All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup
@ 2020-10-21 15:03 Aubrey Li
  2020-10-21 17:40 ` kernel test robot
                   ` (5 more replies)
  0 siblings, 6 replies; 15+ messages in thread
From: Aubrey Li @ 2020-10-21 15:03 UTC (permalink / raw)
  To: mingo, peterz, juri.lelli, vincent.guittot, dietmar.eggemann,
	rostedt, bsegall, mgorman, valentin.schneider
  Cc: tim.c.chen, linux-kernel, Aubrey Li, Qais Yousef, Jiang Biao, Aubrey Li

From: Aubrey Li <aubrey.li@intel.com>

Added idle cpumask to track idle cpus in sched domain. When a CPU
enters idle, its corresponding bit in the idle cpumask will be set,
and when the CPU exits idle, its bit will be cleared.

When a task wakes up to select an idle cpu, scanning idle cpumask
has low cost than scanning all the cpus in last level cache domain,
especially when the system is heavily loaded.

v2->v3:
- change setting idle cpumask to every idle entry, otherwise schbench
  has a regression of 99th percentile latency.
- change clearing idle cpumask to nohz_balancer_kick(), so updating
  idle cpumask is ratelimited in the idle exiting path.
- set SCHED_IDLE cpu in idle cpumask to allow it as a wakeup target.

v1->v2:
- idle cpumask is updated in the nohz routines, by initializing idle
  cpumask with sched_domain_span(sd), nohz=off case remains the original
  behavior.

Cc: Mel Gorman <mgorman@suse.de>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Qais Yousef <qais.yousef@arm.com>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Cc: Jiang Biao <benbjiang@gmail.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
---
 include/linux/sched/topology.h | 13 ++++++++++
 kernel/sched/fair.c            | 45 +++++++++++++++++++++++++++++++++-
 kernel/sched/idle.c            |  1 +
 kernel/sched/sched.h           |  1 +
 kernel/sched/topology.c        |  3 ++-
 5 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index fb11091129b3..43a641d26154 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -65,8 +65,21 @@ struct sched_domain_shared {
 	atomic_t	ref;
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
+	/*
+	 * Span of all idle CPUs in this domain.
+	 *
+	 * NOTE: this field is variable length. (Allocated dynamically
+	 * by attaching extra space to the end of the structure,
+	 * depending on how many CPUs the kernel has booted up with)
+	 */
+	unsigned long	idle_cpus_span[];
 };
 
+static inline struct cpumask *sds_idle_cpus(struct sched_domain_shared *sds)
+{
+	return to_cpumask(sds->idle_cpus_span);
+}
+
 struct sched_domain {
 	/* These fields must be setup */
 	struct sched_domain __rcu *parent;	/* top domain must be null terminated */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b3b59cc51d6..088d1995594f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6023,6 +6023,38 @@ void __update_idle_core(struct rq *rq)
 	rcu_read_unlock();
 }
 
+static DEFINE_PER_CPU(bool, cpu_idle_state);
+/*
+ * Update cpu idle state and record this information
+ * in sd_llc_shared->idle_cpus_span.
+ */
+void update_idle_cpumask(struct rq *rq, bool idle_state)
+{
+	struct sched_domain *sd;
+	int cpu = cpu_of(rq);
+
+	/*
+	 * No need to update idle cpumask if the state
+	 * does not change.
+	 */
+	if (per_cpu(cpu_idle_state, cpu) == idle_state)
+		return;
+
+	per_cpu(cpu_idle_state, cpu) = idle_state;
+
+	rcu_read_lock();
+
+	sd = rcu_dereference(per_cpu(sd_llc, cpu));
+	if (!sd || !sd->shared)
+		goto unlock;
+	if (idle_state)
+		cpumask_set_cpu(cpu, sds_idle_cpus(sd->shared));
+	else
+		cpumask_clear_cpu(cpu, sds_idle_cpus(sd->shared));
+unlock:
+	rcu_read_unlock();
+}
+
 /*
  * Scan the entire LLC domain for idle cores; this dynamically switches off if
  * there are no idle cores left in the system; tracked through
@@ -6136,7 +6168,12 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 
 	time = cpu_clock(this);
 
-	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+	/*
+	 * sched_domain_shared is set only at shared cache level,
+	 * this works only because select_idle_cpu is called with
+	 * sd_llc.
+	 */
+	cpumask_and(cpus, sds_idle_cpus(sd->shared), p->cpus_ptr);
 
 	for_each_cpu_wrap(cpu, cpus, target) {
 		if (!--nr)
@@ -10070,6 +10107,12 @@ static void nohz_balancer_kick(struct rq *rq)
 	if (unlikely(rq->idle_balance))
 		return;
 
+	/* The CPU is not in idle, update idle cpumask */
+	if (unlikely(sched_idle_cpu(cpu))) {
+		/* Allow SCHED_IDLE cpu as a wakeup target */
+		update_idle_cpumask(rq, true);
+	} else
+		update_idle_cpumask(rq, false);
 	/*
 	 * We may be recently in ticked or tickless idle mode. At the first
 	 * busy tick after returning from idle, we will update the busy stats.
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1ae95b9150d3..ce1f929d7fbb 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -405,6 +405,7 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
 {
 	update_idle_core(rq);
+	update_idle_cpumask(rq, true);
 	schedstat_inc(rq->sched_goidle);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c82857e2e288..2d1655039ed5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1069,6 +1069,7 @@ static inline void update_idle_core(struct rq *rq)
 #else
 static inline void update_idle_core(struct rq *rq) { }
 #endif
+void update_idle_cpumask(struct rq *rq, bool idle_state);
 
 DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 9079d865a935..f14a6ef4de57 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1407,6 +1407,7 @@ sd_init(struct sched_domain_topology_level *tl,
 		sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
 		atomic_inc(&sd->shared->ref);
 		atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
+		cpumask_copy(sds_idle_cpus(sd->shared), sched_domain_span(sd));
 	}
 
 	sd->private = sdd;
@@ -1769,7 +1770,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 
 			*per_cpu_ptr(sdd->sd, j) = sd;
 
-			sds = kzalloc_node(sizeof(struct sched_domain_shared),
+			sds = kzalloc_node(sizeof(struct sched_domain_shared) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
 			if (!sds)
 				return -ENOMEM;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup
  2020-10-21 15:03 [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup Aubrey Li
@ 2020-10-21 17:40 ` kernel test robot
  2020-10-22  1:28   ` Li, Aubrey
  2020-10-21 18:20 ` kernel test robot
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 15+ messages in thread
From: kernel test robot @ 2020-10-21 17:40 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 2128 bytes --]

Hi Aubrey,

[FYI, it's a private test report for your RFC patch.]
[auto build test ERROR on tip/sched/core]
[also build test ERROR on tip/master linus/master linux/master v5.9]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Aubrey-Li/sched-fair-select-idle-cpu-from-idle-cpumask-for-task-wakeup/20201021-230800
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git feff2e65efd8d84cf831668e182b2ce73c604bbb
config: nds32-allnoconfig (attached as .config)
compiler: nds32le-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/dbfb0d54ab82b9a27e28a8b850cfe1f1f542991a
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Aubrey-Li/sched-fair-select-idle-cpu-from-idle-cpumask-for-task-wakeup/20201021-230800
        git checkout dbfb0d54ab82b9a27e28a8b850cfe1f1f542991a
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=nds32 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   nds32le-linux-ld: kernel/sched/idle.o: in function `set_next_task_idle':
   idle.c:(.text+0x30): undefined reference to `update_idle_cpumask'
>> nds32le-linux-ld: idle.c:(.text+0x34): undefined reference to `update_idle_cpumask'
   nds32le-linux-ld: kernel/sched/idle.o: in function `pick_next_task_idle':
   idle.c:(.text+0x46): undefined reference to `update_idle_cpumask'
   nds32le-linux-ld: idle.c:(.text+0x4a): undefined reference to `update_idle_cpumask'

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 5268 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup
  2020-10-21 15:03 [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup Aubrey Li
  2020-10-21 17:40 ` kernel test robot
@ 2020-10-21 18:20 ` kernel test robot
  2020-11-03 19:27 ` Valentin Schneider
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 15+ messages in thread
From: kernel test robot @ 2020-10-21 18:20 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 2298 bytes --]

Hi Aubrey,

[FYI, it's a private test report for your RFC patch.]
[auto build test ERROR on tip/sched/core]
[also build test ERROR on tip/master linus/master linux/master v5.9]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Aubrey-Li/sched-fair-select-idle-cpu-from-idle-cpumask-for-task-wakeup/20201021-230800
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git feff2e65efd8d84cf831668e182b2ce73c604bbb
config: arc-defconfig (attached as .config)
compiler: arc-elf-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/dbfb0d54ab82b9a27e28a8b850cfe1f1f542991a
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Aubrey-Li/sched-fair-select-idle-cpu-from-idle-cpumask-for-task-wakeup/20201021-230800
        git checkout dbfb0d54ab82b9a27e28a8b850cfe1f1f542991a
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=arc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   arc-elf-ld: kernel/sched/fair.o: in function `trigger_load_balance':
>> fair.c:(.text+0x764e): undefined reference to `update_idle_cpumask'
>> arc-elf-ld: fair.c:(.text+0x764e): undefined reference to `update_idle_cpumask'
   arc-elf-ld: fair.c:(.text+0x7694): undefined reference to `update_idle_cpumask'
   arc-elf-ld: fair.c:(.text+0x7694): undefined reference to `update_idle_cpumask'
   arc-elf-ld: kernel/sched/idle.o: in function `set_next_task_idle':
   idle.c:(.text+0x30): undefined reference to `update_idle_cpumask'
   arc-elf-ld: kernel/sched/idle.o:idle.c:(.text+0x30): more undefined references to `update_idle_cpumask' follow

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 9385 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup
  2020-10-21 17:40 ` kernel test robot
@ 2020-10-22  1:28   ` Li, Aubrey
  0 siblings, 0 replies; 15+ messages in thread
From: Li, Aubrey @ 2020-10-22  1:28 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 2418 bytes --]

On 2020/10/22 1:40, kernel test robot wrote:
> Hi Aubrey,
> 
> [FYI, it's a private test report for your RFC patch.]
> [auto build test ERROR on tip/sched/core]
> [also build test ERROR on tip/master linus/master linux/master v5.9]
> [If your patch is applied to the wrong git tree, kindly drop us a note.

Yes this PATCH is based on 5.8.10 and I don't expect it works on v5.9.
I posted it as a RFC to request more discussion from the community.

Thanks,
-Aubrey

> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch]
> 
> url:    https://github.com/0day-ci/linux/commits/Aubrey-Li/sched-fair-select-idle-cpu-from-idle-cpumask-for-task-wakeup/20201021-230800
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git feff2e65efd8d84cf831668e182b2ce73c604bbb
> config: nds32-allnoconfig (attached as .config)
> compiler: nds32le-linux-gcc (GCC) 9.3.0
> reproduce (this is a W=1 build):
>         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>         chmod +x ~/bin/make.cross
>         # https://github.com/0day-ci/linux/commit/dbfb0d54ab82b9a27e28a8b850cfe1f1f542991a
>         git remote add linux-review https://github.com/0day-ci/linux
>         git fetch --no-tags linux-review Aubrey-Li/sched-fair-select-idle-cpu-from-idle-cpumask-for-task-wakeup/20201021-230800
>         git checkout dbfb0d54ab82b9a27e28a8b850cfe1f1f542991a
>         # save the attached .config to linux build tree
>         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=nds32 
> 
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot <lkp@intel.com>
> 
> All errors (new ones prefixed by >>):
> 
>    nds32le-linux-ld: kernel/sched/idle.o: in function `set_next_task_idle':
>    idle.c:(.text+0x30): undefined reference to `update_idle_cpumask'
>>> nds32le-linux-ld: idle.c:(.text+0x34): undefined reference to `update_idle_cpumask'
>    nds32le-linux-ld: kernel/sched/idle.o: in function `pick_next_task_idle':
>    idle.c:(.text+0x46): undefined reference to `update_idle_cpumask'
>    nds32le-linux-ld: idle.c:(.text+0x4a): undefined reference to `update_idle_cpumask'
> 
> ---
> 0-DAY CI Kernel Test Service, Intel Corporation
> https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org
> 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup
  2020-10-21 15:03 [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup Aubrey Li
  2020-10-21 17:40 ` kernel test robot
  2020-10-21 18:20 ` kernel test robot
@ 2020-11-03 19:27 ` Valentin Schneider
  2020-11-04 11:52   ` Li, Aubrey
  2020-11-06  7:58 ` Vincent Guittot
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 15+ messages in thread
From: Valentin Schneider @ 2020-11-03 19:27 UTC (permalink / raw)
  To: Aubrey Li
  Cc: mingo, peterz, juri.lelli, vincent.guittot, dietmar.eggemann,
	rostedt, bsegall, mgorman, tim.c.chen, linux-kernel, Aubrey Li,
	Qais Yousef, Jiang Biao


Hi,

On 21/10/20 16:03, Aubrey Li wrote:
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6b3b59cc51d6..088d1995594f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6023,6 +6023,38 @@ void __update_idle_core(struct rq *rq)
>       rcu_read_unlock();
>  }
>
> +static DEFINE_PER_CPU(bool, cpu_idle_state);

I would've expected this to be far less compact than a cpumask, but that's
not the story readelf is telling me. Objdump tells me this is recouping
some of the padding in .data..percpu, at least with the arm64 defconfig.

In any case this ought to be better wrt cacheline bouncing, which I suppose
is what we ultimately want here.

Also, see rambling about init value below.

> @@ -10070,6 +10107,12 @@ static void nohz_balancer_kick(struct rq *rq)
>       if (unlikely(rq->idle_balance))
>               return;
>
> +	/* The CPU is not in idle, update idle cpumask */
> +	if (unlikely(sched_idle_cpu(cpu))) {
> +		/* Allow SCHED_IDLE cpu as a wakeup target */
> +		update_idle_cpumask(rq, true);
> +	} else
> +		update_idle_cpumask(rq, false);

This means that without CONFIG_NO_HZ_COMMON, a CPU going into idle will
never be accounted as going out of it, right? Eventually the cpumask
should end up full, which conceptually implements the previous behaviour of
select_idle_cpu() but in a fairly roundabout way...

> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 9079d865a935..f14a6ef4de57 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1407,6 +1407,7 @@ sd_init(struct sched_domain_topology_level *tl,
>               sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
>               atomic_inc(&sd->shared->ref);
>               atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
> +		cpumask_copy(sds_idle_cpus(sd->shared), sched_domain_span(sd));

So at init you would have (single LLC for sake of simplicity):

  \all cpu : cpu_idle_state[cpu]  == false
  cpumask_full(sds_idle_cpus)     == true

IOW it'll require all CPUs to go idle at some point for these two states to
be properly aligned. Should cpu_idle_state not then be init'd to 1?

This then happens again for hotplug, except that cpu_idle_state[cpu] may be
either true or false when the sds_idle_cpus mask is reset to 1's.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup
  2020-11-03 19:27 ` Valentin Schneider
@ 2020-11-04 11:52   ` Li, Aubrey
  2020-11-06 21:22     ` Valentin Schneider
  0 siblings, 1 reply; 15+ messages in thread
From: Li, Aubrey @ 2020-11-04 11:52 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: mingo, peterz, juri.lelli, vincent.guittot, dietmar.eggemann,
	rostedt, bsegall, mgorman, tim.c.chen, linux-kernel, Aubrey Li,
	Qais Yousef, Jiang Biao

Hi Valentin,

Thanks for your reply.

On 2020/11/4 3:27, Valentin Schneider wrote:
> 
> Hi,
> 
> On 21/10/20 16:03, Aubrey Li wrote:
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 6b3b59cc51d6..088d1995594f 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -6023,6 +6023,38 @@ void __update_idle_core(struct rq *rq)
>>       rcu_read_unlock();
>>  }
>>
>> +static DEFINE_PER_CPU(bool, cpu_idle_state);
> 
> I would've expected this to be far less compact than a cpumask, but that's
> not the story readelf is telling me. Objdump tells me this is recouping
> some of the padding in .data..percpu, at least with the arm64 defconfig.
> 
> In any case this ought to be better wrt cacheline bouncing, which I suppose
> is what we ultimately want here.

Yes, every CPU has a byte, so it may not be less than a cpumask. Probably I can
put it into struct rq, do you have any better suggestions?

> 
> Also, see rambling about init value below.
> 
>> @@ -10070,6 +10107,12 @@ static void nohz_balancer_kick(struct rq *rq)
>>       if (unlikely(rq->idle_balance))
>>               return;
>>
>> +	/* The CPU is not in idle, update idle cpumask */
>> +	if (unlikely(sched_idle_cpu(cpu))) {
>> +		/* Allow SCHED_IDLE cpu as a wakeup target */
>> +		update_idle_cpumask(rq, true);
>> +	} else
>> +		update_idle_cpumask(rq, false);
> 
> This means that without CONFIG_NO_HZ_COMMON, a CPU going into idle will
> never be accounted as going out of it, right? Eventually the cpumask
> should end up full, which conceptually implements the previous behaviour of
> select_idle_cpu() but in a fairly roundabout way...

Maybe I can move it to scheduler_tick().

> 
>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>> index 9079d865a935..f14a6ef4de57 100644
>> --- a/kernel/sched/topology.c
>> +++ b/kernel/sched/topology.c
>> @@ -1407,6 +1407,7 @@ sd_init(struct sched_domain_topology_level *tl,
>>               sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
>>               atomic_inc(&sd->shared->ref);
>>               atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
>> +		cpumask_copy(sds_idle_cpus(sd->shared), sched_domain_span(sd));
> 
> So at init you would have (single LLC for sake of simplicity):
> 
>   \all cpu : cpu_idle_state[cpu]  == false
>   cpumask_full(sds_idle_cpus)     == true
> 
> IOW it'll require all CPUs to go idle at some point for these two states to
> be properly aligned. Should cpu_idle_state not then be init'd to 1?
> 
> This then happens again for hotplug, except that cpu_idle_state[cpu] may be
> either true or false when the sds_idle_cpus mask is reset to 1's.
> 

okay, will refine this in the next version.

Thanks,
-Aubrey

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup
  2020-10-21 15:03 [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup Aubrey Li
                   ` (2 preceding siblings ...)
  2020-11-03 19:27 ` Valentin Schneider
@ 2020-11-06  7:58 ` Vincent Guittot
  2020-11-09  6:05   ` Li, Aubrey
  2020-11-06 21:20 ` Valentin Schneider
  2020-11-12 10:57 ` Qais Yousef
  5 siblings, 1 reply; 15+ messages in thread
From: Vincent Guittot @ 2020-11-06  7:58 UTC (permalink / raw)
  To: Aubrey Li
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
	Tim Chen, linux-kernel, Aubrey Li, Qais Yousef, Jiang Biao

On Wed, 21 Oct 2020 at 17:05, Aubrey Li <aubrey.li@linux.intel.com> wrote:
>
> From: Aubrey Li <aubrey.li@intel.com>
>
> Added idle cpumask to track idle cpus in sched domain. When a CPU
> enters idle, its corresponding bit in the idle cpumask will be set,
> and when the CPU exits idle, its bit will be cleared.
>
> When a task wakes up to select an idle cpu, scanning idle cpumask
> has low cost than scanning all the cpus in last level cache domain,
> especially when the system is heavily loaded.
>
> v2->v3:
> - change setting idle cpumask to every idle entry, otherwise schbench
>   has a regression of 99th percentile latency.
> - change clearing idle cpumask to nohz_balancer_kick(), so updating
>   idle cpumask is ratelimited in the idle exiting path.
> - set SCHED_IDLE cpu in idle cpumask to allow it as a wakeup target.
>
> v1->v2:
> - idle cpumask is updated in the nohz routines, by initializing idle
>   cpumask with sched_domain_span(sd), nohz=off case remains the original
>   behavior.
>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Vincent Guittot <vincent.guittot@linaro.org>
> Cc: Qais Yousef <qais.yousef@arm.com>
> Cc: Valentin Schneider <valentin.schneider@arm.com>
> Cc: Jiang Biao <benbjiang@gmail.com>
> Cc: Tim Chen <tim.c.chen@linux.intel.com>
> Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
> ---
>  include/linux/sched/topology.h | 13 ++++++++++
>  kernel/sched/fair.c            | 45 +++++++++++++++++++++++++++++++++-
>  kernel/sched/idle.c            |  1 +
>  kernel/sched/sched.h           |  1 +
>  kernel/sched/topology.c        |  3 ++-
>  5 files changed, 61 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index fb11091129b3..43a641d26154 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -65,8 +65,21 @@ struct sched_domain_shared {
>         atomic_t        ref;
>         atomic_t        nr_busy_cpus;
>         int             has_idle_cores;
> +       /*
> +        * Span of all idle CPUs in this domain.
> +        *
> +        * NOTE: this field is variable length. (Allocated dynamically
> +        * by attaching extra space to the end of the structure,
> +        * depending on how many CPUs the kernel has booted up with)
> +        */
> +       unsigned long   idle_cpus_span[];
>  };
>
> +static inline struct cpumask *sds_idle_cpus(struct sched_domain_shared *sds)
> +{
> +       return to_cpumask(sds->idle_cpus_span);
> +}
> +
>  struct sched_domain {
>         /* These fields must be setup */
>         struct sched_domain __rcu *parent;      /* top domain must be null terminated */
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6b3b59cc51d6..088d1995594f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6023,6 +6023,38 @@ void __update_idle_core(struct rq *rq)
>         rcu_read_unlock();
>  }
>
> +static DEFINE_PER_CPU(bool, cpu_idle_state);
> +/*
> + * Update cpu idle state and record this information
> + * in sd_llc_shared->idle_cpus_span.
> + */
> +void update_idle_cpumask(struct rq *rq, bool idle_state)
> +{
> +       struct sched_domain *sd;
> +       int cpu = cpu_of(rq);
> +
> +       /*
> +        * No need to update idle cpumask if the state
> +        * does not change.
> +        */
> +       if (per_cpu(cpu_idle_state, cpu) == idle_state)
> +               return;
> +
> +       per_cpu(cpu_idle_state, cpu) = idle_state;
> +
> +       rcu_read_lock();
> +
> +       sd = rcu_dereference(per_cpu(sd_llc, cpu));
> +       if (!sd || !sd->shared)
> +               goto unlock;
> +       if (idle_state)
> +               cpumask_set_cpu(cpu, sds_idle_cpus(sd->shared));
> +       else
> +               cpumask_clear_cpu(cpu, sds_idle_cpus(sd->shared));
> +unlock:
> +       rcu_read_unlock();
> +}
> +
>  /*
>   * Scan the entire LLC domain for idle cores; this dynamically switches off if
>   * there are no idle cores left in the system; tracked through
> @@ -6136,7 +6168,12 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
>
>         time = cpu_clock(this);
>
> -       cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
> +       /*
> +        * sched_domain_shared is set only at shared cache level,
> +        * this works only because select_idle_cpu is called with
> +        * sd_llc.
> +        */
> +       cpumask_and(cpus, sds_idle_cpus(sd->shared), p->cpus_ptr);
>
>         for_each_cpu_wrap(cpu, cpus, target) {
>                 if (!--nr)
> @@ -10070,6 +10107,12 @@ static void nohz_balancer_kick(struct rq *rq)
>         if (unlikely(rq->idle_balance))
>                 return;
>
> +       /* The CPU is not in idle, update idle cpumask */
> +       if (unlikely(sched_idle_cpu(cpu))) {
> +               /* Allow SCHED_IDLE cpu as a wakeup target */
> +               update_idle_cpumask(rq, true);
> +       } else
> +               update_idle_cpumask(rq, false);

update_idle_cpumask(rq, sched_idle_cpu(cpu)); ?


>         /*
>          * We may be recently in ticked or tickless idle mode. At the first
>          * busy tick after returning from idle, we will update the busy stats.
> diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
> index 1ae95b9150d3..ce1f929d7fbb 100644
> --- a/kernel/sched/idle.c
> +++ b/kernel/sched/idle.c
> @@ -405,6 +405,7 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
>  static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
>  {
>         update_idle_core(rq);
> +       update_idle_cpumask(rq, true);
>         schedstat_inc(rq->sched_goidle);
>  }
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index c82857e2e288..2d1655039ed5 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1069,6 +1069,7 @@ static inline void update_idle_core(struct rq *rq)
>  #else
>  static inline void update_idle_core(struct rq *rq) { }
>  #endif
> +void update_idle_cpumask(struct rq *rq, bool idle_state);
>
>  DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
>
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 9079d865a935..f14a6ef4de57 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1407,6 +1407,7 @@ sd_init(struct sched_domain_topology_level *tl,
>                 sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
>                 atomic_inc(&sd->shared->ref);
>                 atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
> +               cpumask_copy(sds_idle_cpus(sd->shared), sched_domain_span(sd));
>         }
>
>         sd->private = sdd;
> @@ -1769,7 +1770,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
>
>                         *per_cpu_ptr(sdd->sd, j) = sd;
>
> -                       sds = kzalloc_node(sizeof(struct sched_domain_shared),
> +                       sds = kzalloc_node(sizeof(struct sched_domain_shared) + cpumask_size(),
>                                         GFP_KERNEL, cpu_to_node(j));
>                         if (!sds)
>                                 return -ENOMEM;
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup
  2020-10-21 15:03 [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup Aubrey Li
                   ` (3 preceding siblings ...)
  2020-11-06  7:58 ` Vincent Guittot
@ 2020-11-06 21:20 ` Valentin Schneider
  2020-11-09 13:40   ` Li, Aubrey
  2020-11-12 10:57 ` Qais Yousef
  5 siblings, 1 reply; 15+ messages in thread
From: Valentin Schneider @ 2020-11-06 21:20 UTC (permalink / raw)
  To: Aubrey Li
  Cc: mingo, peterz, juri.lelli, vincent.guittot, dietmar.eggemann,
	rostedt, bsegall, mgorman, tim.c.chen, linux-kernel, Aubrey Li,
	Qais Yousef, Jiang Biao


On 21/10/20 16:03, Aubrey Li wrote:
> From: Aubrey Li <aubrey.li@intel.com>
>
> Added idle cpumask to track idle cpus in sched domain. When a CPU
> enters idle, its corresponding bit in the idle cpumask will be set,
> and when the CPU exits idle, its bit will be cleared.
>
> When a task wakes up to select an idle cpu, scanning idle cpumask
> has low cost than scanning all the cpus in last level cache domain,
> especially when the system is heavily loaded.
>

FWIW I gave this a spin on my arm64 desktop (Ampere eMAG, 32 core). I get
some barely noticeable (AIUI not statistically significant for bench sched)
changes for 100 iterations of:

| bench                              | metric   |   mean |     std |    q90 |    q99 |
|------------------------------------+----------+--------+---------+--------+--------|
| hackbench --loops 5000 --groups 1  | duration | -1.07% |  -2.23% | -0.88% | -0.25% |
| hackbench --loops 5000 --groups 2  | duration | -0.79% | +30.60% | -0.49% | -0.74% |
| hackbench --loops 5000 --groups 4  | duration | -0.54% |  +6.99% | -0.21% | -0.12% |
| perf bench sched pipe -T -l 100000 | ops/sec  | +1.05% |  -2.80% | -0.17% | +0.39% |

q90 & q99 being the 90th and 99th percentile.

Base was tip/sched/core at:
d8fcb81f1acf ("sched/fair: Check for idle core in wake_affine")

> v2->v3:
> - change setting idle cpumask to every idle entry, otherwise schbench
>   has a regression of 99th percentile latency.
> - change clearing idle cpumask to nohz_balancer_kick(), so updating
>   idle cpumask is ratelimited in the idle exiting path.
> - set SCHED_IDLE cpu in idle cpumask to allow it as a wakeup target.
>
> v1->v2:
> - idle cpumask is updated in the nohz routines, by initializing idle
>   cpumask with sched_domain_span(sd), nohz=off case remains the original
>   behavior.
>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Vincent Guittot <vincent.guittot@linaro.org>
> Cc: Qais Yousef <qais.yousef@arm.com>
> Cc: Valentin Schneider <valentin.schneider@arm.com>
> Cc: Jiang Biao <benbjiang@gmail.com>
> Cc: Tim Chen <tim.c.chen@linux.intel.com>
> Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup
  2020-11-04 11:52   ` Li, Aubrey
@ 2020-11-06 21:22     ` Valentin Schneider
  0 siblings, 0 replies; 15+ messages in thread
From: Valentin Schneider @ 2020-11-06 21:22 UTC (permalink / raw)
  To: Li, Aubrey
  Cc: mingo, peterz, juri.lelli, vincent.guittot, dietmar.eggemann,
	rostedt, bsegall, mgorman, tim.c.chen, linux-kernel, Aubrey Li,
	Qais Yousef, Jiang Biao


On 04/11/20 11:52, Li, Aubrey wrote:
> On 2020/11/4 3:27, Valentin Schneider wrote:
>>> +static DEFINE_PER_CPU(bool, cpu_idle_state);
>>
>> I would've expected this to be far less compact than a cpumask, but that's
>> not the story readelf is telling me. Objdump tells me this is recouping
>> some of the padding in .data..percpu, at least with the arm64 defconfig.
>>
>> In any case this ought to be better wrt cacheline bouncing, which I suppose
>> is what we ultimately want here.
>
> Yes, every CPU has a byte, so it may not be less than a cpumask. Probably I can
> put it into struct rq, do you have any better suggestions?
>

Not really, I'm afraid.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup
  2020-11-06  7:58 ` Vincent Guittot
@ 2020-11-09  6:05   ` Li, Aubrey
  0 siblings, 0 replies; 15+ messages in thread
From: Li, Aubrey @ 2020-11-09  6:05 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
	Tim Chen, linux-kernel, Aubrey Li, Qais Yousef, Jiang Biao

On 2020/11/6 15:58, Vincent Guittot wrote:
> On Wed, 21 Oct 2020 at 17:05, Aubrey Li <aubrey.li@linux.intel.com> wrote:
>>
>> From: Aubrey Li <aubrey.li@intel.com>
>>
>> Added idle cpumask to track idle cpus in sched domain. When a CPU
>> enters idle, its corresponding bit in the idle cpumask will be set,
>> and when the CPU exits idle, its bit will be cleared.
>>
>> When a task wakes up to select an idle cpu, scanning idle cpumask
>> has low cost than scanning all the cpus in last level cache domain,
>> especially when the system is heavily loaded.
>>
>> v2->v3:
>> - change setting idle cpumask to every idle entry, otherwise schbench
>>   has a regression of 99th percentile latency.
>> - change clearing idle cpumask to nohz_balancer_kick(), so updating
>>   idle cpumask is ratelimited in the idle exiting path.
>> - set SCHED_IDLE cpu in idle cpumask to allow it as a wakeup target.
>>
>> v1->v2:
>> - idle cpumask is updated in the nohz routines, by initializing idle
>>   cpumask with sched_domain_span(sd), nohz=off case remains the original
>>   behavior.
>>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Vincent Guittot <vincent.guittot@linaro.org>
>> Cc: Qais Yousef <qais.yousef@arm.com>
>> Cc: Valentin Schneider <valentin.schneider@arm.com>
>> Cc: Jiang Biao <benbjiang@gmail.com>
>> Cc: Tim Chen <tim.c.chen@linux.intel.com>
>> Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
>> ---
>>  include/linux/sched/topology.h | 13 ++++++++++
>>  kernel/sched/fair.c            | 45 +++++++++++++++++++++++++++++++++-
>>  kernel/sched/idle.c            |  1 +
>>  kernel/sched/sched.h           |  1 +
>>  kernel/sched/topology.c        |  3 ++-
>>  5 files changed, 61 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
>> index fb11091129b3..43a641d26154 100644
>> --- a/include/linux/sched/topology.h
>> +++ b/include/linux/sched/topology.h
>> @@ -65,8 +65,21 @@ struct sched_domain_shared {
>>         atomic_t        ref;
>>         atomic_t        nr_busy_cpus;
>>         int             has_idle_cores;
>> +       /*
>> +        * Span of all idle CPUs in this domain.
>> +        *
>> +        * NOTE: this field is variable length. (Allocated dynamically
>> +        * by attaching extra space to the end of the structure,
>> +        * depending on how many CPUs the kernel has booted up with)
>> +        */
>> +       unsigned long   idle_cpus_span[];
>>  };
>>
>> +static inline struct cpumask *sds_idle_cpus(struct sched_domain_shared *sds)
>> +{
>> +       return to_cpumask(sds->idle_cpus_span);
>> +}
>> +
>>  struct sched_domain {
>>         /* These fields must be setup */
>>         struct sched_domain __rcu *parent;      /* top domain must be null terminated */
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 6b3b59cc51d6..088d1995594f 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -6023,6 +6023,38 @@ void __update_idle_core(struct rq *rq)
>>         rcu_read_unlock();
>>  }
>>
>> +static DEFINE_PER_CPU(bool, cpu_idle_state);
>> +/*
>> + * Update cpu idle state and record this information
>> + * in sd_llc_shared->idle_cpus_span.
>> + */
>> +void update_idle_cpumask(struct rq *rq, bool idle_state)
>> +{
>> +       struct sched_domain *sd;
>> +       int cpu = cpu_of(rq);
>> +
>> +       /*
>> +        * No need to update idle cpumask if the state
>> +        * does not change.
>> +        */
>> +       if (per_cpu(cpu_idle_state, cpu) == idle_state)
>> +               return;
>> +
>> +       per_cpu(cpu_idle_state, cpu) = idle_state;
>> +
>> +       rcu_read_lock();
>> +
>> +       sd = rcu_dereference(per_cpu(sd_llc, cpu));
>> +       if (!sd || !sd->shared)
>> +               goto unlock;
>> +       if (idle_state)
>> +               cpumask_set_cpu(cpu, sds_idle_cpus(sd->shared));
>> +       else
>> +               cpumask_clear_cpu(cpu, sds_idle_cpus(sd->shared));
>> +unlock:
>> +       rcu_read_unlock();
>> +}
>> +
>>  /*
>>   * Scan the entire LLC domain for idle cores; this dynamically switches off if
>>   * there are no idle cores left in the system; tracked through
>> @@ -6136,7 +6168,12 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
>>
>>         time = cpu_clock(this);
>>
>> -       cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
>> +       /*
>> +        * sched_domain_shared is set only at shared cache level,
>> +        * this works only because select_idle_cpu is called with
>> +        * sd_llc.
>> +        */
>> +       cpumask_and(cpus, sds_idle_cpus(sd->shared), p->cpus_ptr);
>>
>>         for_each_cpu_wrap(cpu, cpus, target) {
>>                 if (!--nr)
>> @@ -10070,6 +10107,12 @@ static void nohz_balancer_kick(struct rq *rq)
>>         if (unlikely(rq->idle_balance))
>>                 return;
>>
>> +       /* The CPU is not in idle, update idle cpumask */
>> +       if (unlikely(sched_idle_cpu(cpu))) {
>> +               /* Allow SCHED_IDLE cpu as a wakeup target */
>> +               update_idle_cpumask(rq, true);
>> +       } else
>> +               update_idle_cpumask(rq, false);
> 
> update_idle_cpumask(rq, sched_idle_cpu(cpu)); ?

This looks much better, thanks! :)

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup
  2020-11-06 21:20 ` Valentin Schneider
@ 2020-11-09 13:40   ` Li, Aubrey
  2020-11-09 15:54     ` Valentin Schneider
  0 siblings, 1 reply; 15+ messages in thread
From: Li, Aubrey @ 2020-11-09 13:40 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: mingo, peterz, juri.lelli, vincent.guittot, dietmar.eggemann,
	rostedt, bsegall, mgorman, tim.c.chen, linux-kernel, Aubrey Li,
	Qais Yousef, Jiang Biao

On 2020/11/7 5:20, Valentin Schneider wrote:
> 
> On 21/10/20 16:03, Aubrey Li wrote:
>> From: Aubrey Li <aubrey.li@intel.com>
>>
>> Added idle cpumask to track idle cpus in sched domain. When a CPU
>> enters idle, its corresponding bit in the idle cpumask will be set,
>> and when the CPU exits idle, its bit will be cleared.
>>
>> When a task wakes up to select an idle cpu, scanning idle cpumask
>> has low cost than scanning all the cpus in last level cache domain,
>> especially when the system is heavily loaded.
>>
> 
> FWIW I gave this a spin on my arm64 desktop (Ampere eMAG, 32 core). I get
> some barely noticeable (AIUI not statistically significant for bench sched)
> changes for 100 iterations of:
> 
> | bench                              | metric   |   mean |     std |    q90 |    q99 |
> |------------------------------------+----------+--------+---------+--------+--------|
> | hackbench --loops 5000 --groups 1  | duration | -1.07% |  -2.23% | -0.88% | -0.25% |
> | hackbench --loops 5000 --groups 2  | duration | -0.79% | +30.60% | -0.49% | -0.74% |
> | hackbench --loops 5000 --groups 4  | duration | -0.54% |  +6.99% | -0.21% | -0.12% |
> | perf bench sched pipe -T -l 100000 | ops/sec  | +1.05% |  -2.80% | -0.17% | +0.39% |
> 
> q90 & q99 being the 90th and 99th percentile.
> 
> Base was tip/sched/core at:
> d8fcb81f1acf ("sched/fair: Check for idle core in wake_affine")

Thanks for the data, Valentin! So does the negative value mean improvement?

If so the data looks expected to me. As we set idle cpumask every time we
enter idle, but only clear it at the tick frequency, so if the workload
is not heavy enough, there could be a lot of idle during two ticks, so idle
cpumask is almost equal to sched_domain_span(sd), which makes no difference.

But if the system load is heavy enough, CPU has few/no chance to enter idle,
then idle cpumask can be cleared during tick, which makes the bit number in 
sds_idle_cpus(sd->shared) far less than the bit number in sched_domain_span(sd)
if llc domain has large count of CPUs.

For example, if I run 4 x overcommit uperf on a system with 192 CPUs, 
I observed:
- default, the average of this_sd->avg_scan_cost is 223.12ns
- patch, the average of this_sd->avg_scan_cost is 63.4ns

And select_idle_cpu is called 7670253 times per second, so for every CPU the
scan cost is saved (223.12 - 63.4) * 7670253 / 192 = 6.4ms. As a result, I
saw uperf thoughput improved by 60+%.

Thanks,
-Aubrey

 
 





^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup
  2020-11-09 13:40   ` Li, Aubrey
@ 2020-11-09 15:54     ` Valentin Schneider
  2020-11-11  8:38       ` Li, Aubrey
  0 siblings, 1 reply; 15+ messages in thread
From: Valentin Schneider @ 2020-11-09 15:54 UTC (permalink / raw)
  To: Li, Aubrey
  Cc: mingo, peterz, juri.lelli, vincent.guittot, dietmar.eggemann,
	rostedt, bsegall, mgorman, tim.c.chen, linux-kernel, Aubrey Li,
	Qais Yousef, Jiang Biao


On 09/11/20 13:40, Li, Aubrey wrote:
> On 2020/11/7 5:20, Valentin Schneider wrote:
>>
>> On 21/10/20 16:03, Aubrey Li wrote:
>>> From: Aubrey Li <aubrey.li@intel.com>
>>>
>>> Added idle cpumask to track idle cpus in sched domain. When a CPU
>>> enters idle, its corresponding bit in the idle cpumask will be set,
>>> and when the CPU exits idle, its bit will be cleared.
>>>
>>> When a task wakes up to select an idle cpu, scanning idle cpumask
>>> has low cost than scanning all the cpus in last level cache domain,
>>> especially when the system is heavily loaded.
>>>
>>
>> FWIW I gave this a spin on my arm64 desktop (Ampere eMAG, 32 core). I get
>> some barely noticeable (AIUI not statistically significant for bench sched)
>> changes for 100 iterations of:
>>
>> | bench                              | metric   |   mean |     std |    q90 |    q99 |
>> |------------------------------------+----------+--------+---------+--------+--------|
>> | hackbench --loops 5000 --groups 1  | duration | -1.07% |  -2.23% | -0.88% | -0.25% |
>> | hackbench --loops 5000 --groups 2  | duration | -0.79% | +30.60% | -0.49% | -0.74% |
>> | hackbench --loops 5000 --groups 4  | duration | -0.54% |  +6.99% | -0.21% | -0.12% |
>> | perf bench sched pipe -T -l 100000 | ops/sec  | +1.05% |  -2.80% | -0.17% | +0.39% |
>>
>> q90 & q99 being the 90th and 99th percentile.
>>
>> Base was tip/sched/core at:
>> d8fcb81f1acf ("sched/fair: Check for idle core in wake_affine")
>
> Thanks for the data, Valentin! So does the negative value mean improvement?
>

For hackbench yes (shorter is better); for perf bench sched no, since the
metric here is ops/sec so higher is better.

That said, I (use a tool that) run a 2-sample Kolmogorov–Smirnov test
against the two sample sets (tip/sched/core vs tip/sched/core+patch), and
the p-value for perf sched bench is quite high (~0.9) which means we can't
reject that both sample sets come from the same distribution; long story
short we can't say whether the patch had a noticeable impact for that
benchmark.

> If so the data looks expected to me. As we set idle cpumask every time we
> enter idle, but only clear it at the tick frequency, so if the workload
> is not heavy enough, there could be a lot of idle during two ticks, so idle
> cpumask is almost equal to sched_domain_span(sd), which makes no difference.
>
> But if the system load is heavy enough, CPU has few/no chance to enter idle,
> then idle cpumask can be cleared during tick, which makes the bit number in
> sds_idle_cpus(sd->shared) far less than the bit number in sched_domain_span(sd)
> if llc domain has large count of CPUs.
>

With hackbench -g 4 that's 160 tasks (against 32 CPUs, all under same LLC),
although the work done by each task isn't much. I'll try bumping that a
notch, or increasing the size of the messages.

> For example, if I run 4 x overcommit uperf on a system with 192 CPUs,
> I observed:
> - default, the average of this_sd->avg_scan_cost is 223.12ns
> - patch, the average of this_sd->avg_scan_cost is 63.4ns
>
> And select_idle_cpu is called 7670253 times per second, so for every CPU the
> scan cost is saved (223.12 - 63.4) * 7670253 / 192 = 6.4ms. As a result, I
> saw uperf thoughput improved by 60+%.
>

That's ~1.2s of "extra" CPU time per second, which sounds pretty cool.

I don't think I've ever played with uperf. I'll give that a shot someday.

> Thanks,
> -Aubrey
>
>
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup
  2020-11-09 15:54     ` Valentin Schneider
@ 2020-11-11  8:38       ` Li, Aubrey
  0 siblings, 0 replies; 15+ messages in thread
From: Li, Aubrey @ 2020-11-11  8:38 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: mingo, peterz, juri.lelli, vincent.guittot, dietmar.eggemann,
	rostedt, bsegall, mgorman, tim.c.chen, linux-kernel, Aubrey Li,
	Qais Yousef, Jiang Biao

On 2020/11/9 23:54, Valentin Schneider wrote:
> 
> On 09/11/20 13:40, Li, Aubrey wrote:
>> On 2020/11/7 5:20, Valentin Schneider wrote:
>>>
>>> On 21/10/20 16:03, Aubrey Li wrote:
>>>> From: Aubrey Li <aubrey.li@intel.com>
>>>>
>>>> Added idle cpumask to track idle cpus in sched domain. When a CPU
>>>> enters idle, its corresponding bit in the idle cpumask will be set,
>>>> and when the CPU exits idle, its bit will be cleared.
>>>>
>>>> When a task wakes up to select an idle cpu, scanning idle cpumask
>>>> has low cost than scanning all the cpus in last level cache domain,
>>>> especially when the system is heavily loaded.
>>>>
>>>
>>> FWIW I gave this a spin on my arm64 desktop (Ampere eMAG, 32 core). I get
>>> some barely noticeable (AIUI not statistically significant for bench sched)
>>> changes for 100 iterations of:
>>>
>>> | bench                              | metric   |   mean |     std |    q90 |    q99 |
>>> |------------------------------------+----------+--------+---------+--------+--------|
>>> | hackbench --loops 5000 --groups 1  | duration | -1.07% |  -2.23% | -0.88% | -0.25% |
>>> | hackbench --loops 5000 --groups 2  | duration | -0.79% | +30.60% | -0.49% | -0.74% |
>>> | hackbench --loops 5000 --groups 4  | duration | -0.54% |  +6.99% | -0.21% | -0.12% |
>>> | perf bench sched pipe -T -l 100000 | ops/sec  | +1.05% |  -2.80% | -0.17% | +0.39% |
>>>
>>> q90 & q99 being the 90th and 99th percentile.
>>>
>>> Base was tip/sched/core at:
>>> d8fcb81f1acf ("sched/fair: Check for idle core in wake_affine")
>>
>> Thanks for the data, Valentin! So does the negative value mean improvement?
>>
> 
> For hackbench yes (shorter is better); for perf bench sched no, since the
> metric here is ops/sec so higher is better.
> 
> That said, I (use a tool that) run a 2-sample Kolmogorov–Smirnov test
> against the two sample sets (tip/sched/core vs tip/sched/core+patch), and
> the p-value for perf sched bench is quite high (~0.9) which means we can't
> reject that both sample sets come from the same distribution; long story
> short we can't say whether the patch had a noticeable impact for that
> benchmark.
> 
>> If so the data looks expected to me. As we set idle cpumask every time we
>> enter idle, but only clear it at the tick frequency, so if the workload
>> is not heavy enough, there could be a lot of idle during two ticks, so idle
>> cpumask is almost equal to sched_domain_span(sd), which makes no difference.
>>
>> But if the system load is heavy enough, CPU has few/no chance to enter idle,
>> then idle cpumask can be cleared during tick, which makes the bit number in
>> sds_idle_cpus(sd->shared) far less than the bit number in sched_domain_span(sd)
>> if llc domain has large count of CPUs.
>>
> 
> With hackbench -g 4 that's 160 tasks (against 32 CPUs, all under same LLC),
> although the work done by each task isn't much. I'll try bumping that a
> notch, or increasing the size of the messages.

As long as the system is busy enough and not schedule on idle thread, then
idle cpu mask will shrink tick by tick, and we'll see lower sd->avg_scan_cost.

This version of patch sets idle cpu bit every time it enters idle, so need
heavy load for scheduler to not switch idle thread in.

I personally like the logic in the previous version, because in those versions,
- when cpu enters idle, cpuidle governor returns a flag "stop_tick"
- if tick is stopped, which indicates the CPU is not busy, and can be set
  idle in idle cpumask
- otherwise, the CPU is likely going to work very soon, so not set it in
  idle cpumask.

But apparently I missed "nohz=off" case in the previous implementation. For
"nohz=off" case I selected to keep original behavior, which didn't content Mel.
Probably I can refine it in the next version.

Do you have any suggestions?

Thanks,
-Aubrey

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup
  2020-10-21 15:03 [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup Aubrey Li
                   ` (4 preceding siblings ...)
  2020-11-06 21:20 ` Valentin Schneider
@ 2020-11-12 10:57 ` Qais Yousef
  2020-11-12 12:12   ` Li, Aubrey
  5 siblings, 1 reply; 15+ messages in thread
From: Qais Yousef @ 2020-11-12 10:57 UTC (permalink / raw)
  To: Aubrey Li
  Cc: mingo, peterz, juri.lelli, vincent.guittot, dietmar.eggemann,
	rostedt, bsegall, mgorman, valentin.schneider, tim.c.chen,
	linux-kernel, Aubrey Li, Jiang Biao

On 10/21/20 23:03, Aubrey Li wrote:
> From: Aubrey Li <aubrey.li@intel.com>
> 
> Added idle cpumask to track idle cpus in sched domain. When a CPU
> enters idle, its corresponding bit in the idle cpumask will be set,
> and when the CPU exits idle, its bit will be cleared.
> 
> When a task wakes up to select an idle cpu, scanning idle cpumask
> has low cost than scanning all the cpus in last level cache domain,
> especially when the system is heavily loaded.
> 
> v2->v3:
> - change setting idle cpumask to every idle entry, otherwise schbench
>   has a regression of 99th percentile latency.
> - change clearing idle cpumask to nohz_balancer_kick(), so updating
>   idle cpumask is ratelimited in the idle exiting path.
> - set SCHED_IDLE cpu in idle cpumask to allow it as a wakeup target.
> 
> v1->v2:
> - idle cpumask is updated in the nohz routines, by initializing idle
>   cpumask with sched_domain_span(sd), nohz=off case remains the original
>   behavior.

Did you intend to put the patch version history in the commit message?

I started looking at this last week but got distracted. I see you already got
enough reviews, so my 2p is that I faced some compilation issues:

	aarch64-linux-gnu-ld: kernel/sched/idle.o: in function `set_next_task_idle':
	/mnt/data/src/linux/kernel/sched/idle.c:405: undefined reference to `update_idle_cpumask'
	aarch64-linux-gnu-ld: kernel/sched/fair.o: in function `nohz_balancer_kick':
	/mnt/data/src/linux/kernel/sched/fair.c:10150: undefined reference to `update_idle_cpumask'
	aarch64-linux-gnu-ld: /mnt/data/src/linux/kernel/sched/fair.c:10148: undefined reference to `update_idle_cpumask'

Because of the missing CONFIG_SCHED_SMT in my .config. I think
update_idle_cpumask() should be defined unconditionally.

Thanks

--
Qais Yousef

> 
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Vincent Guittot <vincent.guittot@linaro.org>
> Cc: Qais Yousef <qais.yousef@arm.com>
> Cc: Valentin Schneider <valentin.schneider@arm.com>
> Cc: Jiang Biao <benbjiang@gmail.com>
> Cc: Tim Chen <tim.c.chen@linux.intel.com>
> Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
> ---
>  include/linux/sched/topology.h | 13 ++++++++++
>  kernel/sched/fair.c            | 45 +++++++++++++++++++++++++++++++++-
>  kernel/sched/idle.c            |  1 +
>  kernel/sched/sched.h           |  1 +
>  kernel/sched/topology.c        |  3 ++-
>  5 files changed, 61 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index fb11091129b3..43a641d26154 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -65,8 +65,21 @@ struct sched_domain_shared {
>  	atomic_t	ref;
>  	atomic_t	nr_busy_cpus;
>  	int		has_idle_cores;
> +	/*
> +	 * Span of all idle CPUs in this domain.
> +	 *
> +	 * NOTE: this field is variable length. (Allocated dynamically
> +	 * by attaching extra space to the end of the structure,
> +	 * depending on how many CPUs the kernel has booted up with)
> +	 */
> +	unsigned long	idle_cpus_span[];
>  };
>  
> +static inline struct cpumask *sds_idle_cpus(struct sched_domain_shared *sds)
> +{
> +	return to_cpumask(sds->idle_cpus_span);
> +}
> +
>  struct sched_domain {
>  	/* These fields must be setup */
>  	struct sched_domain __rcu *parent;	/* top domain must be null terminated */
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6b3b59cc51d6..088d1995594f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6023,6 +6023,38 @@ void __update_idle_core(struct rq *rq)
>  	rcu_read_unlock();
>  }
>  
> +static DEFINE_PER_CPU(bool, cpu_idle_state);
> +/*
> + * Update cpu idle state and record this information
> + * in sd_llc_shared->idle_cpus_span.
> + */
> +void update_idle_cpumask(struct rq *rq, bool idle_state)
> +{
> +	struct sched_domain *sd;
> +	int cpu = cpu_of(rq);
> +
> +	/*
> +	 * No need to update idle cpumask if the state
> +	 * does not change.
> +	 */
> +	if (per_cpu(cpu_idle_state, cpu) == idle_state)
> +		return;
> +
> +	per_cpu(cpu_idle_state, cpu) = idle_state;
> +
> +	rcu_read_lock();
> +
> +	sd = rcu_dereference(per_cpu(sd_llc, cpu));
> +	if (!sd || !sd->shared)
> +		goto unlock;
> +	if (idle_state)
> +		cpumask_set_cpu(cpu, sds_idle_cpus(sd->shared));
> +	else
> +		cpumask_clear_cpu(cpu, sds_idle_cpus(sd->shared));
> +unlock:
> +	rcu_read_unlock();
> +}
> +
>  /*
>   * Scan the entire LLC domain for idle cores; this dynamically switches off if
>   * there are no idle cores left in the system; tracked through
> @@ -6136,7 +6168,12 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
>  
>  	time = cpu_clock(this);
>  
> -	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
> +	/*
> +	 * sched_domain_shared is set only at shared cache level,
> +	 * this works only because select_idle_cpu is called with
> +	 * sd_llc.
> +	 */
> +	cpumask_and(cpus, sds_idle_cpus(sd->shared), p->cpus_ptr);
>  
>  	for_each_cpu_wrap(cpu, cpus, target) {
>  		if (!--nr)
> @@ -10070,6 +10107,12 @@ static void nohz_balancer_kick(struct rq *rq)
>  	if (unlikely(rq->idle_balance))
>  		return;
>  
> +	/* The CPU is not in idle, update idle cpumask */
> +	if (unlikely(sched_idle_cpu(cpu))) {
> +		/* Allow SCHED_IDLE cpu as a wakeup target */
> +		update_idle_cpumask(rq, true);
> +	} else
> +		update_idle_cpumask(rq, false);
>  	/*
>  	 * We may be recently in ticked or tickless idle mode. At the first
>  	 * busy tick after returning from idle, we will update the busy stats.
> diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
> index 1ae95b9150d3..ce1f929d7fbb 100644
> --- a/kernel/sched/idle.c
> +++ b/kernel/sched/idle.c
> @@ -405,6 +405,7 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
>  static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
>  {
>  	update_idle_core(rq);
> +	update_idle_cpumask(rq, true);
>  	schedstat_inc(rq->sched_goidle);
>  }
>  
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index c82857e2e288..2d1655039ed5 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1069,6 +1069,7 @@ static inline void update_idle_core(struct rq *rq)
>  #else
>  static inline void update_idle_core(struct rq *rq) { }
>  #endif
> +void update_idle_cpumask(struct rq *rq, bool idle_state);
>  
>  DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
>  
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 9079d865a935..f14a6ef4de57 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1407,6 +1407,7 @@ sd_init(struct sched_domain_topology_level *tl,
>  		sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
>  		atomic_inc(&sd->shared->ref);
>  		atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
> +		cpumask_copy(sds_idle_cpus(sd->shared), sched_domain_span(sd));
>  	}
>  
>  	sd->private = sdd;
> @@ -1769,7 +1770,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
>  
>  			*per_cpu_ptr(sdd->sd, j) = sd;
>  
> -			sds = kzalloc_node(sizeof(struct sched_domain_shared),
> +			sds = kzalloc_node(sizeof(struct sched_domain_shared) + cpumask_size(),
>  					GFP_KERNEL, cpu_to_node(j));
>  			if (!sds)
>  				return -ENOMEM;
> -- 
> 2.25.1
> 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup
  2020-11-12 10:57 ` Qais Yousef
@ 2020-11-12 12:12   ` Li, Aubrey
  0 siblings, 0 replies; 15+ messages in thread
From: Li, Aubrey @ 2020-11-12 12:12 UTC (permalink / raw)
  To: Qais Yousef
  Cc: mingo, peterz, juri.lelli, vincent.guittot, dietmar.eggemann,
	rostedt, bsegall, mgorman, valentin.schneider, tim.c.chen,
	linux-kernel, Aubrey Li, Jiang Biao

On 2020/11/12 18:57, Qais Yousef wrote:
> On 10/21/20 23:03, Aubrey Li wrote:
>> From: Aubrey Li <aubrey.li@intel.com>
>>
>> Added idle cpumask to track idle cpus in sched domain. When a CPU
>> enters idle, its corresponding bit in the idle cpumask will be set,
>> and when the CPU exits idle, its bit will be cleared.
>>
>> When a task wakes up to select an idle cpu, scanning idle cpumask
>> has low cost than scanning all the cpus in last level cache domain,
>> especially when the system is heavily loaded.
>>
>> v2->v3:
>> - change setting idle cpumask to every idle entry, otherwise schbench
>>   has a regression of 99th percentile latency.
>> - change clearing idle cpumask to nohz_balancer_kick(), so updating
>>   idle cpumask is ratelimited in the idle exiting path.
>> - set SCHED_IDLE cpu in idle cpumask to allow it as a wakeup target.
>>
>> v1->v2:
>> - idle cpumask is updated in the nohz routines, by initializing idle
>>   cpumask with sched_domain_span(sd), nohz=off case remains the original
>>   behavior.
> 
> Did you intend to put the patch version history in the commit message?
> 
> I started looking at this last week but got distracted. I see you already got
> enough reviews, so my 2p is that I faced some compilation issues:
> 
> 	aarch64-linux-gnu-ld: kernel/sched/idle.o: in function `set_next_task_idle':
> 	/mnt/data/src/linux/kernel/sched/idle.c:405: undefined reference to `update_idle_cpumask'
> 	aarch64-linux-gnu-ld: kernel/sched/fair.o: in function `nohz_balancer_kick':
> 	/mnt/data/src/linux/kernel/sched/fair.c:10150: undefined reference to `update_idle_cpumask'
> 	aarch64-linux-gnu-ld: /mnt/data/src/linux/kernel/sched/fair.c:10148: undefined reference to `update_idle_cpumask'
> 
> Because of the missing CONFIG_SCHED_SMT in my .config. I think
> update_idle_cpumask() should be defined unconditionally.

Thanks to point this out timely, :), I'll fix it in the next version.

-Aubrey

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2020-11-12 12:12 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-21 15:03 [RFC PATCH v3] sched/fair: select idle cpu from idle cpumask for task wakeup Aubrey Li
2020-10-21 17:40 ` kernel test robot
2020-10-22  1:28   ` Li, Aubrey
2020-10-21 18:20 ` kernel test robot
2020-11-03 19:27 ` Valentin Schneider
2020-11-04 11:52   ` Li, Aubrey
2020-11-06 21:22     ` Valentin Schneider
2020-11-06  7:58 ` Vincent Guittot
2020-11-09  6:05   ` Li, Aubrey
2020-11-06 21:20 ` Valentin Schneider
2020-11-09 13:40   ` Li, Aubrey
2020-11-09 15:54     ` Valentin Schneider
2020-11-11  8:38       ` Li, Aubrey
2020-11-12 10:57 ` Qais Yousef
2020-11-12 12:12   ` Li, Aubrey

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.