[v3,03/10] sched/topology: Provide cfs_overload_cpus bitmap
diff mbox series

Message ID 1541767840-93588-4-git-send-email-steven.sistare@oracle.com
State New
Headers show
Series
  • steal tasks to improve CPU utilization
Related show

Commit Message

Steven Sistare Nov. 9, 2018, 12:50 p.m. UTC
From: Steve Sistare <steve.sistare@oracle.com>

Define and initialize a sparse bitmap of overloaded CPUs, per
last-level-cache scheduling domain, for use by the CFS scheduling class.
Save a pointer to cfs_overload_cpus in the rq for efficient access.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
 include/linux/sched/topology.h |  1 +
 kernel/sched/sched.h           |  2 ++
 kernel/sched/topology.c        | 21 +++++++++++++++++++--
 3 files changed, 22 insertions(+), 2 deletions(-)

Comments

Valentin Schneider Nov. 9, 2018, 5:38 p.m. UTC | #1
Hi Steve,

On 09/11/2018 12:50, Steve Sistare wrote:
[...]
> @@ -482,6 +484,10 @@ static void update_top_cache_domain(int cpu)
>  	dirty_sched_domain_sysctl(cpu);
>  	destroy_sched_domains(tmp);
>  
> +	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
> +	cfs_overload_cpus = (sd ? sd->shared->cfs_overload_cpus : NULL);
> +	rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus);
> +

Why not do this in update_top_cache_domain() where we also look for the
highest SD_SHARE_PKG_RESOURCES and setup shortcut pointers?

>  	update_top_cache_domain(cpu);
>  }
>  
> @@ -1619,9 +1625,19 @@ static void __sdt_free(const struct cpumask *cpu_map)
>  	}
>  }
>  
> +#define ZALLOC_MASK(maskp, nelems, node)				  \
> +	(!*(maskp) && !zalloc_sparsemask_node(maskp, nelems,		  \
> +					      SPARSEMASK_DENSITY_DEFAULT, \
> +					      GFP_KERNEL, node))	  \
> +
>  static int sd_llc_alloc(struct sched_domain *sd)
>  {
> -	/* Allocate sd->shared data here. Empty for now. */
> +	struct sched_domain_shared *sds = sd->shared;
> +	struct cpumask *span = sched_domain_span(sd);
> +	int nid = cpu_to_node(cpumask_first(span));
> +
> +	if (ZALLOC_MASK(&sds->cfs_overload_cpus, nr_cpu_ids, nid))

Mmm so this is called once on every CPU, but the !*(maskp) check in the
macro makes it so there is only one allocation per sd_llc_shared.

I wouldn't mind having that called out in a comment, or having the
pointer check done explicitly outside of the macro.

[...]
Valentin Schneider Nov. 12, 2018, 4:42 p.m. UTC | #2
Hi Steve,

On 09/11/2018 12:50, Steve Sistare wrote:
> From: Steve Sistare <steve.sistare@oracle.com>
> 
> Define and initialize a sparse bitmap of overloaded CPUs, per
> last-level-cache scheduling domain, for use by the CFS scheduling class.
> Save a pointer to cfs_overload_cpus in the rq for efficient access.
> 
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> ---
>  include/linux/sched/topology.h |  1 +
>  kernel/sched/sched.h           |  2 ++
>  kernel/sched/topology.c        | 21 +++++++++++++++++++--
>  3 files changed, 22 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index 6b99761..b173a77 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -72,6 +72,7 @@ struct sched_domain_shared {
>  	atomic_t	ref;
>  	atomic_t	nr_busy_cpus;
>  	int		has_idle_cores;
> +	struct sparsemask *cfs_overload_cpus;

Thinking about misfit stealing, we can't use the sd_llc_shared's because
on big.LITTLE misfit migrations happen across LLC domains.

I was thinking of adding a misfit sparsemask to the root_domain, but
then I thought we could do the same thing for cfs_overload_cpus.

By doing so we'd have a single source of information for overloaded CPUs,
and we could filter that down during idle balance - you mentioned earlier
wanting to try stealing at each SD level. This would also let you get
rid of [PATCH 02].

The main part of try_steal() could then be written down as something like
this:

----->8-----

for_each_domain(this_cpu, sd) {
	span = sched_domain_span(sd)
		
	for_each_sparse_wrap(src_cpu, overload_cpus) {
		if (cpumask_test_cpu(src_cpu, span) &&
		    steal_from(dts_rq, dst_rf, &locked, src_cpu)) {
			stolen = 1;
			goto out;
		}
	}
}

------8<-----

We could limit the stealing to stop at the highest SD_SHARE_PKG_RESOURCES
domain for now so there would be no behavioural change - but we'd
factorize the #ifdef SCHED_SMT bit. Furthermore, the door would be open
to further stealing.

What do you think?

[...]
Steven Sistare Nov. 19, 2018, 5:32 p.m. UTC | #3
On 11/9/2018 12:38 PM, Valentin Schneider wrote:
> Hi Steve,
> 
> On 09/11/2018 12:50, Steve Sistare wrote:
> [...]
>> @@ -482,6 +484,10 @@ static void update_top_cache_domain(int cpu)
>>  	dirty_sched_domain_sysctl(cpu);
>>  	destroy_sched_domains(tmp);
>>  
>> +	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
>> +	cfs_overload_cpus = (sd ? sd->shared->cfs_overload_cpus : NULL);
>> +	rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus);
>> +
> 
> Why not do this in update_top_cache_domain() where we also look for the
> highest SD_SHARE_PKG_RESOURCES and setup shortcut pointers?

My snippet needs rq which is currently referenced in cpu_attach_domain() but
not in update_top_cache_domain().  I could just as easily do it in 
update_top_cache_domain().  Either way is fine with me.

>>  	update_top_cache_domain(cpu);
>>  }
>>  
>> @@ -1619,9 +1625,19 @@ static void __sdt_free(const struct cpumask *cpu_map)
>>  	}
>>  }
>>  
>> +#define ZALLOC_MASK(maskp, nelems, node)				  \
>> +	(!*(maskp) && !zalloc_sparsemask_node(maskp, nelems,		  \
>> +					      SPARSEMASK_DENSITY_DEFAULT, \
>> +					      GFP_KERNEL, node))	  \
>> +
>>  static int sd_llc_alloc(struct sched_domain *sd)
>>  {
>> -	/* Allocate sd->shared data here. Empty for now. */
>> +	struct sched_domain_shared *sds = sd->shared;
>> +	struct cpumask *span = sched_domain_span(sd);
>> +	int nid = cpu_to_node(cpumask_first(span));
>> +
>> +	if (ZALLOC_MASK(&sds->cfs_overload_cpus, nr_cpu_ids, nid))
> 
> Mmm so this is called once on every CPU, but the !*(maskp) check in the
> macro makes it so there is only one allocation per sd_llc_shared.
> 
> I wouldn't mind having that called out in a comment, or having the
> pointer check done explicitly outside of the macro.

OK, will add a comment.  I like the macro because the code is cleaner if/when 
multiple sets are created.

- Steve
Steven Sistare Nov. 19, 2018, 5:33 p.m. UTC | #4
On 11/12/2018 11:42 AM, Valentin Schneider wrote:
> Hi Steve,
> 
> On 09/11/2018 12:50, Steve Sistare wrote:
>> From: Steve Sistare <steve.sistare@oracle.com>
>>
>> Define and initialize a sparse bitmap of overloaded CPUs, per
>> last-level-cache scheduling domain, for use by the CFS scheduling class.
>> Save a pointer to cfs_overload_cpus in the rq for efficient access.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>> ---
>>  include/linux/sched/topology.h |  1 +
>>  kernel/sched/sched.h           |  2 ++
>>  kernel/sched/topology.c        | 21 +++++++++++++++++++--
>>  3 files changed, 22 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
>> index 6b99761..b173a77 100644
>> --- a/include/linux/sched/topology.h
>> +++ b/include/linux/sched/topology.h
>> @@ -72,6 +72,7 @@ struct sched_domain_shared {
>>  	atomic_t	ref;
>>  	atomic_t	nr_busy_cpus;
>>  	int		has_idle_cores;
>> +	struct sparsemask *cfs_overload_cpus;
> 
> Thinking about misfit stealing, we can't use the sd_llc_shared's because
> on big.LITTLE misfit migrations happen across LLC domains.
> 
> I was thinking of adding a misfit sparsemask to the root_domain, but
> then I thought we could do the same thing for cfs_overload_cpus.
> 
> By doing so we'd have a single source of information for overloaded CPUs,
> and we could filter that down during idle balance - you mentioned earlier
> wanting to try stealing at each SD level. This would also let you get
> rid of [PATCH 02].
> 
> The main part of try_steal() could then be written down as something like
> this:
> 
> ----->8-----
> 
> for_each_domain(this_cpu, sd) {
> 	span = sched_domain_span(sd)
> 		
> 	for_each_sparse_wrap(src_cpu, overload_cpus) {
> 		if (cpumask_test_cpu(src_cpu, span) &&
> 		    steal_from(dts_rq, dst_rf, &locked, src_cpu)) {
> 			stolen = 1;
> 			goto out;
> 		}
> 	}
> }
> 
> ------8<-----
> 
> We could limit the stealing to stop at the highest SD_SHARE_PKG_RESOURCES
> domain for now so there would be no behavioural change - but we'd
> factorize the #ifdef SCHED_SMT bit. Furthermore, the door would be open
> to further stealing.
> 
> What do you think?

That is not efficient for a multi-level search because at each domain level we 
would (re) iterate over overloaded candidates that do not belong in that level.
To extend stealing across LLC, I would like to keep the per-LLC sparsemask, 
but add to each SD a list of sparsemask pointers.  The list nodes would be
private, but the sparsemask structs would be shared.  Each list would include
the masks that overlap the SD's members.  The list would be a singleton at the
core and LLC levels (same as the socket level for most processors), and would 
have multiple elements at the NUMA level.

- Steve
Valentin Schneider Nov. 20, 2018, 12:42 p.m. UTC | #5
On 19/11/2018 17:33, Steven Sistare wrote:
[...]
>>
>> Thinking about misfit stealing, we can't use the sd_llc_shared's because
>> on big.LITTLE misfit migrations happen across LLC domains.
>>
>> I was thinking of adding a misfit sparsemask to the root_domain, but
>> then I thought we could do the same thing for cfs_overload_cpus.
>>
>> By doing so we'd have a single source of information for overloaded CPUs,
>> and we could filter that down during idle balance - you mentioned earlier
>> wanting to try stealing at each SD level. This would also let you get
>> rid of [PATCH 02].
>>
>> The main part of try_steal() could then be written down as something like
>> this:
>>
>> ----->8-----
>>
>> for_each_domain(this_cpu, sd) {
>> 	span = sched_domain_span(sd)
>> 		
>> 	for_each_sparse_wrap(src_cpu, overload_cpus) {
>> 		if (cpumask_test_cpu(src_cpu, span) &&
>> 		    steal_from(dts_rq, dst_rf, &locked, src_cpu)) {
>> 			stolen = 1;
>> 			goto out;
>> 		}
>> 	}
>> }
>>
>> ------8<-----
>>
>> We could limit the stealing to stop at the highest SD_SHARE_PKG_RESOURCES
>> domain for now so there would be no behavioural change - but we'd
>> factorize the #ifdef SCHED_SMT bit. Furthermore, the door would be open
>> to further stealing.
>>
>> What do you think?
> 
> That is not efficient for a multi-level search because at each domain level we 
> would (re) iterate over overloaded candidates that do not belong in that level.


Mmm I was thinking we could abuse the wrap() and start at
(fls(prev_span) + 1), but we're not guaranteed to have contiguous spans -
the Arm Juno for instance has [0, 3, 4], [1, 2] as MC-level domains, so
that goes down the drain.

Another thing that has been trotting in my head would be some helper to
create a cpumask from a sparsemask (some sort of sparsemask_span()),
which would let us use the standard mask operators:

----->8-----
struct cpumask *overload_span = sparsemask_span(overload_cpus)

for_each_domain(this_cpu, sd)
	for_each_cpu_and(src_cpu, overload_span, sched_domain_span(sd))
		<steal_from here>
-----8>-----

The cpumask could be part of the sparsemask struct to save us the
allocation, and only updated when calling sparsemask_span().

> To extend stealing across LLC, I would like to keep the per-LLC sparsemask, 
> but add to each SD a list of sparsemask pointers.  The list nodes would be
> private, but the sparsemask structs would be shared.  Each list would include
> the masks that overlap the SD's members.  The list would be a singleton at the
> core and LLC levels (same as the socket level for most processors), and would 
> have multiple elements at the NUMA level.
> 

I see. As for misfit, creating asym_cpucapacity siblings of the sd_llc_*()
functions seems a bit much - there'd be a lot of redundancy for basically
just a single shared sparsemask, which is why I was rambling about moving
things to root_domain.

Having different locations where sparsemasks are stored is a bit of a
pain which I'd like to avoid, but if it can't be unified I suppose we'll
have to live with it.

> - Steve
>
Valentin Schneider Nov. 20, 2018, 12:52 p.m. UTC | #6
On 19/11/2018 17:32, Steven Sistare wrote:
> On 11/9/2018 12:38 PM, Valentin Schneider wrote:
>> Hi Steve,
>>
>> On 09/11/2018 12:50, Steve Sistare wrote:
>> [...]
>>> @@ -482,6 +484,10 @@ static void update_top_cache_domain(int cpu)
>>>  	dirty_sched_domain_sysctl(cpu);
>>>  	destroy_sched_domains(tmp);
>>>  
>>> +	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
>>> +	cfs_overload_cpus = (sd ? sd->shared->cfs_overload_cpus : NULL);
>>> +	rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus);
>>> +
>>
>> Why not do this in update_top_cache_domain() where we also look for the
>> highest SD_SHARE_PKG_RESOURCES and setup shortcut pointers?
> 
> My snippet needs rq which is currently referenced in cpu_attach_domain() but
> not in update_top_cache_domain().  I could just as easily do it in 
> update_top_cache_domain().  Either way is fine with me.
> 

I think having this in update_top_cache_domain() makes more sense - it's
where all the domain shortcuts are set, and we already play with LLC there.

[...]

Patch
diff mbox series

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 6b99761..b173a77 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -72,6 +72,7 @@  struct sched_domain_shared {
 	atomic_t	ref;
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
+	struct sparsemask *cfs_overload_cpus;
 };
 
 struct sched_domain {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 618577f..eacf5db 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -81,6 +81,7 @@ 
 
 struct rq;
 struct cpuidle_state;
+struct sparsemask;
 
 /* task_struct::on_rq states: */
 #define TASK_ON_RQ_QUEUED	1
@@ -812,6 +813,7 @@  struct rq {
 	struct cfs_rq		cfs;
 	struct rt_rq		rt;
 	struct dl_rq		dl;
+	struct sparsemask	*cfs_overload_cpus;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this CPU: */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 3e72ce0..6455bde 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,6 +3,7 @@ 
  * Scheduler topology setup/handling methods
  */
 #include "sched.h"
+#include <linux/sparsemask.h>
 
 DEFINE_MUTEX(sched_domains_mutex);
 
@@ -441,6 +442,7 @@  static void update_top_cache_domain(int cpu)
 static void
 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 {
+	struct sparsemask *cfs_overload_cpus;
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
 
@@ -482,6 +484,10 @@  static void update_top_cache_domain(int cpu)
 	dirty_sched_domain_sysctl(cpu);
 	destroy_sched_domains(tmp);
 
+	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+	cfs_overload_cpus = (sd ? sd->shared->cfs_overload_cpus : NULL);
+	rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus);
+
 	update_top_cache_domain(cpu);
 }
 
@@ -1619,9 +1625,19 @@  static void __sdt_free(const struct cpumask *cpu_map)
 	}
 }
 
+#define ZALLOC_MASK(maskp, nelems, node)				  \
+	(!*(maskp) && !zalloc_sparsemask_node(maskp, nelems,		  \
+					      SPARSEMASK_DENSITY_DEFAULT, \
+					      GFP_KERNEL, node))	  \
+
 static int sd_llc_alloc(struct sched_domain *sd)
 {
-	/* Allocate sd->shared data here. Empty for now. */
+	struct sched_domain_shared *sds = sd->shared;
+	struct cpumask *span = sched_domain_span(sd);
+	int nid = cpu_to_node(cpumask_first(span));
+
+	if (ZALLOC_MASK(&sds->cfs_overload_cpus, nr_cpu_ids, nid))
+		return 1;
 
 	return 0;
 }
@@ -1633,7 +1649,8 @@  static void sd_llc_free(struct sched_domain *sd)
 	if (!sds)
 		return;
 
-	/* Free data here. Empty for now. */
+	free_sparsemask(sds->cfs_overload_cpus);
+	sds->cfs_overload_cpus = NULL;
 }
 
 static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)