[v3,03/10] sched/topology: Provide cfs_overload_cpus bitmap
diff mbox series

Message ID 1541767840-93588-4-git-send-email-steven.sistare@oracle.com
State Superseded
Headers show
Series
  • steal tasks to improve CPU utilization
Related show

Commit Message

Steven Sistare Nov. 9, 2018, 12:50 p.m. UTC
From: Steve Sistare <steve.sistare@oracle.com>

Define and initialize a sparse bitmap of overloaded CPUs, per
last-level-cache scheduling domain, for use by the CFS scheduling class.
Save a pointer to cfs_overload_cpus in the rq for efficient access.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
 include/linux/sched/topology.h |  1 +
 kernel/sched/sched.h           |  2 ++
 kernel/sched/topology.c        | 21 +++++++++++++++++++--
 3 files changed, 22 insertions(+), 2 deletions(-)

Comments

Valentin Schneider Nov. 9, 2018, 5:38 p.m. UTC | #1
Hi Steve,

On 09/11/2018 12:50, Steve Sistare wrote:
[...]
> @@ -482,6 +484,10 @@ static void update_top_cache_domain(int cpu)
>  	dirty_sched_domain_sysctl(cpu);
>  	destroy_sched_domains(tmp);
>  
> +	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
> +	cfs_overload_cpus = (sd ? sd->shared->cfs_overload_cpus : NULL);
> +	rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus);
> +

Why not do this in update_top_cache_domain() where we also look for the
highest SD_SHARE_PKG_RESOURCES and setup shortcut pointers?

>  	update_top_cache_domain(cpu);
>  }
>  
> @@ -1619,9 +1625,19 @@ static void __sdt_free(const struct cpumask *cpu_map)
>  	}
>  }
>  
> +#define ZALLOC_MASK(maskp, nelems, node)				  \
> +	(!*(maskp) && !zalloc_sparsemask_node(maskp, nelems,		  \
> +					      SPARSEMASK_DENSITY_DEFAULT, \
> +					      GFP_KERNEL, node))	  \
> +
>  static int sd_llc_alloc(struct sched_domain *sd)
>  {
> -	/* Allocate sd->shared data here. Empty for now. */
> +	struct sched_domain_shared *sds = sd->shared;
> +	struct cpumask *span = sched_domain_span(sd);
> +	int nid = cpu_to_node(cpumask_first(span));
> +
> +	if (ZALLOC_MASK(&sds->cfs_overload_cpus, nr_cpu_ids, nid))

Mmm so this is called once on every CPU, but the !*(maskp) check in the
macro makes it so there is only one allocation per sd_llc_shared.

I wouldn't mind having that called out in a comment, or having the
pointer check done explicitly outside of the macro.

[...]
Valentin Schneider Nov. 12, 2018, 4:42 p.m. UTC | #2
Hi Steve,

On 09/11/2018 12:50, Steve Sistare wrote:
> From: Steve Sistare <steve.sistare@oracle.com>
> 
> Define and initialize a sparse bitmap of overloaded CPUs, per
> last-level-cache scheduling domain, for use by the CFS scheduling class.
> Save a pointer to cfs_overload_cpus in the rq for efficient access.
> 
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> ---
>  include/linux/sched/topology.h |  1 +
>  kernel/sched/sched.h           |  2 ++
>  kernel/sched/topology.c        | 21 +++++++++++++++++++--
>  3 files changed, 22 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index 6b99761..b173a77 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -72,6 +72,7 @@ struct sched_domain_shared {
>  	atomic_t	ref;
>  	atomic_t	nr_busy_cpus;
>  	int		has_idle_cores;
> +	struct sparsemask *cfs_overload_cpus;

Thinking about misfit stealing, we can't use the sd_llc_shared's because
on big.LITTLE misfit migrations happen across LLC domains.

I was thinking of adding a misfit sparsemask to the root_domain, but
then I thought we could do the same thing for cfs_overload_cpus.

By doing so we'd have a single source of information for overloaded CPUs,
and we could filter that down during idle balance - you mentioned earlier
wanting to try stealing at each SD level. This would also let you get
rid of [PATCH 02].

The main part of try_steal() could then be written down as something like
this:

----->8-----

for_each_domain(this_cpu, sd) {
	span = sched_domain_span(sd)
		
	for_each_sparse_wrap(src_cpu, overload_cpus) {
		if (cpumask_test_cpu(src_cpu, span) &&
		    steal_from(dts_rq, dst_rf, &locked, src_cpu)) {
			stolen = 1;
			goto out;
		}
	}
}

------8<-----

We could limit the stealing to stop at the highest SD_SHARE_PKG_RESOURCES
domain for now so there would be no behavioural change - but we'd
factorize the #ifdef SCHED_SMT bit. Furthermore, the door would be open
to further stealing.

What do you think?

[...]
Steven Sistare Nov. 19, 2018, 5:32 p.m. UTC | #3
On 11/9/2018 12:38 PM, Valentin Schneider wrote:
> Hi Steve,
> 
> On 09/11/2018 12:50, Steve Sistare wrote:
> [...]
>> @@ -482,6 +484,10 @@ static void update_top_cache_domain(int cpu)
>>  	dirty_sched_domain_sysctl(cpu);
>>  	destroy_sched_domains(tmp);
>>  
>> +	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
>> +	cfs_overload_cpus = (sd ? sd->shared->cfs_overload_cpus : NULL);
>> +	rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus);
>> +
> 
> Why not do this in update_top_cache_domain() where we also look for the
> highest SD_SHARE_PKG_RESOURCES and setup shortcut pointers?

My snippet needs rq which is currently referenced in cpu_attach_domain() but
not in update_top_cache_domain().  I could just as easily do it in 
update_top_cache_domain().  Either way is fine with me.

>>  	update_top_cache_domain(cpu);
>>  }
>>  
>> @@ -1619,9 +1625,19 @@ static void __sdt_free(const struct cpumask *cpu_map)
>>  	}
>>  }
>>  
>> +#define ZALLOC_MASK(maskp, nelems, node)				  \
>> +	(!*(maskp) && !zalloc_sparsemask_node(maskp, nelems,		  \
>> +					      SPARSEMASK_DENSITY_DEFAULT, \
>> +					      GFP_KERNEL, node))	  \
>> +
>>  static int sd_llc_alloc(struct sched_domain *sd)
>>  {
>> -	/* Allocate sd->shared data here. Empty for now. */
>> +	struct sched_domain_shared *sds = sd->shared;
>> +	struct cpumask *span = sched_domain_span(sd);
>> +	int nid = cpu_to_node(cpumask_first(span));
>> +
>> +	if (ZALLOC_MASK(&sds->cfs_overload_cpus, nr_cpu_ids, nid))
> 
> Mmm so this is called once on every CPU, but the !*(maskp) check in the
> macro makes it so there is only one allocation per sd_llc_shared.
> 
> I wouldn't mind having that called out in a comment, or having the
> pointer check done explicitly outside of the macro.

OK, will add a comment.  I like the macro because the code is cleaner if/when 
multiple sets are created.

- Steve
Steven Sistare Nov. 19, 2018, 5:33 p.m. UTC | #4
On 11/12/2018 11:42 AM, Valentin Schneider wrote:
> Hi Steve,
> 
> On 09/11/2018 12:50, Steve Sistare wrote:
>> From: Steve Sistare <steve.sistare@oracle.com>
>>
>> Define and initialize a sparse bitmap of overloaded CPUs, per
>> last-level-cache scheduling domain, for use by the CFS scheduling class.
>> Save a pointer to cfs_overload_cpus in the rq for efficient access.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>> ---
>>  include/linux/sched/topology.h |  1 +
>>  kernel/sched/sched.h           |  2 ++
>>  kernel/sched/topology.c        | 21 +++++++++++++++++++--
>>  3 files changed, 22 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
>> index 6b99761..b173a77 100644
>> --- a/include/linux/sched/topology.h
>> +++ b/include/linux/sched/topology.h
>> @@ -72,6 +72,7 @@ struct sched_domain_shared {
>>  	atomic_t	ref;
>>  	atomic_t	nr_busy_cpus;
>>  	int		has_idle_cores;
>> +	struct sparsemask *cfs_overload_cpus;
> 
> Thinking about misfit stealing, we can't use the sd_llc_shared's because
> on big.LITTLE misfit migrations happen across LLC domains.
> 
> I was thinking of adding a misfit sparsemask to the root_domain, but
> then I thought we could do the same thing for cfs_overload_cpus.
> 
> By doing so we'd have a single source of information for overloaded CPUs,
> and we could filter that down during idle balance - you mentioned earlier
> wanting to try stealing at each SD level. This would also let you get
> rid of [PATCH 02].
> 
> The main part of try_steal() could then be written down as something like
> this:
> 
> ----->8-----
> 
> for_each_domain(this_cpu, sd) {
> 	span = sched_domain_span(sd)
> 		
> 	for_each_sparse_wrap(src_cpu, overload_cpus) {
> 		if (cpumask_test_cpu(src_cpu, span) &&
> 		    steal_from(dts_rq, dst_rf, &locked, src_cpu)) {
> 			stolen = 1;
> 			goto out;
> 		}
> 	}
> }
> 
> ------8<-----
> 
> We could limit the stealing to stop at the highest SD_SHARE_PKG_RESOURCES
> domain for now so there would be no behavioural change - but we'd
> factorize the #ifdef SCHED_SMT bit. Furthermore, the door would be open
> to further stealing.
> 
> What do you think?

That is not efficient for a multi-level search because at each domain level we 
would (re) iterate over overloaded candidates that do not belong in that level.
To extend stealing across LLC, I would like to keep the per-LLC sparsemask, 
but add to each SD a list of sparsemask pointers.  The list nodes would be
private, but the sparsemask structs would be shared.  Each list would include
the masks that overlap the SD's members.  The list would be a singleton at the
core and LLC levels (same as the socket level for most processors), and would 
have multiple elements at the NUMA level.

- Steve
Valentin Schneider Nov. 20, 2018, 12:42 p.m. UTC | #5
On 19/11/2018 17:33, Steven Sistare wrote:
[...]
>>
>> Thinking about misfit stealing, we can't use the sd_llc_shared's because
>> on big.LITTLE misfit migrations happen across LLC domains.
>>
>> I was thinking of adding a misfit sparsemask to the root_domain, but
>> then I thought we could do the same thing for cfs_overload_cpus.
>>
>> By doing so we'd have a single source of information for overloaded CPUs,
>> and we could filter that down during idle balance - you mentioned earlier
>> wanting to try stealing at each SD level. This would also let you get
>> rid of [PATCH 02].
>>
>> The main part of try_steal() could then be written down as something like
>> this:
>>
>> ----->8-----
>>
>> for_each_domain(this_cpu, sd) {
>> 	span = sched_domain_span(sd)
>> 		
>> 	for_each_sparse_wrap(src_cpu, overload_cpus) {
>> 		if (cpumask_test_cpu(src_cpu, span) &&
>> 		    steal_from(dts_rq, dst_rf, &locked, src_cpu)) {
>> 			stolen = 1;
>> 			goto out;
>> 		}
>> 	}
>> }
>>
>> ------8<-----
>>
>> We could limit the stealing to stop at the highest SD_SHARE_PKG_RESOURCES
>> domain for now so there would be no behavioural change - but we'd
>> factorize the #ifdef SCHED_SMT bit. Furthermore, the door would be open
>> to further stealing.
>>
>> What do you think?
> 
> That is not efficient for a multi-level search because at each domain level we 
> would (re) iterate over overloaded candidates that do not belong in that level.


Mmm I was thinking we could abuse the wrap() and start at
(fls(prev_span) + 1), but we're not guaranteed to have contiguous spans -
the Arm Juno for instance has [0, 3, 4], [1, 2] as MC-level domains, so
that goes down the drain.

Another thing that has been trotting in my head would be some helper to
create a cpumask from a sparsemask (some sort of sparsemask_span()),
which would let us use the standard mask operators:

----->8-----
struct cpumask *overload_span = sparsemask_span(overload_cpus)

for_each_domain(this_cpu, sd)
	for_each_cpu_and(src_cpu, overload_span, sched_domain_span(sd))
		<steal_from here>
-----8>-----

The cpumask could be part of the sparsemask struct to save us the
allocation, and only updated when calling sparsemask_span().

> To extend stealing across LLC, I would like to keep the per-LLC sparsemask, 
> but add to each SD a list of sparsemask pointers.  The list nodes would be
> private, but the sparsemask structs would be shared.  Each list would include
> the masks that overlap the SD's members.  The list would be a singleton at the
> core and LLC levels (same as the socket level for most processors), and would 
> have multiple elements at the NUMA level.
> 

I see. As for misfit, creating asym_cpucapacity siblings of the sd_llc_*()
functions seems a bit much - there'd be a lot of redundancy for basically
just a single shared sparsemask, which is why I was rambling about moving
things to root_domain.

Having different locations where sparsemasks are stored is a bit of a
pain which I'd like to avoid, but if it can't be unified I suppose we'll
have to live with it.

> - Steve
>
Valentin Schneider Nov. 20, 2018, 12:52 p.m. UTC | #6
On 19/11/2018 17:32, Steven Sistare wrote:
> On 11/9/2018 12:38 PM, Valentin Schneider wrote:
>> Hi Steve,
>>
>> On 09/11/2018 12:50, Steve Sistare wrote:
>> [...]
>>> @@ -482,6 +484,10 @@ static void update_top_cache_domain(int cpu)
>>>  	dirty_sched_domain_sysctl(cpu);
>>>  	destroy_sched_domains(tmp);
>>>  
>>> +	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
>>> +	cfs_overload_cpus = (sd ? sd->shared->cfs_overload_cpus : NULL);
>>> +	rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus);
>>> +
>>
>> Why not do this in update_top_cache_domain() where we also look for the
>> highest SD_SHARE_PKG_RESOURCES and setup shortcut pointers?
> 
> My snippet needs rq which is currently referenced in cpu_attach_domain() but
> not in update_top_cache_domain().  I could just as easily do it in 
> update_top_cache_domain().  Either way is fine with me.
> 

I think having this in update_top_cache_domain() makes more sense - it's
where all the domain shortcuts are set, and we already play with LLC there.

[...]
Steven Sistare Nov. 26, 2018, 7:06 p.m. UTC | #7
On 11/20/2018 7:42 AM, Valentin Schneider wrote:
> On 19/11/2018 17:33, Steven Sistare wrote:
> [...]
>>>
>>> Thinking about misfit stealing, we can't use the sd_llc_shared's because
>>> on big.LITTLE misfit migrations happen across LLC domains.
>>>
>>> I was thinking of adding a misfit sparsemask to the root_domain, but
>>> then I thought we could do the same thing for cfs_overload_cpus.
>>>
>>> By doing so we'd have a single source of information for overloaded CPUs,
>>> and we could filter that down during idle balance - you mentioned earlier
>>> wanting to try stealing at each SD level. This would also let you get
>>> rid of [PATCH 02].
>>>
>>> The main part of try_steal() could then be written down as something like
>>> this:
>>>
>>> ----->8-----
>>>
>>> for_each_domain(this_cpu, sd) {
>>> 	span = sched_domain_span(sd)
>>> 		
>>> 	for_each_sparse_wrap(src_cpu, overload_cpus) {
>>> 		if (cpumask_test_cpu(src_cpu, span) &&
>>> 		    steal_from(dts_rq, dst_rf, &locked, src_cpu)) {
>>> 			stolen = 1;
>>> 			goto out;
>>> 		}
>>> 	}
>>> }
>>>
>>> ------8<-----
>>>
>>> We could limit the stealing to stop at the highest SD_SHARE_PKG_RESOURCES
>>> domain for now so there would be no behavioural change - but we'd
>>> factorize the #ifdef SCHED_SMT bit. Furthermore, the door would be open
>>> to further stealing.
>>>
>>> What do you think?
>>
>> That is not efficient for a multi-level search because at each domain level we 
>> would (re) iterate over overloaded candidates that do not belong in that level.
> 
> 
> Mmm I was thinking we could abuse the wrap() and start at
> (fls(prev_span) + 1), but we're not guaranteed to have contiguous spans -
> the Arm Juno for instance has [0, 3, 4], [1, 2] as MC-level domains, so
> that goes down the drain.
> 
> Another thing that has been trotting in my head would be some helper to
> create a cpumask from a sparsemask (some sort of sparsemask_span()),
> which would let us use the standard mask operators:
> 
> ----->8-----
> struct cpumask *overload_span = sparsemask_span(overload_cpus)
> 
> for_each_domain(this_cpu, sd)
> 	for_each_cpu_and(src_cpu, overload_span, sched_domain_span(sd))
> 		<steal_from here>
> -----8>-----
> 
> The cpumask could be part of the sparsemask struct to save us the
> allocation, and only updated when calling sparsemask_span().

I thought of providing something like this along with other sparsemask
utility functions, but I decided to be minimalist, and let others add
more functions if/when they become needed.  this_cpu_cpumask_var_ptr(select_idle_mask) 
is a temporary that could be used as the destination of the conversion.

Also, conversion adds cost, particularly on larger systems.  When comparing a
cpumask and a sparsemask, it is more efficient to iterate over the smaller
set, and test for membership in the larger, such as in try_steal:

    for_each_cpu(src_cpu, cpu_smt_mask(dst_cpu)) {
            if (sparsemask_test_elem(src_cpu, overload_cpus)
 
>> To extend stealing across LLC, I would like to keep the per-LLC sparsemask, 
>> but add to each SD a list of sparsemask pointers.  The list nodes would be
>> private, but the sparsemask structs would be shared.  Each list would include
>> the masks that overlap the SD's members.  The list would be a singleton at the
>> core and LLC levels (same as the socket level for most processors), and would 
>> have multiple elements at the NUMA level.
> 
> I see. As for misfit, creating asym_cpucapacity siblings of the sd_llc_*()
> functions seems a bit much - there'd be a lot of redundancy for basically
> just a single shared sparsemask, which is why I was rambling about moving
> things to root_domain.
> 
> Having different locations where sparsemasks are stored is a bit of a
> pain which I'd like to avoid, but if it can't be unified I suppose we'll
> have to live with it.

I don't follow.  A per-LLC sparsemask representing misfits can be allocated with
one more line in sd_llc_alloc, and you can steal across LLC using the list I
briefly described above.

- Steve
Valentin Schneider Dec. 3, 2018, 4:56 p.m. UTC | #8
Hi Steve,

On 26/11/2018 19:06, Steven Sistare wrote:
> [...]
>> Mmm I was thinking we could abuse the wrap() and start at
>> (fls(prev_span) + 1), but we're not guaranteed to have contiguous spans -
>> the Arm Juno for instance has [0, 3, 4], [1, 2] as MC-level domains, so
>> that goes down the drain.
>>
>> Another thing that has been trotting in my head would be some helper to
>> create a cpumask from a sparsemask (some sort of sparsemask_span()),
>> which would let us use the standard mask operators:
>>
>> ----->8-----
>> struct cpumask *overload_span = sparsemask_span(overload_cpus)
>>
>> for_each_domain(this_cpu, sd)
>> 	for_each_cpu_and(src_cpu, overload_span, sched_domain_span(sd))
>> 		<steal_from here>
>> -----8>-----
>>
>> The cpumask could be part of the sparsemask struct to save us the
>> allocation, and only updated when calling sparsemask_span().
> 
> I thought of providing something like this along with other sparsemask
> utility functions, but I decided to be minimalist, and let others add
> more functions if/when they become needed.  this_cpu_cpumask_var_ptr(select_idle_mask) 
> is a temporary that could be used as the destination of the conversion.
> 
> Also, conversion adds cost, particularly on larger systems.  When comparing a
> cpumask and a sparsemask, it is more efficient to iterate over the smaller
> set, and test for membership in the larger, such as in try_steal:
> 
>     for_each_cpu(src_cpu, cpu_smt_mask(dst_cpu)) {
>             if (sparsemask_test_elem(src_cpu, overload_cpus)
>  
>>> To extend stealing across LLC, I would like to keep the per-LLC sparsemask, 
>>> but add to each SD a list of sparsemask pointers.  The list nodes would be
>>> private, but the sparsemask structs would be shared.  Each list would include
>>> the masks that overlap the SD's members.  The list would be a singleton at the
>>> core and LLC levels (same as the socket level for most processors), and would 
>>> have multiple elements at the NUMA level.
>>
>> I see. As for misfit, creating asym_cpucapacity siblings of the sd_llc_*()
>> functions seems a bit much - there'd be a lot of redundancy for basically
>> just a single shared sparsemask, which is why I was rambling about moving
>> things to root_domain.
>>
>> Having different locations where sparsemasks are stored is a bit of a
>> pain which I'd like to avoid, but if it can't be unified I suppose we'll
>> have to live with it.
> 
> I don't follow.  A per-LLC sparsemask representing misfits can be allocated with
> one more line in sd_llc_alloc, and you can steal across LLC using the list I
> briefly described above.
> 

Ah yes, that would work. Thing is, I had excluded having the misfit masks
being in the sd_llc_shareds, since from a logical standpoint they don't
really belong there.

With asymmetric CPU capacities we kind of disregard the cache landscape
and focus on, well, CPU asymmetries. There's a few commits laying around
that forgo some cache usage optimisations for asymmetric systems -
this one comes to mind:

    9c63e84db29b ("sched/core: Disable SD_PREFER_SIBLING on asymmetric CPU capacity domains")

So in truth I was envisioning separate SD_ASYM_CPUCAPACITY-based 
sparsemasks, which is why I was rambling about SD_ASYM_CPUCAPACITY siblings
of sd_llc_*()... *But* after I had a go at it, it looked to me like that
was a lot of duplicated code.

My root_domain suggestion stems from the fact that we only really need one
single sparsemask for misfit stealing, and it provides a unique location
to store the sparsemasks (and you mask them however you want when it comes
to using them).

Sadly I think that doesn't work as well for cfs_overload_cpus since you
can't split a sparsemask's chunks over several NUMA nodes, so we'd be
stuck with an allocation on a single node (but we already do that in some
places, e.g. for nohz.idle_cpus_mask, so... Is it that bad?).

> - Steve
>
Steven Sistare Dec. 6, 2018, 4:40 p.m. UTC | #9
On 12/3/2018 11:56 AM, Valentin Schneider wrote:
> Hi Steve,
> On 26/11/2018 19:06, Steven Sistare wrote:
>> [...]
>>> Mmm I was thinking we could abuse the wrap() and start at
>>> (fls(prev_span) + 1), but we're not guaranteed to have contiguous spans -
>>> the Arm Juno for instance has [0, 3, 4], [1, 2] as MC-level domains, so
>>> that goes down the drain.
>>>
>>> Another thing that has been trotting in my head would be some helper to
>>> create a cpumask from a sparsemask (some sort of sparsemask_span()),
>>> which would let us use the standard mask operators:
>>>
>>> ----->8-----
>>> struct cpumask *overload_span = sparsemask_span(overload_cpus)
>>>
>>> for_each_domain(this_cpu, sd)
>>> 	for_each_cpu_and(src_cpu, overload_span, sched_domain_span(sd))
>>> 		<steal_from here>
>>> -----8>-----
>>>
>>> The cpumask could be part of the sparsemask struct to save us the
>>> allocation, and only updated when calling sparsemask_span().
>>
>> I thought of providing something like this along with other sparsemask
>> utility functions, but I decided to be minimalist, and let others add
>> more functions if/when they become needed.  this_cpu_cpumask_var_ptr(select_idle_mask) 
>> is a temporary that could be used as the destination of the conversion.
>>
>> Also, conversion adds cost, particularly on larger systems.  When comparing a
>> cpumask and a sparsemask, it is more efficient to iterate over the smaller
>> set, and test for membership in the larger, such as in try_steal:
>>
>>     for_each_cpu(src_cpu, cpu_smt_mask(dst_cpu)) {
>>             if (sparsemask_test_elem(src_cpu, overload_cpus)
>>  
>>>> To extend stealing across LLC, I would like to keep the per-LLC sparsemask, 
>>>> but add to each SD a list of sparsemask pointers.  The list nodes would be
>>>> private, but the sparsemask structs would be shared.  Each list would include
>>>> the masks that overlap the SD's members.  The list would be a singleton at the
>>>> core and LLC levels (same as the socket level for most processors), and would 
>>>> have multiple elements at the NUMA level.
>>>
>>> I see. As for misfit, creating asym_cpucapacity siblings of the sd_llc_*()
>>> functions seems a bit much - there'd be a lot of redundancy for basically
>>> just a single shared sparsemask, which is why I was rambling about moving
>>> things to root_domain.
>>>
>>> Having different locations where sparsemasks are stored is a bit of a
>>> pain which I'd like to avoid, but if it can't be unified I suppose we'll
>>> have to live with it.
>>
>> I don't follow.  A per-LLC sparsemask representing misfits can be allocated with
>> one more line in sd_llc_alloc, and you can steal across LLC using the list I
>> briefly described above.
> 
> Ah yes, that would work. Thing is, I had excluded having the misfit masks
> being in the sd_llc_shareds, since from a logical standpoint they don't
> really belong there.
> 
> With asymmetric CPU capacities we kind of disregard the cache landscape

Sure, but adding awareness of the cache hierarchy can only make it better,
and a per-LLC mask organization can serve both the overloaded and misfit
use cases quite naturally.

> and focus on, well, CPU asymmetries. There's a few commits laying around
> that forgo some cache usage optimisations for asymmetric systems -
> this one comes to mind:
> 
>     9c63e84db29b ("sched/core: Disable SD_PREFER_SIBLING on asymmetric CPU capacity domains")
> 
> So in truth I was envisioning separate SD_ASYM_CPUCAPACITY-based 
> sparsemasks, which is why I was rambling about SD_ASYM_CPUCAPACITY siblings
> of sd_llc_*()... *But* after I had a go at it, it looked to me like that
> was a lot of duplicated code.

I would be happy to review your code and make suggestions to reduce duplication,
and happy to continue to discuss clean and optimal handling for misfits. However, 
I have a request: can we push my patches across the finish line first?  Stealing 
for misfits can be its own patch series.  Please consider sending your reviewed-by
for the next version of my series.  I will send it soon.

> My root_domain suggestion stems from the fact that we only really need one
> single sparsemask for misfit stealing, and it provides a unique location
> to store the sparsemasks (and you mask them however you want when it comes
> to using them).
> 
> Sadly I think that doesn't work as well for cfs_overload_cpus since you
> can't split a sparsemask's chunks over several NUMA nodes, so we'd be
> stuck with an allocation on a single node (but we already do that in some
> places, e.g. for nohz.idle_cpus_mask, so... Is it that bad?).

It can be bad for high memory bandwidth workloads, as the sparsemasks will
be displaced from cache and we incur remote memory latencies on next access.

- Steve
Valentin Schneider Dec. 6, 2018, 5:28 p.m. UTC | #10
Hi Steve,

On 06/12/2018 16:40, Steven Sistare wrote:
> [...]
>>
>> Ah yes, that would work. Thing is, I had excluded having the misfit masks
>> being in the sd_llc_shareds, since from a logical standpoint they don't
>> really belong there.
>>
>> With asymmetric CPU capacities we kind of disregard the cache landscape
> 
> Sure, but adding awareness of the cache hierarchy can only make it better,
> and a per-LLC mask organization can serve both the overloaded and misfit
> use cases quite naturally.
> [...]
>> So in truth I was envisioning separate SD_ASYM_CPUCAPACITY-based 
>> sparsemasks, which is why I was rambling about SD_ASYM_CPUCAPACITY siblings
>> of sd_llc_*()... *But* after I had a go at it, it looked to me like that
>> was a lot of duplicated code.
> 
> I would be happy to review your code and make suggestions to reduce duplication,
> and happy to continue to discuss clean and optimal handling for misfits. However, 
> I have a request: can we push my patches across the finish line first?  Stealing 
> for misfits can be its own patch series.  Please consider sending your reviewed-by
> for the next version of my series.  I will send it soon.
> 

Sure, as things stand right now I'm fairly convinced this doesn't harm
asymmetric systems.

The only thing I would add (ignoring misfits) is that with EAS we would
need to gate stealing with something like:

    !static_branch_unlikely(&sched_energy_present) ||
    READ_ONCE(rq->rd->overutilized)

And who "gets" to add this gating (or at least, when must it be added)
depends on which patch-set gets in first.

[...]
>> Sadly I think that doesn't work as well for cfs_overload_cpus since you
>> can't split a sparsemask's chunks over several NUMA nodes, so we'd be
>> stuck with an allocation on a single node (but we already do that in some
>> places, e.g. for nohz.idle_cpus_mask, so... Is it that bad?).
> 
> It can be bad for high memory bandwidth workloads, as the sparsemasks will
> be displaced from cache and we incur remote memory latencies on next access.
> 

Aye, I just caught up with the LPC videos and was about to reply here to
say that, all things considered, it's probably not such a good idea...

> - Steve
>

Patch
diff mbox series

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 6b99761..b173a77 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -72,6 +72,7 @@  struct sched_domain_shared {
 	atomic_t	ref;
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
+	struct sparsemask *cfs_overload_cpus;
 };
 
 struct sched_domain {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 618577f..eacf5db 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -81,6 +81,7 @@ 
 
 struct rq;
 struct cpuidle_state;
+struct sparsemask;
 
 /* task_struct::on_rq states: */
 #define TASK_ON_RQ_QUEUED	1
@@ -812,6 +813,7 @@  struct rq {
 	struct cfs_rq		cfs;
 	struct rt_rq		rt;
 	struct dl_rq		dl;
+	struct sparsemask	*cfs_overload_cpus;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this CPU: */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 3e72ce0..6455bde 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,6 +3,7 @@ 
  * Scheduler topology setup/handling methods
  */
 #include "sched.h"
+#include <linux/sparsemask.h>
 
 DEFINE_MUTEX(sched_domains_mutex);
 
@@ -441,6 +442,7 @@  static void update_top_cache_domain(int cpu)
 static void
 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 {
+	struct sparsemask *cfs_overload_cpus;
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
 
@@ -482,6 +484,10 @@  static void update_top_cache_domain(int cpu)
 	dirty_sched_domain_sysctl(cpu);
 	destroy_sched_domains(tmp);
 
+	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+	cfs_overload_cpus = (sd ? sd->shared->cfs_overload_cpus : NULL);
+	rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus);
+
 	update_top_cache_domain(cpu);
 }
 
@@ -1619,9 +1625,19 @@  static void __sdt_free(const struct cpumask *cpu_map)
 	}
 }
 
+#define ZALLOC_MASK(maskp, nelems, node)				  \
+	(!*(maskp) && !zalloc_sparsemask_node(maskp, nelems,		  \
+					      SPARSEMASK_DENSITY_DEFAULT, \
+					      GFP_KERNEL, node))	  \
+
 static int sd_llc_alloc(struct sched_domain *sd)
 {
-	/* Allocate sd->shared data here. Empty for now. */
+	struct sched_domain_shared *sds = sd->shared;
+	struct cpumask *span = sched_domain_span(sd);
+	int nid = cpu_to_node(cpumask_first(span));
+
+	if (ZALLOC_MASK(&sds->cfs_overload_cpus, nr_cpu_ids, nid))
+		return 1;
 
 	return 0;
 }
@@ -1633,7 +1649,8 @@  static void sd_llc_free(struct sched_domain *sd)
 	if (!sds)
 		return;
 
-	/* Free data here. Empty for now. */
+	free_sparsemask(sds->cfs_overload_cpus);
+	sds->cfs_overload_cpus = NULL;
 }
 
 static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)