All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2 v5] unlink misfit task from cpu overutilized
@ 2023-02-01 14:36 Vincent Guittot
  2023-02-01 14:36 ` [PATCH 1/2 v5] sched/fair: " Vincent Guittot
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Vincent Guittot @ 2023-02-01 14:36 UTC (permalink / raw)
  To: mingo, peterz, dietmar.eggemann, qyousef, rafael, viresh.kumar,
	vschneid, linux-pm, linux-kernel, kajetan.puchalski
  Cc: lukasz.luba, wvw, xuewen.yan94, han.lin, Jonathan.JMChen,
	Vincent Guittot

unlink misfit task detection from cpu overutilized and use the new -1 value
returned by util_fits_cpu() to extend the search for the best CPU.
Remove capacity inversion detection that is covered by these changes.

Changes since v4:
- Added patch 2 to remove capacity inversion detection

Vincent Guittot (2):
  sched/fair: unlink misfit task from cpu overutilized
  sched/fair: Remove capacity inversion detection

 kernel/sched/fair.c  | 189 ++++++++++++++++++++-----------------------
 kernel/sched/sched.h |  19 -----
 2 files changed, 87 insertions(+), 121 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/2 v5] sched/fair: unlink misfit task from cpu overutilized
  2023-02-01 14:36 [PATCH 0/2 v5] unlink misfit task from cpu overutilized Vincent Guittot
@ 2023-02-01 14:36 ` Vincent Guittot
  2023-02-11 10:30   ` [tip: sched/core] " tip-bot2 for Vincent Guittot
  2023-02-01 14:36 ` [PATCH 2/2 v5] sched/fair: Remove capacity inversion detection Vincent Guittot
  2023-02-02 11:29 ` [PATCH 0/2 v5] unlink misfit task from cpu overutilized Rafael J. Wysocki
  2 siblings, 1 reply; 9+ messages in thread
From: Vincent Guittot @ 2023-02-01 14:36 UTC (permalink / raw)
  To: mingo, peterz, dietmar.eggemann, qyousef, rafael, viresh.kumar,
	vschneid, linux-pm, linux-kernel, kajetan.puchalski
  Cc: lukasz.luba, wvw, xuewen.yan94, han.lin, Jonathan.JMChen,
	Vincent Guittot

By taking into account uclamp_min, the 1:1 relation between task misfit
and cpu overutilized is no more true as a task with a small util_avg may
not fit a high capacity cpu because of uclamp_min constraint.

Add a new state in util_fits_cpu() to reflect the case that task would fit
a CPU except for the uclamp_min hint which is a performance requirement.

Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
can use this new value to take additional action to select the best CPU
that doesn't match uclamp_min hint.

When util_fits_cpu() returns -1, we will continue to look for a possible
CPU with better performance, which replaces Capacity Inversion detection
with capacity_orig_of() - thermal_load_avg to detect a capacity inversion.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-by: Kajetan Puchalski <kajetan.puchalski@arm.com>
Reviewed-and-tested-by: Qais Yousef <qyousef@layalina.io>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---

Change since v4:
- Add more details in commit message
- Fix typo
- Add Tag

 kernel/sched/fair.c | 105 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 82 insertions(+), 23 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c46485d65d7..074742f107c0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4561,8 +4561,8 @@ static inline int util_fits_cpu(unsigned long util,
 	 * handle the case uclamp_min > uclamp_max.
 	 */
 	uclamp_min = min(uclamp_min, uclamp_max);
-	if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
-		fits = fits && (uclamp_min <= capacity_orig_thermal);
+	if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
+		return -1;
 
 	return fits;
 }
@@ -4572,7 +4572,11 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
 	unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
 	unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
 	unsigned long util = task_util_est(p);
-	return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
+	/*
+	 * Return true only if the cpu fully fits the task requirements, which
+	 * include the utilization but also the performance hints.
+	 */
+	return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
 }
 
 static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -6138,6 +6142,7 @@ static inline bool cpu_overutilized(int cpu)
 	unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
 	unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
 
+	/* Return true only if the utilization doesn't fit CPU's capacity */
 	return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
 }
 
@@ -6931,6 +6936,7 @@ static int
 select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 {
 	unsigned long task_util, util_min, util_max, best_cap = 0;
+	int fits, best_fits = 0;
 	int cpu, best_cpu = -1;
 	struct cpumask *cpus;
 
@@ -6946,12 +6952,28 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 
 		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
 			continue;
-		if (util_fits_cpu(task_util, util_min, util_max, cpu))
+
+		fits = util_fits_cpu(task_util, util_min, util_max, cpu);
+
+		/* This CPU fits with all requirements */
+		if (fits > 0)
 			return cpu;
+		/*
+		 * Only the min performance hint (i.e. uclamp_min) doesn't fit.
+		 * Look for the CPU with best capacity.
+		 */
+		else if (fits < 0)
+			cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
 
-		if (cpu_cap > best_cap) {
+		/*
+		 * First, select CPU which fits better (-1 being better than 0).
+		 * Then, select the one with best capacity at same level.
+		 */
+		if ((fits < best_fits) ||
+		    ((fits == best_fits) && (cpu_cap > best_cap))) {
 			best_cap = cpu_cap;
 			best_cpu = cpu;
+			best_fits = fits;
 		}
 	}
 
@@ -6964,7 +6986,11 @@ static inline bool asym_fits_cpu(unsigned long util,
 				 int cpu)
 {
 	if (sched_asym_cpucap_active())
-		return util_fits_cpu(util, util_min, util_max, cpu);
+		/*
+		 * Return true only if the cpu fully fits the task requirements
+		 * which include the utilization and the performance hints.
+		 */
+		return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
 
 	return true;
 }
@@ -7331,6 +7357,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 	unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
 	struct root_domain *rd = this_rq()->rd;
 	int cpu, best_energy_cpu, target = -1;
+	int prev_fits = -1, best_fits = -1;
+	unsigned long best_thermal_cap = 0;
+	unsigned long prev_thermal_cap = 0;
 	struct sched_domain *sd;
 	struct perf_domain *pd;
 	struct energy_env eenv;
@@ -7366,6 +7395,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 		unsigned long prev_spare_cap = 0;
 		int max_spare_cap_cpu = -1;
 		unsigned long base_energy;
+		int fits, max_fits = -1;
 
 		cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
 
@@ -7415,7 +7445,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 				util_min = max(rq_util_min, p_util_min);
 				util_max = max(rq_util_max, p_util_max);
 			}
-			if (!util_fits_cpu(util, util_min, util_max, cpu))
+
+			fits = util_fits_cpu(util, util_min, util_max, cpu);
+			if (!fits)
 				continue;
 
 			lsub_positive(&cpu_cap, util);
@@ -7423,7 +7455,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 			if (cpu == prev_cpu) {
 				/* Always use prev_cpu as a candidate. */
 				prev_spare_cap = cpu_cap;
-			} else if (cpu_cap > max_spare_cap) {
+				prev_fits = fits;
+			} else if ((fits > max_fits) ||
+				   ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
 				/*
 				 * Find the CPU with the maximum spare capacity
 				 * among the remaining CPUs in the performance
@@ -7431,6 +7465,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 				 */
 				max_spare_cap = cpu_cap;
 				max_spare_cap_cpu = cpu;
+				max_fits = fits;
 			}
 		}
 
@@ -7449,26 +7484,50 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 			if (prev_delta < base_energy)
 				goto unlock;
 			prev_delta -= base_energy;
+			prev_thermal_cap = cpu_thermal_cap;
 			best_delta = min(best_delta, prev_delta);
 		}
 
 		/* Evaluate the energy impact of using max_spare_cap_cpu. */
 		if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
+			/* Current best energy cpu fits better */
+			if (max_fits < best_fits)
+				continue;
+
+			/*
+			 * Both don't fit performance hint (i.e. uclamp_min)
+			 * but best energy cpu has better capacity.
+			 */
+			if ((max_fits < 0) &&
+			    (cpu_thermal_cap <= best_thermal_cap))
+				continue;
+
 			cur_delta = compute_energy(&eenv, pd, cpus, p,
 						   max_spare_cap_cpu);
 			/* CPU utilization has changed */
 			if (cur_delta < base_energy)
 				goto unlock;
 			cur_delta -= base_energy;
-			if (cur_delta < best_delta) {
-				best_delta = cur_delta;
-				best_energy_cpu = max_spare_cap_cpu;
-			}
+
+			/*
+			 * Both fit for the task but best energy cpu has lower
+			 * energy impact.
+			 */
+			if ((max_fits > 0) && (best_fits > 0) &&
+			    (cur_delta >= best_delta))
+				continue;
+
+			best_delta = cur_delta;
+			best_energy_cpu = max_spare_cap_cpu;
+			best_fits = max_fits;
+			best_thermal_cap = cpu_thermal_cap;
 		}
 	}
 	rcu_read_unlock();
 
-	if (best_delta < prev_delta)
+	if ((best_fits > prev_fits) ||
+	    ((best_fits > 0) && (best_delta < prev_delta)) ||
+	    ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
 		target = best_energy_cpu;
 
 	return target;
@@ -10271,24 +10330,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	 */
 	update_sd_lb_stats(env, &sds);
 
-	if (sched_energy_enabled()) {
-		struct root_domain *rd = env->dst_rq->rd;
-
-		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
-			goto out_balanced;
-	}
-
-	local = &sds.local_stat;
-	busiest = &sds.busiest_stat;
-
 	/* There is no busy sibling group to pull tasks from */
 	if (!sds.busiest)
 		goto out_balanced;
 
+	busiest = &sds.busiest_stat;
+
 	/* Misfit tasks should be dealt with regardless of the avg load */
 	if (busiest->group_type == group_misfit_task)
 		goto force_balance;
 
+	if (sched_energy_enabled()) {
+		struct root_domain *rd = env->dst_rq->rd;
+
+		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
+			goto out_balanced;
+	}
+
 	/* ASYM feature bypasses nice load balance check */
 	if (busiest->group_type == group_asym_packing)
 		goto force_balance;
@@ -10301,6 +10359,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	if (busiest->group_type == group_imbalanced)
 		goto force_balance;
 
+	local = &sds.local_stat;
 	/*
 	 * If the local group is busier than the selected busiest group
 	 * don't try and pull any tasks.
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/2 v5] sched/fair: Remove capacity inversion detection
  2023-02-01 14:36 [PATCH 0/2 v5] unlink misfit task from cpu overutilized Vincent Guittot
  2023-02-01 14:36 ` [PATCH 1/2 v5] sched/fair: " Vincent Guittot
@ 2023-02-01 14:36 ` Vincent Guittot
  2023-02-04 18:42   ` Qais Yousef
  2023-02-11 10:30   ` [tip: sched/core] " tip-bot2 for Vincent Guittot
  2023-02-02 11:29 ` [PATCH 0/2 v5] unlink misfit task from cpu overutilized Rafael J. Wysocki
  2 siblings, 2 replies; 9+ messages in thread
From: Vincent Guittot @ 2023-02-01 14:36 UTC (permalink / raw)
  To: mingo, peterz, dietmar.eggemann, qyousef, rafael, viresh.kumar,
	vschneid, linux-pm, linux-kernel, kajetan.puchalski
  Cc: lukasz.luba, wvw, xuewen.yan94, han.lin, Jonathan.JMChen,
	Vincent Guittot

Remove the capacity inversion detection which is now handled by
util_fits_cpu() returning -1 when we need to continue to look for a
potential CPU with better performance.

This ends up almost reverting patches below except for some comments:
commit da07d2f9c153 ("sched/fair: Fixes for capacity inversion detection")
commit aa69c36f31aa ("sched/fair: Consider capacity inversion in util_fits_cpu()")
commit 44c7b80bffc3 ("sched/fair: Detect capacity inversion")

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 kernel/sched/fair.c  | 84 +++-----------------------------------------
 kernel/sched/sched.h | 19 ----------
 2 files changed, 5 insertions(+), 98 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 074742f107c0..c6c8e7f52935 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4476,17 +4476,9 @@ static inline int util_fits_cpu(unsigned long util,
 	 *
 	 * For uclamp_max, we can tolerate a drop in performance level as the
 	 * goal is to cap the task. So it's okay if it's getting less.
-	 *
-	 * In case of capacity inversion we should honour the inverted capacity
-	 * for both uclamp_min and uclamp_max all the time.
 	 */
-	capacity_orig = cpu_in_capacity_inversion(cpu);
-	if (capacity_orig) {
-		capacity_orig_thermal = capacity_orig;
-	} else {
-		capacity_orig = capacity_orig_of(cpu);
-		capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
-	}
+	capacity_orig = capacity_orig_of(cpu);
+	capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
 
 	/*
 	 * We want to force a task to fit a cpu as implied by uclamp_max.
@@ -9027,82 +9019,16 @@ static unsigned long scale_rt_capacity(int cpu)
 
 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 {
-	unsigned long capacity_orig = arch_scale_cpu_capacity(cpu);
 	unsigned long capacity = scale_rt_capacity(cpu);
 	struct sched_group *sdg = sd->groups;
-	struct rq *rq = cpu_rq(cpu);
 
-	rq->cpu_capacity_orig = capacity_orig;
+	cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
 
 	if (!capacity)
 		capacity = 1;
 
-	rq->cpu_capacity = capacity;
-
-	/*
-	 * Detect if the performance domain is in capacity inversion state.
-	 *
-	 * Capacity inversion happens when another perf domain with equal or
-	 * lower capacity_orig_of() ends up having higher capacity than this
-	 * domain after subtracting thermal pressure.
-	 *
-	 * We only take into account thermal pressure in this detection as it's
-	 * the only metric that actually results in *real* reduction of
-	 * capacity due to performance points (OPPs) being dropped/become
-	 * unreachable due to thermal throttling.
-	 *
-	 * We assume:
-	 *   * That all cpus in a perf domain have the same capacity_orig
-	 *     (same uArch).
-	 *   * Thermal pressure will impact all cpus in this perf domain
-	 *     equally.
-	 */
-	if (sched_energy_enabled()) {
-		unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
-		struct perf_domain *pd;
-
-		rcu_read_lock();
-
-		pd = rcu_dereference(rq->rd->pd);
-		rq->cpu_capacity_inverted = 0;
-
-		for (; pd; pd = pd->next) {
-			struct cpumask *pd_span = perf_domain_span(pd);
-			unsigned long pd_cap_orig, pd_cap;
-
-			/* We can't be inverted against our own pd */
-			if (cpumask_test_cpu(cpu_of(rq), pd_span))
-				continue;
-
-			cpu = cpumask_any(pd_span);
-			pd_cap_orig = arch_scale_cpu_capacity(cpu);
-
-			if (capacity_orig < pd_cap_orig)
-				continue;
-
-			/*
-			 * handle the case of multiple perf domains have the
-			 * same capacity_orig but one of them is under higher
-			 * thermal pressure. We record it as capacity
-			 * inversion.
-			 */
-			if (capacity_orig == pd_cap_orig) {
-				pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu));
-
-				if (pd_cap > inv_cap) {
-					rq->cpu_capacity_inverted = inv_cap;
-					break;
-				}
-			} else if (pd_cap_orig > inv_cap) {
-				rq->cpu_capacity_inverted = inv_cap;
-				break;
-			}
-		}
-
-		rcu_read_unlock();
-	}
-
-	trace_sched_cpu_capacity_tp(rq);
+	cpu_rq(cpu)->cpu_capacity = capacity;
+	trace_sched_cpu_capacity_tp(cpu_rq(cpu));
 
 	sdg->sgc->capacity = capacity;
 	sdg->sgc->min_capacity = capacity;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1072502976df..3e8df6d31c1e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1044,7 +1044,6 @@ struct rq {
 
 	unsigned long		cpu_capacity;
 	unsigned long		cpu_capacity_orig;
-	unsigned long		cpu_capacity_inverted;
 
 	struct balance_callback *balance_callback;
 
@@ -2899,24 +2898,6 @@ static inline unsigned long capacity_orig_of(int cpu)
 	return cpu_rq(cpu)->cpu_capacity_orig;
 }
 
-/*
- * Returns inverted capacity if the CPU is in capacity inversion state.
- * 0 otherwise.
- *
- * Capacity inversion detection only considers thermal impact where actual
- * performance points (OPPs) gets dropped.
- *
- * Capacity inversion state happens when another performance domain that has
- * equal or lower capacity_orig_of() becomes effectively larger than the perf
- * domain this CPU belongs to due to thermal pressure throttling it hard.
- *
- * See comment in update_cpu_capacity().
- */
-static inline unsigned long cpu_in_capacity_inversion(int cpu)
-{
-	return cpu_rq(cpu)->cpu_capacity_inverted;
-}
-
 /**
  * enum cpu_util_type - CPU utilization type
  * @FREQUENCY_UTIL:	Utilization used to select frequency
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 0/2 v5] unlink misfit task from cpu overutilized
  2023-02-01 14:36 [PATCH 0/2 v5] unlink misfit task from cpu overutilized Vincent Guittot
  2023-02-01 14:36 ` [PATCH 1/2 v5] sched/fair: " Vincent Guittot
  2023-02-01 14:36 ` [PATCH 2/2 v5] sched/fair: Remove capacity inversion detection Vincent Guittot
@ 2023-02-02 11:29 ` Rafael J. Wysocki
  2 siblings, 0 replies; 9+ messages in thread
From: Rafael J. Wysocki @ 2023-02-02 11:29 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: mingo, peterz, dietmar.eggemann, qyousef, rafael, viresh.kumar,
	vschneid, linux-pm, linux-kernel, kajetan.puchalski, lukasz.luba,
	wvw, xuewen.yan94, han.lin, Jonathan.JMChen

On Wed, Feb 1, 2023 at 3:36 PM Vincent Guittot
<vincent.guittot@linaro.org> wrote:
>
> unlink misfit task detection from cpu overutilized and use the new -1 value
> returned by util_fits_cpu() to extend the search for the best CPU.
> Remove capacity inversion detection that is covered by these changes.
>
> Changes since v4:
> - Added patch 2 to remove capacity inversion detection
>
> Vincent Guittot (2):
>   sched/fair: unlink misfit task from cpu overutilized
>   sched/fair: Remove capacity inversion detection
>
>  kernel/sched/fair.c  | 189 ++++++++++++++++++++-----------------------
>  kernel/sched/sched.h |  19 -----
>  2 files changed, 87 insertions(+), 121 deletions(-)
>
> --

Acked-by: Rafael J. Wysocki <rafael@kernel.org>

for the series.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2 v5] sched/fair: Remove capacity inversion detection
  2023-02-01 14:36 ` [PATCH 2/2 v5] sched/fair: Remove capacity inversion detection Vincent Guittot
@ 2023-02-04 18:42   ` Qais Yousef
  2023-02-05 17:29     ` Vincent Guittot
  2023-02-11 10:30   ` [tip: sched/core] " tip-bot2 for Vincent Guittot
  1 sibling, 1 reply; 9+ messages in thread
From: Qais Yousef @ 2023-02-04 18:42 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: mingo, peterz, dietmar.eggemann, rafael, viresh.kumar, vschneid,
	linux-pm, linux-kernel, kajetan.puchalski, lukasz.luba, wvw,
	xuewen.yan94, han.lin, Jonathan.JMChen

On 02/01/23 15:36, Vincent Guittot wrote:
> Remove the capacity inversion detection which is now handled by
> util_fits_cpu() returning -1 when we need to continue to look for a
> potential CPU with better performance.
> 
> This ends up almost reverting patches below except for some comments:

nit: I think this comment must be removed/reworeded though

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 691a2f9c4efa..c6c8e7f52935 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4476,10 +4476,6 @@ static inline int util_fits_cpu(unsigned long util,
         *
         * For uclamp_max, we can tolerate a drop in performance level as the
         * goal is to cap the task. So it's okay if it's getting less.
-        *
-        * In case of capacity inversion, which is not handled yet, we should
-        * honour the inverted capacity for both uclamp_min and uclamp_max all
-        * the time.
         */
        capacity_orig = capacity_orig_of(cpu);
        capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);

> commit da07d2f9c153 ("sched/fair: Fixes for capacity inversion detection")
> commit aa69c36f31aa ("sched/fair: Consider capacity inversion in util_fits_cpu()")
> commit 44c7b80bffc3 ("sched/fair: Detect capacity inversion")
> 
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>

Apart from that, LGTM.

Reviewed-by: Qais Yousef <qyousef@layalina.io>


Thanks!

--
Qais Yousef

> ---
>  kernel/sched/fair.c  | 84 +++-----------------------------------------
>  kernel/sched/sched.h | 19 ----------
>  2 files changed, 5 insertions(+), 98 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 074742f107c0..c6c8e7f52935 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4476,17 +4476,9 @@ static inline int util_fits_cpu(unsigned long util,
>  	 *
>  	 * For uclamp_max, we can tolerate a drop in performance level as the
>  	 * goal is to cap the task. So it's okay if it's getting less.
> -	 *
> -	 * In case of capacity inversion we should honour the inverted capacity
> -	 * for both uclamp_min and uclamp_max all the time.
>  	 */
> -	capacity_orig = cpu_in_capacity_inversion(cpu);
> -	if (capacity_orig) {
> -		capacity_orig_thermal = capacity_orig;
> -	} else {
> -		capacity_orig = capacity_orig_of(cpu);
> -		capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
> -	}
> +	capacity_orig = capacity_orig_of(cpu);
> +	capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
>  
>  	/*
>  	 * We want to force a task to fit a cpu as implied by uclamp_max.
> @@ -9027,82 +9019,16 @@ static unsigned long scale_rt_capacity(int cpu)
>  
>  static void update_cpu_capacity(struct sched_domain *sd, int cpu)
>  {
> -	unsigned long capacity_orig = arch_scale_cpu_capacity(cpu);
>  	unsigned long capacity = scale_rt_capacity(cpu);
>  	struct sched_group *sdg = sd->groups;
> -	struct rq *rq = cpu_rq(cpu);
>  
> -	rq->cpu_capacity_orig = capacity_orig;
> +	cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
>  
>  	if (!capacity)
>  		capacity = 1;
>  
> -	rq->cpu_capacity = capacity;
> -
> -	/*
> -	 * Detect if the performance domain is in capacity inversion state.
> -	 *
> -	 * Capacity inversion happens when another perf domain with equal or
> -	 * lower capacity_orig_of() ends up having higher capacity than this
> -	 * domain after subtracting thermal pressure.
> -	 *
> -	 * We only take into account thermal pressure in this detection as it's
> -	 * the only metric that actually results in *real* reduction of
> -	 * capacity due to performance points (OPPs) being dropped/become
> -	 * unreachable due to thermal throttling.
> -	 *
> -	 * We assume:
> -	 *   * That all cpus in a perf domain have the same capacity_orig
> -	 *     (same uArch).
> -	 *   * Thermal pressure will impact all cpus in this perf domain
> -	 *     equally.
> -	 */
> -	if (sched_energy_enabled()) {
> -		unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
> -		struct perf_domain *pd;
> -
> -		rcu_read_lock();
> -
> -		pd = rcu_dereference(rq->rd->pd);
> -		rq->cpu_capacity_inverted = 0;
> -
> -		for (; pd; pd = pd->next) {
> -			struct cpumask *pd_span = perf_domain_span(pd);
> -			unsigned long pd_cap_orig, pd_cap;
> -
> -			/* We can't be inverted against our own pd */
> -			if (cpumask_test_cpu(cpu_of(rq), pd_span))
> -				continue;
> -
> -			cpu = cpumask_any(pd_span);
> -			pd_cap_orig = arch_scale_cpu_capacity(cpu);
> -
> -			if (capacity_orig < pd_cap_orig)
> -				continue;
> -
> -			/*
> -			 * handle the case of multiple perf domains have the
> -			 * same capacity_orig but one of them is under higher
> -			 * thermal pressure. We record it as capacity
> -			 * inversion.
> -			 */
> -			if (capacity_orig == pd_cap_orig) {
> -				pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu));
> -
> -				if (pd_cap > inv_cap) {
> -					rq->cpu_capacity_inverted = inv_cap;
> -					break;
> -				}
> -			} else if (pd_cap_orig > inv_cap) {
> -				rq->cpu_capacity_inverted = inv_cap;
> -				break;
> -			}
> -		}
> -
> -		rcu_read_unlock();
> -	}
> -
> -	trace_sched_cpu_capacity_tp(rq);
> +	cpu_rq(cpu)->cpu_capacity = capacity;
> +	trace_sched_cpu_capacity_tp(cpu_rq(cpu));
>  
>  	sdg->sgc->capacity = capacity;
>  	sdg->sgc->min_capacity = capacity;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 1072502976df..3e8df6d31c1e 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1044,7 +1044,6 @@ struct rq {
>  
>  	unsigned long		cpu_capacity;
>  	unsigned long		cpu_capacity_orig;
> -	unsigned long		cpu_capacity_inverted;
>  
>  	struct balance_callback *balance_callback;
>  
> @@ -2899,24 +2898,6 @@ static inline unsigned long capacity_orig_of(int cpu)
>  	return cpu_rq(cpu)->cpu_capacity_orig;
>  }
>  
> -/*
> - * Returns inverted capacity if the CPU is in capacity inversion state.
> - * 0 otherwise.
> - *
> - * Capacity inversion detection only considers thermal impact where actual
> - * performance points (OPPs) gets dropped.
> - *
> - * Capacity inversion state happens when another performance domain that has
> - * equal or lower capacity_orig_of() becomes effectively larger than the perf
> - * domain this CPU belongs to due to thermal pressure throttling it hard.
> - *
> - * See comment in update_cpu_capacity().
> - */
> -static inline unsigned long cpu_in_capacity_inversion(int cpu)
> -{
> -	return cpu_rq(cpu)->cpu_capacity_inverted;
> -}
> -
>  /**
>   * enum cpu_util_type - CPU utilization type
>   * @FREQUENCY_UTIL:	Utilization used to select frequency
> -- 
> 2.34.1
> 

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2 v5] sched/fair: Remove capacity inversion detection
  2023-02-04 18:42   ` Qais Yousef
@ 2023-02-05 17:29     ` Vincent Guittot
  2023-02-05 20:24       ` Qais Yousef
  0 siblings, 1 reply; 9+ messages in thread
From: Vincent Guittot @ 2023-02-05 17:29 UTC (permalink / raw)
  To: Qais Yousef
  Cc: mingo, peterz, dietmar.eggemann, rafael, viresh.kumar, vschneid,
	linux-pm, linux-kernel, kajetan.puchalski, lukasz.luba, wvw,
	xuewen.yan94, han.lin, Jonathan.JMChen

On Sat, 4 Feb 2023 at 19:42, Qais Yousef <qyousef@layalina.io> wrote:
>
> On 02/01/23 15:36, Vincent Guittot wrote:
> > Remove the capacity inversion detection which is now handled by
> > util_fits_cpu() returning -1 when we need to continue to look for a
> > potential CPU with better performance.
> >
> > This ends up almost reverting patches below except for some comments:
>
> nit: I think this comment must be removed/reworeded though

This comment has already been removed. That's why I said almost revert
except for some comments in the commit message

>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 691a2f9c4efa..c6c8e7f52935 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4476,10 +4476,6 @@ static inline int util_fits_cpu(unsigned long util,
>          *
>          * For uclamp_max, we can tolerate a drop in performance level as the
>          * goal is to cap the task. So it's okay if it's getting less.
> -        *
> -        * In case of capacity inversion, which is not handled yet, we should
> -        * honour the inverted capacity for both uclamp_min and uclamp_max all
> -        * the time.
>          */
>         capacity_orig = capacity_orig_of(cpu);
>         capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
>
> > commit da07d2f9c153 ("sched/fair: Fixes for capacity inversion detection")
> > commit aa69c36f31aa ("sched/fair: Consider capacity inversion in util_fits_cpu()")
> > commit 44c7b80bffc3 ("sched/fair: Detect capacity inversion")
> >
> > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
>
> Apart from that, LGTM.
>
> Reviewed-by: Qais Yousef <qyousef@layalina.io>

Thanks

>
>
> Thanks!
>
> --
> Qais Yousef
>
> > ---
> >  kernel/sched/fair.c  | 84 +++-----------------------------------------
> >  kernel/sched/sched.h | 19 ----------
> >  2 files changed, 5 insertions(+), 98 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 074742f107c0..c6c8e7f52935 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4476,17 +4476,9 @@ static inline int util_fits_cpu(unsigned long util,
> >        *
> >        * For uclamp_max, we can tolerate a drop in performance level as the
> >        * goal is to cap the task. So it's okay if it's getting less.
> > -      *
> > -      * In case of capacity inversion we should honour the inverted capacity
> > -      * for both uclamp_min and uclamp_max all the time.
> >        */
> > -     capacity_orig = cpu_in_capacity_inversion(cpu);
> > -     if (capacity_orig) {
> > -             capacity_orig_thermal = capacity_orig;
> > -     } else {
> > -             capacity_orig = capacity_orig_of(cpu);
> > -             capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
> > -     }
> > +     capacity_orig = capacity_orig_of(cpu);
> > +     capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
> >
> >       /*
> >        * We want to force a task to fit a cpu as implied by uclamp_max.
> > @@ -9027,82 +9019,16 @@ static unsigned long scale_rt_capacity(int cpu)
> >
> >  static void update_cpu_capacity(struct sched_domain *sd, int cpu)
> >  {
> > -     unsigned long capacity_orig = arch_scale_cpu_capacity(cpu);
> >       unsigned long capacity = scale_rt_capacity(cpu);
> >       struct sched_group *sdg = sd->groups;
> > -     struct rq *rq = cpu_rq(cpu);
> >
> > -     rq->cpu_capacity_orig = capacity_orig;
> > +     cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
> >
> >       if (!capacity)
> >               capacity = 1;
> >
> > -     rq->cpu_capacity = capacity;
> > -
> > -     /*
> > -      * Detect if the performance domain is in capacity inversion state.
> > -      *
> > -      * Capacity inversion happens when another perf domain with equal or
> > -      * lower capacity_orig_of() ends up having higher capacity than this
> > -      * domain after subtracting thermal pressure.
> > -      *
> > -      * We only take into account thermal pressure in this detection as it's
> > -      * the only metric that actually results in *real* reduction of
> > -      * capacity due to performance points (OPPs) being dropped/become
> > -      * unreachable due to thermal throttling.
> > -      *
> > -      * We assume:
> > -      *   * That all cpus in a perf domain have the same capacity_orig
> > -      *     (same uArch).
> > -      *   * Thermal pressure will impact all cpus in this perf domain
> > -      *     equally.
> > -      */
> > -     if (sched_energy_enabled()) {
> > -             unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
> > -             struct perf_domain *pd;
> > -
> > -             rcu_read_lock();
> > -
> > -             pd = rcu_dereference(rq->rd->pd);
> > -             rq->cpu_capacity_inverted = 0;
> > -
> > -             for (; pd; pd = pd->next) {
> > -                     struct cpumask *pd_span = perf_domain_span(pd);
> > -                     unsigned long pd_cap_orig, pd_cap;
> > -
> > -                     /* We can't be inverted against our own pd */
> > -                     if (cpumask_test_cpu(cpu_of(rq), pd_span))
> > -                             continue;
> > -
> > -                     cpu = cpumask_any(pd_span);
> > -                     pd_cap_orig = arch_scale_cpu_capacity(cpu);
> > -
> > -                     if (capacity_orig < pd_cap_orig)
> > -                             continue;
> > -
> > -                     /*
> > -                      * handle the case of multiple perf domains have the
> > -                      * same capacity_orig but one of them is under higher
> > -                      * thermal pressure. We record it as capacity
> > -                      * inversion.
> > -                      */
> > -                     if (capacity_orig == pd_cap_orig) {
> > -                             pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu));
> > -
> > -                             if (pd_cap > inv_cap) {
> > -                                     rq->cpu_capacity_inverted = inv_cap;
> > -                                     break;
> > -                             }
> > -                     } else if (pd_cap_orig > inv_cap) {
> > -                             rq->cpu_capacity_inverted = inv_cap;
> > -                             break;
> > -                     }
> > -             }
> > -
> > -             rcu_read_unlock();
> > -     }
> > -
> > -     trace_sched_cpu_capacity_tp(rq);
> > +     cpu_rq(cpu)->cpu_capacity = capacity;
> > +     trace_sched_cpu_capacity_tp(cpu_rq(cpu));
> >
> >       sdg->sgc->capacity = capacity;
> >       sdg->sgc->min_capacity = capacity;
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index 1072502976df..3e8df6d31c1e 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -1044,7 +1044,6 @@ struct rq {
> >
> >       unsigned long           cpu_capacity;
> >       unsigned long           cpu_capacity_orig;
> > -     unsigned long           cpu_capacity_inverted;
> >
> >       struct balance_callback *balance_callback;
> >
> > @@ -2899,24 +2898,6 @@ static inline unsigned long capacity_orig_of(int cpu)
> >       return cpu_rq(cpu)->cpu_capacity_orig;
> >  }
> >
> > -/*
> > - * Returns inverted capacity if the CPU is in capacity inversion state.
> > - * 0 otherwise.
> > - *
> > - * Capacity inversion detection only considers thermal impact where actual
> > - * performance points (OPPs) gets dropped.
> > - *
> > - * Capacity inversion state happens when another performance domain that has
> > - * equal or lower capacity_orig_of() becomes effectively larger than the perf
> > - * domain this CPU belongs to due to thermal pressure throttling it hard.
> > - *
> > - * See comment in update_cpu_capacity().
> > - */
> > -static inline unsigned long cpu_in_capacity_inversion(int cpu)
> > -{
> > -     return cpu_rq(cpu)->cpu_capacity_inverted;
> > -}
> > -
> >  /**
> >   * enum cpu_util_type - CPU utilization type
> >   * @FREQUENCY_UTIL:  Utilization used to select frequency
> > --
> > 2.34.1
> >

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2 v5] sched/fair: Remove capacity inversion detection
  2023-02-05 17:29     ` Vincent Guittot
@ 2023-02-05 20:24       ` Qais Yousef
  0 siblings, 0 replies; 9+ messages in thread
From: Qais Yousef @ 2023-02-05 20:24 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: mingo, peterz, dietmar.eggemann, rafael, viresh.kumar, vschneid,
	linux-pm, linux-kernel, kajetan.puchalski, lukasz.luba, wvw,
	xuewen.yan94, han.lin, Jonathan.JMChen

On 02/05/23 18:29, Vincent Guittot wrote:
> On Sat, 4 Feb 2023 at 19:42, Qais Yousef <qyousef@layalina.io> wrote:
> >
> > On 02/01/23 15:36, Vincent Guittot wrote:
> > > Remove the capacity inversion detection which is now handled by
> > > util_fits_cpu() returning -1 when we need to continue to look for a
> > > potential CPU with better performance.
> > >
> > > This ends up almost reverting patches below except for some comments:
> >
> > nit: I think this comment must be removed/reworeded though
> 
> This comment has already been removed. That's why I said almost revert
> except for some comments in the commit message

Oh, my diff cmd had the wrong order then, sorry.

> 
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 691a2f9c4efa..c6c8e7f52935 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4476,10 +4476,6 @@ static inline int util_fits_cpu(unsigned long util,
> >          *
> >          * For uclamp_max, we can tolerate a drop in performance level as the
> >          * goal is to cap the task. So it's okay if it's getting less.
> > -        *
> > -        * In case of capacity inversion, which is not handled yet, we should
> > -        * honour the inverted capacity for both uclamp_min and uclamp_max all
> > -        * the time.
> >          */
> >         capacity_orig = capacity_orig_of(cpu);
> >         capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
> >
> > > commit da07d2f9c153 ("sched/fair: Fixes for capacity inversion detection")
> > > commit aa69c36f31aa ("sched/fair: Consider capacity inversion in util_fits_cpu()")
> > > commit 44c7b80bffc3 ("sched/fair: Detect capacity inversion")
> > >
> > > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> >
> > Apart from that, LGTM.
> >
> > Reviewed-by: Qais Yousef <qyousef@layalina.io>
> 
> Thanks
> 
> >
> >
> > Thanks!
> >
> > --
> > Qais Yousef
> >
> > > ---
> > >  kernel/sched/fair.c  | 84 +++-----------------------------------------
> > >  kernel/sched/sched.h | 19 ----------
> > >  2 files changed, 5 insertions(+), 98 deletions(-)
> > >
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index 074742f107c0..c6c8e7f52935 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -4476,17 +4476,9 @@ static inline int util_fits_cpu(unsigned long util,
> > >        *
> > >        * For uclamp_max, we can tolerate a drop in performance level as the
> > >        * goal is to cap the task. So it's okay if it's getting less.
> > > -      *
> > > -      * In case of capacity inversion we should honour the inverted capacity
> > > -      * for both uclamp_min and uclamp_max all the time.
> > >        */
> > > -     capacity_orig = cpu_in_capacity_inversion(cpu);
> > > -     if (capacity_orig) {
> > > -             capacity_orig_thermal = capacity_orig;
> > > -     } else {
> > > -             capacity_orig = capacity_orig_of(cpu);
> > > -             capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
> > > -     }
> > > +     capacity_orig = capacity_orig_of(cpu);
> > > +     capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
> > >
> > >       /*
> > >        * We want to force a task to fit a cpu as implied by uclamp_max.
> > > @@ -9027,82 +9019,16 @@ static unsigned long scale_rt_capacity(int cpu)
> > >
> > >  static void update_cpu_capacity(struct sched_domain *sd, int cpu)
> > >  {
> > > -     unsigned long capacity_orig = arch_scale_cpu_capacity(cpu);
> > >       unsigned long capacity = scale_rt_capacity(cpu);
> > >       struct sched_group *sdg = sd->groups;
> > > -     struct rq *rq = cpu_rq(cpu);
> > >
> > > -     rq->cpu_capacity_orig = capacity_orig;
> > > +     cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
> > >
> > >       if (!capacity)
> > >               capacity = 1;
> > >
> > > -     rq->cpu_capacity = capacity;
> > > -
> > > -     /*
> > > -      * Detect if the performance domain is in capacity inversion state.
> > > -      *
> > > -      * Capacity inversion happens when another perf domain with equal or
> > > -      * lower capacity_orig_of() ends up having higher capacity than this
> > > -      * domain after subtracting thermal pressure.
> > > -      *
> > > -      * We only take into account thermal pressure in this detection as it's
> > > -      * the only metric that actually results in *real* reduction of
> > > -      * capacity due to performance points (OPPs) being dropped/become
> > > -      * unreachable due to thermal throttling.
> > > -      *
> > > -      * We assume:
> > > -      *   * That all cpus in a perf domain have the same capacity_orig
> > > -      *     (same uArch).
> > > -      *   * Thermal pressure will impact all cpus in this perf domain
> > > -      *     equally.
> > > -      */
> > > -     if (sched_energy_enabled()) {
> > > -             unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
> > > -             struct perf_domain *pd;
> > > -
> > > -             rcu_read_lock();
> > > -
> > > -             pd = rcu_dereference(rq->rd->pd);
> > > -             rq->cpu_capacity_inverted = 0;
> > > -
> > > -             for (; pd; pd = pd->next) {
> > > -                     struct cpumask *pd_span = perf_domain_span(pd);
> > > -                     unsigned long pd_cap_orig, pd_cap;
> > > -
> > > -                     /* We can't be inverted against our own pd */
> > > -                     if (cpumask_test_cpu(cpu_of(rq), pd_span))
> > > -                             continue;
> > > -
> > > -                     cpu = cpumask_any(pd_span);
> > > -                     pd_cap_orig = arch_scale_cpu_capacity(cpu);
> > > -
> > > -                     if (capacity_orig < pd_cap_orig)
> > > -                             continue;
> > > -
> > > -                     /*
> > > -                      * handle the case of multiple perf domains have the
> > > -                      * same capacity_orig but one of them is under higher
> > > -                      * thermal pressure. We record it as capacity
> > > -                      * inversion.
> > > -                      */
> > > -                     if (capacity_orig == pd_cap_orig) {
> > > -                             pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu));
> > > -
> > > -                             if (pd_cap > inv_cap) {
> > > -                                     rq->cpu_capacity_inverted = inv_cap;
> > > -                                     break;
> > > -                             }
> > > -                     } else if (pd_cap_orig > inv_cap) {
> > > -                             rq->cpu_capacity_inverted = inv_cap;
> > > -                             break;
> > > -                     }
> > > -             }
> > > -
> > > -             rcu_read_unlock();
> > > -     }
> > > -
> > > -     trace_sched_cpu_capacity_tp(rq);
> > > +     cpu_rq(cpu)->cpu_capacity = capacity;
> > > +     trace_sched_cpu_capacity_tp(cpu_rq(cpu));
> > >
> > >       sdg->sgc->capacity = capacity;
> > >       sdg->sgc->min_capacity = capacity;
> > > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > > index 1072502976df..3e8df6d31c1e 100644
> > > --- a/kernel/sched/sched.h
> > > +++ b/kernel/sched/sched.h
> > > @@ -1044,7 +1044,6 @@ struct rq {
> > >
> > >       unsigned long           cpu_capacity;
> > >       unsigned long           cpu_capacity_orig;
> > > -     unsigned long           cpu_capacity_inverted;
> > >
> > >       struct balance_callback *balance_callback;
> > >
> > > @@ -2899,24 +2898,6 @@ static inline unsigned long capacity_orig_of(int cpu)
> > >       return cpu_rq(cpu)->cpu_capacity_orig;
> > >  }
> > >
> > > -/*
> > > - * Returns inverted capacity if the CPU is in capacity inversion state.
> > > - * 0 otherwise.
> > > - *
> > > - * Capacity inversion detection only considers thermal impact where actual
> > > - * performance points (OPPs) gets dropped.
> > > - *
> > > - * Capacity inversion state happens when another performance domain that has
> > > - * equal or lower capacity_orig_of() becomes effectively larger than the perf
> > > - * domain this CPU belongs to due to thermal pressure throttling it hard.
> > > - *
> > > - * See comment in update_cpu_capacity().
> > > - */
> > > -static inline unsigned long cpu_in_capacity_inversion(int cpu)
> > > -{
> > > -     return cpu_rq(cpu)->cpu_capacity_inverted;
> > > -}
> > > -
> > >  /**
> > >   * enum cpu_util_type - CPU utilization type
> > >   * @FREQUENCY_UTIL:  Utilization used to select frequency
> > > --
> > > 2.34.1
> > >

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [tip: sched/core] sched/fair: Remove capacity inversion detection
  2023-02-01 14:36 ` [PATCH 2/2 v5] sched/fair: Remove capacity inversion detection Vincent Guittot
  2023-02-04 18:42   ` Qais Yousef
@ 2023-02-11 10:30   ` tip-bot2 for Vincent Guittot
  1 sibling, 0 replies; 9+ messages in thread
From: tip-bot2 for Vincent Guittot @ 2023-02-11 10:30 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Vincent Guittot, Peter Zijlstra (Intel), x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     a2e90611b9f425adbbfcdaa5b5e49958ddf6f61b
Gitweb:        https://git.kernel.org/tip/a2e90611b9f425adbbfcdaa5b5e49958ddf6f61b
Author:        Vincent Guittot <vincent.guittot@linaro.org>
AuthorDate:    Wed, 01 Feb 2023 15:36:28 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Sat, 11 Feb 2023 11:18:09 +01:00

sched/fair: Remove capacity inversion detection

Remove the capacity inversion detection which is now handled by
util_fits_cpu() returning -1 when we need to continue to look for a
potential CPU with better performance.

This ends up almost reverting patches below except for some comments:
commit da07d2f9c153 ("sched/fair: Fixes for capacity inversion detection")
commit aa69c36f31aa ("sched/fair: Consider capacity inversion in util_fits_cpu()")
commit 44c7b80bffc3 ("sched/fair: Detect capacity inversion")

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230201143628.270912-3-vincent.guittot@linaro.org
---
 kernel/sched/fair.c  | 84 ++-----------------------------------------
 kernel/sched/sched.h | 19 +----------
 2 files changed, 5 insertions(+), 98 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 074742f..c6c8e7f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4476,17 +4476,9 @@ static inline int util_fits_cpu(unsigned long util,
 	 *
 	 * For uclamp_max, we can tolerate a drop in performance level as the
 	 * goal is to cap the task. So it's okay if it's getting less.
-	 *
-	 * In case of capacity inversion we should honour the inverted capacity
-	 * for both uclamp_min and uclamp_max all the time.
 	 */
-	capacity_orig = cpu_in_capacity_inversion(cpu);
-	if (capacity_orig) {
-		capacity_orig_thermal = capacity_orig;
-	} else {
-		capacity_orig = capacity_orig_of(cpu);
-		capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
-	}
+	capacity_orig = capacity_orig_of(cpu);
+	capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
 
 	/*
 	 * We want to force a task to fit a cpu as implied by uclamp_max.
@@ -9027,82 +9019,16 @@ static unsigned long scale_rt_capacity(int cpu)
 
 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 {
-	unsigned long capacity_orig = arch_scale_cpu_capacity(cpu);
 	unsigned long capacity = scale_rt_capacity(cpu);
 	struct sched_group *sdg = sd->groups;
-	struct rq *rq = cpu_rq(cpu);
 
-	rq->cpu_capacity_orig = capacity_orig;
+	cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
 
 	if (!capacity)
 		capacity = 1;
 
-	rq->cpu_capacity = capacity;
-
-	/*
-	 * Detect if the performance domain is in capacity inversion state.
-	 *
-	 * Capacity inversion happens when another perf domain with equal or
-	 * lower capacity_orig_of() ends up having higher capacity than this
-	 * domain after subtracting thermal pressure.
-	 *
-	 * We only take into account thermal pressure in this detection as it's
-	 * the only metric that actually results in *real* reduction of
-	 * capacity due to performance points (OPPs) being dropped/become
-	 * unreachable due to thermal throttling.
-	 *
-	 * We assume:
-	 *   * That all cpus in a perf domain have the same capacity_orig
-	 *     (same uArch).
-	 *   * Thermal pressure will impact all cpus in this perf domain
-	 *     equally.
-	 */
-	if (sched_energy_enabled()) {
-		unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
-		struct perf_domain *pd;
-
-		rcu_read_lock();
-
-		pd = rcu_dereference(rq->rd->pd);
-		rq->cpu_capacity_inverted = 0;
-
-		for (; pd; pd = pd->next) {
-			struct cpumask *pd_span = perf_domain_span(pd);
-			unsigned long pd_cap_orig, pd_cap;
-
-			/* We can't be inverted against our own pd */
-			if (cpumask_test_cpu(cpu_of(rq), pd_span))
-				continue;
-
-			cpu = cpumask_any(pd_span);
-			pd_cap_orig = arch_scale_cpu_capacity(cpu);
-
-			if (capacity_orig < pd_cap_orig)
-				continue;
-
-			/*
-			 * handle the case of multiple perf domains have the
-			 * same capacity_orig but one of them is under higher
-			 * thermal pressure. We record it as capacity
-			 * inversion.
-			 */
-			if (capacity_orig == pd_cap_orig) {
-				pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu));
-
-				if (pd_cap > inv_cap) {
-					rq->cpu_capacity_inverted = inv_cap;
-					break;
-				}
-			} else if (pd_cap_orig > inv_cap) {
-				rq->cpu_capacity_inverted = inv_cap;
-				break;
-			}
-		}
-
-		rcu_read_unlock();
-	}
-
-	trace_sched_cpu_capacity_tp(rq);
+	cpu_rq(cpu)->cpu_capacity = capacity;
+	trace_sched_cpu_capacity_tp(cpu_rq(cpu));
 
 	sdg->sgc->capacity = capacity;
 	sdg->sgc->min_capacity = capacity;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1072502..3e8df6d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1044,7 +1044,6 @@ struct rq {
 
 	unsigned long		cpu_capacity;
 	unsigned long		cpu_capacity_orig;
-	unsigned long		cpu_capacity_inverted;
 
 	struct balance_callback *balance_callback;
 
@@ -2899,24 +2898,6 @@ static inline unsigned long capacity_orig_of(int cpu)
 	return cpu_rq(cpu)->cpu_capacity_orig;
 }
 
-/*
- * Returns inverted capacity if the CPU is in capacity inversion state.
- * 0 otherwise.
- *
- * Capacity inversion detection only considers thermal impact where actual
- * performance points (OPPs) gets dropped.
- *
- * Capacity inversion state happens when another performance domain that has
- * equal or lower capacity_orig_of() becomes effectively larger than the perf
- * domain this CPU belongs to due to thermal pressure throttling it hard.
- *
- * See comment in update_cpu_capacity().
- */
-static inline unsigned long cpu_in_capacity_inversion(int cpu)
-{
-	return cpu_rq(cpu)->cpu_capacity_inverted;
-}
-
 /**
  * enum cpu_util_type - CPU utilization type
  * @FREQUENCY_UTIL:	Utilization used to select frequency

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [tip: sched/core] sched/fair: unlink misfit task from cpu overutilized
  2023-02-01 14:36 ` [PATCH 1/2 v5] sched/fair: " Vincent Guittot
@ 2023-02-11 10:30   ` tip-bot2 for Vincent Guittot
  0 siblings, 0 replies; 9+ messages in thread
From: tip-bot2 for Vincent Guittot @ 2023-02-11 10:30 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Vincent Guittot, Peter Zijlstra (Intel),
	Dietmar Eggemann, Kajetan Puchalski, x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     e5ed0550c04c5469ecdc1634d8aa18c8609590f0
Gitweb:        https://git.kernel.org/tip/e5ed0550c04c5469ecdc1634d8aa18c8609590f0
Author:        Vincent Guittot <vincent.guittot@linaro.org>
AuthorDate:    Wed, 01 Feb 2023 15:36:27 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Sat, 11 Feb 2023 11:18:09 +01:00

sched/fair: unlink misfit task from cpu overutilized

By taking into account uclamp_min, the 1:1 relation between task misfit
and cpu overutilized is no more true as a task with a small util_avg may
not fit a high capacity cpu because of uclamp_min constraint.

Add a new state in util_fits_cpu() to reflect the case that task would fit
a CPU except for the uclamp_min hint which is a performance requirement.

Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
can use this new value to take additional action to select the best CPU
that doesn't match uclamp_min hint.

When util_fits_cpu() returns -1, we will continue to look for a possible
CPU with better performance, which replaces Capacity Inversion detection
with capacity_orig_of() - thermal_load_avg to detect a capacity inversion.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-and-tested-by: Qais Yousef <qyousef@layalina.io>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Kajetan Puchalski <kajetan.puchalski@arm.com>
Link: https://lore.kernel.org/r/20230201143628.270912-2-vincent.guittot@linaro.org
---
 kernel/sched/fair.c | 105 +++++++++++++++++++++++++++++++++----------
 1 file changed, 82 insertions(+), 23 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c46485..074742f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4561,8 +4561,8 @@ static inline int util_fits_cpu(unsigned long util,
 	 * handle the case uclamp_min > uclamp_max.
 	 */
 	uclamp_min = min(uclamp_min, uclamp_max);
-	if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
-		fits = fits && (uclamp_min <= capacity_orig_thermal);
+	if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
+		return -1;
 
 	return fits;
 }
@@ -4572,7 +4572,11 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
 	unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
 	unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
 	unsigned long util = task_util_est(p);
-	return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
+	/*
+	 * Return true only if the cpu fully fits the task requirements, which
+	 * include the utilization but also the performance hints.
+	 */
+	return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
 }
 
 static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -6138,6 +6142,7 @@ static inline bool cpu_overutilized(int cpu)
 	unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
 	unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
 
+	/* Return true only if the utilization doesn't fit CPU's capacity */
 	return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
 }
 
@@ -6931,6 +6936,7 @@ static int
 select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 {
 	unsigned long task_util, util_min, util_max, best_cap = 0;
+	int fits, best_fits = 0;
 	int cpu, best_cpu = -1;
 	struct cpumask *cpus;
 
@@ -6946,12 +6952,28 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 
 		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
 			continue;
-		if (util_fits_cpu(task_util, util_min, util_max, cpu))
+
+		fits = util_fits_cpu(task_util, util_min, util_max, cpu);
+
+		/* This CPU fits with all requirements */
+		if (fits > 0)
 			return cpu;
+		/*
+		 * Only the min performance hint (i.e. uclamp_min) doesn't fit.
+		 * Look for the CPU with best capacity.
+		 */
+		else if (fits < 0)
+			cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
 
-		if (cpu_cap > best_cap) {
+		/*
+		 * First, select CPU which fits better (-1 being better than 0).
+		 * Then, select the one with best capacity at same level.
+		 */
+		if ((fits < best_fits) ||
+		    ((fits == best_fits) && (cpu_cap > best_cap))) {
 			best_cap = cpu_cap;
 			best_cpu = cpu;
+			best_fits = fits;
 		}
 	}
 
@@ -6964,7 +6986,11 @@ static inline bool asym_fits_cpu(unsigned long util,
 				 int cpu)
 {
 	if (sched_asym_cpucap_active())
-		return util_fits_cpu(util, util_min, util_max, cpu);
+		/*
+		 * Return true only if the cpu fully fits the task requirements
+		 * which include the utilization and the performance hints.
+		 */
+		return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
 
 	return true;
 }
@@ -7331,6 +7357,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 	unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
 	struct root_domain *rd = this_rq()->rd;
 	int cpu, best_energy_cpu, target = -1;
+	int prev_fits = -1, best_fits = -1;
+	unsigned long best_thermal_cap = 0;
+	unsigned long prev_thermal_cap = 0;
 	struct sched_domain *sd;
 	struct perf_domain *pd;
 	struct energy_env eenv;
@@ -7366,6 +7395,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 		unsigned long prev_spare_cap = 0;
 		int max_spare_cap_cpu = -1;
 		unsigned long base_energy;
+		int fits, max_fits = -1;
 
 		cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
 
@@ -7415,7 +7445,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 				util_min = max(rq_util_min, p_util_min);
 				util_max = max(rq_util_max, p_util_max);
 			}
-			if (!util_fits_cpu(util, util_min, util_max, cpu))
+
+			fits = util_fits_cpu(util, util_min, util_max, cpu);
+			if (!fits)
 				continue;
 
 			lsub_positive(&cpu_cap, util);
@@ -7423,7 +7455,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 			if (cpu == prev_cpu) {
 				/* Always use prev_cpu as a candidate. */
 				prev_spare_cap = cpu_cap;
-			} else if (cpu_cap > max_spare_cap) {
+				prev_fits = fits;
+			} else if ((fits > max_fits) ||
+				   ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
 				/*
 				 * Find the CPU with the maximum spare capacity
 				 * among the remaining CPUs in the performance
@@ -7431,6 +7465,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 				 */
 				max_spare_cap = cpu_cap;
 				max_spare_cap_cpu = cpu;
+				max_fits = fits;
 			}
 		}
 
@@ -7449,26 +7484,50 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 			if (prev_delta < base_energy)
 				goto unlock;
 			prev_delta -= base_energy;
+			prev_thermal_cap = cpu_thermal_cap;
 			best_delta = min(best_delta, prev_delta);
 		}
 
 		/* Evaluate the energy impact of using max_spare_cap_cpu. */
 		if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
+			/* Current best energy cpu fits better */
+			if (max_fits < best_fits)
+				continue;
+
+			/*
+			 * Both don't fit performance hint (i.e. uclamp_min)
+			 * but best energy cpu has better capacity.
+			 */
+			if ((max_fits < 0) &&
+			    (cpu_thermal_cap <= best_thermal_cap))
+				continue;
+
 			cur_delta = compute_energy(&eenv, pd, cpus, p,
 						   max_spare_cap_cpu);
 			/* CPU utilization has changed */
 			if (cur_delta < base_energy)
 				goto unlock;
 			cur_delta -= base_energy;
-			if (cur_delta < best_delta) {
-				best_delta = cur_delta;
-				best_energy_cpu = max_spare_cap_cpu;
-			}
+
+			/*
+			 * Both fit for the task but best energy cpu has lower
+			 * energy impact.
+			 */
+			if ((max_fits > 0) && (best_fits > 0) &&
+			    (cur_delta >= best_delta))
+				continue;
+
+			best_delta = cur_delta;
+			best_energy_cpu = max_spare_cap_cpu;
+			best_fits = max_fits;
+			best_thermal_cap = cpu_thermal_cap;
 		}
 	}
 	rcu_read_unlock();
 
-	if (best_delta < prev_delta)
+	if ((best_fits > prev_fits) ||
+	    ((best_fits > 0) && (best_delta < prev_delta)) ||
+	    ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
 		target = best_energy_cpu;
 
 	return target;
@@ -10271,24 +10330,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	 */
 	update_sd_lb_stats(env, &sds);
 
-	if (sched_energy_enabled()) {
-		struct root_domain *rd = env->dst_rq->rd;
-
-		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
-			goto out_balanced;
-	}
-
-	local = &sds.local_stat;
-	busiest = &sds.busiest_stat;
-
 	/* There is no busy sibling group to pull tasks from */
 	if (!sds.busiest)
 		goto out_balanced;
 
+	busiest = &sds.busiest_stat;
+
 	/* Misfit tasks should be dealt with regardless of the avg load */
 	if (busiest->group_type == group_misfit_task)
 		goto force_balance;
 
+	if (sched_energy_enabled()) {
+		struct root_domain *rd = env->dst_rq->rd;
+
+		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
+			goto out_balanced;
+	}
+
 	/* ASYM feature bypasses nice load balance check */
 	if (busiest->group_type == group_asym_packing)
 		goto force_balance;
@@ -10301,6 +10359,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	if (busiest->group_type == group_imbalanced)
 		goto force_balance;
 
+	local = &sds.local_stat;
 	/*
 	 * If the local group is busier than the selected busiest group
 	 * don't try and pull any tasks.

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2023-02-11 10:31 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-02-01 14:36 [PATCH 0/2 v5] unlink misfit task from cpu overutilized Vincent Guittot
2023-02-01 14:36 ` [PATCH 1/2 v5] sched/fair: " Vincent Guittot
2023-02-11 10:30   ` [tip: sched/core] " tip-bot2 for Vincent Guittot
2023-02-01 14:36 ` [PATCH 2/2 v5] sched/fair: Remove capacity inversion detection Vincent Guittot
2023-02-04 18:42   ` Qais Yousef
2023-02-05 17:29     ` Vincent Guittot
2023-02-05 20:24       ` Qais Yousef
2023-02-11 10:30   ` [tip: sched/core] " tip-bot2 for Vincent Guittot
2023-02-02 11:29 ` [PATCH 0/2 v5] unlink misfit task from cpu overutilized Rafael J. Wysocki

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.