All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] sched/topology: remove sysctl_sched_energy_aware depending on the architecture
@ 2023-09-01  6:51 Shrikanth Hegde
  2023-09-01  6:57 ` Shrikanth Hegde
  0 siblings, 1 reply; 13+ messages in thread
From: Shrikanth Hegde @ 2023-09-01  6:51 UTC (permalink / raw)
  To: mingo, peterz, vincent.guittot
  Cc: sshegde, dietmar.eggemann, vschneid, linux-kernel,
	ionela.voinescu, quentin.perret, srikar, mgorman, mingo

Currently sysctl_sched_energy_aware doesn't alter the said behaviour on
some of the architectures. IIUC its meant to either force rebuild the
perf domains or cleanup the perf domains by echoing 1 or 0 respectively.

perf domains are not built when there is SMT, or when there is no
Asymmetric CPU topologies or when there is no frequency invariance.
Since such cases EAS is not set and perf domains are not built. By
changing the values of sysctl_sched_energy_aware, its not possible to
force build the perf domains. Hence remove this sysctl on such platforms
that dont support it. Some of the settings can be changed later
such as smt_active by offlining the CPU's, In those cases if
build_perf_domains returns true, re-enable the sysctl.

Anytime, when sysctl_sched_energy_aware is changed sched_energy_update
is set when building the perf domains. Making use of that to find out if
the change is happening by sysctl or dynamic system change.

Taking different cases:
Case1. system while booting has EAS capability, sysctl will be set 1. Hence
perf domains will be built if needed. On changing the sysctl to 0, since
sched_energy_update is true, perf domains would be freed and sysctl will
not be removed. later sysctl is changed to 1, enabling the perf domains
rebuild again. Since sysctl is already there, it will skip register.

Case2. System while booting doesn't have EAS Capability. Later after system
change it becomes capable of EAS. sched_energy_update is false. Though
sysctl is 0, will go ahead and try to enable eas. This is the current
behaviour. if has_eas  is true, then sysctl will be registered. After
that any sysctl change is same as Case1.

Case3. System becomes not capable of EAS due to system change. Here since
sched_energy_update is false, build_perf_domains return has_eas as false
due to one of the checks and Since this is dynamic change remove the sysctl.
Any further change which enables EAS is Case2

Note: This hasn't been tested on platform which supports EAS. If the
change can be verified on that it would really help. This has been
tested on power10 which doesn't support EAS. sysctl_sched_energy_aware
is removed with patch.

changes since v1:
Chen Yu had pointed out that this will not destroy the perf domains on
architectures where EAS is supported by changing the sysctl. This patch
addresses that.
[v1] Link: https://lore.kernel.org/lkml/20230829065040.920629-1-sshegde@linux.vnet.ibm.com/#t

Signed-off-by: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
---
 kernel/sched/topology.c | 45 +++++++++++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 05a5bc678c08..4d16269ac21a 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -208,7 +208,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)

 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
 DEFINE_STATIC_KEY_FALSE(sched_energy_present);
-static unsigned int sysctl_sched_energy_aware = 1;
+static unsigned int sysctl_sched_energy_aware;
+static struct ctl_table_header *sysctl_eas_header;
 static DEFINE_MUTEX(sched_energy_mutex);
 static bool sched_energy_update;

@@ -226,6 +227,7 @@ static int sched_energy_aware_handler(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret, state;
+	int prev_val = sysctl_sched_energy_aware;

 	if (write && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -233,8 +235,11 @@ static int sched_energy_aware_handler(struct ctl_table *table, int write,
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (!ret && write) {
 		state = static_branch_unlikely(&sched_energy_present);
-		if (state != sysctl_sched_energy_aware)
+		if (state != sysctl_sched_energy_aware && prev_val != sysctl_sched_energy_aware) {
+			if (sysctl_sched_energy_aware && !state)
+				pr_warn("Attempt to build energy domains when EAS is disabled\n");
 			rebuild_sched_domains_energy();
+		}
 	}

 	return ret;
@@ -255,7 +260,14 @@ static struct ctl_table sched_energy_aware_sysctls[] = {

 static int __init sched_energy_aware_sysctl_init(void)
 {
-	register_sysctl_init("kernel", sched_energy_aware_sysctls);
+	int cpu = cpumask_first(cpu_active_mask);
+
+	if (sched_smt_active() || !per_cpu(sd_asym_cpucapacity, cpu) ||
+	    !arch_scale_freq_invariant())
+		return 0;
+
+	sysctl_eas_header = register_sysctl("kernel", sched_energy_aware_sysctls);
+	sysctl_sched_energy_aware = 1;
 	return 0;
 }

@@ -336,10 +348,28 @@ static void sched_energy_set(bool has_eas)
 		if (sched_debug())
 			pr_info("%s: stopping EAS\n", __func__);
 		static_branch_disable_cpuslocked(&sched_energy_present);
+#ifdef CONFIG_PROC_SYSCTL
+		/*
+		 * if the architecture supports EAS and forcefully
+		 * perf domains are destroyed, there should be a sysctl
+		 * to enable it later. If this was due to dynamic system
+		 * change such as SMT<->NON_SMT then remove sysctl.
+		 */
+		if (sysctl_eas_header && !sched_energy_update) {
+			unregister_sysctl_table(sysctl_eas_header);
+			sysctl_eas_header = NULL;
+		}
+#endif
+		sysctl_sched_energy_aware = 0;
 	} else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
 		if (sched_debug())
 			pr_info("%s: starting EAS\n", __func__);
 		static_branch_enable_cpuslocked(&sched_energy_present);
+#ifdef CONFIG_PROC_SYSCTL
+		if (!sysctl_eas_header)
+			sysctl_eas_header = register_sysctl("kernel", sched_energy_aware_sysctls);
+#endif
+		sysctl_sched_energy_aware = 1;
 	}
 }

@@ -380,15 +410,14 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
 	struct cpufreq_policy *policy;
 	struct cpufreq_governor *gov;

-	if (!sysctl_sched_energy_aware)
+	if (!sysctl_sched_energy_aware && sched_energy_update)
 		goto free;

 	/* EAS is enabled for asymmetric CPU capacity topologies. */
 	if (!per_cpu(sd_asym_cpucapacity, cpu)) {
-		if (sched_debug()) {
-			pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
-					cpumask_pr_args(cpu_map));
-		}
+		if (sched_debug())
+			pr_info("rd %*pbl: Disabling EAS,  CPUs do not have asymmetric capacities\n",
+				cpumask_pr_args(cpu_map));
 		goto free;
 	}

--
2.31.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/topology: remove sysctl_sched_energy_aware depending on the architecture
  2023-09-01  6:51 [PATCH v2] sched/topology: remove sysctl_sched_energy_aware depending on the architecture Shrikanth Hegde
@ 2023-09-01  6:57 ` Shrikanth Hegde
  0 siblings, 0 replies; 13+ messages in thread
From: Shrikanth Hegde @ 2023-09-01  6:57 UTC (permalink / raw)
  To: mingo, peterz, vincent.guittot
  Cc: dietmar.eggemann, vschneid, linux-kernel, ionela.voinescu,
	quentin.perret, srikar, mgorman, mingo



On 9/1/23 12:21 PM, Shrikanth Hegde wrote:
> Currently sysctl_sched_energy_aware doesn't alter the said behaviour on
> some of the architectures. IIUC its meant to either force rebuild the
> perf domains or cleanup the perf domains by echoing 1 or 0 respectively.
> 
Please ignore this mail.

This is duplicate of 
https://lore.kernel.org/lkml/20230901065249.137242-1-sshegde@linux.vnet.ibm.com/

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/topology: remove sysctl_sched_energy_aware depending on the architecture
  2023-09-07 10:34     ` Pierre Gondois
@ 2023-09-13 11:55       ` Shrikanth Hegde
  0 siblings, 0 replies; 13+ messages in thread
From: Shrikanth Hegde @ 2023-09-13 11:55 UTC (permalink / raw)
  To: Pierre Gondois, Tim Chen
  Cc: dietmar.eggemann, vincent.guittot, peterz, mingo, vschneid,
	linux-kernel, ionela.voinescu, quentin.perret, srikar, mgorman,
	mingo, yu.c.chen



On 9/7/23 4:04 PM, Pierre Gondois wrote:
> Hello Shrikanth,
> 
> On 9/6/23 13:35, Shrikanth Hegde wrote:
>>
>>
>> On 9/5/23 7:33 PM, Pierre Gondois wrote:
>>> Hello Shrikanth,
>>> I tried the patch (on a platform using the cppc_cpufreq driver). The
>>> platform
>>> normally has EAS enabled, but the patch removed the sched_energy_aware
>>> sysctl.
>>> It seemed the following happened (in the below order):
>>>
>>> 1. sched_energy_aware_sysctl_init()
>>> Doesn't set sysctl_sched_energy_aware as cpufreq_freq_invariance
>>> isn't set
>>> and arch_scale_freq_invariant() returns false
>>>
>>> 2. cpufreq_register_driver()
>>> Sets cpufreq_freq_invariance during cpufreq initialization
>>> sched_energy_set()
>>>
>>> 3. sched_energy_set()
>>> Is called with has_eas=0 since build_perf_domains() doesn't see the
>>> platform
>>> as EAS compatible. Indeed sysctl_sched_energy_aware=0.
>>> So with sysctl_sched_energy_aware=0 and has_eas=0, sched_energy_aware
>>> sysctl
>>> is not enabled even though EAS should be possible.
>>>
>>>
>>> On 9/1/23 08:52, Shrikanth Hegde wrote:
>>>> Currently sysctl_sched_energy_aware doesn't alter the said behaviour on
>>>> some of the architectures. IIUC its meant to either force rebuild the
>>>> perf domains or cleanup the perf domains by echoing 1 or 0
>>>> respectively.
>>>
>>> There is a definition of the sysctl at:
>>> Documentation/admin-guide/sysctl/kernel.rst::sched_energy_aware
>> [...]
>>>>
>>>>
>>>> +static unsigned int sysctl_sched_energy_aware;
>>>> +static struct ctl_table_header *sysctl_eas_header;
>>>
>>> The variables around the presence/absence of EAS are:
>>> - sched_energy_present:
>>> EAS is up and running
>>>
>>> - sysctl_sched_energy_aware:
>>> The user wants to use EAS (or not). Doesn't mean EAS can run on the
>>> platform.
>>>
>>> - sched_energy_set/partition_sched_domains_locked's "has_eas":
>>> Local variable. Represent whether EAS can run on the platform.
>>>
>>> IMO it would be simpler to (un)register sched_energy_aware sysctl
>>> in partition_sched_domains_locked(), based on the value of "has_eas".
>>> This would allow to let all the logic as it is right now, inside
>>> build_perf_domains(), and then advertise sched_energy_aware sysctl
>>> if EAS can run on the platform.
>>> sched_energy_aware_sysctl_init() would be deleted then.
>>>
>>>
>> yes. that is true. and there is no variable which holds the info if
>> the system
>> is capable of EAS.
>>
>> Retrospecting, the reason for starting this patch series was this,
>> sysctl_sched_energy_aware didnt make sense on power10 platform since it
>> has SMT and symmetric CPU capacities.  with current code writing 1 to
>> it cause rebuild of sched domains but EAS wouldn't be possible.
>>
>>
>> Possible Approaches:
>>
>> 1.
>> Make this sysctl write as NOP if the platform doesn't has EAS
>> capabilities at
>> the moment.  Do those checks in sched_energy_aware_handler before
>> handling  the
>> change in value. Return EINVAL.  And Update sysctl description that on
>> such
>> platforms value change is NOP. Relatively simpler change.
>>
>> 2.
>> Current patch approach, remove the sysctl completely on non supported
>> architectures and re-enable it if the system becomes capable of doing
>> EAS.
>> With the current patch, instead of using sched_energy_update, use another
>> variable called sched_energy_change_in_sysctl(maybe different name). 
>> I think
>> that would handle all the cases. Another variable can be avoided by
>> encoding
>> the info in sysctl_sched_energy_aware itself in the handler call,
>> since it
>> takes only 1 or 0 as the value. upper bits are free to use. update the
>> sysctl
>> as well with this behavior. plus minor cleanup to remove the init of
>> sysctl.
>>
> 
> FWIW I think it makes more sense to remove the sysctl if EAS isn't
> possible on
> the platform, as this patch intends to do.
> From a code perspective I m not sure I follow exactly your intend. I can
> test
> your v3 if necessary,
> 

Hi Pierre,

Just sent out v3 of the series. 
https://lore.kernel.org/lkml/20230913114807.665094-1-sshegde@linux.vnet.ibm.com/

It would really help if you can test this out. I don't have system where EAS can be 
enabled at the moment. 

> Regards,
> Pierre
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/topology: remove sysctl_sched_energy_aware depending on the architecture
  2023-09-07 10:21   ` Pierre Gondois
@ 2023-09-07 15:04     ` Chen Yu
  0 siblings, 0 replies; 13+ messages in thread
From: Chen Yu @ 2023-09-07 15:04 UTC (permalink / raw)
  To: Pierre Gondois
  Cc: Shrikanth Hegde, mingo, peterz, vincent.guittot,
	dietmar.eggemann, vschneid, linux-kernel, ionela.voinescu,
	quentin.perret, srikar, mgorman, mingo

Hi Pierre,

On 2023-09-07 at 12:21:27 +0200, Pierre Gondois wrote:
> Hello Chen,
> 
> On 9/1/23 11:49, Chen Yu wrote:
> > Hi Shrikanth,
> > 
> > On 2023-09-01 at 12:22:49 +0530, Shrikanth Hegde wrote:
> > > Currently sysctl_sched_energy_aware doesn't alter the said behaviour on
> > > some of the architectures. IIUC its meant to either force rebuild the
> > > perf domains or cleanup the perf domains by echoing 1 or 0 respectively.
> > > 
> > > perf domains are not built when there is SMT, or when there is no
> > > Asymmetric CPU topologies or when there is no frequency invariance.
> > > Since such cases EAS is not set and perf domains are not built. By
> > > changing the values of sysctl_sched_energy_aware, its not possible to
> > > force build the perf domains. Hence remove this sysctl on such platforms
> > > that dont support it. Some of the settings can be changed later
> > > such as smt_active by offlining the CPU's, In those cases if
> > > build_perf_domains returns true, re-enable the sysctl.
> > > 
> > > Anytime, when sysctl_sched_energy_aware is changed sched_energy_update
> > > is set when building the perf domains. Making use of that to find out if
> > > the change is happening by sysctl or dynamic system change.
> > > 
> > > Taking different cases:
> > > Case1. system while booting has EAS capability, sysctl will be set 1. Hence
> > > perf domains will be built if needed. On changing the sysctl to 0, since
> > > sched_energy_update is true, perf domains would be freed and sysctl will
> > > not be removed. later sysctl is changed to 1, enabling the perf domains
> > > rebuild again. Since sysctl is already there, it will skip register.
> > > 
> > > Case2. System while booting doesn't have EAS Capability. Later after system
> > > change it becomes capable of EAS. sched_energy_update is false. Though
> > > sysctl is 0, will go ahead and try to enable eas. This is the current
> > > behaviour. if has_eas  is true, then sysctl will be registered. After
> > > that any sysctl change is same as Case1.
> > > 
> > 
> > I think this change makes sense. Just one question for case 2,
> > sched_energy_update is not strictly tied with sysctl change, right?
> > sched_energy_update is true in rebuild_sched_domains_energy().
> > rebuild_sched_domains_energy() will not only be invoked by sysctl
> > path via sched_energy_aware_handler(), but also by other path, such
> > as update_scale_freq_invariant(). If the system boots with EAS capability
> > disabled, then it becomes EAS capable due to the frequency invariant
> > readiness(cpufreq policy change?), then
> > cpufreq_notifier(CPUFREQ_CREATE_POLICY) -> init_amu_fie_callback() ->
> > amu_fie_setup() -> opology_set_scale_freq_source() ->
> > update_scale_freq_invariant(true) -> rebuild_sched_domains_energy()
> > Since sched_energy_update is true, the rebuild of perf domain will be skipped(but
> > actually we want to create it) Please correct me if I miss something.
> > 
> 
> I thought 'sched_energy_update' was here to force rebuilding the
> perf_domains instead. If sched_energy_update=1, then it prevents from finding
> a pre-existing perf_domain and skipping the perf_domain rebuild, unless I also
> missed something:
> 
> 
> #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
> 	/* Build perf. domains: */
> 	for (i = 0; i < ndoms_new; i++) {
> 		for (j = 0; j < n && !sched_energy_update; j++) {
> 			if (cpumask_equal(doms_new[i], doms_cur[j]) &&
> 			    cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
> 				has_eas = true;
> 				goto match3;
> 			}
> 		}
> 		/* No match - add perf. domains for a new rd */
> 		has_eas |= build_perf_domains(doms_new[i]);
> match3:
> 		;
> 	}
> 	sched_energy_set(has_eas);
> #endif

Yes, it enters build_perf_domains(), in which we have this:

-	if (!sysctl_sched_energy_aware)
+	if (!sysctl_sched_energy_aware && sched_energy_update)
 		goto free;

and domain rebuild will be skipped.

thanks,
Chenyu

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/topology: remove sysctl_sched_energy_aware depending on the architecture
  2023-09-06 11:35   ` Shrikanth Hegde
@ 2023-09-07 10:34     ` Pierre Gondois
  2023-09-13 11:55       ` Shrikanth Hegde
  0 siblings, 1 reply; 13+ messages in thread
From: Pierre Gondois @ 2023-09-07 10:34 UTC (permalink / raw)
  To: Shrikanth Hegde, Tim Chen
  Cc: dietmar.eggemann, vincent.guittot, peterz, mingo, vschneid,
	linux-kernel, ionela.voinescu, quentin.perret, srikar, mgorman,
	mingo, yu.c.chen

Hello Shrikanth,

On 9/6/23 13:35, Shrikanth Hegde wrote:
> 
> 
> On 9/5/23 7:33 PM, Pierre Gondois wrote:
>> Hello Shrikanth,
>> I tried the patch (on a platform using the cppc_cpufreq driver). The
>> platform
>> normally has EAS enabled, but the patch removed the sched_energy_aware
>> sysctl.
>> It seemed the following happened (in the below order):
>>
>> 1. sched_energy_aware_sysctl_init()
>> Doesn't set sysctl_sched_energy_aware as cpufreq_freq_invariance isn't set
>> and arch_scale_freq_invariant() returns false
>>
>> 2. cpufreq_register_driver()
>> Sets cpufreq_freq_invariance during cpufreq initialization
>> sched_energy_set()
>>
>> 3. sched_energy_set()
>> Is called with has_eas=0 since build_perf_domains() doesn't see the
>> platform
>> as EAS compatible. Indeed sysctl_sched_energy_aware=0.
>> So with sysctl_sched_energy_aware=0 and has_eas=0, sched_energy_aware
>> sysctl
>> is not enabled even though EAS should be possible.
>>
>>
>> On 9/1/23 08:52, Shrikanth Hegde wrote:
>>> Currently sysctl_sched_energy_aware doesn't alter the said behaviour on
>>> some of the architectures. IIUC its meant to either force rebuild the
>>> perf domains or cleanup the perf domains by echoing 1 or 0 respectively.
>>
>> There is a definition of the sysctl at:
>> Documentation/admin-guide/sysctl/kernel.rst::sched_energy_aware
> [...]
>>>
>>>
>>> +static unsigned int sysctl_sched_energy_aware;
>>> +static struct ctl_table_header *sysctl_eas_header;
>>
>> The variables around the presence/absence of EAS are:
>> - sched_energy_present:
>> EAS is up and running
>>
>> - sysctl_sched_energy_aware:
>> The user wants to use EAS (or not). Doesn't mean EAS can run on the
>> platform.
>>
>> - sched_energy_set/partition_sched_domains_locked's "has_eas":
>> Local variable. Represent whether EAS can run on the platform.
>>
>> IMO it would be simpler to (un)register sched_energy_aware sysctl
>> in partition_sched_domains_locked(), based on the value of "has_eas".
>> This would allow to let all the logic as it is right now, inside
>> build_perf_domains(), and then advertise sched_energy_aware sysctl
>> if EAS can run on the platform.
>> sched_energy_aware_sysctl_init() would be deleted then.
>>
>>
> yes. that is true. and there is no variable which holds the info if the system
> is capable of EAS.
> 
> Retrospecting, the reason for starting this patch series was this,
> sysctl_sched_energy_aware didnt make sense on power10 platform since it
> has SMT and symmetric CPU capacities.  with current code writing 1 to
> it cause rebuild of sched domains but EAS wouldn't be possible.
> 
> 
> Possible Approaches:
> 
> 1.
> Make this sysctl write as NOP if the platform doesn't has EAS capabilities at
> the moment.  Do those checks in sched_energy_aware_handler before handling  the
> change in value. Return EINVAL.  And Update sysctl description that on such
> platforms value change is NOP. Relatively simpler change.
> 
> 2.
> Current patch approach, remove the sysctl completely on non supported
> architectures and re-enable it if the system becomes capable of doing EAS.
> With the current patch, instead of using sched_energy_update, use another
> variable called sched_energy_change_in_sysctl(maybe different name).  I think
> that would handle all the cases. Another variable can be avoided by encoding
> the info in sysctl_sched_energy_aware itself in the handler call, since it
> takes only 1 or 0 as the value. upper bits are free to use. update the sysctl
> as well with this behavior. plus minor cleanup to remove the init of sysctl.
> 

FWIW I think it makes more sense to remove the sysctl if EAS isn't possible on
the platform, as this patch intends to do.
 From a code perspective I m not sure I follow exactly your intend. I can test
your v3 if necessary,

Regards,
Pierre


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/topology: remove sysctl_sched_energy_aware depending on the architecture
  2023-09-01  9:49 ` Chen Yu
  2023-09-01 13:08   ` Shrikanth Hegde
@ 2023-09-07 10:21   ` Pierre Gondois
  2023-09-07 15:04     ` Chen Yu
  1 sibling, 1 reply; 13+ messages in thread
From: Pierre Gondois @ 2023-09-07 10:21 UTC (permalink / raw)
  To: Chen Yu, Shrikanth Hegde
  Cc: mingo, peterz, vincent.guittot, dietmar.eggemann, vschneid,
	linux-kernel, ionela.voinescu, quentin.perret, srikar, mgorman,
	mingo

Hello Chen,

On 9/1/23 11:49, Chen Yu wrote:
> Hi Shrikanth,
> 
> On 2023-09-01 at 12:22:49 +0530, Shrikanth Hegde wrote:
>> Currently sysctl_sched_energy_aware doesn't alter the said behaviour on
>> some of the architectures. IIUC its meant to either force rebuild the
>> perf domains or cleanup the perf domains by echoing 1 or 0 respectively.
>>
>> perf domains are not built when there is SMT, or when there is no
>> Asymmetric CPU topologies or when there is no frequency invariance.
>> Since such cases EAS is not set and perf domains are not built. By
>> changing the values of sysctl_sched_energy_aware, its not possible to
>> force build the perf domains. Hence remove this sysctl on such platforms
>> that dont support it. Some of the settings can be changed later
>> such as smt_active by offlining the CPU's, In those cases if
>> build_perf_domains returns true, re-enable the sysctl.
>>
>> Anytime, when sysctl_sched_energy_aware is changed sched_energy_update
>> is set when building the perf domains. Making use of that to find out if
>> the change is happening by sysctl or dynamic system change.
>>
>> Taking different cases:
>> Case1. system while booting has EAS capability, sysctl will be set 1. Hence
>> perf domains will be built if needed. On changing the sysctl to 0, since
>> sched_energy_update is true, perf domains would be freed and sysctl will
>> not be removed. later sysctl is changed to 1, enabling the perf domains
>> rebuild again. Since sysctl is already there, it will skip register.
>>
>> Case2. System while booting doesn't have EAS Capability. Later after system
>> change it becomes capable of EAS. sched_energy_update is false. Though
>> sysctl is 0, will go ahead and try to enable eas. This is the current
>> behaviour. if has_eas  is true, then sysctl will be registered. After
>> that any sysctl change is same as Case1.
>>
> 
> I think this change makes sense. Just one question for case 2,
> sched_energy_update is not strictly tied with sysctl change, right?
> sched_energy_update is true in rebuild_sched_domains_energy().
> rebuild_sched_domains_energy() will not only be invoked by sysctl
> path via sched_energy_aware_handler(), but also by other path, such
> as update_scale_freq_invariant(). If the system boots with EAS capability
> disabled, then it becomes EAS capable due to the frequency invariant
> readiness(cpufreq policy change?), then
> cpufreq_notifier(CPUFREQ_CREATE_POLICY) -> init_amu_fie_callback() ->
> amu_fie_setup() -> opology_set_scale_freq_source() ->
> update_scale_freq_invariant(true) -> rebuild_sched_domains_energy()
> Since sched_energy_update is true, the rebuild of perf domain will be skipped(but
> actually we want to create it) Please correct me if I miss something.
> 

I thought 'sched_energy_update' was here to force rebuilding the
perf_domains instead. If sched_energy_update=1, then it prevents from finding
a pre-existing perf_domain and skipping the perf_domain rebuild, unless I also
missed something:


#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
	/* Build perf. domains: */
	for (i = 0; i < ndoms_new; i++) {
		for (j = 0; j < n && !sched_energy_update; j++) {
			if (cpumask_equal(doms_new[i], doms_cur[j]) &&
			    cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
				has_eas = true;
				goto match3;
			}
		}
		/* No match - add perf. domains for a new rd */
		has_eas |= build_perf_domains(doms_new[i]);
match3:
		;
	}
	sched_energy_set(has_eas);
#endif

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/topology: remove sysctl_sched_energy_aware depending on the architecture
  2023-09-05 14:03 ` Pierre Gondois
  2023-09-05 21:53   ` Tim Chen
  2023-09-06  3:21   ` Shrikanth Hegde
@ 2023-09-06 11:35   ` Shrikanth Hegde
  2023-09-07 10:34     ` Pierre Gondois
  2 siblings, 1 reply; 13+ messages in thread
From: Shrikanth Hegde @ 2023-09-06 11:35 UTC (permalink / raw)
  To: Pierre Gondois, Tim Chen
  Cc: dietmar.eggemann, vincent.guittot, peterz, mingo, vschneid,
	linux-kernel, ionela.voinescu, quentin.perret, srikar, mgorman,
	mingo, yu.c.chen



On 9/5/23 7:33 PM, Pierre Gondois wrote:
> Hello Shrikanth,
> I tried the patch (on a platform using the cppc_cpufreq driver). The
> platform
> normally has EAS enabled, but the patch removed the sched_energy_aware
> sysctl.
> It seemed the following happened (in the below order):
> 
> 1. sched_energy_aware_sysctl_init()
> Doesn't set sysctl_sched_energy_aware as cpufreq_freq_invariance isn't set
> and arch_scale_freq_invariant() returns false
> 
> 2. cpufreq_register_driver()
> Sets cpufreq_freq_invariance during cpufreq initialization
> sched_energy_set()
> 
> 3. sched_energy_set()
> Is called with has_eas=0 since build_perf_domains() doesn't see the
> platform
> as EAS compatible. Indeed sysctl_sched_energy_aware=0.
> So with sysctl_sched_energy_aware=0 and has_eas=0, sched_energy_aware
> sysctl
> is not enabled even though EAS should be possible.
> 
> 
> On 9/1/23 08:52, Shrikanth Hegde wrote:
>> Currently sysctl_sched_energy_aware doesn't alter the said behaviour on
>> some of the architectures. IIUC its meant to either force rebuild the
>> perf domains or cleanup the perf domains by echoing 1 or 0 respectively.
> 
> There is a definition of the sysctl at:
> Documentation/admin-guide/sysctl/kernel.rst::sched_energy_aware
[...]
>>
>>
>> +static unsigned int sysctl_sched_energy_aware;
>> +static struct ctl_table_header *sysctl_eas_header;
> 
> The variables around the presence/absence of EAS are:
> - sched_energy_present:
> EAS is up and running
> 
> - sysctl_sched_energy_aware:
> The user wants to use EAS (or not). Doesn't mean EAS can run on the
> platform.
> 
> - sched_energy_set/partition_sched_domains_locked's "has_eas":
> Local variable. Represent whether EAS can run on the platform.
> 
> IMO it would be simpler to (un)register sched_energy_aware sysctl
> in partition_sched_domains_locked(), based on the value of "has_eas".
> This would allow to let all the logic as it is right now, inside
> build_perf_domains(), and then advertise sched_energy_aware sysctl
> if EAS can run on the platform.
> sched_energy_aware_sysctl_init() would be deleted then.
> 
> 
yes. that is true. and there is no variable which holds the info if the system
is capable of EAS.

Retrospecting, the reason for starting this patch series was this,
sysctl_sched_energy_aware didnt make sense on power10 platform since it
has SMT and symmetric CPU capacities.  with current code writing 1 to
it cause rebuild of sched domains but EAS wouldn't be possible.


Possible Approaches: 

1. 
Make this sysctl write as NOP if the platform doesn't has EAS capabilities at
the moment.  Do those checks in sched_energy_aware_handler before handling  the
change in value. Return EINVAL.  And Update sysctl description that on such
platforms value change is NOP. Relatively simpler change.

2. 
Current patch approach, remove the sysctl completely on non supported
architectures and re-enable it if the system becomes capable of doing EAS.
With the current patch, instead of using sched_energy_update, use another
variable called sched_energy_change_in_sysctl(maybe different name).  I think
that would handle all the cases. Another variable can be avoided by encoding
the info in sysctl_sched_energy_aware itself in the handler call, since it
takes only 1 or 0 as the value. upper bits are free to use. update the sysctl
as well with this behavior. plus minor cleanup to remove the init of sysctl. 

Suggestions?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/topology: remove sysctl_sched_energy_aware depending on the architecture
  2023-09-05 14:03 ` Pierre Gondois
  2023-09-05 21:53   ` Tim Chen
@ 2023-09-06  3:21   ` Shrikanth Hegde
  2023-09-06 11:35   ` Shrikanth Hegde
  2 siblings, 0 replies; 13+ messages in thread
From: Shrikanth Hegde @ 2023-09-06  3:21 UTC (permalink / raw)
  To: Pierre Gondois
  Cc: dietmar.eggemann, vincent.guittot, peterz, mingo, vschneid,
	linux-kernel, ionela.voinescu, quentin.perret, srikar, mgorman,
	mingo, yu.c.chen



On 9/5/23 7:33 PM, Pierre Gondois wrote:
> Hello Shrikanth,

Hi Pierre. Thanks for the review. 

> I tried the patch (on a platform using the cppc_cpufreq driver). The
> platform
> normally has EAS enabled, but the patch removed the sched_energy_aware
> sysctl.
> It seemed the following happened (in the below order):
> 
> 1. sched_energy_aware_sysctl_init()
> Doesn't set sysctl_sched_energy_aware as cpufreq_freq_invariance isn't set
> and arch_scale_freq_invariant() returns false
> 
> 2. cpufreq_register_driver()
> Sets cpufreq_freq_invariance during cpufreq initialization
> sched_energy_set()
> 
> 3. sched_energy_set()
> Is called with has_eas=0 since build_perf_domains() doesn't see the
> platform
> as EAS compatible. Indeed sysctl_sched_energy_aware=0.
> So with sysctl_sched_energy_aware=0 and has_eas=0, sched_energy_aware
> sysctl
> is not enabled even though EAS should be possible.
> 
> 

Yes. That is the case mentioned by Chen Yu. this patch missed that case. 

> On 9/1/23 08:52, Shrikanth Hegde wrote:
>> Currently sysctl_sched_energy_aware doesn't alter the said behaviour on
>> some of the architectures. IIUC its meant to either force rebuild the
>> perf domains or cleanup the perf domains by echoing 1 or 0 respectively.
> 
> There is a definition of the sysctl at:
> Documentation/admin-guide/sysctl/kernel.rst::sched_energy_aware
> 

Sorry my bad, i didnt read it, It is as below.
"Enables/disables Energy Aware Scheduling (EAS). EAS starts                      
automatically on platforms where it can run (that is,                           
platforms with asymmetric CPU topologies and having an Energy                   
Model available). If your platform happens to meet the                          
requirements for EAS but you do not want to use it, change                      
this value to 0.   "


It doesn't define whats the expected behavior on platforms 
which doesn't support EAS.




> Also a personal comment about the commit message (FWIW), I think it should
> be a bit more impersonal and factual. The commit message seems to describe
> the code rather than the desired behaviour.
> 
>>
>> perf domains are not built when there is SMT, or when there is no
>> Asymmetric CPU topologies or when there is no frequency invariance.
>> Since such cases EAS is not set and perf domains are not built. By
>> changing the values of sysctl_sched_energy_aware, its not possible to
>> force build the perf domains. Hence remove this sysctl on such platforms
>> that dont support it. Some of the settings can be changed later
>> such as smt_active by offlining the CPU's, In those cases if
>> build_perf_domains returns true, re-enable the sysctl.
>>
>> Anytime, when sysctl_sched_energy_aware is changed sched_energy_update
>> is set when building the perf domains. Making use of that to find out if
>> the change is happening by sysctl or dynamic system change.
>>
>> Taking different cases:
>> Case1. system while booting has EAS capability, sysctl will be set 1.
>> Hence
>> perf domains will be built if needed. On changing the sysctl to 0, since
>> sched_energy_update is true, perf domains would be freed and sysctl will
>> not be removed. later sysctl is changed to 1, enabling the perf domains
>> rebuild again. Since sysctl is already there, it will skip register.
>>
>> Case2. System while booting doesn't have EAS Capability. Later after
>> system
>> change it becomes capable of EAS. sched_energy_update is false. Though
>> sysctl is 0, will go ahead and try to enable eas. This is the current
>> behaviour. if has_eas  is true, then sysctl will be registered. After
>> that any sysctl change is same as Case1.
>>
>> Case3. System becomes not capable of EAS due to system change. Here since
>> sched_energy_update is false, build_perf_domains return has_eas as false
>> due to one of the checks and Since this is dynamic change remove the
>> sysctl.
>> Any further change which enables EAS is Case2
>>
>> Note: This hasn't been tested on platform which supports EAS. If the
>> change can be verified on that it would really help. This has been
>> tested on power10 which doesn't support EAS. sysctl_sched_energy_aware
>> is removed with patch.
>>
>> changes since v1:
>> Chen Yu had pointed out that this will not destroy the perf domains on
>> architectures where EAS is supported by changing the sysctl. This patch
>> addresses that.
>> [v1] Link:
>> https://lore.kernel.org/lkml/20230829065040.920629-1-sshegde@linux.vnet.ibm.com/#t
>>
>> Signed-off-by: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
>> ---
>>   kernel/sched/topology.c | 45 +++++++++++++++++++++++++++++++++--------
>>   1 file changed, 37 insertions(+), 8 deletions(-)
>>
>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>> index 05a5bc678c08..4d16269ac21a 100644
>> --- a/kernel/sched/topology.c
>> +++ b/kernel/sched/topology.c
>> @@ -208,7 +208,8 @@ sd_parent_degenerate(struct sched_domain *sd,
>> struct sched_domain *parent)
>>
>>   #if defined(CONFIG_ENERGY_MODEL) &&
>> defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
>>   DEFINE_STATIC_KEY_FALSE(sched_energy_present);
>> -static unsigned int sysctl_sched_energy_aware = 1;
>> +static unsigned int sysctl_sched_energy_aware;
>> +static struct ctl_table_header *sysctl_eas_header;
> 
> The variables around the presence/absence of EAS are:
> - sched_energy_present:
> EAS is up and running
> 
> - sysctl_sched_energy_aware:
> The user wants to use EAS (or not). Doesn't mean EAS can run on the
> platform.
> 
> - sched_energy_set/partition_sched_domains_locked's "has_eas":
> Local variable. Represent whether EAS can run on the platform.
> 
> IMO it would be simpler to (un)register sched_energy_aware sysctl
> in partition_sched_domains_locked(), based on the value of "has_eas".
> This would allow to let all the logic as it is right now, inside
> build_perf_domains(), and then advertise sched_energy_aware sysctl
> if EAS can run on the platform.
> sched_energy_aware_sysctl_init() would be deleted then.
> 
> 

problem with that is for systems that have EAS capability. 
i.e system has EAS, but admin wanted to to disable the EAS. hence 
has_eas will return false. But sysctl cannot be removed in this case. 

>>   static DEFINE_MUTEX(sched_energy_mutex);
>>   static bool sched_energy_update;
>>
>> @@ -226,6 +227,7 @@ static int sched_energy_aware_handler(struct
>> ctl_table *table, int write,
>>           void *buffer, size_t *lenp, loff_t *ppos)
>>   {
>>       int ret, state;
>> +    int prev_val = sysctl_sched_energy_aware;
>>
>>       if (write && !capable(CAP_SYS_ADMIN))
>>           return -EPERM;
>> @@ -233,8 +235,11 @@ static int sched_energy_aware_handler(struct
>> ctl_table *table, int write,
>>       ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
>>       if (!ret && write) {
>>           state = static_branch_unlikely(&sched_energy_present);
>> -        if (state != sysctl_sched_energy_aware)
>> +        if (state != sysctl_sched_energy_aware && prev_val !=
>> sysctl_sched_energy_aware) {
>> +            if (sysctl_sched_energy_aware && !state)
>> +                pr_warn("Attempt to build energy domains when EAS is
>> disabled\n");
>>               rebuild_sched_domains_energy();
>> +        }
>>       }
>>
>>       return ret;
>> @@ -255,7 +260,14 @@ static struct ctl_table
>> sched_energy_aware_sysctls[] = {
>>
>>   static int __init sched_energy_aware_sysctl_init(void)
>>   {
>> -    register_sysctl_init("kernel", sched_energy_aware_sysctls);
>> +    int cpu = cpumask_first(cpu_active_mask);
>> +
>> +    if (sched_smt_active() || !per_cpu(sd_asym_cpucapacity, cpu) ||
>> +        !arch_scale_freq_invariant())
>> +        return 0;
>> +
>> +    sysctl_eas_header = register_sysctl("kernel",
>> sched_energy_aware_sysctls);
>> +    sysctl_sched_energy_aware = 1;
>>       return 0;
>>   }
>>
>> @@ -336,10 +348,28 @@ static void sched_energy_set(bool has_eas)
>>           if (sched_debug())
>>               pr_info("%s: stopping EAS\n", __func__);
>>           static_branch_disable_cpuslocked(&sched_energy_present);
>> +#ifdef CONFIG_PROC_SYSCTL
>> +        /*
>> +         * if the architecture supports EAS and forcefully
>> +         * perf domains are destroyed, there should be a sysctl
>> +         * to enable it later. If this was due to dynamic system
>> +         * change such as SMT<->NON_SMT then remove sysctl.
>> +         */
>> +        if (sysctl_eas_header && !sched_energy_update) {
>> +            unregister_sysctl_table(sysctl_eas_header);
>> +            sysctl_eas_header = NULL;
>> +        }
>> +#endif
>> +        sysctl_sched_energy_aware = 0;
>>       } else if (has_eas &&
>> !static_branch_unlikely(&sched_energy_present)) {
>>           if (sched_debug())
>>               pr_info("%s: starting EAS\n", __func__);
>>           static_branch_enable_cpuslocked(&sched_energy_present);
>> +#ifdef CONFIG_PROC_SYSCTL
>> +        if (!sysctl_eas_header)
>> +            sysctl_eas_header = register_sysctl("kernel",
>> sched_energy_aware_sysctls);
>> +#endif
>> +        sysctl_sched_energy_aware = 1;
>>       }
>>   }
>>
>> @@ -380,15 +410,14 @@ static bool build_perf_domains(const struct
>> cpumask *cpu_map)
>>       struct cpufreq_policy *policy;
>>       struct cpufreq_governor *gov;
>>
>> -    if (!sysctl_sched_energy_aware)
>> +    if (!sysctl_sched_energy_aware && sched_energy_update)
>>           goto free;
>>
>>       /* EAS is enabled for asymmetric CPU capacity topologies. */
>>       if (!per_cpu(sd_asym_cpucapacity, cpu)) {
>> -        if (sched_debug()) {
>> -            pr_info("rd %*pbl: CPUs do not have asymmetric
>> capacities\n",
>> -                    cpumask_pr_args(cpu_map));
>> -        }
>> +        if (sched_debug())
>> +            pr_info("rd %*pbl: Disabling EAS,  CPUs do not have
>> asymmetric capacities\n",
>> +                cpumask_pr_args(cpu_map));
>>           goto free;
>>       }
>>
>> -- 
>> 2.31.1
>>
>>
> 
> Regards,
> Pierre

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/topology: remove sysctl_sched_energy_aware depending on the architecture
  2023-09-05 14:03 ` Pierre Gondois
@ 2023-09-05 21:53   ` Tim Chen
  2023-09-06  3:21   ` Shrikanth Hegde
  2023-09-06 11:35   ` Shrikanth Hegde
  2 siblings, 0 replies; 13+ messages in thread
From: Tim Chen @ 2023-09-05 21:53 UTC (permalink / raw)
  To: Pierre Gondois, Shrikanth Hegde
  Cc: dietmar.eggemann, vincent.guittot, peterz, mingo, vschneid,
	linux-kernel, ionela.voinescu, quentin.perret, srikar, mgorman,
	mingo, yu.c.chen

On Tue, 2023-09-05 at 16:03 +0200, Pierre Gondois wrote:
> Hello Shrikanth,
> I tried the patch (on a platform using the cppc_cpufreq driver). The platform
> normally has EAS enabled, but the patch removed the sched_energy_aware sysctl.
> It seemed the following happened (in the below order):
> 
> 1. sched_energy_aware_sysctl_init()
> Doesn't set sysctl_sched_energy_aware as cpufreq_freq_invariance isn't set
> and arch_scale_freq_invariant() returns false
> 
> 2. cpufreq_register_driver()
> Sets cpufreq_freq_invariance during cpufreq initialization sched_energy_set()
> 
> 3. sched_energy_set()
> Is called with has_eas=0 since build_perf_domains() doesn't see the platform
> as EAS compatible. Indeed sysctl_sched_energy_aware=0.
> So with sysctl_sched_energy_aware=0 and has_eas=0, sched_energy_aware sysctl
> is not enabled even though EAS should be possible.
> 
> 
> On 9/1/23 08:52, Shrikanth Hegde wrote:
> > Currently sysctl_sched_energy_aware doesn't alter the said behaviour on
> > some of the architectures. IIUC its meant to either force rebuild the
> > perf domains or cleanup the perf domains by echoing 1 or 0 respectively.
> 
> There is a definition of the sysctl at:
> Documentation/admin-guide/sysctl/kernel.rst::sched_energy_aware
> 
> Also a personal comment about the commit message (FWIW), I think it should
> be a bit more impersonal and factual. The commit message seems to describe
> the code rather than the desired behaviour.

I also wonder if Shrikanth's description of the operations can be simplified.

In my mind, There are 3 variables describing the system:

1. sched_energy_capable : whether system is EAS capable
2. sched_energy_aware   : whether the admin wants to enables EAS
3. sched_energy_status  : sched_energy_capable && sched_energy_aware

Whenever there is a change in sched_energy_status, then we should trigger a rebuild
of the sched domain.  We should expose sched_energy_capable
to user rather than removing sched_energy_aware when sched_energy_capable == 0.

If the user know the value of sched_energy_capable, the user will know
if setting sched_energy_aware will change the system's sched_energy_status.

For system that can never support EAS,
we should simply make sched_energy_aware to be 0 and disallow it from getting written.

On systems that allow sched_energy_capable to be enabled (e.g. by brining smt on/offline),
we should allow setting sched_energy_aware even when sched_energy_capable is 0.
Once sched_energy_capable becomes 1, EAS is enabled.


Tim 
  
> 
> > 
> > perf domains are not built when there is SMT, or when there is no
> > Asymmetric CPU topologies or when there is no frequency invariance.
> > Since such cases EAS is not set and perf domains are not built. By
> > changing the values of sysctl_sched_energy_aware, its not possible to
> > force build the perf domains. Hence remove this sysctl on such platforms
> > that dont support it. Some of the settings can be changed later
> > such as smt_active by offlining the CPU's, In those cases if
> > build_perf_domains returns true, re-enable the sysctl.
> > 
> > Anytime, when sysctl_sched_energy_aware is changed sched_energy_update
> > is set when building the perf domains. Making use of that to find out if
> > the change is happening by sysctl or dynamic system change.
> > 
> > Taking different cases:
> > Case1. system while booting has EAS capability, sysctl will be set 1. Hence
> > perf domains will be built if needed. On changing the sysctl to 0, since
> > sched_energy_update is true, perf domains would be freed and sysctl will
> > not be removed. later sysctl is changed to 1, enabling the perf domains
> > rebuild again. Since sysctl is already there, it will skip register.
> > 
> > Case2. System while booting doesn't have EAS Capability. Later after system
> > change it becomes capable of EAS. sched_energy_update is false. Though
> > sysctl is 0, will go ahead and try to enable eas. This is the current
> > behaviour. if has_eas  is true, then sysctl will be registered. After
> > that any sysctl change is same as Case1.
> > 
> > Case3. System becomes not capable of EAS due to system change. Here since
> > sched_energy_update is false, build_perf_domains return has_eas as false
> > due to one of the checks and Since this is dynamic change remove the sysctl.
> > Any further change which enables EAS is Case2
> > 
> > Note: This hasn't been tested on platform which supports EAS. If the
> > change can be verified on that it would really help. This has been
> > tested on power10 which doesn't support EAS. sysctl_sched_energy_aware
> > is removed with patch.
> > 
> > changes since v1:
> > Chen Yu had pointed out that this will not destroy the perf domains on
> > architectures where EAS is supported by changing the sysctl. This patch
> > addresses that.
> > [v1] Link: https://lore.kernel.org/lkml/20230829065040.920629-1-sshegde@linux.vnet.ibm.com/#t
> > 
> > Signed-off-by: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
> > ---
> >   kernel/sched/topology.c | 45 +++++++++++++++++++++++++++++++++--------
> >   1 file changed, 37 insertions(+), 8 deletions(-)
> > 
> > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> > index 05a5bc678c08..4d16269ac21a 100644
> > --- a/kernel/sched/topology.c
> > +++ b/kernel/sched/topology.c
> > @@ -208,7 +208,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
> > 
> >   #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
> >   DEFINE_STATIC_KEY_FALSE(sched_energy_present);
> > -static unsigned int sysctl_sched_energy_aware = 1;
> > +static unsigned int sysctl_sched_energy_aware;
> > +static struct ctl_table_header *sysctl_eas_header;
> 
> The variables around the presence/absence of EAS are:
> - sched_energy_present:
> EAS is up and running
> 
> - sysctl_sched_energy_aware:
> The user wants to use EAS (or not). Doesn't mean EAS can run on the
> platform.
> 
> - sched_energy_set/partition_sched_domains_locked's "has_eas":
> Local variable. Represent whether EAS can run on the platform.
> 
> IMO it would be simpler to (un)register sched_energy_aware sysctl
> in partition_sched_domains_locked(), based on the value of "has_eas".
> This would allow to let all the logic as it is right now, inside
> build_perf_domains(), and then advertise sched_energy_aware sysctl
> if EAS can run on the platform.
> sched_energy_aware_sysctl_init() would be deleted then.
> 
> 
> >   static DEFINE_MUTEX(sched_energy_mutex);
> >   static bool sched_energy_update;
> > 
> > @@ -226,6 +227,7 @@ static int sched_energy_aware_handler(struct ctl_table *table, int write,
> >   		void *buffer, size_t *lenp, loff_t *ppos)
> >   {
> >   	int ret, state;
> > +	int prev_val = sysctl_sched_energy_aware;
> > 
> >   	if (write && !capable(CAP_SYS_ADMIN))
> >   		return -EPERM;
> > @@ -233,8 +235,11 @@ static int sched_energy_aware_handler(struct ctl_table *table, int write,
> >   	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
> >   	if (!ret && write) {
> >   		state = static_branch_unlikely(&sched_energy_present);
> > -		if (state != sysctl_sched_energy_aware)
> > +		if (state != sysctl_sched_energy_aware && prev_val != sysctl_sched_energy_aware) {
> > +			if (sysctl_sched_energy_aware && !state)
> > +				pr_warn("Attempt to build energy domains when EAS is disabled\n");
> >   			rebuild_sched_domains_energy();
> > +		}
> >   	}
> > 
> >   	return ret;
> > @@ -255,7 +260,14 @@ static struct ctl_table sched_energy_aware_sysctls[] = {
> > 
> >   static int __init sched_energy_aware_sysctl_init(void)
> >   {
> > -	register_sysctl_init("kernel", sched_energy_aware_sysctls);
> > +	int cpu = cpumask_first(cpu_active_mask);
> > +
> > +	if (sched_smt_active() || !per_cpu(sd_asym_cpucapacity, cpu) ||
> > +	    !arch_scale_freq_invariant())
> > +		return 0;
> > +
> > +	sysctl_eas_header = register_sysctl("kernel", sched_energy_aware_sysctls);
> > +	sysctl_sched_energy_aware = 1;
> >   	return 0;
> >   }
> > 
> > @@ -336,10 +348,28 @@ static void sched_energy_set(bool has_eas)
> >   		if (sched_debug())
> >   			pr_info("%s: stopping EAS\n", __func__);
> >   		static_branch_disable_cpuslocked(&sched_energy_present);
> > +#ifdef CONFIG_PROC_SYSCTL
> > +		/*
> > +		 * if the architecture supports EAS and forcefully
> > +		 * perf domains are destroyed, there should be a sysctl
> > +		 * to enable it later. If this was due to dynamic system
> > +		 * change such as SMT<->NON_SMT then remove sysctl.
> > +		 */
> > +		if (sysctl_eas_header && !sched_energy_update) {
> > +			unregister_sysctl_table(sysctl_eas_header);
> > +			sysctl_eas_header = NULL;
> > +		}
> > +#endif
> > +		sysctl_sched_energy_aware = 0;
> >   	} else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
> >   		if (sched_debug())
> >   			pr_info("%s: starting EAS\n", __func__);
> >   		static_branch_enable_cpuslocked(&sched_energy_present);
> > +#ifdef CONFIG_PROC_SYSCTL
> > +		if (!sysctl_eas_header)
> > +			sysctl_eas_header = register_sysctl("kernel", sched_energy_aware_sysctls);
> > +#endif
> > +		sysctl_sched_energy_aware = 1;
> >   	}
> >   }
> > 
> > @@ -380,15 +410,14 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
> >   	struct cpufreq_policy *policy;
> >   	struct cpufreq_governor *gov;
> > 
> > -	if (!sysctl_sched_energy_aware)
> > +	if (!sysctl_sched_energy_aware && sched_energy_update)
> >   		goto free;
> > 
> >   	/* EAS is enabled for asymmetric CPU capacity topologies. */
> >   	if (!per_cpu(sd_asym_cpucapacity, cpu)) {
> > -		if (sched_debug()) {
> > -			pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
> > -					cpumask_pr_args(cpu_map));
> > -		}
> > +		if (sched_debug())
> > +			pr_info("rd %*pbl: Disabling EAS,  CPUs do not have asymmetric capacities\n",
> > +				cpumask_pr_args(cpu_map));
> >   		goto free;
> >   	}
> > 
> > --
> > 2.31.1
> > 
> > 
> 
> Regards,
> Pierre


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/topology: remove sysctl_sched_energy_aware depending on the architecture
  2023-09-01  6:52 Shrikanth Hegde
  2023-09-01  9:49 ` Chen Yu
@ 2023-09-05 14:03 ` Pierre Gondois
  2023-09-05 21:53   ` Tim Chen
                     ` (2 more replies)
  1 sibling, 3 replies; 13+ messages in thread
From: Pierre Gondois @ 2023-09-05 14:03 UTC (permalink / raw)
  To: Shrikanth Hegde
  Cc: dietmar.eggemann, vincent.guittot, peterz, mingo, vschneid,
	linux-kernel, ionela.voinescu, quentin.perret, srikar, mgorman,
	mingo, yu.c.chen

Hello Shrikanth,
I tried the patch (on a platform using the cppc_cpufreq driver). The platform
normally has EAS enabled, but the patch removed the sched_energy_aware sysctl.
It seemed the following happened (in the below order):

1. sched_energy_aware_sysctl_init()
Doesn't set sysctl_sched_energy_aware as cpufreq_freq_invariance isn't set
and arch_scale_freq_invariant() returns false

2. cpufreq_register_driver()
Sets cpufreq_freq_invariance during cpufreq initialization sched_energy_set()

3. sched_energy_set()
Is called with has_eas=0 since build_perf_domains() doesn't see the platform
as EAS compatible. Indeed sysctl_sched_energy_aware=0.
So with sysctl_sched_energy_aware=0 and has_eas=0, sched_energy_aware sysctl
is not enabled even though EAS should be possible.


On 9/1/23 08:52, Shrikanth Hegde wrote:
> Currently sysctl_sched_energy_aware doesn't alter the said behaviour on
> some of the architectures. IIUC its meant to either force rebuild the
> perf domains or cleanup the perf domains by echoing 1 or 0 respectively.

There is a definition of the sysctl at:
Documentation/admin-guide/sysctl/kernel.rst::sched_energy_aware

Also a personal comment about the commit message (FWIW), I think it should
be a bit more impersonal and factual. The commit message seems to describe
the code rather than the desired behaviour.

> 
> perf domains are not built when there is SMT, or when there is no
> Asymmetric CPU topologies or when there is no frequency invariance.
> Since such cases EAS is not set and perf domains are not built. By
> changing the values of sysctl_sched_energy_aware, its not possible to
> force build the perf domains. Hence remove this sysctl on such platforms
> that dont support it. Some of the settings can be changed later
> such as smt_active by offlining the CPU's, In those cases if
> build_perf_domains returns true, re-enable the sysctl.
> 
> Anytime, when sysctl_sched_energy_aware is changed sched_energy_update
> is set when building the perf domains. Making use of that to find out if
> the change is happening by sysctl or dynamic system change.
> 
> Taking different cases:
> Case1. system while booting has EAS capability, sysctl will be set 1. Hence
> perf domains will be built if needed. On changing the sysctl to 0, since
> sched_energy_update is true, perf domains would be freed and sysctl will
> not be removed. later sysctl is changed to 1, enabling the perf domains
> rebuild again. Since sysctl is already there, it will skip register.
> 
> Case2. System while booting doesn't have EAS Capability. Later after system
> change it becomes capable of EAS. sched_energy_update is false. Though
> sysctl is 0, will go ahead and try to enable eas. This is the current
> behaviour. if has_eas  is true, then sysctl will be registered. After
> that any sysctl change is same as Case1.
> 
> Case3. System becomes not capable of EAS due to system change. Here since
> sched_energy_update is false, build_perf_domains return has_eas as false
> due to one of the checks and Since this is dynamic change remove the sysctl.
> Any further change which enables EAS is Case2
> 
> Note: This hasn't been tested on platform which supports EAS. If the
> change can be verified on that it would really help. This has been
> tested on power10 which doesn't support EAS. sysctl_sched_energy_aware
> is removed with patch.
> 
> changes since v1:
> Chen Yu had pointed out that this will not destroy the perf domains on
> architectures where EAS is supported by changing the sysctl. This patch
> addresses that.
> [v1] Link: https://lore.kernel.org/lkml/20230829065040.920629-1-sshegde@linux.vnet.ibm.com/#t
> 
> Signed-off-by: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
> ---
>   kernel/sched/topology.c | 45 +++++++++++++++++++++++++++++++++--------
>   1 file changed, 37 insertions(+), 8 deletions(-)
> 
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 05a5bc678c08..4d16269ac21a 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -208,7 +208,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
> 
>   #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
>   DEFINE_STATIC_KEY_FALSE(sched_energy_present);
> -static unsigned int sysctl_sched_energy_aware = 1;
> +static unsigned int sysctl_sched_energy_aware;
> +static struct ctl_table_header *sysctl_eas_header;

The variables around the presence/absence of EAS are:
- sched_energy_present:
EAS is up and running

- sysctl_sched_energy_aware:
The user wants to use EAS (or not). Doesn't mean EAS can run on the
platform.

- sched_energy_set/partition_sched_domains_locked's "has_eas":
Local variable. Represent whether EAS can run on the platform.

IMO it would be simpler to (un)register sched_energy_aware sysctl
in partition_sched_domains_locked(), based on the value of "has_eas".
This would allow to let all the logic as it is right now, inside
build_perf_domains(), and then advertise sched_energy_aware sysctl
if EAS can run on the platform.
sched_energy_aware_sysctl_init() would be deleted then.


>   static DEFINE_MUTEX(sched_energy_mutex);
>   static bool sched_energy_update;
> 
> @@ -226,6 +227,7 @@ static int sched_energy_aware_handler(struct ctl_table *table, int write,
>   		void *buffer, size_t *lenp, loff_t *ppos)
>   {
>   	int ret, state;
> +	int prev_val = sysctl_sched_energy_aware;
> 
>   	if (write && !capable(CAP_SYS_ADMIN))
>   		return -EPERM;
> @@ -233,8 +235,11 @@ static int sched_energy_aware_handler(struct ctl_table *table, int write,
>   	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
>   	if (!ret && write) {
>   		state = static_branch_unlikely(&sched_energy_present);
> -		if (state != sysctl_sched_energy_aware)
> +		if (state != sysctl_sched_energy_aware && prev_val != sysctl_sched_energy_aware) {
> +			if (sysctl_sched_energy_aware && !state)
> +				pr_warn("Attempt to build energy domains when EAS is disabled\n");
>   			rebuild_sched_domains_energy();
> +		}
>   	}
> 
>   	return ret;
> @@ -255,7 +260,14 @@ static struct ctl_table sched_energy_aware_sysctls[] = {
> 
>   static int __init sched_energy_aware_sysctl_init(void)
>   {
> -	register_sysctl_init("kernel", sched_energy_aware_sysctls);
> +	int cpu = cpumask_first(cpu_active_mask);
> +
> +	if (sched_smt_active() || !per_cpu(sd_asym_cpucapacity, cpu) ||
> +	    !arch_scale_freq_invariant())
> +		return 0;
> +
> +	sysctl_eas_header = register_sysctl("kernel", sched_energy_aware_sysctls);
> +	sysctl_sched_energy_aware = 1;
>   	return 0;
>   }
> 
> @@ -336,10 +348,28 @@ static void sched_energy_set(bool has_eas)
>   		if (sched_debug())
>   			pr_info("%s: stopping EAS\n", __func__);
>   		static_branch_disable_cpuslocked(&sched_energy_present);
> +#ifdef CONFIG_PROC_SYSCTL
> +		/*
> +		 * if the architecture supports EAS and forcefully
> +		 * perf domains are destroyed, there should be a sysctl
> +		 * to enable it later. If this was due to dynamic system
> +		 * change such as SMT<->NON_SMT then remove sysctl.
> +		 */
> +		if (sysctl_eas_header && !sched_energy_update) {
> +			unregister_sysctl_table(sysctl_eas_header);
> +			sysctl_eas_header = NULL;
> +		}
> +#endif
> +		sysctl_sched_energy_aware = 0;
>   	} else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
>   		if (sched_debug())
>   			pr_info("%s: starting EAS\n", __func__);
>   		static_branch_enable_cpuslocked(&sched_energy_present);
> +#ifdef CONFIG_PROC_SYSCTL
> +		if (!sysctl_eas_header)
> +			sysctl_eas_header = register_sysctl("kernel", sched_energy_aware_sysctls);
> +#endif
> +		sysctl_sched_energy_aware = 1;
>   	}
>   }
> 
> @@ -380,15 +410,14 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
>   	struct cpufreq_policy *policy;
>   	struct cpufreq_governor *gov;
> 
> -	if (!sysctl_sched_energy_aware)
> +	if (!sysctl_sched_energy_aware && sched_energy_update)
>   		goto free;
> 
>   	/* EAS is enabled for asymmetric CPU capacity topologies. */
>   	if (!per_cpu(sd_asym_cpucapacity, cpu)) {
> -		if (sched_debug()) {
> -			pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
> -					cpumask_pr_args(cpu_map));
> -		}
> +		if (sched_debug())
> +			pr_info("rd %*pbl: Disabling EAS,  CPUs do not have asymmetric capacities\n",
> +				cpumask_pr_args(cpu_map));
>   		goto free;
>   	}
> 
> --
> 2.31.1
> 
> 

Regards,
Pierre

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/topology: remove sysctl_sched_energy_aware depending on the architecture
  2023-09-01  9:49 ` Chen Yu
@ 2023-09-01 13:08   ` Shrikanth Hegde
  2023-09-07 10:21   ` Pierre Gondois
  1 sibling, 0 replies; 13+ messages in thread
From: Shrikanth Hegde @ 2023-09-01 13:08 UTC (permalink / raw)
  To: Chen Yu
  Cc: mingo, peterz, vincent.guittot, dietmar.eggemann, vschneid,
	linux-kernel, ionela.voinescu, quentin.perret, srikar, mgorman,
	mingo



On 9/1/23 3:19 PM, Chen Yu wrote:
> Hi Shrikanth,
>

Hi Chen, Thanks for the review.
 
> On 2023-09-01 at 12:22:49 +0530, Shrikanth Hegde wrote:
>> Currently sysctl_sched_energy_aware doesn't alter the said behaviour on
>> some of the architectures. IIUC its meant to either force rebuild the
>> perf domains or cleanup the perf domains by echoing 1 or 0 respectively.
>>
>> perf domains are not built when there is SMT, or when there is no
>> Asymmetric CPU topologies or when there is no frequency invariance.
>> Since such cases EAS is not set and perf domains are not built. By
>> changing the values of sysctl_sched_energy_aware, its not possible to
>> force build the perf domains. Hence remove this sysctl on such platforms
>> that dont support it. Some of the settings can be changed later
>> such as smt_active by offlining the CPU's, In those cases if
>> build_perf_domains returns true, re-enable the sysctl.
>>
>> Anytime, when sysctl_sched_energy_aware is changed sched_energy_update
>> is set when building the perf domains. Making use of that to find out if
>> the change is happening by sysctl or dynamic system change.
>>
>> Taking different cases:
>> Case1. system while booting has EAS capability, sysctl will be set 1. Hence
>> perf domains will be built if needed. On changing the sysctl to 0, since
>> sched_energy_update is true, perf domains would be freed and sysctl will
>> not be removed. later sysctl is changed to 1, enabling the perf domains
>> rebuild again. Since sysctl is already there, it will skip register.
>>
>> Case2. System while booting doesn't have EAS Capability. Later after system
>> change it becomes capable of EAS. sched_energy_update is false. Though
>> sysctl is 0, will go ahead and try to enable eas. This is the current
>> behaviour. if has_eas  is true, then sysctl will be registered. After
>> that any sysctl change is same as Case1.
>>
> 
> I think this change makes sense. Just one question for case 2,
> sched_energy_update is not strictly tied with sysctl change, right?
> sched_energy_update is true in rebuild_sched_domains_energy().
> rebuild_sched_domains_energy() will not only be invoked by sysctl
> path via sched_energy_aware_handler(), but also by other path, such
> as update_scale_freq_invariant(). If the system boots with EAS capability
> disabled, then it becomes EAS capable due to the frequency invariant
> readiness(cpufreq policy change?), then
> cpufreq_notifier(CPUFREQ_CREATE_POLICY) -> init_amu_fie_callback() ->
> amu_fie_setup() -> opology_set_scale_freq_source() -> 
> update_scale_freq_invariant(true) -> rebuild_sched_domains_energy()
> Since sched_energy_update is true, the rebuild of perf domain will be skipped(but
> actually we want to create it) Please correct me if I miss something.
>

Ah, More cases!

You are right. let me see what can be done.  
maybe specific variable be used for sysctl change? it already quite a few variables there. 
Will think if this can be simplified somehow. 





^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/topology: remove sysctl_sched_energy_aware depending on the architecture
  2023-09-01  6:52 Shrikanth Hegde
@ 2023-09-01  9:49 ` Chen Yu
  2023-09-01 13:08   ` Shrikanth Hegde
  2023-09-07 10:21   ` Pierre Gondois
  2023-09-05 14:03 ` Pierre Gondois
  1 sibling, 2 replies; 13+ messages in thread
From: Chen Yu @ 2023-09-01  9:49 UTC (permalink / raw)
  To: Shrikanth Hegde
  Cc: mingo, peterz, vincent.guittot, dietmar.eggemann, vschneid,
	linux-kernel, ionela.voinescu, quentin.perret, srikar, mgorman,
	mingo

Hi Shrikanth,

On 2023-09-01 at 12:22:49 +0530, Shrikanth Hegde wrote:
> Currently sysctl_sched_energy_aware doesn't alter the said behaviour on
> some of the architectures. IIUC its meant to either force rebuild the
> perf domains or cleanup the perf domains by echoing 1 or 0 respectively.
> 
> perf domains are not built when there is SMT, or when there is no
> Asymmetric CPU topologies or when there is no frequency invariance.
> Since such cases EAS is not set and perf domains are not built. By
> changing the values of sysctl_sched_energy_aware, its not possible to
> force build the perf domains. Hence remove this sysctl on such platforms
> that dont support it. Some of the settings can be changed later
> such as smt_active by offlining the CPU's, In those cases if
> build_perf_domains returns true, re-enable the sysctl.
> 
> Anytime, when sysctl_sched_energy_aware is changed sched_energy_update
> is set when building the perf domains. Making use of that to find out if
> the change is happening by sysctl or dynamic system change.
> 
> Taking different cases:
> Case1. system while booting has EAS capability, sysctl will be set 1. Hence
> perf domains will be built if needed. On changing the sysctl to 0, since
> sched_energy_update is true, perf domains would be freed and sysctl will
> not be removed. later sysctl is changed to 1, enabling the perf domains
> rebuild again. Since sysctl is already there, it will skip register.
> 
> Case2. System while booting doesn't have EAS Capability. Later after system
> change it becomes capable of EAS. sched_energy_update is false. Though
> sysctl is 0, will go ahead and try to enable eas. This is the current
> behaviour. if has_eas  is true, then sysctl will be registered. After
> that any sysctl change is same as Case1.
>

I think this change makes sense. Just one question for case 2,
sched_energy_update is not strictly tied with sysctl change, right?
sched_energy_update is true in rebuild_sched_domains_energy().
rebuild_sched_domains_energy() will not only be invoked by sysctl
path via sched_energy_aware_handler(), but also by other path, such
as update_scale_freq_invariant(). If the system boots with EAS capability
disabled, then it becomes EAS capable due to the frequency invariant
readiness(cpufreq policy change?), then
cpufreq_notifier(CPUFREQ_CREATE_POLICY) -> init_amu_fie_callback() ->
amu_fie_setup() -> opology_set_scale_freq_source() -> 
update_scale_freq_invariant(true) -> rebuild_sched_domains_energy()
Since sched_energy_update is true, the rebuild of perf domain will be skipped(but
actually we want to create it) Please correct me if I miss something.

thanks,
Chenyu

thanks,
Chenyu

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v2] sched/topology: remove sysctl_sched_energy_aware depending on the architecture
@ 2023-09-01  6:52 Shrikanth Hegde
  2023-09-01  9:49 ` Chen Yu
  2023-09-05 14:03 ` Pierre Gondois
  0 siblings, 2 replies; 13+ messages in thread
From: Shrikanth Hegde @ 2023-09-01  6:52 UTC (permalink / raw)
  To: mingo, peterz, vincent.guittot
  Cc: sshegde, dietmar.eggemann, vschneid, linux-kernel,
	ionela.voinescu, quentin.perret, srikar, mgorman, mingo,
	yu.c.chen

Currently sysctl_sched_energy_aware doesn't alter the said behaviour on
some of the architectures. IIUC its meant to either force rebuild the
perf domains or cleanup the perf domains by echoing 1 or 0 respectively.

perf domains are not built when there is SMT, or when there is no
Asymmetric CPU topologies or when there is no frequency invariance.
Since such cases EAS is not set and perf domains are not built. By
changing the values of sysctl_sched_energy_aware, its not possible to
force build the perf domains. Hence remove this sysctl on such platforms
that dont support it. Some of the settings can be changed later
such as smt_active by offlining the CPU's, In those cases if
build_perf_domains returns true, re-enable the sysctl.

Anytime, when sysctl_sched_energy_aware is changed sched_energy_update
is set when building the perf domains. Making use of that to find out if
the change is happening by sysctl or dynamic system change.

Taking different cases:
Case1. system while booting has EAS capability, sysctl will be set 1. Hence
perf domains will be built if needed. On changing the sysctl to 0, since
sched_energy_update is true, perf domains would be freed and sysctl will
not be removed. later sysctl is changed to 1, enabling the perf domains
rebuild again. Since sysctl is already there, it will skip register.

Case2. System while booting doesn't have EAS Capability. Later after system
change it becomes capable of EAS. sched_energy_update is false. Though
sysctl is 0, will go ahead and try to enable eas. This is the current
behaviour. if has_eas  is true, then sysctl will be registered. After
that any sysctl change is same as Case1.

Case3. System becomes not capable of EAS due to system change. Here since
sched_energy_update is false, build_perf_domains return has_eas as false
due to one of the checks and Since this is dynamic change remove the sysctl.
Any further change which enables EAS is Case2

Note: This hasn't been tested on platform which supports EAS. If the
change can be verified on that it would really help. This has been
tested on power10 which doesn't support EAS. sysctl_sched_energy_aware
is removed with patch.

changes since v1:
Chen Yu had pointed out that this will not destroy the perf domains on
architectures where EAS is supported by changing the sysctl. This patch
addresses that.
[v1] Link: https://lore.kernel.org/lkml/20230829065040.920629-1-sshegde@linux.vnet.ibm.com/#t

Signed-off-by: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
---
 kernel/sched/topology.c | 45 +++++++++++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 05a5bc678c08..4d16269ac21a 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -208,7 +208,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)

 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
 DEFINE_STATIC_KEY_FALSE(sched_energy_present);
-static unsigned int sysctl_sched_energy_aware = 1;
+static unsigned int sysctl_sched_energy_aware;
+static struct ctl_table_header *sysctl_eas_header;
 static DEFINE_MUTEX(sched_energy_mutex);
 static bool sched_energy_update;

@@ -226,6 +227,7 @@ static int sched_energy_aware_handler(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret, state;
+	int prev_val = sysctl_sched_energy_aware;

 	if (write && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -233,8 +235,11 @@ static int sched_energy_aware_handler(struct ctl_table *table, int write,
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (!ret && write) {
 		state = static_branch_unlikely(&sched_energy_present);
-		if (state != sysctl_sched_energy_aware)
+		if (state != sysctl_sched_energy_aware && prev_val != sysctl_sched_energy_aware) {
+			if (sysctl_sched_energy_aware && !state)
+				pr_warn("Attempt to build energy domains when EAS is disabled\n");
 			rebuild_sched_domains_energy();
+		}
 	}

 	return ret;
@@ -255,7 +260,14 @@ static struct ctl_table sched_energy_aware_sysctls[] = {

 static int __init sched_energy_aware_sysctl_init(void)
 {
-	register_sysctl_init("kernel", sched_energy_aware_sysctls);
+	int cpu = cpumask_first(cpu_active_mask);
+
+	if (sched_smt_active() || !per_cpu(sd_asym_cpucapacity, cpu) ||
+	    !arch_scale_freq_invariant())
+		return 0;
+
+	sysctl_eas_header = register_sysctl("kernel", sched_energy_aware_sysctls);
+	sysctl_sched_energy_aware = 1;
 	return 0;
 }

@@ -336,10 +348,28 @@ static void sched_energy_set(bool has_eas)
 		if (sched_debug())
 			pr_info("%s: stopping EAS\n", __func__);
 		static_branch_disable_cpuslocked(&sched_energy_present);
+#ifdef CONFIG_PROC_SYSCTL
+		/*
+		 * if the architecture supports EAS and forcefully
+		 * perf domains are destroyed, there should be a sysctl
+		 * to enable it later. If this was due to dynamic system
+		 * change such as SMT<->NON_SMT then remove sysctl.
+		 */
+		if (sysctl_eas_header && !sched_energy_update) {
+			unregister_sysctl_table(sysctl_eas_header);
+			sysctl_eas_header = NULL;
+		}
+#endif
+		sysctl_sched_energy_aware = 0;
 	} else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
 		if (sched_debug())
 			pr_info("%s: starting EAS\n", __func__);
 		static_branch_enable_cpuslocked(&sched_energy_present);
+#ifdef CONFIG_PROC_SYSCTL
+		if (!sysctl_eas_header)
+			sysctl_eas_header = register_sysctl("kernel", sched_energy_aware_sysctls);
+#endif
+		sysctl_sched_energy_aware = 1;
 	}
 }

@@ -380,15 +410,14 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
 	struct cpufreq_policy *policy;
 	struct cpufreq_governor *gov;

-	if (!sysctl_sched_energy_aware)
+	if (!sysctl_sched_energy_aware && sched_energy_update)
 		goto free;

 	/* EAS is enabled for asymmetric CPU capacity topologies. */
 	if (!per_cpu(sd_asym_cpucapacity, cpu)) {
-		if (sched_debug()) {
-			pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
-					cpumask_pr_args(cpu_map));
-		}
+		if (sched_debug())
+			pr_info("rd %*pbl: Disabling EAS,  CPUs do not have asymmetric capacities\n",
+				cpumask_pr_args(cpu_map));
 		goto free;
 	}

--
2.31.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2023-09-13 11:55 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-01  6:51 [PATCH v2] sched/topology: remove sysctl_sched_energy_aware depending on the architecture Shrikanth Hegde
2023-09-01  6:57 ` Shrikanth Hegde
2023-09-01  6:52 Shrikanth Hegde
2023-09-01  9:49 ` Chen Yu
2023-09-01 13:08   ` Shrikanth Hegde
2023-09-07 10:21   ` Pierre Gondois
2023-09-07 15:04     ` Chen Yu
2023-09-05 14:03 ` Pierre Gondois
2023-09-05 21:53   ` Tim Chen
2023-09-06  3:21   ` Shrikanth Hegde
2023-09-06 11:35   ` Shrikanth Hegde
2023-09-07 10:34     ` Pierre Gondois
2023-09-13 11:55       ` Shrikanth Hegde

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.