linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] intel_pstate: track and export frequency residency stats via sysfs.
@ 2014-09-09  0:10 Anup Chenthamarakshan
  2014-09-09  5:03 ` Viresh Kumar
  2014-09-09 15:15 ` Dirk Brandewie
  0 siblings, 2 replies; 15+ messages in thread
From: Anup Chenthamarakshan @ 2014-09-09  0:10 UTC (permalink / raw)
  To: Dirk Brandewie
  Cc: Sameer Nanda, Rafael J. Wysocki, Viresh Kumar, linux-pm,
	linux-kernel, Anup Chenthamarakshan

Exported stats appear in
<sysfs>/devices/system/cpu/intel_pstate/time_in_state as follows:

## CPU 0
400000 3647
500000 24342
600000 144150
700000 202469
## CPU 1
400000 4813
500000 22628
600000 149564
700000 211885
800000 173890

Signed-off-by: Anup Chenthamarakshan <anupc@chromium.org>
---
 drivers/cpufreq/intel_pstate.c | 77 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 74 insertions(+), 3 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 0668b38..7be89bd 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -84,6 +84,11 @@ struct _pid {
 	int32_t last_err;
 };
 
+struct pstate_stat {
+	int pstate;
+	u64 time;
+};
+
 struct cpudata {
 	int cpu;
 
@@ -97,6 +102,9 @@ struct cpudata {
 	u64	prev_aperf;
 	u64	prev_mperf;
 	struct sample sample;
+
+	struct pstate_stat *stat;
+	u64	last_updated;
 };
 
 static struct cpudata **all_cpu_data;
@@ -218,6 +226,18 @@ static inline void intel_pstate_reset_all_pid(void)
 	}
 }
 
+static void intel_pstate_account_time_to_current_pstate(struct cpudata *cpu)
+{
+	/* Handle the initial call from intel_pstate_init_cpu */
+	if (likely(cpu->stat)) {
+		u64 now = jiffies;
+		int index = cpu->pstate.current_pstate - cpu->pstate.min_pstate;
+
+		cpu->stat[index].time += now - cpu->last_updated;
+		cpu->last_updated = now;
+	}
+}
+
 /************************** debugfs begin ************************/
 static int pid_param_set(void *data, u64 val)
 {
@@ -323,6 +343,40 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
 	return count;
 }
 
+static ssize_t show_time_in_state(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	unsigned int cpu;
+	struct cpudata *cpudata;
+	int i, len = 0, total_states;
+
+	for_each_online_cpu(cpu) {
+		if (!all_cpu_data[cpu])
+			continue;
+
+		cpudata = all_cpu_data[cpu];
+		len += snprintf(buf + len, PAGE_SIZE - len, "## CPU %d\n", cpu);
+		if (len >= PAGE_SIZE)
+			return len;
+
+		total_states = cpudata->pstate.turbo_pstate -
+			cpudata->pstate.min_pstate + 1;
+
+		intel_pstate_account_time_to_current_pstate(cpudata);
+
+		for (i = 0; i < total_states; i++) {
+			len += snprintf(buf + len, PAGE_SIZE - len, "%d %llu\n",
+					cpudata->stat[i].pstate * 100000,
+					cpudata->stat[i].time);
+
+			if (len >= PAGE_SIZE)
+				return len;
+		}
+	}
+
+	return len;
+}
+
 show_one(no_turbo, no_turbo);
 show_one(max_perf_pct, max_perf_pct);
 show_one(min_perf_pct, min_perf_pct);
@@ -331,10 +385,13 @@ define_one_global_rw(no_turbo);
 define_one_global_rw(max_perf_pct);
 define_one_global_rw(min_perf_pct);
 
+define_one_global_ro(time_in_state);
+
 static struct attribute *intel_pstate_attributes[] = {
 	&no_turbo.attr,
 	&max_perf_pct.attr,
 	&min_perf_pct.attr,
+	&time_in_state.attr,
 	NULL
 };
 
@@ -525,9 +582,11 @@ static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate)
 
 	trace_cpu_frequency(pstate * 100000, cpu->cpu);
 
-	cpu->pstate.current_pstate = pstate;
-
 	pstate_funcs.set(cpu, pstate);
+
+	intel_pstate_account_time_to_current_pstate(cpu);
+
+	cpu->pstate.current_pstate = pstate;
 }
 
 static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
@@ -751,6 +810,7 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
 
 	del_timer_sync(&all_cpu_data[cpu_num]->timer);
 	intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate);
+	kfree(all_cpu_data[cpu_num]->stat);
 	kfree(all_cpu_data[cpu_num]);
 	all_cpu_data[cpu_num] = NULL;
 }
@@ -758,7 +818,7 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
 static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
 {
 	struct cpudata *cpu;
-	int rc;
+	int rc, i, total_states;
 	u64 misc_en;
 
 	rc = intel_pstate_init_cpu(policy->cpu);
@@ -787,6 +847,16 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
 	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
 	cpumask_set_cpu(policy->cpu, policy->cpus);
 
+	total_states = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1;
+	cpu->stat = kcalloc(total_states, sizeof(struct pstate_stat),
+			GFP_KERNEL);
+
+	if (cpu->stat)
+		for (i = 0; i < total_states; i++)
+			cpu->stat[i].pstate = i + cpu->pstate.min_pstate;
+
+	cpu->last_updated = get_jiffies_64();
+
 	return 0;
 }
 
@@ -958,6 +1028,7 @@ out:
 	for_each_online_cpu(cpu) {
 		if (all_cpu_data[cpu]) {
 			del_timer_sync(&all_cpu_data[cpu]->timer);
+			kfree(all_cpu_data[cpu]->stat);
 			kfree(all_cpu_data[cpu]);
 		}
 	}
-- 
1.8.3.2


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.
  2014-09-09  0:10 [PATCH] intel_pstate: track and export frequency residency stats via sysfs Anup Chenthamarakshan
@ 2014-09-09  5:03 ` Viresh Kumar
  2014-09-09  5:32   ` Anup Chenthamarakshan
  2014-09-09 15:15 ` Dirk Brandewie
  1 sibling, 1 reply; 15+ messages in thread
From: Viresh Kumar @ 2014-09-09  5:03 UTC (permalink / raw)
  To: Anup Chenthamarakshan
  Cc: Dirk Brandewie, Sameer Nanda, Rafael J. Wysocki, linux-pm,
	Linux Kernel Mailing List

On 9 September 2014 05:40, Anup Chenthamarakshan <anupc@chromium.org> wrote:
> Exported stats appear in
> <sysfs>/devices/system/cpu/intel_pstate/time_in_state as follows:
>
> ## CPU 0
> 400000 3647
> 500000 24342
> 600000 144150
> 700000 202469
> ## CPU 1
> 400000 4813
> 500000 22628
> 600000 149564
> 700000 211885
> 800000 173890
>
> Signed-off-by: Anup Chenthamarakshan <anupc@chromium.org>
> ---
>  drivers/cpufreq/intel_pstate.c | 77 ++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 74 insertions(+), 3 deletions(-)

Why don't we reuse cpufreq_stats.c for all this?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.
  2014-09-09  5:03 ` Viresh Kumar
@ 2014-09-09  5:32   ` Anup Chenthamarakshan
  2014-09-09  6:26     ` Viresh Kumar
  0 siblings, 1 reply; 15+ messages in thread
From: Anup Chenthamarakshan @ 2014-09-09  5:32 UTC (permalink / raw)
  To: Viresh Kumar
  Cc: Dirk Brandewie, Sameer Nanda, Rafael J. Wysocki, linux-pm,
	Linux Kernel Mailing List

On Mon, Sep 8, 2014 at 10:03 PM, Viresh Kumar <viresh.kumar@linaro.org> wrote:
> On 9 September 2014 05:40, Anup Chenthamarakshan <anupc@chromium.org> wrote:
>> Exported stats appear in
>> <sysfs>/devices/system/cpu/intel_pstate/time_in_state as follows:
>>
>> ## CPU 0
>> 400000 3647
>> 500000 24342
>> 600000 144150
>> 700000 202469
>> ## CPU 1
>> 400000 4813
>> 500000 22628
>> 600000 149564
>> 700000 211885
>> 800000 173890
>>
>> Signed-off-by: Anup Chenthamarakshan <anupc@chromium.org>
>> ---
>>  drivers/cpufreq/intel_pstate.c | 77 ++++++++++++++++++++++++++++++++++++++++--
>>  1 file changed, 74 insertions(+), 3 deletions(-)
>
> Why don't we reuse cpufreq_stats.c for all this?

Thanks for taking a look into this.

I had initially tried reusing cpufreq_stats.c to export stats.
Calling cpufreq_stats_update() via the cpufreq notifier added
some amount of overhead while switching frequencies. Specifically,
looking up the index of the new frequency in freq_table_get_index()
is a linear search through all available frequencies (vs a single
subtraction with custom stats export). Also, the notifier mechanism
itself added a level of indirection before calling stats_update.

There is a 5X increase in time taken to complete intel_pstate_set_pstate
while using cpufreq_stats compared to having custom stats exported.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.
  2014-09-09  5:32   ` Anup Chenthamarakshan
@ 2014-09-09  6:26     ` Viresh Kumar
  2014-09-09 23:31       ` Anup Chenthamarakshan
  0 siblings, 1 reply; 15+ messages in thread
From: Viresh Kumar @ 2014-09-09  6:26 UTC (permalink / raw)
  To: Anup Chenthamarakshan
  Cc: Dirk Brandewie, Sameer Nanda, Rafael J. Wysocki, linux-pm,
	Linux Kernel Mailing List

On Tue, Sep 9, 2014 at 11:02 AM, Anup Chenthamarakshan
<anupc@chromium.org> wrote:
> I had initially tried reusing cpufreq_stats.c to export stats.
> Calling cpufreq_stats_update() via the cpufreq notifier added
> some amount of overhead while switching frequencies. Specifically,
> looking up the index of the new frequency in freq_table_get_index()
> is a linear search through all available frequencies (vs a single
> subtraction with custom stats export). Also, the notifier mechanism

I don't think just this linear search will make things so bad..

> itself added a level of indirection before calling stats_update.

Probably some other notifier is registered which is taking considerable
amount of time.. Try checking what all registered with cpufreq-core.

> There is a 5X increase in time taken to complete intel_pstate_set_pstate
> while using cpufreq_stats compared to having custom stats exported.

Try calling cpufreq_stat_notifier_trans() directly instead of a notifier
and lets see if this makes it any better.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.
  2014-09-09  0:10 [PATCH] intel_pstate: track and export frequency residency stats via sysfs Anup Chenthamarakshan
  2014-09-09  5:03 ` Viresh Kumar
@ 2014-09-09 15:15 ` Dirk Brandewie
  2014-09-09 23:22   ` Anup Chenthamarakshan
  1 sibling, 1 reply; 15+ messages in thread
From: Dirk Brandewie @ 2014-09-09 15:15 UTC (permalink / raw)
  To: Anup Chenthamarakshan, Dirk Brandewie
  Cc: dirk.brandewie, Sameer Nanda, Rafael J. Wysocki, Viresh Kumar,
	linux-pm, linux-kernel

On 09/08/2014 05:10 PM, Anup Chenthamarakshan wrote:
> Exported stats appear in
> <sysfs>/devices/system/cpu/intel_pstate/time_in_state as follows:
> 
> ## CPU 0
> 400000 3647
> 500000 24342
> 600000 144150
> 700000 202469
> ## CPU 1
> 400000 4813
> 500000 22628
> 600000 149564
> 700000 211885
> 800000 173890
> 
> Signed-off-by: Anup Chenthamarakshan <anupc@chromium.org>

What is this information being used for?

Tracking the current P state request for each core is only part of the 
story.  The processor aggregates the requests from all cores and then decides
what frequency the package will run at, this evaluation happens at ~1ms time
frame.  If a core is idle then it loses its vote for that package frequency will
be and its frequency will be zero even though it may have been requesting
a high P state when it went idle.  Tracking the residency of the requested
P state doesn't provide much useful information other than ensuring the the 
requests are changing over time IMHO.

This interface will not be supportable with upcoming processors using
hardware P states as documented in volume 3 of the current SDM Section 14.4
http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
The OS will have no way of knowing what the P state requests are for a
given core are.

--Dirk 
> ---
>   drivers/cpufreq/intel_pstate.c | 77 ++++++++++++++++++++++++++++++++++++++++--
>   1 file changed, 74 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
> index 0668b38..7be89bd 100644
> --- a/drivers/cpufreq/intel_pstate.c
> +++ b/drivers/cpufreq/intel_pstate.c
> @@ -84,6 +84,11 @@ struct _pid {
>   	int32_t last_err;
>   };
>   
> +struct pstate_stat {
> +	int pstate;
> +	u64 time;
> +};
> +
>   struct cpudata {
>   	int cpu;
>   
> @@ -97,6 +102,9 @@ struct cpudata {
>   	u64	prev_aperf;
>   	u64	prev_mperf;
>   	struct sample sample;
> +
> +	struct pstate_stat *stat;
> +	u64	last_updated;
>   };
>   
>   static struct cpudata **all_cpu_data;
> @@ -218,6 +226,18 @@ static inline void intel_pstate_reset_all_pid(void)
>   	}
>   }
>   
> +static void intel_pstate_account_time_to_current_pstate(struct cpudata *cpu)
> +{
> +	/* Handle the initial call from intel_pstate_init_cpu */
> +	if (likely(cpu->stat)) {
> +		u64 now = jiffies;
> +		int index = cpu->pstate.current_pstate - cpu->pstate.min_pstate;
> +
> +		cpu->stat[index].time += now - cpu->last_updated;
> +		cpu->last_updated = now;
> +	}
> +}
> +
>   /************************** debugfs begin ************************/
>   static int pid_param_set(void *data, u64 val)
>   {
> @@ -323,6 +343,40 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
>   	return count;
>   }
>   
> +static ssize_t show_time_in_state(struct kobject *kobj, struct attribute *attr,
> +				char *buf)
> +{
> +	unsigned int cpu;
> +	struct cpudata *cpudata;
> +	int i, len = 0, total_states;
> +
> +	for_each_online_cpu(cpu) {
> +		if (!all_cpu_data[cpu])
> +			continue;
> +
> +		cpudata = all_cpu_data[cpu];
> +		len += snprintf(buf + len, PAGE_SIZE - len, "## CPU %d\n", cpu);
> +		if (len >= PAGE_SIZE)
> +			return len;
> +
> +		total_states = cpudata->pstate.turbo_pstate -
> +			cpudata->pstate.min_pstate + 1;
> +
> +		intel_pstate_account_time_to_current_pstate(cpudata);
> +
> +		for (i = 0; i < total_states; i++) {
> +			len += snprintf(buf + len, PAGE_SIZE - len, "%d %llu\n",
> +					cpudata->stat[i].pstate * 100000,
> +					cpudata->stat[i].time);
> +
> +			if (len >= PAGE_SIZE)
> +				return len;
> +		}
> +	}
> +
> +	return len;
> +}
> +
>   show_one(no_turbo, no_turbo);
>   show_one(max_perf_pct, max_perf_pct);
>   show_one(min_perf_pct, min_perf_pct);
> @@ -331,10 +385,13 @@ define_one_global_rw(no_turbo);
>   define_one_global_rw(max_perf_pct);
>   define_one_global_rw(min_perf_pct);
>   
> +define_one_global_ro(time_in_state);
> +
>   static struct attribute *intel_pstate_attributes[] = {
>   	&no_turbo.attr,
>   	&max_perf_pct.attr,
>   	&min_perf_pct.attr,
> +	&time_in_state.attr,
>   	NULL
>   };
>   
> @@ -525,9 +582,11 @@ static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate)
>   
>   	trace_cpu_frequency(pstate * 100000, cpu->cpu);
>   
> -	cpu->pstate.current_pstate = pstate;
> -
>   	pstate_funcs.set(cpu, pstate);
> +
> +	intel_pstate_account_time_to_current_pstate(cpu);
> +
> +	cpu->pstate.current_pstate = pstate;
>   }
>   
>   static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
> @@ -751,6 +810,7 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
>   
>   	del_timer_sync(&all_cpu_data[cpu_num]->timer);
>   	intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate);
> +	kfree(all_cpu_data[cpu_num]->stat);
>   	kfree(all_cpu_data[cpu_num]);
>   	all_cpu_data[cpu_num] = NULL;
>   }
> @@ -758,7 +818,7 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
>   static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
>   {
>   	struct cpudata *cpu;
> -	int rc;
> +	int rc, i, total_states;
>   	u64 misc_en;
>   
>   	rc = intel_pstate_init_cpu(policy->cpu);
> @@ -787,6 +847,16 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
>   	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
>   	cpumask_set_cpu(policy->cpu, policy->cpus);
>   
> +	total_states = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1;
> +	cpu->stat = kcalloc(total_states, sizeof(struct pstate_stat),
> +			GFP_KERNEL);
> +
> +	if (cpu->stat)
> +		for (i = 0; i < total_states; i++)
> +			cpu->stat[i].pstate = i + cpu->pstate.min_pstate;
> +
> +	cpu->last_updated = get_jiffies_64();
> +
>   	return 0;
>   }
>   
> @@ -958,6 +1028,7 @@ out:
>   	for_each_online_cpu(cpu) {
>   		if (all_cpu_data[cpu]) {
>   			del_timer_sync(&all_cpu_data[cpu]->timer);
> +			kfree(all_cpu_data[cpu]->stat);
>   			kfree(all_cpu_data[cpu]);
>   		}
>   	}
> 


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.
  2014-09-09 15:15 ` Dirk Brandewie
@ 2014-09-09 23:22   ` Anup Chenthamarakshan
  2014-09-10 16:39     ` Dirk Brandewie
  0 siblings, 1 reply; 15+ messages in thread
From: Anup Chenthamarakshan @ 2014-09-09 23:22 UTC (permalink / raw)
  To: Dirk Brandewie
  Cc: Dirk Brandewie, Sameer Nanda, Rafael J. Wysocki, Viresh Kumar,
	linux-pm, linux-kernel

On Tue, Sep 09, 2014 at 08:15:13AM -0700, Dirk Brandewie wrote:
> On 09/08/2014 05:10 PM, Anup Chenthamarakshan wrote:
> > Exported stats appear in
> > <sysfs>/devices/system/cpu/intel_pstate/time_in_state as follows:
> > 
> > ## CPU 0
> > 400000 3647
> > 500000 24342
> > 600000 144150
> > 700000 202469
> > ## CPU 1
> > 400000 4813
> > 500000 22628
> > 600000 149564
> > 700000 211885
> > 800000 173890
> > 
> > Signed-off-by: Anup Chenthamarakshan <anupc@chromium.org>
> 
> What is this information being used for?

I'm using P-state residency information in power consumption tests to calculate
proportion of time spent in each P-state across all processors (one global set
of percentages, corresponding to each P-state). This is used to validate new
changes from the power perspective. Essentially, sanity checks to flag changes
with large difference in P-state residency.

So far, we've been using the data exported by acpi-cpufreq to track this.

> 
> Tracking the current P state request for each core is only part of the 
> story.  The processor aggregates the requests from all cores and then decides
> what frequency the package will run at, this evaluation happens at ~1ms time
> frame.  If a core is idle then it loses its vote for that package frequency will
> be and its frequency will be zero even though it may have been requesting
> a high P state when it went idle.  Tracking the residency of the requested
> P state doesn't provide much useful information other than ensuring the the 
> requests are changing over time IMHO.

This is exactly why we're trying to track it.

> 
> This interface will not be supportable with upcoming processors using
> hardware P states as documented in volume 3 of the current SDM Section 14.4
> http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
> The OS will have no way of knowing what the P state requests are for a
> given core are.

Will there be any means to determine the proportion of time spent in different
HWP-states when HWP gets enabled (maybe at a package level)?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.
  2014-09-09  6:26     ` Viresh Kumar
@ 2014-09-09 23:31       ` Anup Chenthamarakshan
  2014-09-10  6:49         ` Viresh Kumar
  0 siblings, 1 reply; 15+ messages in thread
From: Anup Chenthamarakshan @ 2014-09-09 23:31 UTC (permalink / raw)
  To: Viresh Kumar
  Cc: Dirk Brandewie, Sameer Nanda, Rafael J. Wysocki, linux-pm,
	Linux Kernel Mailing List

On Tue, Sep 09, 2014 at 11:56:18AM +0530, Viresh Kumar wrote:
> On Tue, Sep 9, 2014 at 11:02 AM, Anup Chenthamarakshan
> <anupc@chromium.org> wrote:
> > I had initially tried reusing cpufreq_stats.c to export stats.
> > Calling cpufreq_stats_update() via the cpufreq notifier added
> > some amount of overhead while switching frequencies. Specifically,
> > looking up the index of the new frequency in freq_table_get_index()
> > is a linear search through all available frequencies (vs a single
> > subtraction with custom stats export). Also, the notifier mechanism
> 
> I don't think just this linear search will make things so bad..

Linear search usually slows down transition to higher P-states because
it has to go through the full list (and the list is longer in intel_pstate
compared to acpi-cpufreq). But, yes, this is probably not a bottleneck.

> 
> > itself added a level of indirection before calling stats_update.
> 
> Probably some other notifier is registered which is taking considerable
> amount of time.. Try checking what all registered with cpufreq-core.

There was no other notifier which was listening to freq transitions events.

> 
> > There is a 5X increase in time taken to complete intel_pstate_set_pstate
> > while using cpufreq_stats compared to having custom stats exported.
> 
> Try calling cpufreq_stat_notifier_trans() directly instead of a notifier
> and lets see if this makes it any better.

Here's a comparison of time taken to run intel_pstate_set_pstate() using
different approaches:
                                  average time for a transition
no stats                                      4.7us
intel_pstate-stats                            5.7us
direct call to cpufreq_stat_notifier_trans    8.1us
cpufreq-notifier-event                       10.6us


I was wrong about the 5x increase (got tripped by calls to
intel_pstate_set_pstate where no change in P-state actually happened)

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.
  2014-09-09 23:31       ` Anup Chenthamarakshan
@ 2014-09-10  6:49         ` Viresh Kumar
  0 siblings, 0 replies; 15+ messages in thread
From: Viresh Kumar @ 2014-09-10  6:49 UTC (permalink / raw)
  To: Anup Chenthamarakshan
  Cc: Dirk Brandewie, Sameer Nanda, Rafael J. Wysocki, linux-pm,
	Linux Kernel Mailing List

On 10 September 2014 05:01, Anup Chenthamarakshan <anupc@chromium.org> wrote:
> Here's a comparison of time taken to run intel_pstate_set_pstate() using
> different approaches:
>                                   average time for a transition
> no stats                                      4.7us
> intel_pstate-stats                            5.7us
> direct call to cpufreq_stat_notifier_trans    8.1us
> cpufreq-notifier-event                       10.6us
>
>
> I was wrong about the 5x increase (got tripped by calls to
> intel_pstate_set_pstate where no change in P-state actually happened)

Okay, lets see if this is the right thing to do or not first. So
finish your discussion with
Dirk and then we will see what's the best way to do it.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.
  2014-09-09 23:22   ` Anup Chenthamarakshan
@ 2014-09-10 16:39     ` Dirk Brandewie
  2014-09-10 22:15       ` Anup Chenthamarakshan
  0 siblings, 1 reply; 15+ messages in thread
From: Dirk Brandewie @ 2014-09-10 16:39 UTC (permalink / raw)
  To: Anup Chenthamarakshan, Dirk Brandewie
  Cc: Sameer Nanda, Rafael J. Wysocki, Viresh Kumar, linux-pm, linux-kernel

On 09/09/2014 04:22 PM, Anup Chenthamarakshan wrote:
> On Tue, Sep 09, 2014 at 08:15:13AM -0700, Dirk Brandewie wrote:
>> On 09/08/2014 05:10 PM, Anup Chenthamarakshan wrote:
>>> Exported stats appear in
>>> <sysfs>/devices/system/cpu/intel_pstate/time_in_state as follows:
>>>
>>> ## CPU 0
>>> 400000 3647
>>> 500000 24342
>>> 600000 144150
>>> 700000 202469
>>> ## CPU 1
>>> 400000 4813
>>> 500000 22628
>>> 600000 149564
>>> 700000 211885
>>> 800000 173890
>>>
>>> Signed-off-by: Anup Chenthamarakshan <anupc@chromium.org>
>>
>> What is this information being used for?
>
> I'm using P-state residency information in power consumption tests to calculate
> proportion of time spent in each P-state across all processors (one global set
> of percentages, corresponding to each P-state). This is used to validate new
> changes from the power perspective. Essentially, sanity checks to flag changes
> with large difference in P-state residency.
>
> So far, we've been using the data exported by acpi-cpufreq to track this.
>
>>
>> Tracking the current P state request for each core is only part of the
>> story.  The processor aggregates the requests from all cores and then decides
>> what frequency the package will run at, this evaluation happens at ~1ms time
>> frame.  If a core is idle then it loses its vote for that package frequency will
>> be and its frequency will be zero even though it may have been requesting
>> a high P state when it went idle.  Tracking the residency of the requested
>> P state doesn't provide much useful information other than ensuring the the
>> requests are changing over time IMHO.
>
> This is exactly why we're trying to track it.

My point is that you are tracking the residency of the request and not
the P state the package was running at.  On a lightly loaded system
it is not unusual for a core that was very busy and requesting a high
P state to go idle for several seconds.  In this case that core would
lose its vote for the package P state but the stats would show that
the P state was high for a very long time when its real frequency
was zero.

There are a couple of ways to get what I consider better information
about what is actually going on.

   The current turbostat provides C state residency and calculates the
   average/effective frequency of the core over its sample time.
   Turbostat will also measure the power consumption from the CPU point
   of view if your processor supports the RAPL registers.

   Reading MSR 0x198 MSR_IA32_PERF_STATUS will tell you what the core
   would run at if it not idle, this reflects the decision that the
   package made based on current requests.

   Using perf to collect power:pstate_sample event will give information
   about each sample on the core and give you timestamps to detect idle
   times.

   Using perf to collect power:cpu_frequency will show when the P state
   request was changed on each core and is triggered by intel_pstate and
   acpi_cpufreq.

   Powertop collects that same information as turbostat and a bunch of
   other information useful in seeing where you could be burning power
   for no good reason.

For getting an idea of real power turbostat is the easiest to use and
is available on most systems.  Using perf will give you a very fine grained
view of what is going on as well as point to the culprit for bad
behaviour in most cases.

>
>>
>> This interface will not be supportable with upcoming processors using
>> hardware P states as documented in volume 3 of the current SDM Section 14.4
>> http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
>> The OS will have no way of knowing what the P state requests are for a
>> given core are.
>
> Will there be any means to determine the proportion of time spent in different
> HWP-states when HWP gets enabled (maybe at a package level)?
>
Not that I am aware of :-(  There is MSR_PPERF section 14.4.5.1 that will give
the CPUs view of the amount of productive work/scalability of the current load.

--Dirk

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.
  2014-09-10 16:39     ` Dirk Brandewie
@ 2014-09-10 22:15       ` Anup Chenthamarakshan
  2014-09-10 22:49         ` Rafael J. Wysocki
  0 siblings, 1 reply; 15+ messages in thread
From: Anup Chenthamarakshan @ 2014-09-10 22:15 UTC (permalink / raw)
  To: Dirk Brandewie
  Cc: Sameer Nanda, Rafael J. Wysocki, Viresh Kumar, linux-pm, linux-kernel

On Wed, Sep 10, 2014 at 09:39:30AM -0700, Dirk Brandewie wrote:
> On 09/09/2014 04:22 PM, Anup Chenthamarakshan wrote:
> >On Tue, Sep 09, 2014 at 08:15:13AM -0700, Dirk Brandewie wrote:
> >>On 09/08/2014 05:10 PM, Anup Chenthamarakshan wrote:
> >>>Exported stats appear in
> >>><sysfs>/devices/system/cpu/intel_pstate/time_in_state as follows:
> >>>
> >>>## CPU 0
> >>>400000 3647
> >>>500000 24342
> >>>600000 144150
> >>>700000 202469
> >>>## CPU 1
> >>>400000 4813
> >>>500000 22628
> >>>600000 149564
> >>>700000 211885
> >>>800000 173890
> >>>
> >>>Signed-off-by: Anup Chenthamarakshan <anupc@chromium.org>
> >>
> >>What is this information being used for?
> >
> >I'm using P-state residency information in power consumption tests to calculate
> >proportion of time spent in each P-state across all processors (one global set
> >of percentages, corresponding to each P-state). This is used to validate new
> >changes from the power perspective. Essentially, sanity checks to flag changes
> >with large difference in P-state residency.
> >
> >So far, we've been using the data exported by acpi-cpufreq to track this.
> >
> >>
> >>Tracking the current P state request for each core is only part of the
> >>story.  The processor aggregates the requests from all cores and then decides
> >>what frequency the package will run at, this evaluation happens at ~1ms time
> >>frame.  If a core is idle then it loses its vote for that package frequency will
> >>be and its frequency will be zero even though it may have been requesting
> >>a high P state when it went idle.  Tracking the residency of the requested
> >>P state doesn't provide much useful information other than ensuring the the
> >>requests are changing over time IMHO.
> >
> >This is exactly why we're trying to track it.
> 
> My point is that you are tracking the residency of the request and not
> the P state the package was running at.  On a lightly loaded system
> it is not unusual for a core that was very busy and requesting a high
> P state to go idle for several seconds.  In this case that core would
> lose its vote for the package P state but the stats would show that
> the P state was high for a very long time when its real frequency
> was zero.

I see what you're saying. Requesting a p-state does not necessarily mean that is
the state the CPU is in.

> 
> There are a couple of ways to get what I consider better information
> about what is actually going on.
> 
>   The current turbostat provides C state residency and calculates the
>   average/effective frequency of the core over its sample time.
>   Turbostat will also measure the power consumption from the CPU point
>   of view if your processor supports the RAPL registers.
> 
>   Reading MSR 0x198 MSR_IA32_PERF_STATUS will tell you what the core
>   would run at if it not idle, this reflects the decision that the
>   package made based on current requests.
> 
>   Using perf to collect power:pstate_sample event will give information
>   about each sample on the core and give you timestamps to detect idle
>   times.
> 
>   Using perf to collect power:cpu_frequency will show when the P state
>   request was changed on each core and is triggered by intel_pstate and
>   acpi_cpufreq.
> 
>   Powertop collects that same information as turbostat and a bunch of
>   other information useful in seeing where you could be burning power
>   for no good reason.
> 
> For getting an idea of real power turbostat is the easiest to use and
> is available on most systems.  Using perf will give you a very fine grained
> view of what is going on as well as point to the culprit for bad
> behaviour in most cases.

Tools like powertop and turbostat are not present by default on all systems,
so it is not always possible to use them :(

Will it make sense to expose the current (64-bit) value of aperf and mperf
through sysfs? This will let userspace tools calculate the average frequency
of a CPU across a large period of time. For example, a load test that runs for
1 hour will only need to poll sysfs twice (per CPU) to do this operation,
instead of polling MSRs on each CPU once every second or so (to account for
overruns).

> 
> >
> >>
> >>This interface will not be supportable with upcoming processors using
> >>hardware P states as documented in volume 3 of the current SDM Section 14.4
> >>http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
> >>The OS will have no way of knowing what the P state requests are for a
> >>given core are.
> >
> >Will there be any means to determine the proportion of time spent in different
> >HWP-states when HWP gets enabled (maybe at a package level)?
> >
> Not that I am aware of :-(  There is MSR_PPERF section 14.4.5.1 that will give
> the CPUs view of the amount of productive work/scalability of the current load.
> 
> --Dirk

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.
  2014-09-10 22:15       ` Anup Chenthamarakshan
@ 2014-09-10 22:49         ` Rafael J. Wysocki
  2014-09-10 23:39           ` Anup Chenthamarakshan
  0 siblings, 1 reply; 15+ messages in thread
From: Rafael J. Wysocki @ 2014-09-10 22:49 UTC (permalink / raw)
  To: Anup Chenthamarakshan
  Cc: Dirk Brandewie, Sameer Nanda, Viresh Kumar, linux-pm, linux-kernel

On Wednesday, September 10, 2014 03:15:08 PM Anup Chenthamarakshan wrote:
> On Wed, Sep 10, 2014 at 09:39:30AM -0700, Dirk Brandewie wrote:
> > On 09/09/2014 04:22 PM, Anup Chenthamarakshan wrote:
> > >On Tue, Sep 09, 2014 at 08:15:13AM -0700, Dirk Brandewie wrote:
> > >>On 09/08/2014 05:10 PM, Anup Chenthamarakshan wrote:
> > >>>Exported stats appear in
> > >>><sysfs>/devices/system/cpu/intel_pstate/time_in_state as follows:
> > >>>
> > >>>## CPU 0
> > >>>400000 3647
> > >>>500000 24342
> > >>>600000 144150
> > >>>700000 202469
> > >>>## CPU 1
> > >>>400000 4813
> > >>>500000 22628
> > >>>600000 149564
> > >>>700000 211885
> > >>>800000 173890
> > >>>
> > >>>Signed-off-by: Anup Chenthamarakshan <anupc@chromium.org>
> > >>
> > >>What is this information being used for?
> > >
> > >I'm using P-state residency information in power consumption tests to calculate
> > >proportion of time spent in each P-state across all processors (one global set
> > >of percentages, corresponding to each P-state). This is used to validate new
> > >changes from the power perspective. Essentially, sanity checks to flag changes
> > >with large difference in P-state residency.
> > >
> > >So far, we've been using the data exported by acpi-cpufreq to track this.
> > >
> > >>
> > >>Tracking the current P state request for each core is only part of the
> > >>story.  The processor aggregates the requests from all cores and then decides
> > >>what frequency the package will run at, this evaluation happens at ~1ms time
> > >>frame.  If a core is idle then it loses its vote for that package frequency will
> > >>be and its frequency will be zero even though it may have been requesting
> > >>a high P state when it went idle.  Tracking the residency of the requested
> > >>P state doesn't provide much useful information other than ensuring the the
> > >>requests are changing over time IMHO.
> > >
> > >This is exactly why we're trying to track it.
> > 
> > My point is that you are tracking the residency of the request and not
> > the P state the package was running at.  On a lightly loaded system
> > it is not unusual for a core that was very busy and requesting a high
> > P state to go idle for several seconds.  In this case that core would
> > lose its vote for the package P state but the stats would show that
> > the P state was high for a very long time when its real frequency
> > was zero.
> 
> I see what you're saying. Requesting a p-state does not necessarily mean that is
> the state the CPU is in.
> 
> > 
> > There are a couple of ways to get what I consider better information
> > about what is actually going on.
> > 
> >   The current turbostat provides C state residency and calculates the
> >   average/effective frequency of the core over its sample time.
> >   Turbostat will also measure the power consumption from the CPU point
> >   of view if your processor supports the RAPL registers.
> > 
> >   Reading MSR 0x198 MSR_IA32_PERF_STATUS will tell you what the core
> >   would run at if it not idle, this reflects the decision that the
> >   package made based on current requests.
> > 
> >   Using perf to collect power:pstate_sample event will give information
> >   about each sample on the core and give you timestamps to detect idle
> >   times.
> > 
> >   Using perf to collect power:cpu_frequency will show when the P state
> >   request was changed on each core and is triggered by intel_pstate and
> >   acpi_cpufreq.
> > 
> >   Powertop collects that same information as turbostat and a bunch of
> >   other information useful in seeing where you could be burning power
> >   for no good reason.
> > 
> > For getting an idea of real power turbostat is the easiest to use and
> > is available on most systems.  Using perf will give you a very fine grained
> > view of what is going on as well as point to the culprit for bad
> > behaviour in most cases.
> 
> Tools like powertop and turbostat are not present by default on all systems,
> so it is not always possible to use them :(

Which systems are you referring to in particular?

-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.
  2014-09-10 22:49         ` Rafael J. Wysocki
@ 2014-09-10 23:39           ` Anup Chenthamarakshan
  2014-09-11  0:04             ` Rafael J. Wysocki
  0 siblings, 1 reply; 15+ messages in thread
From: Anup Chenthamarakshan @ 2014-09-10 23:39 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Dirk Brandewie, Sameer Nanda, Viresh Kumar, linux-pm, linux-kernel

On Thu, Sep 11, 2014 at 12:49:48AM +0200, Rafael J. Wysocki wrote:
> On Wednesday, September 10, 2014 03:15:08 PM Anup Chenthamarakshan wrote:
> > 
> > Tools like powertop and turbostat are not present by default on all systems,
> > so it is not always possible to use them :(
> 
> Which systems are you referring to in particular?

We're testing on Chrome OS devices (Chromebooks).

> 
> -- 
> I speak only for myself.
> Rafael J. Wysocki, Intel Open Source Technology Center.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.
  2014-09-10 23:39           ` Anup Chenthamarakshan
@ 2014-09-11  0:04             ` Rafael J. Wysocki
  2014-09-11  1:04               ` Sameer Nanda
  0 siblings, 1 reply; 15+ messages in thread
From: Rafael J. Wysocki @ 2014-09-11  0:04 UTC (permalink / raw)
  To: Anup Chenthamarakshan
  Cc: Dirk Brandewie, Sameer Nanda, Viresh Kumar, linux-pm, linux-kernel

On Wednesday, September 10, 2014 04:39:05 PM Anup Chenthamarakshan wrote:
> On Thu, Sep 11, 2014 at 12:49:48AM +0200, Rafael J. Wysocki wrote:
> > On Wednesday, September 10, 2014 03:15:08 PM Anup Chenthamarakshan wrote:
> > > 
> > > Tools like powertop and turbostat are not present by default on all systems,
> > > so it is not always possible to use them :(
> > 
> > Which systems are you referring to in particular?
> 
> We're testing on Chrome OS devices (Chromebooks).

How big of a deal is it to install the tools mentioned above on such a system?

At least turbostat is shipped with the kernel source.

-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.
  2014-09-11  0:04             ` Rafael J. Wysocki
@ 2014-09-11  1:04               ` Sameer Nanda
  2014-09-11 15:37                 ` Dirk Brandewie
  0 siblings, 1 reply; 15+ messages in thread
From: Sameer Nanda @ 2014-09-11  1:04 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Anup Chenthamarakshan, Dirk Brandewie, Viresh Kumar, linux-pm,
	linux-kernel

On Wed, Sep 10, 2014 at 5:04 PM, Rafael J. Wysocki <rjw@rjwysocki.net> wrote:
> On Wednesday, September 10, 2014 04:39:05 PM Anup Chenthamarakshan wrote:
>> On Thu, Sep 11, 2014 at 12:49:48AM +0200, Rafael J. Wysocki wrote:
>> > On Wednesday, September 10, 2014 03:15:08 PM Anup Chenthamarakshan wrote:
>> > >
>> > > Tools like powertop and turbostat are not present by default on all systems,
>> > > so it is not always possible to use them :(
>> >
>> > Which systems are you referring to in particular?
>>
>> We're testing on Chrome OS devices (Chromebooks).
>
> How big of a deal is it to install the tools mentioned above on such a system?
>
> At least turbostat is shipped with the kernel source.

Given the web browser based front end of Chrome OS, installing these
tools will only get us so far -- if the system is in developer mode,
the tools are accessible but when the system is in normal (verified
boot mode) these tools cannot be launched directly.

We are in the process of switching Chrome OS x86 kernels from ondemand
governor to intel_pstate.  When debugging power consumption issues,
losing the ability to easily get CPU frequency related information as
a side-effect of this switch is less than ideal.

We are happy to spin this patch to expose aperf/mperf based CPU
frequency information if you think that is the better route to take
longer term.

>
> --
> I speak only for myself.
> Rafael J. Wysocki, Intel Open Source Technology Center.



-- 
Sameer

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] intel_pstate: track and export frequency residency stats via sysfs.
  2014-09-11  1:04               ` Sameer Nanda
@ 2014-09-11 15:37                 ` Dirk Brandewie
  0 siblings, 0 replies; 15+ messages in thread
From: Dirk Brandewie @ 2014-09-11 15:37 UTC (permalink / raw)
  To: Sameer Nanda, Rafael J. Wysocki
  Cc: dirk.brandewie, Anup Chenthamarakshan, Viresh Kumar, linux-pm,
	linux-kernel

On 09/10/2014 06:04 PM, Sameer Nanda wrote:
> On Wed, Sep 10, 2014 at 5:04 PM, Rafael J. Wysocki <rjw@rjwysocki.net> wrote:
>> On Wednesday, September 10, 2014 04:39:05 PM Anup Chenthamarakshan wrote:
>>> On Thu, Sep 11, 2014 at 12:49:48AM +0200, Rafael J. Wysocki wrote:
>>>> On Wednesday, September 10, 2014 03:15:08 PM Anup Chenthamarakshan wrote:
>>>>>
>>>>> Tools like powertop and turbostat are not present by default on all systems,
>>>>> so it is not always possible to use them :(
>>>>
>>>> Which systems are you referring to in particular?
>>>
>>> We're testing on Chrome OS devices (Chromebooks).
>>
>> How big of a deal is it to install the tools mentioned above on such a system?
>>
>> At least turbostat is shipped with the kernel source.
>
> Given the web browser based front end of Chrome OS, installing these
> tools will only get us so far -- if the system is in developer mode,
> the tools are accessible but when the system is in normal (verified
> boot mode) these tools cannot be launched directly.
>
> We are in the process of switching Chrome OS x86 kernels from ondemand
> governor to intel_pstate.  When debugging power consumption issues,
> losing the ability to easily get CPU frequency related information as
> a side-effect of this switch is less than ideal.
>
> We are happy to spin this patch to expose aperf/mperf based CPU
> frequency information if you think that is the better route to take
> longer term.

You can get the frequency as measured by intel_pstate from /proc/cpuinfo
or /sys/devices/system/cpu/cpu[n]/cpufreq/cpuinfo_cur_freq but his is only
for the most recent sample on cpu[n]

reading MSR 0x199 and some reasonable rate will let you graph what request
is being made on each core.

>
>>
>> --
>> I speak only for myself.
>> Rafael J. Wysocki, Intel Open Source Technology Center.
>
>
>


^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2014-09-11 15:37 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-09-09  0:10 [PATCH] intel_pstate: track and export frequency residency stats via sysfs Anup Chenthamarakshan
2014-09-09  5:03 ` Viresh Kumar
2014-09-09  5:32   ` Anup Chenthamarakshan
2014-09-09  6:26     ` Viresh Kumar
2014-09-09 23:31       ` Anup Chenthamarakshan
2014-09-10  6:49         ` Viresh Kumar
2014-09-09 15:15 ` Dirk Brandewie
2014-09-09 23:22   ` Anup Chenthamarakshan
2014-09-10 16:39     ` Dirk Brandewie
2014-09-10 22:15       ` Anup Chenthamarakshan
2014-09-10 22:49         ` Rafael J. Wysocki
2014-09-10 23:39           ` Anup Chenthamarakshan
2014-09-11  0:04             ` Rafael J. Wysocki
2014-09-11  1:04               ` Sameer Nanda
2014-09-11 15:37                 ` Dirk Brandewie

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).