All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V2 0/4] cpufreq: Record stats with fast-switching
@ 2020-09-16  6:45 Viresh Kumar
  2020-09-16  6:45 ` [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() Viresh Kumar
                   ` (3 more replies)
  0 siblings, 4 replies; 23+ messages in thread
From: Viresh Kumar @ 2020-09-16  6:45 UTC (permalink / raw)
  To: Rafael Wysocki, Ben Segall, Dietmar Eggemann, Ingo Molnar,
	Juri Lelli, Mel Gorman, Peter Zijlstra, Steven Rostedt,
	Vincent Guittot, Viresh Kumar
  Cc: linux-pm, Lukasz Luba, cristian.marussi, sudeep.holla, linux-kernel

Hi,

We disabled recording cpufreq stats when fast switching was introduced
to the cpufreq core as the cpufreq stats required to take a spinlock and
that can't be allowed (for performance reasons) on scheduler's hot path.

Here is an attempt to get rid of the lock and bring back the support.

V1-V2:
- Use READ_ONCE/WRITE_ONCE instead of atomic in the first patch.

--
Viresh

Viresh Kumar (4):
  cpufreq: stats: Defer stats update to
    cpufreq_stats_record_transition()
  cpufreq: stats: Remove locking
  cpufreq: stats: Enable stats for fast-switch as well
  cpufreq: Move traces and update to policy->cur to cpufreq core

 drivers/cpufreq/cpufreq.c        | 16 +++++-
 drivers/cpufreq/cpufreq_stats.c  | 87 ++++++++++++++++++++------------
 kernel/sched/cpufreq_schedutil.c | 12 +----
 3 files changed, 72 insertions(+), 43 deletions(-)


base-commit: 856deb866d16e29bd65952e0289066f6078af773
-- 
2.25.0.rc1.19.g042ed3e048af


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition()
  2020-09-16  6:45 [PATCH V2 0/4] cpufreq: Record stats with fast-switching Viresh Kumar
@ 2020-09-16  6:45 ` Viresh Kumar
  2020-09-23 13:48   ` Rafael J. Wysocki
  2020-09-25  8:21   ` Lukasz Luba
  2020-09-16  6:45 ` [PATCH V2 2/4] cpufreq: stats: Remove locking Viresh Kumar
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 23+ messages in thread
From: Viresh Kumar @ 2020-09-16  6:45 UTC (permalink / raw)
  To: Rafael Wysocki, Viresh Kumar
  Cc: linux-pm, Vincent Guittot, Lukasz Luba, cristian.marussi,
	sudeep.holla, linux-kernel

In order to prepare for lock-less stats update, add support to defer any
updates to it until cpufreq_stats_record_transition() is called.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++---------
 1 file changed, 56 insertions(+), 19 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 94d959a8e954..3e7eee29ee86 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -22,17 +22,22 @@ struct cpufreq_stats {
 	spinlock_t lock;
 	unsigned int *freq_table;
 	unsigned int *trans_table;
+
+	/* Deferred reset */
+	unsigned int reset_pending;
+	unsigned long long reset_time;
 };
 
-static void cpufreq_stats_update(struct cpufreq_stats *stats)
+static void cpufreq_stats_update(struct cpufreq_stats *stats,
+				 unsigned long long time)
 {
 	unsigned long long cur_time = get_jiffies_64();
 
-	stats->time_in_state[stats->last_index] += cur_time - stats->last_time;
+	stats->time_in_state[stats->last_index] += cur_time - time;
 	stats->last_time = cur_time;
 }
 
-static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
+static void cpufreq_stats_reset_table(struct cpufreq_stats *stats)
 {
 	unsigned int count = stats->max_state;
 
@@ -41,42 +46,67 @@ static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
 	memset(stats->trans_table, 0, count * count * sizeof(int));
 	stats->last_time = get_jiffies_64();
 	stats->total_trans = 0;
+
+	/* Adjust for the time elapsed since reset was requested */
+	WRITE_ONCE(stats->reset_pending, 0);
+	cpufreq_stats_update(stats, stats->reset_time);
 	spin_unlock(&stats->lock);
 }
 
 static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf)
 {
-	return sprintf(buf, "%d\n", policy->stats->total_trans);
+	struct cpufreq_stats *stats = policy->stats;
+
+	if (READ_ONCE(stats->reset_pending))
+		return sprintf(buf, "%d\n", 0);
+	else
+		return sprintf(buf, "%d\n", stats->total_trans);
 }
 cpufreq_freq_attr_ro(total_trans);
 
 static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 {
 	struct cpufreq_stats *stats = policy->stats;
+	bool pending = READ_ONCE(stats->reset_pending);
+	unsigned long long time;
 	ssize_t len = 0;
 	int i;
 
 	if (policy->fast_switch_enabled)
 		return 0;
 
-	spin_lock(&stats->lock);
-	cpufreq_stats_update(stats);
-	spin_unlock(&stats->lock);
-
 	for (i = 0; i < stats->state_num; i++) {
+		if (pending) {
+			if (i == stats->last_index)
+				time = get_jiffies_64() - stats->reset_time;
+			else
+				time = 0;
+		} else {
+			time = stats->time_in_state[i];
+			if (i == stats->last_index)
+				time += get_jiffies_64() - stats->last_time;
+		}
+
 		len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i],
-			(unsigned long long)
-			jiffies_64_to_clock_t(stats->time_in_state[i]));
+			       jiffies_64_to_clock_t(time));
 	}
 	return len;
 }
 cpufreq_freq_attr_ro(time_in_state);
 
+/* We don't care what is written to the attribute */
 static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf,
 			   size_t count)
 {
-	/* We don't care what is written to the attribute. */
-	cpufreq_stats_clear_table(policy->stats);
+	struct cpufreq_stats *stats = policy->stats;
+
+	/*
+	 * Defer resetting of stats to cpufreq_stats_record_transition() to
+	 * avoid races.
+	 */
+	WRITE_ONCE(stats->reset_pending, 1);
+	stats->reset_time = get_jiffies_64();
+
 	return count;
 }
 cpufreq_freq_attr_wo(reset);
@@ -84,8 +114,9 @@ cpufreq_freq_attr_wo(reset);
 static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
 {
 	struct cpufreq_stats *stats = policy->stats;
+	bool pending = READ_ONCE(stats->reset_pending);
 	ssize_t len = 0;
-	int i, j;
+	int i, j, count;
 
 	if (policy->fast_switch_enabled)
 		return 0;
@@ -113,8 +144,13 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
 		for (j = 0; j < stats->state_num; j++) {
 			if (len >= PAGE_SIZE)
 				break;
-			len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ",
-					stats->trans_table[i*stats->max_state+j]);
+
+			if (pending)
+				count = 0;
+			else
+				count = stats->trans_table[i * stats->max_state + j];
+
+			len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ", count);
 		}
 		if (len >= PAGE_SIZE)
 			break;
@@ -228,10 +264,11 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy,
 	struct cpufreq_stats *stats = policy->stats;
 	int old_index, new_index;
 
-	if (!stats) {
-		pr_debug("%s: No stats found\n", __func__);
+	if (!stats)
 		return;
-	}
+
+	if (READ_ONCE(stats->reset_pending))
+		cpufreq_stats_reset_table(stats);
 
 	old_index = stats->last_index;
 	new_index = freq_table_get_index(stats, new_freq);
@@ -241,7 +278,7 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy,
 		return;
 
 	spin_lock(&stats->lock);
-	cpufreq_stats_update(stats);
+	cpufreq_stats_update(stats, stats->last_time);
 
 	stats->last_index = new_index;
 	stats->trans_table[old_index * stats->max_state + new_index]++;
-- 
2.25.0.rc1.19.g042ed3e048af


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH V2 2/4] cpufreq: stats: Remove locking
  2020-09-16  6:45 [PATCH V2 0/4] cpufreq: Record stats with fast-switching Viresh Kumar
  2020-09-16  6:45 ` [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() Viresh Kumar
@ 2020-09-16  6:45 ` Viresh Kumar
  2020-09-16  6:45 ` [PATCH V2 3/4] cpufreq: stats: Enable stats for fast-switch as well Viresh Kumar
  2020-09-16  6:45 ` [PATCH V2 4/4] cpufreq: Move traces and update to policy->cur to cpufreq core Viresh Kumar
  3 siblings, 0 replies; 23+ messages in thread
From: Viresh Kumar @ 2020-09-16  6:45 UTC (permalink / raw)
  To: Rafael Wysocki, Viresh Kumar
  Cc: linux-pm, Vincent Guittot, Lukasz Luba, cristian.marussi,
	sudeep.holla, linux-kernel

The locking isn't required anymore as stats can get updated only from
one place at a time. Remove it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq_stats.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 3e7eee29ee86..314fa1d506d0 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -19,7 +19,6 @@ struct cpufreq_stats {
 	unsigned int state_num;
 	unsigned int last_index;
 	u64 *time_in_state;
-	spinlock_t lock;
 	unsigned int *freq_table;
 	unsigned int *trans_table;
 
@@ -41,7 +40,6 @@ static void cpufreq_stats_reset_table(struct cpufreq_stats *stats)
 {
 	unsigned int count = stats->max_state;
 
-	spin_lock(&stats->lock);
 	memset(stats->time_in_state, 0, count * sizeof(u64));
 	memset(stats->trans_table, 0, count * count * sizeof(int));
 	stats->last_time = get_jiffies_64();
@@ -50,7 +48,6 @@ static void cpufreq_stats_reset_table(struct cpufreq_stats *stats)
 	/* Adjust for the time elapsed since reset was requested */
 	WRITE_ONCE(stats->reset_pending, 0);
 	cpufreq_stats_update(stats, stats->reset_time);
-	spin_unlock(&stats->lock);
 }
 
 static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf)
@@ -244,7 +241,6 @@ void cpufreq_stats_create_table(struct cpufreq_policy *policy)
 	stats->state_num = i;
 	stats->last_time = get_jiffies_64();
 	stats->last_index = freq_table_get_index(stats, policy->cur);
-	spin_lock_init(&stats->lock);
 
 	policy->stats = stats;
 	ret = sysfs_create_group(&policy->kobj, &stats_attr_group);
@@ -277,11 +273,9 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy,
 	if (old_index == -1 || new_index == -1 || old_index == new_index)
 		return;
 
-	spin_lock(&stats->lock);
 	cpufreq_stats_update(stats, stats->last_time);
 
 	stats->last_index = new_index;
 	stats->trans_table[old_index * stats->max_state + new_index]++;
 	stats->total_trans++;
-	spin_unlock(&stats->lock);
 }
-- 
2.25.0.rc1.19.g042ed3e048af


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH V2 3/4] cpufreq: stats: Enable stats for fast-switch as well
  2020-09-16  6:45 [PATCH V2 0/4] cpufreq: Record stats with fast-switching Viresh Kumar
  2020-09-16  6:45 ` [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() Viresh Kumar
  2020-09-16  6:45 ` [PATCH V2 2/4] cpufreq: stats: Remove locking Viresh Kumar
@ 2020-09-16  6:45 ` Viresh Kumar
  2020-09-23 15:14   ` Rafael J. Wysocki
  2020-09-16  6:45 ` [PATCH V2 4/4] cpufreq: Move traces and update to policy->cur to cpufreq core Viresh Kumar
  3 siblings, 1 reply; 23+ messages in thread
From: Viresh Kumar @ 2020-09-16  6:45 UTC (permalink / raw)
  To: Rafael Wysocki, Viresh Kumar
  Cc: linux-pm, Vincent Guittot, Lukasz Luba, cristian.marussi,
	sudeep.holla, linux-kernel

Now that all the blockers are gone for enabling stats in fast-switching
case, enable it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c       | 6 +++++-
 drivers/cpufreq/cpufreq_stats.c | 6 ------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 47aa90f9a7c2..d5fe64e96be9 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2057,8 +2057,12 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
 					unsigned int target_freq)
 {
 	target_freq = clamp_val(target_freq, policy->min, policy->max);
+	target_freq = cpufreq_driver->fast_switch(policy, target_freq);
 
-	return cpufreq_driver->fast_switch(policy, target_freq);
+	if (target_freq)
+		cpufreq_stats_record_transition(policy, target_freq);
+
+	return target_freq;
 }
 EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
 
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 314fa1d506d0..daea17f0c36c 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -69,9 +69,6 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 	ssize_t len = 0;
 	int i;
 
-	if (policy->fast_switch_enabled)
-		return 0;
-
 	for (i = 0; i < stats->state_num; i++) {
 		if (pending) {
 			if (i == stats->last_index)
@@ -115,9 +112,6 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
 	ssize_t len = 0;
 	int i, j, count;
 
-	if (policy->fast_switch_enabled)
-		return 0;
-
 	len += scnprintf(buf + len, PAGE_SIZE - len, "   From  :    To\n");
 	len += scnprintf(buf + len, PAGE_SIZE - len, "         : ");
 	for (i = 0; i < stats->state_num; i++) {
-- 
2.25.0.rc1.19.g042ed3e048af


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH V2 4/4] cpufreq: Move traces and update to policy->cur to cpufreq core
  2020-09-16  6:45 [PATCH V2 0/4] cpufreq: Record stats with fast-switching Viresh Kumar
                   ` (2 preceding siblings ...)
  2020-09-16  6:45 ` [PATCH V2 3/4] cpufreq: stats: Enable stats for fast-switch as well Viresh Kumar
@ 2020-09-16  6:45 ` Viresh Kumar
  3 siblings, 0 replies; 23+ messages in thread
From: Viresh Kumar @ 2020-09-16  6:45 UTC (permalink / raw)
  To: Rafael Wysocki, Viresh Kumar, Ingo Molnar, Peter Zijlstra,
	Juri Lelli, Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
	Ben Segall, Mel Gorman
  Cc: linux-pm, Lukasz Luba, cristian.marussi, sudeep.holla, linux-kernel

The cpufreq core handles the updates to policy->cur and recording of
cpufreq trace events for all the governors except schedutil's fast
switch case.

Move that as well to cpufreq core for consistency and readability.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c        | 14 ++++++++++++--
 kernel/sched/cpufreq_schedutil.c | 12 +-----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index d5fe64e96be9..bc930f6ecff6 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2056,11 +2056,21 @@ EXPORT_SYMBOL(cpufreq_unregister_notifier);
 unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
 					unsigned int target_freq)
 {
+	int cpu;
+
 	target_freq = clamp_val(target_freq, policy->min, policy->max);
 	target_freq = cpufreq_driver->fast_switch(policy, target_freq);
 
-	if (target_freq)
-		cpufreq_stats_record_transition(policy, target_freq);
+	if (!target_freq)
+		return 0;
+
+	policy->cur = target_freq;
+	cpufreq_stats_record_transition(policy, target_freq);
+
+	if (trace_cpu_frequency_enabled()) {
+		for_each_cpu(cpu, policy->cpus)
+			trace_cpu_frequency(target_freq, cpu);
+	}
 
 	return target_freq;
 }
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index e39008242cf4..28f6d1ad608b 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -115,21 +115,11 @@ static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
 			      unsigned int next_freq)
 {
 	struct cpufreq_policy *policy = sg_policy->policy;
-	int cpu;
 
 	if (!sugov_update_next_freq(sg_policy, time, next_freq))
 		return;
 
-	next_freq = cpufreq_driver_fast_switch(policy, next_freq);
-	if (!next_freq)
-		return;
-
-	policy->cur = next_freq;
-
-	if (trace_cpu_frequency_enabled()) {
-		for_each_cpu(cpu, policy->cpus)
-			trace_cpu_frequency(next_freq, cpu);
-	}
+	cpufreq_driver_fast_switch(policy, next_freq);
 }
 
 static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time,
-- 
2.25.0.rc1.19.g042ed3e048af


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition()
  2020-09-16  6:45 ` [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() Viresh Kumar
@ 2020-09-23 13:48   ` Rafael J. Wysocki
  2020-09-24  9:25     ` Lukasz Luba
  2020-09-24 13:15     ` Viresh Kumar
  2020-09-25  8:21   ` Lukasz Luba
  1 sibling, 2 replies; 23+ messages in thread
From: Rafael J. Wysocki @ 2020-09-23 13:48 UTC (permalink / raw)
  To: Viresh Kumar
  Cc: Rafael Wysocki, Linux PM, Vincent Guittot, Lukasz Luba,
	cristian.marussi, Sudeep Holla, Linux Kernel Mailing List

On Wed, Sep 16, 2020 at 8:45 AM Viresh Kumar <viresh.kumar@linaro.org> wrote:
>
> In order to prepare for lock-less stats update, add support to defer any
> updates to it until cpufreq_stats_record_transition() is called.

This is a bit devoid of details.

I guess you mean reset in particular, but that's not clear from the above.

Also, it would be useful to describe the design somewhat.

> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> ---
>  drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++---------
>  1 file changed, 56 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
> index 94d959a8e954..3e7eee29ee86 100644
> --- a/drivers/cpufreq/cpufreq_stats.c
> +++ b/drivers/cpufreq/cpufreq_stats.c
> @@ -22,17 +22,22 @@ struct cpufreq_stats {
>         spinlock_t lock;
>         unsigned int *freq_table;
>         unsigned int *trans_table;
> +
> +       /* Deferred reset */
> +       unsigned int reset_pending;
> +       unsigned long long reset_time;
>  };
>
> -static void cpufreq_stats_update(struct cpufreq_stats *stats)
> +static void cpufreq_stats_update(struct cpufreq_stats *stats,
> +                                unsigned long long time)
>  {
>         unsigned long long cur_time = get_jiffies_64();
>
> -       stats->time_in_state[stats->last_index] += cur_time - stats->last_time;
> +       stats->time_in_state[stats->last_index] += cur_time - time;
>         stats->last_time = cur_time;
>  }
>
> -static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
> +static void cpufreq_stats_reset_table(struct cpufreq_stats *stats)
>  {
>         unsigned int count = stats->max_state;
>
> @@ -41,42 +46,67 @@ static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
>         memset(stats->trans_table, 0, count * count * sizeof(int));
>         stats->last_time = get_jiffies_64();
>         stats->total_trans = 0;
> +
> +       /* Adjust for the time elapsed since reset was requested */
> +       WRITE_ONCE(stats->reset_pending, 0);

What if this runs in parallel with store_reset()?

The latter may update reset_pending to 1 before the below runs.
Conversely, this may clear reset_pending right after store_reset() has
set it to 1, but before it manages to set reset_time.  Is that not a
problem?

> +       cpufreq_stats_update(stats, stats->reset_time);
>
>         spin_unlock(&stats->lock);
>  }
>
>  static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf)
>  {
> -       return sprintf(buf, "%d\n", policy->stats->total_trans);
> +       struct cpufreq_stats *stats = policy->stats;
> +
> +       if (READ_ONCE(stats->reset_pending))
> +               return sprintf(buf, "%d\n", 0);
> +       else
> +               return sprintf(buf, "%d\n", stats->total_trans);
>  }
>  cpufreq_freq_attr_ro(total_trans);
>
>  static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
>  {
>         struct cpufreq_stats *stats = policy->stats;
> +       bool pending = READ_ONCE(stats->reset_pending);
> +       unsigned long long time;
>         ssize_t len = 0;
>         int i;
>
>         if (policy->fast_switch_enabled)
>                 return 0;
>
> -       spin_lock(&stats->lock);
> -       cpufreq_stats_update(stats);
> -       spin_unlock(&stats->lock);
> -
>         for (i = 0; i < stats->state_num; i++) {
> +               if (pending) {
> +                       if (i == stats->last_index)
> +                               time = get_jiffies_64() - stats->reset_time;

What if this runs in parallel with store_reset() and reads reset_time
before the latter manages to update it?

> +                       else
> +                               time = 0;
> +               } else {
> +                       time = stats->time_in_state[i];
> +                       if (i == stats->last_index)
> +                               time += get_jiffies_64() - stats->last_time;
> +               }
> +
>                 len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i],
> -                       (unsigned long long)
> -                       jiffies_64_to_clock_t(stats->time_in_state[i]));
> +                              jiffies_64_to_clock_t(time));
>         }
>         return len;
>  }
>  cpufreq_freq_attr_ro(time_in_state);
>
> +/* We don't care what is written to the attribute */
>  static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf,
>                            size_t count)
>  {
> -       /* We don't care what is written to the attribute. */
> -       cpufreq_stats_clear_table(policy->stats);
> +       struct cpufreq_stats *stats = policy->stats;
> +
> +       /*
> +        * Defer resetting of stats to cpufreq_stats_record_transition() to
> +        * avoid races.
> +        */
> +       WRITE_ONCE(stats->reset_pending, 1);
> +       stats->reset_time = get_jiffies_64();

AFAICS, there is nothing to ensure that reset_time will be updated in
one go and even to ensure that it won't be partially updated before
setting reset_pending.

This should at least be addressed in a comment to explain why it is
not a problem.

> +
>         return count;
>  }
>  cpufreq_freq_attr_wo(reset);
> @@ -84,8 +114,9 @@ cpufreq_freq_attr_wo(reset);
>  static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
>  {
>         struct cpufreq_stats *stats = policy->stats;
> +       bool pending = READ_ONCE(stats->reset_pending);
>         ssize_t len = 0;
> -       int i, j;
> +       int i, j, count;
>
>         if (policy->fast_switch_enabled)
>                 return 0;
> @@ -113,8 +144,13 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
>                 for (j = 0; j < stats->state_num; j++) {
>                         if (len >= PAGE_SIZE)
>                                 break;
> -                       len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ",
> -                                       stats->trans_table[i*stats->max_state+j]);
> +
> +                       if (pending)
> +                               count = 0;
> +                       else
> +                               count = stats->trans_table[i * stats->max_state + j];
> +
> +                       len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ", count);
>                 }
>                 if (len >= PAGE_SIZE)
>                         break;
> @@ -228,10 +264,11 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy,
>         struct cpufreq_stats *stats = policy->stats;
>         int old_index, new_index;
>
> -       if (!stats) {
> -               pr_debug("%s: No stats found\n", __func__);
> +       if (!stats)
>                 return;
> -       }
> +
> +       if (READ_ONCE(stats->reset_pending))
> +               cpufreq_stats_reset_table(stats);

This is a bit confusing, because cpufreq_stats_reset_table() calls
cpufreq_stats_update() and passes reset_time to it, but it is called
again below with last_time as the second arg.

It is not particularly clear to me why this needs to be done this way.

>
>         old_index = stats->last_index;
>         new_index = freq_table_get_index(stats, new_freq);
> @@ -241,7 +278,7 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy,
>                 return;
>
>         spin_lock(&stats->lock);
> -       cpufreq_stats_update(stats);
> +       cpufreq_stats_update(stats, stats->last_time);
>
>         stats->last_index = new_index;
>         stats->trans_table[old_index * stats->max_state + new_index]++;
> --
> 2.25.0.rc1.19.g042ed3e048af
>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V2 3/4] cpufreq: stats: Enable stats for fast-switch as well
  2020-09-16  6:45 ` [PATCH V2 3/4] cpufreq: stats: Enable stats for fast-switch as well Viresh Kumar
@ 2020-09-23 15:14   ` Rafael J. Wysocki
  2020-09-23 15:17     ` Rafael J. Wysocki
  0 siblings, 1 reply; 23+ messages in thread
From: Rafael J. Wysocki @ 2020-09-23 15:14 UTC (permalink / raw)
  To: Viresh Kumar
  Cc: Rafael Wysocki, Linux PM, Vincent Guittot, Lukasz Luba,
	cristian.marussi, Sudeep Holla, Linux Kernel Mailing List

On Wed, Sep 16, 2020 at 8:46 AM Viresh Kumar <viresh.kumar@linaro.org> wrote:
>
> Now that all the blockers are gone for enabling stats in fast-switching
> case, enable it.
>
> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> ---
>  drivers/cpufreq/cpufreq.c       | 6 +++++-
>  drivers/cpufreq/cpufreq_stats.c | 6 ------
>  2 files changed, 5 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 47aa90f9a7c2..d5fe64e96be9 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -2057,8 +2057,12 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
>                                         unsigned int target_freq)
>  {
>         target_freq = clamp_val(target_freq, policy->min, policy->max);
> +       target_freq = cpufreq_driver->fast_switch(policy, target_freq);
>
> -       return cpufreq_driver->fast_switch(policy, target_freq);
> +       if (target_freq)
> +               cpufreq_stats_record_transition(policy, target_freq);

So this adds two extra branches in the scheduler path for the cases
when the stats are not used at all which seems avoidable to some
extent.

Can we check policy->stats upfront here and bail out right away if it
is not set, for example?

> +
> +       return target_freq;
>  }
>  EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
>
> diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
> index 314fa1d506d0..daea17f0c36c 100644
> --- a/drivers/cpufreq/cpufreq_stats.c
> +++ b/drivers/cpufreq/cpufreq_stats.c
> @@ -69,9 +69,6 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
>         ssize_t len = 0;
>         int i;
>
> -       if (policy->fast_switch_enabled)
> -               return 0;
> -
>         for (i = 0; i < stats->state_num; i++) {
>                 if (pending) {
>                         if (i == stats->last_index)
> @@ -115,9 +112,6 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
>         ssize_t len = 0;
>         int i, j, count;
>
> -       if (policy->fast_switch_enabled)
> -               return 0;
> -
>         len += scnprintf(buf + len, PAGE_SIZE - len, "   From  :    To\n");
>         len += scnprintf(buf + len, PAGE_SIZE - len, "         : ");
>         for (i = 0; i < stats->state_num; i++) {
> --
> 2.25.0.rc1.19.g042ed3e048af
>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V2 3/4] cpufreq: stats: Enable stats for fast-switch as well
  2020-09-23 15:14   ` Rafael J. Wysocki
@ 2020-09-23 15:17     ` Rafael J. Wysocki
  0 siblings, 0 replies; 23+ messages in thread
From: Rafael J. Wysocki @ 2020-09-23 15:17 UTC (permalink / raw)
  To: Viresh Kumar
  Cc: Rafael Wysocki, Linux PM, Vincent Guittot, Lukasz Luba,
	cristian.marussi, Sudeep Holla, Linux Kernel Mailing List

On Wed, Sep 23, 2020 at 5:14 PM Rafael J. Wysocki <rafael@kernel.org> wrote:
>
> On Wed, Sep 16, 2020 at 8:46 AM Viresh Kumar <viresh.kumar@linaro.org> wrote:
> >
> > Now that all the blockers are gone for enabling stats in fast-switching
> > case, enable it.
> >
> > Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> > ---
> >  drivers/cpufreq/cpufreq.c       | 6 +++++-
> >  drivers/cpufreq/cpufreq_stats.c | 6 ------
> >  2 files changed, 5 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> > index 47aa90f9a7c2..d5fe64e96be9 100644
> > --- a/drivers/cpufreq/cpufreq.c
> > +++ b/drivers/cpufreq/cpufreq.c
> > @@ -2057,8 +2057,12 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
> >                                         unsigned int target_freq)
> >  {
> >         target_freq = clamp_val(target_freq, policy->min, policy->max);
> > +       target_freq = cpufreq_driver->fast_switch(policy, target_freq);
> >
> > -       return cpufreq_driver->fast_switch(policy, target_freq);
> > +       if (target_freq)
> > +               cpufreq_stats_record_transition(policy, target_freq);
>
> So this adds two extra branches in the scheduler path for the cases
> when the stats are not used at all which seems avoidable to some
> extent.
>
> Can we check policy->stats upfront here and bail out right away if it
> is not set, for example?

Well, scratch this, the next patch fixes it up.

Cheers!

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition()
  2020-09-23 13:48   ` Rafael J. Wysocki
@ 2020-09-24  9:25     ` Lukasz Luba
  2020-09-24 10:24       ` Rafael J. Wysocki
  2020-09-24 13:15     ` Viresh Kumar
  1 sibling, 1 reply; 23+ messages in thread
From: Lukasz Luba @ 2020-09-24  9:25 UTC (permalink / raw)
  To: Rafael J. Wysocki, Viresh Kumar
  Cc: Rafael Wysocki, Linux PM, Vincent Guittot, cristian.marussi,
	Sudeep Holla, Linux Kernel Mailing List

Hi Rafael,

On 9/23/20 2:48 PM, Rafael J. Wysocki wrote:
> On Wed, Sep 16, 2020 at 8:45 AM Viresh Kumar <viresh.kumar@linaro.org> wrote:
>>
>> In order to prepare for lock-less stats update, add support to defer any
>> updates to it until cpufreq_stats_record_transition() is called.
> 
> This is a bit devoid of details.
> 
> I guess you mean reset in particular, but that's not clear from the above.
> 
> Also, it would be useful to describe the design somewhat.
> 
>> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
>> ---
>>   drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++---------
>>   1 file changed, 56 insertions(+), 19 deletions(-)
>>
>> diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
>> index 94d959a8e954..3e7eee29ee86 100644
>> --- a/drivers/cpufreq/cpufreq_stats.c
>> +++ b/drivers/cpufreq/cpufreq_stats.c
>> @@ -22,17 +22,22 @@ struct cpufreq_stats {
>>          spinlock_t lock;
>>          unsigned int *freq_table;
>>          unsigned int *trans_table;
>> +
>> +       /* Deferred reset */
>> +       unsigned int reset_pending;
>> +       unsigned long long reset_time;
>>   };
>>
>> -static void cpufreq_stats_update(struct cpufreq_stats *stats)
>> +static void cpufreq_stats_update(struct cpufreq_stats *stats,
>> +                                unsigned long long time)
>>   {
>>          unsigned long long cur_time = get_jiffies_64();
>>
>> -       stats->time_in_state[stats->last_index] += cur_time - stats->last_time;
>> +       stats->time_in_state[stats->last_index] += cur_time - time;
>>          stats->last_time = cur_time;
>>   }
>>
>> -static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
>> +static void cpufreq_stats_reset_table(struct cpufreq_stats *stats)
>>   {
>>          unsigned int count = stats->max_state;
>>
>> @@ -41,42 +46,67 @@ static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
>>          memset(stats->trans_table, 0, count * count * sizeof(int));
>>          stats->last_time = get_jiffies_64();
>>          stats->total_trans = 0;
>> +
>> +       /* Adjust for the time elapsed since reset was requested */
>> +       WRITE_ONCE(stats->reset_pending, 0);
> 
> What if this runs in parallel with store_reset()?
> 
> The latter may update reset_pending to 1 before the below runs.
> Conversely, this may clear reset_pending right after store_reset() has
> set it to 1, but before it manages to set reset_time.  Is that not a
> problem?

I wonder if we could just drop the reset feature. Is there a tool
which uses this file? The 'reset' sysfs would probably have to stay
forever, but an empty implementation is not an option?
The documentation states:
'This can be useful for evaluating system behaviour under different
governors without the need for a reboot.'
With the scenario of fast-switch this resetting complicates the
implementation and the justification of having it just for experiments
avoiding reboot is IMO weak. The real production code would have to pay
extra cycles every time. Also, we would probably not experiment with
cpufreq different governors, since the SchedUtil is considered the best
option.

Regards,
Lukasz

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition()
  2020-09-24  9:25     ` Lukasz Luba
@ 2020-09-24 10:24       ` Rafael J. Wysocki
  2020-09-24 11:00         ` Lukasz Luba
  0 siblings, 1 reply; 23+ messages in thread
From: Rafael J. Wysocki @ 2020-09-24 10:24 UTC (permalink / raw)
  To: Lukasz Luba
  Cc: Rafael J. Wysocki, Viresh Kumar, Rafael Wysocki, Linux PM,
	Vincent Guittot, cristian.marussi, Sudeep Holla,
	Linux Kernel Mailing List

On Thu, Sep 24, 2020 at 11:25 AM Lukasz Luba <lukasz.luba@arm.com> wrote:
>
> Hi Rafael,
>
> On 9/23/20 2:48 PM, Rafael J. Wysocki wrote:
> > On Wed, Sep 16, 2020 at 8:45 AM Viresh Kumar <viresh.kumar@linaro.org> wrote:
> >>
> >> In order to prepare for lock-less stats update, add support to defer any
> >> updates to it until cpufreq_stats_record_transition() is called.
> >
> > This is a bit devoid of details.
> >
> > I guess you mean reset in particular, but that's not clear from the above.
> >
> > Also, it would be useful to describe the design somewhat.
> >
> >> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> >> ---
> >>   drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++---------
> >>   1 file changed, 56 insertions(+), 19 deletions(-)
> >>
> >> diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
> >> index 94d959a8e954..3e7eee29ee86 100644
> >> --- a/drivers/cpufreq/cpufreq_stats.c
> >> +++ b/drivers/cpufreq/cpufreq_stats.c
> >> @@ -22,17 +22,22 @@ struct cpufreq_stats {
> >>          spinlock_t lock;
> >>          unsigned int *freq_table;
> >>          unsigned int *trans_table;
> >> +
> >> +       /* Deferred reset */
> >> +       unsigned int reset_pending;
> >> +       unsigned long long reset_time;
> >>   };
> >>
> >> -static void cpufreq_stats_update(struct cpufreq_stats *stats)
> >> +static void cpufreq_stats_update(struct cpufreq_stats *stats,
> >> +                                unsigned long long time)
> >>   {
> >>          unsigned long long cur_time = get_jiffies_64();
> >>
> >> -       stats->time_in_state[stats->last_index] += cur_time - stats->last_time;
> >> +       stats->time_in_state[stats->last_index] += cur_time - time;
> >>          stats->last_time = cur_time;
> >>   }
> >>
> >> -static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
> >> +static void cpufreq_stats_reset_table(struct cpufreq_stats *stats)
> >>   {
> >>          unsigned int count = stats->max_state;
> >>
> >> @@ -41,42 +46,67 @@ static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
> >>          memset(stats->trans_table, 0, count * count * sizeof(int));
> >>          stats->last_time = get_jiffies_64();
> >>          stats->total_trans = 0;
> >> +
> >> +       /* Adjust for the time elapsed since reset was requested */
> >> +       WRITE_ONCE(stats->reset_pending, 0);
> >
> > What if this runs in parallel with store_reset()?
> >
> > The latter may update reset_pending to 1 before the below runs.
> > Conversely, this may clear reset_pending right after store_reset() has
> > set it to 1, but before it manages to set reset_time.  Is that not a
> > problem?
>
> I wonder if we could just drop the reset feature. Is there a tool
> which uses this file? The 'reset' sysfs would probably have to stay
> forever, but an empty implementation is not an option?

Well, having an empty sysfs attr would be a bit ugly, but the
implementation of it could be simplified.

> The documentation states:
> 'This can be useful for evaluating system behaviour under different
> governors without the need for a reboot.'
> With the scenario of fast-switch this resetting complicates the
> implementation and the justification of having it just for experiments
> avoiding reboot is IMO weak. The real production code would have to pay
> extra cycles every time. Also, we would probably not experiment with
> cpufreq different governors, since the SchedUtil is considered the best
> option.

It would still be good to have a way to test it against the other
available options, though.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition()
  2020-09-24 10:24       ` Rafael J. Wysocki
@ 2020-09-24 11:00         ` Lukasz Luba
  2020-09-24 11:07           ` Rafael J. Wysocki
  0 siblings, 1 reply; 23+ messages in thread
From: Lukasz Luba @ 2020-09-24 11:00 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Viresh Kumar, Rafael Wysocki, Linux PM, Vincent Guittot,
	cristian.marussi, Sudeep Holla, Linux Kernel Mailing List



On 9/24/20 11:24 AM, Rafael J. Wysocki wrote:
> On Thu, Sep 24, 2020 at 11:25 AM Lukasz Luba <lukasz.luba@arm.com> wrote:
>>
>> Hi Rafael,
>>
>> On 9/23/20 2:48 PM, Rafael J. Wysocki wrote:
>>> On Wed, Sep 16, 2020 at 8:45 AM Viresh Kumar <viresh.kumar@linaro.org> wrote:
>>>>
>>>> In order to prepare for lock-less stats update, add support to defer any
>>>> updates to it until cpufreq_stats_record_transition() is called.
>>>
>>> This is a bit devoid of details.
>>>
>>> I guess you mean reset in particular, but that's not clear from the above.
>>>
>>> Also, it would be useful to describe the design somewhat.
>>>
>>>> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
>>>> ---
>>>>    drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++---------
>>>>    1 file changed, 56 insertions(+), 19 deletions(-)
>>>>
>>>> diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
>>>> index 94d959a8e954..3e7eee29ee86 100644
>>>> --- a/drivers/cpufreq/cpufreq_stats.c
>>>> +++ b/drivers/cpufreq/cpufreq_stats.c
>>>> @@ -22,17 +22,22 @@ struct cpufreq_stats {
>>>>           spinlock_t lock;
>>>>           unsigned int *freq_table;
>>>>           unsigned int *trans_table;
>>>> +
>>>> +       /* Deferred reset */
>>>> +       unsigned int reset_pending;
>>>> +       unsigned long long reset_time;
>>>>    };
>>>>
>>>> -static void cpufreq_stats_update(struct cpufreq_stats *stats)
>>>> +static void cpufreq_stats_update(struct cpufreq_stats *stats,
>>>> +                                unsigned long long time)
>>>>    {
>>>>           unsigned long long cur_time = get_jiffies_64();
>>>>
>>>> -       stats->time_in_state[stats->last_index] += cur_time - stats->last_time;
>>>> +       stats->time_in_state[stats->last_index] += cur_time - time;
>>>>           stats->last_time = cur_time;
>>>>    }
>>>>
>>>> -static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
>>>> +static void cpufreq_stats_reset_table(struct cpufreq_stats *stats)
>>>>    {
>>>>           unsigned int count = stats->max_state;
>>>>
>>>> @@ -41,42 +46,67 @@ static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
>>>>           memset(stats->trans_table, 0, count * count * sizeof(int));
>>>>           stats->last_time = get_jiffies_64();
>>>>           stats->total_trans = 0;
>>>> +
>>>> +       /* Adjust for the time elapsed since reset was requested */
>>>> +       WRITE_ONCE(stats->reset_pending, 0);
>>>
>>> What if this runs in parallel with store_reset()?
>>>
>>> The latter may update reset_pending to 1 before the below runs.
>>> Conversely, this may clear reset_pending right after store_reset() has
>>> set it to 1, but before it manages to set reset_time.  Is that not a
>>> problem?
>>
>> I wonder if we could just drop the reset feature. Is there a tool
>> which uses this file? The 'reset' sysfs would probably have to stay
>> forever, but an empty implementation is not an option?
> 
> Well, having an empty sysfs attr would be a bit ugly, but the
> implementation of it could be simplified.
> 
>> The documentation states:
>> 'This can be useful for evaluating system behaviour under different
>> governors without the need for a reboot.'
>> With the scenario of fast-switch this resetting complicates the
>> implementation and the justification of having it just for experiments
>> avoiding reboot is IMO weak. The real production code would have to pay
>> extra cycles every time. Also, we would probably not experiment with
>> cpufreq different governors, since the SchedUtil is considered the best
>> option.
> 
> It would still be good to have a way to test it against the other
> available options, though.
> 

Experimenting with different governors would still be possible, just
the user-space would have to take a snapshot of the stats when switching
to a new governor. Then the values presented in the stats would just
need to be calculated in this user tool against the snapshot.

The resetting is also not that bad, since nowadays more components
maintain some kind of local statistics/history (scheduler, thermal).
I would recommend to reset the whole system and repeat the same tests
with different governor, just to be sure that everything starts from
similar state (utilization, temperature, other devfreq devices
frequencies etc).



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition()
  2020-09-24 11:00         ` Lukasz Luba
@ 2020-09-24 11:07           ` Rafael J. Wysocki
  2020-09-24 12:39             ` Viresh Kumar
  0 siblings, 1 reply; 23+ messages in thread
From: Rafael J. Wysocki @ 2020-09-24 11:07 UTC (permalink / raw)
  To: Lukasz Luba
  Cc: Rafael J. Wysocki, Viresh Kumar, Rafael Wysocki, Linux PM,
	Vincent Guittot, cristian.marussi, Sudeep Holla,
	Linux Kernel Mailing List

On Thu, Sep 24, 2020 at 1:00 PM Lukasz Luba <lukasz.luba@arm.com> wrote:
>
>
>
> On 9/24/20 11:24 AM, Rafael J. Wysocki wrote:
> > On Thu, Sep 24, 2020 at 11:25 AM Lukasz Luba <lukasz.luba@arm.com> wrote:
> >>
> >> Hi Rafael,
> >>
> >> On 9/23/20 2:48 PM, Rafael J. Wysocki wrote:
> >>> On Wed, Sep 16, 2020 at 8:45 AM Viresh Kumar <viresh.kumar@linaro.org> wrote:
> >>>>
> >>>> In order to prepare for lock-less stats update, add support to defer any
> >>>> updates to it until cpufreq_stats_record_transition() is called.
> >>>
> >>> This is a bit devoid of details.
> >>>
> >>> I guess you mean reset in particular, but that's not clear from the above.
> >>>
> >>> Also, it would be useful to describe the design somewhat.
> >>>
> >>>> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> >>>> ---
> >>>>    drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++---------
> >>>>    1 file changed, 56 insertions(+), 19 deletions(-)
> >>>>
> >>>> diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
> >>>> index 94d959a8e954..3e7eee29ee86 100644
> >>>> --- a/drivers/cpufreq/cpufreq_stats.c
> >>>> +++ b/drivers/cpufreq/cpufreq_stats.c
> >>>> @@ -22,17 +22,22 @@ struct cpufreq_stats {
> >>>>           spinlock_t lock;
> >>>>           unsigned int *freq_table;
> >>>>           unsigned int *trans_table;
> >>>> +
> >>>> +       /* Deferred reset */
> >>>> +       unsigned int reset_pending;
> >>>> +       unsigned long long reset_time;
> >>>>    };
> >>>>
> >>>> -static void cpufreq_stats_update(struct cpufreq_stats *stats)
> >>>> +static void cpufreq_stats_update(struct cpufreq_stats *stats,
> >>>> +                                unsigned long long time)
> >>>>    {
> >>>>           unsigned long long cur_time = get_jiffies_64();
> >>>>
> >>>> -       stats->time_in_state[stats->last_index] += cur_time - stats->last_time;
> >>>> +       stats->time_in_state[stats->last_index] += cur_time - time;
> >>>>           stats->last_time = cur_time;
> >>>>    }
> >>>>
> >>>> -static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
> >>>> +static void cpufreq_stats_reset_table(struct cpufreq_stats *stats)
> >>>>    {
> >>>>           unsigned int count = stats->max_state;
> >>>>
> >>>> @@ -41,42 +46,67 @@ static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
> >>>>           memset(stats->trans_table, 0, count * count * sizeof(int));
> >>>>           stats->last_time = get_jiffies_64();
> >>>>           stats->total_trans = 0;
> >>>> +
> >>>> +       /* Adjust for the time elapsed since reset was requested */
> >>>> +       WRITE_ONCE(stats->reset_pending, 0);
> >>>
> >>> What if this runs in parallel with store_reset()?
> >>>
> >>> The latter may update reset_pending to 1 before the below runs.
> >>> Conversely, this may clear reset_pending right after store_reset() has
> >>> set it to 1, but before it manages to set reset_time.  Is that not a
> >>> problem?
> >>
> >> I wonder if we could just drop the reset feature. Is there a tool
> >> which uses this file? The 'reset' sysfs would probably have to stay
> >> forever, but an empty implementation is not an option?
> >
> > Well, having an empty sysfs attr would be a bit ugly, but the
> > implementation of it could be simplified.
> >
> >> The documentation states:
> >> 'This can be useful for evaluating system behaviour under different
> >> governors without the need for a reboot.'
> >> With the scenario of fast-switch this resetting complicates the
> >> implementation and the justification of having it just for experiments
> >> avoiding reboot is IMO weak. The real production code would have to pay
> >> extra cycles every time. Also, we would probably not experiment with
> >> cpufreq different governors, since the SchedUtil is considered the best
> >> option.
> >
> > It would still be good to have a way to test it against the other
> > available options, though.
> >
>
> Experimenting with different governors would still be possible, just
> the user-space would have to take a snapshot of the stats when switching
> to a new governor. Then the values presented in the stats would just
> need to be calculated in this user tool against the snapshot.
>
> The resetting is also not that bad, since nowadays more components
> maintain some kind of local statistics/history (scheduler, thermal).
> I would recommend to reset the whole system and repeat the same tests
> with different governor, just to be sure that everything starts from
> similar state (utilization, temperature, other devfreq devices
> frequencies etc).

Well, if everyone agrees on removing the reset feature, let's drop the
sysfs attr too, as it would be useless going forward.

Admittedly, I don't have a strong opinion and since intel_pstate
doesn't use a frequency table, this is not relevant for systems using
that driver anyway.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition()
  2020-09-24 11:07           ` Rafael J. Wysocki
@ 2020-09-24 12:39             ` Viresh Kumar
  2020-09-24 16:10               ` Lukasz Luba
  0 siblings, 1 reply; 23+ messages in thread
From: Viresh Kumar @ 2020-09-24 12:39 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Lukasz Luba, Rafael Wysocki, Linux PM, Vincent Guittot,
	cristian.marussi, Sudeep Holla, Linux Kernel Mailing List

On 24-09-20, 13:07, Rafael J. Wysocki wrote:
> On Thu, Sep 24, 2020 at 1:00 PM Lukasz Luba <lukasz.luba@arm.com> wrote:
> > On 9/24/20 11:24 AM, Rafael J. Wysocki wrote:
> > > On Thu, Sep 24, 2020 at 11:25 AM Lukasz Luba <lukasz.luba@arm.com> wrote:

> > >> I wonder if we could just drop the reset feature. Is there a tool
> > >> which uses this file? The 'reset' sysfs would probably have to stay
> > >> forever, but an empty implementation is not an option?
> > >
> > > Well, having an empty sysfs attr would be a bit ugly, but the
> > > implementation of it could be simplified.
> > >
> > >> The documentation states:
> > >> 'This can be useful for evaluating system behaviour under different
> > >> governors without the need for a reboot.'
> > >> With the scenario of fast-switch this resetting complicates the
> > >> implementation and the justification of having it just for experiments
> > >> avoiding reboot is IMO weak. The real production code would have to pay
> > >> extra cycles every time. Also, we would probably not experiment with
> > >> cpufreq different governors, since the SchedUtil is considered the best
> > >> option.
> > >
> > > It would still be good to have a way to test it against the other
> > > available options, though.
> > >
> >
> > Experimenting with different governors would still be possible, just
> > the user-space would have to take a snapshot of the stats when switching
> > to a new governor. Then the values presented in the stats would just
> > need to be calculated in this user tool against the snapshot.
> >
> > The resetting is also not that bad, since nowadays more components
> > maintain some kind of local statistics/history (scheduler, thermal).
> > I would recommend to reset the whole system and repeat the same tests
> > with different governor, just to be sure that everything starts from
> > similar state (utilization, temperature, other devfreq devices
> > frequencies etc).
> 
> Well, if everyone agrees on removing the reset feature, let's drop the
> sysfs attr too, as it would be useless going forward.
> 
> Admittedly, I don't have a strong opinion and since intel_pstate
> doesn't use a frequency table, this is not relevant for systems using
> that driver anyway.

I added this file sometime back as it made my life a lot easier while testing
some scheduler related changes and see how they affect cpufreq updates. IMO this
is a useful feature and we don't really need to get rid of it.

Lets see where the discussion goes about the feedback you gave.

-- 
viresh

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition()
  2020-09-23 13:48   ` Rafael J. Wysocki
  2020-09-24  9:25     ` Lukasz Luba
@ 2020-09-24 13:15     ` Viresh Kumar
  2020-09-25 10:04       ` Rafael J. Wysocki
  1 sibling, 1 reply; 23+ messages in thread
From: Viresh Kumar @ 2020-09-24 13:15 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Rafael Wysocki, Linux PM, Vincent Guittot, Lukasz Luba,
	cristian.marussi, Sudeep Holla, Linux Kernel Mailing List

On 23-09-20, 15:48, Rafael J. Wysocki wrote:
> On Wed, Sep 16, 2020 at 8:45 AM Viresh Kumar <viresh.kumar@linaro.org> wrote:
> >
> > In order to prepare for lock-less stats update, add support to defer any
> > updates to it until cpufreq_stats_record_transition() is called.
> 
> This is a bit devoid of details.
> 
> I guess you mean reset in particular, but that's not clear from the above.

We used to update the stats from two places earlier, reset and
show_time_in_state, the later got removed with this patch.

> Also, it would be useful to describe the design somewhat.

Okay. I will work on it.

> > Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> > ---
> >  drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++---------
> >  1 file changed, 56 insertions(+), 19 deletions(-)
> >
> > diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
> > index 94d959a8e954..3e7eee29ee86 100644
> > --- a/drivers/cpufreq/cpufreq_stats.c
> > +++ b/drivers/cpufreq/cpufreq_stats.c
> > @@ -22,17 +22,22 @@ struct cpufreq_stats {
> >         spinlock_t lock;
> >         unsigned int *freq_table;
> >         unsigned int *trans_table;
> > +
> > +       /* Deferred reset */
> > +       unsigned int reset_pending;
> > +       unsigned long long reset_time;
> >  };
> >
> > -static void cpufreq_stats_update(struct cpufreq_stats *stats)
> > +static void cpufreq_stats_update(struct cpufreq_stats *stats,
> > +                                unsigned long long time)
> >  {
> >         unsigned long long cur_time = get_jiffies_64();
> >
> > -       stats->time_in_state[stats->last_index] += cur_time - stats->last_time;
> > +       stats->time_in_state[stats->last_index] += cur_time - time;
> >         stats->last_time = cur_time;
> >  }
> >
> > -static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
> > +static void cpufreq_stats_reset_table(struct cpufreq_stats *stats)
> >  {
> >         unsigned int count = stats->max_state;
> >
> > @@ -41,42 +46,67 @@ static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
> >         memset(stats->trans_table, 0, count * count * sizeof(int));
> >         stats->last_time = get_jiffies_64();
> >         stats->total_trans = 0;
> > +
> > +       /* Adjust for the time elapsed since reset was requested */
> > +       WRITE_ONCE(stats->reset_pending, 0);
> 
> What if this runs in parallel with store_reset()?
> 
> The latter may update reset_pending to 1 before the below runs.
> Conversely, this may clear reset_pending right after store_reset() has
> set it to 1, but before it manages to set reset_time.  Is that not a
> problem?

What I tried to do here with reset_time is to capture the delta time-in-state
that has elapsed since the last time reset was called and reset was actually
performed, so we keep showing it in stats.

In the first race you mentioned, whatever update we make here won't matter much
as reset-pending will be 1 and so below stats will not be used anywhere.

In the second race, we may get the delta time-in-state wrong, since this is just
stats it shouldn't matter much. But still I think we need to fix the way we do
reset as you pointed out. More later.

> > +       cpufreq_stats_update(stats, stats->reset_time);
> >
> >         spin_unlock(&stats->lock);
> >  }
> >
> >  static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf)
> >  {
> > -       return sprintf(buf, "%d\n", policy->stats->total_trans);
> > +       struct cpufreq_stats *stats = policy->stats;
> > +
> > +       if (READ_ONCE(stats->reset_pending))
> > +               return sprintf(buf, "%d\n", 0);
> > +       else
> > +               return sprintf(buf, "%d\n", stats->total_trans);
> >  }
> >  cpufreq_freq_attr_ro(total_trans);
> >
> >  static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
> >  {
> >         struct cpufreq_stats *stats = policy->stats;
> > +       bool pending = READ_ONCE(stats->reset_pending);
> > +       unsigned long long time;
> >         ssize_t len = 0;
> >         int i;
> >
> >         if (policy->fast_switch_enabled)
> >                 return 0;
> >
> > -       spin_lock(&stats->lock);
> > -       cpufreq_stats_update(stats);
> > -       spin_unlock(&stats->lock);
> > -
> >         for (i = 0; i < stats->state_num; i++) {
> > +               if (pending) {
> > +                       if (i == stats->last_index)
> > +                               time = get_jiffies_64() - stats->reset_time;
> 
> What if this runs in parallel with store_reset() and reads reset_time
> before the latter manages to update it?

We should update reset-time first I think. More later.

> > +                       else
> > +                               time = 0;
> > +               } else {
> > +                       time = stats->time_in_state[i];
> > +                       if (i == stats->last_index)
> > +                               time += get_jiffies_64() - stats->last_time;
> > +               }
> > +
> >                 len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i],
> > -                       (unsigned long long)
> > -                       jiffies_64_to_clock_t(stats->time_in_state[i]));
> > +                              jiffies_64_to_clock_t(time));
> >         }
> >         return len;
> >  }
> >  cpufreq_freq_attr_ro(time_in_state);
> >
> > +/* We don't care what is written to the attribute */
> >  static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf,
> >                            size_t count)
> >  {
> > -       /* We don't care what is written to the attribute. */
> > -       cpufreq_stats_clear_table(policy->stats);
> > +       struct cpufreq_stats *stats = policy->stats;
> > +
> > +       /*
> > +        * Defer resetting of stats to cpufreq_stats_record_transition() to
> > +        * avoid races.
> > +        */
> > +       WRITE_ONCE(stats->reset_pending, 1);
> > +       stats->reset_time = get_jiffies_64();
> 
> AFAICS, there is nothing to ensure that reset_time will be updated in
> one go and even to ensure that it won't be partially updated before
> setting reset_pending.
> 
> This should at least be addressed in a comment to explain why it is
> not a problem.

I propose something like this for it:

       WRITE_ONCE(stats->reset_time, get_jiffies_64());
       WRITE_ONCE(stats->reset_pending, 1);

Whatever race happens here, the worst would be like getting time-in-state for
one of the frequencies wrong only once and I don't think that should be a
real problem as this is just stats and won't make the kernel behave badly.

> > +
> >         return count;
> >  }
> >  cpufreq_freq_attr_wo(reset);
> > @@ -84,8 +114,9 @@ cpufreq_freq_attr_wo(reset);
> >  static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
> >  {
> >         struct cpufreq_stats *stats = policy->stats;
> > +       bool pending = READ_ONCE(stats->reset_pending);
> >         ssize_t len = 0;
> > -       int i, j;
> > +       int i, j, count;
> >
> >         if (policy->fast_switch_enabled)
> >                 return 0;
> > @@ -113,8 +144,13 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
> >                 for (j = 0; j < stats->state_num; j++) {
> >                         if (len >= PAGE_SIZE)
> >                                 break;
> > -                       len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ",
> > -                                       stats->trans_table[i*stats->max_state+j]);
> > +
> > +                       if (pending)
> > +                               count = 0;
> > +                       else
> > +                               count = stats->trans_table[i * stats->max_state + j];
> > +
> > +                       len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ", count);
> >                 }
> >                 if (len >= PAGE_SIZE)
> >                         break;
> > @@ -228,10 +264,11 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy,
> >         struct cpufreq_stats *stats = policy->stats;
> >         int old_index, new_index;
> >
> > -       if (!stats) {
> > -               pr_debug("%s: No stats found\n", __func__);
> > +       if (!stats)
> >                 return;
> > -       }
> > +
> > +       if (READ_ONCE(stats->reset_pending))
> > +               cpufreq_stats_reset_table(stats);
> 
> This is a bit confusing, because cpufreq_stats_reset_table() calls
> cpufreq_stats_update() and passes reset_time to it, but it is called
> again below with last_time as the second arg.
> 
> It is not particularly clear to me why this needs to be done this way.

Because we need to account for the elapsed time-in-state, for the currently
running frequency.

When reset-stats is called, we note down the reset-time, then if reset was
pending then we call reset-table which calls cpufreq_stats_update(reset_time),
and so we account for the elapsed time in current state, where we also update
last-time.

The second call here won't update much in case reset was pending, but will
update normally otherwise.

-- 
viresh

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition()
  2020-09-24 12:39             ` Viresh Kumar
@ 2020-09-24 16:10               ` Lukasz Luba
  2020-09-25  6:09                 ` Viresh Kumar
  0 siblings, 1 reply; 23+ messages in thread
From: Lukasz Luba @ 2020-09-24 16:10 UTC (permalink / raw)
  To: Viresh Kumar
  Cc: Rafael J. Wysocki, Rafael Wysocki, Linux PM, Vincent Guittot,
	cristian.marussi, Sudeep Holla, Linux Kernel Mailing List



On 9/24/20 1:39 PM, Viresh Kumar wrote:
> On 24-09-20, 13:07, Rafael J. Wysocki wrote:
>> On Thu, Sep 24, 2020 at 1:00 PM Lukasz Luba <lukasz.luba@arm.com> wrote:
>>> On 9/24/20 11:24 AM, Rafael J. Wysocki wrote:
>>>> On Thu, Sep 24, 2020 at 11:25 AM Lukasz Luba <lukasz.luba@arm.com> wrote:
> 
>>>>> I wonder if we could just drop the reset feature. Is there a tool
>>>>> which uses this file? The 'reset' sysfs would probably have to stay
>>>>> forever, but an empty implementation is not an option?
>>>>
>>>> Well, having an empty sysfs attr would be a bit ugly, but the
>>>> implementation of it could be simplified.
>>>>
>>>>> The documentation states:
>>>>> 'This can be useful for evaluating system behaviour under different
>>>>> governors without the need for a reboot.'
>>>>> With the scenario of fast-switch this resetting complicates the
>>>>> implementation and the justification of having it just for experiments
>>>>> avoiding reboot is IMO weak. The real production code would have to pay
>>>>> extra cycles every time. Also, we would probably not experiment with
>>>>> cpufreq different governors, since the SchedUtil is considered the best
>>>>> option.
>>>>
>>>> It would still be good to have a way to test it against the other
>>>> available options, though.
>>>>
>>>
>>> Experimenting with different governors would still be possible, just
>>> the user-space would have to take a snapshot of the stats when switching
>>> to a new governor. Then the values presented in the stats would just
>>> need to be calculated in this user tool against the snapshot.
>>>
>>> The resetting is also not that bad, since nowadays more components
>>> maintain some kind of local statistics/history (scheduler, thermal).
>>> I would recommend to reset the whole system and repeat the same tests
>>> with different governor, just to be sure that everything starts from
>>> similar state (utilization, temperature, other devfreq devices
>>> frequencies etc).
>>
>> Well, if everyone agrees on removing the reset feature, let's drop the
>> sysfs attr too, as it would be useless going forward.
>>
>> Admittedly, I don't have a strong opinion and since intel_pstate
>> doesn't use a frequency table, this is not relevant for systems using
>> that driver anyway.
> 
> I added this file sometime back as it made my life a lot easier while testing
> some scheduler related changes and see how they affect cpufreq updates. IMO this
> is a useful feature and we don't really need to get rid of it.
> 
> Lets see where the discussion goes about the feedback you gave.
> 

Because of supporting this reset file, the code is going to be a bit
complex and also visited from the scheduler. I don't know if the
config for stats is enabled for production kernels but if yes,
then forcing all to keep that reset code might be too much.
For the engineering kernel version is OK.

I would say for most normal checks these sysfs stats are very useful.
If there is a need for investigation like you described, the trace
event is there, just have to be enabled. Tools like LISA would
help with parsing the trace and mapping to some plots or even
merging with scheduler context.

 From time to time some engineers are asking why the stats
don't show the values (missing fast-switch tracking). I think
they are interested in a simple use case, otherwise they would use the
tracing.

Regards,
Lukasz

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition()
  2020-09-24 16:10               ` Lukasz Luba
@ 2020-09-25  6:09                 ` Viresh Kumar
  2020-09-25  8:10                   ` Lukasz Luba
  0 siblings, 1 reply; 23+ messages in thread
From: Viresh Kumar @ 2020-09-25  6:09 UTC (permalink / raw)
  To: Lukasz Luba
  Cc: Rafael J. Wysocki, Rafael Wysocki, Linux PM, Vincent Guittot,
	cristian.marussi, Sudeep Holla, Linux Kernel Mailing List

On 24-09-20, 17:10, Lukasz Luba wrote:
> Because of supporting this reset file, the code is going to be a bit
> complex

I will say not very straight forward, but it isn't complex as well.

> and also visited from the scheduler. I don't know if the
> config for stats is enabled for production kernels but if yes,
> then forcing all to keep that reset code might be too much.
> For the engineering kernel version is OK.

I am not sure either if they are enabled for production kernels, but even if it
then also this code won't hit performance.

> I would say for most normal checks these sysfs stats are very useful.
> If there is a need for investigation like you described, the trace
> event is there, just have to be enabled. Tools like LISA would
> help with parsing the trace and mapping to some plots or even
> merging with scheduler context.

Right, but stats is much easier in my opinion and providing this reset
functionality does make it easier to track. And that it is already there now.

> From time to time some engineers are asking why the stats
> don't show the values (missing fast-switch tracking). I think
> they are interested in a simple use case, otherwise they would use the
> tracing.

Right and I completely agree with that and so this patchset. I think there
aren't any serious race conditions here that would make things bad for anyone
and that this patchset will eventually get in after a little rearrangement.

-- 
viresh

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition()
  2020-09-25  6:09                 ` Viresh Kumar
@ 2020-09-25  8:10                   ` Lukasz Luba
  0 siblings, 0 replies; 23+ messages in thread
From: Lukasz Luba @ 2020-09-25  8:10 UTC (permalink / raw)
  To: Viresh Kumar
  Cc: Rafael J. Wysocki, Rafael Wysocki, Linux PM, Vincent Guittot,
	cristian.marussi, Sudeep Holla, Linux Kernel Mailing List



On 9/25/20 7:09 AM, Viresh Kumar wrote:
> On 24-09-20, 17:10, Lukasz Luba wrote:
>> Because of supporting this reset file, the code is going to be a bit
>> complex
> 
> I will say not very straight forward, but it isn't complex as well.
> 
>> and also visited from the scheduler. I don't know if the
>> config for stats is enabled for production kernels but if yes,
>> then forcing all to keep that reset code might be too much.
>> For the engineering kernel version is OK.
> 
> I am not sure either if they are enabled for production kernels, but even if it
> then also this code won't hit performance.
> 
>> I would say for most normal checks these sysfs stats are very useful.
>> If there is a need for investigation like you described, the trace
>> event is there, just have to be enabled. Tools like LISA would
>> help with parsing the trace and mapping to some plots or even
>> merging with scheduler context.
> 
> Right, but stats is much easier in my opinion and providing this reset
> functionality does make it easier to track. And that it is already there now.
> 
>>  From time to time some engineers are asking why the stats
>> don't show the values (missing fast-switch tracking). I think
>> they are interested in a simple use case, otherwise they would use the
>> tracing.
> 
> Right and I completely agree with that and so this patchset. I think there
> aren't any serious race conditions here that would make things bad for anyone
> and that this patchset will eventually get in after a little rearrangement.
> 

Fair enough, let see the final implementation. We can check it with perf
the speculative exec and cache util.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition()
  2020-09-16  6:45 ` [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() Viresh Kumar
  2020-09-23 13:48   ` Rafael J. Wysocki
@ 2020-09-25  8:21   ` Lukasz Luba
  2020-09-25 10:46     ` Viresh Kumar
  1 sibling, 1 reply; 23+ messages in thread
From: Lukasz Luba @ 2020-09-25  8:21 UTC (permalink / raw)
  To: Viresh Kumar, Rafael Wysocki
  Cc: linux-pm, Vincent Guittot, cristian.marussi, sudeep.holla, linux-kernel



On 9/16/20 7:45 AM, Viresh Kumar wrote:
> In order to prepare for lock-less stats update, add support to defer any
> updates to it until cpufreq_stats_record_transition() is called.
> 
> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> ---
>   drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++---------
>   1 file changed, 56 insertions(+), 19 deletions(-)
> 

[snip]

> @@ -228,10 +264,11 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy,
>   	struct cpufreq_stats *stats = policy->stats;
>   	int old_index, new_index;
>   
> -	if (!stats) {
> -		pr_debug("%s: No stats found\n", __func__);
> +	if (!stats)
>   		return;
> -	}
> +
> +	if (READ_ONCE(stats->reset_pending))
> +		cpufreq_stats_reset_table(stats);
>   

This is in the hot path code, called from the scheduler. I wonder if we
avoid it or make that branch 'unlikely'?

if (unlikely(READ_ONCE(stats->reset_pending)))

Probably the CPU (when it has good prefetcher) would realize about it,
but maybe we can help a bit here.


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition()
  2020-09-24 13:15     ` Viresh Kumar
@ 2020-09-25 10:04       ` Rafael J. Wysocki
  2020-09-25 10:58         ` Viresh Kumar
  0 siblings, 1 reply; 23+ messages in thread
From: Rafael J. Wysocki @ 2020-09-25 10:04 UTC (permalink / raw)
  To: Viresh Kumar
  Cc: Rafael J. Wysocki, Rafael Wysocki, Linux PM, Vincent Guittot,
	Lukasz Luba, cristian.marussi, Sudeep Holla,
	Linux Kernel Mailing List

On Thu, Sep 24, 2020 at 3:15 PM Viresh Kumar <viresh.kumar@linaro.org> wrote:
>
> On 23-09-20, 15:48, Rafael J. Wysocki wrote:
> > On Wed, Sep 16, 2020 at 8:45 AM Viresh Kumar <viresh.kumar@linaro.org> wrote:
> > >
> > > In order to prepare for lock-less stats update, add support to defer any
> > > updates to it until cpufreq_stats_record_transition() is called.
> >
> > This is a bit devoid of details.
> >
> > I guess you mean reset in particular, but that's not clear from the above.
>
> We used to update the stats from two places earlier, reset and
> show_time_in_state, the later got removed with this patch.
>
> > Also, it would be useful to describe the design somewhat.
>
> Okay. I will work on it.
>
> > > Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> > > ---
> > >  drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++---------
> > >  1 file changed, 56 insertions(+), 19 deletions(-)
> > >
> > > diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
> > > index 94d959a8e954..3e7eee29ee86 100644
> > > --- a/drivers/cpufreq/cpufreq_stats.c
> > > +++ b/drivers/cpufreq/cpufreq_stats.c
> > > @@ -22,17 +22,22 @@ struct cpufreq_stats {
> > >         spinlock_t lock;
> > >         unsigned int *freq_table;
> > >         unsigned int *trans_table;
> > > +
> > > +       /* Deferred reset */
> > > +       unsigned int reset_pending;
> > > +       unsigned long long reset_time;
> > >  };
> > >
> > > -static void cpufreq_stats_update(struct cpufreq_stats *stats)
> > > +static void cpufreq_stats_update(struct cpufreq_stats *stats,
> > > +                                unsigned long long time)
> > >  {
> > >         unsigned long long cur_time = get_jiffies_64();
> > >
> > > -       stats->time_in_state[stats->last_index] += cur_time - stats->last_time;
> > > +       stats->time_in_state[stats->last_index] += cur_time - time;
> > >         stats->last_time = cur_time;
> > >  }
> > >
> > > -static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
> > > +static void cpufreq_stats_reset_table(struct cpufreq_stats *stats)
> > >  {
> > >         unsigned int count = stats->max_state;
> > >
> > > @@ -41,42 +46,67 @@ static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
> > >         memset(stats->trans_table, 0, count * count * sizeof(int));
> > >         stats->last_time = get_jiffies_64();
> > >         stats->total_trans = 0;
> > > +
> > > +       /* Adjust for the time elapsed since reset was requested */
> > > +       WRITE_ONCE(stats->reset_pending, 0);
> >
> > What if this runs in parallel with store_reset()?
> >
> > The latter may update reset_pending to 1 before the below runs.
> > Conversely, this may clear reset_pending right after store_reset() has
> > set it to 1, but before it manages to set reset_time.  Is that not a
> > problem?
>
> What I tried to do here with reset_time is to capture the delta time-in-state
> that has elapsed since the last time reset was called and reset was actually
> performed, so we keep showing it in stats.
>
> In the first race you mentioned, whatever update we make here won't matter much
> as reset-pending will be 1 and so below stats will not be used anywhere.
>
> In the second race, we may get the delta time-in-state wrong, since this is just
> stats it shouldn't matter much. But still I think we need to fix the way we do
> reset as you pointed out. More later.
>
> > > +       cpufreq_stats_update(stats, stats->reset_time);
> > >
> > >         spin_unlock(&stats->lock);
> > >  }
> > >
> > >  static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf)
> > >  {
> > > -       return sprintf(buf, "%d\n", policy->stats->total_trans);
> > > +       struct cpufreq_stats *stats = policy->stats;
> > > +
> > > +       if (READ_ONCE(stats->reset_pending))
> > > +               return sprintf(buf, "%d\n", 0);
> > > +       else
> > > +               return sprintf(buf, "%d\n", stats->total_trans);
> > >  }
> > >  cpufreq_freq_attr_ro(total_trans);
> > >
> > >  static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
> > >  {
> > >         struct cpufreq_stats *stats = policy->stats;
> > > +       bool pending = READ_ONCE(stats->reset_pending);
> > > +       unsigned long long time;
> > >         ssize_t len = 0;
> > >         int i;
> > >
> > >         if (policy->fast_switch_enabled)
> > >                 return 0;
> > >
> > > -       spin_lock(&stats->lock);
> > > -       cpufreq_stats_update(stats);
> > > -       spin_unlock(&stats->lock);
> > > -
> > >         for (i = 0; i < stats->state_num; i++) {
> > > +               if (pending) {
> > > +                       if (i == stats->last_index)
> > > +                               time = get_jiffies_64() - stats->reset_time;
> >
> > What if this runs in parallel with store_reset() and reads reset_time
> > before the latter manages to update it?
>
> We should update reset-time first I think. More later.
>
> > > +                       else
> > > +                               time = 0;
> > > +               } else {
> > > +                       time = stats->time_in_state[i];
> > > +                       if (i == stats->last_index)
> > > +                               time += get_jiffies_64() - stats->last_time;
> > > +               }
> > > +
> > >                 len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i],
> > > -                       (unsigned long long)
> > > -                       jiffies_64_to_clock_t(stats->time_in_state[i]));
> > > +                              jiffies_64_to_clock_t(time));
> > >         }
> > >         return len;
> > >  }
> > >  cpufreq_freq_attr_ro(time_in_state);
> > >
> > > +/* We don't care what is written to the attribute */
> > >  static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf,
> > >                            size_t count)
> > >  {
> > > -       /* We don't care what is written to the attribute. */
> > > -       cpufreq_stats_clear_table(policy->stats);
> > > +       struct cpufreq_stats *stats = policy->stats;
> > > +
> > > +       /*
> > > +        * Defer resetting of stats to cpufreq_stats_record_transition() to
> > > +        * avoid races.
> > > +        */
> > > +       WRITE_ONCE(stats->reset_pending, 1);
> > > +       stats->reset_time = get_jiffies_64();
> >
> > AFAICS, there is nothing to ensure that reset_time will be updated in
> > one go and even to ensure that it won't be partially updated before
> > setting reset_pending.
> >
> > This should at least be addressed in a comment to explain why it is
> > not a problem.
>
> I propose something like this for it:
>
>        WRITE_ONCE(stats->reset_time, get_jiffies_64());
>        WRITE_ONCE(stats->reset_pending, 1);
>
> Whatever race happens here, the worst would be like getting time-in-state for
> one of the frequencies wrong only once and I don't think that should be a
> real problem as this is just stats and won't make the kernel behave badly.

I'm actually wondering if reset_time is necessary at all.

If cpufreq_stats_record_transition() is the only updater of the stats,
which will be the case after applying this series IIUC, it may as well
simply set the new starting point and discard all of the data
collected so far if reset_pending is set.

IOW, the time when the reset has been requested isn't particularly
relevant IMV (and it is not exact anyway), because the user is
basically asking for discarding "history" and that may very well be
interpreted to include the current sample.

> > > +
> > >         return count;
> > >  }
> > >  cpufreq_freq_attr_wo(reset);
> > > @@ -84,8 +114,9 @@ cpufreq_freq_attr_wo(reset);
> > >  static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
> > >  {
> > >         struct cpufreq_stats *stats = policy->stats;
> > > +       bool pending = READ_ONCE(stats->reset_pending);
> > >         ssize_t len = 0;
> > > -       int i, j;
> > > +       int i, j, count;
> > >
> > >         if (policy->fast_switch_enabled)
> > >                 return 0;
> > > @@ -113,8 +144,13 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
> > >                 for (j = 0; j < stats->state_num; j++) {
> > >                         if (len >= PAGE_SIZE)
> > >                                 break;
> > > -                       len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ",
> > > -                                       stats->trans_table[i*stats->max_state+j]);
> > > +
> > > +                       if (pending)
> > > +                               count = 0;
> > > +                       else
> > > +                               count = stats->trans_table[i * stats->max_state + j];
> > > +
> > > +                       len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ", count);
> > >                 }
> > >                 if (len >= PAGE_SIZE)
> > >                         break;
> > > @@ -228,10 +264,11 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy,
> > >         struct cpufreq_stats *stats = policy->stats;
> > >         int old_index, new_index;
> > >
> > > -       if (!stats) {
> > > -               pr_debug("%s: No stats found\n", __func__);
> > > +       if (!stats)
> > >                 return;
> > > -       }
> > > +
> > > +       if (READ_ONCE(stats->reset_pending))
> > > +               cpufreq_stats_reset_table(stats);
> >
> > This is a bit confusing, because cpufreq_stats_reset_table() calls
> > cpufreq_stats_update() and passes reset_time to it, but it is called
> > again below with last_time as the second arg.
> >
> > It is not particularly clear to me why this needs to be done this way.
>
> Because we need to account for the elapsed time-in-state, for the currently
> running frequency.
>
> When reset-stats is called, we note down the reset-time, then if reset was
> pending then we call reset-table which calls cpufreq_stats_update(reset_time),
> and so we account for the elapsed time in current state, where we also update
> last-time.
>
> The second call here won't update much in case reset was pending, but will
> update normally otherwise.
>
> --
> viresh

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition()
  2020-09-25  8:21   ` Lukasz Luba
@ 2020-09-25 10:46     ` Viresh Kumar
  0 siblings, 0 replies; 23+ messages in thread
From: Viresh Kumar @ 2020-09-25 10:46 UTC (permalink / raw)
  To: Lukasz Luba
  Cc: Rafael Wysocki, linux-pm, Vincent Guittot, cristian.marussi,
	sudeep.holla, linux-kernel

On 25-09-20, 09:21, Lukasz Luba wrote:
> 
> 
> On 9/16/20 7:45 AM, Viresh Kumar wrote:
> > In order to prepare for lock-less stats update, add support to defer any
> > updates to it until cpufreq_stats_record_transition() is called.
> > 
> > Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> > ---
> >   drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++---------
> >   1 file changed, 56 insertions(+), 19 deletions(-)
> > 
> 
> [snip]
> 
> > @@ -228,10 +264,11 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy,
> >   	struct cpufreq_stats *stats = policy->stats;
> >   	int old_index, new_index;
> > -	if (!stats) {
> > -		pr_debug("%s: No stats found\n", __func__);
> > +	if (!stats)
> >   		return;
> > -	}
> > +
> > +	if (READ_ONCE(stats->reset_pending))
> > +		cpufreq_stats_reset_table(stats);
> 
> This is in the hot path code, called from the scheduler. I wonder if we
> avoid it or make that branch 'unlikely'?
> 
> if (unlikely(READ_ONCE(stats->reset_pending)))
> 
> Probably the CPU (when it has good prefetcher) would realize about it,
> but maybe we can help a bit here.

Sure.

-- 
viresh

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition()
  2020-09-25 10:04       ` Rafael J. Wysocki
@ 2020-09-25 10:58         ` Viresh Kumar
  2020-09-25 11:09           ` Rafael J. Wysocki
  0 siblings, 1 reply; 23+ messages in thread
From: Viresh Kumar @ 2020-09-25 10:58 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Rafael Wysocki, Linux PM, Vincent Guittot, Lukasz Luba,
	cristian.marussi, Sudeep Holla, Linux Kernel Mailing List

On 25-09-20, 12:04, Rafael J. Wysocki wrote:
> I'm actually wondering if reset_time is necessary at all.
> 
> If cpufreq_stats_record_transition() is the only updater of the stats,
> which will be the case after applying this series IIUC, it may as well
> simply set the new starting point and discard all of the data
> collected so far if reset_pending is set.
> 
> IOW, the time when the reset has been requested isn't particularly
> relevant IMV (and it is not exact anyway), because the user is
> basically asking for discarding "history" and that may very well be
> interpreted to include the current sample.

There are times when this would be visible to userspace and won't look nice.

Like, set governor to performance, reset the stats and after 10 seconds, read
the stats again, everything will be 0. Because cpufreq_stats_record_transition()
doesn't get called at all here, we would never clear them until the time
governor is changed and so we need to keep a track of reset-time.

-- 
viresh

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition()
  2020-09-25 10:58         ` Viresh Kumar
@ 2020-09-25 11:09           ` Rafael J. Wysocki
  2020-09-25 11:26             ` Viresh Kumar
  0 siblings, 1 reply; 23+ messages in thread
From: Rafael J. Wysocki @ 2020-09-25 11:09 UTC (permalink / raw)
  To: Viresh Kumar
  Cc: Rafael J. Wysocki, Rafael Wysocki, Linux PM, Vincent Guittot,
	Lukasz Luba, cristian.marussi, Sudeep Holla,
	Linux Kernel Mailing List

On Fri, Sep 25, 2020 at 12:58 PM Viresh Kumar <viresh.kumar@linaro.org> wrote:
>
> On 25-09-20, 12:04, Rafael J. Wysocki wrote:
> > I'm actually wondering if reset_time is necessary at all.
> >
> > If cpufreq_stats_record_transition() is the only updater of the stats,
> > which will be the case after applying this series IIUC, it may as well
> > simply set the new starting point and discard all of the data
> > collected so far if reset_pending is set.
> >
> > IOW, the time when the reset has been requested isn't particularly
> > relevant IMV (and it is not exact anyway), because the user is
> > basically asking for discarding "history" and that may very well be
> > interpreted to include the current sample.
>
> There are times when this would be visible to userspace and won't look nice.
>
> Like, set governor to performance, reset the stats and after 10 seconds, read
> the stats again, everything will be 0.

Unless I'm missing something, the real reset happens when
cpufreq_stats_record_transition() runs next time, so the old stats
will still be visible at that point, won't they?

> Because cpufreq_stats_record_transition()
> doesn't get called at all here, we would never clear them until the time
> governor is changed and so we need to keep a track of reset-time.

Or trigger a forced update.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition()
  2020-09-25 11:09           ` Rafael J. Wysocki
@ 2020-09-25 11:26             ` Viresh Kumar
  0 siblings, 0 replies; 23+ messages in thread
From: Viresh Kumar @ 2020-09-25 11:26 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Rafael Wysocki, Linux PM, Vincent Guittot, Lukasz Luba,
	Cristian Marussi, Sudeep Holla, Linux Kernel Mailing List

On Fri, 25 Sep 2020 at 16:39, Rafael J. Wysocki <rafael@kernel.org> wrote:
>
> On Fri, Sep 25, 2020 at 12:58 PM Viresh Kumar <viresh.kumar@linaro.org> wrote:
> >
> > On 25-09-20, 12:04, Rafael J. Wysocki wrote:
> > > I'm actually wondering if reset_time is necessary at all.
> > >
> > > If cpufreq_stats_record_transition() is the only updater of the stats,
> > > which will be the case after applying this series IIUC, it may as well
> > > simply set the new starting point and discard all of the data
> > > collected so far if reset_pending is set.
> > >
> > > IOW, the time when the reset has been requested isn't particularly
> > > relevant IMV (and it is not exact anyway), because the user is
> > > basically asking for discarding "history" and that may very well be
> > > interpreted to include the current sample.
> >
> > There are times when this would be visible to userspace and won't look nice.
> >
> > Like, set governor to performance, reset the stats and after 10 seconds, read
> > the stats again, everything will be 0.
>
> Unless I'm missing something, the real reset happens when
> cpufreq_stats_record_transition() runs next time, so the old stats
> will still be visible at that point, won't they?

For userspace the stats shouldn't be visible after reset is requested
by it and so with
this series, we check for reset-pending in all the show_*() helpers and print
stats since the time reset was requested.

> > Because cpufreq_stats_record_transition()
> > doesn't get called at all here, we would never clear them until the time
> > governor is changed and so we need to keep a track of reset-time.
>
> Or trigger a forced update.

That would add races while updating the actual stats. And so I found the current
way to be more reliable.

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2020-09-25 11:27 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-16  6:45 [PATCH V2 0/4] cpufreq: Record stats with fast-switching Viresh Kumar
2020-09-16  6:45 ` [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() Viresh Kumar
2020-09-23 13:48   ` Rafael J. Wysocki
2020-09-24  9:25     ` Lukasz Luba
2020-09-24 10:24       ` Rafael J. Wysocki
2020-09-24 11:00         ` Lukasz Luba
2020-09-24 11:07           ` Rafael J. Wysocki
2020-09-24 12:39             ` Viresh Kumar
2020-09-24 16:10               ` Lukasz Luba
2020-09-25  6:09                 ` Viresh Kumar
2020-09-25  8:10                   ` Lukasz Luba
2020-09-24 13:15     ` Viresh Kumar
2020-09-25 10:04       ` Rafael J. Wysocki
2020-09-25 10:58         ` Viresh Kumar
2020-09-25 11:09           ` Rafael J. Wysocki
2020-09-25 11:26             ` Viresh Kumar
2020-09-25  8:21   ` Lukasz Luba
2020-09-25 10:46     ` Viresh Kumar
2020-09-16  6:45 ` [PATCH V2 2/4] cpufreq: stats: Remove locking Viresh Kumar
2020-09-16  6:45 ` [PATCH V2 3/4] cpufreq: stats: Enable stats for fast-switch as well Viresh Kumar
2020-09-23 15:14   ` Rafael J. Wysocki
2020-09-23 15:17     ` Rafael J. Wysocki
2020-09-16  6:45 ` [PATCH V2 4/4] cpufreq: Move traces and update to policy->cur to cpufreq core Viresh Kumar

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.