* [PATCH V2 0/4] cpufreq: Record stats with fast-switching @ 2020-09-16 6:45 Viresh Kumar 2020-09-16 6:45 ` [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() Viresh Kumar ` (3 more replies) 0 siblings, 4 replies; 23+ messages in thread From: Viresh Kumar @ 2020-09-16 6:45 UTC (permalink / raw) To: Rafael Wysocki, Ben Segall, Dietmar Eggemann, Ingo Molnar, Juri Lelli, Mel Gorman, Peter Zijlstra, Steven Rostedt, Vincent Guittot, Viresh Kumar Cc: linux-pm, Lukasz Luba, cristian.marussi, sudeep.holla, linux-kernel Hi, We disabled recording cpufreq stats when fast switching was introduced to the cpufreq core as the cpufreq stats required to take a spinlock and that can't be allowed (for performance reasons) on scheduler's hot path. Here is an attempt to get rid of the lock and bring back the support. V1-V2: - Use READ_ONCE/WRITE_ONCE instead of atomic in the first patch. -- Viresh Viresh Kumar (4): cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() cpufreq: stats: Remove locking cpufreq: stats: Enable stats for fast-switch as well cpufreq: Move traces and update to policy->cur to cpufreq core drivers/cpufreq/cpufreq.c | 16 +++++- drivers/cpufreq/cpufreq_stats.c | 87 ++++++++++++++++++++------------ kernel/sched/cpufreq_schedutil.c | 12 +---- 3 files changed, 72 insertions(+), 43 deletions(-) base-commit: 856deb866d16e29bd65952e0289066f6078af773 -- 2.25.0.rc1.19.g042ed3e048af ^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() 2020-09-16 6:45 [PATCH V2 0/4] cpufreq: Record stats with fast-switching Viresh Kumar @ 2020-09-16 6:45 ` Viresh Kumar 2020-09-23 13:48 ` Rafael J. Wysocki 2020-09-25 8:21 ` Lukasz Luba 2020-09-16 6:45 ` [PATCH V2 2/4] cpufreq: stats: Remove locking Viresh Kumar ` (2 subsequent siblings) 3 siblings, 2 replies; 23+ messages in thread From: Viresh Kumar @ 2020-09-16 6:45 UTC (permalink / raw) To: Rafael Wysocki, Viresh Kumar Cc: linux-pm, Vincent Guittot, Lukasz Luba, cristian.marussi, sudeep.holla, linux-kernel In order to prepare for lock-less stats update, add support to defer any updates to it until cpufreq_stats_record_transition() is called. Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> --- drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++--------- 1 file changed, 56 insertions(+), 19 deletions(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 94d959a8e954..3e7eee29ee86 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -22,17 +22,22 @@ struct cpufreq_stats { spinlock_t lock; unsigned int *freq_table; unsigned int *trans_table; + + /* Deferred reset */ + unsigned int reset_pending; + unsigned long long reset_time; }; -static void cpufreq_stats_update(struct cpufreq_stats *stats) +static void cpufreq_stats_update(struct cpufreq_stats *stats, + unsigned long long time) { unsigned long long cur_time = get_jiffies_64(); - stats->time_in_state[stats->last_index] += cur_time - stats->last_time; + stats->time_in_state[stats->last_index] += cur_time - time; stats->last_time = cur_time; } -static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) +static void cpufreq_stats_reset_table(struct cpufreq_stats *stats) { unsigned int count = stats->max_state; @@ -41,42 +46,67 @@ static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) memset(stats->trans_table, 0, count * count * sizeof(int)); stats->last_time = get_jiffies_64(); stats->total_trans = 0; + + /* Adjust for the time elapsed since reset was requested */ + WRITE_ONCE(stats->reset_pending, 0); + cpufreq_stats_update(stats, stats->reset_time); spin_unlock(&stats->lock); } static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf) { - return sprintf(buf, "%d\n", policy->stats->total_trans); + struct cpufreq_stats *stats = policy->stats; + + if (READ_ONCE(stats->reset_pending)) + return sprintf(buf, "%d\n", 0); + else + return sprintf(buf, "%d\n", stats->total_trans); } cpufreq_freq_attr_ro(total_trans); static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) { struct cpufreq_stats *stats = policy->stats; + bool pending = READ_ONCE(stats->reset_pending); + unsigned long long time; ssize_t len = 0; int i; if (policy->fast_switch_enabled) return 0; - spin_lock(&stats->lock); - cpufreq_stats_update(stats); - spin_unlock(&stats->lock); - for (i = 0; i < stats->state_num; i++) { + if (pending) { + if (i == stats->last_index) + time = get_jiffies_64() - stats->reset_time; + else + time = 0; + } else { + time = stats->time_in_state[i]; + if (i == stats->last_index) + time += get_jiffies_64() - stats->last_time; + } + len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i], - (unsigned long long) - jiffies_64_to_clock_t(stats->time_in_state[i])); + jiffies_64_to_clock_t(time)); } return len; } cpufreq_freq_attr_ro(time_in_state); +/* We don't care what is written to the attribute */ static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf, size_t count) { - /* We don't care what is written to the attribute. */ - cpufreq_stats_clear_table(policy->stats); + struct cpufreq_stats *stats = policy->stats; + + /* + * Defer resetting of stats to cpufreq_stats_record_transition() to + * avoid races. + */ + WRITE_ONCE(stats->reset_pending, 1); + stats->reset_time = get_jiffies_64(); + return count; } cpufreq_freq_attr_wo(reset); @@ -84,8 +114,9 @@ cpufreq_freq_attr_wo(reset); static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) { struct cpufreq_stats *stats = policy->stats; + bool pending = READ_ONCE(stats->reset_pending); ssize_t len = 0; - int i, j; + int i, j, count; if (policy->fast_switch_enabled) return 0; @@ -113,8 +144,13 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) for (j = 0; j < stats->state_num; j++) { if (len >= PAGE_SIZE) break; - len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ", - stats->trans_table[i*stats->max_state+j]); + + if (pending) + count = 0; + else + count = stats->trans_table[i * stats->max_state + j]; + + len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ", count); } if (len >= PAGE_SIZE) break; @@ -228,10 +264,11 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy, struct cpufreq_stats *stats = policy->stats; int old_index, new_index; - if (!stats) { - pr_debug("%s: No stats found\n", __func__); + if (!stats) return; - } + + if (READ_ONCE(stats->reset_pending)) + cpufreq_stats_reset_table(stats); old_index = stats->last_index; new_index = freq_table_get_index(stats, new_freq); @@ -241,7 +278,7 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy, return; spin_lock(&stats->lock); - cpufreq_stats_update(stats); + cpufreq_stats_update(stats, stats->last_time); stats->last_index = new_index; stats->trans_table[old_index * stats->max_state + new_index]++; -- 2.25.0.rc1.19.g042ed3e048af ^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() 2020-09-16 6:45 ` [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() Viresh Kumar @ 2020-09-23 13:48 ` Rafael J. Wysocki 2020-09-24 9:25 ` Lukasz Luba 2020-09-24 13:15 ` Viresh Kumar 2020-09-25 8:21 ` Lukasz Luba 1 sibling, 2 replies; 23+ messages in thread From: Rafael J. Wysocki @ 2020-09-23 13:48 UTC (permalink / raw) To: Viresh Kumar Cc: Rafael Wysocki, Linux PM, Vincent Guittot, Lukasz Luba, cristian.marussi, Sudeep Holla, Linux Kernel Mailing List On Wed, Sep 16, 2020 at 8:45 AM Viresh Kumar <viresh.kumar@linaro.org> wrote: > > In order to prepare for lock-less stats update, add support to defer any > updates to it until cpufreq_stats_record_transition() is called. This is a bit devoid of details. I guess you mean reset in particular, but that's not clear from the above. Also, it would be useful to describe the design somewhat. > Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> > --- > drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++--------- > 1 file changed, 56 insertions(+), 19 deletions(-) > > diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c > index 94d959a8e954..3e7eee29ee86 100644 > --- a/drivers/cpufreq/cpufreq_stats.c > +++ b/drivers/cpufreq/cpufreq_stats.c > @@ -22,17 +22,22 @@ struct cpufreq_stats { > spinlock_t lock; > unsigned int *freq_table; > unsigned int *trans_table; > + > + /* Deferred reset */ > + unsigned int reset_pending; > + unsigned long long reset_time; > }; > > -static void cpufreq_stats_update(struct cpufreq_stats *stats) > +static void cpufreq_stats_update(struct cpufreq_stats *stats, > + unsigned long long time) > { > unsigned long long cur_time = get_jiffies_64(); > > - stats->time_in_state[stats->last_index] += cur_time - stats->last_time; > + stats->time_in_state[stats->last_index] += cur_time - time; > stats->last_time = cur_time; > } > > -static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) > +static void cpufreq_stats_reset_table(struct cpufreq_stats *stats) > { > unsigned int count = stats->max_state; > > @@ -41,42 +46,67 @@ static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) > memset(stats->trans_table, 0, count * count * sizeof(int)); > stats->last_time = get_jiffies_64(); > stats->total_trans = 0; > + > + /* Adjust for the time elapsed since reset was requested */ > + WRITE_ONCE(stats->reset_pending, 0); What if this runs in parallel with store_reset()? The latter may update reset_pending to 1 before the below runs. Conversely, this may clear reset_pending right after store_reset() has set it to 1, but before it manages to set reset_time. Is that not a problem? > + cpufreq_stats_update(stats, stats->reset_time); > > spin_unlock(&stats->lock); > } > > static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf) > { > - return sprintf(buf, "%d\n", policy->stats->total_trans); > + struct cpufreq_stats *stats = policy->stats; > + > + if (READ_ONCE(stats->reset_pending)) > + return sprintf(buf, "%d\n", 0); > + else > + return sprintf(buf, "%d\n", stats->total_trans); > } > cpufreq_freq_attr_ro(total_trans); > > static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) > { > struct cpufreq_stats *stats = policy->stats; > + bool pending = READ_ONCE(stats->reset_pending); > + unsigned long long time; > ssize_t len = 0; > int i; > > if (policy->fast_switch_enabled) > return 0; > > - spin_lock(&stats->lock); > - cpufreq_stats_update(stats); > - spin_unlock(&stats->lock); > - > for (i = 0; i < stats->state_num; i++) { > + if (pending) { > + if (i == stats->last_index) > + time = get_jiffies_64() - stats->reset_time; What if this runs in parallel with store_reset() and reads reset_time before the latter manages to update it? > + else > + time = 0; > + } else { > + time = stats->time_in_state[i]; > + if (i == stats->last_index) > + time += get_jiffies_64() - stats->last_time; > + } > + > len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i], > - (unsigned long long) > - jiffies_64_to_clock_t(stats->time_in_state[i])); > + jiffies_64_to_clock_t(time)); > } > return len; > } > cpufreq_freq_attr_ro(time_in_state); > > +/* We don't care what is written to the attribute */ > static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf, > size_t count) > { > - /* We don't care what is written to the attribute. */ > - cpufreq_stats_clear_table(policy->stats); > + struct cpufreq_stats *stats = policy->stats; > + > + /* > + * Defer resetting of stats to cpufreq_stats_record_transition() to > + * avoid races. > + */ > + WRITE_ONCE(stats->reset_pending, 1); > + stats->reset_time = get_jiffies_64(); AFAICS, there is nothing to ensure that reset_time will be updated in one go and even to ensure that it won't be partially updated before setting reset_pending. This should at least be addressed in a comment to explain why it is not a problem. > + > return count; > } > cpufreq_freq_attr_wo(reset); > @@ -84,8 +114,9 @@ cpufreq_freq_attr_wo(reset); > static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) > { > struct cpufreq_stats *stats = policy->stats; > + bool pending = READ_ONCE(stats->reset_pending); > ssize_t len = 0; > - int i, j; > + int i, j, count; > > if (policy->fast_switch_enabled) > return 0; > @@ -113,8 +144,13 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) > for (j = 0; j < stats->state_num; j++) { > if (len >= PAGE_SIZE) > break; > - len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ", > - stats->trans_table[i*stats->max_state+j]); > + > + if (pending) > + count = 0; > + else > + count = stats->trans_table[i * stats->max_state + j]; > + > + len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ", count); > } > if (len >= PAGE_SIZE) > break; > @@ -228,10 +264,11 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy, > struct cpufreq_stats *stats = policy->stats; > int old_index, new_index; > > - if (!stats) { > - pr_debug("%s: No stats found\n", __func__); > + if (!stats) > return; > - } > + > + if (READ_ONCE(stats->reset_pending)) > + cpufreq_stats_reset_table(stats); This is a bit confusing, because cpufreq_stats_reset_table() calls cpufreq_stats_update() and passes reset_time to it, but it is called again below with last_time as the second arg. It is not particularly clear to me why this needs to be done this way. > > old_index = stats->last_index; > new_index = freq_table_get_index(stats, new_freq); > @@ -241,7 +278,7 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy, > return; > > spin_lock(&stats->lock); > - cpufreq_stats_update(stats); > + cpufreq_stats_update(stats, stats->last_time); > > stats->last_index = new_index; > stats->trans_table[old_index * stats->max_state + new_index]++; > -- > 2.25.0.rc1.19.g042ed3e048af > ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() 2020-09-23 13:48 ` Rafael J. Wysocki @ 2020-09-24 9:25 ` Lukasz Luba 2020-09-24 10:24 ` Rafael J. Wysocki 2020-09-24 13:15 ` Viresh Kumar 1 sibling, 1 reply; 23+ messages in thread From: Lukasz Luba @ 2020-09-24 9:25 UTC (permalink / raw) To: Rafael J. Wysocki, Viresh Kumar Cc: Rafael Wysocki, Linux PM, Vincent Guittot, cristian.marussi, Sudeep Holla, Linux Kernel Mailing List Hi Rafael, On 9/23/20 2:48 PM, Rafael J. Wysocki wrote: > On Wed, Sep 16, 2020 at 8:45 AM Viresh Kumar <viresh.kumar@linaro.org> wrote: >> >> In order to prepare for lock-less stats update, add support to defer any >> updates to it until cpufreq_stats_record_transition() is called. > > This is a bit devoid of details. > > I guess you mean reset in particular, but that's not clear from the above. > > Also, it would be useful to describe the design somewhat. > >> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> >> --- >> drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++--------- >> 1 file changed, 56 insertions(+), 19 deletions(-) >> >> diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c >> index 94d959a8e954..3e7eee29ee86 100644 >> --- a/drivers/cpufreq/cpufreq_stats.c >> +++ b/drivers/cpufreq/cpufreq_stats.c >> @@ -22,17 +22,22 @@ struct cpufreq_stats { >> spinlock_t lock; >> unsigned int *freq_table; >> unsigned int *trans_table; >> + >> + /* Deferred reset */ >> + unsigned int reset_pending; >> + unsigned long long reset_time; >> }; >> >> -static void cpufreq_stats_update(struct cpufreq_stats *stats) >> +static void cpufreq_stats_update(struct cpufreq_stats *stats, >> + unsigned long long time) >> { >> unsigned long long cur_time = get_jiffies_64(); >> >> - stats->time_in_state[stats->last_index] += cur_time - stats->last_time; >> + stats->time_in_state[stats->last_index] += cur_time - time; >> stats->last_time = cur_time; >> } >> >> -static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) >> +static void cpufreq_stats_reset_table(struct cpufreq_stats *stats) >> { >> unsigned int count = stats->max_state; >> >> @@ -41,42 +46,67 @@ static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) >> memset(stats->trans_table, 0, count * count * sizeof(int)); >> stats->last_time = get_jiffies_64(); >> stats->total_trans = 0; >> + >> + /* Adjust for the time elapsed since reset was requested */ >> + WRITE_ONCE(stats->reset_pending, 0); > > What if this runs in parallel with store_reset()? > > The latter may update reset_pending to 1 before the below runs. > Conversely, this may clear reset_pending right after store_reset() has > set it to 1, but before it manages to set reset_time. Is that not a > problem? I wonder if we could just drop the reset feature. Is there a tool which uses this file? The 'reset' sysfs would probably have to stay forever, but an empty implementation is not an option? The documentation states: 'This can be useful for evaluating system behaviour under different governors without the need for a reboot.' With the scenario of fast-switch this resetting complicates the implementation and the justification of having it just for experiments avoiding reboot is IMO weak. The real production code would have to pay extra cycles every time. Also, we would probably not experiment with cpufreq different governors, since the SchedUtil is considered the best option. Regards, Lukasz ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() 2020-09-24 9:25 ` Lukasz Luba @ 2020-09-24 10:24 ` Rafael J. Wysocki 2020-09-24 11:00 ` Lukasz Luba 0 siblings, 1 reply; 23+ messages in thread From: Rafael J. Wysocki @ 2020-09-24 10:24 UTC (permalink / raw) To: Lukasz Luba Cc: Rafael J. Wysocki, Viresh Kumar, Rafael Wysocki, Linux PM, Vincent Guittot, cristian.marussi, Sudeep Holla, Linux Kernel Mailing List On Thu, Sep 24, 2020 at 11:25 AM Lukasz Luba <lukasz.luba@arm.com> wrote: > > Hi Rafael, > > On 9/23/20 2:48 PM, Rafael J. Wysocki wrote: > > On Wed, Sep 16, 2020 at 8:45 AM Viresh Kumar <viresh.kumar@linaro.org> wrote: > >> > >> In order to prepare for lock-less stats update, add support to defer any > >> updates to it until cpufreq_stats_record_transition() is called. > > > > This is a bit devoid of details. > > > > I guess you mean reset in particular, but that's not clear from the above. > > > > Also, it would be useful to describe the design somewhat. > > > >> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> > >> --- > >> drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++--------- > >> 1 file changed, 56 insertions(+), 19 deletions(-) > >> > >> diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c > >> index 94d959a8e954..3e7eee29ee86 100644 > >> --- a/drivers/cpufreq/cpufreq_stats.c > >> +++ b/drivers/cpufreq/cpufreq_stats.c > >> @@ -22,17 +22,22 @@ struct cpufreq_stats { > >> spinlock_t lock; > >> unsigned int *freq_table; > >> unsigned int *trans_table; > >> + > >> + /* Deferred reset */ > >> + unsigned int reset_pending; > >> + unsigned long long reset_time; > >> }; > >> > >> -static void cpufreq_stats_update(struct cpufreq_stats *stats) > >> +static void cpufreq_stats_update(struct cpufreq_stats *stats, > >> + unsigned long long time) > >> { > >> unsigned long long cur_time = get_jiffies_64(); > >> > >> - stats->time_in_state[stats->last_index] += cur_time - stats->last_time; > >> + stats->time_in_state[stats->last_index] += cur_time - time; > >> stats->last_time = cur_time; > >> } > >> > >> -static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) > >> +static void cpufreq_stats_reset_table(struct cpufreq_stats *stats) > >> { > >> unsigned int count = stats->max_state; > >> > >> @@ -41,42 +46,67 @@ static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) > >> memset(stats->trans_table, 0, count * count * sizeof(int)); > >> stats->last_time = get_jiffies_64(); > >> stats->total_trans = 0; > >> + > >> + /* Adjust for the time elapsed since reset was requested */ > >> + WRITE_ONCE(stats->reset_pending, 0); > > > > What if this runs in parallel with store_reset()? > > > > The latter may update reset_pending to 1 before the below runs. > > Conversely, this may clear reset_pending right after store_reset() has > > set it to 1, but before it manages to set reset_time. Is that not a > > problem? > > I wonder if we could just drop the reset feature. Is there a tool > which uses this file? The 'reset' sysfs would probably have to stay > forever, but an empty implementation is not an option? Well, having an empty sysfs attr would be a bit ugly, but the implementation of it could be simplified. > The documentation states: > 'This can be useful for evaluating system behaviour under different > governors without the need for a reboot.' > With the scenario of fast-switch this resetting complicates the > implementation and the justification of having it just for experiments > avoiding reboot is IMO weak. The real production code would have to pay > extra cycles every time. Also, we would probably not experiment with > cpufreq different governors, since the SchedUtil is considered the best > option. It would still be good to have a way to test it against the other available options, though. ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() 2020-09-24 10:24 ` Rafael J. Wysocki @ 2020-09-24 11:00 ` Lukasz Luba 2020-09-24 11:07 ` Rafael J. Wysocki 0 siblings, 1 reply; 23+ messages in thread From: Lukasz Luba @ 2020-09-24 11:00 UTC (permalink / raw) To: Rafael J. Wysocki Cc: Viresh Kumar, Rafael Wysocki, Linux PM, Vincent Guittot, cristian.marussi, Sudeep Holla, Linux Kernel Mailing List On 9/24/20 11:24 AM, Rafael J. Wysocki wrote: > On Thu, Sep 24, 2020 at 11:25 AM Lukasz Luba <lukasz.luba@arm.com> wrote: >> >> Hi Rafael, >> >> On 9/23/20 2:48 PM, Rafael J. Wysocki wrote: >>> On Wed, Sep 16, 2020 at 8:45 AM Viresh Kumar <viresh.kumar@linaro.org> wrote: >>>> >>>> In order to prepare for lock-less stats update, add support to defer any >>>> updates to it until cpufreq_stats_record_transition() is called. >>> >>> This is a bit devoid of details. >>> >>> I guess you mean reset in particular, but that's not clear from the above. >>> >>> Also, it would be useful to describe the design somewhat. >>> >>>> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> >>>> --- >>>> drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++--------- >>>> 1 file changed, 56 insertions(+), 19 deletions(-) >>>> >>>> diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c >>>> index 94d959a8e954..3e7eee29ee86 100644 >>>> --- a/drivers/cpufreq/cpufreq_stats.c >>>> +++ b/drivers/cpufreq/cpufreq_stats.c >>>> @@ -22,17 +22,22 @@ struct cpufreq_stats { >>>> spinlock_t lock; >>>> unsigned int *freq_table; >>>> unsigned int *trans_table; >>>> + >>>> + /* Deferred reset */ >>>> + unsigned int reset_pending; >>>> + unsigned long long reset_time; >>>> }; >>>> >>>> -static void cpufreq_stats_update(struct cpufreq_stats *stats) >>>> +static void cpufreq_stats_update(struct cpufreq_stats *stats, >>>> + unsigned long long time) >>>> { >>>> unsigned long long cur_time = get_jiffies_64(); >>>> >>>> - stats->time_in_state[stats->last_index] += cur_time - stats->last_time; >>>> + stats->time_in_state[stats->last_index] += cur_time - time; >>>> stats->last_time = cur_time; >>>> } >>>> >>>> -static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) >>>> +static void cpufreq_stats_reset_table(struct cpufreq_stats *stats) >>>> { >>>> unsigned int count = stats->max_state; >>>> >>>> @@ -41,42 +46,67 @@ static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) >>>> memset(stats->trans_table, 0, count * count * sizeof(int)); >>>> stats->last_time = get_jiffies_64(); >>>> stats->total_trans = 0; >>>> + >>>> + /* Adjust for the time elapsed since reset was requested */ >>>> + WRITE_ONCE(stats->reset_pending, 0); >>> >>> What if this runs in parallel with store_reset()? >>> >>> The latter may update reset_pending to 1 before the below runs. >>> Conversely, this may clear reset_pending right after store_reset() has >>> set it to 1, but before it manages to set reset_time. Is that not a >>> problem? >> >> I wonder if we could just drop the reset feature. Is there a tool >> which uses this file? The 'reset' sysfs would probably have to stay >> forever, but an empty implementation is not an option? > > Well, having an empty sysfs attr would be a bit ugly, but the > implementation of it could be simplified. > >> The documentation states: >> 'This can be useful for evaluating system behaviour under different >> governors without the need for a reboot.' >> With the scenario of fast-switch this resetting complicates the >> implementation and the justification of having it just for experiments >> avoiding reboot is IMO weak. The real production code would have to pay >> extra cycles every time. Also, we would probably not experiment with >> cpufreq different governors, since the SchedUtil is considered the best >> option. > > It would still be good to have a way to test it against the other > available options, though. > Experimenting with different governors would still be possible, just the user-space would have to take a snapshot of the stats when switching to a new governor. Then the values presented in the stats would just need to be calculated in this user tool against the snapshot. The resetting is also not that bad, since nowadays more components maintain some kind of local statistics/history (scheduler, thermal). I would recommend to reset the whole system and repeat the same tests with different governor, just to be sure that everything starts from similar state (utilization, temperature, other devfreq devices frequencies etc). ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() 2020-09-24 11:00 ` Lukasz Luba @ 2020-09-24 11:07 ` Rafael J. Wysocki 2020-09-24 12:39 ` Viresh Kumar 0 siblings, 1 reply; 23+ messages in thread From: Rafael J. Wysocki @ 2020-09-24 11:07 UTC (permalink / raw) To: Lukasz Luba Cc: Rafael J. Wysocki, Viresh Kumar, Rafael Wysocki, Linux PM, Vincent Guittot, cristian.marussi, Sudeep Holla, Linux Kernel Mailing List On Thu, Sep 24, 2020 at 1:00 PM Lukasz Luba <lukasz.luba@arm.com> wrote: > > > > On 9/24/20 11:24 AM, Rafael J. Wysocki wrote: > > On Thu, Sep 24, 2020 at 11:25 AM Lukasz Luba <lukasz.luba@arm.com> wrote: > >> > >> Hi Rafael, > >> > >> On 9/23/20 2:48 PM, Rafael J. Wysocki wrote: > >>> On Wed, Sep 16, 2020 at 8:45 AM Viresh Kumar <viresh.kumar@linaro.org> wrote: > >>>> > >>>> In order to prepare for lock-less stats update, add support to defer any > >>>> updates to it until cpufreq_stats_record_transition() is called. > >>> > >>> This is a bit devoid of details. > >>> > >>> I guess you mean reset in particular, but that's not clear from the above. > >>> > >>> Also, it would be useful to describe the design somewhat. > >>> > >>>> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> > >>>> --- > >>>> drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++--------- > >>>> 1 file changed, 56 insertions(+), 19 deletions(-) > >>>> > >>>> diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c > >>>> index 94d959a8e954..3e7eee29ee86 100644 > >>>> --- a/drivers/cpufreq/cpufreq_stats.c > >>>> +++ b/drivers/cpufreq/cpufreq_stats.c > >>>> @@ -22,17 +22,22 @@ struct cpufreq_stats { > >>>> spinlock_t lock; > >>>> unsigned int *freq_table; > >>>> unsigned int *trans_table; > >>>> + > >>>> + /* Deferred reset */ > >>>> + unsigned int reset_pending; > >>>> + unsigned long long reset_time; > >>>> }; > >>>> > >>>> -static void cpufreq_stats_update(struct cpufreq_stats *stats) > >>>> +static void cpufreq_stats_update(struct cpufreq_stats *stats, > >>>> + unsigned long long time) > >>>> { > >>>> unsigned long long cur_time = get_jiffies_64(); > >>>> > >>>> - stats->time_in_state[stats->last_index] += cur_time - stats->last_time; > >>>> + stats->time_in_state[stats->last_index] += cur_time - time; > >>>> stats->last_time = cur_time; > >>>> } > >>>> > >>>> -static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) > >>>> +static void cpufreq_stats_reset_table(struct cpufreq_stats *stats) > >>>> { > >>>> unsigned int count = stats->max_state; > >>>> > >>>> @@ -41,42 +46,67 @@ static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) > >>>> memset(stats->trans_table, 0, count * count * sizeof(int)); > >>>> stats->last_time = get_jiffies_64(); > >>>> stats->total_trans = 0; > >>>> + > >>>> + /* Adjust for the time elapsed since reset was requested */ > >>>> + WRITE_ONCE(stats->reset_pending, 0); > >>> > >>> What if this runs in parallel with store_reset()? > >>> > >>> The latter may update reset_pending to 1 before the below runs. > >>> Conversely, this may clear reset_pending right after store_reset() has > >>> set it to 1, but before it manages to set reset_time. Is that not a > >>> problem? > >> > >> I wonder if we could just drop the reset feature. Is there a tool > >> which uses this file? The 'reset' sysfs would probably have to stay > >> forever, but an empty implementation is not an option? > > > > Well, having an empty sysfs attr would be a bit ugly, but the > > implementation of it could be simplified. > > > >> The documentation states: > >> 'This can be useful for evaluating system behaviour under different > >> governors without the need for a reboot.' > >> With the scenario of fast-switch this resetting complicates the > >> implementation and the justification of having it just for experiments > >> avoiding reboot is IMO weak. The real production code would have to pay > >> extra cycles every time. Also, we would probably not experiment with > >> cpufreq different governors, since the SchedUtil is considered the best > >> option. > > > > It would still be good to have a way to test it against the other > > available options, though. > > > > Experimenting with different governors would still be possible, just > the user-space would have to take a snapshot of the stats when switching > to a new governor. Then the values presented in the stats would just > need to be calculated in this user tool against the snapshot. > > The resetting is also not that bad, since nowadays more components > maintain some kind of local statistics/history (scheduler, thermal). > I would recommend to reset the whole system and repeat the same tests > with different governor, just to be sure that everything starts from > similar state (utilization, temperature, other devfreq devices > frequencies etc). Well, if everyone agrees on removing the reset feature, let's drop the sysfs attr too, as it would be useless going forward. Admittedly, I don't have a strong opinion and since intel_pstate doesn't use a frequency table, this is not relevant for systems using that driver anyway. ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() 2020-09-24 11:07 ` Rafael J. Wysocki @ 2020-09-24 12:39 ` Viresh Kumar 2020-09-24 16:10 ` Lukasz Luba 0 siblings, 1 reply; 23+ messages in thread From: Viresh Kumar @ 2020-09-24 12:39 UTC (permalink / raw) To: Rafael J. Wysocki Cc: Lukasz Luba, Rafael Wysocki, Linux PM, Vincent Guittot, cristian.marussi, Sudeep Holla, Linux Kernel Mailing List On 24-09-20, 13:07, Rafael J. Wysocki wrote: > On Thu, Sep 24, 2020 at 1:00 PM Lukasz Luba <lukasz.luba@arm.com> wrote: > > On 9/24/20 11:24 AM, Rafael J. Wysocki wrote: > > > On Thu, Sep 24, 2020 at 11:25 AM Lukasz Luba <lukasz.luba@arm.com> wrote: > > >> I wonder if we could just drop the reset feature. Is there a tool > > >> which uses this file? The 'reset' sysfs would probably have to stay > > >> forever, but an empty implementation is not an option? > > > > > > Well, having an empty sysfs attr would be a bit ugly, but the > > > implementation of it could be simplified. > > > > > >> The documentation states: > > >> 'This can be useful for evaluating system behaviour under different > > >> governors without the need for a reboot.' > > >> With the scenario of fast-switch this resetting complicates the > > >> implementation and the justification of having it just for experiments > > >> avoiding reboot is IMO weak. The real production code would have to pay > > >> extra cycles every time. Also, we would probably not experiment with > > >> cpufreq different governors, since the SchedUtil is considered the best > > >> option. > > > > > > It would still be good to have a way to test it against the other > > > available options, though. > > > > > > > Experimenting with different governors would still be possible, just > > the user-space would have to take a snapshot of the stats when switching > > to a new governor. Then the values presented in the stats would just > > need to be calculated in this user tool against the snapshot. > > > > The resetting is also not that bad, since nowadays more components > > maintain some kind of local statistics/history (scheduler, thermal). > > I would recommend to reset the whole system and repeat the same tests > > with different governor, just to be sure that everything starts from > > similar state (utilization, temperature, other devfreq devices > > frequencies etc). > > Well, if everyone agrees on removing the reset feature, let's drop the > sysfs attr too, as it would be useless going forward. > > Admittedly, I don't have a strong opinion and since intel_pstate > doesn't use a frequency table, this is not relevant for systems using > that driver anyway. I added this file sometime back as it made my life a lot easier while testing some scheduler related changes and see how they affect cpufreq updates. IMO this is a useful feature and we don't really need to get rid of it. Lets see where the discussion goes about the feedback you gave. -- viresh ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() 2020-09-24 12:39 ` Viresh Kumar @ 2020-09-24 16:10 ` Lukasz Luba 2020-09-25 6:09 ` Viresh Kumar 0 siblings, 1 reply; 23+ messages in thread From: Lukasz Luba @ 2020-09-24 16:10 UTC (permalink / raw) To: Viresh Kumar Cc: Rafael J. Wysocki, Rafael Wysocki, Linux PM, Vincent Guittot, cristian.marussi, Sudeep Holla, Linux Kernel Mailing List On 9/24/20 1:39 PM, Viresh Kumar wrote: > On 24-09-20, 13:07, Rafael J. Wysocki wrote: >> On Thu, Sep 24, 2020 at 1:00 PM Lukasz Luba <lukasz.luba@arm.com> wrote: >>> On 9/24/20 11:24 AM, Rafael J. Wysocki wrote: >>>> On Thu, Sep 24, 2020 at 11:25 AM Lukasz Luba <lukasz.luba@arm.com> wrote: > >>>>> I wonder if we could just drop the reset feature. Is there a tool >>>>> which uses this file? The 'reset' sysfs would probably have to stay >>>>> forever, but an empty implementation is not an option? >>>> >>>> Well, having an empty sysfs attr would be a bit ugly, but the >>>> implementation of it could be simplified. >>>> >>>>> The documentation states: >>>>> 'This can be useful for evaluating system behaviour under different >>>>> governors without the need for a reboot.' >>>>> With the scenario of fast-switch this resetting complicates the >>>>> implementation and the justification of having it just for experiments >>>>> avoiding reboot is IMO weak. The real production code would have to pay >>>>> extra cycles every time. Also, we would probably not experiment with >>>>> cpufreq different governors, since the SchedUtil is considered the best >>>>> option. >>>> >>>> It would still be good to have a way to test it against the other >>>> available options, though. >>>> >>> >>> Experimenting with different governors would still be possible, just >>> the user-space would have to take a snapshot of the stats when switching >>> to a new governor. Then the values presented in the stats would just >>> need to be calculated in this user tool against the snapshot. >>> >>> The resetting is also not that bad, since nowadays more components >>> maintain some kind of local statistics/history (scheduler, thermal). >>> I would recommend to reset the whole system and repeat the same tests >>> with different governor, just to be sure that everything starts from >>> similar state (utilization, temperature, other devfreq devices >>> frequencies etc). >> >> Well, if everyone agrees on removing the reset feature, let's drop the >> sysfs attr too, as it would be useless going forward. >> >> Admittedly, I don't have a strong opinion and since intel_pstate >> doesn't use a frequency table, this is not relevant for systems using >> that driver anyway. > > I added this file sometime back as it made my life a lot easier while testing > some scheduler related changes and see how they affect cpufreq updates. IMO this > is a useful feature and we don't really need to get rid of it. > > Lets see where the discussion goes about the feedback you gave. > Because of supporting this reset file, the code is going to be a bit complex and also visited from the scheduler. I don't know if the config for stats is enabled for production kernels but if yes, then forcing all to keep that reset code might be too much. For the engineering kernel version is OK. I would say for most normal checks these sysfs stats are very useful. If there is a need for investigation like you described, the trace event is there, just have to be enabled. Tools like LISA would help with parsing the trace and mapping to some plots or even merging with scheduler context. From time to time some engineers are asking why the stats don't show the values (missing fast-switch tracking). I think they are interested in a simple use case, otherwise they would use the tracing. Regards, Lukasz ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() 2020-09-24 16:10 ` Lukasz Luba @ 2020-09-25 6:09 ` Viresh Kumar 2020-09-25 8:10 ` Lukasz Luba 0 siblings, 1 reply; 23+ messages in thread From: Viresh Kumar @ 2020-09-25 6:09 UTC (permalink / raw) To: Lukasz Luba Cc: Rafael J. Wysocki, Rafael Wysocki, Linux PM, Vincent Guittot, cristian.marussi, Sudeep Holla, Linux Kernel Mailing List On 24-09-20, 17:10, Lukasz Luba wrote: > Because of supporting this reset file, the code is going to be a bit > complex I will say not very straight forward, but it isn't complex as well. > and also visited from the scheduler. I don't know if the > config for stats is enabled for production kernels but if yes, > then forcing all to keep that reset code might be too much. > For the engineering kernel version is OK. I am not sure either if they are enabled for production kernels, but even if it then also this code won't hit performance. > I would say for most normal checks these sysfs stats are very useful. > If there is a need for investigation like you described, the trace > event is there, just have to be enabled. Tools like LISA would > help with parsing the trace and mapping to some plots or even > merging with scheduler context. Right, but stats is much easier in my opinion and providing this reset functionality does make it easier to track. And that it is already there now. > From time to time some engineers are asking why the stats > don't show the values (missing fast-switch tracking). I think > they are interested in a simple use case, otherwise they would use the > tracing. Right and I completely agree with that and so this patchset. I think there aren't any serious race conditions here that would make things bad for anyone and that this patchset will eventually get in after a little rearrangement. -- viresh ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() 2020-09-25 6:09 ` Viresh Kumar @ 2020-09-25 8:10 ` Lukasz Luba 0 siblings, 0 replies; 23+ messages in thread From: Lukasz Luba @ 2020-09-25 8:10 UTC (permalink / raw) To: Viresh Kumar Cc: Rafael J. Wysocki, Rafael Wysocki, Linux PM, Vincent Guittot, cristian.marussi, Sudeep Holla, Linux Kernel Mailing List On 9/25/20 7:09 AM, Viresh Kumar wrote: > On 24-09-20, 17:10, Lukasz Luba wrote: >> Because of supporting this reset file, the code is going to be a bit >> complex > > I will say not very straight forward, but it isn't complex as well. > >> and also visited from the scheduler. I don't know if the >> config for stats is enabled for production kernels but if yes, >> then forcing all to keep that reset code might be too much. >> For the engineering kernel version is OK. > > I am not sure either if they are enabled for production kernels, but even if it > then also this code won't hit performance. > >> I would say for most normal checks these sysfs stats are very useful. >> If there is a need for investigation like you described, the trace >> event is there, just have to be enabled. Tools like LISA would >> help with parsing the trace and mapping to some plots or even >> merging with scheduler context. > > Right, but stats is much easier in my opinion and providing this reset > functionality does make it easier to track. And that it is already there now. > >> From time to time some engineers are asking why the stats >> don't show the values (missing fast-switch tracking). I think >> they are interested in a simple use case, otherwise they would use the >> tracing. > > Right and I completely agree with that and so this patchset. I think there > aren't any serious race conditions here that would make things bad for anyone > and that this patchset will eventually get in after a little rearrangement. > Fair enough, let see the final implementation. We can check it with perf the speculative exec and cache util. ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() 2020-09-23 13:48 ` Rafael J. Wysocki 2020-09-24 9:25 ` Lukasz Luba @ 2020-09-24 13:15 ` Viresh Kumar 2020-09-25 10:04 ` Rafael J. Wysocki 1 sibling, 1 reply; 23+ messages in thread From: Viresh Kumar @ 2020-09-24 13:15 UTC (permalink / raw) To: Rafael J. Wysocki Cc: Rafael Wysocki, Linux PM, Vincent Guittot, Lukasz Luba, cristian.marussi, Sudeep Holla, Linux Kernel Mailing List On 23-09-20, 15:48, Rafael J. Wysocki wrote: > On Wed, Sep 16, 2020 at 8:45 AM Viresh Kumar <viresh.kumar@linaro.org> wrote: > > > > In order to prepare for lock-less stats update, add support to defer any > > updates to it until cpufreq_stats_record_transition() is called. > > This is a bit devoid of details. > > I guess you mean reset in particular, but that's not clear from the above. We used to update the stats from two places earlier, reset and show_time_in_state, the later got removed with this patch. > Also, it would be useful to describe the design somewhat. Okay. I will work on it. > > Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> > > --- > > drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++--------- > > 1 file changed, 56 insertions(+), 19 deletions(-) > > > > diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c > > index 94d959a8e954..3e7eee29ee86 100644 > > --- a/drivers/cpufreq/cpufreq_stats.c > > +++ b/drivers/cpufreq/cpufreq_stats.c > > @@ -22,17 +22,22 @@ struct cpufreq_stats { > > spinlock_t lock; > > unsigned int *freq_table; > > unsigned int *trans_table; > > + > > + /* Deferred reset */ > > + unsigned int reset_pending; > > + unsigned long long reset_time; > > }; > > > > -static void cpufreq_stats_update(struct cpufreq_stats *stats) > > +static void cpufreq_stats_update(struct cpufreq_stats *stats, > > + unsigned long long time) > > { > > unsigned long long cur_time = get_jiffies_64(); > > > > - stats->time_in_state[stats->last_index] += cur_time - stats->last_time; > > + stats->time_in_state[stats->last_index] += cur_time - time; > > stats->last_time = cur_time; > > } > > > > -static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) > > +static void cpufreq_stats_reset_table(struct cpufreq_stats *stats) > > { > > unsigned int count = stats->max_state; > > > > @@ -41,42 +46,67 @@ static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) > > memset(stats->trans_table, 0, count * count * sizeof(int)); > > stats->last_time = get_jiffies_64(); > > stats->total_trans = 0; > > + > > + /* Adjust for the time elapsed since reset was requested */ > > + WRITE_ONCE(stats->reset_pending, 0); > > What if this runs in parallel with store_reset()? > > The latter may update reset_pending to 1 before the below runs. > Conversely, this may clear reset_pending right after store_reset() has > set it to 1, but before it manages to set reset_time. Is that not a > problem? What I tried to do here with reset_time is to capture the delta time-in-state that has elapsed since the last time reset was called and reset was actually performed, so we keep showing it in stats. In the first race you mentioned, whatever update we make here won't matter much as reset-pending will be 1 and so below stats will not be used anywhere. In the second race, we may get the delta time-in-state wrong, since this is just stats it shouldn't matter much. But still I think we need to fix the way we do reset as you pointed out. More later. > > + cpufreq_stats_update(stats, stats->reset_time); > > > > spin_unlock(&stats->lock); > > } > > > > static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf) > > { > > - return sprintf(buf, "%d\n", policy->stats->total_trans); > > + struct cpufreq_stats *stats = policy->stats; > > + > > + if (READ_ONCE(stats->reset_pending)) > > + return sprintf(buf, "%d\n", 0); > > + else > > + return sprintf(buf, "%d\n", stats->total_trans); > > } > > cpufreq_freq_attr_ro(total_trans); > > > > static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) > > { > > struct cpufreq_stats *stats = policy->stats; > > + bool pending = READ_ONCE(stats->reset_pending); > > + unsigned long long time; > > ssize_t len = 0; > > int i; > > > > if (policy->fast_switch_enabled) > > return 0; > > > > - spin_lock(&stats->lock); > > - cpufreq_stats_update(stats); > > - spin_unlock(&stats->lock); > > - > > for (i = 0; i < stats->state_num; i++) { > > + if (pending) { > > + if (i == stats->last_index) > > + time = get_jiffies_64() - stats->reset_time; > > What if this runs in parallel with store_reset() and reads reset_time > before the latter manages to update it? We should update reset-time first I think. More later. > > + else > > + time = 0; > > + } else { > > + time = stats->time_in_state[i]; > > + if (i == stats->last_index) > > + time += get_jiffies_64() - stats->last_time; > > + } > > + > > len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i], > > - (unsigned long long) > > - jiffies_64_to_clock_t(stats->time_in_state[i])); > > + jiffies_64_to_clock_t(time)); > > } > > return len; > > } > > cpufreq_freq_attr_ro(time_in_state); > > > > +/* We don't care what is written to the attribute */ > > static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf, > > size_t count) > > { > > - /* We don't care what is written to the attribute. */ > > - cpufreq_stats_clear_table(policy->stats); > > + struct cpufreq_stats *stats = policy->stats; > > + > > + /* > > + * Defer resetting of stats to cpufreq_stats_record_transition() to > > + * avoid races. > > + */ > > + WRITE_ONCE(stats->reset_pending, 1); > > + stats->reset_time = get_jiffies_64(); > > AFAICS, there is nothing to ensure that reset_time will be updated in > one go and even to ensure that it won't be partially updated before > setting reset_pending. > > This should at least be addressed in a comment to explain why it is > not a problem. I propose something like this for it: WRITE_ONCE(stats->reset_time, get_jiffies_64()); WRITE_ONCE(stats->reset_pending, 1); Whatever race happens here, the worst would be like getting time-in-state for one of the frequencies wrong only once and I don't think that should be a real problem as this is just stats and won't make the kernel behave badly. > > + > > return count; > > } > > cpufreq_freq_attr_wo(reset); > > @@ -84,8 +114,9 @@ cpufreq_freq_attr_wo(reset); > > static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) > > { > > struct cpufreq_stats *stats = policy->stats; > > + bool pending = READ_ONCE(stats->reset_pending); > > ssize_t len = 0; > > - int i, j; > > + int i, j, count; > > > > if (policy->fast_switch_enabled) > > return 0; > > @@ -113,8 +144,13 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) > > for (j = 0; j < stats->state_num; j++) { > > if (len >= PAGE_SIZE) > > break; > > - len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ", > > - stats->trans_table[i*stats->max_state+j]); > > + > > + if (pending) > > + count = 0; > > + else > > + count = stats->trans_table[i * stats->max_state + j]; > > + > > + len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ", count); > > } > > if (len >= PAGE_SIZE) > > break; > > @@ -228,10 +264,11 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy, > > struct cpufreq_stats *stats = policy->stats; > > int old_index, new_index; > > > > - if (!stats) { > > - pr_debug("%s: No stats found\n", __func__); > > + if (!stats) > > return; > > - } > > + > > + if (READ_ONCE(stats->reset_pending)) > > + cpufreq_stats_reset_table(stats); > > This is a bit confusing, because cpufreq_stats_reset_table() calls > cpufreq_stats_update() and passes reset_time to it, but it is called > again below with last_time as the second arg. > > It is not particularly clear to me why this needs to be done this way. Because we need to account for the elapsed time-in-state, for the currently running frequency. When reset-stats is called, we note down the reset-time, then if reset was pending then we call reset-table which calls cpufreq_stats_update(reset_time), and so we account for the elapsed time in current state, where we also update last-time. The second call here won't update much in case reset was pending, but will update normally otherwise. -- viresh ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() 2020-09-24 13:15 ` Viresh Kumar @ 2020-09-25 10:04 ` Rafael J. Wysocki 2020-09-25 10:58 ` Viresh Kumar 0 siblings, 1 reply; 23+ messages in thread From: Rafael J. Wysocki @ 2020-09-25 10:04 UTC (permalink / raw) To: Viresh Kumar Cc: Rafael J. Wysocki, Rafael Wysocki, Linux PM, Vincent Guittot, Lukasz Luba, cristian.marussi, Sudeep Holla, Linux Kernel Mailing List On Thu, Sep 24, 2020 at 3:15 PM Viresh Kumar <viresh.kumar@linaro.org> wrote: > > On 23-09-20, 15:48, Rafael J. Wysocki wrote: > > On Wed, Sep 16, 2020 at 8:45 AM Viresh Kumar <viresh.kumar@linaro.org> wrote: > > > > > > In order to prepare for lock-less stats update, add support to defer any > > > updates to it until cpufreq_stats_record_transition() is called. > > > > This is a bit devoid of details. > > > > I guess you mean reset in particular, but that's not clear from the above. > > We used to update the stats from two places earlier, reset and > show_time_in_state, the later got removed with this patch. > > > Also, it would be useful to describe the design somewhat. > > Okay. I will work on it. > > > > Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> > > > --- > > > drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++--------- > > > 1 file changed, 56 insertions(+), 19 deletions(-) > > > > > > diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c > > > index 94d959a8e954..3e7eee29ee86 100644 > > > --- a/drivers/cpufreq/cpufreq_stats.c > > > +++ b/drivers/cpufreq/cpufreq_stats.c > > > @@ -22,17 +22,22 @@ struct cpufreq_stats { > > > spinlock_t lock; > > > unsigned int *freq_table; > > > unsigned int *trans_table; > > > + > > > + /* Deferred reset */ > > > + unsigned int reset_pending; > > > + unsigned long long reset_time; > > > }; > > > > > > -static void cpufreq_stats_update(struct cpufreq_stats *stats) > > > +static void cpufreq_stats_update(struct cpufreq_stats *stats, > > > + unsigned long long time) > > > { > > > unsigned long long cur_time = get_jiffies_64(); > > > > > > - stats->time_in_state[stats->last_index] += cur_time - stats->last_time; > > > + stats->time_in_state[stats->last_index] += cur_time - time; > > > stats->last_time = cur_time; > > > } > > > > > > -static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) > > > +static void cpufreq_stats_reset_table(struct cpufreq_stats *stats) > > > { > > > unsigned int count = stats->max_state; > > > > > > @@ -41,42 +46,67 @@ static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) > > > memset(stats->trans_table, 0, count * count * sizeof(int)); > > > stats->last_time = get_jiffies_64(); > > > stats->total_trans = 0; > > > + > > > + /* Adjust for the time elapsed since reset was requested */ > > > + WRITE_ONCE(stats->reset_pending, 0); > > > > What if this runs in parallel with store_reset()? > > > > The latter may update reset_pending to 1 before the below runs. > > Conversely, this may clear reset_pending right after store_reset() has > > set it to 1, but before it manages to set reset_time. Is that not a > > problem? > > What I tried to do here with reset_time is to capture the delta time-in-state > that has elapsed since the last time reset was called and reset was actually > performed, so we keep showing it in stats. > > In the first race you mentioned, whatever update we make here won't matter much > as reset-pending will be 1 and so below stats will not be used anywhere. > > In the second race, we may get the delta time-in-state wrong, since this is just > stats it shouldn't matter much. But still I think we need to fix the way we do > reset as you pointed out. More later. > > > > + cpufreq_stats_update(stats, stats->reset_time); > > > > > > spin_unlock(&stats->lock); > > > } > > > > > > static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf) > > > { > > > - return sprintf(buf, "%d\n", policy->stats->total_trans); > > > + struct cpufreq_stats *stats = policy->stats; > > > + > > > + if (READ_ONCE(stats->reset_pending)) > > > + return sprintf(buf, "%d\n", 0); > > > + else > > > + return sprintf(buf, "%d\n", stats->total_trans); > > > } > > > cpufreq_freq_attr_ro(total_trans); > > > > > > static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) > > > { > > > struct cpufreq_stats *stats = policy->stats; > > > + bool pending = READ_ONCE(stats->reset_pending); > > > + unsigned long long time; > > > ssize_t len = 0; > > > int i; > > > > > > if (policy->fast_switch_enabled) > > > return 0; > > > > > > - spin_lock(&stats->lock); > > > - cpufreq_stats_update(stats); > > > - spin_unlock(&stats->lock); > > > - > > > for (i = 0; i < stats->state_num; i++) { > > > + if (pending) { > > > + if (i == stats->last_index) > > > + time = get_jiffies_64() - stats->reset_time; > > > > What if this runs in parallel with store_reset() and reads reset_time > > before the latter manages to update it? > > We should update reset-time first I think. More later. > > > > + else > > > + time = 0; > > > + } else { > > > + time = stats->time_in_state[i]; > > > + if (i == stats->last_index) > > > + time += get_jiffies_64() - stats->last_time; > > > + } > > > + > > > len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i], > > > - (unsigned long long) > > > - jiffies_64_to_clock_t(stats->time_in_state[i])); > > > + jiffies_64_to_clock_t(time)); > > > } > > > return len; > > > } > > > cpufreq_freq_attr_ro(time_in_state); > > > > > > +/* We don't care what is written to the attribute */ > > > static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf, > > > size_t count) > > > { > > > - /* We don't care what is written to the attribute. */ > > > - cpufreq_stats_clear_table(policy->stats); > > > + struct cpufreq_stats *stats = policy->stats; > > > + > > > + /* > > > + * Defer resetting of stats to cpufreq_stats_record_transition() to > > > + * avoid races. > > > + */ > > > + WRITE_ONCE(stats->reset_pending, 1); > > > + stats->reset_time = get_jiffies_64(); > > > > AFAICS, there is nothing to ensure that reset_time will be updated in > > one go and even to ensure that it won't be partially updated before > > setting reset_pending. > > > > This should at least be addressed in a comment to explain why it is > > not a problem. > > I propose something like this for it: > > WRITE_ONCE(stats->reset_time, get_jiffies_64()); > WRITE_ONCE(stats->reset_pending, 1); > > Whatever race happens here, the worst would be like getting time-in-state for > one of the frequencies wrong only once and I don't think that should be a > real problem as this is just stats and won't make the kernel behave badly. I'm actually wondering if reset_time is necessary at all. If cpufreq_stats_record_transition() is the only updater of the stats, which will be the case after applying this series IIUC, it may as well simply set the new starting point and discard all of the data collected so far if reset_pending is set. IOW, the time when the reset has been requested isn't particularly relevant IMV (and it is not exact anyway), because the user is basically asking for discarding "history" and that may very well be interpreted to include the current sample. > > > + > > > return count; > > > } > > > cpufreq_freq_attr_wo(reset); > > > @@ -84,8 +114,9 @@ cpufreq_freq_attr_wo(reset); > > > static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) > > > { > > > struct cpufreq_stats *stats = policy->stats; > > > + bool pending = READ_ONCE(stats->reset_pending); > > > ssize_t len = 0; > > > - int i, j; > > > + int i, j, count; > > > > > > if (policy->fast_switch_enabled) > > > return 0; > > > @@ -113,8 +144,13 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) > > > for (j = 0; j < stats->state_num; j++) { > > > if (len >= PAGE_SIZE) > > > break; > > > - len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ", > > > - stats->trans_table[i*stats->max_state+j]); > > > + > > > + if (pending) > > > + count = 0; > > > + else > > > + count = stats->trans_table[i * stats->max_state + j]; > > > + > > > + len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ", count); > > > } > > > if (len >= PAGE_SIZE) > > > break; > > > @@ -228,10 +264,11 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy, > > > struct cpufreq_stats *stats = policy->stats; > > > int old_index, new_index; > > > > > > - if (!stats) { > > > - pr_debug("%s: No stats found\n", __func__); > > > + if (!stats) > > > return; > > > - } > > > + > > > + if (READ_ONCE(stats->reset_pending)) > > > + cpufreq_stats_reset_table(stats); > > > > This is a bit confusing, because cpufreq_stats_reset_table() calls > > cpufreq_stats_update() and passes reset_time to it, but it is called > > again below with last_time as the second arg. > > > > It is not particularly clear to me why this needs to be done this way. > > Because we need to account for the elapsed time-in-state, for the currently > running frequency. > > When reset-stats is called, we note down the reset-time, then if reset was > pending then we call reset-table which calls cpufreq_stats_update(reset_time), > and so we account for the elapsed time in current state, where we also update > last-time. > > The second call here won't update much in case reset was pending, but will > update normally otherwise. > > -- > viresh ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() 2020-09-25 10:04 ` Rafael J. Wysocki @ 2020-09-25 10:58 ` Viresh Kumar 2020-09-25 11:09 ` Rafael J. Wysocki 0 siblings, 1 reply; 23+ messages in thread From: Viresh Kumar @ 2020-09-25 10:58 UTC (permalink / raw) To: Rafael J. Wysocki Cc: Rafael Wysocki, Linux PM, Vincent Guittot, Lukasz Luba, cristian.marussi, Sudeep Holla, Linux Kernel Mailing List On 25-09-20, 12:04, Rafael J. Wysocki wrote: > I'm actually wondering if reset_time is necessary at all. > > If cpufreq_stats_record_transition() is the only updater of the stats, > which will be the case after applying this series IIUC, it may as well > simply set the new starting point and discard all of the data > collected so far if reset_pending is set. > > IOW, the time when the reset has been requested isn't particularly > relevant IMV (and it is not exact anyway), because the user is > basically asking for discarding "history" and that may very well be > interpreted to include the current sample. There are times when this would be visible to userspace and won't look nice. Like, set governor to performance, reset the stats and after 10 seconds, read the stats again, everything will be 0. Because cpufreq_stats_record_transition() doesn't get called at all here, we would never clear them until the time governor is changed and so we need to keep a track of reset-time. -- viresh ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() 2020-09-25 10:58 ` Viresh Kumar @ 2020-09-25 11:09 ` Rafael J. Wysocki 2020-09-25 11:26 ` Viresh Kumar 0 siblings, 1 reply; 23+ messages in thread From: Rafael J. Wysocki @ 2020-09-25 11:09 UTC (permalink / raw) To: Viresh Kumar Cc: Rafael J. Wysocki, Rafael Wysocki, Linux PM, Vincent Guittot, Lukasz Luba, cristian.marussi, Sudeep Holla, Linux Kernel Mailing List On Fri, Sep 25, 2020 at 12:58 PM Viresh Kumar <viresh.kumar@linaro.org> wrote: > > On 25-09-20, 12:04, Rafael J. Wysocki wrote: > > I'm actually wondering if reset_time is necessary at all. > > > > If cpufreq_stats_record_transition() is the only updater of the stats, > > which will be the case after applying this series IIUC, it may as well > > simply set the new starting point and discard all of the data > > collected so far if reset_pending is set. > > > > IOW, the time when the reset has been requested isn't particularly > > relevant IMV (and it is not exact anyway), because the user is > > basically asking for discarding "history" and that may very well be > > interpreted to include the current sample. > > There are times when this would be visible to userspace and won't look nice. > > Like, set governor to performance, reset the stats and after 10 seconds, read > the stats again, everything will be 0. Unless I'm missing something, the real reset happens when cpufreq_stats_record_transition() runs next time, so the old stats will still be visible at that point, won't they? > Because cpufreq_stats_record_transition() > doesn't get called at all here, we would never clear them until the time > governor is changed and so we need to keep a track of reset-time. Or trigger a forced update. ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() 2020-09-25 11:09 ` Rafael J. Wysocki @ 2020-09-25 11:26 ` Viresh Kumar 0 siblings, 0 replies; 23+ messages in thread From: Viresh Kumar @ 2020-09-25 11:26 UTC (permalink / raw) To: Rafael J. Wysocki Cc: Rafael Wysocki, Linux PM, Vincent Guittot, Lukasz Luba, Cristian Marussi, Sudeep Holla, Linux Kernel Mailing List On Fri, 25 Sep 2020 at 16:39, Rafael J. Wysocki <rafael@kernel.org> wrote: > > On Fri, Sep 25, 2020 at 12:58 PM Viresh Kumar <viresh.kumar@linaro.org> wrote: > > > > On 25-09-20, 12:04, Rafael J. Wysocki wrote: > > > I'm actually wondering if reset_time is necessary at all. > > > > > > If cpufreq_stats_record_transition() is the only updater of the stats, > > > which will be the case after applying this series IIUC, it may as well > > > simply set the new starting point and discard all of the data > > > collected so far if reset_pending is set. > > > > > > IOW, the time when the reset has been requested isn't particularly > > > relevant IMV (and it is not exact anyway), because the user is > > > basically asking for discarding "history" and that may very well be > > > interpreted to include the current sample. > > > > There are times when this would be visible to userspace and won't look nice. > > > > Like, set governor to performance, reset the stats and after 10 seconds, read > > the stats again, everything will be 0. > > Unless I'm missing something, the real reset happens when > cpufreq_stats_record_transition() runs next time, so the old stats > will still be visible at that point, won't they? For userspace the stats shouldn't be visible after reset is requested by it and so with this series, we check for reset-pending in all the show_*() helpers and print stats since the time reset was requested. > > Because cpufreq_stats_record_transition() > > doesn't get called at all here, we would never clear them until the time > > governor is changed and so we need to keep a track of reset-time. > > Or trigger a forced update. That would add races while updating the actual stats. And so I found the current way to be more reliable. ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() 2020-09-16 6:45 ` [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() Viresh Kumar 2020-09-23 13:48 ` Rafael J. Wysocki @ 2020-09-25 8:21 ` Lukasz Luba 2020-09-25 10:46 ` Viresh Kumar 1 sibling, 1 reply; 23+ messages in thread From: Lukasz Luba @ 2020-09-25 8:21 UTC (permalink / raw) To: Viresh Kumar, Rafael Wysocki Cc: linux-pm, Vincent Guittot, cristian.marussi, sudeep.holla, linux-kernel On 9/16/20 7:45 AM, Viresh Kumar wrote: > In order to prepare for lock-less stats update, add support to defer any > updates to it until cpufreq_stats_record_transition() is called. > > Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> > --- > drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++--------- > 1 file changed, 56 insertions(+), 19 deletions(-) > [snip] > @@ -228,10 +264,11 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy, > struct cpufreq_stats *stats = policy->stats; > int old_index, new_index; > > - if (!stats) { > - pr_debug("%s: No stats found\n", __func__); > + if (!stats) > return; > - } > + > + if (READ_ONCE(stats->reset_pending)) > + cpufreq_stats_reset_table(stats); > This is in the hot path code, called from the scheduler. I wonder if we avoid it or make that branch 'unlikely'? if (unlikely(READ_ONCE(stats->reset_pending))) Probably the CPU (when it has good prefetcher) would realize about it, but maybe we can help a bit here. ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() 2020-09-25 8:21 ` Lukasz Luba @ 2020-09-25 10:46 ` Viresh Kumar 0 siblings, 0 replies; 23+ messages in thread From: Viresh Kumar @ 2020-09-25 10:46 UTC (permalink / raw) To: Lukasz Luba Cc: Rafael Wysocki, linux-pm, Vincent Guittot, cristian.marussi, sudeep.holla, linux-kernel On 25-09-20, 09:21, Lukasz Luba wrote: > > > On 9/16/20 7:45 AM, Viresh Kumar wrote: > > In order to prepare for lock-less stats update, add support to defer any > > updates to it until cpufreq_stats_record_transition() is called. > > > > Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> > > --- > > drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++--------- > > 1 file changed, 56 insertions(+), 19 deletions(-) > > > > [snip] > > > @@ -228,10 +264,11 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy, > > struct cpufreq_stats *stats = policy->stats; > > int old_index, new_index; > > - if (!stats) { > > - pr_debug("%s: No stats found\n", __func__); > > + if (!stats) > > return; > > - } > > + > > + if (READ_ONCE(stats->reset_pending)) > > + cpufreq_stats_reset_table(stats); > > This is in the hot path code, called from the scheduler. I wonder if we > avoid it or make that branch 'unlikely'? > > if (unlikely(READ_ONCE(stats->reset_pending))) > > Probably the CPU (when it has good prefetcher) would realize about it, > but maybe we can help a bit here. Sure. -- viresh ^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH V2 2/4] cpufreq: stats: Remove locking 2020-09-16 6:45 [PATCH V2 0/4] cpufreq: Record stats with fast-switching Viresh Kumar 2020-09-16 6:45 ` [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() Viresh Kumar @ 2020-09-16 6:45 ` Viresh Kumar 2020-09-16 6:45 ` [PATCH V2 3/4] cpufreq: stats: Enable stats for fast-switch as well Viresh Kumar 2020-09-16 6:45 ` [PATCH V2 4/4] cpufreq: Move traces and update to policy->cur to cpufreq core Viresh Kumar 3 siblings, 0 replies; 23+ messages in thread From: Viresh Kumar @ 2020-09-16 6:45 UTC (permalink / raw) To: Rafael Wysocki, Viresh Kumar Cc: linux-pm, Vincent Guittot, Lukasz Luba, cristian.marussi, sudeep.holla, linux-kernel The locking isn't required anymore as stats can get updated only from one place at a time. Remove it. Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> --- drivers/cpufreq/cpufreq_stats.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 3e7eee29ee86..314fa1d506d0 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -19,7 +19,6 @@ struct cpufreq_stats { unsigned int state_num; unsigned int last_index; u64 *time_in_state; - spinlock_t lock; unsigned int *freq_table; unsigned int *trans_table; @@ -41,7 +40,6 @@ static void cpufreq_stats_reset_table(struct cpufreq_stats *stats) { unsigned int count = stats->max_state; - spin_lock(&stats->lock); memset(stats->time_in_state, 0, count * sizeof(u64)); memset(stats->trans_table, 0, count * count * sizeof(int)); stats->last_time = get_jiffies_64(); @@ -50,7 +48,6 @@ static void cpufreq_stats_reset_table(struct cpufreq_stats *stats) /* Adjust for the time elapsed since reset was requested */ WRITE_ONCE(stats->reset_pending, 0); cpufreq_stats_update(stats, stats->reset_time); - spin_unlock(&stats->lock); } static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf) @@ -244,7 +241,6 @@ void cpufreq_stats_create_table(struct cpufreq_policy *policy) stats->state_num = i; stats->last_time = get_jiffies_64(); stats->last_index = freq_table_get_index(stats, policy->cur); - spin_lock_init(&stats->lock); policy->stats = stats; ret = sysfs_create_group(&policy->kobj, &stats_attr_group); @@ -277,11 +273,9 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy, if (old_index == -1 || new_index == -1 || old_index == new_index) return; - spin_lock(&stats->lock); cpufreq_stats_update(stats, stats->last_time); stats->last_index = new_index; stats->trans_table[old_index * stats->max_state + new_index]++; stats->total_trans++; - spin_unlock(&stats->lock); } -- 2.25.0.rc1.19.g042ed3e048af ^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH V2 3/4] cpufreq: stats: Enable stats for fast-switch as well 2020-09-16 6:45 [PATCH V2 0/4] cpufreq: Record stats with fast-switching Viresh Kumar 2020-09-16 6:45 ` [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() Viresh Kumar 2020-09-16 6:45 ` [PATCH V2 2/4] cpufreq: stats: Remove locking Viresh Kumar @ 2020-09-16 6:45 ` Viresh Kumar 2020-09-23 15:14 ` Rafael J. Wysocki 2020-09-16 6:45 ` [PATCH V2 4/4] cpufreq: Move traces and update to policy->cur to cpufreq core Viresh Kumar 3 siblings, 1 reply; 23+ messages in thread From: Viresh Kumar @ 2020-09-16 6:45 UTC (permalink / raw) To: Rafael Wysocki, Viresh Kumar Cc: linux-pm, Vincent Guittot, Lukasz Luba, cristian.marussi, sudeep.holla, linux-kernel Now that all the blockers are gone for enabling stats in fast-switching case, enable it. Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> --- drivers/cpufreq/cpufreq.c | 6 +++++- drivers/cpufreq/cpufreq_stats.c | 6 ------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 47aa90f9a7c2..d5fe64e96be9 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2057,8 +2057,12 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq) { target_freq = clamp_val(target_freq, policy->min, policy->max); + target_freq = cpufreq_driver->fast_switch(policy, target_freq); - return cpufreq_driver->fast_switch(policy, target_freq); + if (target_freq) + cpufreq_stats_record_transition(policy, target_freq); + + return target_freq; } EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch); diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 314fa1d506d0..daea17f0c36c 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -69,9 +69,6 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) ssize_t len = 0; int i; - if (policy->fast_switch_enabled) - return 0; - for (i = 0; i < stats->state_num; i++) { if (pending) { if (i == stats->last_index) @@ -115,9 +112,6 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) ssize_t len = 0; int i, j, count; - if (policy->fast_switch_enabled) - return 0; - len += scnprintf(buf + len, PAGE_SIZE - len, " From : To\n"); len += scnprintf(buf + len, PAGE_SIZE - len, " : "); for (i = 0; i < stats->state_num; i++) { -- 2.25.0.rc1.19.g042ed3e048af ^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: [PATCH V2 3/4] cpufreq: stats: Enable stats for fast-switch as well 2020-09-16 6:45 ` [PATCH V2 3/4] cpufreq: stats: Enable stats for fast-switch as well Viresh Kumar @ 2020-09-23 15:14 ` Rafael J. Wysocki 2020-09-23 15:17 ` Rafael J. Wysocki 0 siblings, 1 reply; 23+ messages in thread From: Rafael J. Wysocki @ 2020-09-23 15:14 UTC (permalink / raw) To: Viresh Kumar Cc: Rafael Wysocki, Linux PM, Vincent Guittot, Lukasz Luba, cristian.marussi, Sudeep Holla, Linux Kernel Mailing List On Wed, Sep 16, 2020 at 8:46 AM Viresh Kumar <viresh.kumar@linaro.org> wrote: > > Now that all the blockers are gone for enabling stats in fast-switching > case, enable it. > > Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> > --- > drivers/cpufreq/cpufreq.c | 6 +++++- > drivers/cpufreq/cpufreq_stats.c | 6 ------ > 2 files changed, 5 insertions(+), 7 deletions(-) > > diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c > index 47aa90f9a7c2..d5fe64e96be9 100644 > --- a/drivers/cpufreq/cpufreq.c > +++ b/drivers/cpufreq/cpufreq.c > @@ -2057,8 +2057,12 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, > unsigned int target_freq) > { > target_freq = clamp_val(target_freq, policy->min, policy->max); > + target_freq = cpufreq_driver->fast_switch(policy, target_freq); > > - return cpufreq_driver->fast_switch(policy, target_freq); > + if (target_freq) > + cpufreq_stats_record_transition(policy, target_freq); So this adds two extra branches in the scheduler path for the cases when the stats are not used at all which seems avoidable to some extent. Can we check policy->stats upfront here and bail out right away if it is not set, for example? > + > + return target_freq; > } > EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch); > > diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c > index 314fa1d506d0..daea17f0c36c 100644 > --- a/drivers/cpufreq/cpufreq_stats.c > +++ b/drivers/cpufreq/cpufreq_stats.c > @@ -69,9 +69,6 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) > ssize_t len = 0; > int i; > > - if (policy->fast_switch_enabled) > - return 0; > - > for (i = 0; i < stats->state_num; i++) { > if (pending) { > if (i == stats->last_index) > @@ -115,9 +112,6 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) > ssize_t len = 0; > int i, j, count; > > - if (policy->fast_switch_enabled) > - return 0; > - > len += scnprintf(buf + len, PAGE_SIZE - len, " From : To\n"); > len += scnprintf(buf + len, PAGE_SIZE - len, " : "); > for (i = 0; i < stats->state_num; i++) { > -- > 2.25.0.rc1.19.g042ed3e048af > ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH V2 3/4] cpufreq: stats: Enable stats for fast-switch as well 2020-09-23 15:14 ` Rafael J. Wysocki @ 2020-09-23 15:17 ` Rafael J. Wysocki 0 siblings, 0 replies; 23+ messages in thread From: Rafael J. Wysocki @ 2020-09-23 15:17 UTC (permalink / raw) To: Viresh Kumar Cc: Rafael Wysocki, Linux PM, Vincent Guittot, Lukasz Luba, cristian.marussi, Sudeep Holla, Linux Kernel Mailing List On Wed, Sep 23, 2020 at 5:14 PM Rafael J. Wysocki <rafael@kernel.org> wrote: > > On Wed, Sep 16, 2020 at 8:46 AM Viresh Kumar <viresh.kumar@linaro.org> wrote: > > > > Now that all the blockers are gone for enabling stats in fast-switching > > case, enable it. > > > > Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> > > --- > > drivers/cpufreq/cpufreq.c | 6 +++++- > > drivers/cpufreq/cpufreq_stats.c | 6 ------ > > 2 files changed, 5 insertions(+), 7 deletions(-) > > > > diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c > > index 47aa90f9a7c2..d5fe64e96be9 100644 > > --- a/drivers/cpufreq/cpufreq.c > > +++ b/drivers/cpufreq/cpufreq.c > > @@ -2057,8 +2057,12 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, > > unsigned int target_freq) > > { > > target_freq = clamp_val(target_freq, policy->min, policy->max); > > + target_freq = cpufreq_driver->fast_switch(policy, target_freq); > > > > - return cpufreq_driver->fast_switch(policy, target_freq); > > + if (target_freq) > > + cpufreq_stats_record_transition(policy, target_freq); > > So this adds two extra branches in the scheduler path for the cases > when the stats are not used at all which seems avoidable to some > extent. > > Can we check policy->stats upfront here and bail out right away if it > is not set, for example? Well, scratch this, the next patch fixes it up. Cheers! ^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH V2 4/4] cpufreq: Move traces and update to policy->cur to cpufreq core 2020-09-16 6:45 [PATCH V2 0/4] cpufreq: Record stats with fast-switching Viresh Kumar ` (2 preceding siblings ...) 2020-09-16 6:45 ` [PATCH V2 3/4] cpufreq: stats: Enable stats for fast-switch as well Viresh Kumar @ 2020-09-16 6:45 ` Viresh Kumar 3 siblings, 0 replies; 23+ messages in thread From: Viresh Kumar @ 2020-09-16 6:45 UTC (permalink / raw) To: Rafael Wysocki, Viresh Kumar, Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman Cc: linux-pm, Lukasz Luba, cristian.marussi, sudeep.holla, linux-kernel The cpufreq core handles the updates to policy->cur and recording of cpufreq trace events for all the governors except schedutil's fast switch case. Move that as well to cpufreq core for consistency and readability. Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> --- drivers/cpufreq/cpufreq.c | 14 ++++++++++++-- kernel/sched/cpufreq_schedutil.c | 12 +----------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index d5fe64e96be9..bc930f6ecff6 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2056,11 +2056,21 @@ EXPORT_SYMBOL(cpufreq_unregister_notifier); unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq) { + int cpu; + target_freq = clamp_val(target_freq, policy->min, policy->max); target_freq = cpufreq_driver->fast_switch(policy, target_freq); - if (target_freq) - cpufreq_stats_record_transition(policy, target_freq); + if (!target_freq) + return 0; + + policy->cur = target_freq; + cpufreq_stats_record_transition(policy, target_freq); + + if (trace_cpu_frequency_enabled()) { + for_each_cpu(cpu, policy->cpus) + trace_cpu_frequency(target_freq, cpu); + } return target_freq; } diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e39008242cf4..28f6d1ad608b 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -115,21 +115,11 @@ static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { struct cpufreq_policy *policy = sg_policy->policy; - int cpu; if (!sugov_update_next_freq(sg_policy, time, next_freq)) return; - next_freq = cpufreq_driver_fast_switch(policy, next_freq); - if (!next_freq) - return; - - policy->cur = next_freq; - - if (trace_cpu_frequency_enabled()) { - for_each_cpu(cpu, policy->cpus) - trace_cpu_frequency(next_freq, cpu); - } + cpufreq_driver_fast_switch(policy, next_freq); } static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, -- 2.25.0.rc1.19.g042ed3e048af ^ permalink raw reply related [flat|nested] 23+ messages in thread
end of thread, other threads:[~2020-09-25 11:27 UTC | newest] Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2020-09-16 6:45 [PATCH V2 0/4] cpufreq: Record stats with fast-switching Viresh Kumar 2020-09-16 6:45 ` [PATCH V2 1/4] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() Viresh Kumar 2020-09-23 13:48 ` Rafael J. Wysocki 2020-09-24 9:25 ` Lukasz Luba 2020-09-24 10:24 ` Rafael J. Wysocki 2020-09-24 11:00 ` Lukasz Luba 2020-09-24 11:07 ` Rafael J. Wysocki 2020-09-24 12:39 ` Viresh Kumar 2020-09-24 16:10 ` Lukasz Luba 2020-09-25 6:09 ` Viresh Kumar 2020-09-25 8:10 ` Lukasz Luba 2020-09-24 13:15 ` Viresh Kumar 2020-09-25 10:04 ` Rafael J. Wysocki 2020-09-25 10:58 ` Viresh Kumar 2020-09-25 11:09 ` Rafael J. Wysocki 2020-09-25 11:26 ` Viresh Kumar 2020-09-25 8:21 ` Lukasz Luba 2020-09-25 10:46 ` Viresh Kumar 2020-09-16 6:45 ` [PATCH V2 2/4] cpufreq: stats: Remove locking Viresh Kumar 2020-09-16 6:45 ` [PATCH V2 3/4] cpufreq: stats: Enable stats for fast-switch as well Viresh Kumar 2020-09-23 15:14 ` Rafael J. Wysocki 2020-09-23 15:17 ` Rafael J. Wysocki 2020-09-16 6:45 ` [PATCH V2 4/4] cpufreq: Move traces and update to policy->cur to cpufreq core Viresh Kumar
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).