* [PATCH 2/2] Update cpupower to use the RDPRU instruction
2019-09-18 16:34 [PATCH 1/2] Modify cpupower to schedule itself on cores it is reading MSRs from Natarajan, Janakarajan
@ 2019-09-18 16:34 ` Natarajan, Janakarajan
2019-09-27 16:07 ` [PATCH 1/2] Modify cpupower to schedule itself on cores it is reading MSRs from Natarajan, Janakarajan
2019-09-27 18:59 ` shuah
2 siblings, 0 replies; 11+ messages in thread
From: Natarajan, Janakarajan @ 2019-09-18 16:34 UTC (permalink / raw)
To: linux-pm, linux-kernel
Cc: Thomas Renninger, Shuah Khan, Pu Wen, Borislav Petkov,
Allison Randal, Thomas Gleixner, Kate Stewart,
Greg Kroah-Hartman, Richard Fontana, Natarajan, Janakarajan
AMD Zen 2 introduces the RDPRU instruction which can be used to access some
processor registers which are typically only accessible in privilege level
0. ECX specifies the register to read and EDX:EAX will contain the value read.
ECX: 0 - Register MPERF
1 - Register APERF
This has the added advantage of not having to use the msr module, since the
userspace to kernel transitions which occur during each read_msr() might
cause APERF and MPERF to go out of sync.
Signed-off-by: Janakarajan Natarajan <Janakarajan.Natarajan@amd.com>
---
tools/power/cpupower/utils/helpers/cpuid.c | 4 ++++
tools/power/cpupower/utils/helpers/helpers.h | 1 +
.../utils/idle_monitor/mperf_monitor.c | 20 +++++++++++++++++++
3 files changed, 25 insertions(+)
diff --git a/tools/power/cpupower/utils/helpers/cpuid.c b/tools/power/cpupower/utils/helpers/cpuid.c
index 5cc39d4e23ed..73bfafc60e9b 100644
--- a/tools/power/cpupower/utils/helpers/cpuid.c
+++ b/tools/power/cpupower/utils/helpers/cpuid.c
@@ -131,6 +131,10 @@ int get_cpu_info(struct cpupower_cpu_info *cpu_info)
if (ext_cpuid_level >= 0x80000007 &&
(cpuid_edx(0x80000007) & (1 << 9)))
cpu_info->caps |= CPUPOWER_CAP_AMD_CBP;
+
+ if (ext_cpuid_level >= 0x80000008 &&
+ cpuid_ebx(0x80000008) & (1 << 4))
+ cpu_info->caps |= CPUPOWER_CAP_AMD_RDPRU;
}
if (cpu_info->vendor == X86_VENDOR_INTEL) {
diff --git a/tools/power/cpupower/utils/helpers/helpers.h b/tools/power/cpupower/utils/helpers/helpers.h
index 357b19bb136e..c258eeccd05f 100644
--- a/tools/power/cpupower/utils/helpers/helpers.h
+++ b/tools/power/cpupower/utils/helpers/helpers.h
@@ -69,6 +69,7 @@ enum cpupower_cpu_vendor {X86_VENDOR_UNKNOWN = 0, X86_VENDOR_INTEL,
#define CPUPOWER_CAP_HAS_TURBO_RATIO 0x00000010
#define CPUPOWER_CAP_IS_SNB 0x00000020
#define CPUPOWER_CAP_INTEL_IDA 0x00000040
+#define CPUPOWER_CAP_AMD_RDPRU 0x00000080
#define CPUPOWER_AMD_CPBDIS 0x02000000
diff --git a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
index 8b072e39c897..faee00c078b0 100644
--- a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
+++ b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
@@ -20,6 +20,10 @@
#define MSR_APERF 0xE8
#define MSR_MPERF 0xE7
+#define RDPRU ".byte 0x0f, 0x01, 0xfd"
+#define RDPRU_ECX_MPERF 0
+#define RDPRU_ECX_APERF 1
+
#define MSR_TSC 0x10
#define MSR_AMD_HWCR 0xc0010015
@@ -90,6 +94,8 @@ static int mperf_get_tsc(unsigned long long *tsc)
static int get_aperf_mperf(int cpu, unsigned long long *aval,
unsigned long long *mval)
{
+ unsigned long low_a, high_a;
+ unsigned long low_m, high_m;
cpu_set_t set;
int ret;
@@ -100,6 +106,20 @@ static int get_aperf_mperf(int cpu, unsigned long long *aval,
return 1;
}
+ if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_RDPRU) {
+ asm volatile(RDPRU
+ : "=a" (low_a), "=d" (high_a)
+ : "c" (RDPRU_ECX_APERF));
+ asm volatile(RDPRU
+ : "=a" (low_m), "=d" (high_m)
+ : "c" (RDPRU_ECX_MPERF));
+
+ *aval = ((low_a) | (high_a) << 32);
+ *mval = ((low_m) | (high_m) << 32);
+
+ return 0;
+ }
+
ret = read_msr(cpu, MSR_APERF, aval);
ret |= read_msr(cpu, MSR_MPERF, mval);
--
2.17.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH 1/2] Modify cpupower to schedule itself on cores it is reading MSRs from
2019-09-18 16:34 [PATCH 1/2] Modify cpupower to schedule itself on cores it is reading MSRs from Natarajan, Janakarajan
2019-09-18 16:34 ` [PATCH 2/2] Update cpupower to use the RDPRU instruction Natarajan, Janakarajan
@ 2019-09-27 16:07 ` Natarajan, Janakarajan
2019-09-27 21:48 ` Thomas Renninger
2019-09-27 18:59 ` shuah
2 siblings, 1 reply; 11+ messages in thread
From: Natarajan, Janakarajan @ 2019-09-27 16:07 UTC (permalink / raw)
To: linux-pm, linux-kernel
Cc: Thomas Renninger, Shuah Khan, Pu Wen, Borislav Petkov,
Allison Randal, Thomas Gleixner, Kate Stewart,
Greg Kroah-Hartman, Richard Fontana
On 9/18/2019 11:34 AM, Natarajan, Janakarajan wrote:
> Modify cpupower to schedule itself on each of the cpus in the system and
> then get the APERF/MPERF register values.
>
> This is advantageous because an IPI is not generated when a read_msr() is
> executed on the local logical CPU thereby reducing the chance of having
> APERF and MPERF being out of sync.
>
> Signed-off-by: Janakarajan Natarajan <Janakarajan.Natarajan@amd.com>
Any concerns regarding this patchset?
Thanks.
> ---
> .../utils/idle_monitor/mperf_monitor.c | 38 ++++++++++++++-----
> 1 file changed, 28 insertions(+), 10 deletions(-)
>
> diff --git a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
> index 44806a6dae11..8b072e39c897 100644
> --- a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
> +++ b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
> @@ -10,6 +10,7 @@
> #include <stdlib.h>
> #include <string.h>
> #include <limits.h>
> +#include <sched.h>
>
> #include <cpufreq.h>
>
> @@ -86,15 +87,33 @@ static int mperf_get_tsc(unsigned long long *tsc)
> return ret;
> }
>
> +static int get_aperf_mperf(int cpu, unsigned long long *aval,
> + unsigned long long *mval)
> +{
> + cpu_set_t set;
> + int ret;
> +
> + CPU_ZERO(&set);
> + CPU_SET(cpu, &set);
> + if (sched_setaffinity(getpid(), sizeof(set), &set) == -1) {
> + dprint("Could not migrate to cpu: %d\n", cpu);
> + return 1;
> + }
> +
> + ret = read_msr(cpu, MSR_APERF, aval);
> + ret |= read_msr(cpu, MSR_MPERF, mval);
> +
> + return ret;
> +}
> +
> static int mperf_init_stats(unsigned int cpu)
> {
> - unsigned long long val;
> + unsigned long long aval, mval;
> int ret;
>
> - ret = read_msr(cpu, MSR_APERF, &val);
> - aperf_previous_count[cpu] = val;
> - ret |= read_msr(cpu, MSR_MPERF, &val);
> - mperf_previous_count[cpu] = val;
> + ret = get_aperf_mperf(cpu, &aval, &mval);
> + aperf_previous_count[cpu] = aval;
> + mperf_previous_count[cpu] = mval;
> is_valid[cpu] = !ret;
>
> return 0;
> @@ -102,13 +121,12 @@ static int mperf_init_stats(unsigned int cpu)
>
> static int mperf_measure_stats(unsigned int cpu)
> {
> - unsigned long long val;
> + unsigned long long aval, mval;
> int ret;
>
> - ret = read_msr(cpu, MSR_APERF, &val);
> - aperf_current_count[cpu] = val;
> - ret |= read_msr(cpu, MSR_MPERF, &val);
> - mperf_current_count[cpu] = val;
> + ret = get_aperf_mperf(cpu, &aval, &mval);
> + aperf_current_count[cpu] = aval;
> + mperf_current_count[cpu] = mval;
> is_valid[cpu] = !ret;
>
> return 0;
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 1/2] Modify cpupower to schedule itself on cores it is reading MSRs from
2019-09-27 16:07 ` [PATCH 1/2] Modify cpupower to schedule itself on cores it is reading MSRs from Natarajan, Janakarajan
@ 2019-09-27 21:48 ` Thomas Renninger
2019-10-02 14:45 ` Natarajan, Janakarajan
0 siblings, 1 reply; 11+ messages in thread
From: Thomas Renninger @ 2019-09-27 21:48 UTC (permalink / raw)
To: Natarajan, Janakarajan
Cc: linux-kernel, linux-pm, Pu Wen, Shuah Khan, Thomas Gleixner,
Greg Kroah-Hartman, Kate Stewart, Allison Randal,
Richard Fontana, Thomas Renninger, Borislav Petkov
On Friday, September 27, 2019 6:07:56 PM CEST Natarajan, Janakarajan wrote:
> On 9/18/2019 11:34 AM, Natarajan, Janakarajan wrote:
> > This is advantageous because an IPI is not generated when a read_msr() is
> > executed on the local logical CPU thereby reducing the chance of having
> > APERF and MPERF being out of sync.
> > + if (sched_setaffinity(getpid(), sizeof(set), &set) == -1) {
> > + dprint("Could not migrate to cpu: %d\n", cpu);
> > + return 1;
On a 80 core cpu the process would be pushed around through the
system quite a lot.
This might affect what you are measuring or the other measure values?
Otherwise it's the kernel's MSR read only, not the whole cpupower process,
right? No idea about the exact overhead, though. Others in CC list should
know.
Afaik msr reads through msr module should be avoided anyway?
Those which are worth it are abstracted through sysfs nowadays?
For aperf/mperf it might make sense to define a sysfs file where you
can read both, as this is what you always need?
It would take a while, but could be a longterm solution which is also
usable in secure boot or without msr module case.
Thomas
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 1/2] Modify cpupower to schedule itself on cores it is reading MSRs from
2019-09-27 21:48 ` Thomas Renninger
@ 2019-10-02 14:45 ` Natarajan, Janakarajan
2019-10-05 12:40 ` Thomas Renninger
0 siblings, 1 reply; 11+ messages in thread
From: Natarajan, Janakarajan @ 2019-10-02 14:45 UTC (permalink / raw)
To: Thomas Renninger, Natarajan, Janakarajan
Cc: linux-kernel, linux-pm, Pu Wen, Shuah Khan, Thomas Gleixner,
Greg Kroah-Hartman, Kate Stewart, Allison Randal,
Richard Fontana, Thomas Renninger, Borislav Petkov
On 9/27/19 4:48 PM, Thomas Renninger wrote:
> On Friday, September 27, 2019 6:07:56 PM CEST Natarajan, Janakarajan wrote:
>> On 9/18/2019 11:34 AM, Natarajan, Janakarajan wrote:
>>> This is advantageous because an IPI is not generated when a read_msr() is
>>> executed on the local logical CPU thereby reducing the chance of having
>>> APERF and MPERF being out of sync.
>>> + if (sched_setaffinity(getpid(), sizeof(set), &set) == -1) {
>>> + dprint("Could not migrate to cpu: %d\n", cpu);
>>> + return 1;
> On a 80 core cpu the process would be pushed around through the
> system quite a lot.
> This might affect what you are measuring or the other measure values?
> Otherwise it's the kernel's MSR read only, not the whole cpupower process,
> right? No idea about the exact overhead, though. Others in CC list should
> know.
On a 256 logical-cpu Rome system we see C0 value from cpupower output go
from 0.01 to ~(0.1 to 1.00)
for all cpus with the 1st patch.
However, this goes down to ~0.01 when we use the RDPRU instruction
(which can be used to get
APERF/MPERF from CPL > 0) and avoid using the msr module (patch 2).
> Afaik msr reads through msr module should be avoided anyway?
> Those which are worth it are abstracted through sysfs nowadays?
>
> For aperf/mperf it might make sense to define a sysfs file where you
> can read both, as this is what you always need?
>
> It would take a while, but could be a longterm solution which is also
> usable in secure boot or without msr module case.
Yes. That is a good long term idea. An interface which could be used to
query APERF, MPERF
for a logical cpu in one go.
However, for systems that provide an instruction to get register values
from userspace, would a
command-line parameter be acceptable?
i.e. p: precise measurement.
When this is set, the cpupower process can migrate to each cpu and ,if
an instruction is available
which can get the APERF/MPERF from CPL > 0, use it. That would cut out
the msr module and the
overhead can be reduced.
Thanks.
> Thomas
>
>
>
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 1/2] Modify cpupower to schedule itself on cores it is reading MSRs from
2019-10-02 14:45 ` Natarajan, Janakarajan
@ 2019-10-05 12:40 ` Thomas Renninger
2019-10-07 21:11 ` Natarajan, Janakarajan
0 siblings, 1 reply; 11+ messages in thread
From: Thomas Renninger @ 2019-10-05 12:40 UTC (permalink / raw)
To: Natarajan, Janakarajan
Cc: linux-kernel, linux-pm, Pu Wen, Shuah Khan, Thomas Gleixner,
Greg Kroah-Hartman, Kate Stewart, Allison Randal,
Richard Fontana, Thomas Renninger, Borislav Petkov
Hi,
On Wednesday, October 2, 2019 4:45:03 PM CEST Natarajan, Janakarajan wrote:
> On 9/27/19 4:48 PM, Thomas Renninger wrote:
>
> > On Friday, September 27, 2019 6:07:56 PM CEST Natarajan, Janakarajan
> > wrote:
>
> >> On 9/18/2019 11:34 AM, Natarajan, Janakarajan wrote:
> On a 256 logical-cpu Rome system we see C0 value from cpupower output go
> from 0.01 to ~(0.1 to 1.00)
>
> for all cpus with the 1st patch.
>
> However, this goes down to ~0.01 when we use the RDPRU instruction
> (which can be used to get
>
> APERF/MPERF from CPL > 0) and avoid using the msr module (patch 2).
And this one only exists on latest AMD cpus, right?
> However, for systems that provide an instruction to get register values
> from userspace, would a command-line parameter be acceptable?
Parameter sounds like a good idea. In fact, there already is such a paramter.
cpupower monitor --help
-c
Schedule the process on every core before starting and ending
measuring. This could be needed for the Idle_Stats monitor when no other MSR
based monitor (has to be run on the core that is measured) is run in parallel.
This is to wake up the processors from deeper sleep states and let the kernel
reaccount its cpuidle (C-state) information before reading the cpuidle timings
from sysfs.
Best is you exchange the order of your patches. The 2nd looks rather straight
forward and you can add my reviewed-by.
If you still need adjustings with -c param, they can be discussed separately.
It would also be nice to mention in which case it makes sense to use it in the
manpage or advantages/drawbacks if you don't.
Thanks!
Thomas
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 1/2] Modify cpupower to schedule itself on cores it is reading MSRs from
2019-10-05 12:40 ` Thomas Renninger
@ 2019-10-07 21:11 ` Natarajan, Janakarajan
2019-10-10 11:22 ` Thomas Renninger
0 siblings, 1 reply; 11+ messages in thread
From: Natarajan, Janakarajan @ 2019-10-07 21:11 UTC (permalink / raw)
To: Thomas Renninger
Cc: linux-kernel, linux-pm, Pu Wen, Shuah Khan, Thomas Gleixner,
Greg Kroah-Hartman, Kate Stewart, Allison Randal,
Richard Fontana, Thomas Renninger, Borislav Petkov
On 10/5/2019 7:40 AM, Thomas Renninger wrote:
> Hi,
>
> On Wednesday, October 2, 2019 4:45:03 PM CEST Natarajan, Janakarajan wrote:
>> On 9/27/19 4:48 PM, Thomas Renninger wrote:
>>
>>> On Friday, September 27, 2019 6:07:56 PM CEST Natarajan, Janakarajan
>>> wrote:
>>>> On 9/18/2019 11:34 AM, Natarajan, Janakarajan wrote:
>
>> On a 256 logical-cpu Rome system we see C0 value from cpupower output go
>> from 0.01 to ~(0.1 to 1.00)
>>
>> for all cpus with the 1st patch.
>>
>> However, this goes down to ~0.01 when we use the RDPRU instruction
>> (which can be used to get
>>
>> APERF/MPERF from CPL > 0) and avoid using the msr module (patch 2).
> And this one only exists on latest AMD cpus, right?
Yes. The RDPRU instruction exists only on AMD cpus.
>
>> However, for systems that provide an instruction to get register values
>> from userspace, would a command-line parameter be acceptable?
> Parameter sounds like a good idea. In fact, there already is such a paramter.
> cpupower monitor --help
> -c
> Schedule the process on every core before starting and ending
> measuring. This could be needed for the Idle_Stats monitor when no other MSR
> based monitor (has to be run on the core that is measured) is run in parallel.
> This is to wake up the processors from deeper sleep states and let the kernel
> reaccount its cpuidle (C-state) information before reading the cpuidle timings
> from sysfs.
>
> Best is you exchange the order of your patches. The 2nd looks rather straight
> forward and you can add my reviewed-by.
The RDPRU instruction reads the APERF/MPERF of the cpu on which it is
running. If we do
not schedule it on each cpu specifically, it will read the APERF/MPERF
of the cpu in which it runs/might
happen to run on, which will not be the correct behavior.
>
> If you still need adjustings with -c param, they can be discussed separately.
The -c parameter causes cpupower to schedule itself on each of the cpus
of the system in a loop.
After the loop the cpupower starts the measurement of APERF/MPERF of
each cpu.
This doesn't offer the behavior needed to use RDPRU, which requires
cpupower to execute on the cpu
whose APERF/MPERF values we are interested in.
Thanks,
Janak
> It would also be nice to mention in which case it makes sense to use it in the
> manpage or advantages/drawbacks if you don't.
>
> Thanks!
>
> Thomas
>
>
>
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 1/2] Modify cpupower to schedule itself on cores it is reading MSRs from
2019-10-07 21:11 ` Natarajan, Janakarajan
@ 2019-10-10 11:22 ` Thomas Renninger
2019-10-11 16:58 ` Natarajan, Janakarajan
0 siblings, 1 reply; 11+ messages in thread
From: Thomas Renninger @ 2019-10-10 11:22 UTC (permalink / raw)
To: Natarajan, Janakarajan
Cc: linux-kernel, linux-pm, Pu Wen, Shuah Khan, Thomas Gleixner,
Greg Kroah-Hartman, Kate Stewart, Allison Randal,
Richard Fontana, Borislav Petkov
On Monday, October 7, 2019 11:11:30 PM CEST Natarajan, Janakarajan wrote:
> On 10/5/2019 7:40 AM, Thomas Renninger wrote:
>
...
> >>
> >> APERF/MPERF from CPL > 0) and avoid using the msr module (patch 2).
> >
> > And this one only exists on latest AMD cpus, right?
>
> Yes. The RDPRU instruction exists only on AMD cpus.
> >
> >> However, for systems that provide an instruction to get register values
> >> from userspace, would a command-line parameter be acceptable?
> >
> > Parameter sounds like a good idea. In fact, there already is such a
> > paramter.
cpupower monitor --help
> >
> > -c
> >
> > Schedule the process on every core before starting and
> > ending
> >
> > measuring. This could be needed for the Idle_Stats monitor when no other
> > MSR based monitor (has to be run on the core that is measured) is run in
> > parallel. This is to wake up the processors from deeper sleep states and
> > let the kernel reaccount its cpuidle (C-state) information before reading
> > the cpuidle timings from sysfs.
> >
> > Best is you exchange the order of your patches. The 2nd looks rather
> > straight forward and you can add my reviewed-by.
>
> The RDPRU instruction reads the APERF/MPERF of the cpu on which it is
> running. If we do not schedule it on each cpu specifically, it will read the APERF/MPERF
> of the cpu in which it runs/might happen to run on, which will not be the correct behavior.
Got it. And I also didn't fully read -c. I now remember.. For C-states accounting
you want to have each CPU woken up at measure start and end for accurate measuring.
It's a pity that the monitors do the per_cpu calls themselves.
So a general idle-monitor param is not possible or can only done by for example by
adding a flag to the cpuidle_monitor struct:
struct cpuidle_monitor
unsigned int needs_root:1
unsigned int per_cpu_schedule:1
not sure whether a:
struct {
unsigned int needs_root:1
unsigned int per_cpu_schedule:1
} flags
should/must be put around in a separate cleanup patch (and needs_root users adjusted).
You (and other monitors for which this might make sense) can then implement
the per_cpu_schedule flag. In AMD case you might want (you have to)
directly set it.
All around a -b/-u (--bind-measure-to-cpu, --unbind-measure-to-cpu)
parameter could be added at some point of time if it matters. And monitors
having this could bind or not.
This possibly could nuke out -c param. Or at least the idle state counter
monitor could do it itself. But don't mind about this.
What do you think?
And you should be able to re-use the bind_cpu function used in -c case?
Thomas
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 1/2] Modify cpupower to schedule itself on cores it is reading MSRs from
2019-10-10 11:22 ` Thomas Renninger
@ 2019-10-11 16:58 ` Natarajan, Janakarajan
0 siblings, 0 replies; 11+ messages in thread
From: Natarajan, Janakarajan @ 2019-10-11 16:58 UTC (permalink / raw)
To: Thomas Renninger
Cc: linux-kernel, linux-pm, Pu Wen, Shuah Khan, Thomas Gleixner,
Greg Kroah-Hartman, Kate Stewart, Allison Randal,
Richard Fontana, Borislav Petkov
On 10/10/2019 6:22 AM, Thomas Renninger wrote:
> On Monday, October 7, 2019 11:11:30 PM CEST Natarajan, Janakarajan wrote:
>> On 10/5/2019 7:40 AM, Thomas Renninger wrote:
>>
> ...
>>>> APERF/MPERF from CPL > 0) and avoid using the msr module (patch 2).
>>> And this one only exists on latest AMD cpus, right?
>> Yes. The RDPRU instruction exists only on AMD cpus.
>>>> However, for systems that provide an instruction to get register values
>>>> from userspace, would a command-line parameter be acceptable?
>>> Parameter sounds like a good idea. In fact, there already is such a
>>> paramter.
> cpupower monitor --help
>>> -c
>>>
>>> Schedule the process on every core before starting and
>>> ending
>>>
>>> measuring. This could be needed for the Idle_Stats monitor when no other
>>> MSR based monitor (has to be run on the core that is measured) is run in
>>> parallel. This is to wake up the processors from deeper sleep states and
>>> let the kernel reaccount its cpuidle (C-state) information before reading
>>> the cpuidle timings from sysfs.
>>>
>>> Best is you exchange the order of your patches. The 2nd looks rather
>>> straight forward and you can add my reviewed-by.
>> The RDPRU instruction reads the APERF/MPERF of the cpu on which it is
>> running. If we do not schedule it on each cpu specifically, it will read the APERF/MPERF
>> of the cpu in which it runs/might happen to run on, which will not be the correct behavior.
> Got it. And I also didn't fully read -c. I now remember.. For C-states accounting
> you want to have each CPU woken up at measure start and end for accurate measuring.
>
> It's a pity that the monitors do the per_cpu calls themselves.
> So a general idle-monitor param is not possible or can only done by for example by
> adding a flag to the cpuidle_monitor struct:
>
> struct cpuidle_monitor
>
> unsigned int needs_root:1
> unsigned int per_cpu_schedule:1
>
> not sure whether a:
> struct {
> unsigned int needs_root:1
> unsigned int per_cpu_schedule:1
> } flags
>
> should/must be put around in a separate cleanup patch (and needs_root users adjusted).
>
> You (and other monitors for which this might make sense) can then implement
> the per_cpu_schedule flag. In AMD case you might want (you have to)
> directly set it.
>
> All around a -b/-u (--bind-measure-to-cpu, --unbind-measure-to-cpu)
> parameter could be added at some point of time if it matters. And monitors
> having this could bind or not.
> This possibly could nuke out -c param. Or at least the idle state counter
> monitor could do it itself. But don't mind about this.
>
> What do you think?
This is a good suggestion. I can submit a v2 with:
a) a patch to readjust the needs_root variable
b) a patch to introduce and use the per_cpu_schedule
c) a patch to introduce and use the RDPRU instruction
>
> And you should be able to re-use the bind_cpu function used in -c case?
Yes. I noticed that bind_cpu() is doing what I need. I will use that.
Thanks,
Janak
>
> Thomas
>
>
>
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 1/2] Modify cpupower to schedule itself on cores it is reading MSRs from
2019-09-18 16:34 [PATCH 1/2] Modify cpupower to schedule itself on cores it is reading MSRs from Natarajan, Janakarajan
2019-09-18 16:34 ` [PATCH 2/2] Update cpupower to use the RDPRU instruction Natarajan, Janakarajan
2019-09-27 16:07 ` [PATCH 1/2] Modify cpupower to schedule itself on cores it is reading MSRs from Natarajan, Janakarajan
@ 2019-09-27 18:59 ` shuah
2019-09-30 15:34 ` Natarajan, Janakarajan
2 siblings, 1 reply; 11+ messages in thread
From: shuah @ 2019-09-27 18:59 UTC (permalink / raw)
To: Natarajan, Janakarajan, linux-pm, linux-kernel
Cc: Thomas Renninger, Pu Wen, Borislav Petkov, Allison Randal,
Thomas Gleixner, Kate Stewart, Greg Kroah-Hartman,
Richard Fontana, shuah
On 9/18/19 10:34 AM, Natarajan, Janakarajan wrote:
> Modify cpupower to schedule itself on each of the cpus in the system and
> then get the APERF/MPERF register values.
>
> This is advantageous because an IPI is not generated when a read_msr() is
> executed on the local logical CPU thereby reducing the chance of having
> APERF and MPERF being out of sync.
Somehow this doesn't read right. Is this that you are trying to avoid
APERF and MPERF being out of sync with this change?
This description is rather confusing.
>
> Signed-off-by: Janakarajan Natarajan <Janakarajan.Natarajan@amd.com>
> ---
> .../utils/idle_monitor/mperf_monitor.c | 38 ++++++++++++++-----
> 1 file changed, 28 insertions(+), 10 deletions(-)
>
> diff --git a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
> index 44806a6dae11..8b072e39c897 100644
> --- a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
> +++ b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
> @@ -10,6 +10,7 @@
> #include <stdlib.h>
> #include <string.h>
> #include <limits.h>
> +#include <sched.h>
>
> #include <cpufreq.h>
>
> @@ -86,15 +87,33 @@ static int mperf_get_tsc(unsigned long long *tsc)
> return ret;
> }
>
> +static int get_aperf_mperf(int cpu, unsigned long long *aval,
> + unsigned long long *mval)
> +{
> + cpu_set_t set;
> + int ret;
> +
> + CPU_ZERO(&set);
> + CPU_SET(cpu, &set);
> + if (sched_setaffinity(getpid(), sizeof(set), &set) == -1) {
> + dprint("Could not migrate to cpu: %d\n", cpu);
> + return 1;
> + }
> +
> + ret = read_msr(cpu, MSR_APERF, aval);
> + ret |= read_msr(cpu, MSR_MPERF, mval);
> +
> + return ret;
> +}
> +
> static int mperf_init_stats(unsigned int cpu)
> {
> - unsigned long long val;
> + unsigned long long aval, mval;
> int ret;
>
> - ret = read_msr(cpu, MSR_APERF, &val);
> - aperf_previous_count[cpu] = val;
> - ret |= read_msr(cpu, MSR_MPERF, &val);
> - mperf_previous_count[cpu] = val;
> + ret = get_aperf_mperf(cpu, &aval, &mval);
get_aperf_mperf() could return error right? It returns 1 when
sched_setaffinity() fails. Shouldn't the return value checked,
instead of using aval and mval?
> + aperf_previous_count[cpu] = aval;
> + mperf_previous_count[cpu] = mval;
> is_valid[cpu] = !ret;
>
> return 0;
> @@ -102,13 +121,12 @@ static int mperf_init_stats(unsigned int cpu)
>
> static int mperf_measure_stats(unsigned int cpu)
> {
> - unsigned long long val;
> + unsigned long long aval, mval;
> int ret;
>
> - ret = read_msr(cpu, MSR_APERF, &val);
> - aperf_current_count[cpu] = val;
> - ret |= read_msr(cpu, MSR_MPERF, &val);
> - mperf_current_count[cpu] = val;
> + ret = get_aperf_mperf(cpu, &aval, &mval);
Same comments as above here.
> + aperf_current_count[cpu] = aval;
> + mperf_current_count[cpu] = mval;
> is_valid[cpu] = !ret;
>
> return 0;
>
thanks,
-- Shuah
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 1/2] Modify cpupower to schedule itself on cores it is reading MSRs from
2019-09-27 18:59 ` shuah
@ 2019-09-30 15:34 ` Natarajan, Janakarajan
0 siblings, 0 replies; 11+ messages in thread
From: Natarajan, Janakarajan @ 2019-09-30 15:34 UTC (permalink / raw)
To: shuah, linux-pm, linux-kernel
Cc: Thomas Renninger, Pu Wen, Borislav Petkov, Allison Randal,
Thomas Gleixner, Kate Stewart, Greg Kroah-Hartman,
Richard Fontana
On 9/27/2019 1:59 PM, shuah wrote:
> On 9/18/19 10:34 AM, Natarajan, Janakarajan wrote:
>> Modify cpupower to schedule itself on each of the cpus in the system and
>> then get the APERF/MPERF register values.
>>
>> This is advantageous because an IPI is not generated when a
>> read_msr() is
>> executed on the local logical CPU thereby reducing the chance of having
>> APERF and MPERF being out of sync.
>
> Somehow this doesn't read right. Is this that you are trying to avoid
> APERF and MPERF being out of sync with this change?
>
> This description is rather confusing.
We are trying to avoid a separate IPI for APERF and MPERF. We have seen
that a separate IPI for
each can cause APERF and MPERF to go out of sync.
If the cpupower is already executing on the core it wants to get the
APERF/MPERF from, then
generic_exec_single() does not generate the IPI to execute the rdmsr().
Rather it just executes
on the current cpu.
I can change the description to add more detail.
>
>>
>> Signed-off-by: Janakarajan Natarajan <Janakarajan.Natarajan@amd.com>
>> ---
>> .../utils/idle_monitor/mperf_monitor.c | 38 ++++++++++++++-----
>> 1 file changed, 28 insertions(+), 10 deletions(-)
>>
>> diff --git a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
>> b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
>> index 44806a6dae11..8b072e39c897 100644
>> --- a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
>> +++ b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
>> @@ -10,6 +10,7 @@
>> #include <stdlib.h>
>> #include <string.h>
>> #include <limits.h>
>> +#include <sched.h>
>> #include <cpufreq.h>
>> @@ -86,15 +87,33 @@ static int mperf_get_tsc(unsigned long long *tsc)
>> return ret;
>> }
>> +static int get_aperf_mperf(int cpu, unsigned long long *aval,
>> + unsigned long long *mval)
>> +{
>> + cpu_set_t set;
>> + int ret;
>> +
>> + CPU_ZERO(&set);
>> + CPU_SET(cpu, &set);
>> + if (sched_setaffinity(getpid(), sizeof(set), &set) == -1) {
>> + dprint("Could not migrate to cpu: %d\n", cpu);
>> + return 1;
>> + }
>> +
>> + ret = read_msr(cpu, MSR_APERF, aval);
>> + ret |= read_msr(cpu, MSR_MPERF, mval);
>> +
>> + return ret;
>> +}
>> +
>> static int mperf_init_stats(unsigned int cpu)
>> {
>> - unsigned long long val;
>> + unsigned long long aval, mval;
>> int ret;
>> - ret = read_msr(cpu, MSR_APERF, &val);
>> - aperf_previous_count[cpu] = val;
>> - ret |= read_msr(cpu, MSR_MPERF, &val);
>> - mperf_previous_count[cpu] = val;
>> + ret = get_aperf_mperf(cpu, &aval, &mval);
>
> get_aperf_mperf() could return error right? It returns 1 when
> sched_setaffinity() fails. Shouldn't the return value checked,
> instead of using aval and mval?
We set the is_valid[cpu] to the return value. Later on the is_valid[cpu]
is checked before proceeding to calculate the effective freq.
Thanks.
>
>> + aperf_previous_count[cpu] = aval;
>> + mperf_previous_count[cpu] = mval;
>> is_valid[cpu] = !ret;
>> return 0;
>> @@ -102,13 +121,12 @@ static int mperf_init_stats(unsigned int cpu)
>> static int mperf_measure_stats(unsigned int cpu)
>> {
>> - unsigned long long val;
>> + unsigned long long aval, mval;
>> int ret;
>> - ret = read_msr(cpu, MSR_APERF, &val);
>> - aperf_current_count[cpu] = val;
>> - ret |= read_msr(cpu, MSR_MPERF, &val);
>> - mperf_current_count[cpu] = val;
>> + ret = get_aperf_mperf(cpu, &aval, &mval);
>
> Same comments as above here.
>
>> + aperf_current_count[cpu] = aval;
>> + mperf_current_count[cpu] = mval;
>> is_valid[cpu] = !ret;
>> return 0;
>>
>
> thanks,
> -- Shuah
^ permalink raw reply [flat|nested] 11+ messages in thread