Linux-PM Archive on lore.kernel.org
 help / color / Atom feed
* [RFC/RFT][PATCH] cpufreq: intel_pstate: Accept passive mode with HWP enabled
@ 2020-05-26 18:20 Rafael J. Wysocki
  2020-05-31 16:39 ` Doug Smythies
  2020-06-06 15:21 ` Doug Smythies
  0 siblings, 2 replies; 11+ messages in thread
From: Rafael J. Wysocki @ 2020-05-26 18:20 UTC (permalink / raw)
  To: Linux PM, Doug Smythies
  Cc: LKML, Len Brown, Srinivas Pandruvada, Peter Zijlstra,
	Giovanni Gherdovich, Francisco Jerez

From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

Allow intel_pstate to work in the passive mode with HWP enabled and
make it set the HWP minimum performance limit to 75% of the P-state
value corresponding to the target frequency supplied by the cpufreq
governor, so as to prevent the HWP algorithm and the CPU scheduler
from working against each other at least when the schedutil governor
is in use.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---

This is a replacement for https://patchwork.kernel.org/patch/11563615/ that
uses the HWP floor (minimum performance limit) as the feedback to the HWP
algorithm (instead of the EPP).

The INTEL_CPUFREQ_TRANSITION_DELAY_HWP is still 5000 and the previous comments
still apply to it.

In addition to that, the 75% fraction used in intel_cpufreq_adjust_hwp() can be
adjusted too, but I would like to use a value with a power-of-2 denominator for
that (so the next candidate would be 7/8).

Everyone who can do that is kindly requested to test this and let me know
the outcome.

Of course, the documentation still needs to be updated.  Also, the EPP can be
handled in analogy with the active mode now, but that part can be added in a
separate patch on top of this one.

Thanks!

---
 drivers/cpufreq/intel_pstate.c |  119 ++++++++++++++++++++++++++++++-----------
 1 file changed, 88 insertions(+), 31 deletions(-)

Index: linux-pm/drivers/cpufreq/intel_pstate.c
===================================================================
--- linux-pm.orig/drivers/cpufreq/intel_pstate.c
+++ linux-pm/drivers/cpufreq/intel_pstate.c
@@ -36,6 +36,7 @@
 #define INTEL_PSTATE_SAMPLING_INTERVAL	(10 * NSEC_PER_MSEC)
 
 #define INTEL_CPUFREQ_TRANSITION_LATENCY	20000
+#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP	5000
 #define INTEL_CPUFREQ_TRANSITION_DELAY		500
 
 #ifdef CONFIG_ACPI
@@ -2175,7 +2176,10 @@ static int intel_pstate_verify_policy(st
 
 static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy)
 {
-	intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
+	if (hwp_active)
+		intel_pstate_hwp_force_min_perf(policy->cpu);
+	else
+		intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
 }
 
 static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
@@ -2183,12 +2187,10 @@ static void intel_pstate_stop_cpu(struct
 	pr_debug("CPU %d exiting\n", policy->cpu);
 
 	intel_pstate_clear_update_util_hook(policy->cpu);
-	if (hwp_active) {
+	if (hwp_active)
 		intel_pstate_hwp_save_state(policy);
-		intel_pstate_hwp_force_min_perf(policy->cpu);
-	} else {
-		intel_cpufreq_stop_cpu(policy);
-	}
+
+	intel_cpufreq_stop_cpu(policy);
 }
 
 static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
@@ -2318,13 +2320,58 @@ static void intel_cpufreq_trace(struct c
 		fp_toint(cpu->iowait_boost * 100));
 }
 
+static void intel_cpufreq_update_hwp_request(struct cpudata *cpu, u32 min_perf)
+{
+	u64 value, prev;
+
+	rdmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, &prev);
+	value = prev;
+
+	value &= ~HWP_MIN_PERF(~0L);
+	value |= HWP_MIN_PERF(min_perf);
+
+	/*
+	 * The entire MSR needs to be updated in order to update the HWP min
+	 * field in it, so opportunistically update the max too if needed.
+	 */
+	value &= ~HWP_MAX_PERF(~0L);
+	value |= HWP_MAX_PERF(cpu->max_perf_ratio);
+
+	if (value != prev)
+		wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
+}
+
+/**
+ * intel_cpufreq_adjust_hwp - Adjust the HWP reuqest register.
+ * @cpu: Target CPU.
+ * @target_pstate: P-state corresponding to the target frequency.
+ *
+ * Set the HWP minimum performance limit to 75% of @target_pstate taking the
+ * global min and max policy limits into account.
+ *
+ * The purpose of this is to avoid situations in which the kernel and the HWP
+ * algorithm work against each other by giving a hint about the expectations of
+ * the former to the latter.
+ */
+static void intel_cpufreq_adjust_hwp(struct cpudata *cpu, u32 target_pstate)
+{
+	u32 min_perf;
+
+	min_perf = max_t(u32, (3 * target_pstate) / 4, cpu->min_perf_ratio);
+	min_perf = min_t(u32, min_perf, cpu->max_perf_ratio);
+	if (min_perf != cpu->pstate.current_pstate) {
+		cpu->pstate.current_pstate = min_perf;
+		intel_cpufreq_update_hwp_request(cpu, min_perf);
+	}
+}
+
 static int intel_cpufreq_target(struct cpufreq_policy *policy,
 				unsigned int target_freq,
 				unsigned int relation)
 {
 	struct cpudata *cpu = all_cpu_data[policy->cpu];
+	int target_pstate, old_pstate = cpu->pstate.current_pstate;
 	struct cpufreq_freqs freqs;
-	int target_pstate, old_pstate;
 
 	update_turbo_state();
 
@@ -2332,26 +2379,33 @@ static int intel_cpufreq_target(struct c
 	freqs.new = target_freq;
 
 	cpufreq_freq_transition_begin(policy, &freqs);
+
 	switch (relation) {
 	case CPUFREQ_RELATION_L:
-		target_pstate = DIV_ROUND_UP(freqs.new, cpu->pstate.scaling);
+		target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling);
 		break;
 	case CPUFREQ_RELATION_H:
-		target_pstate = freqs.new / cpu->pstate.scaling;
+		target_pstate = target_freq / cpu->pstate.scaling;
 		break;
 	default:
-		target_pstate = DIV_ROUND_CLOSEST(freqs.new, cpu->pstate.scaling);
+		target_pstate = DIV_ROUND_CLOSEST(target_freq, cpu->pstate.scaling);
 		break;
 	}
-	target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
-	old_pstate = cpu->pstate.current_pstate;
-	if (target_pstate != cpu->pstate.current_pstate) {
-		cpu->pstate.current_pstate = target_pstate;
-		wrmsrl_on_cpu(policy->cpu, MSR_IA32_PERF_CTL,
-			      pstate_funcs.get_val(cpu, target_pstate));
+
+	if (hwp_active) {
+		intel_cpufreq_adjust_hwp(cpu, target_pstate);
+	} else {
+		target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
+		if (target_pstate != old_pstate) {
+			cpu->pstate.current_pstate = target_pstate;
+			wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL,
+				      pstate_funcs.get_val(cpu, target_pstate));
+		}
 	}
-	freqs.new = target_pstate * cpu->pstate.scaling;
 	intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET, old_pstate);
+
+	freqs.new = target_pstate * cpu->pstate.scaling;
+
 	cpufreq_freq_transition_end(policy, &freqs, false);
 
 	return 0;
@@ -2361,14 +2415,19 @@ static unsigned int intel_cpufreq_fast_s
 					      unsigned int target_freq)
 {
 	struct cpudata *cpu = all_cpu_data[policy->cpu];
-	int target_pstate, old_pstate;
+	int target_pstate, old_pstate = cpu->pstate.current_pstate;
 
 	update_turbo_state();
 
 	target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling);
-	target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
-	old_pstate = cpu->pstate.current_pstate;
-	intel_pstate_update_pstate(cpu, target_pstate);
+
+	if (hwp_active) {
+		intel_cpufreq_adjust_hwp(cpu, target_pstate);
+	} else {
+		target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
+		intel_pstate_update_pstate(cpu, target_pstate);
+	}
+
 	intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate);
 	return target_pstate * cpu->pstate.scaling;
 }
@@ -2389,7 +2448,6 @@ static int intel_cpufreq_cpu_init(struct
 		return ret;
 
 	policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY;
-	policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY;
 	/* This reflects the intel_pstate_get_cpu_pstates() setting. */
 	policy->cur = policy->cpuinfo.min_freq;
 
@@ -2401,10 +2459,13 @@ static int intel_cpufreq_cpu_init(struct
 
 	cpu = all_cpu_data[policy->cpu];
 
-	if (hwp_active)
+	if (hwp_active) {
 		intel_pstate_get_hwp_max(policy->cpu, &turbo_max, &max_state);
-	else
+		policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY_HWP;
+	} else {
 		turbo_max = cpu->pstate.turbo_pstate;
+		policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY;
+	}
 
 	min_freq = DIV_ROUND_UP(turbo_max * global.min_perf_pct, 100);
 	min_freq *= cpu->pstate.scaling;
@@ -2505,9 +2566,6 @@ static int intel_pstate_register_driver(
 
 static int intel_pstate_unregister_driver(void)
 {
-	if (hwp_active)
-		return -EBUSY;
-
 	cpufreq_unregister_driver(intel_pstate_driver);
 	intel_pstate_driver_cleanup();
 
@@ -2815,12 +2873,11 @@ static int __init intel_pstate_setup(cha
 	if (!str)
 		return -EINVAL;
 
-	if (!strcmp(str, "disable")) {
+	if (!strcmp(str, "disable"))
 		no_load = 1;
-	} else if (!strcmp(str, "passive")) {
+	else if (!strcmp(str, "passive"))
 		default_driver = &intel_cpufreq;
-		no_hwp = 1;
-	}
+
 	if (!strcmp(str, "no_hwp")) {
 		pr_info("HWP disabled\n");
 		no_hwp = 1;




^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [RFC/RFT][PATCH] cpufreq: intel_pstate: Accept passive mode with HWP enabled
  2020-05-26 18:20 [RFC/RFT][PATCH] cpufreq: intel_pstate: Accept passive mode with HWP enabled Rafael J. Wysocki
@ 2020-05-31 16:39 ` Doug Smythies
  2020-05-31 16:54   ` Srinivas Pandruvada
                     ` (2 more replies)
  2020-06-06 15:21 ` Doug Smythies
  1 sibling, 3 replies; 11+ messages in thread
From: Doug Smythies @ 2020-05-31 16:39 UTC (permalink / raw)
  To: 'Rafael J. Wysocki'
  Cc: 'LKML', 'Len Brown',
	'Srinivas Pandruvada', 'Peter Zijlstra',
	'Giovanni Gherdovich', 'Francisco Jerez',
	'Linux PM'

The content of this e-mail is also at [1],
with annotated graphs.

Hi Rafael,

Hmmm... I think the most important takeaway from
my previous e-mail might have been missed!

HWP does not work properly on my i5-9600K test computer.

For those that don't have to read all this, my upgraded
assertion is:

With HWP enabled, if idle state 2 is used, there is a probability
that the CPU frequency might unexpectedly drop significantly.

Detail (see [1] for supporting graphs and links):

I can not proceed with testing this.
Why not?
Because I do not have a stable good system on top of which to add this patch.
I do not know what is wrong such that HWP appears broken.

For my part of it, I had to stop and dig into
why HWP doesn't seem to work properly on my newer test computer.

Notes:
I have never used HWP before, and have had it disabled so far on this
newer test computer. This patch seemed like a great opportunity to try it.
Why (in addition to helping via review/test)? Because trace now works,
whereas it doesn't in active mode with HWP.

It is on my list to explore viability of a mode for trace where it monitors
what the processor is doing via HWP, rather than reporting what the
processor is being told to do. However, in the meantime, this is great.

Example of what is wrong on my system (repeated, but new example trace data,
from earlier e-mail):

Kernel: 5.7-rc6 and + this version of patch when trace data required.
Patch config: DELAY_HWP 10000 ; 87.5% fraction (7/8).
Keep in mind that the trace entry is the scaled min value, not the target p-state.

Load:
Single thread, forced CPU affinity.
fixed work packet per work period, i.e. the amount of work to do is independent of CPU frequency.
347 hertz work / sleep frequency.
To reveal any hysteresis (i.e. with conservative governor) load ramps up from none
to 100% and then back down to none at 3 seconds per step (step size is uncalibrated).

Processor:
Intel(R) Core(TM) i5-9600K CPU @ 3.70GHz

What do I see?

Unexpected frequency drops at around 70% load.
Example, from trace:

Event begins at 17.456 seconds elapsed time.
Previous event was about 107 milliseconds ago.

Old min ; new min ; freq GHz; load % ; duration mS
27      ; 28      ; 4.60    ; 68.17  ; 10.226
28      ; 26      ; 4.53    ; 57.47  ; 10.005
26      ; 40      ; 2.87    ; 100.00 ; 10.996  <<<< What? Why freq down? (note: it is always above old min)
40      ; 29      ; 4.25    ; 69.67  ; 10.002  <<<< O.K. recovering, but damage done.
29      ; 26      ; 4.60    ; 59.14  ; 10.652  <<<< Event ends. Next event in 128 milliseconds.

I can not think of any reason why the frequency would have been reduced so far by HWP.

O.K., the above had to be done with the patch so that trace could be used.
Ondemand was the governor, because its response curve looks the same as
active/powersave in the area of concern.

The below was done with kernel 5.7-rc6 and only turbostat, at a low sample rate of 15 seconds per,
in addition to the test load. 100 seconds each:

intel_pstate/powersave hwp:

Overruns: 11327
Ave. work percent: 77.952091
Processor package power: ~13 watts.
Average CPU frequency: 3.8 gigahertz 

intel_pstate/powersave no-hwp:

Overruns: 7 (task start ramp up related. I have trace proof.)
Ave. work percent: 74.932603
Processor package power: ~11.5 watts.
Average CPU frequency: 4.0 gigahertz

Question: What is the relevance of the 347 hertz and
the >=70% load for this issue? Why does it appear
to be such a sharp works/fine doesn't work issue?

Answer: The issue seems to be related to when the
sleep portion of a work/sleep periodic workflow
approaches and goes below 1 millisecond (1 jiffy).

some tests were done, varying the work/sleep frequency,
and idle states and overruns and such, the web version
of this e-mail has graphs.

Notes:

The breakpoint between no-overruns/overruns is around 950 microseconds.

A 250 hertz kernel was tested, and it did not have this issue in this area.
Perhaps elsewhere, I didn't look.

1000 hertz kernels were tested back to kernel 5.2, all failed.

If the issue is jiffy related (a race condition?) then a work/sleep frequency
of 333.3333 hertz should behave in binary way, either lots of overruns or none
as a function of the task start time. (Preliminary) It does.

If the issue is jiffy related (a race condition?) then a work/sleep frequency
of 500.00 hertz should behave in binary way, either lots of overruns or none
as a function of the task start time. It does. There are occurrences when idle
state 2 is used somewhat without overruns.

Both teo and menu idle governors were tested, and while both suffer from
unexpected CPU frequency drop, teo seems much worse. However failure points
for both governors are repeatable.

There was new BIOS for this test computer a week ago.
System now upgraded and tested with default BIOS settings (by accident)
and my settings. (Note: slight changes in processor package power for
system idle and one CPU loaded with new BIOS, and earlier tests NOT re-done).

Is the processor using the latest microcode? Currently 0xcc. Can not figure out if there is anything newer.

Leaving out the details, but all the tests and results are available, a mess but available, the summary is:

With HWP enabled, if idle state 2 is used, there is a probability that the CPU frequency might unexpectedly drop significantly.
If the processor does this by itself, or by being told to via sources outside of the intel_pstate CPU frequency driver, I don't
know.

The load sweep test was run at 6 seconds per step during increasing load and 3 seconds per step decreasing
(by mistake, if you must know), while monitoring the idle statistics.
The test was done in a hurry, so many above/below statistics are 0%, due to insufficient sample size.
The overruns and use of idle state 0 are exactly correlated.
There are a lot of graphs on the idle statistics web page, but the idle state 2 usage correlates exactly with
undesired low CPU frequency and overruns.

Side note: Even in the areas where HWP appears to behave, the no-hwp power use is much better.

O.K., so now, do a couple of more turbostat samples:

intel_pstate/powersave hwp idle state 2 disabled:

Overruns: 3
Ave. work percent: 66.647895
Processor package power: ~16.8 watts.
Average CPU frequency: 4.6 gigahertz

intel_pstate/powersave hwp idle state 3 disabled:

Overruns: 22
Ave. work percent: 66.647895
Processor package power: ~16.2 watts.
Average CPU frequency: 4.6 gigahertz

To prevent all the bots that burden my site, the link is coded:
[1] double u double u double u dot smythies dot com /~doug/linux/s18/hwp/index.html 

... Doug

> -----Original Message-----
> From: Rafael J. Wysocki [mailto:rjw@rjwysocki.net]
> Sent: May 26, 2020 11:21 AM
> To: Linux PM; Doug Smythies
> Cc: LKML; Len Brown; Srinivas Pandruvada; Peter Zijlstra; Giovanni Gherdovich; Francisco Jerez
> Subject: [RFC/RFT][PATCH] cpufreq: intel_pstate: Accept passive mode with HWP enabled
> 
> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> 
> Allow intel_pstate to work in the passive mode with HWP enabled and
> make it set the HWP minimum performance limit to 75% of the P-state
> value corresponding to the target frequency supplied by the cpufreq
> governor, so as to prevent the HWP algorithm and the CPU scheduler
> from working against each other at least when the schedutil governor
> is in use.
> 
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> ---
> 
> This is a replacement for https://patchwork.kernel.org/patch/11563615/ that
> uses the HWP floor (minimum performance limit) as the feedback to the HWP
> algorithm (instead of the EPP).
> 
> The INTEL_CPUFREQ_TRANSITION_DELAY_HWP is still 5000 and the previous comments
> still apply to it.
> 
> In addition to that, the 75% fraction used in intel_cpufreq_adjust_hwp() can be
> adjusted too, but I would like to use a value with a power-of-2 denominator for
> that (so the next candidate would be 7/8).
> 
> Everyone who can do that is kindly requested to test this and let me know
> the outcome.



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC/RFT][PATCH] cpufreq: intel_pstate: Accept passive mode with HWP enabled
  2020-05-31 16:39 ` Doug Smythies
@ 2020-05-31 16:54   ` Srinivas Pandruvada
  2020-05-31 18:06     ` Doug Smythies
  2020-05-31 17:15   ` [RFC/RFT][PATCH] cpufreq: intel_pstate: Accept passive mode with HWP enabled Doug Smythies
  2020-06-06 15:21   ` Doug Smythies
  2 siblings, 1 reply; 11+ messages in thread
From: Srinivas Pandruvada @ 2020-05-31 16:54 UTC (permalink / raw)
  To: Doug Smythies, 'Rafael J. Wysocki'
  Cc: 'LKML', 'Len Brown', 'Peter Zijlstra',
	'Giovanni Gherdovich', 'Francisco Jerez',
	'Linux PM'


Hi Doug,

On Sun, 2020-05-31 at 09:39 -0700, Doug Smythies wrote:
> The content of this e-mail is also at [1],
> with annotated graphs.
> 
> Hi Rafael,
> 
> Hmmm... I think the most important takeaway from
> my previous e-mail might have been missed!
> 
> HWP does not work properly on my i5-9600K test computer.
> 
> For those that don't have to read all this, my upgraded
> assertion is:
> 
> With HWP enabled, if idle state 2 is used, there is a probability
> that the CPU frequency might unexpectedly drop significantly.
> 
> Detail (see [1] for supporting graphs and links):
> 
> I can not proceed with testing this.
> Why not?
> Because I do not have a stable good system on top of which to add
> this patch.
> I do not know what is wrong such that HWP appears broken.
> 
> For my part of it, I had to stop and dig into
> why HWP doesn't seem to work properly on my newer test computer.
> 
> Notes:
> I have never used HWP before, and have had it disabled so far on this
> newer test computer. This patch seemed like a great opportunity to
> try it.
> Why (in addition to helping via review/test)? Because trace now
> works,
> whereas it doesn't in active mode with HWP.
> 
> It is on my list to explore viability of a mode for trace where it
> monitors
> what the processor is doing via HWP, rather than reporting what the
> processor is being told to do. However, in the meantime, this is
> great.
> 
> Example of what is wrong on my system (repeated, but new example
> trace data,
> from earlier e-mail):
> 
> Kernel: 5.7-rc6 and + this version of patch when trace data required.
> Patch config: DELAY_HWP 10000 ; 87.5% fraction (7/8).
> Keep in mind that the trace entry is the scaled min value, not the
> target p-state.
> 
> Load:
> Single thread, forced CPU affinity.
> fixed work packet per work period, i.e. the amount of work to do is
> independent of CPU frequency.
> 347 hertz work / sleep frequency.
> To reveal any hysteresis (i.e. with conservative governor) load ramps
> up from none
> to 100% and then back down to none at 3 seconds per step (step size
> is uncalibrated).
> 
> Processor:
> Intel(R) Core(TM) i5-9600K CPU @ 3.70GHz
> 
> What do I see?
> 
> Unexpected frequency drops at around 70% load.
> Example, from trace:
> 
> Event begins at 17.456 seconds elapsed time.
> Previous event was about 107 milliseconds ago.
> 
> Old min ; new min ; freq GHz; load % ; duration mS
> 27      ; 28      ; 4.60    ; 68.17  ; 10.226
> 28      ; 26      ; 4.53    ; 57.47  ; 10.005

Seems you hit power/thermal limit

Is this some Lenovo system?

If you disable HWP you don't see that?

What is the value of
cat /sys/bus/pci/devices/0000\:00\:04.0/tcc_offset_degree_celsius
cat /sys/class/powercap/intel-rapl-mmio/intel-rapl-
mmio:0/constraint_0_power_limit_uw
 
You may want to run 
Try running dptfxtract once.

Then try to get again

cat /sys/bus/pci/devices/0000\:00\:04.0/tcc_offset_degree_celsius
cat /sys/class/powercap/intel-rapl-mmio/intel-rapl-
mmio:0/constraint_0_power_limit_uw


Thanks,
Srinivas

> 26      ; 40      ; 2.87    ; 100.00 ; 10.996  <<<< What? Why freq
> down? (note: it is always above old min)
> 40      ; 29      ; 4.25    ; 69.67  ; 10.002  <<<< O.K. recovering,
> but damage done.
> 29      ; 26      ; 4.60    ; 59.14  ; 10.652  <<<< Event ends. Next
> event in 128 milliseconds.
> 
> I can not think of any reason why the frequency would have been
> reduced so far by HWP.
> 
> O.K., the above had to be done with the patch so that trace could be
> used.
> Ondemand was the governor, because its response curve looks the same
> as
> active/powersave in the area of concern.
> 
> The below was done with kernel 5.7-rc6 and only turbostat, at a low
> sample rate of 15 seconds per,
> in addition to the test load. 100 seconds each:
> 
> intel_pstate/powersave hwp:
> 
> Overruns: 11327
> Ave. work percent: 77.952091
> Processor package power: ~13 watts.
> Average CPU frequency: 3.8 gigahertz 
> 
> intel_pstate/powersave no-hwp:
> 
> Overruns: 7 (task start ramp up related. I have trace proof.)
> Ave. work percent: 74.932603
> Processor package power: ~11.5 watts.
> Average CPU frequency: 4.0 gigahertz
> 
> Question: What is the relevance of the 347 hertz and
> the >=70% load for this issue? Why does it appear
> to be such a sharp works/fine doesn't work issue?
> 
> Answer: The issue seems to be related to when the
> sleep portion of a work/sleep periodic workflow
> approaches and goes below 1 millisecond (1 jiffy).
> 
> some tests were done, varying the work/sleep frequency,
> and idle states and overruns and such, the web version
> of this e-mail has graphs.
> 
> Notes:
> 
> The breakpoint between no-overruns/overruns is around 950
> microseconds.
> 
> A 250 hertz kernel was tested, and it did not have this issue in this
> area.
> Perhaps elsewhere, I didn't look.
> 
> 1000 hertz kernels were tested back to kernel 5.2, all failed.
> 
> If the issue is jiffy related (a race condition?) then a work/sleep
> frequency
> of 333.3333 hertz should behave in binary way, either lots of
> overruns or none
> as a function of the task start time. (Preliminary) It does.
> 
> If the issue is jiffy related (a race condition?) then a work/sleep
> frequency
> of 500.00 hertz should behave in binary way, either lots of overruns
> or none
> as a function of the task start time. It does. There are occurrences
> when idle
> state 2 is used somewhat without overruns.
> 
> Both teo and menu idle governors were tested, and while both suffer
> from
> unexpected CPU frequency drop, teo seems much worse. However failure
> points
> for both governors are repeatable.
> 
> There was new BIOS for this test computer a week ago.
> System now upgraded and tested with default BIOS settings (by
> accident)
> and my settings. (Note: slight changes in processor package power for
> system idle and one CPU loaded with new BIOS, and earlier tests NOT
> re-done).
> 
> Is the processor using the latest microcode? Currently 0xcc. Can not
> figure out if there is anything newer.
> 
> Leaving out the details, but all the tests and results are available,
> a mess but available, the summary is:
> 
> With HWP enabled, if idle state 2 is used, there is a probability
> that the CPU frequency might unexpectedly drop significantly.
> If the processor does this by itself, or by being told to via sources
> outside of the intel_pstate CPU frequency driver, I don't
> know.
> 
> The load sweep test was run at 6 seconds per step during increasing
> load and 3 seconds per step decreasing
> (by mistake, if you must know), while monitoring the idle statistics.
> The test was done in a hurry, so many above/below statistics are 0%,
> due to insufficient sample size.
> The overruns and use of idle state 0 are exactly correlated.
> There are a lot of graphs on the idle statistics web page, but the
> idle state 2 usage correlates exactly with
> undesired low CPU frequency and overruns.
> 
> Side note: Even in the areas where HWP appears to behave, the no-hwp
> power use is much better.
> 
> O.K., so now, do a couple of more turbostat samples:
> 
> intel_pstate/powersave hwp idle state 2 disabled:
> 
> Overruns: 3
> Ave. work percent: 66.647895
> Processor package power: ~16.8 watts.
> Average CPU frequency: 4.6 gigahertz
> 
> intel_pstate/powersave hwp idle state 3 disabled:
> 
> Overruns: 22
> Ave. work percent: 66.647895
> Processor package power: ~16.2 watts.
> Average CPU frequency: 4.6 gigahertz
> 
> To prevent all the bots that burden my site, the link is coded:
> [1] double u double u double u dot smythies dot com
> /~doug/linux/s18/hwp/index.html 
> 
> ... Doug
> 
> > -----Original Message-----
> > From: Rafael J. Wysocki [mailto:rjw@rjwysocki.net]
> > Sent: May 26, 2020 11:21 AM
> > To: Linux PM; Doug Smythies
> > Cc: LKML; Len Brown; Srinivas Pandruvada; Peter Zijlstra; Giovanni
> > Gherdovich; Francisco Jerez
> > Subject: [RFC/RFT][PATCH] cpufreq: intel_pstate: Accept passive
> > mode with HWP enabled
> > 
> > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > 
> > Allow intel_pstate to work in the passive mode with HWP enabled and
> > make it set the HWP minimum performance limit to 75% of the P-state
> > value corresponding to the target frequency supplied by the cpufreq
> > governor, so as to prevent the HWP algorithm and the CPU scheduler
> > from working against each other at least when the schedutil
> > governor
> > is in use.
> > 
> > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > ---
> > 
> > This is a replacement for 
> > https://patchwork.kernel.org/patch/11563615/ that
> > uses the HWP floor (minimum performance limit) as the feedback to
> > the HWP
> > algorithm (instead of the EPP).
> > 
> > The INTEL_CPUFREQ_TRANSITION_DELAY_HWP is still 5000 and the
> > previous comments
> > still apply to it.
> > 
> > In addition to that, the 75% fraction used in
> > intel_cpufreq_adjust_hwp() can be
> > adjusted too, but I would like to use a value with a power-of-2
> > denominator for
> > that (so the next candidate would be 7/8).
> > 
> > Everyone who can do that is kindly requested to test this and let
> > me know
> > the outcome.
> 
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [RFC/RFT][PATCH] cpufreq: intel_pstate: Accept passive mode with HWP enabled
  2020-05-31 16:39 ` Doug Smythies
  2020-05-31 16:54   ` Srinivas Pandruvada
@ 2020-05-31 17:15   ` Doug Smythies
  2020-06-06 15:21   ` Doug Smythies
  2 siblings, 0 replies; 11+ messages in thread
From: Doug Smythies @ 2020-05-31 17:15 UTC (permalink / raw)
  To: 'Rafael J. Wysocki'
  Cc: 'LKML', 'Len Brown',
	'Srinivas Pandruvada', 'Peter Zijlstra',
	'Giovanni Gherdovich', 'Francisco Jerez',
	'Linux PM'

Correction:

On 2020.05.31 09:39 Doug smythies wrote:

> The overruns and use of idle state 0 are exactly correlated.

Should have been "idle state 2":

The overruns and use of idle state 2 are exactly correlated.



^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [RFC/RFT][PATCH] cpufreq: intel_pstate: Accept passive mode with HWP enabled
  2020-05-31 16:54   ` Srinivas Pandruvada
@ 2020-05-31 18:06     ` Doug Smythies
  2020-05-31 18:59       ` Srinivas Pandruvada
  2020-06-30 19:10       ` cpufreq: intel_pstate: HWP mode issue Doug Smythies
  0 siblings, 2 replies; 11+ messages in thread
From: Doug Smythies @ 2020-05-31 18:06 UTC (permalink / raw)
  To: 'Srinivas Pandruvada', 'Rafael J. Wysocki'
  Cc: 'LKML', 'Len Brown', 'Peter Zijlstra',
	'Giovanni Gherdovich', 'Francisco Jerez',
	'Linux PM'

Hi Srinivas,

Thanks you for your quick reply.

On 2020.05.31 09:54 Srinivas Pandruvada wrote
> On Sun, 2020-05-31 at 09:39 -0700, Doug Smythies wrote:

>> Event begins at 17.456 seconds elapsed time.
>> Previous event was about 107 milliseconds ago.
>>
>> Old min ; new min ; freq GHz; load % ; duration mS
>> 27      ; 28      ; 4.60    ; 68.17  ; 10.226
>> 28      ; 26      ; 4.53    ; 57.47  ; 10.005
> 
> Seems you hit power/thermal limit

No.

I am nowhere near any power limit at all.
I have meticulously configured and tested the thermal management of this computer.
I never ever hit a thermal limit and have TDP set such that the processor
temperature never exceeds about 75 degrees centigrade.

There should never be throttling involved in these experiments.
I can achieve throttling when compiling the kernel and with
torture test mode on the mprime test (other CPU stressors,
including my own, are not as good at generating heat as
mprime).

This system can run indefinitely at 99.9 watts processor package power.
Example (turbostat, steady state, CPU freq throttled to 4.04 GHz):

doug@s18:~$ sudo ~/turbostat --Summary --quiet --show Busy%,Bzy_MHz,PkgTmp,PkgWatt,GFXWatt,IRQ --interval 12
Busy%   Bzy_MHz IRQ     PkgTmp  PkgWatt GFXWatt
100.21  4045    72231   66      99.93   0.00
100.21  4043    72239   65      99.92   0.00

> 
> Is this some Lenovo system?

No. The web page version of my original e-mail has
a link to the test computer hardware profile.

The motherboard is ASUS PRIME Z390-P.

> 
> If you disable HWP you don't see that?

Correct.

> 
> What is the value of
> cat /sys/bus/pci/devices/0000\:00\:04.0/tcc_offset_degree_celsius

? "No such file or directory"

> cat /sys/class/powercap/intel-rapl-mmio/intel-rapl-
> mmio:0/constraint_0_power_limit_uw

? "No such file or directory"
 
> You may want to run
> Try running dptfxtract once.

No, I am not going to.

I am not running thermald. Eventually I will, as a backup
in case of cooling failure, so as not to hit the processor limit
shutdown. I just haven't done it yet.

> 
> Then try to get again
> 
> cat /sys/bus/pci/devices/0000\:00\:04.0/tcc_offset_degree_celsius
> cat /sys/class/powercap/intel-rapl-mmio/intel-rapl-
> mmio:0/constraint_0_power_limit_uw
> 
> 
> Thanks,
> Srinivas



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC/RFT][PATCH] cpufreq: intel_pstate: Accept passive mode with HWP enabled
  2020-05-31 18:06     ` Doug Smythies
@ 2020-05-31 18:59       ` Srinivas Pandruvada
  2020-05-31 19:28         ` Srinivas Pandruvada
  2020-06-30 19:10       ` cpufreq: intel_pstate: HWP mode issue Doug Smythies
  1 sibling, 1 reply; 11+ messages in thread
From: Srinivas Pandruvada @ 2020-05-31 18:59 UTC (permalink / raw)
  To: Doug Smythies, 'Rafael J. Wysocki'
  Cc: 'LKML', 'Len Brown', 'Peter Zijlstra',
	'Giovanni Gherdovich', 'Francisco Jerez',
	'Linux PM'

On Sun, 2020-05-31 at 11:06 -0700, Doug Smythies wrote:
> Hi Srinivas,
> 
> Thanks you for your quick reply.
> 
> On 2020.05.31 09:54 Srinivas Pandruvada wrote
> > On Sun, 2020-05-31 at 09:39 -0700, Doug Smythies wrote:
> > > Event begins at 17.456 seconds elapsed time.
> > > Previous event was about 107 milliseconds ago.
> > > 
> > > Old min ; new min ; freq GHz; load % ; duration mS
> > > 27      ; 28      ; 4.60    ; 68.17  ; 10.226
> > > 28      ; 26      ; 4.53    ; 57.47  ; 10.005
> > 
> > Seems you hit power/thermal limit
> 
> No.
> 
> I am nowhere near any power limit at all.
> I have meticulously configured and tested the thermal management of
> this computer.
> I never ever hit a thermal limit and have TDP set such that the
> processor
> temperature never exceeds about 75 degrees centigrade.
> 
> There should never be throttling involved in these experiments.
> I can achieve throttling when compiling the kernel and with
> torture test mode on the mprime test (other CPU stressors,
> including my own, are not as good at generating heat as
> mprime).
> 
> This system can run indefinitely at 99.9 watts processor package
> power.
> Example (turbostat, steady state, CPU freq throttled to 4.04 GHz):
> 
> doug@s18:~$ sudo ~/turbostat --Summary --quiet --show
> Busy%,Bzy_MHz,PkgTmp,PkgWatt,GFXWatt,IRQ --interval 12
> Busy%   Bzy_MHz IRQ     PkgTmp  PkgWatt GFXWatt
> 100.21  4045    72231   66      99.93   0.00
> 100.21  4043    72239   65      99.92   0.00
> 
> > Is this some Lenovo system?
> 
> No. The web page version of my original e-mail has
> a link to the test computer hardware profile.
> 
> The motherboard is ASUS PRIME Z390-P.
> 

OK, this seems a desktop system.

> > If you disable HWP you don't see that?
> 
> Correct.
> 
> > What is the value of
> > cat /sys/bus/pci/devices/0000\:00\:04.0/tcc_offset_degree_celsius
> 
> ? "No such file or directory"
> 

> > cat /sys/class/powercap/intel-rapl-mmio/intel-rapl-
> > mmio:0/constraint_0_power_limit_uw
> 
You may not have
CONFIG_INT340X_THERMAL=y

What is
#rdmsr 0x1a2

Try changing energy_perf_bias and see if it helps here.

Thanks,
Srinivas


> ? "No such file or directory"
>  
> > You may want to run
> > Try running dptfxtract once.
> 
> No, I am not going to.
> 
> I am not running thermald. Eventually I will, as a backup
> in case of cooling failure, so as not to hit the processor limit
> shutdown. I just haven't done it yet.
> 
> > Then try to get again
> > 
> > cat /sys/bus/pci/devices/0000\:00\:04.0/tcc_offset_degree_celsius
> > cat /sys/class/powercap/intel-rapl-mmio/intel-rapl-
> > mmio:0/constraint_0_power_limit_uw
> > 
> > 
> > Thanks,
> > Srinivas
> 
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC/RFT][PATCH] cpufreq: intel_pstate: Accept passive mode with HWP enabled
  2020-05-31 18:59       ` Srinivas Pandruvada
@ 2020-05-31 19:28         ` Srinivas Pandruvada
  2020-05-31 21:38           ` Doug Smythies
  0 siblings, 1 reply; 11+ messages in thread
From: Srinivas Pandruvada @ 2020-05-31 19:28 UTC (permalink / raw)
  To: Doug Smythies, 'Rafael J. Wysocki'
  Cc: 'LKML', 'Len Brown', 'Peter Zijlstra',
	'Giovanni Gherdovich', 'Francisco Jerez',
	'Linux PM'

On Sun, 2020-05-31 at 11:59 -0700, Srinivas Pandruvada wrote:
> On Sun, 2020-05-31 at 11:06 -0700, Doug Smythies wrote:
> > Hi Srinivas,
> > 
> > Thanks you for your quick reply.
> > 
> > On 2020.05.31 09:54 Srinivas Pandruvada wrote
> > > On Sun, 2020-05-31 at 09:39 -0700, Doug Smythies wrote:
> > > > Event begins at 17.456 seconds elapsed time.
> > > > Previous event was about 107 milliseconds ago.
> > > > 
> > > > Old min ; new min ; freq GHz; load % ; duration mS
> > > > 27      ; 28      ; 4.60    ; 68.17  ; 10.226
> > > > 28      ; 26      ; 4.53    ; 57.47  ; 10.005
> > > 
> > > Seems you hit power/thermal limit
> > 
> > No.
> > 
> > I am nowhere near any power limit at all.
> > I have meticulously configured and tested the thermal management of
> > this computer.
> > I never ever hit a thermal limit and have TDP set such that the
> > processor
> > temperature never exceeds about 75 degrees centigrade.
> > 
> > There should never be throttling involved in these experiments.
> > I can achieve throttling when compiling the kernel and with
> > torture test mode on the mprime test (other CPU stressors,
> > including my own, are not as good at generating heat as
> > mprime).
> > 
> > This system can run indefinitely at 99.9 watts processor package
> > power.
> > Example (turbostat, steady state, CPU freq throttled to 4.04 GHz):
> > 
> > doug@s18:~$ sudo ~/turbostat --Summary --quiet --show
> > Busy%,Bzy_MHz,PkgTmp,PkgWatt,GFXWatt,IRQ --interval 12
> > Busy%   Bzy_MHz IRQ     PkgTmp  PkgWatt GFXWatt
> > 100.21  4045    72231   66      99.93   0.00
> > 100.21  4043    72239   65      99.92   0.00
> > 
> > > Is this some Lenovo system?
> > 
> > No. The web page version of my original e-mail has
> > a link to the test computer hardware profile.
> > 
> > The motherboard is ASUS PRIME Z390-P.
> > 
> 
> OK, this seems a desktop system.
> 
> > > If you disable HWP you don't see that?
> > 
> > Correct.
> > 
> > > What is the value of
> > > cat /sys/bus/pci/devices/0000\:00\:04.0/tcc_offset_degree_celsius
> > 
> > ? "No such file or directory"
> > 
> > > cat /sys/class/powercap/intel-rapl-mmio/intel-rapl-
> > > mmio:0/constraint_0_power_limit_uw
> You may not have
> CONFIG_INT340X_THERMAL=y
> 
> What is
> #rdmsr 0x1a2
> 
> Try changing energy_perf_bias and see if it helps here.
> 
Also if
MSR 0x1FC bit 19 is 0, change to 1.

Thanks,
Srinivas

> Thanks,
> Srinivas
> 
> 
> > ? "No such file or directory"
> >  
> > > You may want to run
> > > Try running dptfxtract once.
> > 
> > No, I am not going to.
> > 
> > I am not running thermald. Eventually I will, as a backup
> > in case of cooling failure, so as not to hit the processor limit
> > shutdown. I just haven't done it yet.
> > 
> > > Then try to get again
> > > 
> > > cat /sys/bus/pci/devices/0000\:00\:04.0/tcc_offset_degree_celsius
> > > cat /sys/class/powercap/intel-rapl-mmio/intel-rapl-
> > > mmio:0/constraint_0_power_limit_uw
> > > 
> > > 
> > > Thanks,
> > > Srinivas


^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [RFC/RFT][PATCH] cpufreq: intel_pstate: Accept passive mode with HWP enabled
  2020-05-31 19:28         ` Srinivas Pandruvada
@ 2020-05-31 21:38           ` Doug Smythies
  0 siblings, 0 replies; 11+ messages in thread
From: Doug Smythies @ 2020-05-31 21:38 UTC (permalink / raw)
  To: 'Srinivas Pandruvada', 'Rafael J. Wysocki'
  Cc: 'LKML', 'Len Brown', 'Peter Zijlstra',
	'Giovanni Gherdovich', 'Francisco Jerez',
	'Linux PM'

On 2020.05.31 12:29 Srinivas Pandruvada wrote:
> On Sun, 2020-05-31 at 11:59 -0700, Srinivas Pandruvada wrote:
>> On Sun, 2020-05-31 at 11:06 -0700, Doug Smythies wrote:
>> > Hi Srinivas,
>> >
>> > Thanks you for your quick reply.
>> >
>> > On 2020.05.31 09:54 Srinivas Pandruvada wrote
>> > > On Sun, 2020-05-31 at 09:39 -0700, Doug Smythies wrote:
>> > > > Event begins at 17.456 seconds elapsed time.
>> > > > Previous event was about 107 milliseconds ago.
>> > > >
>> > > > Old min ; new min ; freq GHz; load % ; duration mS
>> > > > 27      ; 28      ; 4.60    ; 68.17  ; 10.226
>> > > > 28      ; 26      ; 4.53    ; 57.47  ; 10.005
>> > >
>> > > Seems you hit power/thermal limit
>> >
>> > No.
>> >
>> > I am nowhere near any power limit at all.
>> > I have meticulously configured and tested the thermal management of
>> > this computer.
>> > I never ever hit a thermal limit and have TDP set such that the
>> > processor
>> > temperature never exceeds about 75 degrees centigrade.
>> >
>> > There should never be throttling involved in these experiments.
>> > I can achieve throttling when compiling the kernel and with
>> > torture test mode on the mprime test (other CPU stressors,
>> > including my own, are not as good at generating heat as
>> > mprime).
>> >
>> > This system can run indefinitely at 99.9 watts processor package
>> > power.
>> > Example (turbostat, steady state, CPU freq throttled to 4.04 GHz):
>> >
>> > doug@s18:~$ sudo ~/turbostat --Summary --quiet --show
>> > Busy%,Bzy_MHz,PkgTmp,PkgWatt,GFXWatt,IRQ --interval 12
>> > Busy%   Bzy_MHz IRQ     PkgTmp  PkgWatt GFXWatt
>> > 100.21  4045    72231   66      99.93   0.00
>> > 100.21  4043    72239   65      99.92   0.00
>> >
>> > > Is this some Lenovo system?
>> >
>> > No. The web page version of my original e-mail has
>> > a link to the test computer hardware profile.
>> >
>> > The motherboard is ASUS PRIME Z390-P.
>> >
>>
>> OK, this seems a desktop system.
>>
>> > > If you disable HWP you don't see that?
>> >
>> > Correct.
>> >
>> > > What is the value of
>> > > cat /sys/bus/pci/devices/0000\:00\:04.0/tcc_offset_degree_celsius
>> >
>> > ? "No such file or directory"
>> >
>> > > cat /sys/class/powercap/intel-rapl-mmio/intel-rapl-
>> > > mmio:0/constraint_0_power_limit_uw
>> You may not have
>> CONFIG_INT340X_THERMAL=y

I have:
CONFIG_INT340X_THERMAL=m

>>
>> What is
>> #rdmsr 0x1a2

From the turbostat startup spew of stuff
(also in a link on the web page version of the e-mail):

MSR_IA32_TEMPERATURE_TARGET: 0x0064100d (100 C)

Or manually now:

root@s18:/home/doug/prime95# rdmsr 0x1a2
64100d

>>
>> Try changing energy_perf_bias and see if it helps here.

Yes, that is a test I meant to do, and should have done.

No, it doesn't help.

>>
> Also if
> MSR 0x1FC bit 19 is 0, change to 1.
> 

Ya, I have always found documentation on 0x1FC somewhat lacking.
Anyway, the bit is already 1.
Is the bit EE_TURBO_DISABLE?
Anyway, I tried that bit as 0, and it didn't help.

> Thanks,
> Srinivas
> 

... Doug



^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [RFC/RFT][PATCH] cpufreq: intel_pstate: Accept passive mode with HWP enabled
  2020-05-31 16:39 ` Doug Smythies
  2020-05-31 16:54   ` Srinivas Pandruvada
  2020-05-31 17:15   ` [RFC/RFT][PATCH] cpufreq: intel_pstate: Accept passive mode with HWP enabled Doug Smythies
@ 2020-06-06 15:21   ` Doug Smythies
  2 siblings, 0 replies; 11+ messages in thread
From: Doug Smythies @ 2020-06-06 15:21 UTC (permalink / raw)
  To: 'Rafael J. Wysocki', 'Srinivas Pandruvada'
  Cc: 'LKML', 'Len Brown', 'Peter Zijlstra',
	'Giovanni Gherdovich', 'Francisco Jerez',
	'Linux PM'

Note 1: I have gone back a few e-mails in the thread for this reply.
Note 2: Srinivas and I have exchanged several e-mails off-list. Thanks.

On 2020.05.31 09:39 Doug wrote:

> 
> The content of this e-mail is also at [1],
> with annotated graphs.

That reference is now a few days out of date.

> 
> Hi Rafael,
> 
> Hmmm... I think the most important takeaway from
> my previous e-mail might have been missed!
> 
> HWP does not work properly on my i5-9600K test computer.
> 
> For those that don't have to read all this, my upgraded
> assertion is:
> 
> With HWP enabled, if idle state 2 is used, there is a probability
> that the CPU frequency might unexpectedly drop significantly.
> 

After another week of investigation, my refined assertion is:

For any workload from periodic to 100% busy but with infrequent small gaps,
there is a significant probability that the processor will lock
at minimum CPU frequency for a period of up to 30 milliseconds.

Gap definition:
lower limit not known, but < 747 uSeconds.
Upper limit is between 952 and 955 uSeconds (there will be some overhead uncertainties).
Must be preceded by busy time spanning a couple of HWP sampling boundaries
or jiffy boundaries or something (I don't actually know how HWP does stuff).

My long e-mail from last week was entirely based on periodic work in the 347 hertz area.
This e-mail is about the same fundamental issue, but a completely different approach.

Workflow: long busy, short gap, busy but taking loop time samples so as to estimate CPU frequency.
I am calling it an inverse impulse response test.

Results:

If 747 uSec < gap < 950 uSeconds then there is approximately a 17% probability
that upon exit from the gap back to busy the CPU will be held at 800 Mhz (or whatever minimum)
for about 30 milliseconds before ramping up. The expectation is that the CPU frequency
ramp up time would even be detectable by my test program, and that is the case for
the other 83% of test runs.

Sanity check (842 uSec gap time):

O.K. so I should be able to observe this with the workflow running in
a loop if I run turbostat at the same time. If I use a 3 second sample time,
sometimes a frequency drop from 4600 MHz to 4573 is shown. If there is an
event during a turbostat sample, there can be only one. So let's reverse
calculate the frequency from the data:

ave mhz = (time at 4600 + time at 0 + time at 800 Mhz)/3
let the time at 800 MHz be Y.

4573 = (4600 * ((3 Sec - gap) - Y) + 800 * Y)/3
4573 = (13796.1268 - 3800 Y)/3
13719 = 13796.1268 -  3800 Y
Y = 20.3 milliseconds.

Note: the transition time from 800 MHz to 4600 MHz
Is not really 0, so the real Y should be longer.

Real data:

Test runs: 511
Bad runs: 86 (17%)
Average time at suppressed CPU frequency: ~ 29.8 milliseconds.

Overall I have run this test about 6,500 times.
So far, nohwp, passive, schedutil has failed the test 0 times
out of 361 runs.

Note: under these condition this computer should never
throttle back from 4600 MHz, because (from turbostat):

cpu2: MSR_TURBO_RATIO_LIMIT: 0x2b2b2e2e2e2e2e2e
43 * 100.0 = 4300.0 MHz max turbo 8 active cores << I don't have 8 cores
43 * 100.0 = 4300.0 MHz max turbo 7 active cores << I don't have 7 cores
46 * 100.0 = 4600.0 MHz max turbo 6 active cores << I do have 6
46 * 100.0 = 4600.0 MHz max turbo 5 active cores
46 * 100.0 = 4600.0 MHz max turbo 4 active cores
46 * 100.0 = 4600.0 MHz max turbo 3 active cores
46 * 100.0 = 4600.0 MHz max turbo 2 active cores
46 * 100.0 = 4600.0 MHz max turbo 1 active cores

> Detail (see [1] for supporting graphs and links):
> 
> I can not proceed with testing this.
> Why not?
> Because I do not have a stable good system on top of which to add this patch.
> I do not know what is wrong such that HWP appears broken.
> 
> For my part of it, I had to stop and dig into
> why HWP doesn't seem to work properly on my newer test computer.
> 
> Notes:
> I have never used HWP before, and have had it disabled so far on this
> newer test computer. This patch seemed like a great opportunity to try it.
> Why (in addition to helping via review/test)? Because trace now works,
> whereas it doesn't in active mode with HWP.
> 
> It is on my list to explore viability of a mode for trace where it monitors
> what the processor is doing via HWP, rather than reporting what the
> processor is being told to do. However, in the meantime, this is great.
> 
> Example of what is wrong on my system (repeated, but new example trace data,
> from earlier e-mail):
> 
> Kernel: 5.7-rc6 and + this version of patch when trace data required.
> Patch config: DELAY_HWP 10000 ; 87.5% fraction (7/8).
> Keep in mind that the trace entry is the scaled min value, not the target p-state.
> 
> Load:
> Single thread, forced CPU affinity.
> fixed work packet per work period, i.e. the amount of work to do is independent of CPU frequency.
> 347 hertz work / sleep frequency.
> To reveal any hysteresis (i.e. with conservative governor) load ramps up from none
> to 100% and then back down to none at 3 seconds per step (step size is uncalibrated).
> 
> Processor:
> Intel(R) Core(TM) i5-9600K CPU @ 3.70GHz
> 
> What do I see?
> 
> Unexpected frequency drops at around 70% load.
> Example, from trace:
> 
> Event begins at 17.456 seconds elapsed time.
> Previous event was about 107 milliseconds ago.
> 
> Old min ; new min ; freq GHz; load % ; duration mS
> 27      ; 28      ; 4.60    ; 68.17  ; 10.226
> 28      ; 26      ; 4.53    ; 57.47  ; 10.005
> 26      ; 40      ; 2.87    ; 100.00 ; 10.996  <<<< What? Why freq down? (note: it is always above old
> min)
> 40      ; 29      ; 4.25    ; 69.67  ; 10.002  <<<< O.K. recovering, but damage done.
> 29      ; 26      ; 4.60    ; 59.14  ; 10.652  <<<< Event ends. Next event in 128 milliseconds.
> 
> I can not think of any reason why the frequency would have been reduced so far by HWP.
> 
> O.K., the above had to be done with the patch so that trace could be used.
> Ondemand was the governor, because its response curve looks the same as
> active/powersave in the area of concern.
> 
> The below was done with kernel 5.7-rc6 and only turbostat, at a low sample rate of 15 seconds per,
> in addition to the test load. 100 seconds each:
> 
> intel_pstate/powersave hwp:
> 
> Overruns: 11327
> Ave. work percent: 77.952091
> Processor package power: ~13 watts.
> Average CPU frequency: 3.8 gigahertz
> 
> intel_pstate/powersave no-hwp:
> 
> Overruns: 7 (task start ramp up related. I have trace proof.)
> Ave. work percent: 74.932603
> Processor package power: ~11.5 watts.
> Average CPU frequency: 4.0 gigahertz
> 
> Question: What is the relevance of the 347 hertz and
> the >=70% load for this issue? Why does it appear
> to be such a sharp works/fine doesn't work issue?
> 
> Answer: The issue seems to be related to when the
> sleep portion of a work/sleep periodic workflow
> approaches and goes below 1 millisecond (1 jiffy).
> 
> some tests were done, varying the work/sleep frequency,
> and idle states and overruns and such, the web version
> of this e-mail has graphs.
> 
> Notes:
> 
> The breakpoint between no-overruns/overruns is around 950 microseconds.
> 
> A 250 hertz kernel was tested, and it did not have this issue in this area.
> Perhaps elsewhere, I didn't look.
> 
> 1000 hertz kernels were tested back to kernel 5.2, all failed.
> 
> If the issue is jiffy related (a race condition?) then a work/sleep frequency
> of 333.3333 hertz should behave in binary way, either lots of overruns or none
> as a function of the task start time. (Preliminary) It does.
> 
> If the issue is jiffy related (a race condition?) then a work/sleep frequency
> of 500.00 hertz should behave in binary way, either lots of overruns or none
> as a function of the task start time. It does. There are occurrences when idle
> state 2 is used somewhat without overruns.
> 
> Both teo and menu idle governors were tested, and while both suffer from
> unexpected CPU frequency drop, teo seems much worse. However failure points
> for both governors are repeatable.
> 
> There was new BIOS for this test computer a week ago.
> System now upgraded and tested with default BIOS settings (by accident)
> and my settings. (Note: slight changes in processor package power for
> system idle and one CPU loaded with new BIOS, and earlier tests NOT re-done).
> 
> Is the processor using the latest microcode? Currently 0xcc. Can not figure out if there is anything
> newer.
> 
> Leaving out the details, but all the tests and results are available, a mess but available, the
> summary is:
> 
> With HWP enabled, if idle state 2 is used, there is a probability that the CPU frequency might
> unexpectedly drop significantly.
> If the processor does this by itself, or by being told to via sources outside of the intel_pstate CPU
> frequency driver, I don't
> know.
> 
> The load sweep test was run at 6 seconds per step during increasing load and 3 seconds per step
> decreasing
> (by mistake, if you must know), while monitoring the idle statistics.
> The test was done in a hurry, so many above/below statistics are 0%, due to insufficient sample size.
> The overruns and use of idle state 0 are exactly correlated.
> There are a lot of graphs on the idle statistics web page, but the idle state 2 usage correlates
> exactly with
> undesired low CPU frequency and overruns.
> 
> Side note: Even in the areas where HWP appears to behave, the no-hwp power use is much better.
> 
> O.K., so now, do a couple of more turbostat samples:
> 
> intel_pstate/powersave hwp idle state 2 disabled:
> 
> Overruns: 3
> Ave. work percent: 66.647895
> Processor package power: ~16.8 watts.
> Average CPU frequency: 4.6 gigahertz
> 
> intel_pstate/powersave hwp idle state 3 disabled:
> 
> Overruns: 22
> Ave. work percent: 66.647895
> Processor package power: ~16.2 watts.
> Average CPU frequency: 4.6 gigahertz
> 
> To prevent all the bots that burden my site, the link is coded:
> [1] double u double u double u dot smythies dot com /~doug/linux/s18/hwp/index.html
> 
> ... Doug
> 
> > -----Original Message-----
> > From: Rafael J. Wysocki [mailto:rjw@rjwysocki.net]
> > Sent: May 26, 2020 11:21 AM
> > To: Linux PM; Doug Smythies
> > Cc: LKML; Len Brown; Srinivas Pandruvada; Peter Zijlstra; Giovanni Gherdovich; Francisco Jerez
> > Subject: [RFC/RFT][PATCH] cpufreq: intel_pstate: Accept passive mode with HWP enabled
> >
> > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> >
> > Allow intel_pstate to work in the passive mode with HWP enabled and
> > make it set the HWP minimum performance limit to 75% of the P-state
> > value corresponding to the target frequency supplied by the cpufreq
> > governor, so as to prevent the HWP algorithm and the CPU scheduler
> > from working against each other at least when the schedutil governor
> > is in use.
> >
> > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > ---
> >
> > This is a replacement for https://patchwork.kernel.org/patch/11563615/ that
> > uses the HWP floor (minimum performance limit) as the feedback to the HWP
> > algorithm (instead of the EPP).
> >
> > The INTEL_CPUFREQ_TRANSITION_DELAY_HWP is still 5000 and the previous comments
> > still apply to it.
> >
> > In addition to that, the 75% fraction used in intel_cpufreq_adjust_hwp() can be
> > adjusted too, but I would like to use a value with a power-of-2 denominator for
> > that (so the next candidate would be 7/8).
> >
> > Everyone who can do that is kindly requested to test this and let me know
> > the outcome.



^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [RFC/RFT][PATCH] cpufreq: intel_pstate: Accept passive mode with HWP enabled
  2020-05-26 18:20 [RFC/RFT][PATCH] cpufreq: intel_pstate: Accept passive mode with HWP enabled Rafael J. Wysocki
  2020-05-31 16:39 ` Doug Smythies
@ 2020-06-06 15:21 ` Doug Smythies
  1 sibling, 0 replies; 11+ messages in thread
From: Doug Smythies @ 2020-06-06 15:21 UTC (permalink / raw)
  To: 'Rafael J. Wysocki'
  Cc: 'LKML', 'Len Brown',
	'Srinivas Pandruvada', 'Peter Zijlstra',
	'Giovanni Gherdovich', 'Francisco Jerez',
	'Linux PM'

Hi Rafael,

As you well know, I often test first and
ask questions and review code later.

I think I should have questioned this first.

To the best of my ability/availability, I am
committed to follow up on the hwp issue raised on
the other branch of this thread. However, moving
forward the typical CPU frequency scaling
configuration for my test system will be:

driver: intel-cpufreq, forced at boot.
governor: schedutil
hwp: forced off at boot.

On 2020.05.26 11:21 Rafael J. Wysocki wrote:
> 
> Allow intel_pstate to work in the passive mode with HWP enabled and
> make it set the HWP minimum performance limit to 75% of the P-state
> value corresponding to the target frequency supplied by the cpufreq
> governor, so as to prevent the HWP algorithm and the CPU scheduler
> from working against each other at least when the schedutil governor
> is in use.

I think we need to define what "passive" mode is.
I have always interpreted it to mean "I would like
this pstate please. It has been requested by some higher level
servo". The name intel_cpufreq makes sense.

I have always interpreted "active" to mean "I would like
the intel_pstate CPU frequency driver to decide what pstate
I need".

As mentioned on the other branch of this thread, I don't have
a stable test baseline, but the servos are still fighting each other
with this version of the patch.

> 
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> ---
> 
> This is a replacement for https://patchwork.kernel.org/patch/11563615/ that
> uses the HWP floor (minimum performance limit) as the feedback to the HWP
> algorithm (instead of the EPP).
> 
> The INTEL_CPUFREQ_TRANSITION_DELAY_HWP is still 5000 and the previous comments
> still apply to it.
> 
> In addition to that, the 75% fraction used in intel_cpufreq_adjust_hwp() can be
> adjusted too, but I would like to use a value with a power-of-2 denominator for
> that (so the next candidate would be 7/8).

The issue here is that the lag of the CPU frequency is not a constant, but rather
a function of the task work/sleep timing verses whatever else is going on. One has
to allow for the worst case. From thousands of seconds of intel_pstate trace data,
that limit needs to be about 3% (31/32).

Disclaimer: Done with no-hwp, active/powersave. The results might not be transferrable
to hwp enabled.

> 
> Everyone who can do that is kindly requested to test this and let me know
> the outcome.
> 
> Of course, the documentation still needs to be updated.  Also, the EPP can be
> handled in analogy with the active mode now, but that part can be added in a
> separate patch on top of this one.
> 
> Thanks!
> 
> ---
>  drivers/cpufreq/intel_pstate.c |  119 ++++++++++++++++++++++++++++++-----------
>  1 file changed, 88 insertions(+), 31 deletions(-)
> 
> Index: linux-pm/drivers/cpufreq/intel_pstate.c
> ===================================================================
> --- linux-pm.orig/drivers/cpufreq/intel_pstate.c
> +++ linux-pm/drivers/cpufreq/intel_pstate.c
> @@ -36,6 +36,7 @@
>  #define INTEL_PSTATE_SAMPLING_INTERVAL	(10 * NSEC_PER_MSEC)
> 
>  #define INTEL_CPUFREQ_TRANSITION_LATENCY	20000
> +#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP	5000
>  #define INTEL_CPUFREQ_TRANSITION_DELAY		500
> 
>  #ifdef CONFIG_ACPI
> @@ -2175,7 +2176,10 @@ static int intel_pstate_verify_policy(st
> 
>  static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy)
>  {
> -	intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
> +	if (hwp_active)
> +		intel_pstate_hwp_force_min_perf(policy->cpu);
> +	else
> +		intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
>  }
> 
>  static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
> @@ -2183,12 +2187,10 @@ static void intel_pstate_stop_cpu(struct
>  	pr_debug("CPU %d exiting\n", policy->cpu);
> 
>  	intel_pstate_clear_update_util_hook(policy->cpu);
> -	if (hwp_active) {
> +	if (hwp_active)
>  		intel_pstate_hwp_save_state(policy);
> -		intel_pstate_hwp_force_min_perf(policy->cpu);
> -	} else {
> -		intel_cpufreq_stop_cpu(policy);
> -	}
> +
> +	intel_cpufreq_stop_cpu(policy);
>  }
> 
>  static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
> @@ -2318,13 +2320,58 @@ static void intel_cpufreq_trace(struct c
>  		fp_toint(cpu->iowait_boost * 100));
>  }
> 
> +static void intel_cpufreq_update_hwp_request(struct cpudata *cpu, u32 min_perf)
> +{
> +	u64 value, prev;
> +
> +	rdmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, &prev);
> +	value = prev;
> +
> +	value &= ~HWP_MIN_PERF(~0L);
> +	value |= HWP_MIN_PERF(min_perf);
> +
> +	/*
> +	 * The entire MSR needs to be updated in order to update the HWP min
> +	 * field in it, so opportunistically update the max too if needed.
> +	 */
> +	value &= ~HWP_MAX_PERF(~0L);
> +	value |= HWP_MAX_PERF(cpu->max_perf_ratio);
> +
> +	if (value != prev)
> +		wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
> +}
> +
> +/**
> + * intel_cpufreq_adjust_hwp - Adjust the HWP reuqest register.
                                                ^^^^^^^
request/request

> + * @cpu: Target CPU.
> + * @target_pstate: P-state corresponding to the target frequency.
> + *
> + * Set the HWP minimum performance limit to 75% of @target_pstate taking the
> + * global min and max policy limits into account.
> + *
> + * The purpose of this is to avoid situations in which the kernel and the HWP
> + * algorithm work against each other by giving a hint about the expectations of



> + * the former to the latter.
> + */
> +static void intel_cpufreq_adjust_hwp(struct cpudata *cpu, u32 target_pstate)
> +{
> +	u32 min_perf;
> +
> +	min_perf = max_t(u32, (3 * target_pstate) / 4, cpu->min_perf_ratio);
> +	min_perf = min_t(u32, min_perf, cpu->max_perf_ratio);
> +	if (min_perf != cpu->pstate.current_pstate) {
> +		cpu->pstate.current_pstate = min_perf;
> +		intel_cpufreq_update_hwp_request(cpu, min_perf);
> +	}
> +}
> +
>  static int intel_cpufreq_target(struct cpufreq_policy *policy,
>  				unsigned int target_freq,
>  				unsigned int relation)
>  {
>  	struct cpudata *cpu = all_cpu_data[policy->cpu];
> +	int target_pstate, old_pstate = cpu->pstate.current_pstate;
>  	struct cpufreq_freqs freqs;
> -	int target_pstate, old_pstate;
> 
>  	update_turbo_state();
> 
> @@ -2332,26 +2379,33 @@ static int intel_cpufreq_target(struct c
>  	freqs.new = target_freq;
> 
>  	cpufreq_freq_transition_begin(policy, &freqs);
> +
>  	switch (relation) {
>  	case CPUFREQ_RELATION_L:
> -		target_pstate = DIV_ROUND_UP(freqs.new, cpu->pstate.scaling);
> +		target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling);
>  		break;
>  	case CPUFREQ_RELATION_H:
> -		target_pstate = freqs.new / cpu->pstate.scaling;
> +		target_pstate = target_freq / cpu->pstate.scaling;
>  		break;
>  	default:
> -		target_pstate = DIV_ROUND_CLOSEST(freqs.new, cpu->pstate.scaling);
> +		target_pstate = DIV_ROUND_CLOSEST(target_freq, cpu->pstate.scaling);
>  		break;
>  	}
> -	target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
> -	old_pstate = cpu->pstate.current_pstate;
> -	if (target_pstate != cpu->pstate.current_pstate) {
> -		cpu->pstate.current_pstate = target_pstate;
> -		wrmsrl_on_cpu(policy->cpu, MSR_IA32_PERF_CTL,
> -			      pstate_funcs.get_val(cpu, target_pstate));
> +
> +	if (hwp_active) {
> +		intel_cpufreq_adjust_hwp(cpu, target_pstate);
> +	} else {
> +		target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
> +		if (target_pstate != old_pstate) {
> +			cpu->pstate.current_pstate = target_pstate;
> +			wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL,
> +				      pstate_funcs.get_val(cpu, target_pstate));
> +		}
>  	}
> -	freqs.new = target_pstate * cpu->pstate.scaling;
>  	intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET, old_pstate);
> +
> +	freqs.new = target_pstate * cpu->pstate.scaling;
> +
>  	cpufreq_freq_transition_end(policy, &freqs, false);
> 
>  	return 0;
> @@ -2361,14 +2415,19 @@ static unsigned int intel_cpufreq_fast_s
>  					      unsigned int target_freq)
>  {
>  	struct cpudata *cpu = all_cpu_data[policy->cpu];
> -	int target_pstate, old_pstate;
> +	int target_pstate, old_pstate = cpu->pstate.current_pstate;
> 
>  	update_turbo_state();
> 
>  	target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling);
> -	target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
> -	old_pstate = cpu->pstate.current_pstate;
> -	intel_pstate_update_pstate(cpu, target_pstate);
> +
> +	if (hwp_active) {
> +		intel_cpufreq_adjust_hwp(cpu, target_pstate);
> +	} else {
> +		target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
> +		intel_pstate_update_pstate(cpu, target_pstate);
> +	}
> +
>  	intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate);
>  	return target_pstate * cpu->pstate.scaling;
>  }
> @@ -2389,7 +2448,6 @@ static int intel_cpufreq_cpu_init(struct
>  		return ret;
> 
>  	policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY;
> -	policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY;
>  	/* This reflects the intel_pstate_get_cpu_pstates() setting. */
>  	policy->cur = policy->cpuinfo.min_freq;
> 
> @@ -2401,10 +2459,13 @@ static int intel_cpufreq_cpu_init(struct
> 
>  	cpu = all_cpu_data[policy->cpu];
> 
> -	if (hwp_active)
> +	if (hwp_active) {
>  		intel_pstate_get_hwp_max(policy->cpu, &turbo_max, &max_state);
> -	else
> +		policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY_HWP;
> +	} else {
>  		turbo_max = cpu->pstate.turbo_pstate;
> +		policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY;
> +	}
> 
>  	min_freq = DIV_ROUND_UP(turbo_max * global.min_perf_pct, 100);
>  	min_freq *= cpu->pstate.scaling;
> @@ -2505,9 +2566,6 @@ static int intel_pstate_register_driver(
> 
>  static int intel_pstate_unregister_driver(void)
>  {
> -	if (hwp_active)
> -		return -EBUSY;
> -
>  	cpufreq_unregister_driver(intel_pstate_driver);
>  	intel_pstate_driver_cleanup();
> 
> @@ -2815,12 +2873,11 @@ static int __init intel_pstate_setup(cha
>  	if (!str)
>  		return -EINVAL;
> 
> -	if (!strcmp(str, "disable")) {
> +	if (!strcmp(str, "disable"))
>  		no_load = 1;
> -	} else if (!strcmp(str, "passive")) {
> +	else if (!strcmp(str, "passive"))
>  		default_driver = &intel_cpufreq;
> -		no_hwp = 1;
> -	}
> +
>  	if (!strcmp(str, "no_hwp")) {
>  		pr_info("HWP disabled\n");
>  		no_hwp = 1;
> 



^ permalink raw reply	[flat|nested] 11+ messages in thread

* cpufreq: intel_pstate: HWP mode issue
  2020-05-31 18:06     ` Doug Smythies
  2020-05-31 18:59       ` Srinivas Pandruvada
@ 2020-06-30 19:10       ` Doug Smythies
  1 sibling, 0 replies; 11+ messages in thread
From: Doug Smythies @ 2020-06-30 19:10 UTC (permalink / raw)
  To: 'Srinivas Pandruvada'
  Cc: 'Len Brown', 'Peter Zijlstra',
	'Giovanni Gherdovich', 'Francisco Jerez',
	'Linux PM', 'Rafael J. Wysocki'

This is re-send without the attachments, because
I suspect the original was deleted entirely for the list
because of them.
Sorry if you got this twice.

On 2020.06.30 11:41 Doug Smythies wrote:
> 
> Hi Srinivas,
> 
> O.K. let's try this again, starting a new thread, with address list similar to a few weeks ago.
> I believe I have untangled my multiple issues, such that this e-mail should be only about
> the single issue of HWP capable processors incorrectly deciding to lower the CPU frequency
> under some conditions. Also, my previous assertion as to the issue was indeed incorrect.
> 
> I now:
> . never use x86_energy_perf_policy.
> . For HWP disabled: never change from active to passive or via versa, but rather do it via boot.
> . after boot always check and reset the various power limit log bits that are set.
> . never compile the kernel (well, until after any tests), which will set those bits again.
> . never run prime95 high heat torture test, which will set those bits again.
> . Note that the tests done for this e-mail never ever set those bits again.
> . Invented an entirely new way to manifest, demonstrate, and exploit the issue (also mentioned June
> 6th).
> . All tests were repeated on another HWP capable computer, so a i5-9600K and a i5-6200U.
> 
> New method (old was periodic workflow):
> 
> Long busy, short gap, busy but taking loop time samples so as to estimate CPU frequency.
> I am calling it an inverse impulse response test.
> 
> Assertion:
> 
> If the short sleep is somehow simultaneous with some sort of 5.0 millisecond (200 Hertz)
> periodic event (either in HWP itself, or via the driver, I am unable to determine which,
> but think it is inside the black box that is HWP), then there is a possibility that the
> CPU frequency will drop significantly and will take an excessive amount of time to recover.
> Frequency step ups are exactly on 5.0 millisecond boundaries +/- the short gap time.
> 
> . The probability is somewhat inconsistent and a function of whatever else the computer is doing.
> . The time to recover is a function of EPP, and if EPP is low enough my test never fails.
> . These tests were all done with default settings.
> . The "5.0" mSec is only for those default settings, it actually depends on EPP.
>   . Crude step boundaries, mSec: EPP=32, 2; EPP=64, 4; EPP=128, 5.00; EPP=196, 9
> . High level: i5-9600K: 2453 tests, 60 failures, 2.45% fail rate. (HWP - powersave)
> . High level: i5-6200U: 4134 tests, 128 failures, 3.1% fail rate. (HWP - powersave)
> . Low level (capture waveforms): i5-9600K: 1842 captured failure waveforms. See graph.
> . Low level (capture waveforms): i5-6200U: 458 captured failure waveforms. See graph.
> . Verify acpi-cpufreq/ondemand works fine: i5-9600K: 8975 tests. 0 failures.
> . Verify acpi-cpufreq/ondemand works fine: i5-6200U: 8575 tests. 0 failures.
> 
> The short gap was 842 uSeconds for all these tests, and for no particular reason.
> 
> While I have not re-done the bounds investigation, I have no reason to doubt
> my previous work, re-stated below:
> 
> > Gap definition:
> > lower limit not known, but < 747 uSeconds.
> > Upper limit is between 952 and 955 uSeconds (there will be some overhead uncertainties).
> > Must be preceded by busy time spanning a couple of HWP sampling boundaries
> > or jiffy boundaries or something (I don't actually know how HWP does stuff).
> 
> Rather than point to graphs, which nobody seems to look at, they are attached,
> and so might get striped for some of you.
> 
> ... Doug
> 
> Addendum: Some of the MSRs you have requested in the past:
> 
> i5-9600K (HWP - powersave after test):
> 
> root@s18:/home/doug# /home/doug/c/msr-decoder
> 8.) 0x198: IA32_PERF_STATUS     : CPU 0-5 :   8 :   8 :   8 :   8 :   8 :   8 :
> B.) 0x770: IA32_PM_ENABLE: 1 : HWP enable
> 1.) 0x19C: IA32_THERM_STATUS: 88480000
> 2.) 0x1AA: MSR_MISC_PWR_MGMT: 401CC0 EIST enabled Coordination enabled OOB Bit 8 reset OOB Bit 18
> reset
> 3.) 0x1B1: IA32_PACKAGE_THERM_STATUS: 88460000
> 4.) 0x64F: MSR_CORE_PERF_LIMIT_REASONS: 0
> A.) 0x1FC: MSR_POWER_CTL: 3C005D : C1E disable : EEO disable : RHO disable
> 5.) 0x771: IA32_HWP_CAPABILITIES (performance): 108252E : high 46 : guaranteed 37 : efficient 8 :
> lowest 1
> 6.) 0x774: IA32_HWP_REQUEST:    CPU 0-5 :
>     raw: 80002E08 : 80002E08 : 80002E08 : 80002E08 : 80002E08 : 80002E08 :
>     min:        8 :        8 :        8 :        8 :        8 :        8 :
>     max:       46 :       46 :       46 :       46 :       46 :       46 :
>     des:        0 :        0 :        0 :        0 :        0 :        0 :
>     epp:      128 :      128 :      128 :      128 :      128 :      128 :
>     act:        0 :        0 :        0 :        0 :        0 :        0 :
> 7.) 0x777: IA32_HWP_STATUS: 0 : high 0 : guaranteed 0 : efficient 0 : lowest 0
> 
> i5-9600K (no HWP - acpi-cpufreq/ondemand after test):
> 
> root@s18:/home/doug/c# /home/doug/c/msr-decoder
> 8.) 0x198: IA32_PERF_STATUS     : CPU 0-5 :   8 :   8 :   8 :   8 :   8 :   8 :
> B.) 0x770: IA32_PM_ENABLE: 0 : HWP disable
> 9.) 0x199: IA32_PERF_CTL        : CPU 0-5 :   8 :   8 :   8 :   8 :   8 :   8 :
> C.) 0x1B0: IA32_ENERGY_PERF_BIAS: CPU 0-5 :   6 :   6 :   6 :   6 :   6 :   6 :
> 1.) 0x19C: IA32_THERM_STATUS: 88480000
> 2.) 0x1AA: MSR_MISC_PWR_MGMT: 401CC0 EIST enabled Coordination enabled OOB Bit 8 reset OOB Bit 18
> reset
> 3.) 0x1B1: IA32_PACKAGE_THERM_STATUS: 88460000
> 4.) 0x64F: MSR_CORE_PERF_LIMIT_REASONS: 0
> A.) 0x1FC: MSR_POWER_CTL: 3C005D : C1E disable : EEO disable : RHO disable
> 
> i5-6200U (HWP - powersave after test):
> 
> 8.) 0x198: IA32_PERF_STATUS : CPU 0-3 : 19 : 19 : 19 : 19 :
> B.) 0x770: IA32_PM_ENABLE: 1 : HWP enable
> 1.) 0x19C: IA32_THERM_STATUS: 88430000
> 2.) 0x1AA: MSR_MISC_PWR_MGMT: 4018C0 EIST enabled Coordination enabled OOB Bit 8 reset OOB Bit 18
> reset
> 3.) 0x1B1: IA32_PACKAGE_THERM_STATUS: 88420000
> 4.) 0x64F: MSR_CORE_PERF_LIMIT_REASONS: 0
> A.) 0x1FC: MSR_POWER_CTL: 24005D : C1E disable : EEO enable : RHO enable
> 5.) 0x771: IA32_HWP_CAPABILITIES (performance): 105171C : high 28 : guaranteed 23 : efficient 5 :
> lowest 1
> 6.) 0x774: IA32_HWP_REQUEST: CPU 0-3 :
>     raw: 80001B04 : 80001B04 : 80001B04 : 80001B04 :
>     min:        4 :        4 :        4 :        4 :
>     max:       27 :       27 :       27 :       27 :
>     des:        0 :        0 :        0 :        0 :
>     epp:      128 :      128 :      128 :      128 :
>     act:        0 :        0 :        0 :        0 :
> 7.) 0x777: IA32_HWP_STATUS: 4 : high 4 : guaranteed 0 : efficient 0 : lowest 0
> 
> i5-6200U (no HWP - acpi-cpufreq/ondemand after test):
> 
> 8.) 0x198: IA32_PERF_STATUS     : CPU 0-3 :  23 :  23 :  23 :  23 :
> B.) 0x770: IA32_PM_ENABLE: 0 : HWP disable
> 9.) 0x199: IA32_PERF_CTL        : CPU 0-3 :  11 :   5 :   5 :   5 :
> C.) 0x1B0: IA32_ENERGY_PERF_BIAS: CPU 0-3 :   6 :   6 :   6 :   6 :
> 1.) 0x19C: IA32_THERM_STATUS: 88440000
> 2.) 0x1AA: MSR_MISC_PWR_MGMT: 4018C0 EIST enabled Coordination enabled OOB Bit 8 reset OOB Bit 18
> reset
> 3.) 0x1B1: IA32_PACKAGE_THERM_STATUS: 88430000
> 4.) 0x64F: MSR_CORE_PERF_LIMIT_REASONS: 0
> A.) 0x1FC: MSR_POWER_CTL: 24005D : C1E disable : EEO enable : RHO enable



^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, back to index

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-26 18:20 [RFC/RFT][PATCH] cpufreq: intel_pstate: Accept passive mode with HWP enabled Rafael J. Wysocki
2020-05-31 16:39 ` Doug Smythies
2020-05-31 16:54   ` Srinivas Pandruvada
2020-05-31 18:06     ` Doug Smythies
2020-05-31 18:59       ` Srinivas Pandruvada
2020-05-31 19:28         ` Srinivas Pandruvada
2020-05-31 21:38           ` Doug Smythies
2020-06-30 19:10       ` cpufreq: intel_pstate: HWP mode issue Doug Smythies
2020-05-31 17:15   ` [RFC/RFT][PATCH] cpufreq: intel_pstate: Accept passive mode with HWP enabled Doug Smythies
2020-06-06 15:21   ` Doug Smythies
2020-06-06 15:21 ` Doug Smythies

Linux-PM Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-pm/0 linux-pm/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-pm linux-pm/ https://lore.kernel.org/linux-pm \
		linux-pm@vger.kernel.org
	public-inbox-index linux-pm

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-pm


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git