linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for suspend
@ 2015-08-27  3:18 Chen Yu
  2015-09-17  5:30 ` Pavel Machek
  2015-10-09 21:50 ` Rafael J. Wysocki
  0 siblings, 2 replies; 14+ messages in thread
From: Chen Yu @ 2015-08-27  3:18 UTC (permalink / raw)
  To: rjw, pavel, tglx, mingo, hpa, bp
  Cc: rui.zhang, linux-pm, x86, linux-kernel, Chen Yu

A bug is reported(https://bugzilla.redhat.com/show_bug.cgi?id=1227208)
that, after resumed from S3, CPU is running at a low speed.
After investigation, it is found that, BIOS has modified the value
of THERM_CONTROL register during S3, and changes it from 0 to 0x10,
since value of 0x10 means CPU can only get 25% of the Duty Cycle,
this triggers the problem.

Here is a simple scenario to reproduce the issue:
1.Boot up the system
2.Get MSR with address 0x19a, it should be 0
3.Put the system into sleep, then wake it up
4.Get MSR with address 0x19a, it should be 0(actually it shows 0x10)

Although this is a BIOS issue, it would be more robust for linux to deal
with this situation. This patch fixes this issue by introducing a framework
to save/restore specified MSR registers(THERM_CONTROL in this case)
for suspend/resume.

When user encounters a problematic platform and needs to protect the
MSRs during suspending, he can simply add a quirk entry in
msr_save_dmi_table, and customizes MSR registers inside the quirk
callback, for example:

u32 msr_id_need_to_save[] = {MSR_ID0, MSR_ID1, MSR_ID2...};

and the quirk mechanism ensures that, once resumed from suspended,
the MSRs indicated by these IDs will be restored to their original values
before suspended.

Since both 64/32-bit kernels are affected, this patch covers 64/32-bit
common code path. And because the MSRs specified by the user might not
be available or readable in any situation, we use rdmsrl_safe to safely
save these MSRs.

Tested-by: Marcin Kaszewski <marcin.kaszewski@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
---
v4:
 - Revert v3 to v2, and fix some typos in changelog/comments. 
   Use msr_info structure instead of msr_id + msr_value.
   Adjust some codes for better readability.
v3:
 - Simplify the patch to only focus on THERM_CONTROL register.
   This will make things 'just work'.
v2:
 - Cover both 64/32-bit common code path.
   Use rdmsrl_safe to safely read MSR.
   Introduce a quirk framework for save/restore specified MSR on different
   platforms.
---
 arch/x86/include/asm/suspend_32.h | 11 +++++
 arch/x86/include/asm/suspend_64.h | 11 +++++
 arch/x86/power/cpu.c              | 99 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 121 insertions(+)

diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index d1793f0..240aaa8 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -9,12 +9,23 @@
 #include <asm/desc.h>
 #include <asm/fpu/api.h>
 
+struct msr_type {
+	bool msr_saved;
+	struct msr_info rv;
+};
+
+struct saved_msr {
+	unsigned short num;
+	struct msr_type *msr_array;
+};
+
 /* image of the saved processor state */
 struct saved_context {
 	u16 es, fs, gs, ss;
 	unsigned long cr0, cr2, cr3, cr4;
 	u64 misc_enable;
 	bool misc_enable_saved;
+	struct saved_msr msr_for_save;
 	struct desc_ptr gdt_desc;
 	struct desc_ptr idt;
 	u16 ldt;
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
index 7ebf0eb..40a7a00 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -9,6 +9,16 @@
 #include <asm/desc.h>
 #include <asm/fpu/api.h>
 
+struct msr_type {
+	bool msr_saved;
+	struct msr_info rv;
+};
+
+struct saved_msr {
+	unsigned short num;
+	struct msr_type *msr_array;
+};
+
 /*
  * Image of the saved processor state, used by the low level ACPI suspend to
  * RAM code and by the low level hibernation code.
@@ -24,6 +34,7 @@ struct saved_context {
 	unsigned long cr0, cr2, cr3, cr4, cr8;
 	u64 misc_enable;
 	bool misc_enable_saved;
+	struct saved_msr msr_for_save;
 	unsigned long efer;
 	u16 gdt_pad; /* Unused */
 	struct desc_ptr gdt_desc;
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 9ab5279..8442473 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -23,6 +23,7 @@
 #include <asm/debugreg.h>
 #include <asm/cpu.h>
 #include <asm/mmu_context.h>
+#include <linux/dmi.h>
 
 #ifdef CONFIG_X86_32
 __visible unsigned long saved_context_ebx;
@@ -32,6 +33,30 @@ __visible unsigned long saved_context_eflags;
 #endif
 struct saved_context saved_context;
 
+static void msr_save_context(struct saved_context *ctxt)
+{
+	int i = 0;
+
+	for (i = 0; i < ctxt->msr_for_save.num; i++) {
+		struct msr_type *msr = &ctxt->msr_for_save.msr_array[i];
+
+		msr->msr_saved = !rdmsrl_safe(msr->rv.msr_no,
+			&msr->rv.reg.q);
+	}
+}
+
+static void msr_restore_context(struct saved_context *ctxt)
+{
+	int i = 0;
+
+	for (i = 0; i < ctxt->msr_for_save.num; i++) {
+		struct msr_type *msr = &ctxt->msr_for_save.msr_array[i];
+
+		if (msr->msr_saved)
+			wrmsrl(msr->rv.msr_no, msr->rv.reg.q);
+	}
+}
+
 /**
  *	__save_processor_state - save CPU registers before creating a
  *		hibernation image and before restoring the memory state from it
@@ -111,6 +136,7 @@ static void __save_processor_state(struct saved_context *ctxt)
 #endif
 	ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE,
 					       &ctxt->misc_enable);
+	msr_save_context(ctxt);
 }
 
 /* Needed by apm.c */
@@ -229,6 +255,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
 	x86_platform.restore_sched_clock_state();
 	mtrr_bp_restore();
 	perf_restore_debug_store();
+	msr_restore_context(ctxt);
 }
 
 /* Needed by apm.c */
@@ -320,3 +347,75 @@ static int __init bsp_pm_check_init(void)
 }
 
 core_initcall(bsp_pm_check_init);
+
+/* We constrain the number of MSRs to 64. */
+#define MAX_MSR_SAVED	64
+
+static struct msr_type msr_context_array[MAX_MSR_SAVED];
+
+/*
+ * Following section is a quirk framework for problematic BIOS:
+ * Sometimes MSRs are modified by BIOS after suspended to
+ * ram, this might cause unexpected behavior after resumed.
+ * Thus we save/restore these specified MSRs during suspending
+ * in order to work around it.
+ * A typical bug is reported at:
+ * https://bugzilla.redhat.com/show_bug.cgi?id=1227208
+ */
+static int msr_set_info(const u32 *msr_id, const int total_num)
+{
+	int i = 0;
+
+	if (total_num > MAX_MSR_SAVED) {
+		pr_err("PM: too many MSRs need to be saved.\n");
+		return -EINVAL;
+	}
+	if ((NULL != saved_context.msr_for_save.msr_array) ||
+	     0 != saved_context.msr_for_save.num) {
+		pr_err("PM: quirk already applied, please check your dmi match table.\n");
+		return -EINVAL;
+	}
+	for (i = 0; i < total_num; i++) {
+		msr_context_array[i].rv.msr_no = msr_id[i];
+		msr_context_array[i].msr_saved = false;
+		msr_context_array[i].rv.reg.q = 0;
+	}
+	saved_context.msr_for_save.num = total_num;
+	saved_context.msr_for_save.msr_array = msr_context_array;
+	return 0;
+}
+
+/*
+ * For any further problematic BIOS/platforms,
+ * please add your own function similar to msr_initialize_bdw.
+ */
+static int msr_initialize_bdw(const struct dmi_system_id *d)
+{
+	/* Add any extra MSR ids into this array. */
+	u32 bdw_msr_id[] = {MSR_IA32_THERM_CONTROL};
+
+	pr_info("PM: %s detected, MSR saving is needed during suspending.\n",
+		d->ident);
+	return msr_set_info(bdw_msr_id, ARRAY_SIZE(bdw_msr_id));
+}
+
+static struct dmi_system_id msr_save_dmi_table[] = {
+	{
+	 .callback = msr_initialize_bdw,
+	 .ident = "BROADWELL BDX_EP",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "Intel Corporation"),
+		DMI_MATCH(DMI_PRODUCT_NAME, "GRANTLEY"),
+		DMI_MATCH(DMI_PRODUCT_VERSION, "E63448-400"),
+		},
+	},
+	{}
+};
+
+static int pm_check_save_msr(void)
+{
+	dmi_check_system(msr_save_dmi_table);
+	return 0;
+}
+
+late_initcall(pm_check_save_msr);
-- 
1.8.4.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for suspend
  2015-08-27  3:18 [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for suspend Chen Yu
@ 2015-09-17  5:30 ` Pavel Machek
  2015-10-09  9:39   ` Chen, Yu C
  2015-10-09 21:50 ` Rafael J. Wysocki
  1 sibling, 1 reply; 14+ messages in thread
From: Pavel Machek @ 2015-09-17  5:30 UTC (permalink / raw)
  To: Chen Yu; +Cc: rjw, tglx, mingo, hpa, bp, rui.zhang, linux-pm, x86, linux-kernel

On Thu 2015-08-27 11:18:27, Chen Yu wrote:
> A bug is reported(https://bugzilla.redhat.com/show_bug.cgi?id=1227208)
> that, after resumed from S3, CPU is running at a low speed.
> After investigation, it is found that, BIOS has modified the value
> of THERM_CONTROL register during S3, and changes it from 0 to 0x10,
> since value of 0x10 means CPU can only get 25% of the Duty Cycle,
> this triggers the problem.
> 
> Here is a simple scenario to reproduce the issue:
> 1.Boot up the system
> 2.Get MSR with address 0x19a, it should be 0
> 3.Put the system into sleep, then wake it up
> 4.Get MSR with address 0x19a, it should be 0(actually it shows 0x10)
> 
> Although this is a BIOS issue, it would be more robust for linux to deal
> with this situation. This patch fixes this issue by introducing a framework
> to save/restore specified MSR registers(THERM_CONTROL in this case)
> for suspend/resume.
> 
> When user encounters a problematic platform and needs to protect the
> MSRs during suspending, he can simply add a quirk entry in
> msr_save_dmi_table, and customizes MSR registers inside the quirk
> callback, for example:
> 
> u32 msr_id_need_to_save[] = {MSR_ID0, MSR_ID1, MSR_ID2...};
> 
> and the quirk mechanism ensures that, once resumed from suspended,
> the MSRs indicated by these IDs will be restored to their original values
> before suspended.
> 
> Since both 64/32-bit kernels are affected, this patch covers 64/32-bit
> common code path. And because the MSRs specified by the user might not
> be available or readable in any situation, we use rdmsrl_safe to safely
> save these MSRs.
> 
> Tested-by: Marcin Kaszewski <marcin.kaszewski@intel.com>
> Signed-off-by: Chen Yu <yu.c.chen@intel.com>

Acked-by: Pavel Machek <pavel@ucw.cz>

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for suspend
  2015-09-17  5:30 ` Pavel Machek
@ 2015-10-09  9:39   ` Chen, Yu C
  2015-10-09 18:55     ` Doug Smythies
  0 siblings, 1 reply; 14+ messages in thread
From: Chen, Yu C @ 2015-10-09  9:39 UTC (permalink / raw)
  To: Pavel Machek, Ingo Molnar
  Cc: Wysocki, Rafael J, tglx, hpa, bp, Zhang, Rui, linux-pm, x86,
	linux-kernel, Brown, Len

Thanks Pavel !

Hi, Ingo,
do you have a bandwidth to help look at this version, 
since this bug has been on bugzilla for a while.
Thanks a lot.

Best Regards,
Yu

> -----Original Message-----
> From: Pavel Machek [mailto:pavel@ucw.cz]
> Sent: Thursday, September 17, 2015 1:30 PM
> To: Chen, Yu C
> Cc: rjw@rjwysocki.net; tglx@linutronix.de; mingo@redhat.com;
> hpa@zytor.com; bp@alien8.de; Zhang, Rui; linux-pm@vger.kernel.org;
> x86@kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for
> suspend
> 
> On Thu 2015-08-27 11:18:27, Chen Yu wrote:
> > A bug is reported(https://bugzilla.redhat.com/show_bug.cgi?id=1227208)
> > that, after resumed from S3, CPU is running at a low speed.
> > After investigation, it is found that, BIOS has modified the value of
> > THERM_CONTROL register during S3, and changes it from 0 to 0x10, since
> > value of 0x10 means CPU can only get 25% of the Duty Cycle, this
> > triggers the problem.
> >
> > Here is a simple scenario to reproduce the issue:
> > 1.Boot up the system
> > 2.Get MSR with address 0x19a, it should be 0 3.Put the system into
> > sleep, then wake it up 4.Get MSR with address 0x19a, it should be
> > 0(actually it shows 0x10)
> >
> > Although this is a BIOS issue, it would be more robust for linux to
> > deal with this situation. This patch fixes this issue by introducing a
> > framework to save/restore specified MSR registers(THERM_CONTROL in
> > this case) for suspend/resume.
> >
> > When user encounters a problematic platform and needs to protect the
> > MSRs during suspending, he can simply add a quirk entry in
> > msr_save_dmi_table, and customizes MSR registers inside the quirk
> > callback, for example:
> >
> > u32 msr_id_need_to_save[] = {MSR_ID0, MSR_ID1, MSR_ID2...};
> >
> > and the quirk mechanism ensures that, once resumed from suspended,
> the
> > MSRs indicated by these IDs will be restored to their original values
> > before suspended.
> >
> > Since both 64/32-bit kernels are affected, this patch covers 64/32-bit
> > common code path. And because the MSRs specified by the user might not
> > be available or readable in any situation, we use rdmsrl_safe to
> > safely save these MSRs.
> >
> > Tested-by: Marcin Kaszewski <marcin.kaszewski@intel.com>
> > Signed-off-by: Chen Yu <yu.c.chen@intel.com>
> 
> Acked-by: Pavel Machek <pavel@ucw.cz>
> 
> --


^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for suspend
  2015-10-09  9:39   ` Chen, Yu C
@ 2015-10-09 18:55     ` Doug Smythies
  2015-10-11  2:26       ` Chen, Yu C
  0 siblings, 1 reply; 14+ messages in thread
From: Doug Smythies @ 2015-10-09 18:55 UTC (permalink / raw)
  To: 'Chen, Yu C'
  Cc: 'Wysocki, Rafael J', tglx, hpa, bp, 'Zhang, Rui',
	linux-pm, x86, linux-kernel, 'Brown, Len',
	'Ingo Molnar', 'Pavel Machek',
	'Kristen Carlson Accardi'

Note: This reply is more for general information than anything else.

Since Yu's original e-mail I have searched all forums and bug reports
that I would find in an attempt to gain knowledge as to the extent
of Clock Modulation becoming enabled issues.

It is common, but particularly with some types of Dell LapTops
upon resume from suspend on battery power.

For the most part, users simply disable the intel_pstate driver, and
move on with their lives using the acpi-cpufreq driver. And once they
move on we never hear from 90% of them again, even though they agree
to do more tests and report back.

Recall my original reply to this thread, a portion copied below:

> The current version of the intel_pstate driver is incompatible with
> any use of Clock Modulation, always resulting in driving the target
> pstate to the minimum, regardless of load. The result is the apparent
> CPU frequency stuck at minimum * modulation percent.

> The acpi-cpufreq driver works fine with Clock Modulation, resulting in
> desired frequency * modulation percent.

For the most part users don't even notice the reduced performance
(which is typically 75 or 87.5 percent of normal) when using the
acpi-cpufreq driver.
 
Yu's specific example is special, because the Clock Modulation
percent is undefined (reserved), and thus clearly unintentional,
or just plain wrong if it is intentional.

Other comments in-line below:

> On 2015.10.07 02:39 Chen, Yu C wrote:
> On 2105.09.17 13:30 Pavel Machek wrote:
>> On Thu 2015-08-27 11:18:27, Chen Yu wrote:
>>> A bug is reported(https://bugzilla.redhat.com/show_bug.cgi?id=1227208)
>>> that, after resumed from S3, CPU is running at a low speed.
>>> After investigation, it is found that, BIOS has modified the value of
>>> THERM_CONTROL register during S3, and changes it from 0 to 0x10, since
>>> value of 0x10 means CPU can only get 25% of the Duty Cycle, this
>>> triggers the problem.

Note: THERM_CONTROL register = IA32_CLOCK_MODULATION

I suspect the outcome for an undefined Clock Modulation value of 0
is processor dependent. For example on my i7-2600K I get 50% duty cycle
if I manually write 0x10 to the register.

>>>
>>> Here is a simple scenario to reproduce the issue:
>>> 1.Boot up the system
>>> 2.Get MSR with address 0x19a, it should be 0

It doesn't have to be, and in some cases I found it isn't,
but Clock Modulation isn't enabled.
Only bit 4 matters.

>>> 3.Put the system into
>>> sleep, then wake it up 4.Get MSR with address 0x19a, it should be
>>> 0(actually it shows 0x10)

Again, it doesn't have to be 0, only bit 4 matters.

In the problem cases I found, typically the value is 0x1c or 0x1e,
for Clock Modulation percentages of 75% or 87.5%

However, thanks very much for the "how to" I used it a lot.

>>>
>>> Although this is a BIOS issue,

In your specific case, and since the register value is undefined yes.
In the resume from suspend on battery power case, it might
be intentional. (there has been no response from Dell on the Dell
forum where I asked.)

>>> it would be more robust for linux to
>>> deal with this situation. This patch fixes this issue by introducing a
>>> framework to save/restore specified MSR registers(THERM_CONTROL in
>>> this case) for suspend/resume.
>>>
>>> When user encounters a problematic platform and needs to protect the
>>> MSRs during suspending, he can simply add a quirk entry in
>>> msr_save_dmi_table, and customizes MSR registers inside the quirk
>>> callback, for example:

This might be hard to maintain and cause significant delays for actual end
user availability for these battery resume type cases.

Is there a point to be made here?:

Yes, I think the intel_pstate driver should be improved. Currently it is overly
sensitive to non-standard system perturbations, sometimes resulting in driving
down the CPU frequency, as in this case, and sometimes driving up the CPU frequency
unnecessarily. I am saying the driver doesn't pass sensitivity analysis well.



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for suspend
  2015-08-27  3:18 [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for suspend Chen Yu
  2015-09-17  5:30 ` Pavel Machek
@ 2015-10-09 21:50 ` Rafael J. Wysocki
  2015-10-11  2:43   ` Chen, Yu C
  1 sibling, 1 reply; 14+ messages in thread
From: Rafael J. Wysocki @ 2015-10-09 21:50 UTC (permalink / raw)
  To: Chen Yu
  Cc: pavel, tglx, mingo, hpa, bp, rui.zhang, linux-pm, x86, linux-kernel

On Thursday, August 27, 2015 11:18:27 AM Chen Yu wrote:
> A bug is reported(https://bugzilla.redhat.com/show_bug.cgi?id=1227208)
> that, after resumed from S3, CPU is running at a low speed.
> After investigation, it is found that, BIOS has modified the value
> of THERM_CONTROL register during S3, and changes it from 0 to 0x10,
> since value of 0x10 means CPU can only get 25% of the Duty Cycle,
> this triggers the problem.
> 
> Here is a simple scenario to reproduce the issue:
> 1.Boot up the system
> 2.Get MSR with address 0x19a, it should be 0
> 3.Put the system into sleep, then wake it up
> 4.Get MSR with address 0x19a, it should be 0(actually it shows 0x10)
> 
> Although this is a BIOS issue, it would be more robust for linux to deal
> with this situation. This patch fixes this issue by introducing a framework
> to save/restore specified MSR registers(THERM_CONTROL in this case)
> for suspend/resume.
> 
> When user encounters a problematic platform and needs to protect the
> MSRs during suspending, he can simply add a quirk entry in
> msr_save_dmi_table, and customizes MSR registers inside the quirk
> callback, for example:
> 
> u32 msr_id_need_to_save[] = {MSR_ID0, MSR_ID1, MSR_ID2...};
> 
> and the quirk mechanism ensures that, once resumed from suspended,
> the MSRs indicated by these IDs will be restored to their original values
> before suspended.
> 
> Since both 64/32-bit kernels are affected, this patch covers 64/32-bit
> common code path. And because the MSRs specified by the user might not
> be available or readable in any situation, we use rdmsrl_safe to safely
> save these MSRs.
> 
> Tested-by: Marcin Kaszewski <marcin.kaszewski@intel.com>
> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
> ---
> v4:
>  - Revert v3 to v2, and fix some typos in changelog/comments. 
>    Use msr_info structure instead of msr_id + msr_value.
>    Adjust some codes for better readability.
> v3:
>  - Simplify the patch to only focus on THERM_CONTROL register.
>    This will make things 'just work'.
> v2:
>  - Cover both 64/32-bit common code path.
>    Use rdmsrl_safe to safely read MSR.
>    Introduce a quirk framework for save/restore specified MSR on different
>    platforms.
> ---
>  arch/x86/include/asm/suspend_32.h | 11 +++++
>  arch/x86/include/asm/suspend_64.h | 11 +++++
>  arch/x86/power/cpu.c              | 99 +++++++++++++++++++++++++++++++++++++++
>  3 files changed, 121 insertions(+)
> 
> diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
> index d1793f0..240aaa8 100644
> --- a/arch/x86/include/asm/suspend_32.h
> +++ b/arch/x86/include/asm/suspend_32.h
> @@ -9,12 +9,23 @@
>  #include <asm/desc.h>
>  #include <asm/fpu/api.h>
>  
> +struct msr_type {

I'd call this msr_data.

> +	bool msr_saved;
> +	struct msr_info rv;
> +};
> +
> +struct saved_msr {

And this msr_context.

> +	unsigned short num;
> +	struct msr_type *msr_array;
> +};
> +
>  /* image of the saved processor state */
>  struct saved_context {
>  	u16 es, fs, gs, ss;
>  	unsigned long cr0, cr2, cr3, cr4;
>  	u64 misc_enable;
>  	bool misc_enable_saved;
> +	struct saved_msr msr_for_save;

"msr_to_save"?

>  	struct desc_ptr gdt_desc;
>  	struct desc_ptr idt;
>  	u16 ldt;
> diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
> index 7ebf0eb..40a7a00 100644
> --- a/arch/x86/include/asm/suspend_64.h
> +++ b/arch/x86/include/asm/suspend_64.h
> @@ -9,6 +9,16 @@
>  #include <asm/desc.h>
>  #include <asm/fpu/api.h>
>  
> +struct msr_type {
> +	bool msr_saved;
> +	struct msr_info rv;
> +};
> +
> +struct saved_msr {
> +	unsigned short num;
> +	struct msr_type *msr_array;
> +};

The definitions look the same as the previous ones.

Can we share them somehow?

> +
>  /*
>   * Image of the saved processor state, used by the low level ACPI suspend to
>   * RAM code and by the low level hibernation code.
> @@ -24,6 +34,7 @@ struct saved_context {
>  	unsigned long cr0, cr2, cr3, cr4, cr8;
>  	u64 misc_enable;
>  	bool misc_enable_saved;
> +	struct saved_msr msr_for_save;
>  	unsigned long efer;
>  	u16 gdt_pad; /* Unused */
>  	struct desc_ptr gdt_desc;
> diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
> index 9ab5279..8442473 100644
> --- a/arch/x86/power/cpu.c
> +++ b/arch/x86/power/cpu.c
> @@ -23,6 +23,7 @@
>  #include <asm/debugreg.h>
>  #include <asm/cpu.h>
>  #include <asm/mmu_context.h>
> +#include <linux/dmi.h>
>  
>  #ifdef CONFIG_X86_32
>  __visible unsigned long saved_context_ebx;
> @@ -32,6 +33,30 @@ __visible unsigned long saved_context_eflags;
>  #endif
>  struct saved_context saved_context;
>  
> +static void msr_save_context(struct saved_context *ctxt)
> +{
> +	int i = 0;
> +
> +	for (i = 0; i < ctxt->msr_for_save.num; i++) {
> +		struct msr_type *msr = &ctxt->msr_for_save.msr_array[i];
> +
> +		msr->msr_saved = !rdmsrl_safe(msr->rv.msr_no,
> +			&msr->rv.reg.q);
> +	}

If you did something like

	struct msr_type *msr = ctxt->msr_for_save.msr_array;
	struct msr_type *end = msr + ctxt->msr_for_save.num;

	while (msr < end) {
		msr->msr_saved = !rdmsrl_safe(msr->rv.msr_no, &msr->rv.reg.q);
		msr++;
	}

here (and analogously below), it would be somewhat easier to follow IMO.

> +}
> +
> +static void msr_restore_context(struct saved_context *ctxt)
> +{
> +	int i = 0;
> +
> +	for (i = 0; i < ctxt->msr_for_save.num; i++) {
> +		struct msr_type *msr = &ctxt->msr_for_save.msr_array[i];
> +
> +		if (msr->msr_saved)
> +			wrmsrl(msr->rv.msr_no, msr->rv.reg.q);
> +	}
> +}
> +
>  /**
>   *	__save_processor_state - save CPU registers before creating a
>   *		hibernation image and before restoring the memory state from it
> @@ -111,6 +136,7 @@ static void __save_processor_state(struct saved_context *ctxt)
>  #endif
>  	ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE,
>  					       &ctxt->misc_enable);
> +	msr_save_context(ctxt);
>  }
>  
>  /* Needed by apm.c */
> @@ -229,6 +255,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
>  	x86_platform.restore_sched_clock_state();
>  	mtrr_bp_restore();
>  	perf_restore_debug_store();
> +	msr_restore_context(ctxt);
>  }
>  
>  /* Needed by apm.c */
> @@ -320,3 +347,75 @@ static int __init bsp_pm_check_init(void)
>  }
>  
>  core_initcall(bsp_pm_check_init);
> +
> +/* We constrain the number of MSRs to 64. */

Why 64 in particular?

> +#define MAX_MSR_SAVED	64
> +
> +static struct msr_type msr_context_array[MAX_MSR_SAVED];

I wonder if this array may be allocated dynamically?

We'll waste memory here in the majority of cases.

> +
> +/*
> + * Following section is a quirk framework for problematic BIOS:

"The following ..."

> + * Sometimes MSRs are modified by BIOS after suspended to
> + * ram, this might cause unexpected behavior after resumed.

"RAM" (in capitals) and "during resume" or "after wakeup".

> + * Thus we save/restore these specified MSRs during suspending
> + * in order to work around it.
> + * A typical bug is reported at:
> + * https://bugzilla.redhat.com/show_bug.cgi?id=1227208
> + */
> +static int msr_set_info(const u32 *msr_id, const int total_num)

I'd call it "msr_init_context" or something like that.

> +{
> +	int i = 0;
> +
> +	if (total_num > MAX_MSR_SAVED) {
> +		pr_err("PM: too many MSRs need to be saved.\n");
> +		return -EINVAL;
> +	}
> +	if ((NULL != saved_context.msr_for_save.msr_array) ||

if (saved_context.msr_for_save.msr_array || saved_context.msr_for_save.num > 0) {

> +	     0 != saved_context.msr_for_save.num) {
> +		pr_err("PM: quirk already applied, please check your dmi match table.\n");
> +		return -EINVAL;
> +	}
> +	for (i = 0; i < total_num; i++) {
> +		msr_context_array[i].rv.msr_no = msr_id[i];
> +		msr_context_array[i].msr_saved = false;
> +		msr_context_array[i].rv.reg.q = 0;
> +	}
> +	saved_context.msr_for_save.num = total_num;
> +	saved_context.msr_for_save.msr_array = msr_context_array;
> +	return 0;
> +}
> +
> +/*
> + * For any further problematic BIOS/platforms,
> + * please add your own function similar to msr_initialize_bdw.
> + */
> +static int msr_initialize_bdw(const struct dmi_system_id *d)
> +{
> +	/* Add any extra MSR ids into this array. */
> +	u32 bdw_msr_id[] = {MSR_IA32_THERM_CONTROL};
> +
> +	pr_info("PM: %s detected, MSR saving is needed during suspending.\n",
> +		d->ident);
> +	return msr_set_info(bdw_msr_id, ARRAY_SIZE(bdw_msr_id));
> +}
> +
> +static struct dmi_system_id msr_save_dmi_table[] = {
> +	{
> +	 .callback = msr_initialize_bdw,
> +	 .ident = "BROADWELL BDX_EP",
> +	 .matches = {
> +		DMI_MATCH(DMI_SYS_VENDOR, "Intel Corporation"),
> +		DMI_MATCH(DMI_PRODUCT_NAME, "GRANTLEY"),
> +		DMI_MATCH(DMI_PRODUCT_VERSION, "E63448-400"),
> +		},
> +	},
> +	{}
> +};
> +
> +static int pm_check_save_msr(void)
> +{
> +	dmi_check_system(msr_save_dmi_table);
> +	return 0;
> +}
> +
> +late_initcall(pm_check_save_msr);

Thanks,
Rafael


^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for suspend
  2015-10-09 18:55     ` Doug Smythies
@ 2015-10-11  2:26       ` Chen, Yu C
  2015-10-11 15:46         ` Doug Smythies
  0 siblings, 1 reply; 14+ messages in thread
From: Chen, Yu C @ 2015-10-11  2:26 UTC (permalink / raw)
  To: Doug Smythies
  Cc: Wysocki, Rafael J, tglx, hpa, bp, Zhang, Rui, linux-pm, x86,
	linux-kernel, Brown, Len, 'Ingo Molnar',
	'Pavel Machek', 'Kristen Carlson Accardi'

Hi, Doug, thanks for your comment,

> -----Original Message-----
> From: Doug Smythies [mailto:dsmythies@telus.net]
> Sent: Saturday, October 10, 2015 2:56 AM
> To: Chen, Yu C
> Cc: Wysocki, Rafael J; tglx@linutronix.de; hpa@zytor.com; bp@alien8.de;
> Zhang, Rui; linux-pm@vger.kernel.org; x86@kernel.org; linux-
> kernel@vger.kernel.org; Brown, Len; 'Ingo Molnar'; 'Pavel Machek'; 'Kristen
> Carlson Accardi'
> Subject: RE: [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for
> suspend
> 
> Note: This reply is more for general information than anything else.
> 
> Since Yu's original e-mail I have searched all forums and bug reports that I
> would find in an attempt to gain knowledge as to the extent of Clock
> Modulation becoming enabled issues.
> 
> It is common, but particularly with some types of Dell LapTops upon resume
> from suspend on battery power.
> 
> For the most part, users simply disable the intel_pstate driver, and move on
> with their lives using the acpi-cpufreq driver. And once they move on we
> never hear from 90% of them again, even though they agree to do more
> tests and report back.
> 
> Recall my original reply to this thread, a portion copied below:
> 
> > The current version of the intel_pstate driver is incompatible with
> > any use of Clock Modulation, always resulting in driving the target
> > pstate to the minimum, regardless of load. The result is the apparent
> > CPU frequency stuck at minimum * modulation percent.
> 
> > The acpi-cpufreq driver works fine with Clock Modulation, resulting in
> > desired frequency * modulation percent.
> 

[Yu] Why intel_pstate driver is incompatible with Clock Modulation? 
I search the intel_pstate driver code, but can not find any operation that
controls the value of MSR_IA32_THERM_CONTROL(0x19a),
but other components such as acpi processor throttling may be involved in.
Is there any bugzilla thread tracking this problem?

> For the most part users don't even notice the reduced performance (which is
> typically 75 or 87.5 percent of normal) when using the acpi-cpufreq driver.
> 
> Yu's specific example is special, because the Clock Modulation percent is
> undefined (reserved), and thus clearly unintentional, or just plain wrong if it
> is intentional.
> 
> Other comments in-line below:
> 
> > On 2015.10.07 02:39 Chen, Yu C wrote:
> > On 2105.09.17 13:30 Pavel Machek wrote:
> >> On Thu 2015-08-27 11:18:27, Chen Yu wrote:
> >>> A bug is
> >>> reported(https://bugzilla.redhat.com/show_bug.cgi?id=1227208)
> >>> that, after resumed from S3, CPU is running at a low speed.
> >>> After investigation, it is found that, BIOS has modified the value
> >>> of THERM_CONTROL register during S3, and changes it from 0 to 0x10,
> >>> since value of 0x10 means CPU can only get 25% of the Duty Cycle,
> >>> this triggers the problem.
> 
> Note: THERM_CONTROL register = IA32_CLOCK_MODULATION
> 
> I suspect the outcome for an undefined Clock Modulation value of 0 is
> processor dependent. For example on my i7-2600K I get 50% duty cycle if I
> manually write 0x10 to the register.
> 
BTW, how do you get the number of 50%?

> >>>
> >>> Here is a simple scenario to reproduce the issue:
> >>> 1.Boot up the system
> >>> 2.Get MSR with address 0x19a, it should be 0
> 
> It doesn't have to be, and in some cases I found it isn't, but Clock Modulation
> isn't enabled.
> Only bit 4 matters.
> 
You are right, I should add some descriptions that, this situation only makes sense 
for  the case reported at that bugzilla thread. 

> >>> 3.Put the system into
> >>> sleep, then wake it up 4.Get MSR with address 0x19a, it should be
> >>> 0(actually it shows 0x10)
> 
> Again, it doesn't have to be 0, only bit 4 matters.
> 
> In the problem cases I found, typically the value is 0x1c or 0x1e, for Clock
> Modulation percentages of 75% or 87.5%
> 
> However, thanks very much for the "how to" I used it a lot.
> 
> >>>
> >>> Although this is a BIOS issue,
> 
> In your specific case, and since the register value is undefined yes.
> In the resume from suspend on battery power case, it might be intentional.
> (there has been no response from Dell on the Dell forum where I asked.)
> 
Do you mean, BIOS would modify this value intentionally(before wakeup)  if BIOS found 
the battey power is too low?

> >>> it would be more robust for linux to deal with this situation. This
> >>> patch fixes this issue by introducing a framework to save/restore
> >>> specified MSR registers(THERM_CONTROL in this case) for
> >>> suspend/resume.
> >>>
> >>> When user encounters a problematic platform and needs to protect the
> >>> MSRs during suspending, he can simply add a quirk entry in
> >>> msr_save_dmi_table, and customizes MSR registers inside the quirk
> >>> callback, for example:
> 
> This might be hard to maintain and cause significant delays for actual end
> user availability for these battery resume type cases.
> 
> Is there a point to be made here?:
Well, I think this is why quirk is introduced here, if MSR_IA32_THERM_CONTROL is
modified by BIOS for battery resume cases, this platform should not add his quirk
in this framework.
And can you please provide bugzilla thread related to this battery resume case ?

> 
> Yes, I think the intel_pstate driver should be improved. Currently it is overly
> sensitive to non-standard system perturbations, sometimes resulting in
> driving down the CPU frequency, as in this case, and sometimes driving up
> the CPU frequency unnecessarily. I am saying the driver doesn't pass
> sensitivity analysis well.
> 
I guess the intel_pstate driver is another problem, or do I miss anything?

Thanks!

Yu


^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for suspend
  2015-10-09 21:50 ` Rafael J. Wysocki
@ 2015-10-11  2:43   ` Chen, Yu C
  0 siblings, 0 replies; 14+ messages in thread
From: Chen, Yu C @ 2015-10-11  2:43 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: pavel, tglx, mingo, hpa, bp, Zhang, Rui, linux-pm, x86, linux-kernel

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 3504 bytes --]

Hi Rafael, thanks for your comment,

> -----Original Message-----
> From: Rafael J. Wysocki [mailto:rjw@rjwysocki.net]
> Sent: Saturday, October 10, 2015 5:50 AM
> To: Chen, Yu C
> Cc: pavel@ucw.cz; tglx@linutronix.de; mingo@redhat.com; hpa@zytor.com;
> bp@alien8.de; Zhang, Rui; linux-pm@vger.kernel.org; x86@kernel.org; linux-
> kernel@vger.kernel.org
> Subject: Re: [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for
> suspend
> 
> On Thursday, August 27, 2015 11:18:27 AM Chen Yu wrote:
> >
> > +struct msr_type {
> 
> I'd call this msr_data.
> 
OK, will change its name. But msr_data is already defined
in kvm_host.h , so I changed it to msr_save_data in the new patch version.

> > +	bool msr_saved;
> > +	struct msr_info rv;
> > +};
> > +
> > +struct saved_msr {
> 
> And this msr_context.
> 
OK, will do.
> > +	unsigned short num;
> > +	struct msr_type *msr_array;
> > +};
> > +
> >  /* image of the saved processor state */  struct saved_context {
> >  	u16 es, fs, gs, ss;
> >  	unsigned long cr0, cr2, cr3, cr4;
> >  	u64 misc_enable;
> >  	bool misc_enable_saved;
> > +	struct saved_msr msr_for_save;
> 
> "msr_to_save"?
> 
OK, will change.

> > +struct msr_type {
> > +	bool msr_saved;
> > +	struct msr_info rv;
> > +};
> > +
> > +struct saved_msr {
> > +	unsigned short num;
> > +	struct msr_type *msr_array;
> > +};
> 
> The definitions look the same as the previous ones.
> 
> Can we share them somehow?
> 
Yes, I've moved the common code to arch/x86/include/asm/msr.h.
> > +static void msr_save_context(struct saved_context *ctxt) {
> > +	int i = 0;
> > +
> > +	for (i = 0; i < ctxt->msr_for_save.num; i++) {
> > +		struct msr_type *msr = &ctxt->msr_for_save.msr_array[i];
> > +
> > +		msr->msr_saved = !rdmsrl_safe(msr->rv.msr_no,
> > +			&msr->rv.reg.q);
> > +	}
> 
> If you did something like
> 
> 	struct msr_type *msr = ctxt->msr_for_save.msr_array;
> 	struct msr_type *end = msr + ctxt->msr_for_save.num;
> 
> 	while (msr < end) {
> 		msr->msr_saved = !rdmsrl_safe(msr->rv.msr_no, &msr-
> >rv.reg.q);
> 		msr++;
> 	}
> 
> here (and analogously below), it would be somewhat easier to follow IMO.
> 
OK, changed the code.

> > +/* We constrain the number of MSRs to 64. */
> 
> Why 64 in particular?
> 
Once I looked up the SDM and estimate that 64 should be enough for some important
MSR registers. 
> > +#define MAX_MSR_SAVED	64
> > +
> > +static struct msr_type msr_context_array[MAX_MSR_SAVED];
> 
> I wonder if this array may be allocated dynamically?
> 
> We'll waste memory here in the majority of cases.
> 
OK, I've changed it to use kmalloc_array for storing.

> > +
> > +/*
> > + * Following section is a quirk framework for problematic BIOS:
> 
> "The following ..."
> 
OK.
> > + * Sometimes MSRs are modified by BIOS after suspended to
> > + * ram, this might cause unexpected behavior after resumed.
> 
> "RAM" (in capitals) and "during resume" or "after wakeup".
> 
OK. will do.

> > + * Thus we save/restore these specified MSRs during suspending
> > + * in order to work around it.
> > + * A typical bug is reported at:
> > + * https://bugzilla.redhat.com/show_bug.cgi?id=1227208
> > + */
> > +static int msr_set_info(const u32 *msr_id, const int total_num)
> 
> I'd call it "msr_init_context" or something like that.
> 
OK, changed it to msr_init_context.
> Thanks,
> Rafael

Best Regards,
Yu
ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for suspend
  2015-10-11  2:26       ` Chen, Yu C
@ 2015-10-11 15:46         ` Doug Smythies
  2015-11-01 16:49           ` Chen, Yu C
  0 siblings, 1 reply; 14+ messages in thread
From: Doug Smythies @ 2015-10-11 15:46 UTC (permalink / raw)
  To: 'Chen, Yu C'
  Cc: 'Wysocki, Rafael J', tglx, hpa, bp, 'Zhang, Rui',
	linux-pm, x86, linux-kernel, 'Brown, Len',
	'Ingo Molnar', 'Pavel Machek',
	'Kristen Carlson Accardi',
	Doug Smythies

Hi Yu, thanks for your reply.

On 2015.10.10 19:27 Chen, Yu C wrote:
> On 2105.10.10 02:56 Doug Smythies wrote:
> 
>>> The current version of the intel_pstate driver is incompatible with
>>> any use of Clock Modulation, always resulting in driving the target
>>> pstate to the minimum, regardless of load. The result is the apparent
>>> CPU frequency stuck at minimum * modulation percent.
>> 
>>> The acpi-cpufreq driver works fine with Clock Modulation, resulting in
>>> desired frequency * modulation percent.
>> 

> [Yu] Why intel_pstate driver is incompatible with Clock Modulation?

It is simply how the current control algorithm responds to the scenario.

The problem is in intel_pstate_get_scaled_busy, here:

        /*
         * core_busy is the ratio of actual performance to max
         * max_pstate is the max non turbo pstate available
         * current_pstate was the pstate that was requested during
         *      the last sample period.
         *
         * We normalize core_busy, which was our actual percent
         * performance to what we requested during the last sample
         * period. The result will be a percentage of busy at a
         * specified pstate.
         */
        core_busy = cpu->sample.core_pct_busy;
        max_pstate = int_tofp(cpu->pstate.max_pstate);
        current_pstate = int_tofp(cpu->pstate.current_pstate);
        core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate));

With Clock Modulation enabled, the actual performance percent will always
be less than what was asked for, basically meaning current_pstate is much less
than what was asked for. Thus the algorithm will drive down the
target pstate regardless of load.

Readers that doubt, should just try it. Here is an example:

. Processor: Intel(R) Core(TM) i7-2600K CPU @ 3.40GHz
. Max pstate: 38 (turbo enabled)
. Min pstate: 16
. Kernel: unmodified 4.3-rc4, freshly booted.
. Never suspended or anything, as there are currently undesired side effects.
. Clock Modulation is set to 87.5% (the maximum duty cycle available for my processor).
. All CPU are loaded about 95%.

Test 1:
. Scaling driver: intel_pstate
. scaling governor: powersave.

. setup: enable Clock Modulation at 87.5% and check it:

# wrmsr -a 0x19a 0x1e
# rdmsr -a 0x19a
1e
1e
1e
1e
1e
1e
1e
1e

. show the load:
# uptime
 21:05:02 up 16 min,  3 users,  load average: 8.00, 3.69, 1.46

. Show the CPU Frequencies:
# grep MHz /proc/cpuinfo
cpu MHz         : 1399.976
cpu MHz         : 1399.976
cpu MHz         : 1399.976
cpu MHz         : 1399.976
cpu MHz         : 1399.976
cpu MHz         : 1399.976
cpu MHz         : 1399.976
cpu MHz         : 1399.976

. The frequencies = minimum * 0.875 = 1600 * 0.875 = 1400.

Test 2:

. Scaling driver: acpi_cpufreq
. scaling governor: ondemand.

. setup: enable Clock Modulation at 87.5% and check it:

# wrmsr -a 0x19a 0x1e
# rdmsr -a 0x19a
1e
1e
1e
1e
1e
1e
1e
1e

. show the load:
# uptime
 23:54:20 up 6 min,  3 users,  load average: 12.47, 5.46, 2.13

. Show the CPU Frequencies (must use turbostat in this case):

 Note: this comes into play: "35 * 100 = 3500 MHz max turbo 4 active cores"

# ./turbostat sleep 10
     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz
       -    2922   95.23    3068    3412
       0    2988   97.20    3074    3414
       4    2908   94.76    3069    3413
       1    2916   95.04    3068    3412
       5    2909   94.85    3067    3411
       2    2920   95.20    3067    3411
       6    2915   95.04    3067    3411
       3    2915   95.06    3067    3411
       7    2906   94.73    3067    3411
10.012572 sec

. The frequencies = maximum * 0.875 = 3500 * 0.875 = 3063.

> I search the intel_pstate driver code, but can not find any operation that
> controls the value of MSR_IA32_THERM_CONTROL(0x19a),

No, there isn't. I was just saying that if, for whatever reason, Clock
Modulation becomes enabled the intel_pstate driver won't work properly with it,
but the acpi-cpufreq driver does.

> but other components such as acpi processor throttling may be involved in.
> Is there any bugzilla thread tracking this problem?

Not specifically, no (that I know of). I have been communicating off-list
with Kristen about it, and she looped in Srinivas Pandruvada one time.

>> I suspect the outcome for an undefined Clock Modulation value of 0 is
>> processor dependent. For example on my i7-2600K I get 50% duty cycle if I
>> manually write 0x10 to the register.
>> 
>BTW, how do you get the number of 50%?

I tested both the intel_pstate and acpi-cpufreq driver with every possible
value of Clock Modulation percent, including the undefined or reserved value.
For the undefined value here is the work:

. Step 1: Enable Clock Modulation at the undefined value and check it:
# wrmsr -a 0x19a 0x10
# rdmsr -a 0x19a
10
10
10
10
10
10
10
10

. Step 2: Observe the resulting CPU frequencies (still using acpi-cpufreq):
# ./turbostat sleep 10
     CPU Avg_MHz   %Busy Bzy_MHz TSC_MHz
       -    1811   95.69    1893    3411
       0    1825   96.48    1892    3410
       4    1816   95.99    1892    3410
       1    1795   94.87    1892    3411
       5    1817   95.96    1893    3411
       2    1816   95.93    1893    3411
       6    1800   95.08    1893    3411
       3    1803   95.25    1893    3411
       7    1817   96.00    1893    3411

. Step 3: Calculate the Clock modulation percentage:
Mod_pct = 1893 / 3500 = 54% (I got 50.05% last time)

. Step 4: Do it another way, observe the registers directly:
(I don't know if this method is actually valid)
0x198 IA32_PRF_STATUS
0x199 IA32_PERF_CTL

. Step 4.1: What is asked for?
# rdmsr --bitfield 15:8 -d -a 0x199
16
16
16
16
16
16
16
16
So pstate 16 is being asked for on all CPUs.

. Step 4.2: What is actually being provided?
# rdmsr --bitfield 15:8 -d -a 0x198
8
8
8
8
8
8
8
8
So 50% of what was asked for is provided.

>>>>
>>>> Although this is a BIOS issue,
>> 
>> In your specific case, and since the register value is undefined yes.
>> In the resume from suspend on battery power case, it might be intentional.
>> (there has been no response from Dell on the Dell forum where I asked.)
>> 
> Do you mean, BIOS would modify this value intentionally(before wakeup)
> if BIOS found the battery power is too low?

I do not know for certain, but that is how it appears. I don't know if
it is battery level dependant.
Recall we also talked off-list one time about the possibility that
nobody, not BIOS or Linux, initializes the register upon resume and the
CPU merely powered with Clock Modulation enabled. You thought no, the register
initializes to 0 on resume from S3 suspend.

>>>> it would be more robust for linux to deal with this situation. This
>>>> patch fixes this issue by introducing a framework to save/restore
>>>> specified MSR registers(THERM_CONTROL in this case) for
>>>> suspend/resume.
>>>>
>>>> When user encounters a problematic platform and needs to protect the
>>>> MSRs during suspending, he can simply add a quirk entry in
>>>> msr_save_dmi_table, and customizes MSR registers inside the quirk
>>>> callback, for example:
>> 
>> This might be hard to maintain and cause significant delays for actual end
>> user availability for these battery resume type cases.
>> 
>> Is there a point to be made here?:
> Well, I think this is why quirk is introduced here, if MSR_IA32_THERM_CONTROL is
> modified by BIOS for battery resume cases, this platform should not add his quirk
> in this framework.
> And can you please provide bugzilla thread related to this battery resume case ?

This is the Dell one:
http://en.community.dell.com/support-forums/laptop/f/3518/t/19634759?pi239031352=1#20824558

This is an Arch Linux one:
https://bbs.archlinux.org/viewtopic.php?id=199922

This issue sometimes comes into play on some otherwise unrelated bug reports. For example:
https://bugzilla.kernel.org/show_bug.cgi?id=80651
starting at comment #24

There are others.

>> 
>> Yes, I think the intel_pstate driver should be improved. Currently it is overly
>> sensitive to non-standard system perturbations, sometimes resulting in
>> driving down the CPU frequency, as in this case, and sometimes driving up
>> the CPU frequency unnecessarily. I am saying the driver doesn't pass
>> sensitivity analysis well.
>> 
> I guess the intel_pstate driver is another problem, or do I miss anything?

You don't miss anything.



^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for suspend
  2015-10-11 15:46         ` Doug Smythies
@ 2015-11-01 16:49           ` Chen, Yu C
  2015-11-06 15:33             ` Doug Smythies
  0 siblings, 1 reply; 14+ messages in thread
From: Chen, Yu C @ 2015-11-01 16:49 UTC (permalink / raw)
  To: Doug Smythies
  Cc: Wysocki, Rafael J, tglx, hpa, bp, Zhang, Rui, linux-pm, x86,
	linux-kernel, Brown, Len, 'Ingo Molnar',
	'Pavel Machek', 'Kristen Carlson Accardi',
	Pandruvada, Srinivas

Hi, Doug,
Sorry for my late response, and thanks for your information:

> -----Original Message-----
> From: Doug Smythies [mailto:dsmythies@telus.net]
> Sent: Sunday, October 11, 2015 11:47 PM
> To: Chen, Yu C
> Cc: Wysocki, Rafael J; tglx@linutronix.de; hpa@zytor.com; bp@alien8.de;
> Zhang, Rui; linux-pm@vger.kernel.org; x86@kernel.org; linux-
> kernel@vger.kernel.org; Brown, Len; 'Ingo Molnar'; 'Pavel Machek'; 'Kristen
> Carlson Accardi'; Doug Smythies
> Subject: RE: [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for
> suspend
> 
> Hi Yu, thanks for your reply.
> 
> On 2015.10.10 19:27 Chen, Yu C wrote:
> > On 2105.10.10 02:56 Doug Smythies wrote:
> >
> >>> The current version of the intel_pstate driver is incompatible with
> >>> any use of Clock Modulation, always resulting in driving the target
> >>> pstate to the minimum, regardless of load. The result is the
> >>> apparent CPU frequency stuck at minimum * modulation percent.
> >>
> >>> The acpi-cpufreq driver works fine with Clock Modulation, resulting
> >>> in desired frequency * modulation percent.
> >>
> 
> > [Yu] Why intel_pstate driver is incompatible with Clock Modulation?
> 
> It is simply how the current control algorithm responds to the scenario.
> 
> The problem is in intel_pstate_get_scaled_busy, here:
> 
>         /*
>          * core_busy is the ratio of actual performance to max
>          * max_pstate is the max non turbo pstate available
>          * current_pstate was the pstate that was requested during
>          *      the last sample period.
>          *
>          * We normalize core_busy, which was our actual percent
>          * performance to what we requested during the last sample
>          * period. The result will be a percentage of busy at a
>          * specified pstate.
>          */
>         core_busy = cpu->sample.core_pct_busy;
>         max_pstate = int_tofp(cpu->pstate.max_pstate);
>         current_pstate = int_tofp(cpu->pstate.current_pstate);
>         core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate));
> 
> With Clock Modulation enabled, the actual performance percent will always
> be less than what was asked for, basically meaning current_pstate is much
> less than what was asked for. Thus the algorithm will drive down the target
> pstate regardless of load.
> 
[Yu] Do you mean, there is some problem with the normalization,and we should use 
the actual pstate rather than the theoretical current_pstate, for example, 
the pseudocode might looked like:

-  current_pstate = int_tofp(cpu->pstate.current_pstate);
+ current_pstate = int_tofp(cpu->pstate.current_pstat)*0.85;

Best Regards,
Yu

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for suspend
  2015-11-01 16:49           ` Chen, Yu C
@ 2015-11-06 15:33             ` Doug Smythies
  2015-11-12  9:42               ` Chen, Yu C
  0 siblings, 1 reply; 14+ messages in thread
From: Doug Smythies @ 2015-11-06 15:33 UTC (permalink / raw)
  To: 'Chen, Yu C'
  Cc: 'Wysocki, Rafael J', tglx, hpa, bp, 'Zhang, Rui',
	linux-pm, x86, linux-kernel, 'Brown, Len',
	'Ingo Molnar', 'Pavel Machek',
	'Kristen Carlson Accardi', 'Pandruvada, Srinivas'


On 2015.11.01 08:50 Chen, Yu C wrote:
>> On 2015.10.10 19:27 Chen, Yu C wrote:
>>> On 2105.10.10 02:56 Doug Smythies wrote:
>>>
>>>>> The current version of the intel_pstate driver is incompatible with
>>>>> any use of Clock Modulation, always resulting in driving the target
>>>>> pstate to the minimum, regardless of load. The result is the
>>>>> apparent CPU frequency stuck at minimum * modulation percent.
>>>>
>>>>> The acpi-cpufreq driver works fine with Clock Modulation, resulting
>>>>> in desired frequency * modulation percent.
>>>>
>>
>>> [Yu] Why intel_pstate driver is incompatible with Clock Modulation?
>> 
>> It is simply how the current control algorithm responds to the scenario.
>> 
>> The problem is in intel_pstate_get_scaled_busy, here:
>> 
>>         /*
>>          * core_busy is the ratio of actual performance to max
>>          * max_pstate is the max non turbo pstate available
>>          * current_pstate was the pstate that was requested during
>>          *      the last sample period.
>>          *
>>          * We normalize core_busy, which was our actual percent
>>          * performance to what we requested during the last sample
>>          * period. The result will be a percentage of busy at a
>>          * specified pstate.
>>          */
>>         core_busy = cpu->sample.core_pct_busy;
>>         max_pstate = int_tofp(cpu->pstate.max_pstate);
>>         current_pstate = int_tofp(cpu->pstate.current_pstate);
>>         core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate));
>> 
>> With Clock Modulation enabled, the actual performance percent will always
>> be less than what was asked for, basically meaning current_pstate is much
>> less than what was asked for. Thus the algorithm will drive down the target
>> pstate regardless of load.
>> 
> [Yu] Do you mean, there is some problem with the normalization,and we should use 
> the actual pstate rather than the theoretical current_pstate, for example, 
> the pseudocode might looked like:
> 
> -  current_pstate = int_tofp(cpu->pstate.current_pstate);
> + current_pstate = int_tofp(cpu->pstate.current_pstat)*0.85;

I did not think of normalizing / compensating at this point.
That is a good idea.
Just for a test, I tried it and it seems to work well.
Before normalizing / compensating core_busy can be quite a small
for lesser clock modulation duty cycles, and so becomes a little
noisy afterwards. 

For my test, on an otherwise unaltered kernel v4.3 I did this:

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index aa33b92..97a90e1 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -821,6 +821,7 @@ static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu)
        int32_t core_busy, max_pstate, current_pstate, sample_ratio;
        s64 duration_us;
        u32 sample_time;
+       u64 clock_modulation;

        /*
         * core_busy is the ratio of actual performance to max
@@ -836,6 +837,17 @@ static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu)
        core_busy = cpu->sample.core_pct_busy;
        max_pstate = int_tofp(cpu->pstate.max_pstate);
        current_pstate = int_tofp(cpu->pstate.current_pstate);
+
+//     rdmsrl(MSR_IA32_CLOCK_MODULATION, clock_modulation);
+       rdmsrl(MSR_IA32_THERM_CONTROL, clock_modulation);
+       if(clock_modulation && 0X10) {
+               clock_modulation = clock_modulation & 0x0F;
+               if(clock_modulation == 0) clock_modulation = 8;
+               core_busy = mul_fp(core_busy, int_tofp(0x10));
+               core_busy = div_fp(core_busy, int_tofp(clock_modulation));
+       }
+
        core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate));

        /*





^ permalink raw reply related	[flat|nested] 14+ messages in thread

* RE: [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for suspend
  2015-11-06 15:33             ` Doug Smythies
@ 2015-11-12  9:42               ` Chen, Yu C
  2015-11-21 16:45                 ` Doug Smythies
  0 siblings, 1 reply; 14+ messages in thread
From: Chen, Yu C @ 2015-11-12  9:42 UTC (permalink / raw)
  To: Doug Smythies
  Cc: Wysocki, Rafael J, tglx, hpa, bp, Zhang, Rui, linux-pm, x86,
	linux-kernel, Brown, Len, 'Ingo Molnar',
	'Pavel Machek', 'Kristen Carlson Accardi',
	Pandruvada, Srinivas

Hi, 
> -----Original Message-----
> From: Doug Smythies [mailto:dsmythies@telus.net]
> Sent: Friday, November 06, 2015 11:34 PM
> To: Chen, Yu C
> Cc: Wysocki, Rafael J; tglx@linutronix.de; hpa@zytor.com; bp@alien8.de;
> Zhang, Rui; linux-pm@vger.kernel.org; x86@kernel.org; linux-
> kernel@vger.kernel.org; Brown, Len; 'Ingo Molnar'; 'Pavel Machek'; 'Kristen
> Carlson Accardi'; Pandruvada, Srinivas
> Subject: RE: [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for
> suspend
> 
> 
> On 2015.11.01 08:50 Chen, Yu C wrote:
> >> On 2015.10.10 19:27 Chen, Yu C wrote:
> >>> On 2105.10.10 02:56 Doug Smythies wrote:
> >>>
> >>>>> The current version of the intel_pstate driver is incompatible
> >>>>> with any use of Clock Modulation, always resulting in driving the
> >>>>> target pstate to the minimum, regardless of load. The result is
> >>>>> the apparent CPU frequency stuck at minimum * modulation percent.
> >>>>
> >>>>> The acpi-cpufreq driver works fine with Clock Modulation,
> >>>>> resulting in desired frequency * modulation percent.
> >>>>
> >>
> >>> [Yu] Why intel_pstate driver is incompatible with Clock Modulation?
> >>
> >> It is simply how the current control algorithm responds to the scenario.
> >>
> >> The problem is in intel_pstate_get_scaled_busy, here:
> >>
> >>         /*
> >>          * core_busy is the ratio of actual performance to max
> >>          * max_pstate is the max non turbo pstate available
> >>          * current_pstate was the pstate that was requested during
> >>          *      the last sample period.
> >>          *
> >>          * We normalize core_busy, which was our actual percent
> >>          * performance to what we requested during the last sample
> >>          * period. The result will be a percentage of busy at a
> >>          * specified pstate.
> >>          */
> >>         core_busy = cpu->sample.core_pct_busy;
> >>         max_pstate = int_tofp(cpu->pstate.max_pstate);
> >>         current_pstate = int_tofp(cpu->pstate.current_pstate);
> >>         core_busy = mul_fp(core_busy, div_fp(max_pstate,
> >> current_pstate));
> >>
> >> With Clock Modulation enabled, the actual performance percent will
> >> always be less than what was asked for, basically meaning
> >> current_pstate is much less than what was asked for. Thus the
> >> algorithm will drive down the target pstate regardless of load.
> >>
> > [Yu] Do you mean, there is some problem with the normalization,and we
> > should use the actual pstate rather than the theoretical
> > current_pstate, for example, the pseudocode might looked like:
> >
> > -  current_pstate = int_tofp(cpu->pstate.current_pstate);
> > + current_pstate = int_tofp(cpu->pstate.current_pstat)*0.85;
> 
> I did not think of normalizing / compensating at this point.
> That is a good idea.
> Just for a test, I tried it and it seems to work well.
> Before normalizing / compensating core_busy can be quite a small for lesser
> clock modulation duty cycles, and so becomes a little noisy afterwards.
> 
> For my test, on an otherwise unaltered kernel v4.3 I did this:
> 
> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
> index aa33b92..97a90e1 100644
> --- a/drivers/cpufreq/intel_pstate.c
> +++ b/drivers/cpufreq/intel_pstate.c
> @@ -821,6 +821,7 @@ static inline int32_t
> intel_pstate_get_scaled_busy(struct cpudata *cpu)
>         int32_t core_busy, max_pstate, current_pstate, sample_ratio;
>         s64 duration_us;
>         u32 sample_time;
> +       u64 clock_modulation;
> 
>         /*
>          * core_busy is the ratio of actual performance to max @@ -836,6
> +837,17 @@ static inline int32_t intel_pstate_get_scaled_busy(struct
> cpudata *cpu)
>         core_busy = cpu->sample.core_pct_busy;
>         max_pstate = int_tofp(cpu->pstate.max_pstate);
>         current_pstate = int_tofp(cpu->pstate.current_pstate);
> +
> +//     rdmsrl(MSR_IA32_CLOCK_MODULATION, clock_modulation);
> +       rdmsrl(MSR_IA32_THERM_CONTROL, clock_modulation);
> +       if(clock_modulation && 0X10) {
> +               clock_modulation = clock_modulation & 0x0F;
> +               if(clock_modulation == 0) clock_modulation = 8;
> +               core_busy = mul_fp(core_busy, int_tofp(0x10));
> +               core_busy = div_fp(core_busy, int_tofp(clock_modulation));
> +       }
> +
rdmsr_safe  might be better, you can refer to acpi_throttling_rdmsr ,
and I'm OK with this code, are you planning to send a formal patch? 

Yu

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for suspend
  2015-11-12  9:42               ` Chen, Yu C
@ 2015-11-21 16:45                 ` Doug Smythies
  2015-11-27  3:28                   ` Doug Smythies
  0 siblings, 1 reply; 14+ messages in thread
From: Doug Smythies @ 2015-11-21 16:45 UTC (permalink / raw)
  To: 'Chen, Yu C'
  Cc: 'Wysocki, Rafael J', tglx, hpa, bp, 'Zhang, Rui',
	linux-pm, x86, linux-kernel, 'Brown, Len',
	'Ingo Molnar', 'Pavel Machek',
	'Kristen Carlson Accardi', 'Pandruvada, Srinivas'

On 2015.11.12 01:42 Chen, Yu C wrote:
> On 2015.11.06 11:34 Doug Smythies wrote: 
>> On 2015.11.01 08:50 Chen, Yu C wrote:
>>>> On 2015.10.10 19:27 Chen, Yu C wrote:
>>>>> On 2105.10.10 02:56 Doug Smythies wrote:
>>>>>
>>>>>>> The current version of the intel_pstate driver is incompatible
>>>>>>> with any use of Clock Modulation, always resulting in driving the
>>>>>>> target pstate to the minimum, regardless of load. The result is
>>>>>>> the apparent CPU frequency stuck at minimum * modulation percent.
>>>>>>
>>>>>>> The acpi-cpufreq driver works fine with Clock Modulation,
>>>>>>> resulting in desired frequency * modulation percent.
>>>>>>
>>>>
>>>>> [Yu] Why intel_pstate driver is incompatible with Clock Modulation?
>>>>
>>>> It is simply how the current control algorithm responds to the scenario.
>>>>
>>>> The problem is in intel_pstate_get_scaled_busy, here:
>>>>
>>>>         /*
>>>>          * core_busy is the ratio of actual performance to max
>>>>          * max_pstate is the max non turbo pstate available
>>>>          * current_pstate was the pstate that was requested during
>>>>          *      the last sample period.
>>>>          *
>>>>          * We normalize core_busy, which was our actual percent
>>>>          * performance to what we requested during the last sample
>>>>          * period. The result will be a percentage of busy at a
>>>>          * specified pstate.
>>>>          */
>>>>         core_busy = cpu->sample.core_pct_busy;
>>>>         max_pstate = int_tofp(cpu->pstate.max_pstate);
>>>>         current_pstate = int_tofp(cpu->pstate.current_pstate);
>>>>         core_busy = mul_fp(core_busy, div_fp(max_pstate,
>>>> current_pstate));
>>>>
>>>> With Clock Modulation enabled, the actual performance percent will
>>>> always be less than what was asked for, basically meaning
>>>> current_pstate is much less than what was asked for. Thus the
>>>> algorithm will drive down the target pstate regardless of load.
>>>>
>>> [Yu] Do you mean, there is some problem with the normalization,and we
>>> should use the actual pstate rather than the theoretical
>>> current_pstate, for example, the pseudocode might looked like:
>>>
>>> -  current_pstate = int_tofp(cpu->pstate.current_pstate);
>>> + current_pstate = int_tofp(cpu->pstate.current_pstat)*0.85;
>> 
>> I did not think of normalizing / compensating at this point.
>> That is a good idea.
>> Just for a test, I tried it and it seems to work well.
>> Before normalizing / compensating core_busy can be quite a small for lesser
>> clock modulation duty cycles, and so becomes a little noisy afterwards.
>> 
>> For my test, on an otherwise unaltered kernel v4.3 I did this:
>> 
>> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
>> index aa33b92..97a90e1 100644
>> --- a/drivers/cpufreq/intel_pstate.c
>> +++ b/drivers/cpufreq/intel_pstate.c
>> @@ -821,6 +821,7 @@ static inline int32_t
>> intel_pstate_get_scaled_busy(struct cpudata *cpu)
>>         int32_t core_busy, max_pstate, current_pstate, sample_ratio;
>>         s64 duration_us;
>>         u32 sample_time;
>> +       u64 clock_modulation;
>> 
>>         /*
>          * core_busy is the ratio of actual performance to max @@ -836,6
>> +837,17 @@ static inline int32_t intel_pstate_get_scaled_busy(struct
>> cpudata *cpu)
>>         core_busy = cpu->sample.core_pct_busy;
>>         max_pstate = int_tofp(cpu->pstate.max_pstate);
>>         current_pstate = int_tofp(cpu->pstate.current_pstate);
>> +
>> +//     rdmsrl(MSR_IA32_CLOCK_MODULATION, clock_modulation);
>> +       rdmsrl(MSR_IA32_THERM_CONTROL, clock_modulation);
>> +       if(clock_modulation && 0X10) {
>> +               clock_modulation = clock_modulation & 0x0F;
>> +               if(clock_modulation == 0) clock_modulation = 8;
>> +               core_busy = mul_fp(core_busy, int_tofp(0x10));
>> +               core_busy = div_fp(core_busy, int_tofp(clock_modulation));
>> +       }
>> +
> rdmsr_safe  might be better,

I'll look into it, thanks.

> you can refer to acpi_throttling_rdmsr

I don't understand.

 ,
> and I'm OK with this code, are you planning to send a formal patch? 

The delay here is because I have always thought that some actual load
content needs to be brought back to the intel_pstate driver, which would
(or at least should) eliminate the need for this patch.

Anyway, and at least for the interim, I'll try to make and submit a formal version.

... Doug



^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for suspend
  2015-11-21 16:45                 ` Doug Smythies
@ 2015-11-27  3:28                   ` Doug Smythies
  2015-11-27  6:01                     ` Yu Chen
  0 siblings, 1 reply; 14+ messages in thread
From: Doug Smythies @ 2015-11-27  3:28 UTC (permalink / raw)
  To: 'Chen, Yu C'
  Cc: 'Wysocki, Rafael J', tglx, hpa, bp, 'Zhang, Rui',
	linux-pm, x86, linux-kernel, 'Brown, Len',
	'Ingo Molnar', 'Pavel Machek',
	'Pandruvada, Srinivas', 'Doug Smythies'

On 2015.11.21 08:45 Doug Smythies wrote:
>On 2015.11.12 01:42 Chen, Yu C wrote:
>> On 2015.11.06 11:34 Doug Smythies wrote: 

[cut]

>> rdmsr_safe  might be better,

> I'll look into it, thanks.

>> you can refer to acpi_throttling_rdmsr

> I don't understand.

>> and I'm OK with this code, are you planning to send a formal patch? 

> The delay here is because I have always thought that some actual load
> content needs to be brought back to the intel_pstate driver, which would
> (or at least should) eliminate the need for this patch.

> Anyway, and at least for the interim, I'll try to make and submit a formal version.

I made a mistake in my initial testing. I put a 100% load on CPU 7 and then
cycled through all the clock modulation values to show that my test version of
a possible patch compensated / normalized the Clock Modulation. Indeed, if the
system is already asking for the maximum pstate, it will stay there. However,
whenever the load drops, the target pstate will drop to minimum and it will
never kick back up again, regardless of load.

I am returning to my initial assertion copied below:

>>>>>>> The current version of the intel_pstate driver is incompatible
>>>>>>> with any use of Clock Modulation, always resulting in driving the
>>>>>>> target pstate to the minimum, regardless of load. The result is
>>>>>>> the apparent CPU frequency stuck at minimum * modulation percent.
>>>>>>
>>>>>>> The acpi-cpufreq driver works fine with Clock Modulation,
>>>>>>> resulting in desired frequency * modulation percent.

Chen,

Thanks though for the suggestion to try normalizing.

... Doug



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for suspend
  2015-11-27  3:28                   ` Doug Smythies
@ 2015-11-27  6:01                     ` Yu Chen
  0 siblings, 0 replies; 14+ messages in thread
From: Yu Chen @ 2015-11-27  6:01 UTC (permalink / raw)
  To: Doug Smythies
  Cc: 'Wysocki, Rafael J', tglx, hpa, bp, 'Zhang, Rui',
	linux-pm, x86, linux-kernel, 'Brown, Len',
	'Ingo Molnar', 'Pavel Machek',
	'Pandruvada, Srinivas'

Hi,
On 11/27/2015 11:28 AM, Doug Smythies wrote:
> On 2015.11.21 08:45 Doug Smythies wrote:
>> On 2015.11.12 01:42 Chen, Yu C wrote:
>>> On 2015.11.06 11:34 Doug Smythies wrote:
>
> [cut]
>
>>> rdmsr_safe  might be better,
>
>> I'll look into it, thanks.
>
>>> you can refer to acpi_throttling_rdmsr
>
>> I don't understand.
>
>>> and I'm OK with this code, are you planning to send a formal patch?
>
>> The delay here is because I have always thought that some actual load
>> content needs to be brought back to the intel_pstate driver, which would
>> (or at least should) eliminate the need for this patch.
>
>> Anyway, and at least for the interim, I'll try to make and submit a formal version.
>
> I made a mistake in my initial testing. I put a 100% load on CPU 7 and then
> cycled through all the clock modulation values to show that my test version of
> a possible patch compensated / normalized the Clock Modulation. Indeed, if the
> system is already asking for the maximum pstate, it will stay there. However,
> whenever the load drops, the target pstate will drop to minimum and it will
> never kick back up again, regardless of load.
>
  Do you mean even with your
  patch applied, the cpufreq policy would choose a smaller target?
  I looked up the SDM, it says in 14.7.3: on Hyper-Threading Technology
  enabled processors, the clock modulation might behave differently:
"if the programmed duty cycle is not identical for all logical
processors in the same core, the
processor core will modulate at the lowest programmed duty cycle "
I dont know if this is related to the problem.
> I am returning to my initial assertion copied below:
>
>>>>>>>> The current version of the intel_pstate driver is incompatible
>>>>>>>> with any use of Clock Modulation, always resulting in driving the
>>>>>>>> target pstate to the minimum, regardless of load. The result is
>>>>>>>> the apparent CPU frequency stuck at minimum * modulation percent.
>>>>>>>
>>>>>>>> The acpi-cpufreq driver works fine with Clock Modulation,
>>>>>>>> resulting in desired frequency * modulation percent.
>
> Chen,
>
> Thanks though for the suggestion to try normalizing.
>
I'll try to reproduce your problem, and let's discuss this offline.
> ... Doug
>
thanks,
Yu


^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2015-11-27  5:57 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-08-27  3:18 [PATCH] [v4] x86, suspend: Save/restore extra MSR registers for suspend Chen Yu
2015-09-17  5:30 ` Pavel Machek
2015-10-09  9:39   ` Chen, Yu C
2015-10-09 18:55     ` Doug Smythies
2015-10-11  2:26       ` Chen, Yu C
2015-10-11 15:46         ` Doug Smythies
2015-11-01 16:49           ` Chen, Yu C
2015-11-06 15:33             ` Doug Smythies
2015-11-12  9:42               ` Chen, Yu C
2015-11-21 16:45                 ` Doug Smythies
2015-11-27  3:28                   ` Doug Smythies
2015-11-27  6:01                     ` Yu Chen
2015-10-09 21:50 ` Rafael J. Wysocki
2015-10-11  2:43   ` Chen, Yu C

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).