All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] x86/mwait-idle: (mostly) SPR support
@ 2022-04-26 10:03 Jan Beulich
  2022-04-26 10:04 ` [PATCH 1/4] x86/mwait-idle: switch to asm/intel-family.h naming Jan Beulich
                   ` (3 more replies)
  0 siblings, 4 replies; 16+ messages in thread
From: Jan Beulich @ 2022-04-26 10:03 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

Fresh from Linux 5.18-rc (and with adjustments to address issues noticed
while porting), except for the 1st patch.

1: switch to asm/intel-family.h naming
2: add SPR support
3: add 'preferred_cstates' module argument
4: add core C6 optimization for SPR

Jan



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/4] x86/mwait-idle: switch to asm/intel-family.h naming
  2022-04-26 10:03 [PATCH 0/4] x86/mwait-idle: (mostly) SPR support Jan Beulich
@ 2022-04-26 10:04 ` Jan Beulich
  2022-04-27 11:21   ` Roger Pau Monné
  2022-04-26 10:05 ` [PATCH 2/4] mwait-idle: add SPR support Jan Beulich
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 16+ messages in thread
From: Jan Beulich @ 2022-04-26 10:04 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

This brings us (back) closer to the original Linux source.

While touching mwait_idle_state_table_update() also drop a stray leading
blank.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/cpu/mwait-idle.c
+++ b/xen/arch/x86/cpu/mwait-idle.c
@@ -61,6 +61,7 @@
 #include <xen/trace.h>
 #include <asm/cpuidle.h>
 #include <asm/hpet.h>
+#include <asm/intel-family.h>
 #include <asm/mwait.h>
 #include <asm/msr.h>
 #include <asm/spec_ctrl.h>
@@ -996,48 +997,49 @@ static const struct idle_cpu idle_cpu_sn
 };
 
 #define ICPU(model, cpu) \
-	{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_ALWAYS, &idle_cpu_##cpu}
+	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ ## model, X86_FEATURE_ALWAYS, \
+	  &idle_cpu_ ## cpu}
 
 static const struct x86_cpu_id intel_idle_ids[] __initconstrel = {
-	ICPU(0x1a, nehalem),
-	ICPU(0x1e, nehalem),
-	ICPU(0x1f, nehalem),
-	ICPU(0x25, nehalem),
-	ICPU(0x2c, nehalem),
-	ICPU(0x2e, nehalem),
-	ICPU(0x2f, nehalem),
-	ICPU(0x1c, atom),
-	ICPU(0x26, lincroft),
-	ICPU(0x2a, snb),
-	ICPU(0x2d, snb),
-	ICPU(0x36, atom),
-	ICPU(0x37, byt),
-	ICPU(0x4a, tangier),
-	ICPU(0x4c, cht),
-	ICPU(0x3a, ivb),
-	ICPU(0x3e, ivt),
-	ICPU(0x3c, hsw),
-	ICPU(0x3f, hsw),
-	ICPU(0x45, hsw),
-	ICPU(0x46, hsw),
-	ICPU(0x4d, avn),
-	ICPU(0x3d, bdw),
-	ICPU(0x47, bdw),
-	ICPU(0x4f, bdw),
-	ICPU(0x56, bdw),
-	ICPU(0x4e, skl),
-	ICPU(0x5e, skl),
-	ICPU(0x8e, skl),
-	ICPU(0x9e, skl),
-	ICPU(0x55, skx),
-	ICPU(0x6a, icx),
-	ICPU(0x6c, icx),
-	ICPU(0x57, knl),
-	ICPU(0x85, knl),
-	ICPU(0x5c, bxt),
-	ICPU(0x7a, bxt),
-	ICPU(0x5f, dnv),
-	ICPU(0x86, snr),
+	ICPU(NEHALEM_EP,		nehalem),
+	ICPU(NEHALEM,			nehalem),
+	ICPU(NEHALEM_G,			nehalem),
+	ICPU(WESTMERE,			nehalem),
+	ICPU(WESTMERE_EP,		nehalem),
+	ICPU(NEHALEM_EX,		nehalem),
+	ICPU(WESTMERE_EX,		nehalem),
+	ICPU(ATOM_BONNELL,		atom),
+	ICPU(ATOM_BONNELL_MID,		lincroft),
+	ICPU(SANDYBRIDGE,		snb),
+	ICPU(SANDYBRIDGE_X,		snb),
+	ICPU(ATOM_SALTWELL,		atom),
+	ICPU(ATOM_SILVERMONT,		byt),
+	ICPU(ATOM_SILVERMONT_MID,	tangier),
+	ICPU(ATOM_AIRMONT,		cht),
+	ICPU(IVYBRIDGE,			ivb),
+	ICPU(IVYBRIDGE_X,		ivt),
+	ICPU(HASWELL,			hsw),
+	ICPU(HASWELL_X,			hsw),
+	ICPU(HASWELL_L,			hsw),
+	ICPU(HASWELL_G,			hsw),
+	ICPU(ATOM_SILVERMONT_D,		avn),
+	ICPU(BROADWELL,			bdw),
+	ICPU(BROADWELL_G,		bdw),
+	ICPU(BROADWELL_X,		bdw),
+	ICPU(BROADWELL_D,		bdw),
+	ICPU(SKYLAKE_L,			skl),
+	ICPU(SKYLAKE,			skl),
+	ICPU(KABYLAKE_L,		skl),
+	ICPU(KABYLAKE,			skl),
+	ICPU(SKYLAKE_X,			skx),
+	ICPU(ICELAKE_X,			icx),
+	ICPU(ICELAKE_D,			icx),
+	ICPU(XEON_PHI_KNL,		knl),
+	ICPU(XEON_PHI_KNM,		knl),
+	ICPU(ATOM_GOLDMONT,		bxt),
+	ICPU(ATOM_GOLDMONT_PLUS,	bxt),
+	ICPU(ATOM_GOLDMONT_D,		dnv),
+	ICPU(ATOM_TREMONT_D,		snr),
 	{}
 };
 
@@ -1208,20 +1210,20 @@ static void __init skx_idle_state_table_
 static void __init mwait_idle_state_table_update(void)
 {
 	switch (boot_cpu_data.x86_model) {
-	case 0x3e: /* IVT */
+	case INTEL_FAM6_IVYBRIDGE_X:
 		ivt_idle_state_table_update();
 		break;
-	case 0x5c: /* BXT */
-	case 0x7a:
+	case INTEL_FAM6_ATOM_GOLDMONT:
+	case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
 		bxt_idle_state_table_update();
 		break;
-	case 0x5e: /* SKL-H */
+	case INTEL_FAM6_SKYLAKE:
 		sklh_idle_state_table_update();
 		break;
-	case 0x55: /* SKL-X */
+	case INTEL_FAM6_SKYLAKE_X:
 		skx_idle_state_table_update();
 		break;
- 	}
+	}
 }
 
 static int __init mwait_idle_probe(void)



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 2/4] mwait-idle: add SPR support
  2022-04-26 10:03 [PATCH 0/4] x86/mwait-idle: (mostly) SPR support Jan Beulich
  2022-04-26 10:04 ` [PATCH 1/4] x86/mwait-idle: switch to asm/intel-family.h naming Jan Beulich
@ 2022-04-26 10:05 ` Jan Beulich
  2022-04-27 11:39   ` Roger Pau Monné
  2022-04-26 10:05 ` [PATCH 3/4] mwait-idle: add 'preferred_cstates' module argument Jan Beulich
  2022-04-26 10:06 ` [PATCH 4/4] mwait-idle: add core C6 optimization for SPR Jan Beulich
  3 siblings, 1 reply; 16+ messages in thread
From: Jan Beulich @ 2022-04-26 10:05 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>

Add Sapphire Rapids Xeon support.

Up until very recently, the C1 and C1E C-states were independent, but this
has changed in some new chips, including Sapphire Rapids Xeon (SPR). In these
chips the C1 and C1E states cannot be enabled at the same time. The "C1E
promotion" bit in 'MSR_IA32_POWER_CTL' also has its semantics changed a bit.

Here are the C1, C1E, and "C1E promotion" bit rules on Xeons before SPR.

1. If C1E promotion bit is disabled.
   a. C1  requests end up with C1  C-state.
   b. C1E requests end up with C1E C-state.
2. If C1E promotion bit is enabled.
   a. C1  requests end up with C1E C-state.
   b. C1E requests end up with C1E C-state.

Here are the C1, C1E, and "C1E promotion" bit rules on Sapphire Rapids Xeon.
1. If C1E promotion bit is disabled.
   a. C1  requests end up with C1 C-state.
   b. C1E requests end up with C1 C-state.
2. If C1E promotion bit is enabled.
   a. C1  requests end up with C1E C-state.
   b. C1E requests end up with C1E C-state.

Before SPR Xeon, the 'intel_idle' driver was disabling C1E promotion and was
exposing C1 and C1E as independent C-states. But on SPR, C1 and C1E cannot be
enabled at the same time.

This patch adds both C1 and C1E states. However, C1E is marked as with the
"CPUIDLE_FLAG_UNUSABLE" flag, which means that in won't be registered by
default. The C1E promotion bit will be cleared, which means that by default
only C1 and C6 will be registered on SPR.

The next patch will add an option for enabling C1E and disabling C1 on SPR.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Origin: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 9edf3c0ffef0
Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- unstable.orig/xen/arch/x86/cpu/mwait-idle.c	2022-04-25 16:48:58.000000000 +0200
+++ unstable/xen/arch/x86/cpu/mwait-idle.c	2022-04-25 17:17:05.000000000 +0200
@@ -586,6 +586,38 @@ static const struct cpuidle_state icx_cs
        {}
 };
 
+/*
+ * On Sapphire Rapids Xeon C1 has to be disabled if C1E is enabled, and vice
+ * versa. On SPR C1E is enabled only if "C1E promotion" bit is set in
+ * MSR_IA32_POWER_CTL. But in this case there effectively no C1, because C1
+ * requests are promoted to C1E. If the "C1E promotion" bit is cleared, then
+ * both C1 and C1E requests end up with C1, so there is effectively no C1E.
+ *
+ * By default we enable C1 and disable C1E by marking it with
+ * 'CPUIDLE_FLAG_DISABLED'.
+ */
+static struct cpuidle_state __read_mostly spr_cstates[] = {
+	{
+		.name = "C1",
+		.flags = MWAIT2flg(0x00),
+		.exit_latency = 1,
+		.target_residency = 1,
+	},
+	{
+		.name = "C1E",
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_DISABLED,
+		.exit_latency = 2,
+		.target_residency = 4,
+	},
+	{
+		.name = "C6",
+		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 290,
+		.target_residency = 800,
+	},
+	{}
+};
+
 static const struct cpuidle_state atom_cstates[] = {
 	{
 		.name = "C1E",
@@ -972,6 +1004,11 @@ static const struct idle_cpu idle_cpu_ic
        .disable_promotion_to_c1e = true,
 };
 
+static struct idle_cpu __read_mostly idle_cpu_spr = {
+	.state_table = spr_cstates,
+	.disable_promotion_to_c1e = true,
+};
+
 static const struct idle_cpu idle_cpu_avn = {
 	.state_table = avn_cstates,
 	.disable_promotion_to_c1e = true,
@@ -1034,6 +1071,7 @@ static const struct x86_cpu_id intel_idl
 	ICPU(SKYLAKE_X,			skx),
 	ICPU(ICELAKE_X,			icx),
 	ICPU(ICELAKE_D,			icx),
+	ICPU(SAPPHIRERAPIDS_X,		spr),
 	ICPU(XEON_PHI_KNL,		knl),
 	ICPU(XEON_PHI_KNM,		knl),
 	ICPU(ATOM_GOLDMONT,		bxt),



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 3/4] mwait-idle: add 'preferred_cstates' module argument
  2022-04-26 10:03 [PATCH 0/4] x86/mwait-idle: (mostly) SPR support Jan Beulich
  2022-04-26 10:04 ` [PATCH 1/4] x86/mwait-idle: switch to asm/intel-family.h naming Jan Beulich
  2022-04-26 10:05 ` [PATCH 2/4] mwait-idle: add SPR support Jan Beulich
@ 2022-04-26 10:05 ` Jan Beulich
  2022-04-27 12:45   ` Roger Pau Monné
  2022-04-26 10:06 ` [PATCH 4/4] mwait-idle: add core C6 optimization for SPR Jan Beulich
  3 siblings, 1 reply; 16+ messages in thread
From: Jan Beulich @ 2022-04-26 10:05 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>

On Sapphire Rapids Xeon (SPR) the C1 and C1E states are basically mutually
exclusive - only one of them can be enabled. By default, 'intel_idle' driver
enables C1 and disables C1E. However, some users prefer to use C1E instead of
C1, because it saves more energy.

This patch adds a new module parameter ('preferred_cstates') for enabling C1E
and disabling C1. Here is the idea behind it.

1. This option has effect only for "mutually exclusive" C-states like C1 and
   C1E on SPR.
2. It does not have any effect on independent C-states, which do not require
   other C-states to be disabled (most states on most platforms as of today).
3. For mutually exclusive C-states, the 'intel_idle' driver always has a
   reasonable default, such as enabling C1 on SPR by default. On other
   platforms, the default may be different.
4. Users can override the default using the 'preferred_cstates' parameter.
5. The parameter accepts the preferred C-states bit-mask, similarly to the
   existing 'states_off' parameter.
6. This parameter is not limited to C1/C1E, and leaves room for supporting
   other mutually exclusive C-states, if they come in the future.

Today 'intel_idle' can only be compiled-in, which means that on SPR, in order
to disable C1 and enable C1E, users should boot with the following kernel
argument: intel_idle.preferred_cstates=4

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Origin: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git da0e58c038e6

Enable C1E (if requested) not only on the BSP's socket / package.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- unstable.orig/docs/misc/xen-command-line.pandoc	2022-04-25 17:59:42.123387258 +0200
+++ unstable/docs/misc/xen-command-line.pandoc	2022-04-25 17:36:00.000000000 +0200
@@ -1884,6 +1884,12 @@ paging controls access to usermode addre
 ### ple_window (Intel)
 > `= <integer>`
 
+### preferred-cstates (x86)
+> `= <integer>`
+
+This is a mask of C-states which are to be use preferably.  This option is
+applicable only oh hardware were certain C-states are exlusive of one another.
+
 ### psr (Intel)
 > `= List of ( cmt:<boolean> | rmid_max:<integer> | cat:<boolean> | cos_max:<integer> | cdp:<boolean> )`
 
--- unstable.orig/xen/arch/x86/cpu/mwait-idle.c	2022-04-25 17:17:05.000000000 +0200
+++ unstable/xen/arch/x86/cpu/mwait-idle.c	2022-04-25 17:33:47.000000000 +0200
@@ -82,6 +82,18 @@ boolean_param("mwait-idle", opt_mwait_id
 
 static unsigned int mwait_substates;
 
+/*
+ * Some platforms come with mutually exclusive C-states, so that if one is
+ * enabled, the other C-states must not be used. Example: C1 and C1E on
+ * Sapphire Rapids platform. This parameter allows for selecting the
+ * preferred C-states among the groups of mutually exclusive C-states - the
+ * selected C-states will be registered, the other C-states from the mutually
+ * exclusive group won't be registered. If the platform has no mutually
+ * exclusive C-states, this parameter has no effect.
+ */
+static unsigned int __ro_after_init preferred_states_mask;
+integer_param("preferred-cstates", preferred_states_mask);
+
 #define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF
 /* Reliable LAPIC Timer States, bit 1 for C1 etc. Default to only C1. */
 static unsigned int lapic_timer_reliable_states = (1 << 1);
@@ -96,6 +108,7 @@ struct idle_cpu {
 	unsigned long auto_demotion_disable_flags;
 	bool byt_auto_demotion_disable_flag;
 	bool disable_promotion_to_c1e;
+	bool enable_promotion_to_c1e;
 };
 
 static const struct idle_cpu *icpu;
@@ -924,6 +937,15 @@ static void cf_check byt_auto_demotion_d
 	wrmsrl(MSR_MC6_DEMOTION_POLICY_CONFIG, 0);
 }
 
+static void cf_check c1e_promotion_enable(void *dummy)
+{
+	uint64_t msr_bits;
+
+	rdmsrl(MSR_IA32_POWER_CTL, msr_bits);
+	msr_bits |= 0x2;
+	wrmsrl(MSR_IA32_POWER_CTL, msr_bits);
+}
+
 static void cf_check c1e_promotion_disable(void *dummy)
 {
 	u64 msr_bits;
@@ -1241,6 +1263,26 @@ static void __init skx_idle_state_table_
 }
 
 /*
+ * spr_idle_state_table_update - Adjust Sapphire Rapids idle states table.
+ */
+static void __init spr_idle_state_table_update(void)
+{
+	/* Check if user prefers C1E over C1. */
+	if (preferred_states_mask & BIT(2, U)) {
+		if (preferred_states_mask & BIT(1, U))
+			/* Both can't be enabled, stick to the defaults. */
+			return;
+
+		spr_cstates[0].flags |= CPUIDLE_FLAG_DISABLED;
+		spr_cstates[1].flags &= ~CPUIDLE_FLAG_DISABLED;
+
+		/* Request enabling C1E using the "C1E promotion" bit. */
+		idle_cpu_spr.disable_promotion_to_c1e = false;
+		idle_cpu_spr.enable_promotion_to_c1e = true;
+	}
+}
+
+/*
  * mwait_idle_state_table_update()
  *
  * Update the default state_table for this CPU-id
@@ -1261,6 +1303,9 @@ static void __init mwait_idle_state_tabl
 	case INTEL_FAM6_SKYLAKE_X:
 		skx_idle_state_table_update();
 		break;
+	case INTEL_FAM6_SAPPHIRERAPIDS_X:
+		spr_idle_state_table_update();
+		break;
 	}
 }
 
@@ -1402,6 +1447,8 @@ static int cf_check mwait_idle_cpu_init(
 
 	if (icpu->disable_promotion_to_c1e)
 		on_selected_cpus(cpumask_of(cpu), c1e_promotion_disable, NULL, 1);
+	else if (icpu->enable_promotion_to_c1e)
+		on_selected_cpus(cpumask_of(cpu), c1e_promotion_enable, NULL, 1);
 
 	return NOTIFY_DONE;
 }



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 4/4] mwait-idle: add core C6 optimization for SPR
  2022-04-26 10:03 [PATCH 0/4] x86/mwait-idle: (mostly) SPR support Jan Beulich
                   ` (2 preceding siblings ...)
  2022-04-26 10:05 ` [PATCH 3/4] mwait-idle: add 'preferred_cstates' module argument Jan Beulich
@ 2022-04-26 10:06 ` Jan Beulich
  2022-04-27 12:56   ` Roger Pau Monné
  3 siblings, 1 reply; 16+ messages in thread
From: Jan Beulich @ 2022-04-26 10:06 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>

Add a Sapphire Rapids Xeon C6 optimization, similar to what we have for Sky Lake
Xeon: if package C6 is disabled, adjust C6 exit latency and target residency to
match core C6 values, instead of using the default package C6 values.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Origin: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 3a9cf77b60dc

Make sure a contradictory "preferred-cstates" wouldn't cause bypassing
of the added logic.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/cpu/mwait-idle.c
+++ b/xen/arch/x86/cpu/mwait-idle.c
@@ -1267,12 +1267,12 @@ static void __init skx_idle_state_table_
  */
 static void __init spr_idle_state_table_update(void)
 {
-	/* Check if user prefers C1E over C1. */
-	if (preferred_states_mask & BIT(2, U)) {
-		if (preferred_states_mask & BIT(1, U))
-			/* Both can't be enabled, stick to the defaults. */
-			return;
+	uint64_t msr;
 
+	/* Check if user prefers C1E over C1. */
+	if (preferred_states_mask & BIT(2, U) &&
+	    /* Both can't be enabled, stick to the defaults. */
+	    !(preferred_states_mask & BIT(1, U))) {
 		spr_cstates[0].flags |= CPUIDLE_FLAG_DISABLED;
 		spr_cstates[1].flags &= ~CPUIDLE_FLAG_DISABLED;
 
@@ -1280,6 +1280,19 @@ static void __init spr_idle_state_table_
 		idle_cpu_spr.disable_promotion_to_c1e = false;
 		idle_cpu_spr.enable_promotion_to_c1e = true;
 	}
+
+	/*
+	 * By default, the C6 state assumes the worst-case scenario of package
+	 * C6. However, if PC6 is disabled, we update the numbers to match
+	 * core C6.
+	 */
+	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr);
+
+	/* Limit value 2 and above allow for PC6. */
+	if ((msr & 0x7) < 2) {
+		spr_cstates[2].exit_latency = 190;
+		spr_cstates[2].target_residency = 600;
+	}
 }
 
 /*



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/4] x86/mwait-idle: switch to asm/intel-family.h naming
  2022-04-26 10:04 ` [PATCH 1/4] x86/mwait-idle: switch to asm/intel-family.h naming Jan Beulich
@ 2022-04-27 11:21   ` Roger Pau Monné
  0 siblings, 0 replies; 16+ messages in thread
From: Roger Pau Monné @ 2022-04-27 11:21 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper, Wei Liu

On Tue, Apr 26, 2022 at 12:04:45PM +0200, Jan Beulich wrote:
> This brings us (back) closer to the original Linux source.
> 
> While touching mwait_idle_state_table_update() also drop a stray leading
> blank.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks, Roger.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/4] mwait-idle: add SPR support
  2022-04-26 10:05 ` [PATCH 2/4] mwait-idle: add SPR support Jan Beulich
@ 2022-04-27 11:39   ` Roger Pau Monné
  0 siblings, 0 replies; 16+ messages in thread
From: Roger Pau Monné @ 2022-04-27 11:39 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper, Wei Liu

On Tue, Apr 26, 2022 at 12:05:07PM +0200, Jan Beulich wrote:
> From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
> 
> Add Sapphire Rapids Xeon support.
> 
> Up until very recently, the C1 and C1E C-states were independent, but this
> has changed in some new chips, including Sapphire Rapids Xeon (SPR). In these
> chips the C1 and C1E states cannot be enabled at the same time. The "C1E
> promotion" bit in 'MSR_IA32_POWER_CTL' also has its semantics changed a bit.
> 
> Here are the C1, C1E, and "C1E promotion" bit rules on Xeons before SPR.
> 
> 1. If C1E promotion bit is disabled.
>    a. C1  requests end up with C1  C-state.
>    b. C1E requests end up with C1E C-state.
> 2. If C1E promotion bit is enabled.
>    a. C1  requests end up with C1E C-state.
>    b. C1E requests end up with C1E C-state.
> 
> Here are the C1, C1E, and "C1E promotion" bit rules on Sapphire Rapids Xeon.
> 1. If C1E promotion bit is disabled.
>    a. C1  requests end up with C1 C-state.
>    b. C1E requests end up with C1 C-state.
> 2. If C1E promotion bit is enabled.
>    a. C1  requests end up with C1E C-state.
>    b. C1E requests end up with C1E C-state.
> 
> Before SPR Xeon, the 'intel_idle' driver was disabling C1E promotion and was
> exposing C1 and C1E as independent C-states. But on SPR, C1 and C1E cannot be
> enabled at the same time.
> 
> This patch adds both C1 and C1E states. However, C1E is marked as with the
> "CPUIDLE_FLAG_UNUSABLE" flag, which means that in won't be registered by
> default. The C1E promotion bit will be cleared, which means that by default
> only C1 and C6 will be registered on SPR.
> 
> The next patch will add an option for enabling C1E and disabling C1 on SPR.
> 
> Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> Origin: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 9edf3c0ffef0
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks, Roger.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/4] mwait-idle: add 'preferred_cstates' module argument
  2022-04-26 10:05 ` [PATCH 3/4] mwait-idle: add 'preferred_cstates' module argument Jan Beulich
@ 2022-04-27 12:45   ` Roger Pau Monné
  2022-04-27 13:41     ` Jan Beulich
  0 siblings, 1 reply; 16+ messages in thread
From: Roger Pau Monné @ 2022-04-27 12:45 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper, Wei Liu

On Tue, Apr 26, 2022 at 12:05:28PM +0200, Jan Beulich wrote:
> From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
> 
> On Sapphire Rapids Xeon (SPR) the C1 and C1E states are basically mutually
> exclusive - only one of them can be enabled. By default, 'intel_idle' driver
> enables C1 and disables C1E. However, some users prefer to use C1E instead of
> C1, because it saves more energy.
> 
> This patch adds a new module parameter ('preferred_cstates') for enabling C1E
> and disabling C1. Here is the idea behind it.
> 
> 1. This option has effect only for "mutually exclusive" C-states like C1 and
>    C1E on SPR.
> 2. It does not have any effect on independent C-states, which do not require
>    other C-states to be disabled (most states on most platforms as of today).
> 3. For mutually exclusive C-states, the 'intel_idle' driver always has a
>    reasonable default, such as enabling C1 on SPR by default. On other
>    platforms, the default may be different.
> 4. Users can override the default using the 'preferred_cstates' parameter.
> 5. The parameter accepts the preferred C-states bit-mask, similarly to the
>    existing 'states_off' parameter.
> 6. This parameter is not limited to C1/C1E, and leaves room for supporting
>    other mutually exclusive C-states, if they come in the future.
> 
> Today 'intel_idle' can only be compiled-in, which means that on SPR, in order
> to disable C1 and enable C1E, users should boot with the following kernel
> argument: intel_idle.preferred_cstates=4
> 
> Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> Origin: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git da0e58c038e6
> 
> Enable C1E (if requested) not only on the BSP's socket / package.

Maybe we should also add a note here that the command line option for
Xen is preferred-cstates instead of intel_idle.preferred_cstates?

I think this is a bad interface however, we should have a more generic
option (ie: cstate-mode = 'performance | powersave') so that users
don't have to fiddle with model specific C state masks.

> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> --- unstable.orig/docs/misc/xen-command-line.pandoc	2022-04-25 17:59:42.123387258 +0200
> +++ unstable/docs/misc/xen-command-line.pandoc	2022-04-25 17:36:00.000000000 +0200
> @@ -1884,6 +1884,12 @@ paging controls access to usermode addre
>  ### ple_window (Intel)
>  > `= <integer>`
>  
> +### preferred-cstates (x86)
> +> `= <integer>`
> +
> +This is a mask of C-states which are to be use preferably.  This option is
> +applicable only oh hardware were certain C-states are exlusive of one another.
> +
>  ### psr (Intel)
>  > `= List of ( cmt:<boolean> | rmid_max:<integer> | cat:<boolean> | cos_max:<integer> | cdp:<boolean> )`
>  
> --- unstable.orig/xen/arch/x86/cpu/mwait-idle.c	2022-04-25 17:17:05.000000000 +0200
> +++ unstable/xen/arch/x86/cpu/mwait-idle.c	2022-04-25 17:33:47.000000000 +0200
> @@ -82,6 +82,18 @@ boolean_param("mwait-idle", opt_mwait_id
>  
>  static unsigned int mwait_substates;
>  
> +/*
> + * Some platforms come with mutually exclusive C-states, so that if one is
> + * enabled, the other C-states must not be used. Example: C1 and C1E on
> + * Sapphire Rapids platform. This parameter allows for selecting the
> + * preferred C-states among the groups of mutually exclusive C-states - the
> + * selected C-states will be registered, the other C-states from the mutually
> + * exclusive group won't be registered. If the platform has no mutually
> + * exclusive C-states, this parameter has no effect.
> + */
> +static unsigned int __ro_after_init preferred_states_mask;
> +integer_param("preferred-cstates", preferred_states_mask);
> +
>  #define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF
>  /* Reliable LAPIC Timer States, bit 1 for C1 etc. Default to only C1. */
>  static unsigned int lapic_timer_reliable_states = (1 << 1);
> @@ -96,6 +108,7 @@ struct idle_cpu {
>  	unsigned long auto_demotion_disable_flags;
>  	bool byt_auto_demotion_disable_flag;
>  	bool disable_promotion_to_c1e;
> +	bool enable_promotion_to_c1e;

I'm confused by those fields, shouldn't we just have:
promotion_to_c1e = true | false?

As one field is the negation of the other:
enable_promotion_to_c1e = !disable_promotion_to_c1e

I know this is code from Linux, but would like to understand why two
fields are needed.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] mwait-idle: add core C6 optimization for SPR
  2022-04-26 10:06 ` [PATCH 4/4] mwait-idle: add core C6 optimization for SPR Jan Beulich
@ 2022-04-27 12:56   ` Roger Pau Monné
  0 siblings, 0 replies; 16+ messages in thread
From: Roger Pau Monné @ 2022-04-27 12:56 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper, Wei Liu

On Tue, Apr 26, 2022 at 12:06:14PM +0200, Jan Beulich wrote:
> From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
> 
> Add a Sapphire Rapids Xeon C6 optimization, similar to what we have for Sky Lake
> Xeon: if package C6 is disabled, adjust C6 exit latency and target residency to
> match core C6 values, instead of using the default package C6 values.
> 
> Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> Origin: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 3a9cf77b60dc
> 
> Make sure a contradictory "preferred-cstates" wouldn't cause bypassing
> of the added logic.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks, Roger.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/4] mwait-idle: add 'preferred_cstates' module argument
  2022-04-27 12:45   ` Roger Pau Monné
@ 2022-04-27 13:41     ` Jan Beulich
  2022-04-27 15:06       ` Roger Pau Monné
  0 siblings, 1 reply; 16+ messages in thread
From: Jan Beulich @ 2022-04-27 13:41 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: xen-devel, Andrew Cooper, Wei Liu

On 27.04.2022 14:45, Roger Pau Monné wrote:
> On Tue, Apr 26, 2022 at 12:05:28PM +0200, Jan Beulich wrote:
>> From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
>>
>> On Sapphire Rapids Xeon (SPR) the C1 and C1E states are basically mutually
>> exclusive - only one of them can be enabled. By default, 'intel_idle' driver
>> enables C1 and disables C1E. However, some users prefer to use C1E instead of
>> C1, because it saves more energy.
>>
>> This patch adds a new module parameter ('preferred_cstates') for enabling C1E
>> and disabling C1. Here is the idea behind it.
>>
>> 1. This option has effect only for "mutually exclusive" C-states like C1 and
>>    C1E on SPR.
>> 2. It does not have any effect on independent C-states, which do not require
>>    other C-states to be disabled (most states on most platforms as of today).
>> 3. For mutually exclusive C-states, the 'intel_idle' driver always has a
>>    reasonable default, such as enabling C1 on SPR by default. On other
>>    platforms, the default may be different.
>> 4. Users can override the default using the 'preferred_cstates' parameter.
>> 5. The parameter accepts the preferred C-states bit-mask, similarly to the
>>    existing 'states_off' parameter.
>> 6. This parameter is not limited to C1/C1E, and leaves room for supporting
>>    other mutually exclusive C-states, if they come in the future.
>>
>> Today 'intel_idle' can only be compiled-in, which means that on SPR, in order
>> to disable C1 and enable C1E, users should boot with the following kernel
>> argument: intel_idle.preferred_cstates=4
>>
>> Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
>> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
>> Origin: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git da0e58c038e6
>>
>> Enable C1E (if requested) not only on the BSP's socket / package.
> 
> Maybe we should also add a note here that the command line option for
> Xen is preferred-cstates instead of intel_idle.preferred_cstates?
> 
> I think this is a bad interface however, we should have a more generic
> option (ie: cstate-mode = 'performance | powersave') so that users
> don't have to fiddle with model specific C state masks.

Performance vs powersave doesn't cover it imo, especially if down
the road more states would appear which can be controlled this way.
I don't think there's a way around providing _some_ way to control
things one a per-state level. When porting this over, I too didn't
like this interface very much, but I had no good replacement idea.

>> --- unstable.orig/xen/arch/x86/cpu/mwait-idle.c
>> +++ unstable/xen/arch/x86/cpu/mwait-idle.c
>> @@ -82,6 +82,18 @@ boolean_param("mwait-idle", opt_mwait_id
>>  
>>  static unsigned int mwait_substates;
>>  
>> +/*
>> + * Some platforms come with mutually exclusive C-states, so that if one is
>> + * enabled, the other C-states must not be used. Example: C1 and C1E on
>> + * Sapphire Rapids platform. This parameter allows for selecting the
>> + * preferred C-states among the groups of mutually exclusive C-states - the
>> + * selected C-states will be registered, the other C-states from the mutually
>> + * exclusive group won't be registered. If the platform has no mutually
>> + * exclusive C-states, this parameter has no effect.
>> + */
>> +static unsigned int __ro_after_init preferred_states_mask;
>> +integer_param("preferred-cstates", preferred_states_mask);
>> +
>>  #define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF
>>  /* Reliable LAPIC Timer States, bit 1 for C1 etc. Default to only C1. */
>>  static unsigned int lapic_timer_reliable_states = (1 << 1);
>> @@ -96,6 +108,7 @@ struct idle_cpu {
>>  	unsigned long auto_demotion_disable_flags;
>>  	bool byt_auto_demotion_disable_flag;
>>  	bool disable_promotion_to_c1e;
>> +	bool enable_promotion_to_c1e;
> 
> I'm confused by those fields, shouldn't we just have:
> promotion_to_c1e = true | false?
> 
> As one field is the negation of the other:
> enable_promotion_to_c1e = !disable_promotion_to_c1e
> 
> I know this is code from Linux, but would like to understand why two
> fields are needed.

This really is a tristate; Linux is now changing their global variable
to an enum, but we don't have an equivalent of that global variable.

Jan



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/4] mwait-idle: add 'preferred_cstates' module argument
  2022-04-27 13:41     ` Jan Beulich
@ 2022-04-27 15:06       ` Roger Pau Monné
  2022-04-27 15:25         ` Jan Beulich
  0 siblings, 1 reply; 16+ messages in thread
From: Roger Pau Monné @ 2022-04-27 15:06 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper, Wei Liu

On Wed, Apr 27, 2022 at 03:41:24PM +0200, Jan Beulich wrote:
> On 27.04.2022 14:45, Roger Pau Monné wrote:
> > On Tue, Apr 26, 2022 at 12:05:28PM +0200, Jan Beulich wrote:
> >> From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
> >>
> >> On Sapphire Rapids Xeon (SPR) the C1 and C1E states are basically mutually
> >> exclusive - only one of them can be enabled. By default, 'intel_idle' driver
> >> enables C1 and disables C1E. However, some users prefer to use C1E instead of
> >> C1, because it saves more energy.
> >>
> >> This patch adds a new module parameter ('preferred_cstates') for enabling C1E
> >> and disabling C1. Here is the idea behind it.
> >>
> >> 1. This option has effect only for "mutually exclusive" C-states like C1 and
> >>    C1E on SPR.
> >> 2. It does not have any effect on independent C-states, which do not require
> >>    other C-states to be disabled (most states on most platforms as of today).
> >> 3. For mutually exclusive C-states, the 'intel_idle' driver always has a
> >>    reasonable default, such as enabling C1 on SPR by default. On other
> >>    platforms, the default may be different.
> >> 4. Users can override the default using the 'preferred_cstates' parameter.
> >> 5. The parameter accepts the preferred C-states bit-mask, similarly to the
> >>    existing 'states_off' parameter.
> >> 6. This parameter is not limited to C1/C1E, and leaves room for supporting
> >>    other mutually exclusive C-states, if they come in the future.
> >>
> >> Today 'intel_idle' can only be compiled-in, which means that on SPR, in order
> >> to disable C1 and enable C1E, users should boot with the following kernel
> >> argument: intel_idle.preferred_cstates=4
> >>
> >> Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
> >> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> >> Origin: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git da0e58c038e6
> >>
> >> Enable C1E (if requested) not only on the BSP's socket / package.
> > 
> > Maybe we should also add a note here that the command line option for
> > Xen is preferred-cstates instead of intel_idle.preferred_cstates?
> > 
> > I think this is a bad interface however, we should have a more generic
> > option (ie: cstate-mode = 'performance | powersave') so that users
> > don't have to fiddle with model specific C state masks.
> 
> Performance vs powersave doesn't cover it imo, especially if down
> the road more states would appear which can be controlled this way.
> I don't think there's a way around providing _some_ way to control
> things one a per-state level. When porting this over, I too didn't
> like this interface very much, but I had no good replacement idea.

I think it's fine to have this more fine grained control of C states,
but it doesn't seem practical from a user (or distro) PoV.  But then I
also wonder how much of a difference this will make regarding power
consumption.

> >> --- unstable.orig/xen/arch/x86/cpu/mwait-idle.c
> >> +++ unstable/xen/arch/x86/cpu/mwait-idle.c
> >> @@ -82,6 +82,18 @@ boolean_param("mwait-idle", opt_mwait_id
> >>  
> >>  static unsigned int mwait_substates;
> >>  
> >> +/*
> >> + * Some platforms come with mutually exclusive C-states, so that if one is
> >> + * enabled, the other C-states must not be used. Example: C1 and C1E on
> >> + * Sapphire Rapids platform. This parameter allows for selecting the
> >> + * preferred C-states among the groups of mutually exclusive C-states - the
> >> + * selected C-states will be registered, the other C-states from the mutually
> >> + * exclusive group won't be registered. If the platform has no mutually
> >> + * exclusive C-states, this parameter has no effect.
> >> + */
> >> +static unsigned int __ro_after_init preferred_states_mask;
> >> +integer_param("preferred-cstates", preferred_states_mask);
> >> +
> >>  #define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF
> >>  /* Reliable LAPIC Timer States, bit 1 for C1 etc. Default to only C1. */
> >>  static unsigned int lapic_timer_reliable_states = (1 << 1);
> >> @@ -96,6 +108,7 @@ struct idle_cpu {
> >>  	unsigned long auto_demotion_disable_flags;
> >>  	bool byt_auto_demotion_disable_flag;
> >>  	bool disable_promotion_to_c1e;
> >> +	bool enable_promotion_to_c1e;
> > 
> > I'm confused by those fields, shouldn't we just have:
> > promotion_to_c1e = true | false?
> > 
> > As one field is the negation of the other:
> > enable_promotion_to_c1e = !disable_promotion_to_c1e
> > 
> > I know this is code from Linux, but would like to understand why two
> > fields are needed.
> 
> This really is a tristate; Linux is now changing their global variable
> to an enum, but we don't have an equivalent of that global variable.

So it would be: leave default, disable C1E promotion, enable C1E
promotion.

And Linux is leaving the {disable,enable}_promotion_to_c1e in
idle_cpu?

I guess there's not much we can do unless we want to diverge from
upstream.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/4] mwait-idle: add 'preferred_cstates' module argument
  2022-04-27 15:06       ` Roger Pau Monné
@ 2022-04-27 15:25         ` Jan Beulich
  2022-04-27 16:12           ` Roger Pau Monné
  0 siblings, 1 reply; 16+ messages in thread
From: Jan Beulich @ 2022-04-27 15:25 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: xen-devel, Andrew Cooper, Wei Liu

On 27.04.2022 17:06, Roger Pau Monné wrote:
> On Wed, Apr 27, 2022 at 03:41:24PM +0200, Jan Beulich wrote:
>> On 27.04.2022 14:45, Roger Pau Monné wrote:
>>> On Tue, Apr 26, 2022 at 12:05:28PM +0200, Jan Beulich wrote:
>>>> --- unstable.orig/xen/arch/x86/cpu/mwait-idle.c
>>>> +++ unstable/xen/arch/x86/cpu/mwait-idle.c
>>>> @@ -82,6 +82,18 @@ boolean_param("mwait-idle", opt_mwait_id
>>>>  
>>>>  static unsigned int mwait_substates;
>>>>  
>>>> +/*
>>>> + * Some platforms come with mutually exclusive C-states, so that if one is
>>>> + * enabled, the other C-states must not be used. Example: C1 and C1E on
>>>> + * Sapphire Rapids platform. This parameter allows for selecting the
>>>> + * preferred C-states among the groups of mutually exclusive C-states - the
>>>> + * selected C-states will be registered, the other C-states from the mutually
>>>> + * exclusive group won't be registered. If the platform has no mutually
>>>> + * exclusive C-states, this parameter has no effect.
>>>> + */
>>>> +static unsigned int __ro_after_init preferred_states_mask;
>>>> +integer_param("preferred-cstates", preferred_states_mask);
>>>> +
>>>>  #define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF
>>>>  /* Reliable LAPIC Timer States, bit 1 for C1 etc. Default to only C1. */
>>>>  static unsigned int lapic_timer_reliable_states = (1 << 1);
>>>> @@ -96,6 +108,7 @@ struct idle_cpu {
>>>>  	unsigned long auto_demotion_disable_flags;
>>>>  	bool byt_auto_demotion_disable_flag;
>>>>  	bool disable_promotion_to_c1e;
>>>> +	bool enable_promotion_to_c1e;
>>>
>>> I'm confused by those fields, shouldn't we just have:
>>> promotion_to_c1e = true | false?
>>>
>>> As one field is the negation of the other:
>>> enable_promotion_to_c1e = !disable_promotion_to_c1e
>>>
>>> I know this is code from Linux, but would like to understand why two
>>> fields are needed.
>>
>> This really is a tristate; Linux is now changing their global variable
>> to an enum, but we don't have an equivalent of that global variable.
> 
> So it would be: leave default, disable C1E promotion, enable C1E
> promotion.
> 
> And Linux is leaving the {disable,enable}_promotion_to_c1e in
> idle_cpu?

Iirc they only have disable_promotion_to_c1e there (as a struct field)
and keep it, but they convert the similarly named file-scope variable
to a tristate.

> I guess there's not much we can do unless we want to diverge from
> upstream.

We've diverged some from Linux here already - as said, for example we
don't have their file-scope variable. I could convert our struct field
to an enum, but that would be larger code churn for (I think) little
gain.

Jan



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/4] mwait-idle: add 'preferred_cstates' module argument
  2022-04-27 15:25         ` Jan Beulich
@ 2022-04-27 16:12           ` Roger Pau Monné
  2022-04-28  6:37             ` Jan Beulich
  0 siblings, 1 reply; 16+ messages in thread
From: Roger Pau Monné @ 2022-04-27 16:12 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper, Wei Liu

On Wed, Apr 27, 2022 at 05:25:35PM +0200, Jan Beulich wrote:
> On 27.04.2022 17:06, Roger Pau Monné wrote:
> > On Wed, Apr 27, 2022 at 03:41:24PM +0200, Jan Beulich wrote:
> >> On 27.04.2022 14:45, Roger Pau Monné wrote:
> >>> On Tue, Apr 26, 2022 at 12:05:28PM +0200, Jan Beulich wrote:
> >>>> --- unstable.orig/xen/arch/x86/cpu/mwait-idle.c
> >>>> +++ unstable/xen/arch/x86/cpu/mwait-idle.c
> >>>> @@ -82,6 +82,18 @@ boolean_param("mwait-idle", opt_mwait_id
> >>>>  
> >>>>  static unsigned int mwait_substates;
> >>>>  
> >>>> +/*
> >>>> + * Some platforms come with mutually exclusive C-states, so that if one is
> >>>> + * enabled, the other C-states must not be used. Example: C1 and C1E on
> >>>> + * Sapphire Rapids platform. This parameter allows for selecting the
> >>>> + * preferred C-states among the groups of mutually exclusive C-states - the
> >>>> + * selected C-states will be registered, the other C-states from the mutually
> >>>> + * exclusive group won't be registered. If the platform has no mutually
> >>>> + * exclusive C-states, this parameter has no effect.
> >>>> + */
> >>>> +static unsigned int __ro_after_init preferred_states_mask;
> >>>> +integer_param("preferred-cstates", preferred_states_mask);
> >>>> +
> >>>>  #define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF
> >>>>  /* Reliable LAPIC Timer States, bit 1 for C1 etc. Default to only C1. */
> >>>>  static unsigned int lapic_timer_reliable_states = (1 << 1);
> >>>> @@ -96,6 +108,7 @@ struct idle_cpu {
> >>>>  	unsigned long auto_demotion_disable_flags;
> >>>>  	bool byt_auto_demotion_disable_flag;
> >>>>  	bool disable_promotion_to_c1e;
> >>>> +	bool enable_promotion_to_c1e;
> >>>
> >>> I'm confused by those fields, shouldn't we just have:
> >>> promotion_to_c1e = true | false?
> >>>
> >>> As one field is the negation of the other:
> >>> enable_promotion_to_c1e = !disable_promotion_to_c1e
> >>>
> >>> I know this is code from Linux, but would like to understand why two
> >>> fields are needed.
> >>
> >> This really is a tristate; Linux is now changing their global variable
> >> to an enum, but we don't have an equivalent of that global variable.
> > 
> > So it would be: leave default, disable C1E promotion, enable C1E
> > promotion.
> > 
> > And Linux is leaving the {disable,enable}_promotion_to_c1e in
> > idle_cpu?
> 
> Iirc they only have disable_promotion_to_c1e there (as a struct field)
> and keep it, but they convert the similarly named file-scope variable
> to a tristate.
> 
> > I guess there's not much we can do unless we want to diverge from
> > upstream.
> 
> We've diverged some from Linux here already - as said, for example we
> don't have their file-scope variable. I could convert our struct field
> to an enum, but that would be larger code churn for (I think) little
> gain.

Hm, OK, could gaining the file scope variable would make sense in order
to reduce divergences?  Or are the other roadblocks there?

I think this is ugly, but would make sense as long as it allows us to
keep closer to upstream.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/4] mwait-idle: add 'preferred_cstates' module argument
  2022-04-27 16:12           ` Roger Pau Monné
@ 2022-04-28  6:37             ` Jan Beulich
  2022-04-28  8:06               ` Roger Pau Monné
  0 siblings, 1 reply; 16+ messages in thread
From: Jan Beulich @ 2022-04-28  6:37 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: xen-devel, Andrew Cooper, Wei Liu

On 27.04.2022 18:12, Roger Pau Monné wrote:
> On Wed, Apr 27, 2022 at 05:25:35PM +0200, Jan Beulich wrote:
>> On 27.04.2022 17:06, Roger Pau Monné wrote:
>>> On Wed, Apr 27, 2022 at 03:41:24PM +0200, Jan Beulich wrote:
>>>> On 27.04.2022 14:45, Roger Pau Monné wrote:
>>>>> On Tue, Apr 26, 2022 at 12:05:28PM +0200, Jan Beulich wrote:
>>>>>> --- unstable.orig/xen/arch/x86/cpu/mwait-idle.c
>>>>>> +++ unstable/xen/arch/x86/cpu/mwait-idle.c
>>>>>> @@ -82,6 +82,18 @@ boolean_param("mwait-idle", opt_mwait_id
>>>>>>  
>>>>>>  static unsigned int mwait_substates;
>>>>>>  
>>>>>> +/*
>>>>>> + * Some platforms come with mutually exclusive C-states, so that if one is
>>>>>> + * enabled, the other C-states must not be used. Example: C1 and C1E on
>>>>>> + * Sapphire Rapids platform. This parameter allows for selecting the
>>>>>> + * preferred C-states among the groups of mutually exclusive C-states - the
>>>>>> + * selected C-states will be registered, the other C-states from the mutually
>>>>>> + * exclusive group won't be registered. If the platform has no mutually
>>>>>> + * exclusive C-states, this parameter has no effect.
>>>>>> + */
>>>>>> +static unsigned int __ro_after_init preferred_states_mask;
>>>>>> +integer_param("preferred-cstates", preferred_states_mask);
>>>>>> +
>>>>>>  #define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF
>>>>>>  /* Reliable LAPIC Timer States, bit 1 for C1 etc. Default to only C1. */
>>>>>>  static unsigned int lapic_timer_reliable_states = (1 << 1);
>>>>>> @@ -96,6 +108,7 @@ struct idle_cpu {
>>>>>>  	unsigned long auto_demotion_disable_flags;
>>>>>>  	bool byt_auto_demotion_disable_flag;
>>>>>>  	bool disable_promotion_to_c1e;
>>>>>> +	bool enable_promotion_to_c1e;
>>>>>
>>>>> I'm confused by those fields, shouldn't we just have:
>>>>> promotion_to_c1e = true | false?
>>>>>
>>>>> As one field is the negation of the other:
>>>>> enable_promotion_to_c1e = !disable_promotion_to_c1e
>>>>>
>>>>> I know this is code from Linux, but would like to understand why two
>>>>> fields are needed.
>>>>
>>>> This really is a tristate; Linux is now changing their global variable
>>>> to an enum, but we don't have an equivalent of that global variable.
>>>
>>> So it would be: leave default, disable C1E promotion, enable C1E
>>> promotion.
>>>
>>> And Linux is leaving the {disable,enable}_promotion_to_c1e in
>>> idle_cpu?
>>
>> Iirc they only have disable_promotion_to_c1e there (as a struct field)
>> and keep it, but they convert the similarly named file-scope variable
>> to a tristate.
>>
>>> I guess there's not much we can do unless we want to diverge from
>>> upstream.
>>
>> We've diverged some from Linux here already - as said, for example we
>> don't have their file-scope variable. I could convert our struct field
>> to an enum, but that would be larger code churn for (I think) little
>> gain.
> 
> Hm, OK, could gaining the file scope variable would make sense in order
> to reduce divergences?  Or are the other roadblocks there?

I don't recall. It might have originated from a change I decided to not
port over, or I might have dropped it while porting. To be honest I'm
not keen on putting time into researching this, the more that I would
generally try to avoid static variables.

What I would be willing to put time in is making a more user friendly
command line option, but as said - I can't think of any good alternative
(except perhaps "preferred-cstates=c1e" or "cstates=preferred:c1e", with
internal translation of the strings into a bit mask, as long as (a) you
would think that's an improvement and (b) the further divergence from
Linux is not deemed a problem).

Jan

> I think this is ugly, but would make sense as long as it allows us to
> keep closer to upstream.
> 
> Thanks, Roger.
> 



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/4] mwait-idle: add 'preferred_cstates' module argument
  2022-04-28  6:37             ` Jan Beulich
@ 2022-04-28  8:06               ` Roger Pau Monné
  2022-04-28  8:10                 ` Jan Beulich
  0 siblings, 1 reply; 16+ messages in thread
From: Roger Pau Monné @ 2022-04-28  8:06 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper, Wei Liu

On Thu, Apr 28, 2022 at 08:37:58AM +0200, Jan Beulich wrote:
> On 27.04.2022 18:12, Roger Pau Monné wrote:
> > On Wed, Apr 27, 2022 at 05:25:35PM +0200, Jan Beulich wrote:
> >> On 27.04.2022 17:06, Roger Pau Monné wrote:
> >>> On Wed, Apr 27, 2022 at 03:41:24PM +0200, Jan Beulich wrote:
> >>>> On 27.04.2022 14:45, Roger Pau Monné wrote:
> >>>>> On Tue, Apr 26, 2022 at 12:05:28PM +0200, Jan Beulich wrote:
> >>>>>> --- unstable.orig/xen/arch/x86/cpu/mwait-idle.c
> >>>>>> +++ unstable/xen/arch/x86/cpu/mwait-idle.c
> >>>>>> @@ -82,6 +82,18 @@ boolean_param("mwait-idle", opt_mwait_id
> >>>>>>  
> >>>>>>  static unsigned int mwait_substates;
> >>>>>>  
> >>>>>> +/*
> >>>>>> + * Some platforms come with mutually exclusive C-states, so that if one is
> >>>>>> + * enabled, the other C-states must not be used. Example: C1 and C1E on
> >>>>>> + * Sapphire Rapids platform. This parameter allows for selecting the
> >>>>>> + * preferred C-states among the groups of mutually exclusive C-states - the
> >>>>>> + * selected C-states will be registered, the other C-states from the mutually
> >>>>>> + * exclusive group won't be registered. If the platform has no mutually
> >>>>>> + * exclusive C-states, this parameter has no effect.
> >>>>>> + */
> >>>>>> +static unsigned int __ro_after_init preferred_states_mask;
> >>>>>> +integer_param("preferred-cstates", preferred_states_mask);
> >>>>>> +
> >>>>>>  #define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF
> >>>>>>  /* Reliable LAPIC Timer States, bit 1 for C1 etc. Default to only C1. */
> >>>>>>  static unsigned int lapic_timer_reliable_states = (1 << 1);
> >>>>>> @@ -96,6 +108,7 @@ struct idle_cpu {
> >>>>>>  	unsigned long auto_demotion_disable_flags;
> >>>>>>  	bool byt_auto_demotion_disable_flag;
> >>>>>>  	bool disable_promotion_to_c1e;
> >>>>>> +	bool enable_promotion_to_c1e;
> >>>>>
> >>>>> I'm confused by those fields, shouldn't we just have:
> >>>>> promotion_to_c1e = true | false?
> >>>>>
> >>>>> As one field is the negation of the other:
> >>>>> enable_promotion_to_c1e = !disable_promotion_to_c1e
> >>>>>
> >>>>> I know this is code from Linux, but would like to understand why two
> >>>>> fields are needed.
> >>>>
> >>>> This really is a tristate; Linux is now changing their global variable
> >>>> to an enum, but we don't have an equivalent of that global variable.
> >>>
> >>> So it would be: leave default, disable C1E promotion, enable C1E
> >>> promotion.
> >>>
> >>> And Linux is leaving the {disable,enable}_promotion_to_c1e in
> >>> idle_cpu?
> >>
> >> Iirc they only have disable_promotion_to_c1e there (as a struct field)
> >> and keep it, but they convert the similarly named file-scope variable
> >> to a tristate.
> >>
> >>> I guess there's not much we can do unless we want to diverge from
> >>> upstream.
> >>
> >> We've diverged some from Linux here already - as said, for example we
> >> don't have their file-scope variable. I could convert our struct field
> >> to an enum, but that would be larger code churn for (I think) little
> >> gain.
> > 
> > Hm, OK, could gaining the file scope variable would make sense in order
> > to reduce divergences?  Or are the other roadblocks there?
> 
> I don't recall. It might have originated from a change I decided to not
> port over, or I might have dropped it while porting. To be honest I'm
> not keen on putting time into researching this, the more that I would
> generally try to avoid static variables.
> 
> What I would be willing to put time in is making a more user friendly
> command line option, but as said - I can't think of any good alternative
> (except perhaps "preferred-cstates=c1e" or "cstates=preferred:c1e", with
> internal translation of the strings into a bit mask, as long as (a) you
> would think that's an improvement and (b) the further divergence from
> Linux is not deemed a problem).

I think (b) won't be a problem as long as internally the user option
is translated into a bitmask.

Regarding (a) I do think it would be helpful to express this in a more
user friendly way, I'm not sure whether it would make sense to keep
Linux format also for compatibility reasons if users already have a
bitmask and want to use the same parameter for Xen and Linux, ie:

preferred-cstates = <string of c1e,c1,...> | <integer bitmask>

What I think we should fix is the naming of the two booleans:

bool disable_promotion_to_c1e;
bool enable_promotion_to_c1e;

I would rather translated this into an enum, as right now it's
confusing IMO.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/4] mwait-idle: add 'preferred_cstates' module argument
  2022-04-28  8:06               ` Roger Pau Monné
@ 2022-04-28  8:10                 ` Jan Beulich
  0 siblings, 0 replies; 16+ messages in thread
From: Jan Beulich @ 2022-04-28  8:10 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: xen-devel, Andrew Cooper, Wei Liu

On 28.04.2022 10:06, Roger Pau Monné wrote:
> On Thu, Apr 28, 2022 at 08:37:58AM +0200, Jan Beulich wrote:
>> On 27.04.2022 18:12, Roger Pau Monné wrote:
>>> On Wed, Apr 27, 2022 at 05:25:35PM +0200, Jan Beulich wrote:
>>>> On 27.04.2022 17:06, Roger Pau Monné wrote:
>>>>> On Wed, Apr 27, 2022 at 03:41:24PM +0200, Jan Beulich wrote:
>>>>>> On 27.04.2022 14:45, Roger Pau Monné wrote:
>>>>>>> On Tue, Apr 26, 2022 at 12:05:28PM +0200, Jan Beulich wrote:
>>>>>>>> --- unstable.orig/xen/arch/x86/cpu/mwait-idle.c
>>>>>>>> +++ unstable/xen/arch/x86/cpu/mwait-idle.c
>>>>>>>> @@ -82,6 +82,18 @@ boolean_param("mwait-idle", opt_mwait_id
>>>>>>>>  
>>>>>>>>  static unsigned int mwait_substates;
>>>>>>>>  
>>>>>>>> +/*
>>>>>>>> + * Some platforms come with mutually exclusive C-states, so that if one is
>>>>>>>> + * enabled, the other C-states must not be used. Example: C1 and C1E on
>>>>>>>> + * Sapphire Rapids platform. This parameter allows for selecting the
>>>>>>>> + * preferred C-states among the groups of mutually exclusive C-states - the
>>>>>>>> + * selected C-states will be registered, the other C-states from the mutually
>>>>>>>> + * exclusive group won't be registered. If the platform has no mutually
>>>>>>>> + * exclusive C-states, this parameter has no effect.
>>>>>>>> + */
>>>>>>>> +static unsigned int __ro_after_init preferred_states_mask;
>>>>>>>> +integer_param("preferred-cstates", preferred_states_mask);
>>>>>>>> +
>>>>>>>>  #define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF
>>>>>>>>  /* Reliable LAPIC Timer States, bit 1 for C1 etc. Default to only C1. */
>>>>>>>>  static unsigned int lapic_timer_reliable_states = (1 << 1);
>>>>>>>> @@ -96,6 +108,7 @@ struct idle_cpu {
>>>>>>>>  	unsigned long auto_demotion_disable_flags;
>>>>>>>>  	bool byt_auto_demotion_disable_flag;
>>>>>>>>  	bool disable_promotion_to_c1e;
>>>>>>>> +	bool enable_promotion_to_c1e;
>>>>>>>
>>>>>>> I'm confused by those fields, shouldn't we just have:
>>>>>>> promotion_to_c1e = true | false?
>>>>>>>
>>>>>>> As one field is the negation of the other:
>>>>>>> enable_promotion_to_c1e = !disable_promotion_to_c1e
>>>>>>>
>>>>>>> I know this is code from Linux, but would like to understand why two
>>>>>>> fields are needed.
>>>>>>
>>>>>> This really is a tristate; Linux is now changing their global variable
>>>>>> to an enum, but we don't have an equivalent of that global variable.
>>>>>
>>>>> So it would be: leave default, disable C1E promotion, enable C1E
>>>>> promotion.
>>>>>
>>>>> And Linux is leaving the {disable,enable}_promotion_to_c1e in
>>>>> idle_cpu?
>>>>
>>>> Iirc they only have disable_promotion_to_c1e there (as a struct field)
>>>> and keep it, but they convert the similarly named file-scope variable
>>>> to a tristate.
>>>>
>>>>> I guess there's not much we can do unless we want to diverge from
>>>>> upstream.
>>>>
>>>> We've diverged some from Linux here already - as said, for example we
>>>> don't have their file-scope variable. I could convert our struct field
>>>> to an enum, but that would be larger code churn for (I think) little
>>>> gain.
>>>
>>> Hm, OK, could gaining the file scope variable would make sense in order
>>> to reduce divergences?  Or are the other roadblocks there?
>>
>> I don't recall. It might have originated from a change I decided to not
>> port over, or I might have dropped it while porting. To be honest I'm
>> not keen on putting time into researching this, the more that I would
>> generally try to avoid static variables.
>>
>> What I would be willing to put time in is making a more user friendly
>> command line option, but as said - I can't think of any good alternative
>> (except perhaps "preferred-cstates=c1e" or "cstates=preferred:c1e", with
>> internal translation of the strings into a bit mask, as long as (a) you
>> would think that's an improvement and (b) the further divergence from
>> Linux is not deemed a problem).
> 
> I think (b) won't be a problem as long as internally the user option
> is translated into a bitmask.
> 
> Regarding (a) I do think it would be helpful to express this in a more
> user friendly way, I'm not sure whether it would make sense to keep
> Linux format also for compatibility reasons if users already have a
> bitmask and want to use the same parameter for Xen and Linux, ie:
> 
> preferred-cstates = <string of c1e,c1,...> | <integer bitmask>

Yes, I would have been planning to accept both (but probably reject
mixing of both).

> What I think we should fix is the naming of the two booleans:
> 
> bool disable_promotion_to_c1e;
> bool enable_promotion_to_c1e;
> 
> I would rather translated this into an enum, as right now it's
> confusing IMO.

Okay, I can certainly do that. The more that I did consider doing
so already, breaking ties simply based on the "less code churn"
argument.

Jan



^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2022-04-28  8:11 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-26 10:03 [PATCH 0/4] x86/mwait-idle: (mostly) SPR support Jan Beulich
2022-04-26 10:04 ` [PATCH 1/4] x86/mwait-idle: switch to asm/intel-family.h naming Jan Beulich
2022-04-27 11:21   ` Roger Pau Monné
2022-04-26 10:05 ` [PATCH 2/4] mwait-idle: add SPR support Jan Beulich
2022-04-27 11:39   ` Roger Pau Monné
2022-04-26 10:05 ` [PATCH 3/4] mwait-idle: add 'preferred_cstates' module argument Jan Beulich
2022-04-27 12:45   ` Roger Pau Monné
2022-04-27 13:41     ` Jan Beulich
2022-04-27 15:06       ` Roger Pau Monné
2022-04-27 15:25         ` Jan Beulich
2022-04-27 16:12           ` Roger Pau Monné
2022-04-28  6:37             ` Jan Beulich
2022-04-28  8:06               ` Roger Pau Monné
2022-04-28  8:10                 ` Jan Beulich
2022-04-26 10:06 ` [PATCH 4/4] mwait-idle: add core C6 optimization for SPR Jan Beulich
2022-04-27 12:56   ` Roger Pau Monné

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.