All of lore.kernel.org
 help / color / mirror / Atom feed
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
To: rjw@rjwysocki.net
Cc: linux-pm@vger.kernel.org,
	Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Subject: [PATCH] cpufreq: intel_pstate: Enforce _PPC limits
Date: Mon,  4 Apr 2016 16:22:30 -0700	[thread overview]
Message-ID: <1459812150-9111-1-git-send-email-srinivas.pandruvada@linux.intel.com> (raw)

Use ACPI _PPC notification to limit max P state driver will request.
ACPI _PPC change notification is sent by BIOS to limit max P state
in several cases:
- Reduce impact of platform thermal condition
- When Config TDP feature is used, a changed _PPC is sent to
follow TDP change
- Remote node managers in server want to control platform power
via baseboard management controller (BMC)

This change registers with ACPI processor performance lib so that
_PPC changes are notified to cpufreq core, which in turns will
result in call to .setpolicy() callback. Also the way _PSS
table identifies a turbo frequency is not compatible to max turbo
frequency in intel_pstate, so the very first entry in _PSS needs
to be adjusted.

When config TDP feature is on the turbo activation ratio can be less than
max physical non turbo P state. In this case _PPC is set to turbo
activation ratio + 1. In this case we don't need to treat this as the
reduced frequency in set_policy callback, as this is still in the turbo
range. So we set the policy->max to actual policy->cpuinfo.max_freq.
It avoid showing reduced P States max_perf_pct in intel P state sysfs,
when the _PPC is still in turbo range.

This feature can be turned on by using kernel parameters:
intel_pstate=acpi_ppc

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
---
 Documentation/kernel-parameters.txt |   2 +
 drivers/cpufreq/Kconfig.x86         |   1 +
 drivers/cpufreq/intel_pstate.c      | 161 +++++++++++++++++++++++++++++++++++-
 3 files changed, 162 insertions(+), 2 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index ecc74fa..b7714bf 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1661,6 +1661,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 		hwp_only
 			Only load intel_pstate on systems which support
 			hardware P state control (HWP) if available.
+		acpi_ppc
+			Enforce ACPI _PPC performance limits.
 
 	intremap=	[X86-64, Intel-IOMMU]
 			on	enable Interrupt Remapping (default)
diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
index c59bdcb..adbd1de 100644
--- a/drivers/cpufreq/Kconfig.x86
+++ b/drivers/cpufreq/Kconfig.x86
@@ -5,6 +5,7 @@
 config X86_INTEL_PSTATE
        bool "Intel P state control"
        depends on X86
+       select ACPI_PROCESSOR if ACPI
        help
           This driver provides a P state for Intel core processors.
 	  The driver implements an internal governor and will become
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 8b5a415..b10ea73 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -39,6 +39,10 @@
 #define ATOM_TURBO_RATIOS	0x66c
 #define ATOM_TURBO_VIDS		0x66d
 
+#if IS_ENABLED(CONFIG_ACPI)
+#include <acpi/processor.h>
+#endif
+
 #define FRAC_BITS 8
 #define int_tofp(X) ((int64_t)(X) << FRAC_BITS)
 #define fp_toint(X) ((X) >> FRAC_BITS)
@@ -190,6 +194,10 @@ struct cpudata {
 	u64	prev_tsc;
 	u64	prev_cummulative_iowait;
 	struct sample sample;
+#if IS_ENABLED(CONFIG_ACPI)
+	struct acpi_processor_performance acpi_perf_data;
+	bool valid_pss_table;
+#endif
 };
 
 static struct cpudata **all_cpu_data;
@@ -257,7 +265,7 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu);
 static struct pstate_adjust_policy pid_params;
 static struct pstate_funcs pstate_funcs;
 static int hwp_active;
-
+static int acpi_ppc;
 
 /**
  * struct perf_limits - Store user and policy limits
@@ -331,6 +339,117 @@ static struct perf_limits *limits = &performance_limits;
 static struct perf_limits *limits = &powersave_limits;
 #endif
 
+#if IS_ENABLED(CONFIG_ACPI)
+/*
+ * The max target pstate ratio is a 8 bit value in both PLATFORM_INFO MSR and
+ * in TURBO_RATIO_LIMIT MSR, which pstate driver stores in max_pstate and
+ * max_turbo_pstate fields. The PERF_CTL MSR contains 16 bit value for P state
+ * ratio, out of it only high 8 bits are used. For example 0x1700 is setting
+ * target ratio 0x17. The _PSS control value stores in a format which can be
+ * directly written to PERF_CTL MSR. But in intel_pstate driver this shift
+ * occurs during write to PERF_CTL (E.g. for cores core_set_pstate()).
+ * This function converts the _PSS control value to intel pstate driver format
+ * for comparison and assignment.
+ */
+static int convert_to_native_pstate_format(struct cpudata *cpu, int index)
+{
+	return cpu->acpi_perf_data.states[index].control >> 8;
+}
+
+static int intel_pstate_init_perf_limits(struct cpufreq_policy *policy)
+{
+	struct cpudata *cpu;
+	int turbo_pss_ctl;
+	int ret;
+	int i;
+
+	cpu = all_cpu_data[policy->cpu];
+
+	if (!cpu->acpi_perf_data.shared_cpu_map &&
+	    zalloc_cpumask_var_node(&cpu->acpi_perf_data.shared_cpu_map,
+				    GFP_KERNEL, cpu_to_node(policy->cpu))) {
+		return -ENOMEM;
+	}
+
+	ret = acpi_processor_register_performance(&cpu->acpi_perf_data,
+						  policy->cpu);
+	if (ret)
+		return ret;
+
+	/*
+	 * Check if the control value in _PSS is for PERF_CTL MSR, which should
+	 * guarantee that the states returned by it map to the states in our
+	 * list directly.
+	 */
+	if (cpu->acpi_perf_data.control_register.space_id !=
+						ACPI_ADR_SPACE_FIXED_HARDWARE)
+		goto unreg_perf;
+
+	/*
+	 * If there is only one entry _PSS, simply ignore _PSS and continue as
+	 * usual without taking _PSS into account
+	 */
+	if (cpu->acpi_perf_data.state_count < 2)
+		goto unreg_perf;
+
+	pr_debug("intel_pstate: CPU%u - ACPI _PSS perf data\n", policy->cpu);
+	for (i = 0; i < cpu->acpi_perf_data.state_count; i++) {
+		pr_debug("     %cP%d: %u MHz, %u mW, 0x%x\n",
+			 (i == cpu->acpi_perf_data.state ? '*' : ' '), i,
+			 (u32) cpu->acpi_perf_data.states[i].core_frequency,
+			 (u32) cpu->acpi_perf_data.states[i].power,
+			 (u32) cpu->acpi_perf_data.states[i].control);
+	}
+
+	/*
+	 * The _PSS table doesn't contain whole turbo frequency range.
+	 * This just contains +1 MHZ above the max non turbo frequency,
+	 * with control value corresponding to max turbo ratio. But
+	 * when cpufreq set policy is called, it will call with this
+	 * max frequency, which will cause a reduced performance as
+	 * this driver uses real max turbo frequency as the max
+	 * frequeny. So correct this frequency in _PSS table to
+	 * correct max turbo frequency based on the turbo ratio.
+	 * Also need to convert to MHz as _PSS freq is in MHz.
+	 */
+	turbo_pss_ctl = convert_to_native_pstate_format(cpu, 0);
+	if (turbo_pss_ctl > cpu->pstate.max_pstate)
+		cpu->acpi_perf_data.states[0].core_frequency =
+					policy->cpuinfo.max_freq / 1000;
+	cpu->valid_pss_table = true;
+	pr_info("intel_pstate: _PPC limits will be enforced\n");
+
+	return 0;
+unreg_perf:
+	cpu->valid_pss_table = false;
+	acpi_processor_unregister_performance(policy->cpu);
+	return -EINVAL;
+}
+
+static int intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
+{
+	struct cpudata *cpu;
+
+	cpu = all_cpu_data[policy->cpu];
+	if (!acpi_ppc || !cpu->valid_pss_table)
+		return 0;
+
+	acpi_processor_unregister_performance(policy->cpu);
+	return 0;
+}
+
+#else
+static int intel_pstate_init_perf_limits(struct cpufreq_policy *policy)
+{
+	return 0;
+}
+
+static int intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
+{
+	return 0;
+}
+#endif
+
 static inline void pid_reset(struct _pid *pid, int setpoint, int busy,
 			     int deadband, int integral) {
 	pid->setpoint = int_tofp(setpoint);
@@ -1297,6 +1416,30 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
 
 	intel_pstate_clear_update_util_hook(policy->cpu);
 
+	if (acpi_ppc) {
+		struct cpudata *cpu;
+
+		/*
+		 * If the platform has config TDP feature, then to indicate
+		 * start of turbo range _PPC is set to one more than the turbo
+		 * activation ratio, which is cpu->pstate.max_pstate. Here the
+		 * updated frequency corresponding to _PPC is reflected in
+		 * policy->max. This  means that this _PPC setting still
+		 * allowing system to reach policy->cpuinfo.max_freq anyway as
+		 * this is turbo range.
+		 * In this case showing restricted limits in intel_pstate
+		 * sysfs or setting limits->max_perf to a lower value has
+		 * no meaning.
+		 */
+		cpu = all_cpu_data[0];
+		if (policy->max < policy->cpuinfo.max_freq &&
+		    policy->max > (cpu->pstate.max_pstate *
+					cpu->pstate.scaling)) {
+			pr_info("intel_pstate: _PPC > Max non Turbo P_state\n");
+			policy->max = policy->cpuinfo.max_freq;
+		}
+	}
+
 	if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) {
 		limits = &performance_limits;
 		if (policy->max >= policy->cpuinfo.max_freq) {
@@ -1392,18 +1535,30 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
 	policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling;
 	policy->cpuinfo.max_freq =
 		cpu->pstate.turbo_pstate * cpu->pstate.scaling;
+	if (acpi_ppc)
+		intel_pstate_init_perf_limits(policy);
+	/*
+	 * If there is no acpi perf data or error, we ignore and use Intel P
+	 * state calculated limits, So this is not fatal error.
+	 */
 	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
 	cpumask_set_cpu(policy->cpu, policy->cpus);
 
 	return 0;
 }
 
+static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
+{
+	return intel_pstate_exit_perf_limits(policy);
+}
+
 static struct cpufreq_driver intel_pstate_driver = {
 	.flags		= CPUFREQ_CONST_LOOPS,
 	.verify		= intel_pstate_verify_policy,
 	.setpolicy	= intel_pstate_set_policy,
 	.get		= intel_pstate_get,
 	.init		= intel_pstate_cpu_init,
+	.exit		= intel_pstate_cpu_exit,
 	.stop_cpu	= intel_pstate_stop_cpu,
 	.name		= "intel_pstate",
 };
@@ -1448,7 +1603,6 @@ static void copy_cpu_funcs(struct pstate_funcs *funcs)
 }
 
 #if IS_ENABLED(CONFIG_ACPI)
-#include <acpi/processor.h>
 
 static bool intel_pstate_no_acpi_pss(void)
 {
@@ -1654,6 +1808,9 @@ static int __init intel_pstate_setup(char *str)
 		force_load = 1;
 	if (!strcmp(str, "hwp_only"))
 		hwp_only = 1;
+	if (!strcmp(str, "acpi_ppc"))
+		acpi_ppc = 1;
+
 	return 0;
 }
 early_param("intel_pstate", intel_pstate_setup);
-- 
2.4.3


             reply	other threads:[~2016-04-04 23:22 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-04-04 23:22 Srinivas Pandruvada [this message]
2016-04-20  1:20 ` [PATCH] cpufreq: intel_pstate: Enforce _PPC limits Rafael J. Wysocki
2016-04-20 20:11   ` Srinivas Pandruvada
2016-04-20 14:02 ` Konstantin Khlebnikov
2016-04-20 17:28   ` Srinivas Pandruvada

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1459812150-9111-1-git-send-email-srinivas.pandruvada@linux.intel.com \
    --to=srinivas.pandruvada@linux.intel.com \
    --cc=linux-pm@vger.kernel.org \
    --cc=rjw@rjwysocki.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.