linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] platform/x86: Add support for Uncore frequency control
@ 2019-12-14  0:14 Srinivas Pandruvada
  2019-12-14  0:14 ` [PATCH 2/2] admin guide/pm: Admin guide for Intel Uncore Frequency limits Srinivas Pandruvada
  2019-12-18 16:51 ` [PATCH 1/2] platform/x86: Add support for Uncore frequency control Andy Shevchenko
  0 siblings, 2 replies; 5+ messages in thread
From: Srinivas Pandruvada @ 2019-12-14  0:14 UTC (permalink / raw)
  To: dvhart, andy, corbet
  Cc: linux-doc, platform-driver-x86, linux-kernel, lenb, rjw,
	Srinivas Pandruvada

Some server users set limits on the uncore frequency using MSR 620H, while
running latency sensitive workloads. Here uncore frequency controls
RING/LLC(last-level cache) clocks.

But MSR control is not always possible from the user space, so this driver
provides a sysfs interface to set max and min frequency limits. This MSR
620H is a die scoped in multi-die system or package scoped in non multi-die
systems.

When this driver is loaded, a new directory is created under
 /sys/devices/system/cpu.

For example on a two package Skylake server:
$cd /sys/devices/system/cpu/intel_uncore_frequency

$ls
package_00_die_00 package_01_die_00

$ls package_00_die_00
max_freq_khz  min_freq_khz  power_up_max_freq_khz
power_up_min_freq_khz

$grep . *
    max_freq_khz:2400000
    min_freq_khz:1200000
    power_up_max_freq_khz:2400000
    power_up_min_freq_khz:1200000

Here, power_up_max_freq_khz and power_up_min_freq_khz are read only
attributes to show power up values of max and min frequencies respectively.
Other attributes are read-write, so that users can modify.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
---
 drivers/platform/x86/Kconfig                  |  11 +
 drivers/platform/x86/Makefile                 |   1 +
 drivers/platform/x86/intel-uncore-frequency.c | 434 ++++++++++++++++++
 3 files changed, 446 insertions(+)
 create mode 100644 drivers/platform/x86/intel-uncore-frequency.c

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 27d5b40fb717..6013c3b96cfd 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1337,6 +1337,17 @@ config PCENGINES_APU2
 	  To compile this driver as a module, choose M here: the module
 	  will be called pcengines-apuv2.
 
+config INTEL_UNCORE_FREQ_CONTROL
+	tristate "Intel Uncore frequency control driver"
+	depends on X86_64
+	help
+	  This driver allows control of uncore frequency limits on
+	  supported server platforms.
+	  Uncore frequency controls RING/LLC (last-level cache) clocks.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called intel-uncore-frequency.
+
 source "drivers/platform/x86/intel_speed_select_if/Kconfig"
 
 config SYSTEM76_ACPI
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index 42d85a00be4e..3747b1f07cf1 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -105,3 +105,4 @@ obj-$(CONFIG_INTEL_ATOMISP2_PM)	+= intel_atomisp2_pm.o
 obj-$(CONFIG_PCENGINES_APU2)	+= pcengines-apuv2.o
 obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += intel_speed_select_if/
 obj-$(CONFIG_SYSTEM76_ACPI)	+= system76_acpi.o
+obj-$(CONFIG_INTEL_UNCORE_FREQ_CONTROL)	+= intel-uncore-frequency.o
diff --git a/drivers/platform/x86/intel-uncore-frequency.c b/drivers/platform/x86/intel-uncore-frequency.c
new file mode 100644
index 000000000000..82ee5a3107cf
--- /dev/null
+++ b/drivers/platform/x86/intel-uncore-frequency.c
@@ -0,0 +1,434 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Uncore Frequency Setting
+ * Copyright (c) 2019, Intel Corporation.
+ * All rights reserved.
+ *
+ * Provide interface to set MSR 620 at a granularity of per die. On CPU online,
+ * one control CPU is identified per die to read/write limit. This control CPU
+ * is changed, if the CPU state is changed to offline. When the last CPU is
+ * offline in a die then remove the sysfs object for that die.
+ * The majority of actual code is related to sysfs create and read/write
+ * attributes.
+ *
+ * Author: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+ */
+
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+
+#define MSR_UNCORE_RATIO_LIMIT			0x620
+#define UNCORE_FREQ_KHZ_MULTIPLIER		100000
+
+/**
+ * struct uncore_data -	Encapsulate all uncore data
+ * @stored_uncore_data:	Last user changed MSR 620 value, which will be restored
+ *			on system resume.
+ * @power_up_min_freq_khz: Sampled minimum uncore frequency at driver init
+ * @power_up_max_freq_khz: Sampled maximum uncore frequency at driver init
+ * @control_cpu:	Designated CPU for a die to read/write
+ * @valid:		Mark the data valid/invalid
+ *
+ * This structure is used to encapsulate all data related to uncore sysfs
+ * settings for a die/package.
+ */
+struct uncore_data {
+	struct kobject kobj;
+	u64 stored_uncore_data;
+	u32 power_up_min_freq_khz;
+	u32 power_up_max_freq_khz;
+	int control_cpu;
+	bool valid;
+};
+
+#define to_uncore_data(a) container_of(a, struct uncore_data, kobj)
+
+/* Max instances for uncore data, one for each die */
+static int uncore_max_entries __read_mostly;
+/* Storage for uncore data for all instances */
+static struct uncore_data *uncore_instances;
+/* Root of the all uncore sysfs kobjs */
+struct kobject uncore_root_kobj;
+/* Stores the CPU mask of the target CPUs to use during uncore read/write */
+static cpumask_t uncore_cpu_mask;
+/* CPU online callback register instance */
+static enum cpuhp_state uncore_hp_state __read_mostly;
+/* Mutex to control all mutual exclusions */
+static DEFINE_MUTEX(uncore_lock);
+
+struct uncore_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct kobject *kobj,
+			struct attribute *attr, char *buf);
+	ssize_t (*store)(struct kobject *kobj,
+			 struct attribute *attr, const char *c, ssize_t count);
+};
+
+#define define_one_uncore_ro(_name) \
+static struct uncore_attr _name = \
+__ATTR(_name, 0444, show_##_name, NULL)
+
+#define define_one_uncore_rw(_name) \
+static struct uncore_attr _name = \
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+#define show_uncore_data(member_name)					\
+	static ssize_t show_##member_name(struct kobject *kobj,         \
+					  struct attribute *attr,	\
+					  char *buf)			\
+	{                                                               \
+		struct uncore_data *data = to_uncore_data(kobj);	\
+		return scnprintf(buf, PAGE_SIZE, "%u\n",		\
+				 data->member_name);			\
+	}								\
+	define_one_uncore_ro(member_name)
+
+show_uncore_data(power_up_min_freq_khz);
+show_uncore_data(power_up_max_freq_khz);
+
+/* Common function to read MSR 0x620 and read min/max */
+static int uncore_read_ratio(struct uncore_data *data, unsigned int *min,
+			     unsigned int *max)
+{
+	u64 cap;
+	int ret;
+
+	ret = rdmsrl_on_cpu(data->control_cpu, MSR_UNCORE_RATIO_LIMIT, &cap);
+	if (ret)
+		return ret;
+
+	*max = (cap & 0x7F) * UNCORE_FREQ_KHZ_MULTIPLIER;
+	*min = ((cap & GENMASK(14, 8)) >> 8) * UNCORE_FREQ_KHZ_MULTIPLIER;
+
+	return 0;
+}
+
+/* Common function to set min/max ratios to be used by sysfs callbacks */
+static int uncore_write_ratio(struct uncore_data *data, unsigned int input,
+			      int set_max)
+{
+	int ret;
+	u64 cap;
+
+	mutex_lock(&uncore_lock);
+
+	input /= UNCORE_FREQ_KHZ_MULTIPLIER;
+	if (!input || input > 0x7F) {
+		ret = -EINVAL;
+		goto finish_write;
+	}
+
+	rdmsrl(MSR_UNCORE_RATIO_LIMIT, cap);
+	if (set_max) {
+		cap &= ~0x7F;
+		cap |= input;
+	} else  {
+		cap &= ~GENMASK(14, 8);
+		cap |= (input << 8);
+	}
+
+	ret = wrmsrl_on_cpu(data->control_cpu, MSR_UNCORE_RATIO_LIMIT, cap);
+	if (ret)
+		goto finish_write;
+
+	data->stored_uncore_data = cap;
+
+finish_write:
+	mutex_unlock(&uncore_lock);
+
+	return ret;
+}
+
+static ssize_t store_min_max_freq_khz(struct kobject *kobj,
+				      struct attribute *attr,
+				      const char *buf, ssize_t count,
+				      int min_max)
+{
+	struct uncore_data *data = to_uncore_data(kobj);
+	unsigned int input;
+
+	if (kstrtouint(buf, 10, &input))
+		return -EINVAL;
+
+	uncore_write_ratio(data, input, min_max);
+
+	return count;
+}
+
+static ssize_t show_min_max_freq_khz(struct kobject *kobj,
+				     struct attribute *attr,
+				     char *buf, int min_max)
+{
+	struct uncore_data *data = to_uncore_data(kobj);
+	unsigned int min, max;
+	int ret;
+
+	mutex_lock(&uncore_lock);
+	ret = uncore_read_ratio(data, &min, &max);
+	mutex_unlock(&uncore_lock);
+	if (ret)
+		return ret;
+
+	if (min_max)
+		return sprintf(buf, "%u\n", max);
+
+	return sprintf(buf, "%u\n", min);
+}
+
+#define store_uncore_min_max(name, min_max)				\
+	static ssize_t store_##name(struct kobject *kobj,		\
+				    struct attribute *attr,		\
+				    const char *buf, ssize_t count)	\
+	{                                                               \
+									\
+		return store_min_max_freq_khz(kobj, attr, buf, count,	\
+					      min_max);			\
+	}
+
+#define show_uncore_min_max(name, min_max)				\
+	static ssize_t show_##name(struct kobject *kobj,		\
+				   struct attribute *attr, char *buf)	\
+	{                                                               \
+									\
+		return show_min_max_freq_khz(kobj, attr, buf, min_max); \
+	}
+
+store_uncore_min_max(min_freq_khz, 0);
+store_uncore_min_max(max_freq_khz, 1);
+
+show_uncore_min_max(min_freq_khz, 0);
+show_uncore_min_max(max_freq_khz, 1);
+
+define_one_uncore_rw(min_freq_khz);
+define_one_uncore_rw(max_freq_khz);
+
+static struct attribute *uncore_attrs[] = {
+	&power_up_min_freq_khz.attr,
+	&power_up_max_freq_khz.attr,
+	&max_freq_khz.attr,
+	&min_freq_khz.attr,
+	NULL
+};
+
+static struct kobj_type uncore_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_attrs = uncore_attrs,
+};
+
+static struct kobj_type uncore_root_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+};
+
+/* Caller provides protection */
+static struct uncore_data *uncore_get_instance(unsigned int cpu)
+{
+	int id = topology_logical_die_id(cpu);
+
+	if (id >= 0 && id < uncore_max_entries)
+		return &uncore_instances[id];
+
+	return NULL;
+}
+
+static void uncore_add_die_entry(int cpu)
+{
+	struct uncore_data *data;
+
+	mutex_lock(&uncore_lock);
+	data = uncore_get_instance(cpu);
+	if (!data) {
+		mutex_unlock(&uncore_lock);
+		return;
+	}
+
+	if (data->valid) {
+		/* control cpu changed */
+		data->control_cpu = cpu;
+	} else {
+		char str[64];
+		int ret;
+
+		memset(data, 0, sizeof(*data));
+		sprintf(str, "package_%02d_die_%02d",
+			topology_physical_package_id(cpu),
+			topology_die_id(cpu));
+
+		uncore_read_ratio(data, &data->power_up_min_freq_khz,
+				  &data->power_up_max_freq_khz);
+
+		ret = kobject_init_and_add(&data->kobj, &uncore_ktype,
+					   &uncore_root_kobj, str);
+		if (!ret) {
+			data->control_cpu = cpu;
+			data->valid = true;
+		}
+	}
+	mutex_unlock(&uncore_lock);
+}
+
+/* Last CPU in this die is offline, so remove sysfs entries */
+static void uncore_remove_die_entry(int cpu)
+{
+	struct uncore_data *data;
+
+	mutex_lock(&uncore_lock);
+	data = uncore_get_instance(cpu);
+	if (data) {
+		kobject_put(&data->kobj);
+		data->control_cpu = -1;
+		data->valid = false;
+	}
+	mutex_unlock(&uncore_lock);
+}
+
+static int uncore_event_cpu_online(unsigned int cpu)
+{
+	int target;
+
+	/* Check if there is an online cpu in the package for uncore MSR */
+	target = cpumask_any_and(&uncore_cpu_mask, topology_die_cpumask(cpu));
+	if (target < nr_cpu_ids)
+		return 0;
+
+	/* Use this CPU on this die as a control CPU */
+	cpumask_set_cpu(cpu, &uncore_cpu_mask);
+	uncore_add_die_entry(cpu);
+
+	return 0;
+}
+
+static int uncore_event_cpu_offline(unsigned int cpu)
+{
+	int target;
+
+	/* Check if existing cpu is used for uncore MSRs */
+	if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
+		return 0;
+
+	/* Find a new cpu to set uncore MSR */
+	target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
+
+	if (target < nr_cpu_ids) {
+		cpumask_set_cpu(target, &uncore_cpu_mask);
+		uncore_add_die_entry(target);
+	} else {
+		uncore_remove_die_entry(cpu);
+	}
+
+	return 0;
+}
+
+static int uncore_pm_notify(struct notifier_block *nb, unsigned long mode,
+			    void *_unused)
+{
+	int cpu;
+
+	switch (mode) {
+	case PM_POST_HIBERNATION:
+	case PM_POST_RESTORE:
+	case PM_POST_SUSPEND:
+		for_each_cpu(cpu, &uncore_cpu_mask) {
+			struct uncore_data *data;
+			int ret;
+
+			data = uncore_get_instance(cpu);
+			if (!data || !data->valid || !data->stored_uncore_data)
+				continue;
+
+			ret = wrmsrl_on_cpu(cpu, MSR_UNCORE_RATIO_LIMIT,
+					    data->stored_uncore_data);
+			if (ret)
+				return ret;
+		}
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+static struct notifier_block uncore_pm_nb = {
+	.notifier_call = uncore_pm_notify,
+};
+
+#define ICPU(model)     { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, }
+
+static const struct x86_cpu_id intel_uncore_cpu_ids[] = {
+	ICPU(INTEL_FAM6_BROADWELL_G),
+	ICPU(INTEL_FAM6_BROADWELL_X),
+	ICPU(INTEL_FAM6_BROADWELL_D),
+	ICPU(INTEL_FAM6_SKYLAKE_X),
+	ICPU(INTEL_FAM6_ICELAKE_X),
+	ICPU(INTEL_FAM6_ICELAKE_D),
+	{}
+};
+
+static int __init intel_uncore_init(void)
+{
+	const struct x86_cpu_id *id;
+	int ret;
+
+	id = x86_match_cpu(intel_uncore_cpu_ids);
+	if (!id)
+		return -ENODEV;
+
+	uncore_max_entries = topology_max_packages() *
+					topology_max_die_per_package();
+	uncore_instances = kcalloc(uncore_max_entries,
+				   sizeof(*uncore_instances), GFP_KERNEL);
+	if (!uncore_instances)
+		return -ENOMEM;
+
+	ret = kobject_init_and_add(&uncore_root_kobj, &uncore_root_ktype,
+				   &cpu_subsys.dev_root->kobj,
+				   "intel_uncore_frequency");
+	if (ret)
+		goto err_free;
+
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+				"platform/x86/uncore-freq:online",
+				uncore_event_cpu_online,
+				uncore_event_cpu_offline);
+	if (ret < 0)
+		goto err_rem_kobj;
+
+	uncore_hp_state = ret;
+
+	ret = register_pm_notifier(&uncore_pm_nb);
+	if (ret)
+		goto err_rem_state;
+
+	return 0;
+
+err_rem_state:
+	cpuhp_remove_state(uncore_hp_state);
+err_rem_kobj:
+	kobject_put(&uncore_root_kobj);
+err_free:
+	kfree(uncore_instances);
+
+	return ret;
+}
+module_init(intel_uncore_init)
+
+static void __exit intel_uncore_exit(void)
+{
+	int i;
+
+	unregister_pm_notifier(&uncore_pm_nb);
+	cpuhp_remove_state(uncore_hp_state);
+	for (i = 0; i < uncore_max_entries; ++i) {
+		if (uncore_instances[i].valid)
+			kobject_put(&uncore_instances[i].kobj);
+	}
+	kobject_put(&uncore_root_kobj);
+	kfree(uncore_instances);
+}
+module_exit(intel_uncore_exit)
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Intel Uncore Frequency Limits Driver");
-- 
2.17.2


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/2] admin guide/pm: Admin guide for Intel Uncore Frequency limits
  2019-12-14  0:14 [PATCH 1/2] platform/x86: Add support for Uncore frequency control Srinivas Pandruvada
@ 2019-12-14  0:14 ` Srinivas Pandruvada
  2019-12-18 16:50   ` Andy Shevchenko
  2019-12-18 16:51 ` [PATCH 1/2] platform/x86: Add support for Uncore frequency control Andy Shevchenko
  1 sibling, 1 reply; 5+ messages in thread
From: Srinivas Pandruvada @ 2019-12-14  0:14 UTC (permalink / raw)
  To: dvhart, andy, corbet
  Cc: linux-doc, platform-driver-x86, linux-kernel, lenb, rjw,
	Srinivas Pandruvada

Added documentation for the attributes to control uncore frequency
selection.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
---
 Documentation/admin-guide/pm/intel_uncore.rst | 23 +++++++++++++++++++
 .../admin-guide/pm/working-state.rst          |  1 +
 2 files changed, 24 insertions(+)
 create mode 100644 Documentation/admin-guide/pm/intel_uncore.rst

diff --git a/Documentation/admin-guide/pm/intel_uncore.rst b/Documentation/admin-guide/pm/intel_uncore.rst
new file mode 100644
index 000000000000..0986d9ab59d6
--- /dev/null
+++ b/Documentation/admin-guide/pm/intel_uncore.rst
@@ -0,0 +1,23 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=========================================================
+Intel® Uncore Frequency Selection
+=========================================================
+
+The uncore frequency in the Intel(R) hardware is selected based on internal heuristics, which uses the current selected performance state and various system power constraints. In majority of the cases this selection is the most optimal, so there is no need for placing external constraints from the Operating System.
+
+But there are some customers who wants less jitters from dynamic uncore frequency selection. For them, power saving is much lower priority than consistent performance. Currently these customers uses MSR 0x620, to place hard limits on the maximum and the minimum uncore frequency. They can now use Linux sysfs to place these limits and also have additional capability to place hard limits under power constraint scenario.
+
+The Uncore frequency section attributes are present under "/sys/devices/system/cpu/intel_uncore_frequency".
+The scope of these attributes is per die in multi-die systems or package wide in non multi-die systems. There is a unique folder for each die or package. For example:
+"package_00_die_00" for package 0 and die 0.
+
+Attributes:
+
+power_up_max_freq_khz (READ ONLY): This is the power up value of the maximum uncore frequency in KHz. This is sampled during the driver initialization time. This is not the absolute maximum uncore frequency as there is no capability for the Operating System to read that. This can be used as a reference to roll back settings once user changed this limit.
+
+power_up_min_freq_khz (READ ONLY): This is the power up value of the minimum uncore frequency in KHz. This is not the absolute minimum uncore frequency. This can be used as a reference to roll back settings once user changed this limit.
+
+max_freq_khz (READ, WRITE): This presents current maximum uncore frequency. User can modify this attribute to change to a new maximum uncore frequency in KHz.
+
+min_freq_khz (READ, WRITE): This presents current minimum uncore frequency. User can modify this attribute to change to a new minimum uncore frequency in KHz.
diff --git a/Documentation/admin-guide/pm/working-state.rst b/Documentation/admin-guide/pm/working-state.rst
index fc298eb1234b..15094cf0a234 100644
--- a/Documentation/admin-guide/pm/working-state.rst
+++ b/Documentation/admin-guide/pm/working-state.rst
@@ -11,3 +11,4 @@ Working-State Power Management
    cpufreq
    intel_pstate
    intel_epb
+   intel_uncore
-- 
2.17.2


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 2/2] admin guide/pm: Admin guide for Intel Uncore Frequency limits
  2019-12-14  0:14 ` [PATCH 2/2] admin guide/pm: Admin guide for Intel Uncore Frequency limits Srinivas Pandruvada
@ 2019-12-18 16:50   ` Andy Shevchenko
  0 siblings, 0 replies; 5+ messages in thread
From: Andy Shevchenko @ 2019-12-18 16:50 UTC (permalink / raw)
  To: Srinivas Pandruvada
  Cc: Darren Hart, Andy Shevchenko, Jonathan Corbet,
	Linux Documentation List, Platform Driver,
	Linux Kernel Mailing List, Len Brown, Rafael J. Wysocki

On Sat, Dec 14, 2019 at 2:15 AM Srinivas Pandruvada
<srinivas.pandruvada@linux.intel.com> wrote:
>
> Added documentation for the attributes to control uncore frequency
> selection.
>

I need either Ack for this one, or take mine for patch 1.

> Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
> ---
>  Documentation/admin-guide/pm/intel_uncore.rst | 23 +++++++++++++++++++
>  .../admin-guide/pm/working-state.rst          |  1 +
>  2 files changed, 24 insertions(+)
>  create mode 100644 Documentation/admin-guide/pm/intel_uncore.rst
>
> diff --git a/Documentation/admin-guide/pm/intel_uncore.rst b/Documentation/admin-guide/pm/intel_uncore.rst
> new file mode 100644
> index 000000000000..0986d9ab59d6
> --- /dev/null
> +++ b/Documentation/admin-guide/pm/intel_uncore.rst
> @@ -0,0 +1,23 @@
> +.. SPDX-License-Identifier: GPL-2.0
> +
> +=========================================================
> +Intel® Uncore Frequency Selection
> +=========================================================
> +
> +The uncore frequency in the Intel(R) hardware is selected based on internal heuristics, which uses the current selected performance state and various system power constraints. In majority of the cases this selection is the most optimal, so there is no need for placing external constraints from the Operating System.
> +
> +But there are some customers who wants less jitters from dynamic uncore frequency selection. For them, power saving is much lower priority than consistent performance. Currently these customers uses MSR 0x620, to place hard limits on the maximum and the minimum uncore frequency. They can now use Linux sysfs to place these limits and also have additional capability to place hard limits under power constraint scenario.
> +
> +The Uncore frequency section attributes are present under "/sys/devices/system/cpu/intel_uncore_frequency".
> +The scope of these attributes is per die in multi-die systems or package wide in non multi-die systems. There is a unique folder for each die or package. For example:
> +"package_00_die_00" for package 0 and die 0.
> +
> +Attributes:
> +
> +power_up_max_freq_khz (READ ONLY): This is the power up value of the maximum uncore frequency in KHz. This is sampled during the driver initialization time. This is not the absolute maximum uncore frequency as there is no capability for the Operating System to read that. This can be used as a reference to roll back settings once user changed this limit.
> +
> +power_up_min_freq_khz (READ ONLY): This is the power up value of the minimum uncore frequency in KHz. This is not the absolute minimum uncore frequency. This can be used as a reference to roll back settings once user changed this limit.
> +
> +max_freq_khz (READ, WRITE): This presents current maximum uncore frequency. User can modify this attribute to change to a new maximum uncore frequency in KHz.
> +
> +min_freq_khz (READ, WRITE): This presents current minimum uncore frequency. User can modify this attribute to change to a new minimum uncore frequency in KHz.
> diff --git a/Documentation/admin-guide/pm/working-state.rst b/Documentation/admin-guide/pm/working-state.rst
> index fc298eb1234b..15094cf0a234 100644
> --- a/Documentation/admin-guide/pm/working-state.rst
> +++ b/Documentation/admin-guide/pm/working-state.rst
> @@ -11,3 +11,4 @@ Working-State Power Management
>     cpufreq
>     intel_pstate
>     intel_epb
> +   intel_uncore
> --
> 2.17.2
>


-- 
With Best Regards,
Andy Shevchenko

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/2] platform/x86: Add support for Uncore frequency control
  2019-12-14  0:14 [PATCH 1/2] platform/x86: Add support for Uncore frequency control Srinivas Pandruvada
  2019-12-14  0:14 ` [PATCH 2/2] admin guide/pm: Admin guide for Intel Uncore Frequency limits Srinivas Pandruvada
@ 2019-12-18 16:51 ` Andy Shevchenko
  2019-12-23 15:50   ` Srinivas Pandruvada
  1 sibling, 1 reply; 5+ messages in thread
From: Andy Shevchenko @ 2019-12-18 16:51 UTC (permalink / raw)
  To: Srinivas Pandruvada
  Cc: Darren Hart, Andy Shevchenko, Jonathan Corbet,
	Linux Documentation List, Platform Driver,
	Linux Kernel Mailing List, Len Brown, Rafael J. Wysocki

On Sat, Dec 14, 2019 at 2:15 AM Srinivas Pandruvada
<srinivas.pandruvada@linux.intel.com> wrote:
>
> Some server users set limits on the uncore frequency using MSR 620H, while
> running latency sensitive workloads. Here uncore frequency controls
> RING/LLC(last-level cache) clocks.
>
> But MSR control is not always possible from the user space, so this driver
> provides a sysfs interface to set max and min frequency limits. This MSR
> 620H is a die scoped in multi-die system or package scoped in non multi-die
> systems.
>
> When this driver is loaded, a new directory is created under
>  /sys/devices/system/cpu.
>
> For example on a two package Skylake server:
> $cd /sys/devices/system/cpu/intel_uncore_frequency
>
> $ls
> package_00_die_00 package_01_die_00
>
> $ls package_00_die_00
> max_freq_khz  min_freq_khz  power_up_max_freq_khz
> power_up_min_freq_khz
>
> $grep . *
>     max_freq_khz:2400000
>     min_freq_khz:1200000
>     power_up_max_freq_khz:2400000
>     power_up_min_freq_khz:1200000
>
> Here, power_up_max_freq_khz and power_up_min_freq_khz are read only
> attributes to show power up values of max and min frequencies respectively.
> Other attributes are read-write, so that users can modify.
>

Acked-by: Andy Shevchenko <andy.shevchenko@gmail.com>
in case it goes thru different (doc?) tree.

> Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
> ---
>  drivers/platform/x86/Kconfig                  |  11 +
>  drivers/platform/x86/Makefile                 |   1 +
>  drivers/platform/x86/intel-uncore-frequency.c | 434 ++++++++++++++++++
>  3 files changed, 446 insertions(+)
>  create mode 100644 drivers/platform/x86/intel-uncore-frequency.c
>
> diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
> index 27d5b40fb717..6013c3b96cfd 100644
> --- a/drivers/platform/x86/Kconfig
> +++ b/drivers/platform/x86/Kconfig
> @@ -1337,6 +1337,17 @@ config PCENGINES_APU2
>           To compile this driver as a module, choose M here: the module
>           will be called pcengines-apuv2.
>
> +config INTEL_UNCORE_FREQ_CONTROL
> +       tristate "Intel Uncore frequency control driver"
> +       depends on X86_64
> +       help
> +         This driver allows control of uncore frequency limits on
> +         supported server platforms.
> +         Uncore frequency controls RING/LLC (last-level cache) clocks.
> +
> +         To compile this driver as a module, choose M here: the module
> +         will be called intel-uncore-frequency.
> +
>  source "drivers/platform/x86/intel_speed_select_if/Kconfig"
>
>  config SYSTEM76_ACPI
> diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
> index 42d85a00be4e..3747b1f07cf1 100644
> --- a/drivers/platform/x86/Makefile
> +++ b/drivers/platform/x86/Makefile
> @@ -105,3 +105,4 @@ obj-$(CONFIG_INTEL_ATOMISP2_PM)     += intel_atomisp2_pm.o
>  obj-$(CONFIG_PCENGINES_APU2)   += pcengines-apuv2.o
>  obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += intel_speed_select_if/
>  obj-$(CONFIG_SYSTEM76_ACPI)    += system76_acpi.o
> +obj-$(CONFIG_INTEL_UNCORE_FREQ_CONTROL)        += intel-uncore-frequency.o
> diff --git a/drivers/platform/x86/intel-uncore-frequency.c b/drivers/platform/x86/intel-uncore-frequency.c
> new file mode 100644
> index 000000000000..82ee5a3107cf
> --- /dev/null
> +++ b/drivers/platform/x86/intel-uncore-frequency.c
> @@ -0,0 +1,434 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Intel Uncore Frequency Setting
> + * Copyright (c) 2019, Intel Corporation.
> + * All rights reserved.
> + *
> + * Provide interface to set MSR 620 at a granularity of per die. On CPU online,
> + * one control CPU is identified per die to read/write limit. This control CPU
> + * is changed, if the CPU state is changed to offline. When the last CPU is
> + * offline in a die then remove the sysfs object for that die.
> + * The majority of actual code is related to sysfs create and read/write
> + * attributes.
> + *
> + * Author: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
> + */
> +
> +#include <linux/cpu.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/suspend.h>
> +#include <asm/cpu_device_id.h>
> +#include <asm/intel-family.h>
> +
> +#define MSR_UNCORE_RATIO_LIMIT                 0x620
> +#define UNCORE_FREQ_KHZ_MULTIPLIER             100000
> +
> +/**
> + * struct uncore_data -        Encapsulate all uncore data
> + * @stored_uncore_data:        Last user changed MSR 620 value, which will be restored
> + *                     on system resume.
> + * @power_up_min_freq_khz: Sampled minimum uncore frequency at driver init
> + * @power_up_max_freq_khz: Sampled maximum uncore frequency at driver init
> + * @control_cpu:       Designated CPU for a die to read/write
> + * @valid:             Mark the data valid/invalid
> + *
> + * This structure is used to encapsulate all data related to uncore sysfs
> + * settings for a die/package.
> + */
> +struct uncore_data {
> +       struct kobject kobj;
> +       u64 stored_uncore_data;
> +       u32 power_up_min_freq_khz;
> +       u32 power_up_max_freq_khz;
> +       int control_cpu;
> +       bool valid;
> +};
> +
> +#define to_uncore_data(a) container_of(a, struct uncore_data, kobj)
> +
> +/* Max instances for uncore data, one for each die */
> +static int uncore_max_entries __read_mostly;
> +/* Storage for uncore data for all instances */
> +static struct uncore_data *uncore_instances;
> +/* Root of the all uncore sysfs kobjs */
> +struct kobject uncore_root_kobj;
> +/* Stores the CPU mask of the target CPUs to use during uncore read/write */
> +static cpumask_t uncore_cpu_mask;
> +/* CPU online callback register instance */
> +static enum cpuhp_state uncore_hp_state __read_mostly;
> +/* Mutex to control all mutual exclusions */
> +static DEFINE_MUTEX(uncore_lock);
> +
> +struct uncore_attr {
> +       struct attribute attr;
> +       ssize_t (*show)(struct kobject *kobj,
> +                       struct attribute *attr, char *buf);
> +       ssize_t (*store)(struct kobject *kobj,
> +                        struct attribute *attr, const char *c, ssize_t count);
> +};
> +
> +#define define_one_uncore_ro(_name) \
> +static struct uncore_attr _name = \
> +__ATTR(_name, 0444, show_##_name, NULL)
> +
> +#define define_one_uncore_rw(_name) \
> +static struct uncore_attr _name = \
> +__ATTR(_name, 0644, show_##_name, store_##_name)
> +
> +#define show_uncore_data(member_name)                                  \
> +       static ssize_t show_##member_name(struct kobject *kobj,         \
> +                                         struct attribute *attr,       \
> +                                         char *buf)                    \
> +       {                                                               \
> +               struct uncore_data *data = to_uncore_data(kobj);        \
> +               return scnprintf(buf, PAGE_SIZE, "%u\n",                \
> +                                data->member_name);                    \
> +       }                                                               \
> +       define_one_uncore_ro(member_name)
> +
> +show_uncore_data(power_up_min_freq_khz);
> +show_uncore_data(power_up_max_freq_khz);
> +
> +/* Common function to read MSR 0x620 and read min/max */
> +static int uncore_read_ratio(struct uncore_data *data, unsigned int *min,
> +                            unsigned int *max)
> +{
> +       u64 cap;
> +       int ret;
> +
> +       ret = rdmsrl_on_cpu(data->control_cpu, MSR_UNCORE_RATIO_LIMIT, &cap);
> +       if (ret)
> +               return ret;
> +
> +       *max = (cap & 0x7F) * UNCORE_FREQ_KHZ_MULTIPLIER;
> +       *min = ((cap & GENMASK(14, 8)) >> 8) * UNCORE_FREQ_KHZ_MULTIPLIER;
> +
> +       return 0;
> +}
> +
> +/* Common function to set min/max ratios to be used by sysfs callbacks */
> +static int uncore_write_ratio(struct uncore_data *data, unsigned int input,
> +                             int set_max)
> +{
> +       int ret;
> +       u64 cap;
> +
> +       mutex_lock(&uncore_lock);
> +
> +       input /= UNCORE_FREQ_KHZ_MULTIPLIER;
> +       if (!input || input > 0x7F) {
> +               ret = -EINVAL;
> +               goto finish_write;
> +       }
> +
> +       rdmsrl(MSR_UNCORE_RATIO_LIMIT, cap);
> +       if (set_max) {
> +               cap &= ~0x7F;
> +               cap |= input;
> +       } else  {
> +               cap &= ~GENMASK(14, 8);
> +               cap |= (input << 8);
> +       }
> +
> +       ret = wrmsrl_on_cpu(data->control_cpu, MSR_UNCORE_RATIO_LIMIT, cap);
> +       if (ret)
> +               goto finish_write;
> +
> +       data->stored_uncore_data = cap;
> +
> +finish_write:
> +       mutex_unlock(&uncore_lock);
> +
> +       return ret;
> +}
> +
> +static ssize_t store_min_max_freq_khz(struct kobject *kobj,
> +                                     struct attribute *attr,
> +                                     const char *buf, ssize_t count,
> +                                     int min_max)
> +{
> +       struct uncore_data *data = to_uncore_data(kobj);
> +       unsigned int input;
> +
> +       if (kstrtouint(buf, 10, &input))
> +               return -EINVAL;
> +
> +       uncore_write_ratio(data, input, min_max);
> +
> +       return count;
> +}
> +
> +static ssize_t show_min_max_freq_khz(struct kobject *kobj,
> +                                    struct attribute *attr,
> +                                    char *buf, int min_max)
> +{
> +       struct uncore_data *data = to_uncore_data(kobj);
> +       unsigned int min, max;
> +       int ret;
> +
> +       mutex_lock(&uncore_lock);
> +       ret = uncore_read_ratio(data, &min, &max);
> +       mutex_unlock(&uncore_lock);
> +       if (ret)
> +               return ret;
> +
> +       if (min_max)
> +               return sprintf(buf, "%u\n", max);
> +
> +       return sprintf(buf, "%u\n", min);
> +}
> +
> +#define store_uncore_min_max(name, min_max)                            \
> +       static ssize_t store_##name(struct kobject *kobj,               \
> +                                   struct attribute *attr,             \
> +                                   const char *buf, ssize_t count)     \
> +       {                                                               \
> +                                                                       \
> +               return store_min_max_freq_khz(kobj, attr, buf, count,   \
> +                                             min_max);                 \
> +       }
> +
> +#define show_uncore_min_max(name, min_max)                             \
> +       static ssize_t show_##name(struct kobject *kobj,                \
> +                                  struct attribute *attr, char *buf)   \
> +       {                                                               \
> +                                                                       \
> +               return show_min_max_freq_khz(kobj, attr, buf, min_max); \
> +       }
> +
> +store_uncore_min_max(min_freq_khz, 0);
> +store_uncore_min_max(max_freq_khz, 1);
> +
> +show_uncore_min_max(min_freq_khz, 0);
> +show_uncore_min_max(max_freq_khz, 1);
> +
> +define_one_uncore_rw(min_freq_khz);
> +define_one_uncore_rw(max_freq_khz);
> +
> +static struct attribute *uncore_attrs[] = {
> +       &power_up_min_freq_khz.attr,
> +       &power_up_max_freq_khz.attr,
> +       &max_freq_khz.attr,
> +       &min_freq_khz.attr,
> +       NULL
> +};
> +
> +static struct kobj_type uncore_ktype = {
> +       .sysfs_ops = &kobj_sysfs_ops,
> +       .default_attrs = uncore_attrs,
> +};
> +
> +static struct kobj_type uncore_root_ktype = {
> +       .sysfs_ops = &kobj_sysfs_ops,
> +};
> +
> +/* Caller provides protection */
> +static struct uncore_data *uncore_get_instance(unsigned int cpu)
> +{
> +       int id = topology_logical_die_id(cpu);
> +
> +       if (id >= 0 && id < uncore_max_entries)
> +               return &uncore_instances[id];
> +
> +       return NULL;
> +}
> +
> +static void uncore_add_die_entry(int cpu)
> +{
> +       struct uncore_data *data;
> +
> +       mutex_lock(&uncore_lock);
> +       data = uncore_get_instance(cpu);
> +       if (!data) {
> +               mutex_unlock(&uncore_lock);
> +               return;
> +       }
> +
> +       if (data->valid) {
> +               /* control cpu changed */
> +               data->control_cpu = cpu;
> +       } else {
> +               char str[64];
> +               int ret;
> +
> +               memset(data, 0, sizeof(*data));
> +               sprintf(str, "package_%02d_die_%02d",
> +                       topology_physical_package_id(cpu),
> +                       topology_die_id(cpu));
> +
> +               uncore_read_ratio(data, &data->power_up_min_freq_khz,
> +                                 &data->power_up_max_freq_khz);
> +
> +               ret = kobject_init_and_add(&data->kobj, &uncore_ktype,
> +                                          &uncore_root_kobj, str);
> +               if (!ret) {
> +                       data->control_cpu = cpu;
> +                       data->valid = true;
> +               }
> +       }
> +       mutex_unlock(&uncore_lock);
> +}
> +
> +/* Last CPU in this die is offline, so remove sysfs entries */
> +static void uncore_remove_die_entry(int cpu)
> +{
> +       struct uncore_data *data;
> +
> +       mutex_lock(&uncore_lock);
> +       data = uncore_get_instance(cpu);
> +       if (data) {
> +               kobject_put(&data->kobj);
> +               data->control_cpu = -1;
> +               data->valid = false;
> +       }
> +       mutex_unlock(&uncore_lock);
> +}
> +
> +static int uncore_event_cpu_online(unsigned int cpu)
> +{
> +       int target;
> +
> +       /* Check if there is an online cpu in the package for uncore MSR */
> +       target = cpumask_any_and(&uncore_cpu_mask, topology_die_cpumask(cpu));
> +       if (target < nr_cpu_ids)
> +               return 0;
> +
> +       /* Use this CPU on this die as a control CPU */
> +       cpumask_set_cpu(cpu, &uncore_cpu_mask);
> +       uncore_add_die_entry(cpu);
> +
> +       return 0;
> +}
> +
> +static int uncore_event_cpu_offline(unsigned int cpu)
> +{
> +       int target;
> +
> +       /* Check if existing cpu is used for uncore MSRs */
> +       if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
> +               return 0;
> +
> +       /* Find a new cpu to set uncore MSR */
> +       target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
> +
> +       if (target < nr_cpu_ids) {
> +               cpumask_set_cpu(target, &uncore_cpu_mask);
> +               uncore_add_die_entry(target);
> +       } else {
> +               uncore_remove_die_entry(cpu);
> +       }
> +
> +       return 0;
> +}
> +
> +static int uncore_pm_notify(struct notifier_block *nb, unsigned long mode,
> +                           void *_unused)
> +{
> +       int cpu;
> +
> +       switch (mode) {
> +       case PM_POST_HIBERNATION:
> +       case PM_POST_RESTORE:
> +       case PM_POST_SUSPEND:
> +               for_each_cpu(cpu, &uncore_cpu_mask) {
> +                       struct uncore_data *data;
> +                       int ret;
> +
> +                       data = uncore_get_instance(cpu);
> +                       if (!data || !data->valid || !data->stored_uncore_data)
> +                               continue;
> +
> +                       ret = wrmsrl_on_cpu(cpu, MSR_UNCORE_RATIO_LIMIT,
> +                                           data->stored_uncore_data);
> +                       if (ret)
> +                               return ret;
> +               }
> +               break;
> +       default:
> +               break;
> +       }
> +       return 0;
> +}
> +
> +static struct notifier_block uncore_pm_nb = {
> +       .notifier_call = uncore_pm_notify,
> +};
> +
> +#define ICPU(model)     { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, }
> +
> +static const struct x86_cpu_id intel_uncore_cpu_ids[] = {
> +       ICPU(INTEL_FAM6_BROADWELL_G),
> +       ICPU(INTEL_FAM6_BROADWELL_X),
> +       ICPU(INTEL_FAM6_BROADWELL_D),
> +       ICPU(INTEL_FAM6_SKYLAKE_X),
> +       ICPU(INTEL_FAM6_ICELAKE_X),
> +       ICPU(INTEL_FAM6_ICELAKE_D),
> +       {}
> +};
> +
> +static int __init intel_uncore_init(void)
> +{
> +       const struct x86_cpu_id *id;
> +       int ret;
> +
> +       id = x86_match_cpu(intel_uncore_cpu_ids);
> +       if (!id)
> +               return -ENODEV;
> +
> +       uncore_max_entries = topology_max_packages() *
> +                                       topology_max_die_per_package();
> +       uncore_instances = kcalloc(uncore_max_entries,
> +                                  sizeof(*uncore_instances), GFP_KERNEL);
> +       if (!uncore_instances)
> +               return -ENOMEM;
> +
> +       ret = kobject_init_and_add(&uncore_root_kobj, &uncore_root_ktype,
> +                                  &cpu_subsys.dev_root->kobj,
> +                                  "intel_uncore_frequency");
> +       if (ret)
> +               goto err_free;
> +
> +       ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> +                               "platform/x86/uncore-freq:online",
> +                               uncore_event_cpu_online,
> +                               uncore_event_cpu_offline);
> +       if (ret < 0)
> +               goto err_rem_kobj;
> +
> +       uncore_hp_state = ret;
> +
> +       ret = register_pm_notifier(&uncore_pm_nb);
> +       if (ret)
> +               goto err_rem_state;
> +
> +       return 0;
> +
> +err_rem_state:
> +       cpuhp_remove_state(uncore_hp_state);
> +err_rem_kobj:
> +       kobject_put(&uncore_root_kobj);
> +err_free:
> +       kfree(uncore_instances);
> +
> +       return ret;
> +}
> +module_init(intel_uncore_init)
> +
> +static void __exit intel_uncore_exit(void)
> +{
> +       int i;
> +
> +       unregister_pm_notifier(&uncore_pm_nb);
> +       cpuhp_remove_state(uncore_hp_state);
> +       for (i = 0; i < uncore_max_entries; ++i) {
> +               if (uncore_instances[i].valid)
> +                       kobject_put(&uncore_instances[i].kobj);
> +       }
> +       kobject_put(&uncore_root_kobj);
> +       kfree(uncore_instances);
> +}
> +module_exit(intel_uncore_exit)
> +
> +MODULE_LICENSE("GPL v2");
> +MODULE_DESCRIPTION("Intel Uncore Frequency Limits Driver");
> --
> 2.17.2
>


-- 
With Best Regards,
Andy Shevchenko

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/2] platform/x86: Add support for Uncore frequency control
  2019-12-18 16:51 ` [PATCH 1/2] platform/x86: Add support for Uncore frequency control Andy Shevchenko
@ 2019-12-23 15:50   ` Srinivas Pandruvada
  0 siblings, 0 replies; 5+ messages in thread
From: Srinivas Pandruvada @ 2019-12-23 15:50 UTC (permalink / raw)
  To: Andy Shevchenko
  Cc: Darren Hart, Andy Shevchenko, Jonathan Corbet,
	Linux Documentation List, Platform Driver,
	Linux Kernel Mailing List, Len Brown, Rafael J. Wysocki

On Wed, 2019-12-18 at 18:51 +0200, Andy Shevchenko wrote:
> On Sat, Dec 14, 2019 at 2:15 AM Srinivas Pandruvada
> <srinivas.pandruvada@linux.intel.com> wrote:
> > 
> > Some server users set limits on the uncore frequency using MSR
> > 620H, while
> > running latency sensitive workloads. Here uncore frequency controls
> > RING/LLC(last-level cache) clocks.
> > 
> > But MSR control is not always possible from the user space, so this
> > driver
> > provides a sysfs interface to set max and min frequency limits.
> > This MSR
> > 620H is a die scoped in multi-die system or package scoped in non
> > multi-die
> > systems.
> > 
> > When this driver is loaded, a new directory is created under
> >  /sys/devices/system/cpu.
> > 
> > For example on a two package Skylake server:
> > $cd /sys/devices/system/cpu/intel_uncore_frequency
> > 
> > $ls
> > package_00_die_00 package_01_die_00
> > 
> > $ls package_00_die_00
> > max_freq_khz  min_freq_khz  power_up_max_freq_khz
> > power_up_min_freq_khz
> > 
> > $grep . *
> >     max_freq_khz:2400000
> >     min_freq_khz:1200000
> >     power_up_max_freq_khz:2400000
> >     power_up_min_freq_khz:1200000
> > 
> > Here, power_up_max_freq_khz and power_up_min_freq_khz are read only
> > attributes to show power up values of max and min frequencies
> > respectively.
> > Other attributes are read-write, so that users can modify.
> > 
> 
> Acked-by: Andy Shevchenko <andy.shevchenko@gmail.com>
> in case it goes thru different (doc?) tree.
I can submit separate patches for the documentation, so that these
patches can be submitted via two different trees.

Thanks,
Srinivas


> 
> > Signed-off-by: Srinivas Pandruvada <
> > srinivas.pandruvada@linux.intel.com>
> > ---
> >  drivers/platform/x86/Kconfig                  |  11 +
> >  drivers/platform/x86/Makefile                 |   1 +
> >  drivers/platform/x86/intel-uncore-frequency.c | 434
> > ++++++++++++++++++
> >  3 files changed, 446 insertions(+)
> >  create mode 100644 drivers/platform/x86/intel-uncore-frequency.c
> > 
> > diff --git a/drivers/platform/x86/Kconfig
> > b/drivers/platform/x86/Kconfig
> > index 27d5b40fb717..6013c3b96cfd 100644
> > --- a/drivers/platform/x86/Kconfig
> > +++ b/drivers/platform/x86/Kconfig
> > @@ -1337,6 +1337,17 @@ config PCENGINES_APU2
> >           To compile this driver as a module, choose M here: the
> > module
> >           will be called pcengines-apuv2.
> > 
> > +config INTEL_UNCORE_FREQ_CONTROL
> > +       tristate "Intel Uncore frequency control driver"
> > +       depends on X86_64
> > +       help
> > +         This driver allows control of uncore frequency limits on
> > +         supported server platforms.
> > +         Uncore frequency controls RING/LLC (last-level cache)
> > clocks.
> > +
> > +         To compile this driver as a module, choose M here: the
> > module
> > +         will be called intel-uncore-frequency.
> > +
> >  source "drivers/platform/x86/intel_speed_select_if/Kconfig"
> > 
> >  config SYSTEM76_ACPI
> > diff --git a/drivers/platform/x86/Makefile
> > b/drivers/platform/x86/Makefile
> > index 42d85a00be4e..3747b1f07cf1 100644
> > --- a/drivers/platform/x86/Makefile
> > +++ b/drivers/platform/x86/Makefile
> > @@ -105,3 +105,4 @@ obj-$(CONFIG_INTEL_ATOMISP2_PM)     +=
> > intel_atomisp2_pm.o
> >  obj-$(CONFIG_PCENGINES_APU2)   += pcengines-apuv2.o
> >  obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) +=
> > intel_speed_select_if/
> >  obj-$(CONFIG_SYSTEM76_ACPI)    += system76_acpi.o
> > +obj-$(CONFIG_INTEL_UNCORE_FREQ_CONTROL)        += intel-uncore-
> > frequency.o
> > diff --git a/drivers/platform/x86/intel-uncore-frequency.c
> > b/drivers/platform/x86/intel-uncore-frequency.c
> > new file mode 100644
> > index 000000000000..82ee5a3107cf
> > --- /dev/null
> > +++ b/drivers/platform/x86/intel-uncore-frequency.c
> > @@ -0,0 +1,434 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Intel Uncore Frequency Setting
> > + * Copyright (c) 2019, Intel Corporation.
> > + * All rights reserved.
> > + *
> > + * Provide interface to set MSR 620 at a granularity of per die.
> > On CPU online,
> > + * one control CPU is identified per die to read/write limit. This
> > control CPU
> > + * is changed, if the CPU state is changed to offline. When the
> > last CPU is
> > + * offline in a die then remove the sysfs object for that die.
> > + * The majority of actual code is related to sysfs create and
> > read/write
> > + * attributes.
> > + *
> > + * Author: Srinivas Pandruvada <
> > srinivas.pandruvada@linux.intel.com>
> > + */
> > +
> > +#include <linux/cpu.h>
> > +#include <linux/module.h>
> > +#include <linux/slab.h>
> > +#include <linux/suspend.h>
> > +#include <asm/cpu_device_id.h>
> > +#include <asm/intel-family.h>
> > +
> > +#define MSR_UNCORE_RATIO_LIMIT                 0x620
> > +#define UNCORE_FREQ_KHZ_MULTIPLIER             100000
> > +
> > +/**
> > + * struct uncore_data -        Encapsulate all uncore data
> > + * @stored_uncore_data:        Last user changed MSR 620 value,
> > which will be restored
> > + *                     on system resume.
> > + * @power_up_min_freq_khz: Sampled minimum uncore frequency at
> > driver init
> > + * @power_up_max_freq_khz: Sampled maximum uncore frequency at
> > driver init
> > + * @control_cpu:       Designated CPU for a die to read/write
> > + * @valid:             Mark the data valid/invalid
> > + *
> > + * This structure is used to encapsulate all data related to
> > uncore sysfs
> > + * settings for a die/package.
> > + */
> > +struct uncore_data {
> > +       struct kobject kobj;
> > +       u64 stored_uncore_data;
> > +       u32 power_up_min_freq_khz;
> > +       u32 power_up_max_freq_khz;
> > +       int control_cpu;
> > +       bool valid;
> > +};
> > +
> > +#define to_uncore_data(a) container_of(a, struct uncore_data,
> > kobj)
> > +
> > +/* Max instances for uncore data, one for each die */
> > +static int uncore_max_entries __read_mostly;
> > +/* Storage for uncore data for all instances */
> > +static struct uncore_data *uncore_instances;
> > +/* Root of the all uncore sysfs kobjs */
> > +struct kobject uncore_root_kobj;
> > +/* Stores the CPU mask of the target CPUs to use during uncore
> > read/write */
> > +static cpumask_t uncore_cpu_mask;
> > +/* CPU online callback register instance */
> > +static enum cpuhp_state uncore_hp_state __read_mostly;
> > +/* Mutex to control all mutual exclusions */
> > +static DEFINE_MUTEX(uncore_lock);
> > +
> > +struct uncore_attr {
> > +       struct attribute attr;
> > +       ssize_t (*show)(struct kobject *kobj,
> > +                       struct attribute *attr, char *buf);
> > +       ssize_t (*store)(struct kobject *kobj,
> > +                        struct attribute *attr, const char *c,
> > ssize_t count);
> > +};
> > +
> > +#define define_one_uncore_ro(_name) \
> > +static struct uncore_attr _name = \
> > +__ATTR(_name, 0444, show_##_name, NULL)
> > +
> > +#define define_one_uncore_rw(_name) \
> > +static struct uncore_attr _name = \
> > +__ATTR(_name, 0644, show_##_name, store_##_name)
> > +
> > +#define
> > show_uncore_data(member_name)                                  \
> > +       static ssize_t show_##member_name(struct kobject
> > *kobj,         \
> > +                                         struct attribute
> > *attr,       \
> > +                                         char
> > *buf)                    \
> > +       {                                                          
> >      \
> > +               struct uncore_data *data =
> > to_uncore_data(kobj);        \
> > +               return scnprintf(buf, PAGE_SIZE,
> > "%u\n",                \
> > +                                data-
> > >member_name);                    \
> > +       }                                                          
> >      \
> > +       define_one_uncore_ro(member_name)
> > +
> > +show_uncore_data(power_up_min_freq_khz);
> > +show_uncore_data(power_up_max_freq_khz);
> > +
> > +/* Common function to read MSR 0x620 and read min/max */
> > +static int uncore_read_ratio(struct uncore_data *data, unsigned
> > int *min,
> > +                            unsigned int *max)
> > +{
> > +       u64 cap;
> > +       int ret;
> > +
> > +       ret = rdmsrl_on_cpu(data->control_cpu,
> > MSR_UNCORE_RATIO_LIMIT, &cap);
> > +       if (ret)
> > +               return ret;
> > +
> > +       *max = (cap & 0x7F) * UNCORE_FREQ_KHZ_MULTIPLIER;
> > +       *min = ((cap & GENMASK(14, 8)) >> 8) *
> > UNCORE_FREQ_KHZ_MULTIPLIER;
> > +
> > +       return 0;
> > +}
> > +
> > +/* Common function to set min/max ratios to be used by sysfs
> > callbacks */
> > +static int uncore_write_ratio(struct uncore_data *data, unsigned
> > int input,
> > +                             int set_max)
> > +{
> > +       int ret;
> > +       u64 cap;
> > +
> > +       mutex_lock(&uncore_lock);
> > +
> > +       input /= UNCORE_FREQ_KHZ_MULTIPLIER;
> > +       if (!input || input > 0x7F) {
> > +               ret = -EINVAL;
> > +               goto finish_write;
> > +       }
> > +
> > +       rdmsrl(MSR_UNCORE_RATIO_LIMIT, cap);
> > +       if (set_max) {
> > +               cap &= ~0x7F;
> > +               cap |= input;
> > +       } else  {
> > +               cap &= ~GENMASK(14, 8);
> > +               cap |= (input << 8);
> > +       }
> > +
> > +       ret = wrmsrl_on_cpu(data->control_cpu,
> > MSR_UNCORE_RATIO_LIMIT, cap);
> > +       if (ret)
> > +               goto finish_write;
> > +
> > +       data->stored_uncore_data = cap;
> > +
> > +finish_write:
> > +       mutex_unlock(&uncore_lock);
> > +
> > +       return ret;
> > +}
> > +
> > +static ssize_t store_min_max_freq_khz(struct kobject *kobj,
> > +                                     struct attribute *attr,
> > +                                     const char *buf, ssize_t
> > count,
> > +                                     int min_max)
> > +{
> > +       struct uncore_data *data = to_uncore_data(kobj);
> > +       unsigned int input;
> > +
> > +       if (kstrtouint(buf, 10, &input))
> > +               return -EINVAL;
> > +
> > +       uncore_write_ratio(data, input, min_max);
> > +
> > +       return count;
> > +}
> > +
> > +static ssize_t show_min_max_freq_khz(struct kobject *kobj,
> > +                                    struct attribute *attr,
> > +                                    char *buf, int min_max)
> > +{
> > +       struct uncore_data *data = to_uncore_data(kobj);
> > +       unsigned int min, max;
> > +       int ret;
> > +
> > +       mutex_lock(&uncore_lock);
> > +       ret = uncore_read_ratio(data, &min, &max);
> > +       mutex_unlock(&uncore_lock);
> > +       if (ret)
> > +               return ret;
> > +
> > +       if (min_max)
> > +               return sprintf(buf, "%u\n", max);
> > +
> > +       return sprintf(buf, "%u\n", min);
> > +}
> > +
> > +#define store_uncore_min_max(name,
> > min_max)                            \
> > +       static ssize_t store_##name(struct kobject
> > *kobj,               \
> > +                                   struct attribute
> > *attr,             \
> > +                                   const char *buf, ssize_t
> > count)     \
> > +       {                                                          
> >      \
> > +                                                                  
> >      \
> > +               return store_min_max_freq_khz(kobj, attr, buf,
> > count,   \
> > +                                             min_max);            
> >      \
> > +       }
> > +
> > +#define show_uncore_min_max(name,
> > min_max)                             \
> > +       static ssize_t show_##name(struct kobject
> > *kobj,                \
> > +                                  struct attribute *attr, char
> > *buf)   \
> > +       {                                                          
> >      \
> > +                                                                  
> >      \
> > +               return show_min_max_freq_khz(kobj, attr, buf,
> > min_max); \
> > +       }
> > +
> > +store_uncore_min_max(min_freq_khz, 0);
> > +store_uncore_min_max(max_freq_khz, 1);
> > +
> > +show_uncore_min_max(min_freq_khz, 0);
> > +show_uncore_min_max(max_freq_khz, 1);
> > +
> > +define_one_uncore_rw(min_freq_khz);
> > +define_one_uncore_rw(max_freq_khz);
> > +
> > +static struct attribute *uncore_attrs[] = {
> > +       &power_up_min_freq_khz.attr,
> > +       &power_up_max_freq_khz.attr,
> > +       &max_freq_khz.attr,
> > +       &min_freq_khz.attr,
> > +       NULL
> > +};
> > +
> > +static struct kobj_type uncore_ktype = {
> > +       .sysfs_ops = &kobj_sysfs_ops,
> > +       .default_attrs = uncore_attrs,
> > +};
> > +
> > +static struct kobj_type uncore_root_ktype = {
> > +       .sysfs_ops = &kobj_sysfs_ops,
> > +};
> > +
> > +/* Caller provides protection */
> > +static struct uncore_data *uncore_get_instance(unsigned int cpu)
> > +{
> > +       int id = topology_logical_die_id(cpu);
> > +
> > +       if (id >= 0 && id < uncore_max_entries)
> > +               return &uncore_instances[id];
> > +
> > +       return NULL;
> > +}
> > +
> > +static void uncore_add_die_entry(int cpu)
> > +{
> > +       struct uncore_data *data;
> > +
> > +       mutex_lock(&uncore_lock);
> > +       data = uncore_get_instance(cpu);
> > +       if (!data) {
> > +               mutex_unlock(&uncore_lock);
> > +               return;
> > +       }
> > +
> > +       if (data->valid) {
> > +               /* control cpu changed */
> > +               data->control_cpu = cpu;
> > +       } else {
> > +               char str[64];
> > +               int ret;
> > +
> > +               memset(data, 0, sizeof(*data));
> > +               sprintf(str, "package_%02d_die_%02d",
> > +                       topology_physical_package_id(cpu),
> > +                       topology_die_id(cpu));
> > +
> > +               uncore_read_ratio(data, &data-
> > >power_up_min_freq_khz,
> > +                                 &data->power_up_max_freq_khz);
> > +
> > +               ret = kobject_init_and_add(&data->kobj,
> > &uncore_ktype,
> > +                                          &uncore_root_kobj, str);
> > +               if (!ret) {
> > +                       data->control_cpu = cpu;
> > +                       data->valid = true;
> > +               }
> > +       }
> > +       mutex_unlock(&uncore_lock);
> > +}
> > +
> > +/* Last CPU in this die is offline, so remove sysfs entries */
> > +static void uncore_remove_die_entry(int cpu)
> > +{
> > +       struct uncore_data *data;
> > +
> > +       mutex_lock(&uncore_lock);
> > +       data = uncore_get_instance(cpu);
> > +       if (data) {
> > +               kobject_put(&data->kobj);
> > +               data->control_cpu = -1;
> > +               data->valid = false;
> > +       }
> > +       mutex_unlock(&uncore_lock);
> > +}
> > +
> > +static int uncore_event_cpu_online(unsigned int cpu)
> > +{
> > +       int target;
> > +
> > +       /* Check if there is an online cpu in the package for
> > uncore MSR */
> > +       target = cpumask_any_and(&uncore_cpu_mask,
> > topology_die_cpumask(cpu));
> > +       if (target < nr_cpu_ids)
> > +               return 0;
> > +
> > +       /* Use this CPU on this die as a control CPU */
> > +       cpumask_set_cpu(cpu, &uncore_cpu_mask);
> > +       uncore_add_die_entry(cpu);
> > +
> > +       return 0;
> > +}
> > +
> > +static int uncore_event_cpu_offline(unsigned int cpu)
> > +{
> > +       int target;
> > +
> > +       /* Check if existing cpu is used for uncore MSRs */
> > +       if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
> > +               return 0;
> > +
> > +       /* Find a new cpu to set uncore MSR */
> > +       target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
> > +
> > +       if (target < nr_cpu_ids) {
> > +               cpumask_set_cpu(target, &uncore_cpu_mask);
> > +               uncore_add_die_entry(target);
> > +       } else {
> > +               uncore_remove_die_entry(cpu);
> > +       }
> > +
> > +       return 0;
> > +}
> > +
> > +static int uncore_pm_notify(struct notifier_block *nb, unsigned
> > long mode,
> > +                           void *_unused)
> > +{
> > +       int cpu;
> > +
> > +       switch (mode) {
> > +       case PM_POST_HIBERNATION:
> > +       case PM_POST_RESTORE:
> > +       case PM_POST_SUSPEND:
> > +               for_each_cpu(cpu, &uncore_cpu_mask) {
> > +                       struct uncore_data *data;
> > +                       int ret;
> > +
> > +                       data = uncore_get_instance(cpu);
> > +                       if (!data || !data->valid || !data-
> > >stored_uncore_data)
> > +                               continue;
> > +
> > +                       ret = wrmsrl_on_cpu(cpu,
> > MSR_UNCORE_RATIO_LIMIT,
> > +                                           data-
> > >stored_uncore_data);
> > +                       if (ret)
> > +                               return ret;
> > +               }
> > +               break;
> > +       default:
> > +               break;
> > +       }
> > +       return 0;
> > +}
> > +
> > +static struct notifier_block uncore_pm_nb = {
> > +       .notifier_call = uncore_pm_notify,
> > +};
> > +
> > +#define ICPU(model)     { X86_VENDOR_INTEL, 6, model,
> > X86_FEATURE_ANY, }
> > +
> > +static const struct x86_cpu_id intel_uncore_cpu_ids[] = {
> > +       ICPU(INTEL_FAM6_BROADWELL_G),
> > +       ICPU(INTEL_FAM6_BROADWELL_X),
> > +       ICPU(INTEL_FAM6_BROADWELL_D),
> > +       ICPU(INTEL_FAM6_SKYLAKE_X),
> > +       ICPU(INTEL_FAM6_ICELAKE_X),
> > +       ICPU(INTEL_FAM6_ICELAKE_D),
> > +       {}
> > +};
> > +
> > +static int __init intel_uncore_init(void)
> > +{
> > +       const struct x86_cpu_id *id;
> > +       int ret;
> > +
> > +       id = x86_match_cpu(intel_uncore_cpu_ids);
> > +       if (!id)
> > +               return -ENODEV;
> > +
> > +       uncore_max_entries = topology_max_packages() *
> > +                                       topology_max_die_per_packag
> > e();
> > +       uncore_instances = kcalloc(uncore_max_entries,
> > +                                  sizeof(*uncore_instances),
> > GFP_KERNEL);
> > +       if (!uncore_instances)
> > +               return -ENOMEM;
> > +
> > +       ret = kobject_init_and_add(&uncore_root_kobj,
> > &uncore_root_ktype,
> > +                                  &cpu_subsys.dev_root->kobj,
> > +                                  "intel_uncore_frequency");
> > +       if (ret)
> > +               goto err_free;
> > +
> > +       ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> > +                               "platform/x86/uncore-freq:online",
> > +                               uncore_event_cpu_online,
> > +                               uncore_event_cpu_offline);
> > +       if (ret < 0)
> > +               goto err_rem_kobj;
> > +
> > +       uncore_hp_state = ret;
> > +
> > +       ret = register_pm_notifier(&uncore_pm_nb);
> > +       if (ret)
> > +               goto err_rem_state;
> > +
> > +       return 0;
> > +
> > +err_rem_state:
> > +       cpuhp_remove_state(uncore_hp_state);
> > +err_rem_kobj:
> > +       kobject_put(&uncore_root_kobj);
> > +err_free:
> > +       kfree(uncore_instances);
> > +
> > +       return ret;
> > +}
> > +module_init(intel_uncore_init)
> > +
> > +static void __exit intel_uncore_exit(void)
> > +{
> > +       int i;
> > +
> > +       unregister_pm_notifier(&uncore_pm_nb);
> > +       cpuhp_remove_state(uncore_hp_state);
> > +       for (i = 0; i < uncore_max_entries; ++i) {
> > +               if (uncore_instances[i].valid)
> > +                       kobject_put(&uncore_instances[i].kobj);
> > +       }
> > +       kobject_put(&uncore_root_kobj);
> > +       kfree(uncore_instances);
> > +}
> > +module_exit(intel_uncore_exit)
> > +
> > +MODULE_LICENSE("GPL v2");
> > +MODULE_DESCRIPTION("Intel Uncore Frequency Limits Driver");
> > --
> > 2.17.2
> > 
> 
> 


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2019-12-23 15:50 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-14  0:14 [PATCH 1/2] platform/x86: Add support for Uncore frequency control Srinivas Pandruvada
2019-12-14  0:14 ` [PATCH 2/2] admin guide/pm: Admin guide for Intel Uncore Frequency limits Srinivas Pandruvada
2019-12-18 16:50   ` Andy Shevchenko
2019-12-18 16:51 ` [PATCH 1/2] platform/x86: Add support for Uncore frequency control Andy Shevchenko
2019-12-23 15:50   ` Srinivas Pandruvada

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).