All of lore.kernel.org
 help / color / mirror / Atom feed
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
To: yu.chen.surf@gmail.com, andy@infradead.org, dvhart@infradead.org,
	andy@kernel.org, lenb@kernel.org
Cc: linux-kernel@vger.kernel.org,
	platform-driver-x86@vger.kernel.org,
	Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Subject: [PATCH v3 1/2] platform/x86: Add support for Uncore frequency control
Date: Mon, 13 Jan 2020 10:00:14 -0800	[thread overview]
Message-ID: <20200113180015.503314-2-srinivas.pandruvada@linux.intel.com> (raw)
In-Reply-To: <20200113180015.503314-1-srinivas.pandruvada@linux.intel.com>

Some server users set limits on the uncore frequency using MSR 620H, while
running latency sensitive workloads. Here uncore frequency controls
RING/LLC(last-level cache) clocks.

But MSR control is not always possible from the user space, so this driver
provides a sysfs interface to set max and min frequency limits. This MSR
620H is a die scoped in multi-die system or package scoped in non multi-die
systems.

When this driver is loaded, a new directory is created under
 /sys/devices/system/cpu.

For example on a two package Skylake server:
$cd /sys/devices/system/cpu/intel_uncore_frequency

$ls
package_00_die_00 package_01_die_00

$ls package_00_die_00
max_freq_khz  min_freq_khz  initial_max_freq_khz
initial_min_freq_khz

$grep . *
    max_freq_khz:2400000
    min_freq_khz:1200000
    initial_max_freq_khz:2400000
    initial_min_freq_khz:1200000

Here, initial_max_freq_khz and initial_min_freq_khz are read only
attributes to show power up or initial values of max and min frequencies
respectively. Other attributes are read-write, so that users can modify.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
---
 drivers/platform/x86/Kconfig                  |  11 +
 drivers/platform/x86/Makefile                 |   1 +
 drivers/platform/x86/intel-uncore-frequency.c | 437 ++++++++++++++++++
 3 files changed, 449 insertions(+)
 create mode 100644 drivers/platform/x86/intel-uncore-frequency.c

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 27d5b40fb717..6013c3b96cfd 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1337,6 +1337,17 @@ config PCENGINES_APU2
 	  To compile this driver as a module, choose M here: the module
 	  will be called pcengines-apuv2.
 
+config INTEL_UNCORE_FREQ_CONTROL
+	tristate "Intel Uncore frequency control driver"
+	depends on X86_64
+	help
+	  This driver allows control of uncore frequency limits on
+	  supported server platforms.
+	  Uncore frequency controls RING/LLC (last-level cache) clocks.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called intel-uncore-frequency.
+
 source "drivers/platform/x86/intel_speed_select_if/Kconfig"
 
 config SYSTEM76_ACPI
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index 42d85a00be4e..3747b1f07cf1 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -105,3 +105,4 @@ obj-$(CONFIG_INTEL_ATOMISP2_PM)	+= intel_atomisp2_pm.o
 obj-$(CONFIG_PCENGINES_APU2)	+= pcengines-apuv2.o
 obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += intel_speed_select_if/
 obj-$(CONFIG_SYSTEM76_ACPI)	+= system76_acpi.o
+obj-$(CONFIG_INTEL_UNCORE_FREQ_CONTROL)	+= intel-uncore-frequency.o
diff --git a/drivers/platform/x86/intel-uncore-frequency.c b/drivers/platform/x86/intel-uncore-frequency.c
new file mode 100644
index 000000000000..2b1a0734c3f8
--- /dev/null
+++ b/drivers/platform/x86/intel-uncore-frequency.c
@@ -0,0 +1,437 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Uncore Frequency Setting
+ * Copyright (c) 2019, Intel Corporation.
+ * All rights reserved.
+ *
+ * Provide interface to set MSR 620 at a granularity of per die. On CPU online,
+ * one control CPU is identified per die to read/write limit. This control CPU
+ * is changed, if the CPU state is changed to offline. When the last CPU is
+ * offline in a die then remove the sysfs object for that die.
+ * The majority of actual code is related to sysfs create and read/write
+ * attributes.
+ *
+ * Author: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+ */
+
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+
+#define MSR_UNCORE_RATIO_LIMIT			0x620
+#define UNCORE_FREQ_KHZ_MULTIPLIER		100000
+
+/**
+ * struct uncore_data -	Encapsulate all uncore data
+ * @stored_uncore_data:	Last user changed MSR 620 value, which will be restored
+ *			on system resume.
+ * @initial_min_freq_khz: Sampled minimum uncore frequency at driver init
+ * @initial_max_freq_khz: Sampled maximum uncore frequency at driver init
+ * @control_cpu:	Designated CPU for a die to read/write
+ * @valid:		Mark the data valid/invalid
+ *
+ * This structure is used to encapsulate all data related to uncore sysfs
+ * settings for a die/package.
+ */
+struct uncore_data {
+	struct kobject kobj;
+	u64 stored_uncore_data;
+	u32 initial_min_freq_khz;
+	u32 initial_max_freq_khz;
+	int control_cpu;
+	bool valid;
+};
+
+#define to_uncore_data(a) container_of(a, struct uncore_data, kobj)
+
+/* Max instances for uncore data, one for each die */
+static int uncore_max_entries __read_mostly;
+/* Storage for uncore data for all instances */
+static struct uncore_data *uncore_instances;
+/* Root of the all uncore sysfs kobjs */
+struct kobject uncore_root_kobj;
+/* Stores the CPU mask of the target CPUs to use during uncore read/write */
+static cpumask_t uncore_cpu_mask;
+/* CPU online callback register instance */
+static enum cpuhp_state uncore_hp_state __read_mostly;
+/* Mutex to control all mutual exclusions */
+static DEFINE_MUTEX(uncore_lock);
+
+struct uncore_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct kobject *kobj,
+			struct attribute *attr, char *buf);
+	ssize_t (*store)(struct kobject *kobj,
+			 struct attribute *attr, const char *c, ssize_t count);
+};
+
+#define define_one_uncore_ro(_name) \
+static struct uncore_attr _name = \
+__ATTR(_name, 0444, show_##_name, NULL)
+
+#define define_one_uncore_rw(_name) \
+static struct uncore_attr _name = \
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+#define show_uncore_data(member_name)					\
+	static ssize_t show_##member_name(struct kobject *kobj,         \
+					  struct attribute *attr,	\
+					  char *buf)			\
+	{                                                               \
+		struct uncore_data *data = to_uncore_data(kobj);	\
+		return scnprintf(buf, PAGE_SIZE, "%u\n",		\
+				 data->member_name);			\
+	}								\
+	define_one_uncore_ro(member_name)
+
+show_uncore_data(initial_min_freq_khz);
+show_uncore_data(initial_max_freq_khz);
+
+/* Common function to read MSR 0x620 and read min/max */
+static int uncore_read_ratio(struct uncore_data *data, unsigned int *min,
+			     unsigned int *max)
+{
+	u64 cap;
+	int ret;
+
+	ret = rdmsrl_on_cpu(data->control_cpu, MSR_UNCORE_RATIO_LIMIT, &cap);
+	if (ret)
+		return ret;
+
+	*max = (cap & 0x7F) * UNCORE_FREQ_KHZ_MULTIPLIER;
+	*min = ((cap & GENMASK(14, 8)) >> 8) * UNCORE_FREQ_KHZ_MULTIPLIER;
+
+	return 0;
+}
+
+/* Common function to set min/max ratios to be used by sysfs callbacks */
+static int uncore_write_ratio(struct uncore_data *data, unsigned int input,
+			      int set_max)
+{
+	int ret;
+	u64 cap;
+
+	mutex_lock(&uncore_lock);
+
+	input /= UNCORE_FREQ_KHZ_MULTIPLIER;
+	if (!input || input > 0x7F) {
+		ret = -EINVAL;
+		goto finish_write;
+	}
+
+	ret = rdmsrl_on_cpu(data->control_cpu, MSR_UNCORE_RATIO_LIMIT, &cap);
+	if (ret)
+		goto finish_write;
+
+	if (set_max) {
+		cap &= ~0x7F;
+		cap |= input;
+	} else  {
+		cap &= ~GENMASK(14, 8);
+		cap |= (input << 8);
+	}
+
+	ret = wrmsrl_on_cpu(data->control_cpu, MSR_UNCORE_RATIO_LIMIT, cap);
+	if (ret)
+		goto finish_write;
+
+	data->stored_uncore_data = cap;
+
+finish_write:
+	mutex_unlock(&uncore_lock);
+
+	return ret;
+}
+
+static ssize_t store_min_max_freq_khz(struct kobject *kobj,
+				      struct attribute *attr,
+				      const char *buf, ssize_t count,
+				      int min_max)
+{
+	struct uncore_data *data = to_uncore_data(kobj);
+	unsigned int input;
+
+	if (kstrtouint(buf, 10, &input))
+		return -EINVAL;
+
+	uncore_write_ratio(data, input, min_max);
+
+	return count;
+}
+
+static ssize_t show_min_max_freq_khz(struct kobject *kobj,
+				     struct attribute *attr,
+				     char *buf, int min_max)
+{
+	struct uncore_data *data = to_uncore_data(kobj);
+	unsigned int min, max;
+	int ret;
+
+	mutex_lock(&uncore_lock);
+	ret = uncore_read_ratio(data, &min, &max);
+	mutex_unlock(&uncore_lock);
+	if (ret)
+		return ret;
+
+	if (min_max)
+		return sprintf(buf, "%u\n", max);
+
+	return sprintf(buf, "%u\n", min);
+}
+
+#define store_uncore_min_max(name, min_max)				\
+	static ssize_t store_##name(struct kobject *kobj,		\
+				    struct attribute *attr,		\
+				    const char *buf, ssize_t count)	\
+	{                                                               \
+									\
+		return store_min_max_freq_khz(kobj, attr, buf, count,	\
+					      min_max);			\
+	}
+
+#define show_uncore_min_max(name, min_max)				\
+	static ssize_t show_##name(struct kobject *kobj,		\
+				   struct attribute *attr, char *buf)	\
+	{                                                               \
+									\
+		return show_min_max_freq_khz(kobj, attr, buf, min_max); \
+	}
+
+store_uncore_min_max(min_freq_khz, 0);
+store_uncore_min_max(max_freq_khz, 1);
+
+show_uncore_min_max(min_freq_khz, 0);
+show_uncore_min_max(max_freq_khz, 1);
+
+define_one_uncore_rw(min_freq_khz);
+define_one_uncore_rw(max_freq_khz);
+
+static struct attribute *uncore_attrs[] = {
+	&initial_min_freq_khz.attr,
+	&initial_max_freq_khz.attr,
+	&max_freq_khz.attr,
+	&min_freq_khz.attr,
+	NULL
+};
+
+static struct kobj_type uncore_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_attrs = uncore_attrs,
+};
+
+static struct kobj_type uncore_root_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+};
+
+/* Caller provides protection */
+static struct uncore_data *uncore_get_instance(unsigned int cpu)
+{
+	int id = topology_logical_die_id(cpu);
+
+	if (id >= 0 && id < uncore_max_entries)
+		return &uncore_instances[id];
+
+	return NULL;
+}
+
+static void uncore_add_die_entry(int cpu)
+{
+	struct uncore_data *data;
+
+	mutex_lock(&uncore_lock);
+	data = uncore_get_instance(cpu);
+	if (!data) {
+		mutex_unlock(&uncore_lock);
+		return;
+	}
+
+	if (data->valid) {
+		/* control cpu changed */
+		data->control_cpu = cpu;
+	} else {
+		char str[64];
+		int ret;
+
+		memset(data, 0, sizeof(*data));
+		sprintf(str, "package_%02d_die_%02d",
+			topology_physical_package_id(cpu),
+			topology_die_id(cpu));
+
+		uncore_read_ratio(data, &data->initial_min_freq_khz,
+				  &data->initial_max_freq_khz);
+
+		ret = kobject_init_and_add(&data->kobj, &uncore_ktype,
+					   &uncore_root_kobj, str);
+		if (!ret) {
+			data->control_cpu = cpu;
+			data->valid = true;
+		}
+	}
+	mutex_unlock(&uncore_lock);
+}
+
+/* Last CPU in this die is offline, so remove sysfs entries */
+static void uncore_remove_die_entry(int cpu)
+{
+	struct uncore_data *data;
+
+	mutex_lock(&uncore_lock);
+	data = uncore_get_instance(cpu);
+	if (data) {
+		kobject_put(&data->kobj);
+		data->control_cpu = -1;
+		data->valid = false;
+	}
+	mutex_unlock(&uncore_lock);
+}
+
+static int uncore_event_cpu_online(unsigned int cpu)
+{
+	int target;
+
+	/* Check if there is an online cpu in the package for uncore MSR */
+	target = cpumask_any_and(&uncore_cpu_mask, topology_die_cpumask(cpu));
+	if (target < nr_cpu_ids)
+		return 0;
+
+	/* Use this CPU on this die as a control CPU */
+	cpumask_set_cpu(cpu, &uncore_cpu_mask);
+	uncore_add_die_entry(cpu);
+
+	return 0;
+}
+
+static int uncore_event_cpu_offline(unsigned int cpu)
+{
+	int target;
+
+	/* Check if existing cpu is used for uncore MSRs */
+	if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
+		return 0;
+
+	/* Find a new cpu to set uncore MSR */
+	target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
+
+	if (target < nr_cpu_ids) {
+		cpumask_set_cpu(target, &uncore_cpu_mask);
+		uncore_add_die_entry(target);
+	} else {
+		uncore_remove_die_entry(cpu);
+	}
+
+	return 0;
+}
+
+static int uncore_pm_notify(struct notifier_block *nb, unsigned long mode,
+			    void *_unused)
+{
+	int cpu;
+
+	switch (mode) {
+	case PM_POST_HIBERNATION:
+	case PM_POST_RESTORE:
+	case PM_POST_SUSPEND:
+		for_each_cpu(cpu, &uncore_cpu_mask) {
+			struct uncore_data *data;
+			int ret;
+
+			data = uncore_get_instance(cpu);
+			if (!data || !data->valid || !data->stored_uncore_data)
+				continue;
+
+			ret = wrmsrl_on_cpu(cpu, MSR_UNCORE_RATIO_LIMIT,
+					    data->stored_uncore_data);
+			if (ret)
+				return ret;
+		}
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+static struct notifier_block uncore_pm_nb = {
+	.notifier_call = uncore_pm_notify,
+};
+
+#define ICPU(model)     { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, }
+
+static const struct x86_cpu_id intel_uncore_cpu_ids[] = {
+	ICPU(INTEL_FAM6_BROADWELL_G),
+	ICPU(INTEL_FAM6_BROADWELL_X),
+	ICPU(INTEL_FAM6_BROADWELL_D),
+	ICPU(INTEL_FAM6_SKYLAKE_X),
+	ICPU(INTEL_FAM6_ICELAKE_X),
+	ICPU(INTEL_FAM6_ICELAKE_D),
+	{}
+};
+
+static int __init intel_uncore_init(void)
+{
+	const struct x86_cpu_id *id;
+	int ret;
+
+	id = x86_match_cpu(intel_uncore_cpu_ids);
+	if (!id)
+		return -ENODEV;
+
+	uncore_max_entries = topology_max_packages() *
+					topology_max_die_per_package();
+	uncore_instances = kcalloc(uncore_max_entries,
+				   sizeof(*uncore_instances), GFP_KERNEL);
+	if (!uncore_instances)
+		return -ENOMEM;
+
+	ret = kobject_init_and_add(&uncore_root_kobj, &uncore_root_ktype,
+				   &cpu_subsys.dev_root->kobj,
+				   "intel_uncore_frequency");
+	if (ret)
+		goto err_free;
+
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+				"platform/x86/uncore-freq:online",
+				uncore_event_cpu_online,
+				uncore_event_cpu_offline);
+	if (ret < 0)
+		goto err_rem_kobj;
+
+	uncore_hp_state = ret;
+
+	ret = register_pm_notifier(&uncore_pm_nb);
+	if (ret)
+		goto err_rem_state;
+
+	return 0;
+
+err_rem_state:
+	cpuhp_remove_state(uncore_hp_state);
+err_rem_kobj:
+	kobject_put(&uncore_root_kobj);
+err_free:
+	kfree(uncore_instances);
+
+	return ret;
+}
+module_init(intel_uncore_init)
+
+static void __exit intel_uncore_exit(void)
+{
+	int i;
+
+	unregister_pm_notifier(&uncore_pm_nb);
+	cpuhp_remove_state(uncore_hp_state);
+	for (i = 0; i < uncore_max_entries; ++i) {
+		if (uncore_instances[i].valid)
+			kobject_put(&uncore_instances[i].kobj);
+	}
+	kobject_put(&uncore_root_kobj);
+	kfree(uncore_instances);
+}
+module_exit(intel_uncore_exit)
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Intel Uncore Frequency Limits Driver");
-- 
2.24.1


  reply	other threads:[~2020-01-13 18:00 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-01-13 18:00 [PATCH v3 0/2] Intel Uncore frequency control Srinivas Pandruvada
2020-01-13 18:00 ` Srinivas Pandruvada [this message]
2020-01-13 18:00 ` [PATCH v3 2/2] MAINTAINERS: Update for the intel uncore " Srinivas Pandruvada

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200113180015.503314-2-srinivas.pandruvada@linux.intel.com \
    --to=srinivas.pandruvada@linux.intel.com \
    --cc=andy@infradead.org \
    --cc=andy@kernel.org \
    --cc=dvhart@infradead.org \
    --cc=lenb@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=platform-driver-x86@vger.kernel.org \
    --cc=yu.chen.surf@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.