Linux-Hwmon Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH 1/3 v6] hwmon: Add amd_energy driver to report energy counters
@ 2020-05-15 18:52 Naveen Krishna Chatradhi
  2020-05-15 18:52 ` [PATCH 2/3 v6] hwmon: (amd_energy) Add documentation Naveen Krishna Chatradhi
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Naveen Krishna Chatradhi @ 2020-05-15 18:52 UTC (permalink / raw)
  To: linux-hwmon; +Cc: naveenkrishna.ch, Naveen Krishna Chatradhi, Guenter Roeck

This patch adds hwmon based amd_energy driver support for
family 17h processors from AMD.

The driver provides following interface to the userspace
1. Reports the per core consumption
	* file: "energy%d_input", label: "Ecore%03d"
2. Reports per socket energy consumption
	* file: "energy%d_input", label: "Esocket%d"
3. To, increase the wrap around time of the socket energy
   counters, a 64bit accumultor is implemented.
4. Reports scaled energy value in Joules.

Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
---
Changes in v6:
1. Implement core accumulation
2. Remove enumurating sibling threads
3. Fix bug in accumulation
4. Few other minor comments addressed

 drivers/hwmon/Kconfig      |  10 +
 drivers/hwmon/Makefile     |   1 +
 drivers/hwmon/amd_energy.c | 413 +++++++++++++++++++++++++++++++++++++
 3 files changed, 424 insertions(+)
 create mode 100644 drivers/hwmon/amd_energy.c

diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 4c62f900bf7e..e165e10c49ef 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -324,6 +324,16 @@ config SENSORS_FAM15H_POWER
 	  This driver can also be built as a module. If so, the module
 	  will be called fam15h_power.
 
+config SENSORS_AMD_ENERGY
+	tristate "AMD RAPL MSR based Energy driver"
+	depends on X86
+	help
+	  If you say yes here you get support for core and package energy
+	  sensors, based on RAPL MSR for AMD family 17h and above CPUs.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called as amd_energy.
+
 config SENSORS_APPLESMC
 	tristate "Apple SMC (Motion sensor, light sensor, keyboard backlight)"
 	depends on INPUT && X86
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index b0b9c8e57176..318f89dc7133 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -45,6 +45,7 @@ obj-$(CONFIG_SENSORS_ADT7411)	+= adt7411.o
 obj-$(CONFIG_SENSORS_ADT7462)	+= adt7462.o
 obj-$(CONFIG_SENSORS_ADT7470)	+= adt7470.o
 obj-$(CONFIG_SENSORS_ADT7475)	+= adt7475.o
+obj-$(CONFIG_SENSORS_AMD_ENERGY) += amd_energy.o
 obj-$(CONFIG_SENSORS_APPLESMC)	+= applesmc.o
 obj-$(CONFIG_SENSORS_ARM_SCMI)	+= scmi-hwmon.o
 obj-$(CONFIG_SENSORS_ARM_SCPI)	+= scpi-hwmon.o
diff --git a/drivers/hwmon/amd_energy.c b/drivers/hwmon/amd_energy.c
new file mode 100644
index 000000000000..f2cd5e2e1339
--- /dev/null
+++ b/drivers/hwmon/amd_energy.c
@@ -0,0 +1,413 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright (C) 2020 Advanced Micro Devices, Inc.
+ */
+#include <asm/cpu_device_id.h>
+
+#include <linux/bits.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/hwmon.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/processor.h>
+#include <linux/platform_device.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/topology.h>
+#include <linux/types.h>
+
+#define DRVNAME			"amd_energy"
+
+#define ENERGY_PWR_UNIT_MSR	0xC0010299
+#define ENERGY_CORE_MSR		0xC001029A
+#define ENERGY_PKG_MSR		0xC001029B
+
+#define AMD_ENERGY_UNIT_MASK	0x01F00
+#define AMD_ENERGY_MASK		0xFFFFFFFF
+
+struct amd_energy_data {
+	struct hwmon_channel_info energy_info;
+	const struct hwmon_channel_info *info[2];
+	struct hwmon_chip_info chip;
+	/* Lock around the accumulator */
+	struct mutex lock;
+	/* Energy Status Units */
+	u64 energy_units;
+	/* An accumulator for each socket */
+	u64 *sock_energy_ctrs;
+	u64 *core_energy_ctrs;
+	u64 *sock_prev_value;
+	u64 *core_prev_value;
+};
+
+static struct task_struct *wrap_accumulate;
+static int nr_cpus, nr_socks;
+static char buf[10];
+
+static int amd_energy_read_labels(struct device *dev,
+				  enum hwmon_sensor_types type,
+				  u32 attr, int channel,
+				  const char **str)
+{
+	if (channel >= nr_cpus)
+		scnprintf(buf, 10, "Esocket%u", channel - nr_cpus);
+	else
+		scnprintf(buf, 10, "Ecore%03u", channel);
+
+	*str = buf;
+
+	return 0;
+}
+
+static int get_energy_units(struct amd_energy_data *data)
+{
+	u64 rapl_units;
+	int ret;
+
+	ret = rdmsrl_safe(ENERGY_PWR_UNIT_MSR, &rapl_units);
+	if (ret)
+		return ret;
+
+	data->energy_units = (rapl_units & AMD_ENERGY_UNIT_MASK) >> 8;
+	return 0;
+}
+
+static void amd_accumulate_delta(struct amd_energy_data *data,
+				 u64 input, int cu, bool is_core)
+{
+	input &= AMD_ENERGY_MASK;
+
+	mutex_lock(&data->lock);
+	if (!is_core) {
+		if (input >= data->sock_prev_value[cu])
+			data->sock_energy_ctrs[cu] +=
+				input - data->sock_prev_value[cu];
+		else
+			data->sock_energy_ctrs[cu] += UINT_MAX -
+				data->sock_prev_value[cu] + input;
+
+		data->sock_prev_value[cu] = input;
+	} else {
+		if (input >= data->core_prev_value[cu])
+			data->core_energy_ctrs[cu] +=
+				input - data->core_prev_value[cu];
+		else
+			data->core_energy_ctrs[cu] += UINT_MAX -
+				data->core_prev_value[cu] + input;
+
+		data->core_prev_value[cu] = input;
+	}
+	mutex_unlock(&data->lock);
+}
+
+static int read_accumulate(struct amd_energy_data *data)
+{
+	int ret, cu;
+	u64 input = 0, core_input = 0;
+	static int core_id;
+
+	for (cu = 0; cu < nr_socks; cu++) {
+		int cpu;
+
+		cpu = cpumask_first_and(cpu_online_mask,
+					cpumask_of_node(cu));
+
+		ret = rdmsrl_safe_on_cpu(cpu, ENERGY_PKG_MSR, &input);
+		if (ret)
+			return ret;
+
+		amd_accumulate_delta(data, input, cu, false);
+	}
+
+	if (core_id >= nr_cpus)
+		core_id = 0;
+
+	if (!cpu_online(core_id)) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	ret = rdmsrl_safe_on_cpu(core_id, ENERGY_CORE_MSR, &core_input);
+	if (ret)
+		goto out;
+
+	amd_accumulate_delta(data, core_input, core_id, true);
+
+out:
+	core_id++;
+	return ret;
+}
+
+static void amd_add_delta(struct amd_energy_data *data,
+			  u64 *input, int ch, bool is_core)
+{
+	mutex_lock(&data->lock);
+	if (!is_core) {
+		if (*input >= data->sock_prev_value[ch])
+			*input += data->sock_energy_ctrs[ch] -
+				  data->sock_prev_value[ch];
+		else
+			*input += UINT_MAX - data->sock_prev_value[ch] +
+				  data->sock_energy_ctrs[ch];
+	} else {
+		if (*input >= data->core_prev_value[ch])
+			*input += data->core_energy_ctrs[ch] -
+				 data->core_prev_value[ch];
+		else
+			*input += UINT_MAX - data->core_prev_value[ch] +
+				 data->core_energy_ctrs[ch];
+	}
+	mutex_unlock(&data->lock);
+}
+
+static int amd_energy_read(struct device *dev,
+			   enum hwmon_sensor_types type,
+			   u32 attr, int channel, long *val)
+{
+	struct amd_energy_data *data = dev_get_drvdata(dev);
+	int ret, cpu;
+	u64 input;
+	bool is_core;
+
+	if (channel >= nr_cpus) {
+		is_core = false;
+		channel = channel - nr_cpus;
+		cpu = cpumask_first_and(cpu_online_mask,
+					cpumask_of_node
+					(channel));
+		ret = rdmsrl_safe_on_cpu(cpu, ENERGY_PKG_MSR, &input);
+		if (ret)
+			return -EAGAIN;
+	} else {
+		is_core = true;
+		cpu = channel;
+		if (!cpu_online(cpu))
+			return -ENODEV;
+
+		ret = rdmsrl_safe_on_cpu(cpu, ENERGY_CORE_MSR, &input);
+		if (ret)
+			return -EAGAIN;
+	}
+
+	input &= AMD_ENERGY_MASK;
+	amd_add_delta(data, &input, channel, is_core);
+
+	if (data->energy_units == 0 && get_energy_units(data))
+		return -EAGAIN;
+
+	/* Energy consumed = (1/(2^ESU) * RAW * 1000000UL) μJoules */
+	*val = (long)div64_ul(input * 1000000UL,
+				      BIT(data->energy_units));
+
+	return 0;
+}
+
+static umode_t amd_energy_is_visible(const void *_data,
+				     enum hwmon_sensor_types type,
+				     u32 attr, int channel)
+{
+	return 0444;
+}
+
+static int energy_accumulator(void *p)
+{
+	struct amd_energy_data *data = (struct amd_energy_data *)p;
+
+	while (!kthread_should_stop()) {
+		/*
+		 * Ignoring the conditions such as
+		 * cpu being offline or rdmsr failure
+		 */
+		read_accumulate(data);
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop())
+			break;
+
+		/*
+		 * On a 240W system, with default resolution the
+		 * Socket Energy status register may wrap around in
+		 * 2^32*15.3 e-6/240 = 273.8041 secs (~4.5 mins)
+		 *
+		 * let us accumulate for every 100secs
+		 */
+		schedule_timeout(msecs_to_jiffies(100000));
+	}
+	return 0;
+}
+
+static const struct hwmon_ops amd_energy_ops = {
+	.is_visible = amd_energy_is_visible,
+	.read = amd_energy_read,
+	.read_string = amd_energy_read_labels,
+};
+
+static int amd_create_sensor(struct device *dev,
+			     struct amd_energy_data *data,
+			     u8 type, u32 config)
+{
+	int i, num_siblings;
+	u32 *s_config;
+	struct hwmon_channel_info *info = &data->energy_info;
+
+	/* Identify the number of siblings per core */
+	num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
+
+	nr_socks = num_possible_nodes();
+
+	/*
+	 * Energy counter register is accessed at core level.
+	 * Hence, filterout the siblings.
+	 */
+	nr_cpus = num_present_cpus()/num_siblings;
+
+	s_config = devm_kcalloc(dev, nr_cpus + nr_socks,
+				sizeof(u32), GFP_KERNEL);
+	if (!s_config)
+		return -ENOMEM;
+
+	data->sock_energy_ctrs = devm_kcalloc(dev, nr_socks,
+				sizeof(u64), GFP_KERNEL);
+	if (!data->sock_energy_ctrs)
+		return -ENOMEM;
+
+	data->core_energy_ctrs = devm_kcalloc(dev, nr_cpus,
+				sizeof(u64), GFP_KERNEL);
+	if (!data->core_energy_ctrs)
+		return -ENOMEM;
+
+	data->sock_prev_value = devm_kcalloc(dev, nr_socks,
+				sizeof(u64), GFP_KERNEL);
+	if (!data->sock_prev_value)
+		return -ENOMEM;
+
+	data->core_prev_value = devm_kcalloc(dev, nr_cpus,
+				sizeof(u64), GFP_KERNEL);
+	if (!data->core_prev_value)
+		return -ENOMEM;
+
+	info->type = type;
+	info->config = s_config;
+
+	for (i = 0; i < nr_cpus + nr_socks; i++)
+		s_config[i] = config;
+
+	return 0;
+}
+
+static int amd_energy_probe(struct platform_device *pdev)
+{
+	struct device *hwmon_dev;
+	struct amd_energy_data *data;
+	struct device *dev = &pdev->dev;
+	int ret;
+
+	data = devm_kzalloc(dev,
+			    sizeof(struct amd_energy_data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->chip.ops = &amd_energy_ops;
+	data->chip.info = data->info;
+
+	/* Populate per-core energy reporting */
+	data->info[0] = &data->energy_info;
+	amd_create_sensor(dev, data,  hwmon_energy,
+			  HWMON_E_INPUT | HWMON_E_LABEL);
+
+	mutex_init(&data->lock);
+
+	ret = get_energy_units(data);
+	if (ret)
+		return ret;
+
+	hwmon_dev = devm_hwmon_device_register_with_info(dev, DRVNAME,
+							 data,
+							 &data->chip,
+							 NULL);
+	if (IS_ERR(hwmon_dev))
+		return PTR_ERR(hwmon_dev);
+
+	wrap_accumulate = kthread_run(energy_accumulator, data,
+				      "%s", dev_name(hwmon_dev));
+	if (IS_ERR(wrap_accumulate))
+		return PTR_ERR(wrap_accumulate);
+
+	return PTR_ERR_OR_ZERO(wrap_accumulate);
+}
+
+static int amd_energy_remove(struct platform_device *pdev)
+{
+	if (wrap_accumulate)
+		kthread_stop(wrap_accumulate);
+
+	return 0;
+}
+
+static const struct platform_device_id amd_energy_ids[] = {
+	{ .name = DRVNAME, },
+	{}
+};
+MODULE_DEVICE_TABLE(platform, amd_energy_ids);
+
+static struct platform_driver amd_energy_driver = {
+	.probe = amd_energy_probe,
+	.remove	= amd_energy_remove,
+	.id_table = amd_energy_ids,
+	.driver = {
+		.name = DRVNAME,
+	},
+};
+
+static struct platform_device *amd_energy_platdev;
+
+static const struct x86_cpu_id cpu_ids[] __initconst = {
+	X86_MATCH_VENDOR_FAM(AMD, 0x17, NULL),
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, cpu_ids);
+
+static int __init amd_energy_init(void)
+{
+	int ret;
+
+	if (!x86_match_cpu(cpu_ids))
+		return -ENODEV;
+
+	ret = platform_driver_register(&amd_energy_driver);
+	if (ret)
+		return ret;
+
+	amd_energy_platdev = platform_device_alloc(DRVNAME, 0);
+	if (!amd_energy_platdev)
+		return -ENOMEM;
+
+	ret = platform_device_add(amd_energy_platdev);
+	if (ret) {
+		platform_device_put(amd_energy_platdev);
+		platform_driver_unregister(&amd_energy_driver);
+		return ret;
+	}
+
+	return ret;
+}
+
+static void __exit amd_energy_exit(void)
+{
+	platform_device_unregister(amd_energy_platdev);
+	platform_driver_unregister(&amd_energy_driver);
+}
+
+module_init(amd_energy_init);
+module_exit(amd_energy_exit);
+
+MODULE_DESCRIPTION("Driver for AMD Energy reporting from RAPL MSR via HWMON interface");
+MODULE_AUTHOR("Naveen Krishna Chatradhi <nchatrad@amd.com>");
+MODULE_LICENSE("GPL");
-- 
2.17.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 2/3 v6] hwmon: (amd_energy) Add documentation
  2020-05-15 18:52 [PATCH 1/3 v6] hwmon: Add amd_energy driver to report energy counters Naveen Krishna Chatradhi
@ 2020-05-15 18:52 ` Naveen Krishna Chatradhi
  2020-05-15 18:52 ` [PATCH 3/3 v6] MAINTAINERS: add entry for AMD energy driver Naveen Krishna Chatradhi
  2020-05-15 19:15 ` [PATCH 1/3 v6] hwmon: Add amd_energy driver to report energy counters Guenter Roeck
  2 siblings, 0 replies; 5+ messages in thread
From: Naveen Krishna Chatradhi @ 2020-05-15 18:52 UTC (permalink / raw)
  To: linux-hwmon; +Cc: naveenkrishna.ch, Naveen Krishna Chatradhi, Guenter Roeck

Document amd_energy driver with all chips supported by it.

Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
---
Changes in v6:
1. Update energy accumulation section

 Documentation/hwmon/amd_energy.rst | 103 +++++++++++++++++++++++++++++
 Documentation/hwmon/index.rst      |   1 +
 2 files changed, 104 insertions(+)
 create mode 100644 Documentation/hwmon/amd_energy.rst

diff --git a/Documentation/hwmon/amd_energy.rst b/Documentation/hwmon/amd_energy.rst
new file mode 100644
index 000000000000..4bf5997db22b
--- /dev/null
+++ b/Documentation/hwmon/amd_energy.rst
@@ -0,0 +1,103 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Kernel driver amd_energy
+==========================
+
+Supported chips:
+
+* AMD Family 17h Processors
+
+  Prefix: 'amd_energy'
+
+  Addresses used:  RAPL MSRs
+
+  Datasheets:
+
+  - Processor Programming Reference (PPR) for AMD Family 17h Model 01h, Revision B1 Processors
+
+	https://developer.amd.com/wp-content/resources/55570-B1_PUB.zip
+
+  - Preliminary Processor Programming Reference (PPR) for AMD Family 17h Model 31h, Revision B0 Processors
+
+	https://developer.amd.com/wp-content/resources/56176_ppr_Family_17h_Model_71h_B0_pub_Rev_3.06.zip
+
+Author: Naveen Krishna Chatradhi <nchatrad@amd.com>
+
+Description
+-----------
+
+The Energy driver exposes the energy counters that are
+reported via the Running Average Power Limit (RAPL)
+Model-specific Registers (MSRs) via the hardware monitor
+(HWMON) sysfs interface.
+
+1. Power, Energy and Time Units
+   MSR_RAPL_POWER_UNIT/ C001_0299:
+   shared with all cores in the socket
+
+2. Energy consumed by each Core
+   MSR_CORE_ENERGY_STATUS/ C001_029A:
+   32-bitRO, Accumulator, core-level power reporting
+
+3. Energy consumed by Socket
+   MSR_PACKAGE_ENERGY_STATUS/ C001_029B:
+   32-bitRO, Accumulator, socket-level power reporting,
+   shared with all cores in socket
+
+These registers are updated every 1ms and cleared on
+reset of the system.
+
+Energy Caluclation
+------------------
+
+Energy information (in Joules) is based on the multiplier,
+1/2^ESU; where ESU is an unsigned integer read from
+MSR_RAPL_POWER_UNIT register. Default value is 10000b,
+indicating energy status unit is 15.3 micro-Joules increment.
+
+Reported values are scaled as per the formula
+
+scaled value = ((1/2^ESU) * (Raw value) * 1000000UL) in uJoules
+
+Users calculate power for a given domain by calculating
+	dEnergy/dTime for that domain.
+
+Energy accumulation
+--------------------------
+
+Current, Socket energy status register is 32bit, assuming a 240W
+2P system, the register would wrap around in
+
+	2^32*15.3 e-6/240 * 2 = 547.60833024 secs to wrap(~9 mins)
+
+The Core energy register may wrap around after several days.
+
+To improve the wrap around time, a kernel thread is implemented
+to accumulate the socket energy counters and one core energy counter
+per run to a respective 64-bit counter. The kernel thread starts
+running during probe, wakes up every 100secs and stops running
+when driver is removed.
+
+A socket and core energy read would return the current register
+value added to the respective energy accumulator.
+
+Sysfs attributes
+----------------
+
+=============== ========  =====================================
+Attribute	Label	  Description
+===============	========  =====================================
+
+* For index N between [1] and [nr_cpus]
+
+===============	========  ======================================
+energy[N]_input EcoreX	  Core Energy   X = [0] to [nr_cpus - 1]
+			  Measured input core energy
+===============	========  ======================================
+
+* For N between [nr_cpus] and [nr_cpus + nr_socks]
+
+===============	========  ======================================
+energy[N]_input EsocketX  Socket Energy X = [0] to [nr_socks -1]
+			  Measured input socket energy
+=============== ========  ======================================
diff --git a/Documentation/hwmon/index.rst b/Documentation/hwmon/index.rst
index 8ef62fd39787..fc4b89810e67 100644
--- a/Documentation/hwmon/index.rst
+++ b/Documentation/hwmon/index.rst
@@ -39,6 +39,7 @@ Hardware Monitoring Kernel Drivers
    adt7470
    adt7475
    amc6821
+   amd_energy
    asb100
    asc7621
    aspeed-pwm-tacho
-- 
2.17.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 3/3 v6] MAINTAINERS: add entry for AMD energy driver
  2020-05-15 18:52 [PATCH 1/3 v6] hwmon: Add amd_energy driver to report energy counters Naveen Krishna Chatradhi
  2020-05-15 18:52 ` [PATCH 2/3 v6] hwmon: (amd_energy) Add documentation Naveen Krishna Chatradhi
@ 2020-05-15 18:52 ` Naveen Krishna Chatradhi
  2020-05-15 19:15 ` [PATCH 1/3 v6] hwmon: Add amd_energy driver to report energy counters Guenter Roeck
  2 siblings, 0 replies; 5+ messages in thread
From: Naveen Krishna Chatradhi @ 2020-05-15 18:52 UTC (permalink / raw)
  To: linux-hwmon; +Cc: naveenkrishna.ch, Naveen Krishna Chatradhi, Guenter Roeck

The kernel driver is part of HWMON subsystem.

Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
---
Change in v6:
None

 MAINTAINERS | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 091ec22c1a23..7e7393f9a0e5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -842,6 +842,13 @@ S:	Supported
 T:	git git://people.freedesktop.org/~agd5f/linux
 F:	drivers/gpu/drm/amd/display/
 
+AMD ENERGY DRIVER
+M:	Naveen Krishna Chatradhi <nchatrad@amd.com>
+L:	linux-hwmon@vger.kernel.org
+S:	Maintained
+F:	Documentation/hwmon/amd_energy.rst
+F:	drivers/hwmon/amd_energy.c
+
 AMD FAM15H PROCESSOR POWER MONITORING DRIVER
 M:	Huang Rui <ray.huang@amd.com>
 L:	linux-hwmon@vger.kernel.org
-- 
2.17.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/3 v6] hwmon: Add amd_energy driver to report energy counters
  2020-05-15 18:52 [PATCH 1/3 v6] hwmon: Add amd_energy driver to report energy counters Naveen Krishna Chatradhi
  2020-05-15 18:52 ` [PATCH 2/3 v6] hwmon: (amd_energy) Add documentation Naveen Krishna Chatradhi
  2020-05-15 18:52 ` [PATCH 3/3 v6] MAINTAINERS: add entry for AMD energy driver Naveen Krishna Chatradhi
@ 2020-05-15 19:15 ` Guenter Roeck
  2020-05-18 19:35   ` Naveen Krishna Ch
  2 siblings, 1 reply; 5+ messages in thread
From: Guenter Roeck @ 2020-05-15 19:15 UTC (permalink / raw)
  To: Naveen Krishna Chatradhi; +Cc: linux-hwmon, naveenkrishna.ch

On Sat, May 16, 2020 at 12:22:32AM +0530, Naveen Krishna Chatradhi wrote:
> This patch adds hwmon based amd_energy driver support for
> family 17h processors from AMD.
> 
> The driver provides following interface to the userspace
> 1. Reports the per core consumption
> 	* file: "energy%d_input", label: "Ecore%03d"
> 2. Reports per socket energy consumption
> 	* file: "energy%d_input", label: "Esocket%d"
> 3. To, increase the wrap around time of the socket energy
>    counters, a 64bit accumultor is implemented.
> 4. Reports scaled energy value in Joules.
> 
> Cc: Guenter Roeck <linux@roeck-us.net>
> Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
> ---
> Changes in v6:
> 1. Implement core accumulation
> 2. Remove enumurating sibling threads
> 3. Fix bug in accumulation
> 4. Few other minor comments addressed
> 
>  drivers/hwmon/Kconfig      |  10 +
>  drivers/hwmon/Makefile     |   1 +
>  drivers/hwmon/amd_energy.c | 413 +++++++++++++++++++++++++++++++++++++
>  3 files changed, 424 insertions(+)
>  create mode 100644 drivers/hwmon/amd_energy.c
> 
> diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
> index 4c62f900bf7e..e165e10c49ef 100644
> --- a/drivers/hwmon/Kconfig
> +++ b/drivers/hwmon/Kconfig
> @@ -324,6 +324,16 @@ config SENSORS_FAM15H_POWER
>  	  This driver can also be built as a module. If so, the module
>  	  will be called fam15h_power.
>  
> +config SENSORS_AMD_ENERGY
> +	tristate "AMD RAPL MSR based Energy driver"
> +	depends on X86
> +	help
> +	  If you say yes here you get support for core and package energy
> +	  sensors, based on RAPL MSR for AMD family 17h and above CPUs.
> +
> +	  This driver can also be built as a module. If so, the module
> +	  will be called as amd_energy.
> +
>  config SENSORS_APPLESMC
>  	tristate "Apple SMC (Motion sensor, light sensor, keyboard backlight)"
>  	depends on INPUT && X86
> diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
> index b0b9c8e57176..318f89dc7133 100644
> --- a/drivers/hwmon/Makefile
> +++ b/drivers/hwmon/Makefile
> @@ -45,6 +45,7 @@ obj-$(CONFIG_SENSORS_ADT7411)	+= adt7411.o
>  obj-$(CONFIG_SENSORS_ADT7462)	+= adt7462.o
>  obj-$(CONFIG_SENSORS_ADT7470)	+= adt7470.o
>  obj-$(CONFIG_SENSORS_ADT7475)	+= adt7475.o
> +obj-$(CONFIG_SENSORS_AMD_ENERGY) += amd_energy.o
>  obj-$(CONFIG_SENSORS_APPLESMC)	+= applesmc.o
>  obj-$(CONFIG_SENSORS_ARM_SCMI)	+= scmi-hwmon.o
>  obj-$(CONFIG_SENSORS_ARM_SCPI)	+= scpi-hwmon.o
> diff --git a/drivers/hwmon/amd_energy.c b/drivers/hwmon/amd_energy.c
> new file mode 100644
> index 000000000000..f2cd5e2e1339
> --- /dev/null
> +++ b/drivers/hwmon/amd_energy.c
> @@ -0,0 +1,413 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +
> +/*
> + * Copyright (C) 2020 Advanced Micro Devices, Inc.
> + */
> +#include <asm/cpu_device_id.h>
> +
> +#include <linux/bits.h>
> +#include <linux/cpu.h>
> +#include <linux/cpumask.h>
> +#include <linux/delay.h>
> +#include <linux/device.h>
> +#include <linux/hwmon.h>
> +#include <linux/kernel.h>
> +#include <linux/kthread.h>
> +#include <linux/list.h>
> +#include <linux/module.h>
> +#include <linux/mutex.h>
> +#include <linux/processor.h>
> +#include <linux/platform_device.h>
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/topology.h>
> +#include <linux/types.h>
> +
> +#define DRVNAME			"amd_energy"
> +
> +#define ENERGY_PWR_UNIT_MSR	0xC0010299
> +#define ENERGY_CORE_MSR		0xC001029A
> +#define ENERGY_PKG_MSR		0xC001029B
> +
> +#define AMD_ENERGY_UNIT_MASK	0x01F00
> +#define AMD_ENERGY_MASK		0xFFFFFFFF
> +
> +struct amd_energy_data {
> +	struct hwmon_channel_info energy_info;
> +	const struct hwmon_channel_info *info[2];
> +	struct hwmon_chip_info chip;
> +	/* Lock around the accumulator */
> +	struct mutex lock;
> +	/* Energy Status Units */
> +	u64 energy_units;
> +	/* An accumulator for each socket */
> +	u64 *sock_energy_ctrs;
> +	u64 *core_energy_ctrs;
> +	u64 *sock_prev_value;
> +	u64 *core_prev_value;
> +};
> +
> +static struct task_struct *wrap_accumulate;
> +static int nr_cpus, nr_socks;

I don't think any of the above need to be static variables
and could be added to struct amd_energy_data.

> +static char buf[10];
> +
> +static int amd_energy_read_labels(struct device *dev,
> +				  enum hwmon_sensor_types type,
> +				  u32 attr, int channel,
> +				  const char **str)
> +{
> +	if (channel >= nr_cpus)
> +		scnprintf(buf, 10, "Esocket%u", channel - nr_cpus);
> +	else
> +		scnprintf(buf, 10, "Ecore%03u", channel);
> +

That won't work. It assumes that the calling function is adding protection
against overwriting buf. If you run multiple threads in parallel which all
access different labels, you'll get more or less random data returned.
Worse, a thread may be busy writing into buf while another thread is
reading from it, enabling responses such as, say, "Esocke05". The only
solution I can imagine is a data structure referenced from truct
md_energy_data which points to a set of pre-allocated strings.

> +	*str = buf;
> +
> +	return 0;
> +}
> +
> +static int get_energy_units(struct amd_energy_data *data)
> +{
> +	u64 rapl_units;
> +	int ret;
> +
> +	ret = rdmsrl_safe(ENERGY_PWR_UNIT_MSR, &rapl_units);
> +	if (ret)
> +		return ret;
> +
> +	data->energy_units = (rapl_units & AMD_ENERGY_UNIT_MASK) >> 8;
> +	return 0;
> +}
> +
> +static void amd_accumulate_delta(struct amd_energy_data *data,
> +				 u64 input, int cu, bool is_core)
> +{
> +	input &= AMD_ENERGY_MASK;
> +
> +	mutex_lock(&data->lock);
> +	if (!is_core) {
> +		if (input >= data->sock_prev_value[cu])
> +			data->sock_energy_ctrs[cu] +=
> +				input - data->sock_prev_value[cu];
> +		else
> +			data->sock_energy_ctrs[cu] += UINT_MAX -
> +				data->sock_prev_value[cu] + input;
> +
> +		data->sock_prev_value[cu] = input;
> +	} else {
> +		if (input >= data->core_prev_value[cu])
> +			data->core_energy_ctrs[cu] +=
> +				input - data->core_prev_value[cu];
> +		else
> +			data->core_energy_ctrs[cu] += UINT_MAX -
> +				data->core_prev_value[cu] + input;
> +
> +		data->core_prev_value[cu] = input;
> +	}
> +	mutex_unlock(&data->lock);
> +}
> +
> +static int read_accumulate(struct amd_energy_data *data)
> +{
> +	int ret, cu;
> +	u64 input = 0, core_input = 0;
> +	static int core_id;

Any chance you can store core_id in struct amd_energy_data ?

> +
> +	for (cu = 0; cu < nr_socks; cu++) {
> +		int cpu;
> +
> +		cpu = cpumask_first_and(cpu_online_mask,
> +					cpumask_of_node(cu));
> +
> +		ret = rdmsrl_safe_on_cpu(cpu, ENERGY_PKG_MSR, &input);
> +		if (ret)
> +			return ret;

Returning an error from this function is pretty pointless.

> +
> +		amd_accumulate_delta(data, input, cu, false);
> +	}
> +
> +	if (core_id >= nr_cpus)
> +		core_id = 0;
> +
> +	if (!cpu_online(core_id)) {
> +		ret = -ENODEV;
> +		goto out;
> +	}
> +
> +	ret = rdmsrl_safe_on_cpu(core_id, ENERGY_CORE_MSR, &core_input);
> +	if (ret)
> +		goto out;
> +
> +	amd_accumulate_delta(data, core_input, core_id, true);
> +
> +out:
> +	core_id++;
> +	return ret;
> +}
> +
> +static void amd_add_delta(struct amd_energy_data *data,
> +			  u64 *input, int ch, bool is_core)
> +{
> +	mutex_lock(&data->lock);
> +	if (!is_core) {
> +		if (*input >= data->sock_prev_value[ch])
> +			*input += data->sock_energy_ctrs[ch] -
> +				  data->sock_prev_value[ch];
> +		else
> +			*input += UINT_MAX - data->sock_prev_value[ch] +
> +				  data->sock_energy_ctrs[ch];
> +	} else {
> +		if (*input >= data->core_prev_value[ch])
> +			*input += data->core_energy_ctrs[ch] -
> +				 data->core_prev_value[ch];
> +		else
> +			*input += UINT_MAX - data->core_prev_value[ch] +
> +				 data->core_energy_ctrs[ch];
> +	}
> +	mutex_unlock(&data->lock);
> +}
> +
> +static int amd_energy_read(struct device *dev,
> +			   enum hwmon_sensor_types type,
> +			   u32 attr, int channel, long *val)
> +{
> +	struct amd_energy_data *data = dev_get_drvdata(dev);
> +	int ret, cpu;
> +	u64 input;
> +	bool is_core;
> +
> +	if (channel >= nr_cpus) {
> +		is_core = false;
> +		channel = channel - nr_cpus;
> +		cpu = cpumask_first_and(cpu_online_mask,
> +					cpumask_of_node
> +					(channel));
> +		ret = rdmsrl_safe_on_cpu(cpu, ENERGY_PKG_MSR, &input);
> +		if (ret)
> +			return -EAGAIN;

From my experience, and looking into other code, such errors tend to be
permanent. I see two issues with this code: One, static analyzers will
likely complain that error codes are overwritten, and hard errors will
result in endless retry loops.

> +	} else {
> +		is_core = true;
> +		cpu = channel;
> +		if (!cpu_online(cpu))
> +			return -ENODEV;
> +
> +		ret = rdmsrl_safe_on_cpu(cpu, ENERGY_CORE_MSR, &input);
> +		if (ret)
> +			return -EAGAIN;
> +	}
> +
> +	input &= AMD_ENERGY_MASK;
> +	amd_add_delta(data, &input, channel, is_core);
> +
I am a bit concerned that input is read first and passed to amd_add_delta()
unlocked. That means process 1 can read it, followed by process 2. If
process 2 gets the mutex in amd_add_delta() first, it will add a larger
value, followed by the older value from process 1. It would be much safer
to protect the entire read-update code with the mutex.

> +	if (data->energy_units == 0 && get_energy_units(data))
> +		return -EAGAIN;
> +
> +	/* Energy consumed = (1/(2^ESU) * RAW * 1000000UL) μJoules */
> +	*val = (long)div64_ul(input * 1000000UL,
> +				      BIT(data->energy_units));

Alignment seems to be off here. Please run checkpatch --strict.
Also, is the typecast really needed ?

> +
> +	return 0;
> +}
> +
> +static umode_t amd_energy_is_visible(const void *_data,
> +				     enum hwmon_sensor_types type,
> +				     u32 attr, int channel)
> +{
> +	return 0444;
> +}
> +
> +static int energy_accumulator(void *p)
> +{
> +	struct amd_energy_data *data = (struct amd_energy_data *)p;
> +
> +	while (!kthread_should_stop()) {
> +		/*
> +		 * Ignoring the conditions such as
> +		 * cpu being offline or rdmsr failure
> +		 */
> +		read_accumulate(data);
> +
> +		set_current_state(TASK_INTERRUPTIBLE);
> +		if (kthread_should_stop())
> +			break;
> +
> +		/*
> +		 * On a 240W system, with default resolution the
> +		 * Socket Energy status register may wrap around in
> +		 * 2^32*15.3 e-6/240 = 273.8041 secs (~4.5 mins)
> +		 *
> +		 * let us accumulate for every 100secs
> +		 */
> +		schedule_timeout(msecs_to_jiffies(100000));
> +	}
> +	return 0;
> +}
> +
> +static const struct hwmon_ops amd_energy_ops = {
> +	.is_visible = amd_energy_is_visible,
> +	.read = amd_energy_read,
> +	.read_string = amd_energy_read_labels,
> +};
> +
> +static int amd_create_sensor(struct device *dev,
> +			     struct amd_energy_data *data,
> +			     u8 type, u32 config)
> +{
> +	int i, num_siblings;
> +	u32 *s_config;
> +	struct hwmon_channel_info *info = &data->energy_info;
> +
> +	/* Identify the number of siblings per core */
> +	num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
> +
> +	nr_socks = num_possible_nodes();
> +
> +	/*
> +	 * Energy counter register is accessed at core level.
> +	 * Hence, filterout the siblings.
> +	 */
> +	nr_cpus = num_present_cpus()/num_siblings;
> +
> +	s_config = devm_kcalloc(dev, nr_cpus + nr_socks,
> +				sizeof(u32), GFP_KERNEL);
> +	if (!s_config)
> +		return -ENOMEM;
> +
> +	data->sock_energy_ctrs = devm_kcalloc(dev, nr_socks,
> +				sizeof(u64), GFP_KERNEL);
> +	if (!data->sock_energy_ctrs)
> +		return -ENOMEM;
> +
> +	data->core_energy_ctrs = devm_kcalloc(dev, nr_cpus,
> +				sizeof(u64), GFP_KERNEL);
> +	if (!data->core_energy_ctrs)
> +		return -ENOMEM;
> +
> +	data->sock_prev_value = devm_kcalloc(dev, nr_socks,
> +				sizeof(u64), GFP_KERNEL);
> +	if (!data->sock_prev_value)
> +		return -ENOMEM;
> +
> +	data->core_prev_value = devm_kcalloc(dev, nr_cpus,
> +				sizeof(u64), GFP_KERNEL);
> +	if (!data->core_prev_value)
> +		return -ENOMEM;
> +
> +	info->type = type;
> +	info->config = s_config;
> +
> +	for (i = 0; i < nr_cpus + nr_socks; i++)
> +		s_config[i] = config;
> +
> +	return 0;
> +}
> +
> +static int amd_energy_probe(struct platform_device *pdev)
> +{
> +	struct device *hwmon_dev;
> +	struct amd_energy_data *data;
> +	struct device *dev = &pdev->dev;
> +	int ret;
> +
> +	data = devm_kzalloc(dev,
> +			    sizeof(struct amd_energy_data), GFP_KERNEL);
> +	if (!data)
> +		return -ENOMEM;
> +
> +	data->chip.ops = &amd_energy_ops;
> +	data->chip.info = data->info;
> +
> +	/* Populate per-core energy reporting */
> +	data->info[0] = &data->energy_info;
> +	amd_create_sensor(dev, data,  hwmon_energy,
> +			  HWMON_E_INPUT | HWMON_E_LABEL);
> +
> +	mutex_init(&data->lock);
> +
> +	ret = get_energy_units(data);
> +	if (ret)
> +		return ret;
> +
> +	hwmon_dev = devm_hwmon_device_register_with_info(dev, DRVNAME,
> +							 data,
> +							 &data->chip,
> +							 NULL);
> +	if (IS_ERR(hwmon_dev))
> +		return PTR_ERR(hwmon_dev);
> +
> +	wrap_accumulate = kthread_run(energy_accumulator, data,
> +				      "%s", dev_name(hwmon_dev));
> +	if (IS_ERR(wrap_accumulate))
> +		return PTR_ERR(wrap_accumulate);
> +
> +	return PTR_ERR_OR_ZERO(wrap_accumulate);
> +}
> +
> +static int amd_energy_remove(struct platform_device *pdev)
> +{
> +	if (wrap_accumulate)
> +		kthread_stop(wrap_accumulate);
> +
> +	return 0;
> +}
> +
> +static const struct platform_device_id amd_energy_ids[] = {
> +	{ .name = DRVNAME, },
> +	{}
> +};
> +MODULE_DEVICE_TABLE(platform, amd_energy_ids);
> +
> +static struct platform_driver amd_energy_driver = {
> +	.probe = amd_energy_probe,
> +	.remove	= amd_energy_remove,
> +	.id_table = amd_energy_ids,
> +	.driver = {
> +		.name = DRVNAME,
> +	},
> +};
> +
> +static struct platform_device *amd_energy_platdev;
> +
> +static const struct x86_cpu_id cpu_ids[] __initconst = {
> +	X86_MATCH_VENDOR_FAM(AMD, 0x17, NULL),
> +	{}
> +};
> +MODULE_DEVICE_TABLE(x86cpu, cpu_ids);
> +
> +static int __init amd_energy_init(void)
> +{
> +	int ret;
> +
> +	if (!x86_match_cpu(cpu_ids))
> +		return -ENODEV;
> +
> +	ret = platform_driver_register(&amd_energy_driver);
> +	if (ret)
> +		return ret;
> +
> +	amd_energy_platdev = platform_device_alloc(DRVNAME, 0);
> +	if (!amd_energy_platdev)
> +		return -ENOMEM;
> +
> +	ret = platform_device_add(amd_energy_platdev);
> +	if (ret) {
> +		platform_device_put(amd_energy_platdev);
> +		platform_driver_unregister(&amd_energy_driver);
> +		return ret;
> +	}
> +
> +	return ret;
> +}
> +
> +static void __exit amd_energy_exit(void)
> +{
> +	platform_device_unregister(amd_energy_platdev);
> +	platform_driver_unregister(&amd_energy_driver);
> +}
> +
> +module_init(amd_energy_init);
> +module_exit(amd_energy_exit);
> +
> +MODULE_DESCRIPTION("Driver for AMD Energy reporting from RAPL MSR via HWMON interface");
> +MODULE_AUTHOR("Naveen Krishna Chatradhi <nchatrad@amd.com>");
> +MODULE_LICENSE("GPL");
> -- 
> 2.17.1
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/3 v6] hwmon: Add amd_energy driver to report energy counters
  2020-05-15 19:15 ` [PATCH 1/3 v6] hwmon: Add amd_energy driver to report energy counters Guenter Roeck
@ 2020-05-18 19:35   ` Naveen Krishna Ch
  0 siblings, 0 replies; 5+ messages in thread
From: Naveen Krishna Ch @ 2020-05-18 19:35 UTC (permalink / raw)
  To: Guenter Roeck; +Cc: Naveen Krishna Chatradhi, linux-hwmon

Hi Guenter

Thank you for the comments, i will address them and resubmit.

On Sat, 16 May 2020 at 00:45, Guenter Roeck <linux@roeck-us.net> wrote:
>
> On Sat, May 16, 2020 at 12:22:32AM +0530, Naveen Krishna Chatradhi wrote:
> > This patch adds hwmon based amd_energy driver support for
> > family 17h processors from AMD.
> >
> > The driver provides following interface to the userspace
> > 1. Reports the per core consumption
> >       * file: "energy%d_input", label: "Ecore%03d"
> > 2. Reports per socket energy consumption
> >       * file: "energy%d_input", label: "Esocket%d"
> > 3. To, increase the wrap around time of the socket energy
> >    counters, a 64bit accumultor is implemented.
> > 4. Reports scaled energy value in Joules.
> >
> > Cc: Guenter Roeck <linux@roeck-us.net>
> > Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
> > ---
> > Changes in v6:
> > 1. Implement core accumulation
> > 2. Remove enumurating sibling threads
> > 3. Fix bug in accumulation
> > 4. Few other minor comments addressed
> >
> >  drivers/hwmon/Kconfig      |  10 +
> >  drivers/hwmon/Makefile     |   1 +
> >  drivers/hwmon/amd_energy.c | 413 +++++++++++++++++++++++++++++++++++++
> >  3 files changed, 424 insertions(+)
> >  create mode 100644 drivers/hwmon/amd_energy.c
> >
> > diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
> > index 4c62f900bf7e..e165e10c49ef 100644
> > --- a/drivers/hwmon/Kconfig
> > +++ b/drivers/hwmon/Kconfig
> > @@ -324,6 +324,16 @@ config SENSORS_FAM15H_POWER
> >         This driver can also be built as a module. If so, the module
> >         will be called fam15h_power.
> >
> > +config SENSORS_AMD_ENERGY
> > +     tristate "AMD RAPL MSR based Energy driver"
> > +     depends on X86
> > +     help
> > +       If you say yes here you get support for core and package energy
> > +       sensors, based on RAPL MSR for AMD family 17h and above CPUs.
> > +
> > +       This driver can also be built as a module. If so, the module
> > +       will be called as amd_energy.
> > +
> >  config SENSORS_APPLESMC
> >       tristate "Apple SMC (Motion sensor, light sensor, keyboard backlight)"
> >       depends on INPUT && X86
> > diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
> > index b0b9c8e57176..318f89dc7133 100644
> > --- a/drivers/hwmon/Makefile
> > +++ b/drivers/hwmon/Makefile
> > @@ -45,6 +45,7 @@ obj-$(CONFIG_SENSORS_ADT7411)       += adt7411.o
> >  obj-$(CONFIG_SENSORS_ADT7462)        += adt7462.o
> >  obj-$(CONFIG_SENSORS_ADT7470)        += adt7470.o
> >  obj-$(CONFIG_SENSORS_ADT7475)        += adt7475.o
> > +obj-$(CONFIG_SENSORS_AMD_ENERGY) += amd_energy.o
> >  obj-$(CONFIG_SENSORS_APPLESMC)       += applesmc.o
> >  obj-$(CONFIG_SENSORS_ARM_SCMI)       += scmi-hwmon.o
> >  obj-$(CONFIG_SENSORS_ARM_SCPI)       += scpi-hwmon.o
> > diff --git a/drivers/hwmon/amd_energy.c b/drivers/hwmon/amd_energy.c
> > new file mode 100644
> > index 000000000000..f2cd5e2e1339
> > --- /dev/null
> > +++ b/drivers/hwmon/amd_energy.c
> > @@ -0,0 +1,413 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +
> > +/*
> > + * Copyright (C) 2020 Advanced Micro Devices, Inc.
> > + */
> > +#include <asm/cpu_device_id.h>
> > +
> > +#include <linux/bits.h>
> > +#include <linux/cpu.h>
> > +#include <linux/cpumask.h>
> > +#include <linux/delay.h>
> > +#include <linux/device.h>
> > +#include <linux/hwmon.h>
> > +#include <linux/kernel.h>
> > +#include <linux/kthread.h>
> > +#include <linux/list.h>
> > +#include <linux/module.h>
> > +#include <linux/mutex.h>
> > +#include <linux/processor.h>
> > +#include <linux/platform_device.h>
> > +#include <linux/sched.h>
> > +#include <linux/slab.h>
> > +#include <linux/topology.h>
> > +#include <linux/types.h>
> > +
> > +#define DRVNAME                      "amd_energy"
> > +
> > +#define ENERGY_PWR_UNIT_MSR  0xC0010299
> > +#define ENERGY_CORE_MSR              0xC001029A
> > +#define ENERGY_PKG_MSR               0xC001029B
> > +
> > +#define AMD_ENERGY_UNIT_MASK 0x01F00
> > +#define AMD_ENERGY_MASK              0xFFFFFFFF
> > +
> > +struct amd_energy_data {
> > +     struct hwmon_channel_info energy_info;
> > +     const struct hwmon_channel_info *info[2];
> > +     struct hwmon_chip_info chip;
> > +     /* Lock around the accumulator */
> > +     struct mutex lock;
> > +     /* Energy Status Units */
> > +     u64 energy_units;
> > +     /* An accumulator for each socket */
> > +     u64 *sock_energy_ctrs;
> > +     u64 *core_energy_ctrs;
> > +     u64 *sock_prev_value;
> > +     u64 *core_prev_value;
> > +};
> > +
> > +static struct task_struct *wrap_accumulate;
> > +static int nr_cpus, nr_socks;
>
> I don't think any of the above need to be static variables
> and could be added to struct amd_energy_data.
Sure

>
> > +static char buf[10];
> > +
> > +static int amd_energy_read_labels(struct device *dev,
> > +                               enum hwmon_sensor_types type,
> > +                               u32 attr, int channel,
> > +                               const char **str)
> > +{
> > +     if (channel >= nr_cpus)
> > +             scnprintf(buf, 10, "Esocket%u", channel - nr_cpus);
> > +     else
> > +             scnprintf(buf, 10, "Ecore%03u", channel);
> > +
>
> That won't work. It assumes that the calling function is adding protection
> against overwriting buf. If you run multiple threads in parallel which all
> access different labels, you'll get more or less random data returned.
> Worse, a thread may be busy writing into buf while another thread is
> reading from it, enabling responses such as, say, "Esocke05". The only
> solution I can imagine is a data structure referenced from truct
> md_energy_data which points to a set of pre-allocated strings.
Yes, i should have addressed this before sending.

>
> > +     *str = buf;
> > +
> > +     return 0;
> > +}
> > +
> > +static int get_energy_units(struct amd_energy_data *data)
> > +{
> > +     u64 rapl_units;
> > +     int ret;
> > +
> > +     ret = rdmsrl_safe(ENERGY_PWR_UNIT_MSR, &rapl_units);
> > +     if (ret)
> > +             return ret;
> > +
> > +     data->energy_units = (rapl_units & AMD_ENERGY_UNIT_MASK) >> 8;
> > +     return 0;
> > +}
> > +
> > +static void amd_accumulate_delta(struct amd_energy_data *data,
> > +                              u64 input, int cu, bool is_core)
> > +{
> > +     input &= AMD_ENERGY_MASK;
> > +
> > +     mutex_lock(&data->lock);
> > +     if (!is_core) {
> > +             if (input >= data->sock_prev_value[cu])
> > +                     data->sock_energy_ctrs[cu] +=
> > +                             input - data->sock_prev_value[cu];
> > +             else
> > +                     data->sock_energy_ctrs[cu] += UINT_MAX -
> > +                             data->sock_prev_value[cu] + input;
> > +
> > +             data->sock_prev_value[cu] = input;
> > +     } else {
> > +             if (input >= data->core_prev_value[cu])
> > +                     data->core_energy_ctrs[cu] +=
> > +                             input - data->core_prev_value[cu];
> > +             else
> > +                     data->core_energy_ctrs[cu] += UINT_MAX -
> > +                             data->core_prev_value[cu] + input;
> > +
> > +             data->core_prev_value[cu] = input;
> > +     }
> > +     mutex_unlock(&data->lock);
> > +}
> > +
> > +static int read_accumulate(struct amd_energy_data *data)
> > +{
> > +     int ret, cu;
> > +     u64 input = 0, core_input = 0;
> > +     static int core_id;
>
> Any chance you can store core_id in struct amd_energy_data ?
Sure. Will do.
>
> > +
> > +     for (cu = 0; cu < nr_socks; cu++) {
> > +             int cpu;
> > +
> > +             cpu = cpumask_first_and(cpu_online_mask,
> > +                                     cpumask_of_node(cu));
> > +
> > +             ret = rdmsrl_safe_on_cpu(cpu, ENERGY_PKG_MSR, &input);
> > +             if (ret)
> > +                     return ret;
>
> Returning an error from this function is pretty pointless.
Got it.
>
> > +
> > +             amd_accumulate_delta(data, input, cu, false);
> > +     }
> > +
> > +     if (core_id >= nr_cpus)
> > +             core_id = 0;
> > +
> > +     if (!cpu_online(core_id)) {
> > +             ret = -ENODEV;
> > +             goto out;
> > +     }
> > +
> > +     ret = rdmsrl_safe_on_cpu(core_id, ENERGY_CORE_MSR, &core_input);
> > +     if (ret)
> > +             goto out;
> > +
> > +     amd_accumulate_delta(data, core_input, core_id, true);
> > +
> > +out:
> > +     core_id++;
> > +     return ret;
> > +}
> > +
> > +static void amd_add_delta(struct amd_energy_data *data,
> > +                       u64 *input, int ch, bool is_core)
> > +{
> > +     mutex_lock(&data->lock);
> > +     if (!is_core) {
> > +             if (*input >= data->sock_prev_value[ch])
> > +                     *input += data->sock_energy_ctrs[ch] -
> > +                               data->sock_prev_value[ch];
> > +             else
> > +                     *input += UINT_MAX - data->sock_prev_value[ch] +
> > +                               data->sock_energy_ctrs[ch];
> > +     } else {
> > +             if (*input >= data->core_prev_value[ch])
> > +                     *input += data->core_energy_ctrs[ch] -
> > +                              data->core_prev_value[ch];
> > +             else
> > +                     *input += UINT_MAX - data->core_prev_value[ch] +
> > +                              data->core_energy_ctrs[ch];
> > +     }
> > +     mutex_unlock(&data->lock);
> > +}
> > +
> > +static int amd_energy_read(struct device *dev,
> > +                        enum hwmon_sensor_types type,
> > +                        u32 attr, int channel, long *val)
> > +{
> > +     struct amd_energy_data *data = dev_get_drvdata(dev);
> > +     int ret, cpu;
> > +     u64 input;
> > +     bool is_core;
> > +
> > +     if (channel >= nr_cpus) {
> > +             is_core = false;
> > +             channel = channel - nr_cpus;
> > +             cpu = cpumask_first_and(cpu_online_mask,
> > +                                     cpumask_of_node
> > +                                     (channel));
> > +             ret = rdmsrl_safe_on_cpu(cpu, ENERGY_PKG_MSR, &input);
> > +             if (ret)
> > +                     return -EAGAIN;
>
> From my experience, and looking into other code, such errors tend to be
> permanent. I see two issues with this code: One, static analyzers will
> likely complain that error codes are overwritten, and hard errors will
> result in endless retry loops.
Will remove these EAGAINs

>
> > +     } else {
> > +             is_core = true;
> > +             cpu = channel;
> > +             if (!cpu_online(cpu))
> > +                     return -ENODEV;
> > +
> > +             ret = rdmsrl_safe_on_cpu(cpu, ENERGY_CORE_MSR, &input);
> > +             if (ret)
> > +                     return -EAGAIN;
> > +     }
> > +
> > +     input &= AMD_ENERGY_MASK;
> > +     amd_add_delta(data, &input, channel, is_core);
> > +
> I am a bit concerned that input is read first and passed to amd_add_delta()
> unlocked. That means process 1 can read it, followed by process 2. If
> process 2 gets the mutex in amd_add_delta() first, it will add a larger
> value, followed by the older value from process 1. It would be much safer
> to protect the entire read-update code with the mutex.
Got it, will address.

>
> > +     if (data->energy_units == 0 && get_energy_units(data))
> > +             return -EAGAIN;
> > +
> > +     /* Energy consumed = (1/(2^ESU) * RAW * 1000000UL) μJoules */
> > +     *val = (long)div64_ul(input * 1000000UL,
> > +                                   BIT(data->energy_units));
>
> Alignment seems to be off here. Please run checkpatch --strict.
> Also, is the typecast really needed ?
>
> > +
> > +     return 0;
> > +}
> > +
> > +static umode_t amd_energy_is_visible(const void *_data,
> > +                                  enum hwmon_sensor_types type,
> > +                                  u32 attr, int channel)
> > +{
> > +     return 0444;
> > +}
> > +
> > +static int energy_accumulator(void *p)
> > +{
> > +     struct amd_energy_data *data = (struct amd_energy_data *)p;
> > +
> > +     while (!kthread_should_stop()) {
> > +             /*
> > +              * Ignoring the conditions such as
> > +              * cpu being offline or rdmsr failure
> > +              */
> > +             read_accumulate(data);
> > +
> > +             set_current_state(TASK_INTERRUPTIBLE);
> > +             if (kthread_should_stop())
> > +                     break;
> > +
> > +             /*
> > +              * On a 240W system, with default resolution the
> > +              * Socket Energy status register may wrap around in
> > +              * 2^32*15.3 e-6/240 = 273.8041 secs (~4.5 mins)
> > +              *
> > +              * let us accumulate for every 100secs
> > +              */
> > +             schedule_timeout(msecs_to_jiffies(100000));
> > +     }
> > +     return 0;
> > +}
> > +
> > +static const struct hwmon_ops amd_energy_ops = {
> > +     .is_visible = amd_energy_is_visible,
> > +     .read = amd_energy_read,
> > +     .read_string = amd_energy_read_labels,
> > +};
> > +
> > +static int amd_create_sensor(struct device *dev,
> > +                          struct amd_energy_data *data,
> > +                          u8 type, u32 config)
> > +{
> > +     int i, num_siblings;
> > +     u32 *s_config;
> > +     struct hwmon_channel_info *info = &data->energy_info;
> > +
> > +     /* Identify the number of siblings per core */
> > +     num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
> > +
> > +     nr_socks = num_possible_nodes();
> > +
> > +     /*
> > +      * Energy counter register is accessed at core level.
> > +      * Hence, filterout the siblings.
> > +      */
> > +     nr_cpus = num_present_cpus()/num_siblings;
> > +
> > +     s_config = devm_kcalloc(dev, nr_cpus + nr_socks,
> > +                             sizeof(u32), GFP_KERNEL);
> > +     if (!s_config)
> > +             return -ENOMEM;
> > +
> > +     data->sock_energy_ctrs = devm_kcalloc(dev, nr_socks,
> > +                             sizeof(u64), GFP_KERNEL);
> > +     if (!data->sock_energy_ctrs)
> > +             return -ENOMEM;
> > +
> > +     data->core_energy_ctrs = devm_kcalloc(dev, nr_cpus,
> > +                             sizeof(u64), GFP_KERNEL);
> > +     if (!data->core_energy_ctrs)
> > +             return -ENOMEM;
> > +
> > +     data->sock_prev_value = devm_kcalloc(dev, nr_socks,
> > +                             sizeof(u64), GFP_KERNEL);
> > +     if (!data->sock_prev_value)
> > +             return -ENOMEM;
> > +
> > +     data->core_prev_value = devm_kcalloc(dev, nr_cpus,
> > +                             sizeof(u64), GFP_KERNEL);
> > +     if (!data->core_prev_value)
> > +             return -ENOMEM;
> > +
> > +     info->type = type;
> > +     info->config = s_config;
> > +
> > +     for (i = 0; i < nr_cpus + nr_socks; i++)
> > +             s_config[i] = config;
> > +
> > +     return 0;
> > +}
> > +
> > +static int amd_energy_probe(struct platform_device *pdev)
> > +{
> > +     struct device *hwmon_dev;
> > +     struct amd_energy_data *data;
> > +     struct device *dev = &pdev->dev;
> > +     int ret;
> > +
> > +     data = devm_kzalloc(dev,
> > +                         sizeof(struct amd_energy_data), GFP_KERNEL);
> > +     if (!data)
> > +             return -ENOMEM;
> > +
> > +     data->chip.ops = &amd_energy_ops;
> > +     data->chip.info = data->info;
> > +
> > +     /* Populate per-core energy reporting */
> > +     data->info[0] = &data->energy_info;
> > +     amd_create_sensor(dev, data,  hwmon_energy,
> > +                       HWMON_E_INPUT | HWMON_E_LABEL);
> > +
> > +     mutex_init(&data->lock);
> > +
> > +     ret = get_energy_units(data);
> > +     if (ret)
> > +             return ret;
> > +
> > +     hwmon_dev = devm_hwmon_device_register_with_info(dev, DRVNAME,
> > +                                                      data,
> > +                                                      &data->chip,
> > +                                                      NULL);
> > +     if (IS_ERR(hwmon_dev))
> > +             return PTR_ERR(hwmon_dev);
> > +
> > +     wrap_accumulate = kthread_run(energy_accumulator, data,
> > +                                   "%s", dev_name(hwmon_dev));
> > +     if (IS_ERR(wrap_accumulate))
> > +             return PTR_ERR(wrap_accumulate);
> > +
> > +     return PTR_ERR_OR_ZERO(wrap_accumulate);
> > +}
> > +
> > +static int amd_energy_remove(struct platform_device *pdev)
> > +{
> > +     if (wrap_accumulate)
> > +             kthread_stop(wrap_accumulate);
> > +
> > +     return 0;
> > +}
> > +
> > +static const struct platform_device_id amd_energy_ids[] = {
> > +     { .name = DRVNAME, },
> > +     {}
> > +};
> > +MODULE_DEVICE_TABLE(platform, amd_energy_ids);
> > +
> > +static struct platform_driver amd_energy_driver = {
> > +     .probe = amd_energy_probe,
> > +     .remove = amd_energy_remove,
> > +     .id_table = amd_energy_ids,
> > +     .driver = {
> > +             .name = DRVNAME,
> > +     },
> > +};
> > +
> > +static struct platform_device *amd_energy_platdev;
> > +
> > +static const struct x86_cpu_id cpu_ids[] __initconst = {
> > +     X86_MATCH_VENDOR_FAM(AMD, 0x17, NULL),
> > +     {}
> > +};
> > +MODULE_DEVICE_TABLE(x86cpu, cpu_ids);
> > +
> > +static int __init amd_energy_init(void)
> > +{
> > +     int ret;
> > +
> > +     if (!x86_match_cpu(cpu_ids))
> > +             return -ENODEV;
> > +
> > +     ret = platform_driver_register(&amd_energy_driver);
> > +     if (ret)
> > +             return ret;
> > +
> > +     amd_energy_platdev = platform_device_alloc(DRVNAME, 0);
> > +     if (!amd_energy_platdev)
> > +             return -ENOMEM;
> > +
> > +     ret = platform_device_add(amd_energy_platdev);
> > +     if (ret) {
> > +             platform_device_put(amd_energy_platdev);
> > +             platform_driver_unregister(&amd_energy_driver);
> > +             return ret;
> > +     }
> > +
> > +     return ret;
> > +}
> > +
> > +static void __exit amd_energy_exit(void)
> > +{
> > +     platform_device_unregister(amd_energy_platdev);
> > +     platform_driver_unregister(&amd_energy_driver);
> > +}
> > +
> > +module_init(amd_energy_init);
> > +module_exit(amd_energy_exit);
> > +
> > +MODULE_DESCRIPTION("Driver for AMD Energy reporting from RAPL MSR via HWMON interface");
> > +MODULE_AUTHOR("Naveen Krishna Chatradhi <nchatrad@amd.com>");
> > +MODULE_LICENSE("GPL");
> > --
> > 2.17.1
> >



-- 
Shine bright,
(: Nav :)

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, back to index

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-15 18:52 [PATCH 1/3 v6] hwmon: Add amd_energy driver to report energy counters Naveen Krishna Chatradhi
2020-05-15 18:52 ` [PATCH 2/3 v6] hwmon: (amd_energy) Add documentation Naveen Krishna Chatradhi
2020-05-15 18:52 ` [PATCH 3/3 v6] MAINTAINERS: add entry for AMD energy driver Naveen Krishna Chatradhi
2020-05-15 19:15 ` [PATCH 1/3 v6] hwmon: Add amd_energy driver to report energy counters Guenter Roeck
2020-05-18 19:35   ` Naveen Krishna Ch

Linux-Hwmon Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-hwmon/0 linux-hwmon/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-hwmon linux-hwmon/ https://lore.kernel.org/linux-hwmon \
		linux-hwmon@vger.kernel.org
	public-inbox-index linux-hwmon

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-hwmon


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git