All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2/3] introduce intel_rapl driver
@ 2011-05-26  8:34 Zhang Rui
  2011-05-26  9:43 ` Peter Zijlstra
                   ` (5 more replies)
  0 siblings, 6 replies; 26+ messages in thread
From: Zhang Rui @ 2011-05-26  8:34 UTC (permalink / raw)
  To: LKML, linux-pm; +Cc: a.p.zijlstra, mingo, acme, ming.m.lin, Brown, Len


Introduce Intel RAPL driver.

RAPL (running average power limit) is a new feature which provides mechanisms
to enforce power consumption limit, on some new processors.

RAPL provides MSRs reporting the total amount of energy consumed
by the package/core/uncore/dram.
Further more, by using RAPL, OS can set a power bugdet in a certain time window,
and let Hardware to throttle the processor P/T-state to meet this enery limitation.

Currently, we don't have the plan to support the RAPL power control,
but we do want to export the package/core/uncore/dram power consumption
information via perf tool first.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 drivers/platform/x86/Kconfig      |    8 
 drivers/platform/x86/Makefile     |    1 
 drivers/platform/x86/intel_rapl.c |  368 ++++++++++++++++++++++++++++++++++++++
 include/linux/perf_event.h        |    4 
 4 files changed, 381 insertions(+)

Index: linux-2.6/drivers/platform/x86/Kconfig
===================================================================
--- linux-2.6.orig/drivers/platform/x86/Kconfig
+++ linux-2.6/drivers/platform/x86/Kconfig
@@ -753,4 +753,12 @@ config SAMSUNG_LAPTOP
 	  To compile this driver as a module, choose M here: the module
 	  will be called samsung-laptop.
 
+config INTEL_RAPL
+	tristate "Intel RAPL Support"
+	depends on X86
+	default y
+	---help---
+	  RAPL, AKA, Running Average Power Limit provides mechanisms to enforce
+	  power consumption limit.
+
 endif # X86_PLATFORM_DEVICES
Index: linux-2.6/drivers/platform/x86/Makefile
===================================================================
--- linux-2.6.orig/drivers/platform/x86/Makefile
+++ linux-2.6/drivers/platform/x86/Makefile
@@ -42,3 +42,4 @@ obj-$(CONFIG_XO15_EBOOK)	+= xo15-ebook.o
 obj-$(CONFIG_IBM_RTL)		+= ibm_rtl.o
 obj-$(CONFIG_SAMSUNG_LAPTOP)	+= samsung-laptop.o
 obj-$(CONFIG_INTEL_MFLD_THERMAL)	+= intel_mid_thermal.o
+obj-$(CONFIG_INTEL_RAPL)	+= intel_rapl.o
Index: linux-2.6/include/linux/perf_event.h
===================================================================
--- linux-2.6.orig/include/linux/perf_event.h
+++ linux-2.6/include/linux/perf_event.h
@@ -107,6 +107,10 @@ enum perf_sw_ids {
 	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
 	PERF_COUNT_SW_ALIGNMENT_FAULTS		= 7,
 	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
+	PERF_COUNT_SW_PKG_ENERGY		= 9,
+	PERF_COUNT_SW_CORE_ENERGY		= 10,
+	PERF_COUNT_SW_UNCORE_ENERGY		= 11,
+	PERF_COUNT_SW_DRAM_ENERGY		= 12,
 
 	PERF_COUNT_SW_MAX,			/* non-ABI */
 };
Index: linux-2.6/drivers/platform/x86/intel_rapl.c
===================================================================
--- /dev/null
+++ linux-2.6/drivers/platform/x86/intel_rapl.c
@@ -0,0 +1,368 @@
+/*
+ *  Intel RAPL interface driver
+ *
+ *  Copyright (C) 2010-2011 Zhang Rui <rui.zhang@intel.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/list.h>
+#include <asm/processor.h>
+#include <linux/perf_event.h>
+
+MODULE_AUTHOR("Zhang Rui");
+MODULE_DESCRIPTION("Intel RAPL interface Driver");
+MODULE_LICENSE("GPL");
+
+#define PREFIX "Intel: RAPL: "
+
+#define MSR_RAPL_POWER_UNIT		0x606
+
+/*
+ * Platform specific RAPL Domains.
+ * Note that PP1 RAPL Domain is supported on 062A only
+ * And DRAM RAPL Domain is supported on 062D only
+ */
+/* Package RAPL Domain */
+#define MSR_PKG_RAPL_POWER_LIMIT	0x610
+#define MSR_PKG_ENERGY_STATUS		0x611
+#define MSR_PKG_PERF_STATUS		0x613
+#define MSR_PKG_POWER_INFO		0x614
+
+/* PP0 RAPL Domain */
+#define MSR_PP0_POWER_LIMIT		0x638
+#define MSR_PP0_ENERGY_STATUS		0x639
+#define MSR_PP0_POLICY			0x63A
+#define MSR_PP0_PERF_STATUS		0x63B
+
+/* PP1 RAPL Domain, may reflect to uncore devices */
+#define MSR_PP1_POWER_LIMIT		0x640
+#define MSR_PP1_ENERGY_STATUS		0x641
+#define MSR_PP1_POLICY			0x642
+
+/* DRAM RAPL Domain */
+#define MSR_DRAM_POWER_LIMIT		0x618
+#define MSR_DRAM_ENERGY_STATUS		0x619
+#define MSR_DRAM_PERF_STATUS		0x61B
+#define MSR_DRAM_POWER_INFO		0x61C
+
+/* RAPL UNIT BITMASK */
+#define POWER_UNIT_OFFSET	0
+#define POWER_UNIT_MASK		0x0F
+
+#define ENERGY_UNIT_OFFSET	0x08
+#define ENERGY_UNIT_MASK	0x1F00
+
+#define TIME_UNIT_OFFSET	0x10
+#define TIME_UNIT_MASK		0xF000
+
+static int rapl_pmu_pkg_event_init(struct perf_event *event);
+static int rapl_pmu_core_event_init(struct perf_event *event);
+static int rapl_pmu_uncore_event_init(struct perf_event *event);
+static int rapl_pmu_dram_event_init(struct perf_event *event);
+static void rapl_event_start(struct perf_event *event, int flags);
+static void rapl_event_stop(struct perf_event *event, int flags);
+static int rapl_event_add(struct perf_event *event, int flags);
+static void rapl_event_del(struct perf_event *event, int flags);
+static void rapl_event_read(struct perf_event *event);
+
+enum rapl_domain_id {
+	RAPL_DOMAIN_PKG,
+	RAPL_DOMAIN_PP0,
+	RAPL_DOMAIN_PP1,
+	RAPL_DOMAIN_DRAM,
+	RAPL_DOMAIN_MAX
+};
+
+struct rapl_domain_msr {
+	int	limit;
+	int	status;
+};
+
+struct rapl_domain {
+	enum rapl_domain_id domain_id;
+	struct rapl_domain_msr msrs;
+	struct pmu pmu;
+	enum perf_sw_ids event_id;
+	int valid;
+};
+
+#define to_rapl_domain(p) container_of(p, struct rapl_domain, pmu);
+
+static struct rapl_domain rapl_domains[] = {
+	[RAPL_DOMAIN_PKG] = {
+		.domain_id = RAPL_DOMAIN_PKG,
+		.msrs	= {
+			.limit	= MSR_PKG_RAPL_POWER_LIMIT,
+			.status	= MSR_PKG_ENERGY_STATUS,
+		},
+		.pmu	= {
+			.name		= "rapl_pkg_energy_meter",
+			.event_init	= rapl_pmu_pkg_event_init,
+			.add		= rapl_event_add,
+			.del		= rapl_event_del,
+			.start		= rapl_event_start,
+			.stop		= rapl_event_stop,
+			.read		= rapl_event_read,
+		},
+		.event_id = PERF_COUNT_SW_PKG_ENERGY,
+		.valid	= 1,
+	},
+	[RAPL_DOMAIN_PP0] = {
+		.domain_id = RAPL_DOMAIN_PP0,
+		.msrs	= {
+			.limit	= MSR_PP0_POWER_LIMIT,
+			.status	= MSR_PP0_ENERGY_STATUS,
+		},
+		.pmu	= {
+			.name		= "rapl_core_energy_meter",
+			.event_init	= rapl_pmu_core_event_init,
+			.add		= rapl_event_add,
+			.del		= rapl_event_del,
+			.start		= rapl_event_start,
+			.stop		= rapl_event_stop,
+			.read		= rapl_event_read,
+		},
+		.event_id = PERF_COUNT_SW_CORE_ENERGY,
+		.valid	= 1,
+	},
+	[RAPL_DOMAIN_PP1] = {
+		.domain_id = RAPL_DOMAIN_PP1,
+		.msrs	= {
+			.limit	= MSR_PP1_POWER_LIMIT,
+			.status	= MSR_PP1_ENERGY_STATUS,
+		},
+		.pmu	= {
+			.name		= "rapl_uncore_energy_meter",
+			.event_init	= rapl_pmu_uncore_event_init,
+			.add		= rapl_event_add,
+			.del		= rapl_event_del,
+			.start		= rapl_event_start,
+			.stop		= rapl_event_stop,
+			.read		= rapl_event_read,
+		},
+		.event_id = PERF_COUNT_SW_UNCORE_ENERGY,
+	},
+	[RAPL_DOMAIN_DRAM] = {
+		.domain_id = RAPL_DOMAIN_DRAM,
+		.msrs	= {
+			.limit	= MSR_DRAM_POWER_LIMIT,
+			.status	= MSR_DRAM_ENERGY_STATUS,
+		},
+		.pmu	= {
+			.name		= "rapl_dram_energy_meter",
+			.event_init	= rapl_pmu_dram_event_init,
+			.add		= rapl_event_add,
+			.del		= rapl_event_del,
+			.start		= rapl_event_start,
+			.stop		= rapl_event_stop,
+			.read		= rapl_event_read,
+		},
+		.event_id = PERF_COUNT_SW_DRAM_ENERGY,
+	},
+};
+
+static unsigned int power_unit_divisor;
+static unsigned int energy_unit_divisor;
+static unsigned int time_unit_divisor;
+
+enum unit_type {
+	POWER_UNIT,
+	ENERGY_UNIT,
+	TIME_UNIT
+};
+static u64 rapl_unit_xlate(enum unit_type type, u64 value, int action)
+{
+	u64 divisor;
+
+	switch (type) {
+	case POWER_UNIT:
+		divisor = power_unit_divisor;
+		break;
+	case ENERGY_UNIT:
+		divisor = energy_unit_divisor;
+		break;
+	case TIME_UNIT:
+		divisor = time_unit_divisor;
+		break;
+	default:
+		return 0;
+	};
+
+	if (action)
+		return value * divisor; /* value is from users */
+	else
+		return div64_u64(value, divisor); /* value is from MSR */
+}
+
+/* show the energy status, in Jelous */
+static int rapl_read_energy(struct rapl_domain *domain)
+{
+	u64 value;
+	u32 msr = domain->msrs.status;
+
+	rdmsrl(msr, value);
+	return rapl_unit_xlate(ENERGY_UNIT, value, 0);
+}
+
+static void rapl_event_update(struct perf_event *event)
+{
+	s64 prev;
+	u64 now;
+	struct rapl_domain *domain = to_rapl_domain(event->pmu);
+
+	now = rapl_read_energy(domain);
+	prev = local64_xchg(&event->hw.prev_count, now);
+	local64_add(now - prev, &event->count);
+}
+
+static void rapl_event_start(struct perf_event *event, int flags)
+{
+	struct rapl_domain *domain = to_rapl_domain(event->pmu);
+
+	local64_set(&event->hw.prev_count, rapl_read_energy(domain));
+	perf_swevent_start_hrtimer(event);
+}
+
+static void rapl_event_stop(struct perf_event *event, int flags)
+{
+	perf_swevent_cancel_hrtimer(event);
+	rapl_event_update(event);
+}
+
+static int rapl_event_add(struct perf_event *event, int flags)
+{
+	if (flags & PERF_EF_START)
+		rapl_event_start(event, flags);
+	return 0;
+}
+static void rapl_event_del(struct perf_event *event, int flags)
+{
+	rapl_event_stop(event, flags);
+}
+
+static void rapl_event_read(struct perf_event *event)
+{
+	rapl_event_update(event);
+}
+
+static int rapl_pmu_event_init(struct perf_event *event,
+			       enum rapl_domain_id id)
+{
+	struct rapl_domain *domain = &(rapl_domains[id]);
+
+	if (event->attr.type != PERF_TYPE_SOFTWARE)
+		return -ENOENT;
+
+	if (event->attr.config != domain->event_id)
+		return -ENOENT;
+
+	/* Do periodecal update every second */
+	event->attr.freq = 1;
+	event->attr.sample_period = 1;
+
+	perf_swevent_init_hrtimer(event);
+
+	return 0;
+}
+
+static int rapl_pmu_pkg_event_init(struct perf_event *event)
+{
+	return rapl_pmu_event_init(event, RAPL_DOMAIN_PKG);
+}
+
+static int rapl_pmu_core_event_init(struct perf_event *event)
+{
+	return rapl_pmu_event_init(event, RAPL_DOMAIN_PP0);
+}
+
+static int rapl_pmu_uncore_event_init(struct perf_event *event)
+{
+	return rapl_pmu_event_init(event, RAPL_DOMAIN_PP1);
+}
+
+static int rapl_pmu_dram_event_init(struct perf_event *event)
+{
+	return rapl_pmu_event_init(event, RAPL_DOMAIN_DRAM);
+}
+
+static int rapl_check_unit(void)
+{
+	u64 output;
+	u32 value;
+
+	rdmsrl(MSR_RAPL_POWER_UNIT, output);
+
+	/* energy unit: 1/enery_unit_divisor Joules */
+	value = (output & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
+	energy_unit_divisor = 1 << value;
+
+	/* power unit: 1/power_unit_divisor Watts */
+	value = (output & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
+	power_unit_divisor = 1 << value;
+
+	/* time unit: 1/time_unit_divisor Seconds */
+	value =(output & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
+	time_unit_divisor = 1 << value;
+
+	return 0;
+}
+
+static int __init intel_rapl_init(void)
+{
+	enum rapl_domain_id id;
+
+	/*
+	 * RAPL features are only supported on processors have a CPUID
+	 * signature with DisplayFamily_DisplayModel of 06_2AH, 06_2DH
+	 */
+	if (boot_cpu_data.x86 != 0x06)
+		return -ENODEV;
+
+	if (boot_cpu_data.x86_model == 0x2A)
+		rapl_domains[RAPL_DOMAIN_PP1].valid = 1;
+	else if (boot_cpu_data.x86_model == 0x2D)
+		rapl_domains[RAPL_DOMAIN_DRAM].valid = 1;
+	else
+		return -ENODEV;
+
+	if (rapl_check_unit())
+		return -ENODEV;
+
+	for(id = 0; id < RAPL_DOMAIN_MAX; id++)
+		if (rapl_domains[id].valid)
+			perf_pmu_register(&(rapl_domains[id].pmu), rapl_domains[id].pmu.name, PERF_TYPE_SOFTWARE);
+	return 0;
+}
+
+static void __exit intel_rapl_exit(void)
+{
+	enum rapl_domain_id id;
+
+	for(id = 0; id < RAPL_DOMAIN_MAX; id++)
+		if (rapl_domains[id].valid)
+			perf_pmu_unregister(&(rapl_domains[id].pmu));
+}
+
+module_init(intel_rapl_init);
+module_exit(intel_rapl_exit);



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-26  8:34 [PATCH 2/3] introduce intel_rapl driver Zhang Rui
  2011-05-26  9:43 ` Peter Zijlstra
@ 2011-05-26  9:43 ` Peter Zijlstra
  2011-05-26 10:21   ` Peter Zijlstra
                     ` (5 more replies)
  2011-05-26 15:48 ` Randy Dunlap
                   ` (3 subsequent siblings)
  5 siblings, 6 replies; 26+ messages in thread
From: Peter Zijlstra @ 2011-05-26  9:43 UTC (permalink / raw)
  To: Zhang Rui
  Cc: LKML, linux-pm, mingo, acme, ming.m.lin, Brown, Len, Matt Fleming

On Thu, 2011-05-26 at 16:34 +0800, Zhang Rui wrote:
> Introduce Intel RAPL driver.
> 
> RAPL (running average power limit) is a new feature which provides mechanisms
> to enforce power consumption limit, on some new processors.
> 
> RAPL provides MSRs reporting the total amount of energy consumed
> by the package/core/uncore/dram.
> Further more, by using RAPL, OS can set a power bugdet in a certain time window,
> and let Hardware to throttle the processor P/T-state to meet this enery limitation.
> 
> Currently, we don't have the plan to support the RAPL power control,
> but we do want to export the package/core/uncore/dram power consumption
> information via perf tool first.

Do note that perf is not the right API for those control bits. If you
never plan to expose those, that's fine. If you do, you'll likely need a
parallel API (your own device) for accessing that. Please consider if
using separate APIs for reading/writing this resource is what you want
and mention these considerations in your future changelog.

> Signed-off-by: Zhang Rui <rui.zhang@intel.com>
> ---
>  drivers/platform/x86/Kconfig      |    8 
>  drivers/platform/x86/Makefile     |    1 
>  drivers/platform/x86/intel_rapl.c |  368 ++++++++++++++++++++++++++++++++++++++
>  include/linux/perf_event.h        |    4 
>  4 files changed, 381 insertions(+)
> 
> Index: linux-2.6/drivers/platform/x86/Kconfig
> ===================================================================
> --- linux-2.6.orig/drivers/platform/x86/Kconfig
> +++ linux-2.6/drivers/platform/x86/Kconfig
> @@ -753,4 +753,12 @@ config SAMSUNG_LAPTOP
>  	  To compile this driver as a module, choose M here: the module
>  	  will be called samsung-laptop.
>  
> +config INTEL_RAPL
> +	tristate "Intel RAPL Support"
> +	depends on X86

Also very much depends on perf being there.

> +	default y
> +	---help---
> +	  RAPL, AKA, Running Average Power Limit provides mechanisms to enforce
> +	  power consumption limit.

The enforce part seems dubious, perf is purely about observing state it
doesn't enforce anything. Also this help text could do with expanding in
general.

>  endif # X86_PLATFORM_DEVICES
> Index: linux-2.6/drivers/platform/x86/Makefile
> ===================================================================
> --- linux-2.6.orig/drivers/platform/x86/Makefile
> +++ linux-2.6/drivers/platform/x86/Makefile
> @@ -42,3 +42,4 @@ obj-$(CONFIG_XO15_EBOOK)	+= xo15-ebook.o
>  obj-$(CONFIG_IBM_RTL)		+= ibm_rtl.o
>  obj-$(CONFIG_SAMSUNG_LAPTOP)	+= samsung-laptop.o
>  obj-$(CONFIG_INTEL_MFLD_THERMAL)	+= intel_mid_thermal.o
> +obj-$(CONFIG_INTEL_RAPL)	+= intel_rapl.o
> Index: linux-2.6/include/linux/perf_event.h
> ===================================================================
> --- linux-2.6.orig/include/linux/perf_event.h
> +++ linux-2.6/include/linux/perf_event.h
> @@ -107,6 +107,10 @@ enum perf_sw_ids {
>  	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
>  	PERF_COUNT_SW_ALIGNMENT_FAULTS		= 7,
>  	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
> +	PERF_COUNT_SW_PKG_ENERGY		= 9,
> +	PERF_COUNT_SW_CORE_ENERGY		= 10,
> +	PERF_COUNT_SW_UNCORE_ENERGY		= 11,
> +	PERF_COUNT_SW_DRAM_ENERGY		= 12,

Not going to happen, RAPL registers its own pmu (wrongly, see below),
with that it (should) get its own perf_event_attr::type and thus should
have its own ::config space, you do not get to pollute the
PERF_TYPE_SOFTWARE config space.

Currently there isn't a way to expose the events in sysfs, but we do
want that, its mostly a matter of getting all involved parties to agree
on a format and implementing it.

>  	PERF_COUNT_SW_MAX,			/* non-ABI */
>  };
> Index: linux-2.6/drivers/platform/x86/intel_rapl.c
> ===================================================================
> --- /dev/null
> +++ linux-2.6/drivers/platform/x86/intel_rapl.c

> +#define MSR_RAPL_POWER_UNIT		0x606
> +
> +/*
> + * Platform specific RAPL Domains.
> + * Note that PP1 RAPL Domain is supported on 062A only
> + * And DRAM RAPL Domain is supported on 062D only
> + */

0x62[AD] is useless, please use proper names.

> +/* Package RAPL Domain */
> +#define MSR_PKG_RAPL_POWER_LIMIT	0x610
> +#define MSR_PKG_ENERGY_STATUS		0x611
> +#define MSR_PKG_PERF_STATUS		0x613
> +#define MSR_PKG_POWER_INFO		0x614
> +
> +/* PP0 RAPL Domain */
> +#define MSR_PP0_POWER_LIMIT		0x638
> +#define MSR_PP0_ENERGY_STATUS		0x639
> +#define MSR_PP0_POLICY			0x63A
> +#define MSR_PP0_PERF_STATUS		0x63B
> +
> +/* PP1 RAPL Domain, may reflect to uncore devices */
> +#define MSR_PP1_POWER_LIMIT		0x640
> +#define MSR_PP1_ENERGY_STATUS		0x641
> +#define MSR_PP1_POLICY			0x642
> +
> +/* DRAM RAPL Domain */
> +#define MSR_DRAM_POWER_LIMIT		0x618
> +#define MSR_DRAM_ENERGY_STATUS		0x619
> +#define MSR_DRAM_PERF_STATUS		0x61B
> +#define MSR_DRAM_POWER_INFO		0x61C
> +
> +/* RAPL UNIT BITMASK */
> +#define POWER_UNIT_OFFSET	0
> +#define POWER_UNIT_MASK		0x0F
> +
> +#define ENERGY_UNIT_OFFSET	0x08
> +#define ENERGY_UNIT_MASK	0x1F00
> +
> +#define TIME_UNIT_OFFSET	0x10
> +#define TIME_UNIT_MASK		0xF000

Are you sure? (x & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET == 0.
You either want a mask of 0xF0000, or an offset of 0x0c.

> +static int rapl_pmu_pkg_event_init(struct perf_event *event);
> +static int rapl_pmu_core_event_init(struct perf_event *event);
> +static int rapl_pmu_uncore_event_init(struct perf_event *event);
> +static int rapl_pmu_dram_event_init(struct perf_event *event);
> +static void rapl_event_start(struct perf_event *event, int flags);
> +static void rapl_event_stop(struct perf_event *event, int flags);
> +static int rapl_event_add(struct perf_event *event, int flags);
> +static void rapl_event_del(struct perf_event *event, int flags);
> +static void rapl_event_read(struct perf_event *event);
> +
> +enum rapl_domain_id {
> +	RAPL_DOMAIN_PKG,
> +	RAPL_DOMAIN_PP0,
> +	RAPL_DOMAIN_PP1,
> +	RAPL_DOMAIN_DRAM,
> +	RAPL_DOMAIN_MAX
> +};
> +
> +struct rapl_domain_msr {
> +	int	limit;
> +	int	status;
> +};
> +
> +struct rapl_domain {
> +	enum rapl_domain_id domain_id;
> +	struct rapl_domain_msr msrs;
> +	struct pmu pmu;
> +	enum perf_sw_ids event_id;
> +	int valid;
> +};

You could use the rapl_domain_id as your ::config space.


> +static unsigned int power_unit_divisor;
> +static unsigned int energy_unit_divisor;
> +static unsigned int time_unit_divisor;
> +
> +enum unit_type {
> +	POWER_UNIT,
> +	ENERGY_UNIT,
> +	TIME_UNIT
> +};
> +static u64 rapl_unit_xlate(enum unit_type type, u64 value, int action)
> +{
> +	u64 divisor;
> +
> +	switch (type) {
> +	case POWER_UNIT:
> +		divisor = power_unit_divisor;
> +		break;
> +	case ENERGY_UNIT:
> +		divisor = energy_unit_divisor;
> +		break;
> +	case TIME_UNIT:
> +		divisor = time_unit_divisor;
> +		break;
> +	default:
> +		return 0;
> +	};
> +
> +	if (action)
> +		return value * divisor; /* value is from users */
> +	else
> +		return div64_u64(value, divisor); /* value is from MSR */
> +}

Please see the comment down by rapl_check_unit(), this is just too wrong
to live.

> +/* show the energy status, in Jelous */
> +static int rapl_read_energy(struct rapl_domain *domain)
> +{
> +	u64 value;
> +	u32 msr = domain->msrs.status;
> +
> +	rdmsrl(msr, value);
> +	return rapl_unit_xlate(ENERGY_UNIT, value, 0);
> +}
> +
> +static void rapl_event_update(struct perf_event *event)
> +{
> +	s64 prev;
> +	u64 now;
> +	struct rapl_domain *domain = to_rapl_domain(event->pmu);
> +
> +	now = rapl_read_energy(domain);

So I had to get the Intel SDM because your driver lacks all useful
information, and I learned that the RAPL status MSRs contain 32 bits.

So you get those 32 bits, divide them by some number,

> +	prev = local64_xchg(&event->hw.prev_count, now);
> +	local64_add(now - prev, &event->count);

And then expect that to work?

I don't think so..

> +}
> +
> +static void rapl_event_start(struct perf_event *event, int flags)
> +{
> +	struct rapl_domain *domain = to_rapl_domain(event->pmu);
> +
> +	local64_set(&event->hw.prev_count, rapl_read_energy(domain));
> +	perf_swevent_start_hrtimer(event);
> +}
> +
> +static void rapl_event_stop(struct perf_event *event, int flags)
> +{
> +	perf_swevent_cancel_hrtimer(event);
> +	rapl_event_update(event);
> +}

> +static int rapl_pmu_event_init(struct perf_event *event,
> +			       enum rapl_domain_id id)
> +{
> +	struct rapl_domain *domain = &(rapl_domains[id]);
> +
> +	if (event->attr.type != PERF_TYPE_SOFTWARE)
> +		return -ENOENT;
> +
> +	if (event->attr.config != domain->event_id)
> +		return -ENOENT;
> +
> +	/* Do periodecal update every second */
> +	event->attr.freq = 1;
> +	event->attr.sample_period = 1;
> +
> +	perf_swevent_init_hrtimer(event);
> +
> +	return 0;
> +}

That's just wrong.. the reason you're wanting to have this timer is to
avoid the RAPL MSRs from overflowing and you loosing offsets, right?

But the above is actually forcing the event to create samples on a
totally unrelated time base.

RAPL should fail to create a sampling event since it doesn't have the
capability to trigger overflow interrupts based on its events.

If you want a timer, add one, but don't do this.

If you expect you actually want to sample, use this event as part of a
group and add a sampling event in there and use PERF_FORMAT_GROUP, Matt
was working on patches to make perf-record capable of this.

> +static int rapl_check_unit(void)

Shouldn't that be called: rapl_init_unit()? You're not actually
verifying anything, you're setting-up state.

> +{
> +	u64 output;
> +	u32 value;
> +
> +	rdmsrl(MSR_RAPL_POWER_UNIT, output);
> +
> +	/* energy unit: 1/enery_unit_divisor Joules */
> +	value = (output & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
> +	energy_unit_divisor = 1 << value;
> +
> +	/* power unit: 1/power_unit_divisor Watts */
> +	value = (output & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
> +	power_unit_divisor = 1 << value;
> +
> +	/* time unit: 1/time_unit_divisor Seconds */
> +	value =(output & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
> +	time_unit_divisor = 1 << value;

So you're saying these factors are powers-of-two, please look at
rapl_unit_xlate and try again.

+
> +	return 0;
> +}
> +
> +static int __init intel_rapl_init(void)
> +{
> +	enum rapl_domain_id id;
> +
> +	/*
> +	 * RAPL features are only supported on processors have a CPUID
> +	 * signature with DisplayFamily_DisplayModel of 06_2AH, 06_2DH
> +	 */
> +	if (boot_cpu_data.x86 != 0x06)
> +		return -ENODEV;
> +
> +	if (boot_cpu_data.x86_model == 0x2A)
> +		rapl_domains[RAPL_DOMAIN_PP1].valid = 1;
> +	else if (boot_cpu_data.x86_model == 0x2D)
> +		rapl_domains[RAPL_DOMAIN_DRAM].valid = 1;
> +	else
> +		return -ENODEV;

Names please, again 06_2[AD] is useless we could have surmised that by
reading the code, nobody knows which part that is.

  a += 4; /* increment by 4 */

quality comments here.

> +	if (rapl_check_unit())
> +		return -ENODEV;
> +
> +	for(id = 0; id < RAPL_DOMAIN_MAX; id++)
> +		if (rapl_domains[id].valid)
> +			perf_pmu_register(&(rapl_domains[id].pmu), rapl_domains[id].pmu.name, PERF_TYPE_SOFTWARE);

Uhm, hell no!, you get to use type = -1.

> +	return 0;
> +}
> +

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-26  8:34 [PATCH 2/3] introduce intel_rapl driver Zhang Rui
@ 2011-05-26  9:43 ` Peter Zijlstra
  2011-05-26  9:43 ` Peter Zijlstra
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2011-05-26  9:43 UTC (permalink / raw)
  To: Zhang Rui; +Cc: Matt Fleming, ming.m.lin, LKML, acme, linux-pm, mingo

On Thu, 2011-05-26 at 16:34 +0800, Zhang Rui wrote:
> Introduce Intel RAPL driver.
> 
> RAPL (running average power limit) is a new feature which provides mechanisms
> to enforce power consumption limit, on some new processors.
> 
> RAPL provides MSRs reporting the total amount of energy consumed
> by the package/core/uncore/dram.
> Further more, by using RAPL, OS can set a power bugdet in a certain time window,
> and let Hardware to throttle the processor P/T-state to meet this enery limitation.
> 
> Currently, we don't have the plan to support the RAPL power control,
> but we do want to export the package/core/uncore/dram power consumption
> information via perf tool first.

Do note that perf is not the right API for those control bits. If you
never plan to expose those, that's fine. If you do, you'll likely need a
parallel API (your own device) for accessing that. Please consider if
using separate APIs for reading/writing this resource is what you want
and mention these considerations in your future changelog.

> Signed-off-by: Zhang Rui <rui.zhang@intel.com>
> ---
>  drivers/platform/x86/Kconfig      |    8 
>  drivers/platform/x86/Makefile     |    1 
>  drivers/platform/x86/intel_rapl.c |  368 ++++++++++++++++++++++++++++++++++++++
>  include/linux/perf_event.h        |    4 
>  4 files changed, 381 insertions(+)
> 
> Index: linux-2.6/drivers/platform/x86/Kconfig
> ===================================================================
> --- linux-2.6.orig/drivers/platform/x86/Kconfig
> +++ linux-2.6/drivers/platform/x86/Kconfig
> @@ -753,4 +753,12 @@ config SAMSUNG_LAPTOP
>  	  To compile this driver as a module, choose M here: the module
>  	  will be called samsung-laptop.
>  
> +config INTEL_RAPL
> +	tristate "Intel RAPL Support"
> +	depends on X86

Also very much depends on perf being there.

> +	default y
> +	---help---
> +	  RAPL, AKA, Running Average Power Limit provides mechanisms to enforce
> +	  power consumption limit.

The enforce part seems dubious, perf is purely about observing state it
doesn't enforce anything. Also this help text could do with expanding in
general.

>  endif # X86_PLATFORM_DEVICES
> Index: linux-2.6/drivers/platform/x86/Makefile
> ===================================================================
> --- linux-2.6.orig/drivers/platform/x86/Makefile
> +++ linux-2.6/drivers/platform/x86/Makefile
> @@ -42,3 +42,4 @@ obj-$(CONFIG_XO15_EBOOK)	+= xo15-ebook.o
>  obj-$(CONFIG_IBM_RTL)		+= ibm_rtl.o
>  obj-$(CONFIG_SAMSUNG_LAPTOP)	+= samsung-laptop.o
>  obj-$(CONFIG_INTEL_MFLD_THERMAL)	+= intel_mid_thermal.o
> +obj-$(CONFIG_INTEL_RAPL)	+= intel_rapl.o
> Index: linux-2.6/include/linux/perf_event.h
> ===================================================================
> --- linux-2.6.orig/include/linux/perf_event.h
> +++ linux-2.6/include/linux/perf_event.h
> @@ -107,6 +107,10 @@ enum perf_sw_ids {
>  	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
>  	PERF_COUNT_SW_ALIGNMENT_FAULTS		= 7,
>  	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
> +	PERF_COUNT_SW_PKG_ENERGY		= 9,
> +	PERF_COUNT_SW_CORE_ENERGY		= 10,
> +	PERF_COUNT_SW_UNCORE_ENERGY		= 11,
> +	PERF_COUNT_SW_DRAM_ENERGY		= 12,

Not going to happen, RAPL registers its own pmu (wrongly, see below),
with that it (should) get its own perf_event_attr::type and thus should
have its own ::config space, you do not get to pollute the
PERF_TYPE_SOFTWARE config space.

Currently there isn't a way to expose the events in sysfs, but we do
want that, its mostly a matter of getting all involved parties to agree
on a format and implementing it.

>  	PERF_COUNT_SW_MAX,			/* non-ABI */
>  };
> Index: linux-2.6/drivers/platform/x86/intel_rapl.c
> ===================================================================
> --- /dev/null
> +++ linux-2.6/drivers/platform/x86/intel_rapl.c

> +#define MSR_RAPL_POWER_UNIT		0x606
> +
> +/*
> + * Platform specific RAPL Domains.
> + * Note that PP1 RAPL Domain is supported on 062A only
> + * And DRAM RAPL Domain is supported on 062D only
> + */

0x62[AD] is useless, please use proper names.

> +/* Package RAPL Domain */
> +#define MSR_PKG_RAPL_POWER_LIMIT	0x610
> +#define MSR_PKG_ENERGY_STATUS		0x611
> +#define MSR_PKG_PERF_STATUS		0x613
> +#define MSR_PKG_POWER_INFO		0x614
> +
> +/* PP0 RAPL Domain */
> +#define MSR_PP0_POWER_LIMIT		0x638
> +#define MSR_PP0_ENERGY_STATUS		0x639
> +#define MSR_PP0_POLICY			0x63A
> +#define MSR_PP0_PERF_STATUS		0x63B
> +
> +/* PP1 RAPL Domain, may reflect to uncore devices */
> +#define MSR_PP1_POWER_LIMIT		0x640
> +#define MSR_PP1_ENERGY_STATUS		0x641
> +#define MSR_PP1_POLICY			0x642
> +
> +/* DRAM RAPL Domain */
> +#define MSR_DRAM_POWER_LIMIT		0x618
> +#define MSR_DRAM_ENERGY_STATUS		0x619
> +#define MSR_DRAM_PERF_STATUS		0x61B
> +#define MSR_DRAM_POWER_INFO		0x61C
> +
> +/* RAPL UNIT BITMASK */
> +#define POWER_UNIT_OFFSET	0
> +#define POWER_UNIT_MASK		0x0F
> +
> +#define ENERGY_UNIT_OFFSET	0x08
> +#define ENERGY_UNIT_MASK	0x1F00
> +
> +#define TIME_UNIT_OFFSET	0x10
> +#define TIME_UNIT_MASK		0xF000

Are you sure? (x & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET == 0.
You either want a mask of 0xF0000, or an offset of 0x0c.

> +static int rapl_pmu_pkg_event_init(struct perf_event *event);
> +static int rapl_pmu_core_event_init(struct perf_event *event);
> +static int rapl_pmu_uncore_event_init(struct perf_event *event);
> +static int rapl_pmu_dram_event_init(struct perf_event *event);
> +static void rapl_event_start(struct perf_event *event, int flags);
> +static void rapl_event_stop(struct perf_event *event, int flags);
> +static int rapl_event_add(struct perf_event *event, int flags);
> +static void rapl_event_del(struct perf_event *event, int flags);
> +static void rapl_event_read(struct perf_event *event);
> +
> +enum rapl_domain_id {
> +	RAPL_DOMAIN_PKG,
> +	RAPL_DOMAIN_PP0,
> +	RAPL_DOMAIN_PP1,
> +	RAPL_DOMAIN_DRAM,
> +	RAPL_DOMAIN_MAX
> +};
> +
> +struct rapl_domain_msr {
> +	int	limit;
> +	int	status;
> +};
> +
> +struct rapl_domain {
> +	enum rapl_domain_id domain_id;
> +	struct rapl_domain_msr msrs;
> +	struct pmu pmu;
> +	enum perf_sw_ids event_id;
> +	int valid;
> +};

You could use the rapl_domain_id as your ::config space.


> +static unsigned int power_unit_divisor;
> +static unsigned int energy_unit_divisor;
> +static unsigned int time_unit_divisor;
> +
> +enum unit_type {
> +	POWER_UNIT,
> +	ENERGY_UNIT,
> +	TIME_UNIT
> +};
> +static u64 rapl_unit_xlate(enum unit_type type, u64 value, int action)
> +{
> +	u64 divisor;
> +
> +	switch (type) {
> +	case POWER_UNIT:
> +		divisor = power_unit_divisor;
> +		break;
> +	case ENERGY_UNIT:
> +		divisor = energy_unit_divisor;
> +		break;
> +	case TIME_UNIT:
> +		divisor = time_unit_divisor;
> +		break;
> +	default:
> +		return 0;
> +	};
> +
> +	if (action)
> +		return value * divisor; /* value is from users */
> +	else
> +		return div64_u64(value, divisor); /* value is from MSR */
> +}

Please see the comment down by rapl_check_unit(), this is just too wrong
to live.

> +/* show the energy status, in Jelous */
> +static int rapl_read_energy(struct rapl_domain *domain)
> +{
> +	u64 value;
> +	u32 msr = domain->msrs.status;
> +
> +	rdmsrl(msr, value);
> +	return rapl_unit_xlate(ENERGY_UNIT, value, 0);
> +}
> +
> +static void rapl_event_update(struct perf_event *event)
> +{
> +	s64 prev;
> +	u64 now;
> +	struct rapl_domain *domain = to_rapl_domain(event->pmu);
> +
> +	now = rapl_read_energy(domain);

So I had to get the Intel SDM because your driver lacks all useful
information, and I learned that the RAPL status MSRs contain 32 bits.

So you get those 32 bits, divide them by some number,

> +	prev = local64_xchg(&event->hw.prev_count, now);
> +	local64_add(now - prev, &event->count);

And then expect that to work?

I don't think so..

> +}
> +
> +static void rapl_event_start(struct perf_event *event, int flags)
> +{
> +	struct rapl_domain *domain = to_rapl_domain(event->pmu);
> +
> +	local64_set(&event->hw.prev_count, rapl_read_energy(domain));
> +	perf_swevent_start_hrtimer(event);
> +}
> +
> +static void rapl_event_stop(struct perf_event *event, int flags)
> +{
> +	perf_swevent_cancel_hrtimer(event);
> +	rapl_event_update(event);
> +}

> +static int rapl_pmu_event_init(struct perf_event *event,
> +			       enum rapl_domain_id id)
> +{
> +	struct rapl_domain *domain = &(rapl_domains[id]);
> +
> +	if (event->attr.type != PERF_TYPE_SOFTWARE)
> +		return -ENOENT;
> +
> +	if (event->attr.config != domain->event_id)
> +		return -ENOENT;
> +
> +	/* Do periodecal update every second */
> +	event->attr.freq = 1;
> +	event->attr.sample_period = 1;
> +
> +	perf_swevent_init_hrtimer(event);
> +
> +	return 0;
> +}

That's just wrong.. the reason you're wanting to have this timer is to
avoid the RAPL MSRs from overflowing and you loosing offsets, right?

But the above is actually forcing the event to create samples on a
totally unrelated time base.

RAPL should fail to create a sampling event since it doesn't have the
capability to trigger overflow interrupts based on its events.

If you want a timer, add one, but don't do this.

If you expect you actually want to sample, use this event as part of a
group and add a sampling event in there and use PERF_FORMAT_GROUP, Matt
was working on patches to make perf-record capable of this.

> +static int rapl_check_unit(void)

Shouldn't that be called: rapl_init_unit()? You're not actually
verifying anything, you're setting-up state.

> +{
> +	u64 output;
> +	u32 value;
> +
> +	rdmsrl(MSR_RAPL_POWER_UNIT, output);
> +
> +	/* energy unit: 1/enery_unit_divisor Joules */
> +	value = (output & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
> +	energy_unit_divisor = 1 << value;
> +
> +	/* power unit: 1/power_unit_divisor Watts */
> +	value = (output & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
> +	power_unit_divisor = 1 << value;
> +
> +	/* time unit: 1/time_unit_divisor Seconds */
> +	value =(output & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
> +	time_unit_divisor = 1 << value;

So you're saying these factors are powers-of-two, please look at
rapl_unit_xlate and try again.

+
> +	return 0;
> +}
> +
> +static int __init intel_rapl_init(void)
> +{
> +	enum rapl_domain_id id;
> +
> +	/*
> +	 * RAPL features are only supported on processors have a CPUID
> +	 * signature with DisplayFamily_DisplayModel of 06_2AH, 06_2DH
> +	 */
> +	if (boot_cpu_data.x86 != 0x06)
> +		return -ENODEV;
> +
> +	if (boot_cpu_data.x86_model == 0x2A)
> +		rapl_domains[RAPL_DOMAIN_PP1].valid = 1;
> +	else if (boot_cpu_data.x86_model == 0x2D)
> +		rapl_domains[RAPL_DOMAIN_DRAM].valid = 1;
> +	else
> +		return -ENODEV;

Names please, again 06_2[AD] is useless we could have surmised that by
reading the code, nobody knows which part that is.

  a += 4; /* increment by 4 */

quality comments here.

> +	if (rapl_check_unit())
> +		return -ENODEV;
> +
> +	for(id = 0; id < RAPL_DOMAIN_MAX; id++)
> +		if (rapl_domains[id].valid)
> +			perf_pmu_register(&(rapl_domains[id].pmu), rapl_domains[id].pmu.name, PERF_TYPE_SOFTWARE);

Uhm, hell no!, you get to use type = -1.

> +	return 0;
> +}
> +

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-26  9:43 ` Peter Zijlstra
  2011-05-26 10:21   ` Peter Zijlstra
@ 2011-05-26 10:21   ` Peter Zijlstra
  2011-05-26 10:55   ` Matt Fleming
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2011-05-26 10:21 UTC (permalink / raw)
  To: Zhang Rui
  Cc: LKML, linux-pm, mingo, acme, ming.m.lin, Brown, Len, Matt Fleming

On Thu, 2011-05-26 at 11:43 +0200, Peter Zijlstra wrote:
> > +     for(id = 0; id < RAPL_DOMAIN_MAX; id++)
> > +             if (rapl_domains[id].valid)
> > +                     perf_pmu_register(&(rapl_domains[id].pmu), rapl_domains[id].pmu.name, PERF_TYPE_SOFTWARE);
> 
> Uhm, hell no!, you get to use type = -1. 

Also, you don't need a struct pmu per domain.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-26  9:43 ` Peter Zijlstra
@ 2011-05-26 10:21   ` Peter Zijlstra
  2011-05-26 10:21   ` Peter Zijlstra
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2011-05-26 10:21 UTC (permalink / raw)
  To: Zhang Rui; +Cc: Matt Fleming, ming.m.lin, LKML, acme, linux-pm, mingo

On Thu, 2011-05-26 at 11:43 +0200, Peter Zijlstra wrote:
> > +     for(id = 0; id < RAPL_DOMAIN_MAX; id++)
> > +             if (rapl_domains[id].valid)
> > +                     perf_pmu_register(&(rapl_domains[id].pmu), rapl_domains[id].pmu.name, PERF_TYPE_SOFTWARE);
> 
> Uhm, hell no!, you get to use type = -1. 

Also, you don't need a struct pmu per domain.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-26  9:43 ` Peter Zijlstra
  2011-05-26 10:21   ` Peter Zijlstra
  2011-05-26 10:21   ` Peter Zijlstra
@ 2011-05-26 10:55   ` Matt Fleming
  2011-06-02  8:04       ` Matt Fleming
  2011-05-26 10:55   ` Matt Fleming
                     ` (2 subsequent siblings)
  5 siblings, 1 reply; 26+ messages in thread
From: Matt Fleming @ 2011-05-26 10:55 UTC (permalink / raw)
  To: Peter Zijlstra, Zhang Rui
  Cc: LKML, linux-pm, mingo, acme, ming.m.lin, Brown, Len

On Thu, May 26, 2011 at 11:43:23AM +0200, Peter Zijlstra wrote:
>
> That's just wrong.. the reason you're wanting to have this timer is to
> avoid the RAPL MSRs from overflowing and you loosing offsets, right?
> 
> But the above is actually forcing the event to create samples on a
> totally unrelated time base.
> 
> RAPL should fail to create a sampling event since it doesn't have the
> capability to trigger overflow interrupts based on its events.
> 
> If you want a timer, add one, but don't do this.
> 
> If you expect you actually want to sample, use this event as part of a
> group and add a sampling event in there and use PERF_FORMAT_GROUP, Matt
> was working on patches to make perf-record capable of this.

Yep, I have some unfinished patches around here somewhere...

*rummage*

OK, they're in this repository on the perf/group-events branch,

    git://git.kernel.org/pub/scm/linux/kernel/git/mfleming/sh-2.6.git

Obviously since I last touched them in November of last year they're
more than likely not going to apply cleanly to tip, and perhaps more
importantly, I don't think I ever submitted them to LKML for review.

Rui, I'll try to look at your patch series over the weekend and get my
sampling patches into a state where they can be submitted for review
and used by the intel_rapl driver. Does that sound OK?

Or if you want to take a more proactive approach, you're welcome to
finish and submit them ;-)

--
Matt Fleming, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-26  9:43 ` Peter Zijlstra
                     ` (2 preceding siblings ...)
  2011-05-26 10:55   ` Matt Fleming
@ 2011-05-26 10:55   ` Matt Fleming
  2011-05-27  8:26   ` Zhang Rui
  2011-05-27  8:26   ` Zhang Rui
  5 siblings, 0 replies; 26+ messages in thread
From: Matt Fleming @ 2011-05-26 10:55 UTC (permalink / raw)
  To: Peter Zijlstra, Zhang Rui; +Cc: ming.m.lin, LKML, acme, linux-pm, mingo

On Thu, May 26, 2011 at 11:43:23AM +0200, Peter Zijlstra wrote:
>
> That's just wrong.. the reason you're wanting to have this timer is to
> avoid the RAPL MSRs from overflowing and you loosing offsets, right?
> 
> But the above is actually forcing the event to create samples on a
> totally unrelated time base.
> 
> RAPL should fail to create a sampling event since it doesn't have the
> capability to trigger overflow interrupts based on its events.
> 
> If you want a timer, add one, but don't do this.
> 
> If you expect you actually want to sample, use this event as part of a
> group and add a sampling event in there and use PERF_FORMAT_GROUP, Matt
> was working on patches to make perf-record capable of this.

Yep, I have some unfinished patches around here somewhere...

*rummage*

OK, they're in this repository on the perf/group-events branch,

    git://git.kernel.org/pub/scm/linux/kernel/git/mfleming/sh-2.6.git

Obviously since I last touched them in November of last year they're
more than likely not going to apply cleanly to tip, and perhaps more
importantly, I don't think I ever submitted them to LKML for review.

Rui, I'll try to look at your patch series over the weekend and get my
sampling patches into a state where they can be submitted for review
and used by the intel_rapl driver. Does that sound OK?

Or if you want to take a more proactive approach, you're welcome to
finish and submit them ;-)

--
Matt Fleming, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-26  8:34 [PATCH 2/3] introduce intel_rapl driver Zhang Rui
  2011-05-26  9:43 ` Peter Zijlstra
  2011-05-26  9:43 ` Peter Zijlstra
@ 2011-05-26 15:48 ` Randy Dunlap
  2011-05-30  2:40   ` Zhang Rui
  2011-05-30  2:40   ` Zhang Rui
  2011-05-26 15:48 ` Randy Dunlap
                   ` (2 subsequent siblings)
  5 siblings, 2 replies; 26+ messages in thread
From: Randy Dunlap @ 2011-05-26 15:48 UTC (permalink / raw)
  To: Zhang Rui
  Cc: LKML, linux-pm, a.p.zijlstra, mingo, acme, ming.m.lin, Brown, Len

On Thu, 26 May 2011 16:34:17 +0800 Zhang Rui wrote:

> 
> Introduce Intel RAPL driver.
> 
> RAPL (running average power limit) is a new feature which provides mechanisms
> to enforce power consumption limit, on some new processors.
> 
> RAPL provides MSRs reporting the total amount of energy consumed
> by the package/core/uncore/dram.
> Further more, by using RAPL, OS can set a power bugdet in a certain time window,
> and let Hardware to throttle the processor P/T-state to meet this enery limitation.
> 
> Currently, we don't have the plan to support the RAPL power control,
> but we do want to export the package/core/uncore/dram power consumption
> information via perf tool first.

Hi,

What's an uncore?

> Signed-off-by: Zhang Rui <rui.zhang@intel.com>
> ---
>  drivers/platform/x86/Kconfig      |    8 
>  drivers/platform/x86/Makefile     |    1 
>  drivers/platform/x86/intel_rapl.c |  368 ++++++++++++++++++++++++++++++++++++++
>  include/linux/perf_event.h        |    4 
>  4 files changed, 381 insertions(+)
> 
> Index: linux-2.6/drivers/platform/x86/Kconfig
> ===================================================================
> --- linux-2.6.orig/drivers/platform/x86/Kconfig
> +++ linux-2.6/drivers/platform/x86/Kconfig
> @@ -753,4 +753,12 @@ config SAMSUNG_LAPTOP
>  	  To compile this driver as a module, choose M here: the module
>  	  will be called samsung-laptop.
>  
> +config INTEL_RAPL
> +	tristate "Intel RAPL Support"
> +	depends on X86
> +	default y
> +	---help---
> +	  RAPL, AKA, Running Average Power Limit provides mechanisms to enforce

	  RAPL (Running Average Power Limit) provides mechanisms to enforce

> +	  power consumption limit.
> +
>  endif # X86_PLATFORM_DEVICES

> Index: linux-2.6/drivers/platform/x86/intel_rapl.c
> ===================================================================
> --- /dev/null
> +++ linux-2.6/drivers/platform/x86/intel_rapl.c
> @@ -0,0 +1,368 @@

[snip]

> +/* show the energy status, in Jelous */

Is that Joules?  or what?

> +static int rapl_read_energy(struct rapl_domain *domain)
> +{
> +	u64 value;
> +	u32 msr = domain->msrs.status;
> +
> +	rdmsrl(msr, value);
> +	return rapl_unit_xlate(ENERGY_UNIT, value, 0);
> +}

[snip]

> +static int __init intel_rapl_init(void)
> +{
> +	enum rapl_domain_id id;
> +
> +	/*
> +	 * RAPL features are only supported on processors have a CPUID
> +	 * signature with DisplayFamily_DisplayModel of 06_2AH, 06_2DH
> +	 */
> +	if (boot_cpu_data.x86 != 0x06)
> +		return -ENODEV;
> +
> +	if (boot_cpu_data.x86_model == 0x2A)
> +		rapl_domains[RAPL_DOMAIN_PP1].valid = 1;
> +	else if (boot_cpu_data.x86_model == 0x2D)
> +		rapl_domains[RAPL_DOMAIN_DRAM].valid = 1;
> +	else
> +		return -ENODEV;
> +
> +	if (rapl_check_unit())
> +		return -ENODEV;
> +
> +	for(id = 0; id < RAPL_DOMAIN_MAX; id++)

space after "for"

> +		if (rapl_domains[id].valid)
> +			perf_pmu_register(&(rapl_domains[id].pmu), rapl_domains[id].pmu.name, PERF_TYPE_SOFTWARE);
> +	return 0;
> +}
> +
> +static void __exit intel_rapl_exit(void)
> +{
> +	enum rapl_domain_id id;
> +
> +	for(id = 0; id < RAPL_DOMAIN_MAX; id++)

ditto

> +		if (rapl_domains[id].valid)
> +			perf_pmu_unregister(&(rapl_domains[id].pmu));
> +}


---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-26  8:34 [PATCH 2/3] introduce intel_rapl driver Zhang Rui
                   ` (2 preceding siblings ...)
  2011-05-26 15:48 ` Randy Dunlap
@ 2011-05-26 15:48 ` Randy Dunlap
  2011-05-28 10:17 ` Greg KH
  2011-05-28 10:17 ` Greg KH
  5 siblings, 0 replies; 26+ messages in thread
From: Randy Dunlap @ 2011-05-26 15:48 UTC (permalink / raw)
  To: Zhang Rui; +Cc: a.p.zijlstra, ming.m.lin, LKML, acme, linux-pm, mingo

On Thu, 26 May 2011 16:34:17 +0800 Zhang Rui wrote:

> 
> Introduce Intel RAPL driver.
> 
> RAPL (running average power limit) is a new feature which provides mechanisms
> to enforce power consumption limit, on some new processors.
> 
> RAPL provides MSRs reporting the total amount of energy consumed
> by the package/core/uncore/dram.
> Further more, by using RAPL, OS can set a power bugdet in a certain time window,
> and let Hardware to throttle the processor P/T-state to meet this enery limitation.
> 
> Currently, we don't have the plan to support the RAPL power control,
> but we do want to export the package/core/uncore/dram power consumption
> information via perf tool first.

Hi,

What's an uncore?

> Signed-off-by: Zhang Rui <rui.zhang@intel.com>
> ---
>  drivers/platform/x86/Kconfig      |    8 
>  drivers/platform/x86/Makefile     |    1 
>  drivers/platform/x86/intel_rapl.c |  368 ++++++++++++++++++++++++++++++++++++++
>  include/linux/perf_event.h        |    4 
>  4 files changed, 381 insertions(+)
> 
> Index: linux-2.6/drivers/platform/x86/Kconfig
> ===================================================================
> --- linux-2.6.orig/drivers/platform/x86/Kconfig
> +++ linux-2.6/drivers/platform/x86/Kconfig
> @@ -753,4 +753,12 @@ config SAMSUNG_LAPTOP
>  	  To compile this driver as a module, choose M here: the module
>  	  will be called samsung-laptop.
>  
> +config INTEL_RAPL
> +	tristate "Intel RAPL Support"
> +	depends on X86
> +	default y
> +	---help---
> +	  RAPL, AKA, Running Average Power Limit provides mechanisms to enforce

	  RAPL (Running Average Power Limit) provides mechanisms to enforce

> +	  power consumption limit.
> +
>  endif # X86_PLATFORM_DEVICES

> Index: linux-2.6/drivers/platform/x86/intel_rapl.c
> ===================================================================
> --- /dev/null
> +++ linux-2.6/drivers/platform/x86/intel_rapl.c
> @@ -0,0 +1,368 @@

[snip]

> +/* show the energy status, in Jelous */

Is that Joules?  or what?

> +static int rapl_read_energy(struct rapl_domain *domain)
> +{
> +	u64 value;
> +	u32 msr = domain->msrs.status;
> +
> +	rdmsrl(msr, value);
> +	return rapl_unit_xlate(ENERGY_UNIT, value, 0);
> +}

[snip]

> +static int __init intel_rapl_init(void)
> +{
> +	enum rapl_domain_id id;
> +
> +	/*
> +	 * RAPL features are only supported on processors have a CPUID
> +	 * signature with DisplayFamily_DisplayModel of 06_2AH, 06_2DH
> +	 */
> +	if (boot_cpu_data.x86 != 0x06)
> +		return -ENODEV;
> +
> +	if (boot_cpu_data.x86_model == 0x2A)
> +		rapl_domains[RAPL_DOMAIN_PP1].valid = 1;
> +	else if (boot_cpu_data.x86_model == 0x2D)
> +		rapl_domains[RAPL_DOMAIN_DRAM].valid = 1;
> +	else
> +		return -ENODEV;
> +
> +	if (rapl_check_unit())
> +		return -ENODEV;
> +
> +	for(id = 0; id < RAPL_DOMAIN_MAX; id++)

space after "for"

> +		if (rapl_domains[id].valid)
> +			perf_pmu_register(&(rapl_domains[id].pmu), rapl_domains[id].pmu.name, PERF_TYPE_SOFTWARE);
> +	return 0;
> +}
> +
> +static void __exit intel_rapl_exit(void)
> +{
> +	enum rapl_domain_id id;
> +
> +	for(id = 0; id < RAPL_DOMAIN_MAX; id++)

ditto

> +		if (rapl_domains[id].valid)
> +			perf_pmu_unregister(&(rapl_domains[id].pmu));
> +}


---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-26  9:43 ` Peter Zijlstra
                     ` (4 preceding siblings ...)
  2011-05-27  8:26   ` Zhang Rui
@ 2011-05-27  8:26   ` Zhang Rui
  2011-05-27 19:56     ` Peter Zijlstra
                       ` (3 more replies)
  5 siblings, 4 replies; 26+ messages in thread
From: Zhang Rui @ 2011-05-27  8:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, linux-pm, mingo, acme, Lin, Ming M, Brown, Len, Matt Fleming

Hi, Peter,

On Thu, 2011-05-26 at 17:43 +0800, Peter Zijlstra wrote: 
> On Thu, 2011-05-26 at 16:34 +0800, Zhang Rui wrote:
> > Introduce Intel RAPL driver.
> > 
> > RAPL (running average power limit) is a new feature which provides mechanisms
> > to enforce power consumption limit, on some new processors.
> > 
> > RAPL provides MSRs reporting the total amount of energy consumed
> > by the package/core/uncore/dram.
> > Further more, by using RAPL, OS can set a power bugdet in a certain time window,
> > and let Hardware to throttle the processor P/T-state to meet this enery limitation.
> > 
> > Currently, we don't have the plan to support the RAPL power control,
> > but we do want to export the package/core/uncore/dram power consumption
> > information via perf tool first.
> 
> Do note that perf is not the right API for those control bits. If you
> never plan to expose those, that's fine. If you do, you'll likely need a
> parallel API (your own device) for accessing that.

Agree.
I was thinking of registering RAPL as a platform device and set the
power limit via sysfs nodes.

> Please consider if
> using separate APIs for reading/writing this resource is what you want
> and mention these considerations in your future changelog.
> 
okay. I'll do that.

> > Signed-off-by: Zhang Rui <rui.zhang@intel.com>
> > ---
> >  drivers/platform/x86/Kconfig      |    8 
> >  drivers/platform/x86/Makefile     |    1 
> >  drivers/platform/x86/intel_rapl.c |  368 ++++++++++++++++++++++++++++++++++++++
> >  include/linux/perf_event.h        |    4 
> >  4 files changed, 381 insertions(+)
> > 
> > Index: linux-2.6/drivers/platform/x86/Kconfig
> > ===================================================================
> > --- linux-2.6.orig/drivers/platform/x86/Kconfig
> > +++ linux-2.6/drivers/platform/x86/Kconfig
> > @@ -753,4 +753,12 @@ config SAMSUNG_LAPTOP
> >  	  To compile this driver as a module, choose M here: the module
> >  	  will be called samsung-laptop.
> >  
> > +config INTEL_RAPL
> > +	tristate "Intel RAPL Support"
> > +	depends on X86
> 
> Also very much depends on perf being there.
> 
Agree.

> > +	default y
> > +	---help---
> > +	  RAPL, AKA, Running Average Power Limit provides mechanisms to enforce
> > +	  power consumption limit.
> 
> The enforce part seems dubious, perf is purely about observing state it
> doesn't enforce anything. Also this help text could do with expanding in
> general.
> 
This help text is just a description of RAPL interface.
But you're right, I should be more specific about the CURRENT intel_rapl
driver status.

> >  endif # X86_PLATFORM_DEVICES
> > Index: linux-2.6/drivers/platform/x86/Makefile
> > ===================================================================
> > --- linux-2.6.orig/drivers/platform/x86/Makefile
> > +++ linux-2.6/drivers/platform/x86/Makefile
> > @@ -42,3 +42,4 @@ obj-$(CONFIG_XO15_EBOOK)	+= xo15-ebook.o
> >  obj-$(CONFIG_IBM_RTL)		+= ibm_rtl.o
> >  obj-$(CONFIG_SAMSUNG_LAPTOP)	+= samsung-laptop.o
> >  obj-$(CONFIG_INTEL_MFLD_THERMAL)	+= intel_mid_thermal.o
> > +obj-$(CONFIG_INTEL_RAPL)	+= intel_rapl.o
> > Index: linux-2.6/include/linux/perf_event.h
> > ===================================================================
> > --- linux-2.6.orig/include/linux/perf_event.h
> > +++ linux-2.6/include/linux/perf_event.h
> > @@ -107,6 +107,10 @@ enum perf_sw_ids {
> >  	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
> >  	PERF_COUNT_SW_ALIGNMENT_FAULTS		= 7,
> >  	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
> > +	PERF_COUNT_SW_PKG_ENERGY		= 9,
> > +	PERF_COUNT_SW_CORE_ENERGY		= 10,
> > +	PERF_COUNT_SW_UNCORE_ENERGY		= 11,
> > +	PERF_COUNT_SW_DRAM_ENERGY		= 12,
> 
> Not going to happen, RAPL registers its own pmu (wrongly, see below),
> with that it (should) get its own perf_event_attr::type and thus should
> have its own ::config space, you do not get to pollute the
> PERF_TYPE_SOFTWARE config space.

> Currently there isn't a way to expose the events in sysfs, but we do
> want that, its mostly a matter of getting all involved parties to agree
> on a format and implementing it.
> 
I talked with Lin Ming just now, and he said that it should work in this
way:
First, only one pmu for RAPL interfaces, with four different kinds of
events, pkg/core/uncore/dram,
and the sysfs I/F is:
/sys/bus/event_source/devices/rapl/---|---type
                                      |---pkg
                                      |---core
                                      |---uncore
                                      |---dram

to use it, users can issue something like:
perf stat -P rapl -e pkg/core/uncore/dram foo
so that event->attr.type equals rapl_pmu.type and event->attr.config
equals one of the rapl_domain_id.

This sounds good. I can rewrite the code to work in this way, but it
doesn't work for now, until both sysfs I/F and perf tool being ready,
right?

> >  	PERF_COUNT_SW_MAX,			/* non-ABI */
> >  };
> > Index: linux-2.6/drivers/platform/x86/intel_rapl.c
> > ===================================================================
> > --- /dev/null
> > +++ linux-2.6/drivers/platform/x86/intel_rapl.c
> 
> > +#define MSR_RAPL_POWER_UNIT		0x606
> > +
> > +/*
> > + * Platform specific RAPL Domains.
> > + * Note that PP1 RAPL Domain is supported on 062A only
> > + * And DRAM RAPL Domain is supported on 062D only
> > + */
> 
> 0x62[AD] is useless, please use proper names.

> > +/* Package RAPL Domain */
> > +#define MSR_PKG_RAPL_POWER_LIMIT	0x610
> > +#define MSR_PKG_ENERGY_STATUS		0x611
> > +#define MSR_PKG_PERF_STATUS		0x613
> > +#define MSR_PKG_POWER_INFO		0x614
> > +
> > +/* PP0 RAPL Domain */
> > +#define MSR_PP0_POWER_LIMIT		0x638
> > +#define MSR_PP0_ENERGY_STATUS		0x639
> > +#define MSR_PP0_POLICY			0x63A
> > +#define MSR_PP0_PERF_STATUS		0x63B
> > +
> > +/* PP1 RAPL Domain, may reflect to uncore devices */
> > +#define MSR_PP1_POWER_LIMIT		0x640
> > +#define MSR_PP1_ENERGY_STATUS		0x641
> > +#define MSR_PP1_POLICY			0x642
> > +
> > +/* DRAM RAPL Domain */
> > +#define MSR_DRAM_POWER_LIMIT		0x618
> > +#define MSR_DRAM_ENERGY_STATUS		0x619
> > +#define MSR_DRAM_PERF_STATUS		0x61B
> > +#define MSR_DRAM_POWER_INFO		0x61C
> > +
> > +/* RAPL UNIT BITMASK */
> > +#define POWER_UNIT_OFFSET	0
> > +#define POWER_UNIT_MASK		0x0F
> > +
> > +#define ENERGY_UNIT_OFFSET	0x08
> > +#define ENERGY_UNIT_MASK	0x1F00
> > +
> > +#define TIME_UNIT_OFFSET	0x10
> > +#define TIME_UNIT_MASK		0xF000
> 
> Are you sure? (x & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET == 0.
> You either want a mask of 0xF0000, or an offset of 0x0c.
> 
oops. It's 0xF0000. sorry about that.

> > +static int rapl_pmu_pkg_event_init(struct perf_event *event);
> > +static int rapl_pmu_core_event_init(struct perf_event *event);
> > +static int rapl_pmu_uncore_event_init(struct perf_event *event);
> > +static int rapl_pmu_dram_event_init(struct perf_event *event);
> > +static void rapl_event_start(struct perf_event *event, int flags);
> > +static void rapl_event_stop(struct perf_event *event, int flags);
> > +static int rapl_event_add(struct perf_event *event, int flags);
> > +static void rapl_event_del(struct perf_event *event, int flags);
> > +static void rapl_event_read(struct perf_event *event);
> > +
> > +enum rapl_domain_id {
> > +	RAPL_DOMAIN_PKG,
> > +	RAPL_DOMAIN_PP0,
> > +	RAPL_DOMAIN_PP1,
> > +	RAPL_DOMAIN_DRAM,
> > +	RAPL_DOMAIN_MAX
> > +};
> > +
> > +struct rapl_domain_msr {
> > +	int	limit;
> > +	int	status;
> > +};
> > +
> > +struct rapl_domain {
> > +	enum rapl_domain_id domain_id;
> > +	struct rapl_domain_msr msrs;
> > +	struct pmu pmu;
> > +	enum perf_sw_ids event_id;
> > +	int valid;
> > +};
> 
> You could use the rapl_domain_id as your ::config space.
> 
> 
> > +static unsigned int power_unit_divisor;
> > +static unsigned int energy_unit_divisor;
> > +static unsigned int time_unit_divisor;
> > +
> > +enum unit_type {
> > +	POWER_UNIT,
> > +	ENERGY_UNIT,
> > +	TIME_UNIT
> > +};
> > +static u64 rapl_unit_xlate(enum unit_type type, u64 value, int action)
> > +{
> > +	u64 divisor;
> > +
> > +	switch (type) {
> > +	case POWER_UNIT:
> > +		divisor = power_unit_divisor;
> > +		break;
> > +	case ENERGY_UNIT:
> > +		divisor = energy_unit_divisor;
> > +		break;
> > +	case TIME_UNIT:
> > +		divisor = time_unit_divisor;
> > +		break;
> > +	default:
> > +		return 0;
> > +	};
> > +
> > +	if (action)
> > +		return value * divisor; /* value is from users */
> > +	else
> > +		return div64_u64(value, divisor); /* value is from MSR */
> > +}
> 
> Please see the comment down by rapl_check_unit(), this is just too wrong
> to live.
> 
> > +/* show the energy status, in Jelous */
> > +static int rapl_read_energy(struct rapl_domain *domain)
> > +{
> > +	u64 value;
> > +	u32 msr = domain->msrs.status;
> > +
> > +	rdmsrl(msr, value);
> > +	return rapl_unit_xlate(ENERGY_UNIT, value, 0);
> > +}
> > +
> > +static void rapl_event_update(struct perf_event *event)
> > +{
> > +	s64 prev;
> > +	u64 now;
> > +	struct rapl_domain *domain = to_rapl_domain(event->pmu);
> > +
> > +	now = rapl_read_energy(domain);
> 
> So I had to get the Intel SDM because your driver lacks all useful
> information, and I learned that the RAPL status MSRs contain 32 bits.
> 
> So you get those 32 bits, divide them by some number,
> 
> > +	prev = local64_xchg(&event->hw.prev_count, now);
> > +	local64_add(now - prev, &event->count);
> 
> And then expect that to work?
> 
rapl_read_energy first reads energy status from MSR and then invokes
rapl_unit_xlate to translate it into Joules.
For example, on the laptop I tested, the energy unit bits is 0x10, which
means that the energy unit is 1/65536 Joule.
So I need to divide the value read from MSR by 65536 to calculate how
many Joules of energy are cost. 

But this reveals a problem. If the task is scheduled out with energy
consumption less than 1 Joule, we failed to record it.

IMO, a new callback should be introduced so that I can save the MSR
value first and translate it to Joule when the task exits. Or just do
the translation in user space.

what do you think?

> I don't think so..
> 
> > +}
> > +
> > +static void rapl_event_start(struct perf_event *event, int flags)
> > +{
> > +	struct rapl_domain *domain = to_rapl_domain(event->pmu);
> > +
> > +	local64_set(&event->hw.prev_count, rapl_read_energy(domain));
> > +	perf_swevent_start_hrtimer(event);
> > +}
> > +
> > +static void rapl_event_stop(struct perf_event *event, int flags)
> > +{
> > +	perf_swevent_cancel_hrtimer(event);
> > +	rapl_event_update(event);
> > +}
> 
> > +static int rapl_pmu_event_init(struct perf_event *event,
> > +			       enum rapl_domain_id id)
> > +{
> > +	struct rapl_domain *domain = &(rapl_domains[id]);
> > +
> > +	if (event->attr.type != PERF_TYPE_SOFTWARE)
> > +		return -ENOENT;
> > +
> > +	if (event->attr.config != domain->event_id)
> > +		return -ENOENT;
> > +
> > +	/* Do periodecal update every second */
> > +	event->attr.freq = 1;
> > +	event->attr.sample_period = 1;
> > +
> > +	perf_swevent_init_hrtimer(event);
> > +
> > +	return 0;
> > +}
> 
> That's just wrong.. the reason you're wanting to have this timer is to
> avoid the RAPL MSRs from overflowing and you loosing offsets, right?
> 
> But the above is actually forcing the event to create samples on a
> totally unrelated time base.
> 
> RAPL should fail to create a sampling event since it doesn't have the
> capability to trigger overflow interrupts based on its events.
> 
> If you want a timer, add one, but don't do this.
> 
> If you expect you actually want to sample, use this event as part of a
> group and add a sampling event in there and use PERF_FORMAT_GROUP, Matt
> was working on patches to make perf-record capable of this.
> 
perf stat doesn't support -g parameter.

BTW, as I need a per task hrtimer, can I make use of the
hw_perf_event.hrtimer in intel_rapl driver, without touching the perf
hrtimer interfaces?

> > +static int rapl_check_unit(void)
> 
> Shouldn't that be called: rapl_init_unit()? You're not actually
> verifying anything, you're setting-up state.
> 
Agree.

thanks,
rui
> > +{
> > +	u64 output;
> > +	u32 value;
> > +
> > +	rdmsrl(MSR_RAPL_POWER_UNIT, output);
> > +
> > +	/* energy unit: 1/enery_unit_divisor Joules */
> > +	value = (output & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
> > +	energy_unit_divisor = 1 << value;
> > +
> > +	/* power unit: 1/power_unit_divisor Watts */
> > +	value = (output & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
> > +	power_unit_divisor = 1 << value;
> > +
> > +	/* time unit: 1/time_unit_divisor Seconds */
> > +	value =(output & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
> > +	time_unit_divisor = 1 << value;
> 
> So you're saying these factors are powers-of-two, please look at
> rapl_unit_xlate and try again.
> 
> +
> > +	return 0;
> > +}
> > +
> > +static int __init intel_rapl_init(void)
> > +{
> > +	enum rapl_domain_id id;
> > +
> > +	/*
> > +	 * RAPL features are only supported on processors have a CPUID
> > +	 * signature with DisplayFamily_DisplayModel of 06_2AH, 06_2DH
> > +	 */
> > +	if (boot_cpu_data.x86 != 0x06)
> > +		return -ENODEV;
> > +
> > +	if (boot_cpu_data.x86_model == 0x2A)
> > +		rapl_domains[RAPL_DOMAIN_PP1].valid = 1;
> > +	else if (boot_cpu_data.x86_model == 0x2D)
> > +		rapl_domains[RAPL_DOMAIN_DRAM].valid = 1;
> > +	else
> > +		return -ENODEV;
> 
> Names please, again 06_2[AD] is useless we could have surmised that by
> reading the code, nobody knows which part that is.
> 
>   a += 4; /* increment by 4 */
> 
> quality comments here.
> 
> > +	if (rapl_check_unit())
> > +		return -ENODEV;
> > +
> > +	for(id = 0; id < RAPL_DOMAIN_MAX; id++)
> > +		if (rapl_domains[id].valid)
> > +			perf_pmu_register(&(rapl_domains[id].pmu), rapl_domains[id].pmu.name, PERF_TYPE_SOFTWARE);
> 
> Uhm, hell no!, you get to use type = -1.
> 
> > +	return 0;
> > +}
> > +





^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-26  9:43 ` Peter Zijlstra
                     ` (3 preceding siblings ...)
  2011-05-26 10:55   ` Matt Fleming
@ 2011-05-27  8:26   ` Zhang Rui
  2011-05-27  8:26   ` Zhang Rui
  5 siblings, 0 replies; 26+ messages in thread
From: Zhang Rui @ 2011-05-27  8:26 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Matt Fleming, Lin, Ming M, LKML, acme, linux-pm, mingo

Hi, Peter,

On Thu, 2011-05-26 at 17:43 +0800, Peter Zijlstra wrote: 
> On Thu, 2011-05-26 at 16:34 +0800, Zhang Rui wrote:
> > Introduce Intel RAPL driver.
> > 
> > RAPL (running average power limit) is a new feature which provides mechanisms
> > to enforce power consumption limit, on some new processors.
> > 
> > RAPL provides MSRs reporting the total amount of energy consumed
> > by the package/core/uncore/dram.
> > Further more, by using RAPL, OS can set a power bugdet in a certain time window,
> > and let Hardware to throttle the processor P/T-state to meet this enery limitation.
> > 
> > Currently, we don't have the plan to support the RAPL power control,
> > but we do want to export the package/core/uncore/dram power consumption
> > information via perf tool first.
> 
> Do note that perf is not the right API for those control bits. If you
> never plan to expose those, that's fine. If you do, you'll likely need a
> parallel API (your own device) for accessing that.

Agree.
I was thinking of registering RAPL as a platform device and set the
power limit via sysfs nodes.

> Please consider if
> using separate APIs for reading/writing this resource is what you want
> and mention these considerations in your future changelog.
> 
okay. I'll do that.

> > Signed-off-by: Zhang Rui <rui.zhang@intel.com>
> > ---
> >  drivers/platform/x86/Kconfig      |    8 
> >  drivers/platform/x86/Makefile     |    1 
> >  drivers/platform/x86/intel_rapl.c |  368 ++++++++++++++++++++++++++++++++++++++
> >  include/linux/perf_event.h        |    4 
> >  4 files changed, 381 insertions(+)
> > 
> > Index: linux-2.6/drivers/platform/x86/Kconfig
> > ===================================================================
> > --- linux-2.6.orig/drivers/platform/x86/Kconfig
> > +++ linux-2.6/drivers/platform/x86/Kconfig
> > @@ -753,4 +753,12 @@ config SAMSUNG_LAPTOP
> >  	  To compile this driver as a module, choose M here: the module
> >  	  will be called samsung-laptop.
> >  
> > +config INTEL_RAPL
> > +	tristate "Intel RAPL Support"
> > +	depends on X86
> 
> Also very much depends on perf being there.
> 
Agree.

> > +	default y
> > +	---help---
> > +	  RAPL, AKA, Running Average Power Limit provides mechanisms to enforce
> > +	  power consumption limit.
> 
> The enforce part seems dubious, perf is purely about observing state it
> doesn't enforce anything. Also this help text could do with expanding in
> general.
> 
This help text is just a description of RAPL interface.
But you're right, I should be more specific about the CURRENT intel_rapl
driver status.

> >  endif # X86_PLATFORM_DEVICES
> > Index: linux-2.6/drivers/platform/x86/Makefile
> > ===================================================================
> > --- linux-2.6.orig/drivers/platform/x86/Makefile
> > +++ linux-2.6/drivers/platform/x86/Makefile
> > @@ -42,3 +42,4 @@ obj-$(CONFIG_XO15_EBOOK)	+= xo15-ebook.o
> >  obj-$(CONFIG_IBM_RTL)		+= ibm_rtl.o
> >  obj-$(CONFIG_SAMSUNG_LAPTOP)	+= samsung-laptop.o
> >  obj-$(CONFIG_INTEL_MFLD_THERMAL)	+= intel_mid_thermal.o
> > +obj-$(CONFIG_INTEL_RAPL)	+= intel_rapl.o
> > Index: linux-2.6/include/linux/perf_event.h
> > ===================================================================
> > --- linux-2.6.orig/include/linux/perf_event.h
> > +++ linux-2.6/include/linux/perf_event.h
> > @@ -107,6 +107,10 @@ enum perf_sw_ids {
> >  	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
> >  	PERF_COUNT_SW_ALIGNMENT_FAULTS		= 7,
> >  	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
> > +	PERF_COUNT_SW_PKG_ENERGY		= 9,
> > +	PERF_COUNT_SW_CORE_ENERGY		= 10,
> > +	PERF_COUNT_SW_UNCORE_ENERGY		= 11,
> > +	PERF_COUNT_SW_DRAM_ENERGY		= 12,
> 
> Not going to happen, RAPL registers its own pmu (wrongly, see below),
> with that it (should) get its own perf_event_attr::type and thus should
> have its own ::config space, you do not get to pollute the
> PERF_TYPE_SOFTWARE config space.

> Currently there isn't a way to expose the events in sysfs, but we do
> want that, its mostly a matter of getting all involved parties to agree
> on a format and implementing it.
> 
I talked with Lin Ming just now, and he said that it should work in this
way:
First, only one pmu for RAPL interfaces, with four different kinds of
events, pkg/core/uncore/dram,
and the sysfs I/F is:
/sys/bus/event_source/devices/rapl/---|---type
                                      |---pkg
                                      |---core
                                      |---uncore
                                      |---dram

to use it, users can issue something like:
perf stat -P rapl -e pkg/core/uncore/dram foo
so that event->attr.type equals rapl_pmu.type and event->attr.config
equals one of the rapl_domain_id.

This sounds good. I can rewrite the code to work in this way, but it
doesn't work for now, until both sysfs I/F and perf tool being ready,
right?

> >  	PERF_COUNT_SW_MAX,			/* non-ABI */
> >  };
> > Index: linux-2.6/drivers/platform/x86/intel_rapl.c
> > ===================================================================
> > --- /dev/null
> > +++ linux-2.6/drivers/platform/x86/intel_rapl.c
> 
> > +#define MSR_RAPL_POWER_UNIT		0x606
> > +
> > +/*
> > + * Platform specific RAPL Domains.
> > + * Note that PP1 RAPL Domain is supported on 062A only
> > + * And DRAM RAPL Domain is supported on 062D only
> > + */
> 
> 0x62[AD] is useless, please use proper names.

> > +/* Package RAPL Domain */
> > +#define MSR_PKG_RAPL_POWER_LIMIT	0x610
> > +#define MSR_PKG_ENERGY_STATUS		0x611
> > +#define MSR_PKG_PERF_STATUS		0x613
> > +#define MSR_PKG_POWER_INFO		0x614
> > +
> > +/* PP0 RAPL Domain */
> > +#define MSR_PP0_POWER_LIMIT		0x638
> > +#define MSR_PP0_ENERGY_STATUS		0x639
> > +#define MSR_PP0_POLICY			0x63A
> > +#define MSR_PP0_PERF_STATUS		0x63B
> > +
> > +/* PP1 RAPL Domain, may reflect to uncore devices */
> > +#define MSR_PP1_POWER_LIMIT		0x640
> > +#define MSR_PP1_ENERGY_STATUS		0x641
> > +#define MSR_PP1_POLICY			0x642
> > +
> > +/* DRAM RAPL Domain */
> > +#define MSR_DRAM_POWER_LIMIT		0x618
> > +#define MSR_DRAM_ENERGY_STATUS		0x619
> > +#define MSR_DRAM_PERF_STATUS		0x61B
> > +#define MSR_DRAM_POWER_INFO		0x61C
> > +
> > +/* RAPL UNIT BITMASK */
> > +#define POWER_UNIT_OFFSET	0
> > +#define POWER_UNIT_MASK		0x0F
> > +
> > +#define ENERGY_UNIT_OFFSET	0x08
> > +#define ENERGY_UNIT_MASK	0x1F00
> > +
> > +#define TIME_UNIT_OFFSET	0x10
> > +#define TIME_UNIT_MASK		0xF000
> 
> Are you sure? (x & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET == 0.
> You either want a mask of 0xF0000, or an offset of 0x0c.
> 
oops. It's 0xF0000. sorry about that.

> > +static int rapl_pmu_pkg_event_init(struct perf_event *event);
> > +static int rapl_pmu_core_event_init(struct perf_event *event);
> > +static int rapl_pmu_uncore_event_init(struct perf_event *event);
> > +static int rapl_pmu_dram_event_init(struct perf_event *event);
> > +static void rapl_event_start(struct perf_event *event, int flags);
> > +static void rapl_event_stop(struct perf_event *event, int flags);
> > +static int rapl_event_add(struct perf_event *event, int flags);
> > +static void rapl_event_del(struct perf_event *event, int flags);
> > +static void rapl_event_read(struct perf_event *event);
> > +
> > +enum rapl_domain_id {
> > +	RAPL_DOMAIN_PKG,
> > +	RAPL_DOMAIN_PP0,
> > +	RAPL_DOMAIN_PP1,
> > +	RAPL_DOMAIN_DRAM,
> > +	RAPL_DOMAIN_MAX
> > +};
> > +
> > +struct rapl_domain_msr {
> > +	int	limit;
> > +	int	status;
> > +};
> > +
> > +struct rapl_domain {
> > +	enum rapl_domain_id domain_id;
> > +	struct rapl_domain_msr msrs;
> > +	struct pmu pmu;
> > +	enum perf_sw_ids event_id;
> > +	int valid;
> > +};
> 
> You could use the rapl_domain_id as your ::config space.
> 
> 
> > +static unsigned int power_unit_divisor;
> > +static unsigned int energy_unit_divisor;
> > +static unsigned int time_unit_divisor;
> > +
> > +enum unit_type {
> > +	POWER_UNIT,
> > +	ENERGY_UNIT,
> > +	TIME_UNIT
> > +};
> > +static u64 rapl_unit_xlate(enum unit_type type, u64 value, int action)
> > +{
> > +	u64 divisor;
> > +
> > +	switch (type) {
> > +	case POWER_UNIT:
> > +		divisor = power_unit_divisor;
> > +		break;
> > +	case ENERGY_UNIT:
> > +		divisor = energy_unit_divisor;
> > +		break;
> > +	case TIME_UNIT:
> > +		divisor = time_unit_divisor;
> > +		break;
> > +	default:
> > +		return 0;
> > +	};
> > +
> > +	if (action)
> > +		return value * divisor; /* value is from users */
> > +	else
> > +		return div64_u64(value, divisor); /* value is from MSR */
> > +}
> 
> Please see the comment down by rapl_check_unit(), this is just too wrong
> to live.
> 
> > +/* show the energy status, in Jelous */
> > +static int rapl_read_energy(struct rapl_domain *domain)
> > +{
> > +	u64 value;
> > +	u32 msr = domain->msrs.status;
> > +
> > +	rdmsrl(msr, value);
> > +	return rapl_unit_xlate(ENERGY_UNIT, value, 0);
> > +}
> > +
> > +static void rapl_event_update(struct perf_event *event)
> > +{
> > +	s64 prev;
> > +	u64 now;
> > +	struct rapl_domain *domain = to_rapl_domain(event->pmu);
> > +
> > +	now = rapl_read_energy(domain);
> 
> So I had to get the Intel SDM because your driver lacks all useful
> information, and I learned that the RAPL status MSRs contain 32 bits.
> 
> So you get those 32 bits, divide them by some number,
> 
> > +	prev = local64_xchg(&event->hw.prev_count, now);
> > +	local64_add(now - prev, &event->count);
> 
> And then expect that to work?
> 
rapl_read_energy first reads energy status from MSR and then invokes
rapl_unit_xlate to translate it into Joules.
For example, on the laptop I tested, the energy unit bits is 0x10, which
means that the energy unit is 1/65536 Joule.
So I need to divide the value read from MSR by 65536 to calculate how
many Joules of energy are cost. 

But this reveals a problem. If the task is scheduled out with energy
consumption less than 1 Joule, we failed to record it.

IMO, a new callback should be introduced so that I can save the MSR
value first and translate it to Joule when the task exits. Or just do
the translation in user space.

what do you think?

> I don't think so..
> 
> > +}
> > +
> > +static void rapl_event_start(struct perf_event *event, int flags)
> > +{
> > +	struct rapl_domain *domain = to_rapl_domain(event->pmu);
> > +
> > +	local64_set(&event->hw.prev_count, rapl_read_energy(domain));
> > +	perf_swevent_start_hrtimer(event);
> > +}
> > +
> > +static void rapl_event_stop(struct perf_event *event, int flags)
> > +{
> > +	perf_swevent_cancel_hrtimer(event);
> > +	rapl_event_update(event);
> > +}
> 
> > +static int rapl_pmu_event_init(struct perf_event *event,
> > +			       enum rapl_domain_id id)
> > +{
> > +	struct rapl_domain *domain = &(rapl_domains[id]);
> > +
> > +	if (event->attr.type != PERF_TYPE_SOFTWARE)
> > +		return -ENOENT;
> > +
> > +	if (event->attr.config != domain->event_id)
> > +		return -ENOENT;
> > +
> > +	/* Do periodecal update every second */
> > +	event->attr.freq = 1;
> > +	event->attr.sample_period = 1;
> > +
> > +	perf_swevent_init_hrtimer(event);
> > +
> > +	return 0;
> > +}
> 
> That's just wrong.. the reason you're wanting to have this timer is to
> avoid the RAPL MSRs from overflowing and you loosing offsets, right?
> 
> But the above is actually forcing the event to create samples on a
> totally unrelated time base.
> 
> RAPL should fail to create a sampling event since it doesn't have the
> capability to trigger overflow interrupts based on its events.
> 
> If you want a timer, add one, but don't do this.
> 
> If you expect you actually want to sample, use this event as part of a
> group and add a sampling event in there and use PERF_FORMAT_GROUP, Matt
> was working on patches to make perf-record capable of this.
> 
perf stat doesn't support -g parameter.

BTW, as I need a per task hrtimer, can I make use of the
hw_perf_event.hrtimer in intel_rapl driver, without touching the perf
hrtimer interfaces?

> > +static int rapl_check_unit(void)
> 
> Shouldn't that be called: rapl_init_unit()? You're not actually
> verifying anything, you're setting-up state.
> 
Agree.

thanks,
rui
> > +{
> > +	u64 output;
> > +	u32 value;
> > +
> > +	rdmsrl(MSR_RAPL_POWER_UNIT, output);
> > +
> > +	/* energy unit: 1/enery_unit_divisor Joules */
> > +	value = (output & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
> > +	energy_unit_divisor = 1 << value;
> > +
> > +	/* power unit: 1/power_unit_divisor Watts */
> > +	value = (output & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
> > +	power_unit_divisor = 1 << value;
> > +
> > +	/* time unit: 1/time_unit_divisor Seconds */
> > +	value =(output & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
> > +	time_unit_divisor = 1 << value;
> 
> So you're saying these factors are powers-of-two, please look at
> rapl_unit_xlate and try again.
> 
> +
> > +	return 0;
> > +}
> > +
> > +static int __init intel_rapl_init(void)
> > +{
> > +	enum rapl_domain_id id;
> > +
> > +	/*
> > +	 * RAPL features are only supported on processors have a CPUID
> > +	 * signature with DisplayFamily_DisplayModel of 06_2AH, 06_2DH
> > +	 */
> > +	if (boot_cpu_data.x86 != 0x06)
> > +		return -ENODEV;
> > +
> > +	if (boot_cpu_data.x86_model == 0x2A)
> > +		rapl_domains[RAPL_DOMAIN_PP1].valid = 1;
> > +	else if (boot_cpu_data.x86_model == 0x2D)
> > +		rapl_domains[RAPL_DOMAIN_DRAM].valid = 1;
> > +	else
> > +		return -ENODEV;
> 
> Names please, again 06_2[AD] is useless we could have surmised that by
> reading the code, nobody knows which part that is.
> 
>   a += 4; /* increment by 4 */
> 
> quality comments here.
> 
> > +	if (rapl_check_unit())
> > +		return -ENODEV;
> > +
> > +	for(id = 0; id < RAPL_DOMAIN_MAX; id++)
> > +		if (rapl_domains[id].valid)
> > +			perf_pmu_register(&(rapl_domains[id].pmu), rapl_domains[id].pmu.name, PERF_TYPE_SOFTWARE);
> 
> Uhm, hell no!, you get to use type = -1.
> 
> > +	return 0;
> > +}
> > +

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-27  8:26   ` Zhang Rui
@ 2011-05-27 19:56     ` Peter Zijlstra
  2011-05-27 19:56     ` Peter Zijlstra
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2011-05-27 19:56 UTC (permalink / raw)
  To: Zhang Rui
  Cc: LKML, linux-pm, mingo, acme, Lin, Ming M, Brown, Len, Matt Fleming

On Fri, 2011-05-27 at 16:26 +0800, Zhang Rui wrote:
> > > +static void rapl_event_update(struct perf_event *event)
> > > +{
> > > +   s64 prev;
> > > +   u64 now;
> > > +   struct rapl_domain *domain = to_rapl_domain(event->pmu);
> > > +
> > > +   now = rapl_read_energy(domain);
> > 
> > So I had to get the Intel SDM because your driver lacks all useful
> > information, and I learned that the RAPL status MSRs contain 32 bits.
> > 
> > So you get those 32 bits, divide them by some number,
> > 
> > > +   prev = local64_xchg(&event->hw.prev_count, now);
> > > +   local64_add(now - prev, &event->count);
> > 
> > And then expect that to work?
> > 
> rapl_read_energy first reads energy status from MSR and then invokes
> rapl_unit_xlate to translate it into Joules.
> For example, on the laptop I tested, the energy unit bits is 0x10, which
> means that the energy unit is 1/65536 Joule.
> So I need to divide the value read from MSR by 65536 to calculate how
> many Joules of energy are cost. 
> 
> But this reveals a problem. If the task is scheduled out with energy
> consumption less than 1 Joule, we failed to record it.
> 
> IMO, a new callback should be introduced so that I can save the MSR
> value first and translate it to Joule when the task exits. Or just do
> the translation in user space.
> 
> what do you think? 

That's not the problem I meant, but lets start with that, you can solve
that differently, just keep a fraction somewhere in hw_perf_event.

Anyway, the problem you missed is what happens when those 32 bits roll
over, at that point you get now < prev and the value added to
event->count is a _HUGE_ 64 bit number.



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-27  8:26   ` Zhang Rui
  2011-05-27 19:56     ` Peter Zijlstra
@ 2011-05-27 19:56     ` Peter Zijlstra
  2011-05-27 19:56     ` Peter Zijlstra
  2011-05-27 19:56     ` Peter Zijlstra
  3 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2011-05-27 19:56 UTC (permalink / raw)
  To: Zhang Rui; +Cc: Matt Fleming, Lin, Ming M, LKML, acme, linux-pm, mingo

On Fri, 2011-05-27 at 16:26 +0800, Zhang Rui wrote:
> > > +static void rapl_event_update(struct perf_event *event)
> > > +{
> > > +   s64 prev;
> > > +   u64 now;
> > > +   struct rapl_domain *domain = to_rapl_domain(event->pmu);
> > > +
> > > +   now = rapl_read_energy(domain);
> > 
> > So I had to get the Intel SDM because your driver lacks all useful
> > information, and I learned that the RAPL status MSRs contain 32 bits.
> > 
> > So you get those 32 bits, divide them by some number,
> > 
> > > +   prev = local64_xchg(&event->hw.prev_count, now);
> > > +   local64_add(now - prev, &event->count);
> > 
> > And then expect that to work?
> > 
> rapl_read_energy first reads energy status from MSR and then invokes
> rapl_unit_xlate to translate it into Joules.
> For example, on the laptop I tested, the energy unit bits is 0x10, which
> means that the energy unit is 1/65536 Joule.
> So I need to divide the value read from MSR by 65536 to calculate how
> many Joules of energy are cost. 
> 
> But this reveals a problem. If the task is scheduled out with energy
> consumption less than 1 Joule, we failed to record it.
> 
> IMO, a new callback should be introduced so that I can save the MSR
> value first and translate it to Joule when the task exits. Or just do
> the translation in user space.
> 
> what do you think? 

That's not the problem I meant, but lets start with that, you can solve
that differently, just keep a fraction somewhere in hw_perf_event.

Anyway, the problem you missed is what happens when those 32 bits roll
over, at that point you get now < prev and the value added to
event->count is a _HUGE_ 64 bit number.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-27  8:26   ` Zhang Rui
  2011-05-27 19:56     ` Peter Zijlstra
  2011-05-27 19:56     ` Peter Zijlstra
@ 2011-05-27 19:56     ` Peter Zijlstra
  2011-05-30  3:11       ` Zhang Rui
  2011-05-30  3:11       ` Zhang Rui
  2011-05-27 19:56     ` Peter Zijlstra
  3 siblings, 2 replies; 26+ messages in thread
From: Peter Zijlstra @ 2011-05-27 19:56 UTC (permalink / raw)
  To: Zhang Rui
  Cc: LKML, linux-pm, mingo, acme, Lin, Ming M, Brown, Len,
	Matt Fleming, Corey Ashford, Stephane Eranian

On Fri, 2011-05-27 at 16:26 +0800, Zhang Rui wrote:
> 
> > Currently there isn't a way to expose the events in sysfs, but we do
> > want that, its mostly a matter of getting all involved parties to agree
> > on a format and implementing it.
> > 
> I talked with Lin Ming just now, and he said that it should work in this
> way:
> First, only one pmu for RAPL interfaces, with four different kinds of
> events, pkg/core/uncore/dram,
> and the sysfs I/F is:
> /sys/bus/event_source/devices/rapl/---|---type
>                                       |---pkg
>                                       |---core
>                                       |---uncore
>                                       |---dram

Actually something like:

 /sys/bus/.../rapl/ -- | -- type
                       | -- events -- | -- pkg
                                      | -- core
                                      | ...

was one of the latest proposals, but then someone (can't remember who)
offered the opinion that having sub-groups of event might also be
wanted.

Furthermore a 'format' file was proposed which ought to contain a
description of how to compose a ::config value, but we never got around
to discussing a valid/useful syntax that could express all existing
cases (let alone be future proof).

> to use it, users can issue something like:
> perf stat -P rapl -e pkg/core/uncore/dram foo
> so that event->attr.type equals rapl_pmu.type and event->attr.config
> equals one of the rapl_domain_id.

Right, something like that, or simply something like -e rapl:pkg, which
again reminds me that people were working on a full EBNF syntax for the
-e argument.

> This sounds good. I can rewrite the code to work in this way, but it
> doesn't work for now, until both sysfs I/F and perf tool being ready,
> right? 

Right, so the only thing missing is the event bits (and some userspace
bits to use it all). The hardest part of it is getting those definitions
sorted, writing the patches shouldn't be too hard.




^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-27  8:26   ` Zhang Rui
                       ` (2 preceding siblings ...)
  2011-05-27 19:56     ` Peter Zijlstra
@ 2011-05-27 19:56     ` Peter Zijlstra
  3 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2011-05-27 19:56 UTC (permalink / raw)
  To: Zhang Rui
  Cc: Matt Fleming, Lin, Ming M, Corey Ashford, LKML, Stephane Eranian,
	acme, linux-pm, mingo

On Fri, 2011-05-27 at 16:26 +0800, Zhang Rui wrote:
> 
> > Currently there isn't a way to expose the events in sysfs, but we do
> > want that, its mostly a matter of getting all involved parties to agree
> > on a format and implementing it.
> > 
> I talked with Lin Ming just now, and he said that it should work in this
> way:
> First, only one pmu for RAPL interfaces, with four different kinds of
> events, pkg/core/uncore/dram,
> and the sysfs I/F is:
> /sys/bus/event_source/devices/rapl/---|---type
>                                       |---pkg
>                                       |---core
>                                       |---uncore
>                                       |---dram

Actually something like:

 /sys/bus/.../rapl/ -- | -- type
                       | -- events -- | -- pkg
                                      | -- core
                                      | ...

was one of the latest proposals, but then someone (can't remember who)
offered the opinion that having sub-groups of event might also be
wanted.

Furthermore a 'format' file was proposed which ought to contain a
description of how to compose a ::config value, but we never got around
to discussing a valid/useful syntax that could express all existing
cases (let alone be future proof).

> to use it, users can issue something like:
> perf stat -P rapl -e pkg/core/uncore/dram foo
> so that event->attr.type equals rapl_pmu.type and event->attr.config
> equals one of the rapl_domain_id.

Right, something like that, or simply something like -e rapl:pkg, which
again reminds me that people were working on a full EBNF syntax for the
-e argument.

> This sounds good. I can rewrite the code to work in this way, but it
> doesn't work for now, until both sysfs I/F and perf tool being ready,
> right? 

Right, so the only thing missing is the event bits (and some userspace
bits to use it all). The hardest part of it is getting those definitions
sorted, writing the patches shouldn't be too hard.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-26  8:34 [PATCH 2/3] introduce intel_rapl driver Zhang Rui
                   ` (3 preceding siblings ...)
  2011-05-26 15:48 ` Randy Dunlap
@ 2011-05-28 10:17 ` Greg KH
  2011-05-30  7:04   ` Zhang Rui
  2011-05-30  7:04   ` Zhang Rui
  2011-05-28 10:17 ` Greg KH
  5 siblings, 2 replies; 26+ messages in thread
From: Greg KH @ 2011-05-28 10:17 UTC (permalink / raw)
  To: Zhang Rui
  Cc: LKML, linux-pm, a.p.zijlstra, mingo, acme, ming.m.lin, Brown, Len

On Thu, May 26, 2011 at 04:34:17PM +0800, Zhang Rui wrote:
> +/*
> + *  Intel RAPL interface driver
> + *
> + *  Copyright (C) 2010-2011 Zhang Rui <rui.zhang@intel.com>
> + *
> + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> + *
> + *  This program is free software; you can redistribute it and/or modify
> + *  it under the terms of the GNU General Public License as published by
> + *  the Free Software Foundation; either version 2 of the License, or (at
> + *  your option) any later version.

Are you really sure about "any later version"?  I didn't think that was
the default rule of Intel kernel code these days, have you verified this
exception is ok?

> + *
> + *  This program is distributed in the hope that it will be useful, but
> + *  WITHOUT ANY WARRANTY; without even the implied warranty of
> + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + *  General Public License for more details.
> + *
> + *  You should have received a copy of the GNU General Public License along
> + *  with this program; if not, write to the Free Software Foundation, Inc.,
> + *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.

These two paragraphs are not needed, and the last one is never needed
unless you want to track the office movements of the FSF for the next
50+ years and always update this file because of that.

greg k-h

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-26  8:34 [PATCH 2/3] introduce intel_rapl driver Zhang Rui
                   ` (4 preceding siblings ...)
  2011-05-28 10:17 ` Greg KH
@ 2011-05-28 10:17 ` Greg KH
  5 siblings, 0 replies; 26+ messages in thread
From: Greg KH @ 2011-05-28 10:17 UTC (permalink / raw)
  To: Zhang Rui; +Cc: a.p.zijlstra, ming.m.lin, LKML, acme, linux-pm, mingo

On Thu, May 26, 2011 at 04:34:17PM +0800, Zhang Rui wrote:
> +/*
> + *  Intel RAPL interface driver
> + *
> + *  Copyright (C) 2010-2011 Zhang Rui <rui.zhang@intel.com>
> + *
> + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> + *
> + *  This program is free software; you can redistribute it and/or modify
> + *  it under the terms of the GNU General Public License as published by
> + *  the Free Software Foundation; either version 2 of the License, or (at
> + *  your option) any later version.

Are you really sure about "any later version"?  I didn't think that was
the default rule of Intel kernel code these days, have you verified this
exception is ok?

> + *
> + *  This program is distributed in the hope that it will be useful, but
> + *  WITHOUT ANY WARRANTY; without even the implied warranty of
> + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + *  General Public License for more details.
> + *
> + *  You should have received a copy of the GNU General Public License along
> + *  with this program; if not, write to the Free Software Foundation, Inc.,
> + *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.

These two paragraphs are not needed, and the last one is never needed
unless you want to track the office movements of the FSF for the next
50+ years and always update this file because of that.

greg k-h

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-26 15:48 ` Randy Dunlap
  2011-05-30  2:40   ` Zhang Rui
@ 2011-05-30  2:40   ` Zhang Rui
  1 sibling, 0 replies; 26+ messages in thread
From: Zhang Rui @ 2011-05-30  2:40 UTC (permalink / raw)
  To: Randy Dunlap
  Cc: LKML, linux-pm, a.p.zijlstra, mingo, acme, Lin, Ming M, Brown, Len

Hi,

On Thu, 2011-05-26 at 23:48 +0800, Randy Dunlap wrote:
> On Thu, 26 May 2011 16:34:17 +0800 Zhang Rui wrote:
> 
> > 
> > Introduce Intel RAPL driver.
> > 
> > RAPL (running average power limit) is a new feature which provides mechanisms
> > to enforce power consumption limit, on some new processors.
> > 
> > RAPL provides MSRs reporting the total amount of energy consumed
> > by the package/core/uncore/dram.
> > Further more, by using RAPL, OS can set a power bugdet in a certain time window,
> > and let Hardware to throttle the processor P/T-state to meet this enery limitation.
> > 
> > Currently, we don't have the plan to support the RAPL power control,
> > but we do want to export the package/core/uncore/dram power consumption
> > information via perf tool first.
> 
> Hi,
> 
> What's an uncore?
> 
According to the Intel SDM, besides the Package and DRAM power domains,
RAPL defines another two power domains, AKA, PP0/PP1, PP0 refers to the
processor cores and PP1 refers to the power plane of a specific device
in the uncore.
Maybe "uncore" is kind of misleading, but using "PP0/PP1" doesn't mean
anything to users.

> > Signed-off-by: Zhang Rui <rui.zhang@intel.com>
> > ---
> >  drivers/platform/x86/Kconfig      |    8 
> >  drivers/platform/x86/Makefile     |    1 
> >  drivers/platform/x86/intel_rapl.c |  368 ++++++++++++++++++++++++++++++++++++++
> >  include/linux/perf_event.h        |    4 
> >  4 files changed, 381 insertions(+)
> > 
> > Index: linux-2.6/drivers/platform/x86/Kconfig
> > ===================================================================
> > --- linux-2.6.orig/drivers/platform/x86/Kconfig
> > +++ linux-2.6/drivers/platform/x86/Kconfig
> > @@ -753,4 +753,12 @@ config SAMSUNG_LAPTOP
> >  	  To compile this driver as a module, choose M here: the module
> >  	  will be called samsung-laptop.
> >  
> > +config INTEL_RAPL
> > +	tristate "Intel RAPL Support"
> > +	depends on X86
> > +	default y
> > +	---help---
> > +	  RAPL, AKA, Running Average Power Limit provides mechanisms to enforce
> 
> 	  RAPL (Running Average Power Limit) provides mechanisms to enforce
> 
> > +	  power consumption limit.
> > +
> >  endif # X86_PLATFORM_DEVICES
> 
> > Index: linux-2.6/drivers/platform/x86/intel_rapl.c
> > ===================================================================
> > --- /dev/null
> > +++ linux-2.6/drivers/platform/x86/intel_rapl.c
> > @@ -0,0 +1,368 @@
> 
> [snip]
> 
> > +/* show the energy status, in Jelous */
> 
> Is that Joules?  or what?
> 
Oh. right, it's Joules.

> > +static int rapl_read_energy(struct rapl_domain *domain)
> > +{
> > +	u64 value;
> > +	u32 msr = domain->msrs.status;
> > +
> > +	rdmsrl(msr, value);
> > +	return rapl_unit_xlate(ENERGY_UNIT, value, 0);
> > +}
> 
> [snip]
> 
> > +static int __init intel_rapl_init(void)
> > +{
> > +	enum rapl_domain_id id;
> > +
> > +	/*
> > +	 * RAPL features are only supported on processors have a CPUID
> > +	 * signature with DisplayFamily_DisplayModel of 06_2AH, 06_2DH
> > +	 */
> > +	if (boot_cpu_data.x86 != 0x06)
> > +		return -ENODEV;
> > +
> > +	if (boot_cpu_data.x86_model == 0x2A)
> > +		rapl_domains[RAPL_DOMAIN_PP1].valid = 1;
> > +	else if (boot_cpu_data.x86_model == 0x2D)
> > +		rapl_domains[RAPL_DOMAIN_DRAM].valid = 1;
> > +	else
> > +		return -ENODEV;
> > +
> > +	if (rapl_check_unit())
> > +		return -ENODEV;
> > +
> > +	for(id = 0; id < RAPL_DOMAIN_MAX; id++)
> 
> space after "for"
> 
> > +		if (rapl_domains[id].valid)
> > +			perf_pmu_register(&(rapl_domains[id].pmu), rapl_domains[id].pmu.name, PERF_TYPE_SOFTWARE);
> > +	return 0;
> > +}
> > +
> > +static void __exit intel_rapl_exit(void)
> > +{
> > +	enum rapl_domain_id id;
> > +
> > +	for(id = 0; id < RAPL_DOMAIN_MAX; id++)
> 
> ditto
> 
will fix them in the next version.

thanks,
rui

> > +		if (rapl_domains[id].valid)
> > +			perf_pmu_unregister(&(rapl_domains[id].pmu));
> > +}
> 
> 
> ---
> ~Randy
> *** Remember to use Documentation/SubmitChecklist when testing your code ***



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-26 15:48 ` Randy Dunlap
@ 2011-05-30  2:40   ` Zhang Rui
  2011-05-30  2:40   ` Zhang Rui
  1 sibling, 0 replies; 26+ messages in thread
From: Zhang Rui @ 2011-05-30  2:40 UTC (permalink / raw)
  To: Randy Dunlap; +Cc: a.p.zijlstra, Lin, Ming M, LKML, acme, linux-pm, mingo

Hi,

On Thu, 2011-05-26 at 23:48 +0800, Randy Dunlap wrote:
> On Thu, 26 May 2011 16:34:17 +0800 Zhang Rui wrote:
> 
> > 
> > Introduce Intel RAPL driver.
> > 
> > RAPL (running average power limit) is a new feature which provides mechanisms
> > to enforce power consumption limit, on some new processors.
> > 
> > RAPL provides MSRs reporting the total amount of energy consumed
> > by the package/core/uncore/dram.
> > Further more, by using RAPL, OS can set a power bugdet in a certain time window,
> > and let Hardware to throttle the processor P/T-state to meet this enery limitation.
> > 
> > Currently, we don't have the plan to support the RAPL power control,
> > but we do want to export the package/core/uncore/dram power consumption
> > information via perf tool first.
> 
> Hi,
> 
> What's an uncore?
> 
According to the Intel SDM, besides the Package and DRAM power domains,
RAPL defines another two power domains, AKA, PP0/PP1, PP0 refers to the
processor cores and PP1 refers to the power plane of a specific device
in the uncore.
Maybe "uncore" is kind of misleading, but using "PP0/PP1" doesn't mean
anything to users.

> > Signed-off-by: Zhang Rui <rui.zhang@intel.com>
> > ---
> >  drivers/platform/x86/Kconfig      |    8 
> >  drivers/platform/x86/Makefile     |    1 
> >  drivers/platform/x86/intel_rapl.c |  368 ++++++++++++++++++++++++++++++++++++++
> >  include/linux/perf_event.h        |    4 
> >  4 files changed, 381 insertions(+)
> > 
> > Index: linux-2.6/drivers/platform/x86/Kconfig
> > ===================================================================
> > --- linux-2.6.orig/drivers/platform/x86/Kconfig
> > +++ linux-2.6/drivers/platform/x86/Kconfig
> > @@ -753,4 +753,12 @@ config SAMSUNG_LAPTOP
> >  	  To compile this driver as a module, choose M here: the module
> >  	  will be called samsung-laptop.
> >  
> > +config INTEL_RAPL
> > +	tristate "Intel RAPL Support"
> > +	depends on X86
> > +	default y
> > +	---help---
> > +	  RAPL, AKA, Running Average Power Limit provides mechanisms to enforce
> 
> 	  RAPL (Running Average Power Limit) provides mechanisms to enforce
> 
> > +	  power consumption limit.
> > +
> >  endif # X86_PLATFORM_DEVICES
> 
> > Index: linux-2.6/drivers/platform/x86/intel_rapl.c
> > ===================================================================
> > --- /dev/null
> > +++ linux-2.6/drivers/platform/x86/intel_rapl.c
> > @@ -0,0 +1,368 @@
> 
> [snip]
> 
> > +/* show the energy status, in Jelous */
> 
> Is that Joules?  or what?
> 
Oh. right, it's Joules.

> > +static int rapl_read_energy(struct rapl_domain *domain)
> > +{
> > +	u64 value;
> > +	u32 msr = domain->msrs.status;
> > +
> > +	rdmsrl(msr, value);
> > +	return rapl_unit_xlate(ENERGY_UNIT, value, 0);
> > +}
> 
> [snip]
> 
> > +static int __init intel_rapl_init(void)
> > +{
> > +	enum rapl_domain_id id;
> > +
> > +	/*
> > +	 * RAPL features are only supported on processors have a CPUID
> > +	 * signature with DisplayFamily_DisplayModel of 06_2AH, 06_2DH
> > +	 */
> > +	if (boot_cpu_data.x86 != 0x06)
> > +		return -ENODEV;
> > +
> > +	if (boot_cpu_data.x86_model == 0x2A)
> > +		rapl_domains[RAPL_DOMAIN_PP1].valid = 1;
> > +	else if (boot_cpu_data.x86_model == 0x2D)
> > +		rapl_domains[RAPL_DOMAIN_DRAM].valid = 1;
> > +	else
> > +		return -ENODEV;
> > +
> > +	if (rapl_check_unit())
> > +		return -ENODEV;
> > +
> > +	for(id = 0; id < RAPL_DOMAIN_MAX; id++)
> 
> space after "for"
> 
> > +		if (rapl_domains[id].valid)
> > +			perf_pmu_register(&(rapl_domains[id].pmu), rapl_domains[id].pmu.name, PERF_TYPE_SOFTWARE);
> > +	return 0;
> > +}
> > +
> > +static void __exit intel_rapl_exit(void)
> > +{
> > +	enum rapl_domain_id id;
> > +
> > +	for(id = 0; id < RAPL_DOMAIN_MAX; id++)
> 
> ditto
> 
will fix them in the next version.

thanks,
rui

> > +		if (rapl_domains[id].valid)
> > +			perf_pmu_unregister(&(rapl_domains[id].pmu));
> > +}
> 
> 
> ---
> ~Randy
> *** Remember to use Documentation/SubmitChecklist when testing your code ***

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-27 19:56     ` Peter Zijlstra
@ 2011-05-30  3:11       ` Zhang Rui
  2011-05-30  3:11       ` Zhang Rui
  1 sibling, 0 replies; 26+ messages in thread
From: Zhang Rui @ 2011-05-30  3:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, linux-pm, mingo, acme, Lin, Ming M, Brown, Len,
	Matt Fleming, Corey Ashford, Stephane Eranian

On Sat, 2011-05-28 at 03:56 +0800, Peter Zijlstra wrote:
> On Fri, 2011-05-27 at 16:26 +0800, Zhang Rui wrote:
> > 
> > > Currently there isn't a way to expose the events in sysfs, but we do
> > > want that, its mostly a matter of getting all involved parties to agree
> > > on a format and implementing it.
> > > 
> > I talked with Lin Ming just now, and he said that it should work in this
> > way:
> > First, only one pmu for RAPL interfaces, with four different kinds of
> > events, pkg/core/uncore/dram,
> > and the sysfs I/F is:
> > /sys/bus/event_source/devices/rapl/---|---type
> >                                       |---pkg
> >                                       |---core
> >                                       |---uncore
> >                                       |---dram
> 
> Actually something like:
> 
>  /sys/bus/.../rapl/ -- | -- type
>                        | -- events -- | -- pkg
>                                       | -- core
>                                       | ...
> 
> was one of the latest proposals, but then someone (can't remember who)
> offered the opinion that having sub-groups of event might also be
> wanted.
> 
> Furthermore a 'format' file was proposed which ought to contain a
> description of how to compose a ::config value, but we never got around
> to discussing a valid/useful syntax that could express all existing
> cases (let alone be future proof).
> 
> > to use it, users can issue something like:
> > perf stat -P rapl -e pkg/core/uncore/dram foo
> > so that event->attr.type equals rapl_pmu.type and event->attr.config
> > equals one of the rapl_domain_id.
> 
> Right, something like that, or simply something like -e rapl:pkg, which
> again reminds me that people were working on a full EBNF syntax for the
> -e argument.
> 
> > This sounds good. I can rewrite the code to work in this way, but it
> > doesn't work for now, until both sysfs I/F and perf tool being ready,
> > right? 
> 
> Right, so the only thing missing is the event bits (and some userspace
> bits to use it all). The hardest part of it is getting those definitions
> sorted, writing the patches shouldn't be too hard.
> 
Okay. So should I finish the kernel code first, which just checks the
event->attr.type/config value, no matter how they are passed to kernel?

thanks,
rui
> 
> 



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-27 19:56     ` Peter Zijlstra
  2011-05-30  3:11       ` Zhang Rui
@ 2011-05-30  3:11       ` Zhang Rui
  1 sibling, 0 replies; 26+ messages in thread
From: Zhang Rui @ 2011-05-30  3:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Matt Fleming, Lin, Ming M, Corey Ashford, LKML, Stephane Eranian,
	acme, linux-pm, mingo

On Sat, 2011-05-28 at 03:56 +0800, Peter Zijlstra wrote:
> On Fri, 2011-05-27 at 16:26 +0800, Zhang Rui wrote:
> > 
> > > Currently there isn't a way to expose the events in sysfs, but we do
> > > want that, its mostly a matter of getting all involved parties to agree
> > > on a format and implementing it.
> > > 
> > I talked with Lin Ming just now, and he said that it should work in this
> > way:
> > First, only one pmu for RAPL interfaces, with four different kinds of
> > events, pkg/core/uncore/dram,
> > and the sysfs I/F is:
> > /sys/bus/event_source/devices/rapl/---|---type
> >                                       |---pkg
> >                                       |---core
> >                                       |---uncore
> >                                       |---dram
> 
> Actually something like:
> 
>  /sys/bus/.../rapl/ -- | -- type
>                        | -- events -- | -- pkg
>                                       | -- core
>                                       | ...
> 
> was one of the latest proposals, but then someone (can't remember who)
> offered the opinion that having sub-groups of event might also be
> wanted.
> 
> Furthermore a 'format' file was proposed which ought to contain a
> description of how to compose a ::config value, but we never got around
> to discussing a valid/useful syntax that could express all existing
> cases (let alone be future proof).
> 
> > to use it, users can issue something like:
> > perf stat -P rapl -e pkg/core/uncore/dram foo
> > so that event->attr.type equals rapl_pmu.type and event->attr.config
> > equals one of the rapl_domain_id.
> 
> Right, something like that, or simply something like -e rapl:pkg, which
> again reminds me that people were working on a full EBNF syntax for the
> -e argument.
> 
> > This sounds good. I can rewrite the code to work in this way, but it
> > doesn't work for now, until both sysfs I/F and perf tool being ready,
> > right? 
> 
> Right, so the only thing missing is the event bits (and some userspace
> bits to use it all). The hardest part of it is getting those definitions
> sorted, writing the patches shouldn't be too hard.
> 
Okay. So should I finish the kernel code first, which just checks the
event->attr.type/config value, no matter how they are passed to kernel?

thanks,
rui
> 
> 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-28 10:17 ` Greg KH
@ 2011-05-30  7:04   ` Zhang Rui
  2011-05-30  7:04   ` Zhang Rui
  1 sibling, 0 replies; 26+ messages in thread
From: Zhang Rui @ 2011-05-30  7:04 UTC (permalink / raw)
  To: Greg KH
  Cc: LKML, linux-pm, a.p.zijlstra, mingo, acme, Lin, Ming M, Brown, Len

Hi, Greg,

On Sat, 2011-05-28 at 18:17 +0800, Greg KH wrote:
> On Thu, May 26, 2011 at 04:34:17PM +0800, Zhang Rui wrote:
> > +/*
> > + *  Intel RAPL interface driver
> > + *
> > + *  Copyright (C) 2010-2011 Zhang Rui <rui.zhang@intel.com>
> > + *
> > + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> > + *
> > + *  This program is free software; you can redistribute it and/or modify
> > + *  it under the terms of the GNU General Public License as published by
> > + *  the Free Software Foundation; either version 2 of the License, or (at
> > + *  your option) any later version.
> 
> Are you really sure about "any later version"?  I didn't think that was
> the default rule of Intel kernel code these days, have you verified this
> exception is ok?
> 
Sorry, I overlooked this problem and just did the copy & paste from
another driver. It won't happen again.

thanks,
rui

> > + *
> > + *  This program is distributed in the hope that it will be useful, but
> > + *  WITHOUT ANY WARRANTY; without even the implied warranty of
> > + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + *  General Public License for more details.
> > + *
> > + *  You should have received a copy of the GNU General Public License along
> > + *  with this program; if not, write to the Free Software Foundation, Inc.,
> > + *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
> 
> These two paragraphs are not needed, and the last one is never needed
> unless you want to track the office movements of the FSF for the next
> 50+ years and always update this file because of that.
> 
> greg k-h



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-28 10:17 ` Greg KH
  2011-05-30  7:04   ` Zhang Rui
@ 2011-05-30  7:04   ` Zhang Rui
  1 sibling, 0 replies; 26+ messages in thread
From: Zhang Rui @ 2011-05-30  7:04 UTC (permalink / raw)
  To: Greg KH; +Cc: a.p.zijlstra, Lin, Ming M, LKML, acme, linux-pm, mingo

Hi, Greg,

On Sat, 2011-05-28 at 18:17 +0800, Greg KH wrote:
> On Thu, May 26, 2011 at 04:34:17PM +0800, Zhang Rui wrote:
> > +/*
> > + *  Intel RAPL interface driver
> > + *
> > + *  Copyright (C) 2010-2011 Zhang Rui <rui.zhang@intel.com>
> > + *
> > + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> > + *
> > + *  This program is free software; you can redistribute it and/or modify
> > + *  it under the terms of the GNU General Public License as published by
> > + *  the Free Software Foundation; either version 2 of the License, or (at
> > + *  your option) any later version.
> 
> Are you really sure about "any later version"?  I didn't think that was
> the default rule of Intel kernel code these days, have you verified this
> exception is ok?
> 
Sorry, I overlooked this problem and just did the copy & paste from
another driver. It won't happen again.

thanks,
rui

> > + *
> > + *  This program is distributed in the hope that it will be useful, but
> > + *  WITHOUT ANY WARRANTY; without even the implied warranty of
> > + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + *  General Public License for more details.
> > + *
> > + *  You should have received a copy of the GNU General Public License along
> > + *  with this program; if not, write to the Free Software Foundation, Inc.,
> > + *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
> 
> These two paragraphs are not needed, and the last one is never needed
> unless you want to track the office movements of the FSF for the next
> 50+ years and always update this file because of that.
> 
> greg k-h

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
  2011-05-26 10:55   ` Matt Fleming
@ 2011-06-02  8:04       ` Matt Fleming
  0 siblings, 0 replies; 26+ messages in thread
From: Matt Fleming @ 2011-06-02  8:04 UTC (permalink / raw)
  To: Zhang Rui
  Cc: Peter Zijlstra, LKML, linux-pm, mingo, acme, ming.m.lin, Brown, Len

On Thu, 26 May 2011 11:55:38 +0100
Matt Fleming <matt@console-pimps.org> wrote:

> On Thu, May 26, 2011 at 11:43:23AM +0200, Peter Zijlstra wrote:
>
> > If you expect you actually want to sample, use this event as part of a
> > group and add a sampling event in there and use PERF_FORMAT_GROUP, Matt
> > was working on patches to make perf-record capable of this.
> 
> Yep, I have some unfinished patches around here somewhere...
> 
> *rummage*
> 
> OK, they're in this repository on the perf/group-events branch,
> 
>     git://git.kernel.org/pub/scm/linux/kernel/git/mfleming/sh-2.6.git
> 
> Obviously since I last touched them in November of last year they're
> more than likely not going to apply cleanly to tip, and perhaps more
> importantly, I don't think I ever submitted them to LKML for review.

OK, my previous patches didn't apply at all and I had to rewrite them.
This is what I came up with. It's not quite finished (it's missing
perf-report changes) but should be enough to get you started if indeed
you need this functionality. I've only touched the perf-record code but
it should be trivial to add support to builtin-stat.c.

-------->8--------

>From e19c94e9968489746ee8d8519ce4a6afbcb4d7cc Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@linux.intel.com>
Date: Tue, 31 May 2011 11:19:01 +0100
Subject: [PATCH] perf: Add support for group events

Allow events to be grouped so that they are scheduled together on the
performance hardware.

Signed-off-by: Matt Fleming <matt.fleming@linux.intel.com>
---
 tools/perf/builtin-record.c    |   17 ++++++++++-----
 tools/perf/util/evlist.c       |   23 +++++++++++++++++---
 tools/perf/util/evsel.c        |    3 ++
 tools/perf/util/evsel.h        |    2 +
 tools/perf/util/parse-events.c |   43 +++++++++++++++++++++++++++++++++++++++-
 5 files changed, 77 insertions(+), 11 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 8e2c857..4c9412b 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -163,10 +163,11 @@ static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist)
 	struct perf_event_attr *attr = &evsel->attr;
 	int track = !evsel->idx; /* only the first counter needs these */
 
-	attr->inherit		= !no_inherit;
-	attr->read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
-				  PERF_FORMAT_TOTAL_TIME_RUNNING |
-				  PERF_FORMAT_ID;
+	attr->inherit		= !no_inherit &&
+				  (!(attr->read_format & PERF_FORMAT_GROUP));
+	attr->read_format	|= PERF_FORMAT_TOTAL_TIME_ENABLED |
+				   PERF_FORMAT_TOTAL_TIME_RUNNING |
+				   PERF_FORMAT_ID;
 
 	attr->sample_type	|= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
 
@@ -176,9 +177,13 @@ static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist)
 	/*
 	 * We default some events to a 1 default interval. But keep
 	 * it a weak assumption overridable by the user.
+	 *
+	 * We only allow the default to be overridden if the event is
+	 * not part of a group, or if the event is the leader of a group.
 	 */
-	if (!attr->sample_period || (user_freq != UINT_MAX &&
-				     user_interval != ULLONG_MAX)) {
+	if ((!evsel->group || (evsel->group == evsel)) &&
+	    (!attr->sample_period || (user_freq != UINT_MAX &&
+				      user_interval != ULLONG_MAX))) {
 		if (freq) {
 			attr->sample_type	|= PERF_SAMPLE_PERIOD;
 			attr->freq		= 1;
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 50aa348..a7f0f8c 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -133,17 +133,31 @@ static int perf_evlist__id_add_fd(struct perf_evlist *evlist,
 {
 	u64 read_data[4] = { 0, };
 	int id_idx = 1; /* The first entry is the counter value */
+	size_t data_sz;
+	u64 *data;
+
+	data_sz = sizeof(u64) * 4;
+
+	if (evsel->group == evsel) {
+		data_sz += evsel->group_cnt * sizeof(u64) * 4;
+		data = malloc(data_sz);
+		if (!data)
+			return -1;
+	} else
+		data = read_data;
 
 	if (!(evsel->attr.read_format & PERF_FORMAT_ID) ||
-	    read(fd, &read_data, sizeof(read_data)) == -1)
+	    read(fd, data, data_sz) == -1)
 		return -1;
 
 	if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 		++id_idx;
 	if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 		++id_idx;
+	if (evsel->group_cnt)
+		++id_idx;
 
-	perf_evlist__id_add(evlist, evsel, cpu, thread, read_data[id_idx]);
+	perf_evlist__id_add(evlist, evsel, cpu, thread, data[id_idx]);
 	return 0;
 }
 
@@ -466,9 +480,10 @@ u64 perf_evlist__sample_type(struct perf_evlist *evlist)
 	u64 type = 0;
 
 	list_for_each_entry(pos, &evlist->entries, node) {
+		u64 mask = PERF_SAMPLE_PERIOD | PERF_SAMPLE_READ;
 		if (!type)
-			type = pos->attr.sample_type;
-		else if (type != pos->attr.sample_type)
+			type = (pos->attr.sample_type & ~mask);
+		else if (type != (pos->attr.sample_type & ~mask))
 			die("non matching sample_type");
 	}
 
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index cca29ed..53960e1 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -208,6 +208,9 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 			if (!evsel->cgrp)
 				pid = threads->map[thread];
 
+			if (evsel->group && evsel->group != evsel)
+				group_fd = FD(evsel->group, cpu, thread);
+
 			FD(evsel, cpu, thread) = sys_perf_event_open(&evsel->attr,
 								     pid,
 								     cpus->map[cpu],
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index f79bb2c..14be4b9 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -61,6 +61,8 @@ struct perf_evsel {
 		off_t		id_offset;
 	};
 	struct cgroup_sel	*cgrp;
+	struct perf_evsel	*group;
+	u64			group_cnt;
 };
 
 struct cpu_map;
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 41982c3..60a3479 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -737,6 +737,9 @@ parse_event_modifier(const char **strp, struct perf_event_attr *attr)
 	if (*str == ',')
 		return 0;
 
+	if (*str == '}')
+		return 0;
+
 	if (*str++ != ':')
 		return -1;
 
@@ -825,18 +828,44 @@ modifier:
 int parse_events(const struct option *opt, const char *str, int unset __used)
 {
 	struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;
+	struct perf_evsel *group_evsel = NULL;
 	struct perf_event_attr attr;
 	enum event_result ret;
 	const char *ostr;
+	u64 group_cnt = 0;
 
 	for (;;) {
 		ostr = str;
 		memset(&attr, 0, sizeof(attr));
+
+		if (*str == '{') {
+			/* Already parsing a group? */
+			if (group_evsel != NULL)
+				return -1;
+
+			/* Create an event group */
+			attr.config = PERF_COUNT_SW_TASK_CLOCK;
+			attr.type = PERF_TYPE_SOFTWARE;
+			attr.read_format = PERF_FORMAT_GROUP |
+				PERF_FORMAT_TOTAL_TIME_RUNNING |
+				PERF_FORMAT_TOTAL_TIME_ENABLED;
+			attr.sample_type = PERF_SAMPLE_READ;
+
+			group_evsel = perf_evsel__new(&attr, evlist->nr_entries);
+			if (group_evsel == NULL)
+				return -1;
+
+			group_evsel->group = group_evsel;
+			perf_evlist__add(evlist, group_evsel);
+			str++;
+		}
+
+		memset(&attr, 0, sizeof(attr));
 		ret = parse_event_symbols(opt, &str, &attr);
 		if (ret == EVT_FAILED)
 			return -1;
 
-		if (!(*str == 0 || *str == ',' || isspace(*str)))
+		if (!(*str == 0 || *str == ',' || isspace(*str) || *str == '}'))
 			return -1;
 
 		if (ret != EVT_HANDLED_ALL) {
@@ -844,12 +873,24 @@ int parse_events(const struct option *opt, const char *str, int unset __used)
 			evsel = perf_evsel__new(&attr, evlist->nr_entries);
 			if (evsel == NULL)
 				return -1;
+
+			if (group_evsel)
+				evsel->group = group_evsel;
+
 			perf_evlist__add(evlist, evsel);
 
 			evsel->name = calloc(str - ostr + 1, 1);
 			if (!evsel->name)
 				return -1;
 			strncpy(evsel->name, ostr, str - ostr);
+			group_cnt++;
+		}
+
+		if (*str == '}') {
+			group_evsel->group_cnt = group_cnt;
+			group_evsel = NULL;
+			group_cnt = 0;
+			str++;
 		}
 
 		if (*str == 0)
-- 
1.7.4.4


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] introduce intel_rapl driver
@ 2011-06-02  8:04       ` Matt Fleming
  0 siblings, 0 replies; 26+ messages in thread
From: Matt Fleming @ 2011-06-02  8:04 UTC (permalink / raw)
  To: Zhang Rui; +Cc: Peter Zijlstra, ming.m.lin, LKML, acme, linux-pm, mingo

On Thu, 26 May 2011 11:55:38 +0100
Matt Fleming <matt@console-pimps.org> wrote:

> On Thu, May 26, 2011 at 11:43:23AM +0200, Peter Zijlstra wrote:
>
> > If you expect you actually want to sample, use this event as part of a
> > group and add a sampling event in there and use PERF_FORMAT_GROUP, Matt
> > was working on patches to make perf-record capable of this.
> 
> Yep, I have some unfinished patches around here somewhere...
> 
> *rummage*
> 
> OK, they're in this repository on the perf/group-events branch,
> 
>     git://git.kernel.org/pub/scm/linux/kernel/git/mfleming/sh-2.6.git
> 
> Obviously since I last touched them in November of last year they're
> more than likely not going to apply cleanly to tip, and perhaps more
> importantly, I don't think I ever submitted them to LKML for review.

OK, my previous patches didn't apply at all and I had to rewrite them.
This is what I came up with. It's not quite finished (it's missing
perf-report changes) but should be enough to get you started if indeed
you need this functionality. I've only touched the perf-record code but
it should be trivial to add support to builtin-stat.c.

-------->8--------

>From e19c94e9968489746ee8d8519ce4a6afbcb4d7cc Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@linux.intel.com>
Date: Tue, 31 May 2011 11:19:01 +0100
Subject: [PATCH] perf: Add support for group events

Allow events to be grouped so that they are scheduled together on the
performance hardware.

Signed-off-by: Matt Fleming <matt.fleming@linux.intel.com>
---
 tools/perf/builtin-record.c    |   17 ++++++++++-----
 tools/perf/util/evlist.c       |   23 +++++++++++++++++---
 tools/perf/util/evsel.c        |    3 ++
 tools/perf/util/evsel.h        |    2 +
 tools/perf/util/parse-events.c |   43 +++++++++++++++++++++++++++++++++++++++-
 5 files changed, 77 insertions(+), 11 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 8e2c857..4c9412b 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -163,10 +163,11 @@ static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist)
 	struct perf_event_attr *attr = &evsel->attr;
 	int track = !evsel->idx; /* only the first counter needs these */
 
-	attr->inherit		= !no_inherit;
-	attr->read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
-				  PERF_FORMAT_TOTAL_TIME_RUNNING |
-				  PERF_FORMAT_ID;
+	attr->inherit		= !no_inherit &&
+				  (!(attr->read_format & PERF_FORMAT_GROUP));
+	attr->read_format	|= PERF_FORMAT_TOTAL_TIME_ENABLED |
+				   PERF_FORMAT_TOTAL_TIME_RUNNING |
+				   PERF_FORMAT_ID;
 
 	attr->sample_type	|= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
 
@@ -176,9 +177,13 @@ static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist)
 	/*
 	 * We default some events to a 1 default interval. But keep
 	 * it a weak assumption overridable by the user.
+	 *
+	 * We only allow the default to be overridden if the event is
+	 * not part of a group, or if the event is the leader of a group.
 	 */
-	if (!attr->sample_period || (user_freq != UINT_MAX &&
-				     user_interval != ULLONG_MAX)) {
+	if ((!evsel->group || (evsel->group == evsel)) &&
+	    (!attr->sample_period || (user_freq != UINT_MAX &&
+				      user_interval != ULLONG_MAX))) {
 		if (freq) {
 			attr->sample_type	|= PERF_SAMPLE_PERIOD;
 			attr->freq		= 1;
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 50aa348..a7f0f8c 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -133,17 +133,31 @@ static int perf_evlist__id_add_fd(struct perf_evlist *evlist,
 {
 	u64 read_data[4] = { 0, };
 	int id_idx = 1; /* The first entry is the counter value */
+	size_t data_sz;
+	u64 *data;
+
+	data_sz = sizeof(u64) * 4;
+
+	if (evsel->group == evsel) {
+		data_sz += evsel->group_cnt * sizeof(u64) * 4;
+		data = malloc(data_sz);
+		if (!data)
+			return -1;
+	} else
+		data = read_data;
 
 	if (!(evsel->attr.read_format & PERF_FORMAT_ID) ||
-	    read(fd, &read_data, sizeof(read_data)) == -1)
+	    read(fd, data, data_sz) == -1)
 		return -1;
 
 	if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 		++id_idx;
 	if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 		++id_idx;
+	if (evsel->group_cnt)
+		++id_idx;
 
-	perf_evlist__id_add(evlist, evsel, cpu, thread, read_data[id_idx]);
+	perf_evlist__id_add(evlist, evsel, cpu, thread, data[id_idx]);
 	return 0;
 }
 
@@ -466,9 +480,10 @@ u64 perf_evlist__sample_type(struct perf_evlist *evlist)
 	u64 type = 0;
 
 	list_for_each_entry(pos, &evlist->entries, node) {
+		u64 mask = PERF_SAMPLE_PERIOD | PERF_SAMPLE_READ;
 		if (!type)
-			type = pos->attr.sample_type;
-		else if (type != pos->attr.sample_type)
+			type = (pos->attr.sample_type & ~mask);
+		else if (type != (pos->attr.sample_type & ~mask))
 			die("non matching sample_type");
 	}
 
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index cca29ed..53960e1 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -208,6 +208,9 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 			if (!evsel->cgrp)
 				pid = threads->map[thread];
 
+			if (evsel->group && evsel->group != evsel)
+				group_fd = FD(evsel->group, cpu, thread);
+
 			FD(evsel, cpu, thread) = sys_perf_event_open(&evsel->attr,
 								     pid,
 								     cpus->map[cpu],
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index f79bb2c..14be4b9 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -61,6 +61,8 @@ struct perf_evsel {
 		off_t		id_offset;
 	};
 	struct cgroup_sel	*cgrp;
+	struct perf_evsel	*group;
+	u64			group_cnt;
 };
 
 struct cpu_map;
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 41982c3..60a3479 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -737,6 +737,9 @@ parse_event_modifier(const char **strp, struct perf_event_attr *attr)
 	if (*str == ',')
 		return 0;
 
+	if (*str == '}')
+		return 0;
+
 	if (*str++ != ':')
 		return -1;
 
@@ -825,18 +828,44 @@ modifier:
 int parse_events(const struct option *opt, const char *str, int unset __used)
 {
 	struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;
+	struct perf_evsel *group_evsel = NULL;
 	struct perf_event_attr attr;
 	enum event_result ret;
 	const char *ostr;
+	u64 group_cnt = 0;
 
 	for (;;) {
 		ostr = str;
 		memset(&attr, 0, sizeof(attr));
+
+		if (*str == '{') {
+			/* Already parsing a group? */
+			if (group_evsel != NULL)
+				return -1;
+
+			/* Create an event group */
+			attr.config = PERF_COUNT_SW_TASK_CLOCK;
+			attr.type = PERF_TYPE_SOFTWARE;
+			attr.read_format = PERF_FORMAT_GROUP |
+				PERF_FORMAT_TOTAL_TIME_RUNNING |
+				PERF_FORMAT_TOTAL_TIME_ENABLED;
+			attr.sample_type = PERF_SAMPLE_READ;
+
+			group_evsel = perf_evsel__new(&attr, evlist->nr_entries);
+			if (group_evsel == NULL)
+				return -1;
+
+			group_evsel->group = group_evsel;
+			perf_evlist__add(evlist, group_evsel);
+			str++;
+		}
+
+		memset(&attr, 0, sizeof(attr));
 		ret = parse_event_symbols(opt, &str, &attr);
 		if (ret == EVT_FAILED)
 			return -1;
 
-		if (!(*str == 0 || *str == ',' || isspace(*str)))
+		if (!(*str == 0 || *str == ',' || isspace(*str) || *str == '}'))
 			return -1;
 
 		if (ret != EVT_HANDLED_ALL) {
@@ -844,12 +873,24 @@ int parse_events(const struct option *opt, const char *str, int unset __used)
 			evsel = perf_evsel__new(&attr, evlist->nr_entries);
 			if (evsel == NULL)
 				return -1;
+
+			if (group_evsel)
+				evsel->group = group_evsel;
+
 			perf_evlist__add(evlist, evsel);
 
 			evsel->name = calloc(str - ostr + 1, 1);
 			if (!evsel->name)
 				return -1;
 			strncpy(evsel->name, ostr, str - ostr);
+			group_cnt++;
+		}
+
+		if (*str == '}') {
+			group_evsel->group_cnt = group_cnt;
+			group_evsel = NULL;
+			group_cnt = 0;
+			str++;
 		}
 
 		if (*str == 0)
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 2/3] introduce intel_rapl driver
@ 2011-05-26  8:34 Zhang Rui
  0 siblings, 0 replies; 26+ messages in thread
From: Zhang Rui @ 2011-05-26  8:34 UTC (permalink / raw)
  To: LKML, linux-pm; +Cc: acme, mingo, a.p.zijlstra, ming.m.lin


Introduce Intel RAPL driver.

RAPL (running average power limit) is a new feature which provides mechanisms
to enforce power consumption limit, on some new processors.

RAPL provides MSRs reporting the total amount of energy consumed
by the package/core/uncore/dram.
Further more, by using RAPL, OS can set a power bugdet in a certain time window,
and let Hardware to throttle the processor P/T-state to meet this enery limitation.

Currently, we don't have the plan to support the RAPL power control,
but we do want to export the package/core/uncore/dram power consumption
information via perf tool first.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 drivers/platform/x86/Kconfig      |    8 
 drivers/platform/x86/Makefile     |    1 
 drivers/platform/x86/intel_rapl.c |  368 ++++++++++++++++++++++++++++++++++++++
 include/linux/perf_event.h        |    4 
 4 files changed, 381 insertions(+)

Index: linux-2.6/drivers/platform/x86/Kconfig
===================================================================
--- linux-2.6.orig/drivers/platform/x86/Kconfig
+++ linux-2.6/drivers/platform/x86/Kconfig
@@ -753,4 +753,12 @@ config SAMSUNG_LAPTOP
 	  To compile this driver as a module, choose M here: the module
 	  will be called samsung-laptop.
 
+config INTEL_RAPL
+	tristate "Intel RAPL Support"
+	depends on X86
+	default y
+	---help---
+	  RAPL, AKA, Running Average Power Limit provides mechanisms to enforce
+	  power consumption limit.
+
 endif # X86_PLATFORM_DEVICES
Index: linux-2.6/drivers/platform/x86/Makefile
===================================================================
--- linux-2.6.orig/drivers/platform/x86/Makefile
+++ linux-2.6/drivers/platform/x86/Makefile
@@ -42,3 +42,4 @@ obj-$(CONFIG_XO15_EBOOK)	+= xo15-ebook.o
 obj-$(CONFIG_IBM_RTL)		+= ibm_rtl.o
 obj-$(CONFIG_SAMSUNG_LAPTOP)	+= samsung-laptop.o
 obj-$(CONFIG_INTEL_MFLD_THERMAL)	+= intel_mid_thermal.o
+obj-$(CONFIG_INTEL_RAPL)	+= intel_rapl.o
Index: linux-2.6/include/linux/perf_event.h
===================================================================
--- linux-2.6.orig/include/linux/perf_event.h
+++ linux-2.6/include/linux/perf_event.h
@@ -107,6 +107,10 @@ enum perf_sw_ids {
 	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
 	PERF_COUNT_SW_ALIGNMENT_FAULTS		= 7,
 	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
+	PERF_COUNT_SW_PKG_ENERGY		= 9,
+	PERF_COUNT_SW_CORE_ENERGY		= 10,
+	PERF_COUNT_SW_UNCORE_ENERGY		= 11,
+	PERF_COUNT_SW_DRAM_ENERGY		= 12,
 
 	PERF_COUNT_SW_MAX,			/* non-ABI */
 };
Index: linux-2.6/drivers/platform/x86/intel_rapl.c
===================================================================
--- /dev/null
+++ linux-2.6/drivers/platform/x86/intel_rapl.c
@@ -0,0 +1,368 @@
+/*
+ *  Intel RAPL interface driver
+ *
+ *  Copyright (C) 2010-2011 Zhang Rui <rui.zhang@intel.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/list.h>
+#include <asm/processor.h>
+#include <linux/perf_event.h>
+
+MODULE_AUTHOR("Zhang Rui");
+MODULE_DESCRIPTION("Intel RAPL interface Driver");
+MODULE_LICENSE("GPL");
+
+#define PREFIX "Intel: RAPL: "
+
+#define MSR_RAPL_POWER_UNIT		0x606
+
+/*
+ * Platform specific RAPL Domains.
+ * Note that PP1 RAPL Domain is supported on 062A only
+ * And DRAM RAPL Domain is supported on 062D only
+ */
+/* Package RAPL Domain */
+#define MSR_PKG_RAPL_POWER_LIMIT	0x610
+#define MSR_PKG_ENERGY_STATUS		0x611
+#define MSR_PKG_PERF_STATUS		0x613
+#define MSR_PKG_POWER_INFO		0x614
+
+/* PP0 RAPL Domain */
+#define MSR_PP0_POWER_LIMIT		0x638
+#define MSR_PP0_ENERGY_STATUS		0x639
+#define MSR_PP0_POLICY			0x63A
+#define MSR_PP0_PERF_STATUS		0x63B
+
+/* PP1 RAPL Domain, may reflect to uncore devices */
+#define MSR_PP1_POWER_LIMIT		0x640
+#define MSR_PP1_ENERGY_STATUS		0x641
+#define MSR_PP1_POLICY			0x642
+
+/* DRAM RAPL Domain */
+#define MSR_DRAM_POWER_LIMIT		0x618
+#define MSR_DRAM_ENERGY_STATUS		0x619
+#define MSR_DRAM_PERF_STATUS		0x61B
+#define MSR_DRAM_POWER_INFO		0x61C
+
+/* RAPL UNIT BITMASK */
+#define POWER_UNIT_OFFSET	0
+#define POWER_UNIT_MASK		0x0F
+
+#define ENERGY_UNIT_OFFSET	0x08
+#define ENERGY_UNIT_MASK	0x1F00
+
+#define TIME_UNIT_OFFSET	0x10
+#define TIME_UNIT_MASK		0xF000
+
+static int rapl_pmu_pkg_event_init(struct perf_event *event);
+static int rapl_pmu_core_event_init(struct perf_event *event);
+static int rapl_pmu_uncore_event_init(struct perf_event *event);
+static int rapl_pmu_dram_event_init(struct perf_event *event);
+static void rapl_event_start(struct perf_event *event, int flags);
+static void rapl_event_stop(struct perf_event *event, int flags);
+static int rapl_event_add(struct perf_event *event, int flags);
+static void rapl_event_del(struct perf_event *event, int flags);
+static void rapl_event_read(struct perf_event *event);
+
+enum rapl_domain_id {
+	RAPL_DOMAIN_PKG,
+	RAPL_DOMAIN_PP0,
+	RAPL_DOMAIN_PP1,
+	RAPL_DOMAIN_DRAM,
+	RAPL_DOMAIN_MAX
+};
+
+struct rapl_domain_msr {
+	int	limit;
+	int	status;
+};
+
+struct rapl_domain {
+	enum rapl_domain_id domain_id;
+	struct rapl_domain_msr msrs;
+	struct pmu pmu;
+	enum perf_sw_ids event_id;
+	int valid;
+};
+
+#define to_rapl_domain(p) container_of(p, struct rapl_domain, pmu);
+
+static struct rapl_domain rapl_domains[] = {
+	[RAPL_DOMAIN_PKG] = {
+		.domain_id = RAPL_DOMAIN_PKG,
+		.msrs	= {
+			.limit	= MSR_PKG_RAPL_POWER_LIMIT,
+			.status	= MSR_PKG_ENERGY_STATUS,
+		},
+		.pmu	= {
+			.name		= "rapl_pkg_energy_meter",
+			.event_init	= rapl_pmu_pkg_event_init,
+			.add		= rapl_event_add,
+			.del		= rapl_event_del,
+			.start		= rapl_event_start,
+			.stop		= rapl_event_stop,
+			.read		= rapl_event_read,
+		},
+		.event_id = PERF_COUNT_SW_PKG_ENERGY,
+		.valid	= 1,
+	},
+	[RAPL_DOMAIN_PP0] = {
+		.domain_id = RAPL_DOMAIN_PP0,
+		.msrs	= {
+			.limit	= MSR_PP0_POWER_LIMIT,
+			.status	= MSR_PP0_ENERGY_STATUS,
+		},
+		.pmu	= {
+			.name		= "rapl_core_energy_meter",
+			.event_init	= rapl_pmu_core_event_init,
+			.add		= rapl_event_add,
+			.del		= rapl_event_del,
+			.start		= rapl_event_start,
+			.stop		= rapl_event_stop,
+			.read		= rapl_event_read,
+		},
+		.event_id = PERF_COUNT_SW_CORE_ENERGY,
+		.valid	= 1,
+	},
+	[RAPL_DOMAIN_PP1] = {
+		.domain_id = RAPL_DOMAIN_PP1,
+		.msrs	= {
+			.limit	= MSR_PP1_POWER_LIMIT,
+			.status	= MSR_PP1_ENERGY_STATUS,
+		},
+		.pmu	= {
+			.name		= "rapl_uncore_energy_meter",
+			.event_init	= rapl_pmu_uncore_event_init,
+			.add		= rapl_event_add,
+			.del		= rapl_event_del,
+			.start		= rapl_event_start,
+			.stop		= rapl_event_stop,
+			.read		= rapl_event_read,
+		},
+		.event_id = PERF_COUNT_SW_UNCORE_ENERGY,
+	},
+	[RAPL_DOMAIN_DRAM] = {
+		.domain_id = RAPL_DOMAIN_DRAM,
+		.msrs	= {
+			.limit	= MSR_DRAM_POWER_LIMIT,
+			.status	= MSR_DRAM_ENERGY_STATUS,
+		},
+		.pmu	= {
+			.name		= "rapl_dram_energy_meter",
+			.event_init	= rapl_pmu_dram_event_init,
+			.add		= rapl_event_add,
+			.del		= rapl_event_del,
+			.start		= rapl_event_start,
+			.stop		= rapl_event_stop,
+			.read		= rapl_event_read,
+		},
+		.event_id = PERF_COUNT_SW_DRAM_ENERGY,
+	},
+};
+
+static unsigned int power_unit_divisor;
+static unsigned int energy_unit_divisor;
+static unsigned int time_unit_divisor;
+
+enum unit_type {
+	POWER_UNIT,
+	ENERGY_UNIT,
+	TIME_UNIT
+};
+static u64 rapl_unit_xlate(enum unit_type type, u64 value, int action)
+{
+	u64 divisor;
+
+	switch (type) {
+	case POWER_UNIT:
+		divisor = power_unit_divisor;
+		break;
+	case ENERGY_UNIT:
+		divisor = energy_unit_divisor;
+		break;
+	case TIME_UNIT:
+		divisor = time_unit_divisor;
+		break;
+	default:
+		return 0;
+	};
+
+	if (action)
+		return value * divisor; /* value is from users */
+	else
+		return div64_u64(value, divisor); /* value is from MSR */
+}
+
+/* show the energy status, in Jelous */
+static int rapl_read_energy(struct rapl_domain *domain)
+{
+	u64 value;
+	u32 msr = domain->msrs.status;
+
+	rdmsrl(msr, value);
+	return rapl_unit_xlate(ENERGY_UNIT, value, 0);
+}
+
+static void rapl_event_update(struct perf_event *event)
+{
+	s64 prev;
+	u64 now;
+	struct rapl_domain *domain = to_rapl_domain(event->pmu);
+
+	now = rapl_read_energy(domain);
+	prev = local64_xchg(&event->hw.prev_count, now);
+	local64_add(now - prev, &event->count);
+}
+
+static void rapl_event_start(struct perf_event *event, int flags)
+{
+	struct rapl_domain *domain = to_rapl_domain(event->pmu);
+
+	local64_set(&event->hw.prev_count, rapl_read_energy(domain));
+	perf_swevent_start_hrtimer(event);
+}
+
+static void rapl_event_stop(struct perf_event *event, int flags)
+{
+	perf_swevent_cancel_hrtimer(event);
+	rapl_event_update(event);
+}
+
+static int rapl_event_add(struct perf_event *event, int flags)
+{
+	if (flags & PERF_EF_START)
+		rapl_event_start(event, flags);
+	return 0;
+}
+static void rapl_event_del(struct perf_event *event, int flags)
+{
+	rapl_event_stop(event, flags);
+}
+
+static void rapl_event_read(struct perf_event *event)
+{
+	rapl_event_update(event);
+}
+
+static int rapl_pmu_event_init(struct perf_event *event,
+			       enum rapl_domain_id id)
+{
+	struct rapl_domain *domain = &(rapl_domains[id]);
+
+	if (event->attr.type != PERF_TYPE_SOFTWARE)
+		return -ENOENT;
+
+	if (event->attr.config != domain->event_id)
+		return -ENOENT;
+
+	/* Do periodecal update every second */
+	event->attr.freq = 1;
+	event->attr.sample_period = 1;
+
+	perf_swevent_init_hrtimer(event);
+
+	return 0;
+}
+
+static int rapl_pmu_pkg_event_init(struct perf_event *event)
+{
+	return rapl_pmu_event_init(event, RAPL_DOMAIN_PKG);
+}
+
+static int rapl_pmu_core_event_init(struct perf_event *event)
+{
+	return rapl_pmu_event_init(event, RAPL_DOMAIN_PP0);
+}
+
+static int rapl_pmu_uncore_event_init(struct perf_event *event)
+{
+	return rapl_pmu_event_init(event, RAPL_DOMAIN_PP1);
+}
+
+static int rapl_pmu_dram_event_init(struct perf_event *event)
+{
+	return rapl_pmu_event_init(event, RAPL_DOMAIN_DRAM);
+}
+
+static int rapl_check_unit(void)
+{
+	u64 output;
+	u32 value;
+
+	rdmsrl(MSR_RAPL_POWER_UNIT, output);
+
+	/* energy unit: 1/enery_unit_divisor Joules */
+	value = (output & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
+	energy_unit_divisor = 1 << value;
+
+	/* power unit: 1/power_unit_divisor Watts */
+	value = (output & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
+	power_unit_divisor = 1 << value;
+
+	/* time unit: 1/time_unit_divisor Seconds */
+	value =(output & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
+	time_unit_divisor = 1 << value;
+
+	return 0;
+}
+
+static int __init intel_rapl_init(void)
+{
+	enum rapl_domain_id id;
+
+	/*
+	 * RAPL features are only supported on processors have a CPUID
+	 * signature with DisplayFamily_DisplayModel of 06_2AH, 06_2DH
+	 */
+	if (boot_cpu_data.x86 != 0x06)
+		return -ENODEV;
+
+	if (boot_cpu_data.x86_model == 0x2A)
+		rapl_domains[RAPL_DOMAIN_PP1].valid = 1;
+	else if (boot_cpu_data.x86_model == 0x2D)
+		rapl_domains[RAPL_DOMAIN_DRAM].valid = 1;
+	else
+		return -ENODEV;
+
+	if (rapl_check_unit())
+		return -ENODEV;
+
+	for(id = 0; id < RAPL_DOMAIN_MAX; id++)
+		if (rapl_domains[id].valid)
+			perf_pmu_register(&(rapl_domains[id].pmu), rapl_domains[id].pmu.name, PERF_TYPE_SOFTWARE);
+	return 0;
+}
+
+static void __exit intel_rapl_exit(void)
+{
+	enum rapl_domain_id id;
+
+	for(id = 0; id < RAPL_DOMAIN_MAX; id++)
+		if (rapl_domains[id].valid)
+			perf_pmu_unregister(&(rapl_domains[id].pmu));
+}
+
+module_init(intel_rapl_init);
+module_exit(intel_rapl_exit);

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2011-06-02  8:04 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-05-26  8:34 [PATCH 2/3] introduce intel_rapl driver Zhang Rui
2011-05-26  9:43 ` Peter Zijlstra
2011-05-26  9:43 ` Peter Zijlstra
2011-05-26 10:21   ` Peter Zijlstra
2011-05-26 10:21   ` Peter Zijlstra
2011-05-26 10:55   ` Matt Fleming
2011-06-02  8:04     ` Matt Fleming
2011-06-02  8:04       ` Matt Fleming
2011-05-26 10:55   ` Matt Fleming
2011-05-27  8:26   ` Zhang Rui
2011-05-27  8:26   ` Zhang Rui
2011-05-27 19:56     ` Peter Zijlstra
2011-05-27 19:56     ` Peter Zijlstra
2011-05-27 19:56     ` Peter Zijlstra
2011-05-30  3:11       ` Zhang Rui
2011-05-30  3:11       ` Zhang Rui
2011-05-27 19:56     ` Peter Zijlstra
2011-05-26 15:48 ` Randy Dunlap
2011-05-30  2:40   ` Zhang Rui
2011-05-30  2:40   ` Zhang Rui
2011-05-26 15:48 ` Randy Dunlap
2011-05-28 10:17 ` Greg KH
2011-05-30  7:04   ` Zhang Rui
2011-05-30  7:04   ` Zhang Rui
2011-05-28 10:17 ` Greg KH
2011-05-26  8:34 Zhang Rui

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.