[RFC][PATCH] perf: Implement read_group() PMU operation

* [RFC][PATCH] perf: Implement read_group() PMU operation
@ 2015-02-06  2:59 Sukadev Bhattiprolu
  2015-02-12 15:58 ` Peter Zijlstra
  2015-02-22 21:04 ` Cody P Schafer
  0 siblings, 2 replies; 5+ messages in thread
From: Sukadev Bhattiprolu @ 2015-02-06  2:59 UTC (permalink / raw)
  To: Peter Zijlstra, mingo, Michael Ellerman, Anton Blanchard,
	Stephane Eranian
  Cc: Jiri Olsa, Arnaldo Carvalho de Melo, linux-kernel

From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Thu Feb  5 20:56:20 EST 2015 -0300
Subject: [RFC][PATCH] perf: Implement read_group() PMU operation

This is a lightly tested, exploratory patch to allow PMUs to return
several counters at once. Appreciate any comments :-)

Unlike normal hardware PMCs, the 24x7 counters[1] in Power8 are stored
in memory and accessed via a hypervisor call (HCALL).  A major aspect
of the HCALL is that it allows retireving _SEVERAL_ counters at once
(unlike regular PMCs, which are read one at a time).

This patch implements a ->read_group() PMU operation that tries to
take advantage of this ability to read several counters at once.  A
PMU that implements the ->read_group() operation would allow users
to retrieve several counters at once and get a more consistent
snapshot.

NOTE: 	This patch has a TODO in h_24x7_event_read_group() in that it
	still does multiple HCALLS. I think that can be optimized 
	independently, once the pmu->read_group() interface itself is
	finalized.

Appreciate comments on the ->read_group interface and best managing the
interfaces between the core and PMU layers - eg: Ok for hv-24x7 PMU to
to walk the ->sibling_list ?

[1] Some notes about 24x7 counters:

        Power8 supports 24x7 counters[1] which differ from traditional PMCs
	in several ways:

	- The 24x7 counters are always on and counting. Rather than
	  start/stop the PMCs, we read/report the _change_ in values
	  in the counters during the execution of the workload.

	- The 24x7 counters are not tied to a task context (they are
	  always on).

	- Rather than reading the event counts from registers, we make
	  a hypervisor call (HCALL) to retrieve counts. The HCALL allows
	  retrieving a large number of counters in a single call.

	- These counters don't generate interrupts when they overflow (so
	  sampling does not apply to these counters).
---

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1d36314..b69fbdf 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -232,6 +232,13 @@ struct pmu {
 	void (*read)			(struct perf_event *event);
 
 	/*
+	 * Read a group of counters.
+	 */
+	int (*read_group)		(struct perf_event *event,
+						u64 *values,
+						int ncounters);
+
+	/*
 	 * Group events scheduling is treated as a transaction, add
 	 * group events as a whole and perform one schedulability test.
 	 * If the test fails, roll back the whole group
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 934687f..026a9d0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3549,10 +3549,43 @@ static int perf_event_read_group(struct perf_event *event,
 	struct perf_event *leader = event->group_leader, *sub;
 	int n = 0, size = 0, ret = -EFAULT;
 	struct perf_event_context *ctx = leader->ctx;
+	u64 *valuesp;
 	u64 values[5];
+	int use_group_read;
 	u64 count, enabled, running;
+	struct pmu *pmu = event->pmu;
+
+	/*
+	 * If PMU supports group read and group read is requested,
+	 * allocate memory before taking the mutex.
+	 */
+	use_group_read = 0;
+	if ((read_format & PERF_FORMAT_GROUP) && pmu->read_group) {
+		use_group_read++;
+	}
+
+	if (use_group_read) {
+		valuesp = kzalloc(leader->nr_siblings * sizeof(u64), GFP_KERNEL);
+		if (!valuesp)
+			return -ENOMEM;
+	}
 
 	mutex_lock(&ctx->mutex);
+
+	if (use_group_read) {
+		ret = pmu->read_group(leader, valuesp, leader->nr_siblings);
+		if (ret >= 0) {
+			size = ret * sizeof(u64);
+
+			ret = size;
+			if (copy_to_user(buf, valuesp, size))
+				ret = -EFAULT;
+		}
+
+		kfree(valuesp);
+		goto unlock;
+	}
+
 	count = perf_event_read_value(leader, &enabled, &running);
 
 	values[n++] = 1 + leader->nr_siblings;
diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index 9445a82..cd48cf0 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -1071,12 +1071,33 @@ static int h_24x7_event_init(struct perf_event *event)
 	struct hv_perf_caps caps;
 	unsigned domain;
 	unsigned long hret;
+	u64 read_format, inv_flags;
 	u64 ct;
 
 	/* Not our event */
 	if (event->attr.type != event->pmu->type)
 		return -ENOENT;
 
+	/*
+	 * We don't support enabled/running times with PERF_FORMAT_GROUP.
+	 * The ->read_group() operation is intended to be used in continous
+	 * monitoring mode, so these time values are not important at least
+	 * for now.
+	 *
+	 * Not sure if the PERF_FORMAT_ID is useful. Block it for now.
+	 */
+	read_format = event->attr.read_format;
+	inv_flags = PERF_FORMAT_TOTAL_TIME_ENABLED;
+	inv_flags |= PERF_FORMAT_TOTAL_TIME_RUNNING;
+	inv_flags |= PERF_FORMAT_ID;
+
+	if ((read_format & PERF_FORMAT_GROUP) && (read_format & inv_flags)) {
+		pr_devel("%s(): Invalid flags: rf 0x%llx, invf 0x%llx\n",
+				__func__, (unsigned long long)read_format,
+				(unsigned long long)inv_flags);
+		return -EINVAL;
+	}
+
 	/* Unused areas must be 0 */
 	if (event_get_reserved1(event) ||
 	    event_get_reserved2(event) ||
@@ -1181,6 +1202,50 @@ static int h_24x7_event_add(struct perf_event *event, int flags)
 	return 0;
 }
 
+static int h_24x7_event_read_group(struct perf_event *leader, u64 *values,
+				int ncounters)
+{
+	struct perf_event *sub;
+	int n = 0;
+
+	BUG_ON(!(leader->attr.read_format & PERF_FORMAT_GROUP));
+
+	/*
+	 * sys_perf_event_open() for now prevents inheritance with
+	 * PERF_FORMAT_GROUP. Ensure that hasn't changed.
+	 */
+	BUG_ON(!list_empty(&leader->child_list));
+
+	if (ncounters < leader->nr_siblings) {
+		pr_devel("%s(): Insufficient buffer : ns %d, nc %d\n",
+				__func__, leader->nr_siblings, ncounters);
+		return -EINVAL;
+	}
+
+	raw_spin_lock(&leader->ctx->lock);
+
+	if (leader->state == PERF_EVENT_STATE_ACTIVE) {
+		h_24x7_event_update(leader);
+		values[n++] = local64_read(&leader->count);
+	}
+
+	/*
+	 * TODO: For now, make one HCALL per event. We will soon retrieve
+	 * 	 several events with one HCALL.
+	 */
+	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+		if (sub->state != PERF_EVENT_STATE_ACTIVE)
+			continue;
+
+		h_24x7_event_update(sub);
+		values[n++] =  local64_read(&sub->count);
+	}
+
+	raw_spin_unlock(&leader->ctx->lock);
+
+	return n;
+}
+
 static struct pmu h_24x7_pmu = {
 	.task_ctx_nr = perf_invalid_context,
 
@@ -1192,6 +1257,7 @@ static struct pmu h_24x7_pmu = {
 	.start       = h_24x7_event_start,
 	.stop        = h_24x7_event_stop,
 	.read        = h_24x7_event_update,
+	.read_group  = h_24x7_event_read_group,
 };
 
 static int hv_24x7_init(void)


^ permalink raw reply related	[flat|nested] 5+ messages in thread