linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/4] perf_counter: Group reads and other patches
@ 2009-08-13  9:47 Peter Zijlstra
  2009-08-13  9:47 ` [PATCH 1/4 -v2] perf: rework the whole read vs group stuff Peter Zijlstra
                   ` (3 more replies)
  0 siblings, 4 replies; 12+ messages in thread
From: Peter Zijlstra @ 2009-08-13  9:47 UTC (permalink / raw)
  To: Ingo Molnar, Paul Mackerras, stephane eranian
  Cc: Corey J Ashford, LKML, Peter Zijlstra

So Ingo really wanted to not break the current read() ABI, which is possible
with a little bit more code. Hence here a second version.

The test proglet below gives:

# gcc -o test test.c; ./test
EVNT: 0x400851 scale: nan ID: 646 CNT: 1006656 ID: 647 CNT: 1011020 ID: 648 CNT: 1011120 ID: 649 CNT: 1011079
EVNT: 0x40084b scale: 1.000000 ID: 646 CNT: 2002513 ID: 647 CNT: 2009368 ID: 648 CNT: 2009756 ID: 649 CNT: 2010162
EVNT: 0x40084b scale: 1.000000 ID: 646 CNT: 3002611 ID: 647 CNT: 3013444 ID: 648 CNT: 3014276 ID: 649 CNT: 3015129
EVNT: 0x400858 scale: 1.000000 ID: 646 CNT: 4002528 ID: 647 CNT: 4017221 ID: 648 CNT: 4018497 ID: 649 CNT: 4019802
EVNT: 0x40084b scale: 1.000000 ID: 646 CNT: 5002324 ID: 647 CNT: 5020652 ID: 648 CNT: 5022372 ID: 649 CNT: 5024119
EVNT: 0x40084c scale: 1.000000 ID: 646 CNT: 6002555 ID: 647 CNT: 6024466 ID: 648 CNT: 6026635 ID: 649 CNT: 6028829

and an the regular perf stuff also still works:

# perf stat sleep 1

 Performance counter stats for 'sleep 1':

       4.164737  task-clock-msecs         #      0.004 CPUs
              1  context-switches         #      0.000 M/sec
              0  CPU-migrations           #      0.000 M/sec
            186  page-faults              #      0.045 M/sec
        4109598  cycles                   #    986.761 M/sec
        2573031  instructions             #      0.626 IPC
        1268929  cache-references         #    304.684 M/sec
          13059  cache-misses             #      3.136 M/sec


---
#include "perf.h"

#include <sys/types.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <unistd.h>
#include <signal.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>

void work(void)
{
	int i;

	for (i = 0; i < 1000000; i++) {
		asm("nop");
		asm("nop");
		asm("nop");
		asm("nop");
		asm("nop");
		asm("nop");
		asm("nop");
	}
}

unsigned long page_size;
int fd = -1, fd1 = 0;
pid_t me;
void *output;

void handle_sigio(int sig)
{
	ioctl(fd, PERF_COUNTER_IOC_REFRESH, 1);
}

static unsigned long mmap_read_head(void)
{
	struct perf_counter_mmap_page *pc = output;
	long head;

	head = pc->data_head;
	rmb();

	return head;
}

static void *mmap_read_base(void)
{
	return output + page_size;
}

struct event {
	struct perf_event_header header;

	u64 ip;
	u64 nr;
	u64 time_enabled;
	u64 time_running;
	struct {
		u64 val;
		u64 id;
	} cnt[0];
};

int main(int argc, char **argv)
{
	struct perf_counter_attr attr;
	unsigned long offset = 0, head;
	int err, i;

	page_size = sysconf(_SC_PAGE_SIZE);
	me = getpid();

	memset(&attr, 0, sizeof(attr));

	attr.type = PERF_TYPE_HARDWARE;
	attr.config = PERF_COUNT_HW_CPU_CYCLES;
	attr.sample_period = 1000000;
	attr.sample_type = PERF_SAMPLE_IP |
			   PERF_SAMPLE_READ;
	attr.read_format = PERF_FORMAT_TOTAL_TIME_RUNNING |
			   PERF_FORMAT_TOTAL_TIME_ENABLED |
			   PERF_FORMAT_ID |
			   PERF_FORMAT_GROUP;
	attr.disabled = 1;
	attr.wakeup_events = 1;

	fd  = sys_perf_counter_open(&attr, me, -1, fd, 0);
	if (fd <= 0) {
		perror("FAIL fd: ");
		exit(-1);
	}

	attr.sample_period = 0;
	attr.disabled = 0;

	for (i = 0; i < 3; i++) {
		fd1 = sys_perf_counter_open(&attr, me, -1, fd, 0);
		if (fd1 <= 0) {
			perror("FAIL fd1: ");
			exit(-1);
		}
	}

	signal(SIGIO, handle_sigio);
	err = fcntl(fd, F_SETOWN, me);
	if (err == -1) {
		perror("FAIL fcntl: ");
		exit(-1);
	}

	err = fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_ASYNC);
	if (err == -1) {
		perror("FAIL fcntl2: ");
		exit(-1);
	}

	output = mmap(NULL, page_size * 3, PROT_READ, MAP_SHARED, fd, 0);
	if (output == ((void *)-1)) {
		perror("FAIL mmap:");
		exit(-1);
	}

	ioctl(fd, PERF_COUNTER_IOC_REFRESH, 1);

	work();

	ioctl(fd, PERF_COUNTER_IOC_DISABLE, 0);

	head = mmap_read_head();

	for (; offset < head; ) {
		struct event *evnt = mmap_read_base() + offset;

		offset += evnt->header.size;

		printf("EVNT: %p scale: %f ", (void *)evnt->ip,
				((double)evnt->time_running)/evnt->time_enabled
				);
		for (i = 0; i < evnt->nr; i++) {
			printf("ID: %Lu CNT: %Lu ", 
					evnt->cnt[i].id, evnt->cnt[i].val);
		}
		printf("\n");
	}

	return 0;
}


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/4 -v2] perf: rework the whole read vs group stuff
  2009-08-13  9:47 [PATCH 0/4] perf_counter: Group reads and other patches Peter Zijlstra
@ 2009-08-13  9:47 ` Peter Zijlstra
  2009-08-13 11:00   ` [tip:perfcounters/urgent] perf: Rework/fix " tip-bot for Peter Zijlstra
  2009-08-21 14:47   ` [PATCH 1/4 -v2] perf: rework " stephane eranian
  2009-08-13  9:47 ` [PATCH 2/4] perf_counter: Fix an ipi-deadlock Peter Zijlstra
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 12+ messages in thread
From: Peter Zijlstra @ 2009-08-13  9:47 UTC (permalink / raw)
  To: Ingo Molnar, Paul Mackerras, stephane eranian
  Cc: Corey J Ashford, LKML, Peter Zijlstra

[-- Attachment #1: perf-PERF_SAMPLE_READ.patch --]
[-- Type: text/plain, Size: 13528 bytes --]

Replace PERF_SAMPLE_GROUP with PERF_SAMPLE_READ and introduce
PERF_FORMAT_GROUP to deal with group reads in a more generic way.

This allows you to get group reads out of read() as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/perf_counter.h |   47 +++++--
 kernel/perf_counter.c        |  274 +++++++++++++++++++++++++++++++------------
 2 files changed, 238 insertions(+), 83 deletions(-)

Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -115,7 +115,7 @@ enum perf_counter_sample_format {
 	PERF_SAMPLE_TID				= 1U << 1,
 	PERF_SAMPLE_TIME			= 1U << 2,
 	PERF_SAMPLE_ADDR			= 1U << 3,
-	PERF_SAMPLE_GROUP			= 1U << 4,
+	PERF_SAMPLE_READ			= 1U << 4,
 	PERF_SAMPLE_CALLCHAIN			= 1U << 5,
 	PERF_SAMPLE_ID				= 1U << 6,
 	PERF_SAMPLE_CPU				= 1U << 7,
@@ -127,16 +127,32 @@ enum perf_counter_sample_format {
 };
 
 /*
- * Bits that can be set in attr.read_format to request that
- * reads on the counter should return the indicated quantities,
- * in increasing order of bit value, after the counter value.
+ * The format of the data returned by read() on a perf counter fd,
+ * as specified by attr.read_format:
+ *
+ * struct read_format {
+ * 	{ u64		value;
+ * 	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
+ * 	  { u64		time_running; } && PERF_FORMAT_RUNNING
+ * 	  { u64		id;           } && PERF_FORMAT_ID
+ * 	} && !PERF_FORMAT_GROUP
+ *
+ * 	{ u64		nr;
+ * 	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
+ * 	  { u64		time_running; } && PERF_FORMAT_RUNNING
+ * 	  { u64		value;
+ * 	    { u64	id;           } && PERF_FORMAT_ID
+ * 	  }		cntr[nr];
+ * 	} && PERF_FORMAT_GROUP
+ * };
  */
 enum perf_counter_read_format {
 	PERF_FORMAT_TOTAL_TIME_ENABLED		= 1U << 0,
 	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
 	PERF_FORMAT_ID				= 1U << 2,
+	PERF_FORMAT_GROUP			= 1U << 3,
 
-	PERF_FORMAT_MAX = 1U << 3, 		/* non-ABI */
+	PERF_FORMAT_MAX = 1U << 4, 		/* non-ABI */
 };
 
 #define PERF_ATTR_SIZE_VER0	64	/* sizeof first published struct */
@@ -343,10 +359,8 @@ enum perf_event_type {
 	 * struct {
 	 * 	struct perf_event_header	header;
 	 * 	u32				pid, tid;
-	 * 	u64				value;
-	 * 	{ u64		time_enabled; 	} && PERF_FORMAT_ENABLED
-	 * 	{ u64		time_running; 	} && PERF_FORMAT_RUNNING
-	 * 	{ u64		parent_id;	} && PERF_FORMAT_ID
+	 *
+	 * 	struct read_format		values;
 	 * };
 	 */
 	PERF_EVENT_READ			= 8,
@@ -364,11 +378,22 @@ enum perf_event_type {
 	 *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
 	 * 	{ u64			period;   } && PERF_SAMPLE_PERIOD
 	 *
-	 *	{ u64			nr;
-	 *	  { u64 id, val; }	cnt[nr];  } && PERF_SAMPLE_GROUP
+	 *	{ struct read_format	values;	  } && PERF_SAMPLE_READ
 	 *
 	 *	{ u64			nr,
 	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
+	 *
+	 * 	#
+	 * 	# The RAW record below is opaque data wrt the ABI
+	 * 	#
+	 * 	# That is, the ABI doesn't make any promises wrt to
+	 * 	# the stability of its content, it may vary depending
+	 * 	# on event, hardware, kernel version and phase of
+	 * 	# the moon.
+	 * 	#
+	 * 	# In other words, PERF_SAMPLE_RAW contents are not an ABI.
+	 * 	#
+	 *
 	 *	{ u32			size;
 	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
 	 * };
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -1692,7 +1692,32 @@ static int perf_release(struct inode *in
 	return 0;
 }
 
-static u64 perf_counter_read_tree(struct perf_counter *counter)
+static int perf_counter_read_size(struct perf_counter *counter)
+{
+	int entry = sizeof(u64); /* value */
+	int size = 0;
+	int nr = 1;
+
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		size += sizeof(u64);
+
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		size += sizeof(u64);
+
+	if (counter->attr.read_format & PERF_FORMAT_ID)
+		entry += sizeof(u64);
+
+	if (counter->attr.read_format & PERF_FORMAT_GROUP) {
+		nr += counter->group_leader->nr_siblings;
+		size += sizeof(u64);
+	}
+
+	size += entry * nr;
+
+	return size;
+}
+
+static u64 perf_counter_read_value(struct perf_counter *counter)
 {
 	struct perf_counter *child;
 	u64 total = 0;
@@ -1704,14 +1729,96 @@ static u64 perf_counter_read_tree(struct
 	return total;
 }
 
+static int perf_counter_read_entry(struct perf_counter *counter,
+				   u64 read_format, char __user *buf)
+{
+	int n = 0, count = 0;
+	u64 values[2];
+
+	values[n++] = perf_counter_read_value(counter);
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_counter_id(counter);
+
+	count = n * sizeof(u64);
+
+	if (copy_to_user(buf, values, count))
+		return -EFAULT;
+
+	return count;
+}
+
+static int perf_counter_read_group(struct perf_counter *counter,
+				   u64 read_format, char __user *buf)
+{
+	struct perf_counter *leader = counter->group_leader, *sub;
+	int n = 0, size = 0, err = -EFAULT;
+	u64 values[3];
+
+	values[n++] = 1 + leader->nr_siblings;
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+		values[n++] = leader->total_time_enabled +
+			atomic64_read(&leader->child_total_time_enabled);
+	}
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+		values[n++] = leader->total_time_running +
+			atomic64_read(&leader->child_total_time_running);
+	}
+
+	size = n * sizeof(u64);
+
+	if (copy_to_user(buf, values, size))
+		return -EFAULT;
+
+	err = perf_counter_read_entry(leader, read_format, buf + size);
+	if (err < 0)
+		return err;
+
+	size += err;
+
+	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+		err = perf_counter_read_entry(counter, read_format,
+				buf + size);
+		if (err < 0)
+			return err;
+
+		size += err;
+	}
+
+	return size;
+}
+
+static int perf_counter_read_one(struct perf_counter *counter,
+				 u64 read_format, char __user *buf)
+{
+	u64 values[4];
+	int n = 0;
+
+	values[n++] = perf_counter_read_value(counter);
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+		values[n++] = counter->total_time_enabled +
+			atomic64_read(&counter->child_total_time_enabled);
+	}
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+		values[n++] = counter->total_time_running +
+			atomic64_read(&counter->child_total_time_running);
+	}
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_counter_id(counter);
+
+	if (copy_to_user(buf, values, n * sizeof(u64)))
+		return -EFAULT;
+
+	return n * sizeof(u64);
+}
+
 /*
  * Read the performance counter - simple non blocking version for now
  */
 static ssize_t
 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 {
-	u64 values[4];
-	int n;
+	u64 read_format = counter->attr.read_format;
+	int ret;
 
 	/*
 	 * Return end-of-file for a read on a counter that is in
@@ -1721,28 +1828,18 @@ perf_read_hw(struct perf_counter *counte
 	if (counter->state == PERF_COUNTER_STATE_ERROR)
 		return 0;
 
+	if (count < perf_counter_read_size(counter))
+		return -ENOSPC;
+
 	WARN_ON_ONCE(counter->ctx->parent_ctx);
 	mutex_lock(&counter->child_mutex);
-	values[0] = perf_counter_read_tree(counter);
-	n = 1;
-	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-		values[n++] = counter->total_time_enabled +
-			atomic64_read(&counter->child_total_time_enabled);
-	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-		values[n++] = counter->total_time_running +
-			atomic64_read(&counter->child_total_time_running);
-	if (counter->attr.read_format & PERF_FORMAT_ID)
-		values[n++] = primary_counter_id(counter);
+	if (read_format & PERF_FORMAT_GROUP)
+		ret = perf_counter_read_group(counter, read_format, buf);
+	else
+		ret = perf_counter_read_one(counter, read_format, buf);
 	mutex_unlock(&counter->child_mutex);
 
-	if (count < n * sizeof(u64))
-		return -EINVAL;
-	count = n * sizeof(u64);
-
-	if (copy_to_user(buf, values, count))
-		return -EFAULT;
-
-	return count;
+	return ret;
 }
 
 static ssize_t
@@ -2631,6 +2728,79 @@ static u32 perf_counter_tid(struct perf_
 	return task_pid_nr_ns(p, counter->ns);
 }
 
+static void perf_output_read_one(struct perf_output_handle *handle,
+				 struct perf_counter *counter)
+{
+	u64 read_format = counter->attr.read_format;
+	u64 values[4];
+	int n = 0;
+
+	values[n++] = atomic64_read(&counter->count);
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+		values[n++] = counter->total_time_enabled +
+			atomic64_read(&counter->child_total_time_enabled);
+	}
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+		values[n++] = counter->total_time_running +
+			atomic64_read(&counter->child_total_time_running);
+	}
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_counter_id(counter);
+
+	perf_output_copy(handle, values, n * sizeof(u64));
+}
+
+/*
+ * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
+ */
+static void perf_output_read_group(struct perf_output_handle *handle,
+			    struct perf_counter *counter)
+{
+	struct perf_counter *leader = counter->group_leader, *sub;
+	u64 read_format = counter->attr.read_format;
+	u64 values[5];
+	int n = 0;
+
+	values[n++] = 1 + leader->nr_siblings;
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		values[n++] = leader->total_time_enabled;
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		values[n++] = leader->total_time_running;
+
+	if (leader != counter)
+		leader->pmu->read(leader);
+
+	values[n++] = atomic64_read(&leader->count);
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_counter_id(leader);
+
+	perf_output_copy(handle, values, n * sizeof(u64));
+
+	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+		n = 0;
+
+		if (sub != counter)
+			sub->pmu->read(sub);
+
+		values[n++] = atomic64_read(&sub->count);
+		if (read_format & PERF_FORMAT_ID)
+			values[n++] = primary_counter_id(sub);
+
+		perf_output_copy(handle, values, n * sizeof(u64));
+	}
+}
+
+static void perf_output_read(struct perf_output_handle *handle,
+			     struct perf_counter *counter)
+{
+	if (counter->attr.read_format & PERF_FORMAT_GROUP)
+		perf_output_read_group(handle, counter);
+	else
+		perf_output_read_one(handle, counter);
+}
+
 void perf_counter_output(struct perf_counter *counter, int nmi,
 				struct perf_sample_data *data)
 {
@@ -2642,10 +2812,6 @@ void perf_counter_output(struct perf_cou
 	struct {
 		u32 pid, tid;
 	} tid_entry;
-	struct {
-		u64 id;
-		u64 counter;
-	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
 	int callchain_size = 0;
 	u64 time;
@@ -2700,10 +2866,8 @@ void perf_counter_output(struct perf_cou
 	if (sample_type & PERF_SAMPLE_PERIOD)
 		header.size += sizeof(u64);
 
-	if (sample_type & PERF_SAMPLE_GROUP) {
-		header.size += sizeof(u64) +
-			counter->nr_siblings * sizeof(group_entry);
-	}
+	if (sample_type & PERF_SAMPLE_READ)
+		header.size += perf_counter_read_size(counter);
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 		callchain = perf_callchain(data->regs);
@@ -2760,26 +2924,8 @@ void perf_counter_output(struct perf_cou
 	if (sample_type & PERF_SAMPLE_PERIOD)
 		perf_output_put(&handle, data->period);
 
-	/*
-	 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
-	 */
-	if (sample_type & PERF_SAMPLE_GROUP) {
-		struct perf_counter *leader, *sub;
-		u64 nr = counter->nr_siblings;
-
-		perf_output_put(&handle, nr);
-
-		leader = counter->group_leader;
-		list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-			if (sub != counter)
-				sub->pmu->read(sub);
-
-			group_entry.id = primary_counter_id(sub);
-			group_entry.counter = atomic64_read(&sub->count);
-
-			perf_output_put(&handle, group_entry);
-		}
-	}
+	if (sample_type & PERF_SAMPLE_READ)
+		perf_output_read(&handle, counter);
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 		if (callchain)
@@ -2818,8 +2964,6 @@ struct perf_read_event {
 
 	u32				pid;
 	u32				tid;
-	u64				value;
-	u64				format[3];
 };
 
 static void
@@ -2831,34 +2975,20 @@ perf_counter_read_event(struct perf_coun
 		.header = {
 			.type = PERF_EVENT_READ,
 			.misc = 0,
-			.size = sizeof(event) - sizeof(event.format),
+			.size = sizeof(event) + perf_counter_read_size(counter),
 		},
 		.pid = perf_counter_pid(counter, task),
 		.tid = perf_counter_tid(counter, task),
-		.value = atomic64_read(&counter->count),
 	};
-	int ret, i = 0;
-
-	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-		event.header.size += sizeof(u64);
-		event.format[i++] = counter->total_time_enabled;
-	}
-
-	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-		event.header.size += sizeof(u64);
-		event.format[i++] = counter->total_time_running;
-	}
-
-	if (counter->attr.read_format & PERF_FORMAT_ID) {
-		event.header.size += sizeof(u64);
-		event.format[i++] = primary_counter_id(counter);
-	}
+	int ret;
 
 	ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
 	if (ret)
 		return;
 
-	perf_output_copy(&handle, &event, event.header.size);
+	perf_output_put(&handle, event);
+	perf_output_read(&handle, counter);
+
 	perf_output_end(&handle);
 }
 
@@ -3929,9 +4059,9 @@ perf_counter_alloc(struct perf_counter_a
 	atomic64_set(&hwc->period_left, hwc->sample_period);
 
 	/*
-	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
+	 * we currently do not support PERF_FORMAT_GROUP on inherited counters
 	 */
-	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
+	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
 		goto done;
 
 	switch (attr->type) {

-- 


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 2/4] perf_counter: Fix an ipi-deadlock
  2009-08-13  9:47 [PATCH 0/4] perf_counter: Group reads and other patches Peter Zijlstra
  2009-08-13  9:47 ` [PATCH 1/4 -v2] perf: rework the whole read vs group stuff Peter Zijlstra
@ 2009-08-13  9:47 ` Peter Zijlstra
  2009-08-13 11:00   ` [tip:perfcounters/urgent] " tip-bot for Peter Zijlstra
  2009-08-13  9:47 ` [PATCH 3/4] perf tools: Add some comments to the event definitions Peter Zijlstra
  2009-08-13  9:47 ` [PATCH 4/4] perf_counter: Fix swcounter context invariance Peter Zijlstra
  3 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2009-08-13  9:47 UTC (permalink / raw)
  To: Ingo Molnar, Paul Mackerras, stephane eranian
  Cc: Corey J Ashford, LKML, Peter Zijlstra

[-- Attachment #1: perf-fix-pending.patch --]
[-- Type: text/plain, Size: 1369 bytes --]

perf_pending_counter() is called from IRQ context and will call
perf_counter_disable(), however perf_counter_disable() uses
smp_call_function_single() which doesn't fancy being used with IRQs
disabled due to IPI deadlocks.

Fix this by making it use the local __perf_counter_disable() call and
teaching the counter_sched_out() code about pending disables as well.

This should cover the case where a counter migrates before the pending
queue gets processed.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/perf_counter.c |    6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -307,6 +307,10 @@ counter_sched_out(struct perf_counter *c
 		return;
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
+	if (counter->pending_disable) {
+		counter->pending_disable = 0;
+		counter->state = PERF_COUNTER_STATE_OFF;
+	}
 	counter->tstamp_stopped = ctx->time;
 	counter->pmu->disable(counter);
 	counter->oncpu = -1;
@@ -2315,7 +2319,7 @@ static void perf_pending_counter(struct 
 
 	if (counter->pending_disable) {
 		counter->pending_disable = 0;
-		perf_counter_disable(counter);
+		__perf_counter_disable(counter);
 	}
 
 	if (counter->pending_wakeup) {

-- 


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 3/4] perf tools: Add some comments to the event definitions
  2009-08-13  9:47 [PATCH 0/4] perf_counter: Group reads and other patches Peter Zijlstra
  2009-08-13  9:47 ` [PATCH 1/4 -v2] perf: rework the whole read vs group stuff Peter Zijlstra
  2009-08-13  9:47 ` [PATCH 2/4] perf_counter: Fix an ipi-deadlock Peter Zijlstra
@ 2009-08-13  9:47 ` Peter Zijlstra
  2009-08-13 11:01   ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra
  2009-08-15 10:07   ` tip-bot for Peter Zijlstra
  2009-08-13  9:47 ` [PATCH 4/4] perf_counter: Fix swcounter context invariance Peter Zijlstra
  3 siblings, 2 replies; 12+ messages in thread
From: Peter Zijlstra @ 2009-08-13  9:47 UTC (permalink / raw)
  To: Ingo Molnar, Paul Mackerras, stephane eranian
  Cc: Corey J Ashford, LKML, Peter Zijlstra

[-- Attachment #1: perf-tools-fix-read.patch --]
[-- Type: text/plain, Size: 814 bytes --]

Just to make it clear that these are _not_ generic event structures
but do rely on the counter configuration.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 tools/perf/util/event.h |    6 ++++++
 1 file changed, 6 insertions(+)

Index: linux-2.6/tools/perf/util/event.h
===================================================================
--- linux-2.6.orig/tools/perf/util/event.h
+++ linux-2.6/tools/perf/util/event.h
@@ -4,6 +4,9 @@
 #include "util.h"
 #include <linux/list.h>
 
+/*
+ * PERF_SAMPLE_IP | PERF_SAMPLE_TID | *
+ */
 struct ip_event {
 	struct perf_event_header header;
 	u64 ip;
@@ -38,6 +41,9 @@ struct lost_event {
 	u64 lost;
 };
 
+/*
+ * PERF_FORMAT_ENABLED | PERF_FORMAT_RUNNING | PERF_FORMAT_ID
+ */
 struct read_event {
 	struct perf_event_header header;
 	u32 pid,tid;

-- 


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 4/4] perf_counter: Fix swcounter context invariance
  2009-08-13  9:47 [PATCH 0/4] perf_counter: Group reads and other patches Peter Zijlstra
                   ` (2 preceding siblings ...)
  2009-08-13  9:47 ` [PATCH 3/4] perf tools: Add some comments to the event definitions Peter Zijlstra
@ 2009-08-13  9:47 ` Peter Zijlstra
  3 siblings, 0 replies; 12+ messages in thread
From: Peter Zijlstra @ 2009-08-13  9:47 UTC (permalink / raw)
  To: Ingo Molnar, Paul Mackerras, stephane eranian
  Cc: Corey J Ashford, LKML, Peter Zijlstra

[-- Attachment #1: perf-fix-swcounter-is-counting.patch --]
[-- Type: text/plain, Size: 2582 bytes --]

perf_swcounter_is_counting() uses a lock, which means we cannot use
swcounters from NMI or when holding that particular lock, this is
unintended.

The below removes the lock, this opens up race window, but not worse
than the swcounters already experience due to RCU traversal of the
context in perf_swcounter_ctx_event().

Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/perf_counter.c |   44 ++++++++++++++++++--------------------------
 1 file changed, 18 insertions(+), 26 deletions(-)

Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -3519,40 +3519,32 @@ static void perf_swcounter_add(struct pe
 
 static int perf_swcounter_is_counting(struct perf_counter *counter)
 {
-	struct perf_counter_context *ctx;
-	unsigned long flags;
-	int count;
-
+	/*
+	 * The counter is active, we're good!
+	 */
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		return 1;
 
+	/*
+	 * The counter is off/error, not counting.
+	 */
 	if (counter->state != PERF_COUNTER_STATE_INACTIVE)
 		return 0;
 
 	/*
-	 * If the counter is inactive, it could be just because
-	 * its task is scheduled out, or because it's in a group
-	 * which could not go on the PMU.  We want to count in
-	 * the first case but not the second.  If the context is
-	 * currently active then an inactive software counter must
-	 * be the second case.  If it's not currently active then
-	 * we need to know whether the counter was active when the
-	 * context was last active, which we can determine by
-	 * comparing counter->tstamp_stopped with ctx->time.
-	 *
-	 * We are within an RCU read-side critical section,
-	 * which protects the existence of *ctx.
+	 * The counter is inactive, if the context is active
+	 * we're part of a group that didn't make it on the 'pmu',
+	 * not counting.
 	 */
-	ctx = counter->ctx;
-	spin_lock_irqsave(&ctx->lock, flags);
-	count = 1;
-	/* Re-check state now we have the lock */
-	if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
-	    counter->ctx->is_active ||
-	    counter->tstamp_stopped < ctx->time)
-		count = 0;
-	spin_unlock_irqrestore(&ctx->lock, flags);
-	return count;
+	if (counter->ctx->is_active)
+		return 0;
+
+	/*
+	 * We're inactive and the context is too, this means the
+	 * task is scheduled out, we're counting events that happen
+	 * to us, like migration events.
+	 */
+	return 1;
 }
 
 static int perf_swcounter_match(struct perf_counter *counter,

-- 


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [tip:perfcounters/urgent] perf: Rework/fix the whole read vs group stuff
  2009-08-13  9:47 ` [PATCH 1/4 -v2] perf: rework the whole read vs group stuff Peter Zijlstra
@ 2009-08-13 11:00   ` tip-bot for Peter Zijlstra
  2009-08-21 14:47   ` [PATCH 1/4 -v2] perf: rework " stephane eranian
  1 sibling, 0 replies; 12+ messages in thread
From: tip-bot for Peter Zijlstra @ 2009-08-13 11:00 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, paulus, hpa, mingo, eranian, cjashfor,
	a.p.zijlstra, tglx, mingo

Commit-ID:  3dab77fb1bf89664bb1c9544607159dcab6f7d57
Gitweb:     http://git.kernel.org/tip/3dab77fb1bf89664bb1c9544607159dcab6f7d57
Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Thu, 13 Aug 2009 11:47:53 +0200
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Thu, 13 Aug 2009 12:58:04 +0200

perf: Rework/fix the whole read vs group stuff

Replace PERF_SAMPLE_GROUP with PERF_SAMPLE_READ and introduce
PERF_FORMAT_GROUP to deal with group reads in a more generic
way.

This allows you to get group reads out of read() as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey J Ashford <cjashfor@us.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
LKML-Reference: <20090813103655.117411814@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>


---
 include/linux/perf_counter.h |   47 ++++++--
 kernel/perf_counter.c        |  274 +++++++++++++++++++++++++++++++-----------
 2 files changed, 238 insertions(+), 83 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 2b36afe..b53f700 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -115,7 +115,7 @@ enum perf_counter_sample_format {
 	PERF_SAMPLE_TID				= 1U << 1,
 	PERF_SAMPLE_TIME			= 1U << 2,
 	PERF_SAMPLE_ADDR			= 1U << 3,
-	PERF_SAMPLE_GROUP			= 1U << 4,
+	PERF_SAMPLE_READ			= 1U << 4,
 	PERF_SAMPLE_CALLCHAIN			= 1U << 5,
 	PERF_SAMPLE_ID				= 1U << 6,
 	PERF_SAMPLE_CPU				= 1U << 7,
@@ -127,16 +127,32 @@ enum perf_counter_sample_format {
 };
 
 /*
- * Bits that can be set in attr.read_format to request that
- * reads on the counter should return the indicated quantities,
- * in increasing order of bit value, after the counter value.
+ * The format of the data returned by read() on a perf counter fd,
+ * as specified by attr.read_format:
+ *
+ * struct read_format {
+ * 	{ u64		value;
+ * 	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
+ * 	  { u64		time_running; } && PERF_FORMAT_RUNNING
+ * 	  { u64		id;           } && PERF_FORMAT_ID
+ * 	} && !PERF_FORMAT_GROUP
+ *
+ * 	{ u64		nr;
+ * 	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
+ * 	  { u64		time_running; } && PERF_FORMAT_RUNNING
+ * 	  { u64		value;
+ * 	    { u64	id;           } && PERF_FORMAT_ID
+ * 	  }		cntr[nr];
+ * 	} && PERF_FORMAT_GROUP
+ * };
  */
 enum perf_counter_read_format {
 	PERF_FORMAT_TOTAL_TIME_ENABLED		= 1U << 0,
 	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
 	PERF_FORMAT_ID				= 1U << 2,
+	PERF_FORMAT_GROUP			= 1U << 3,
 
-	PERF_FORMAT_MAX = 1U << 3, 		/* non-ABI */
+	PERF_FORMAT_MAX = 1U << 4, 		/* non-ABI */
 };
 
 #define PERF_ATTR_SIZE_VER0	64	/* sizeof first published struct */
@@ -343,10 +359,8 @@ enum perf_event_type {
 	 * struct {
 	 * 	struct perf_event_header	header;
 	 * 	u32				pid, tid;
-	 * 	u64				value;
-	 * 	{ u64		time_enabled; 	} && PERF_FORMAT_ENABLED
-	 * 	{ u64		time_running; 	} && PERF_FORMAT_RUNNING
-	 * 	{ u64		parent_id;	} && PERF_FORMAT_ID
+	 *
+	 * 	struct read_format		values;
 	 * };
 	 */
 	PERF_EVENT_READ			= 8,
@@ -364,11 +378,22 @@ enum perf_event_type {
 	 *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
 	 * 	{ u64			period;   } && PERF_SAMPLE_PERIOD
 	 *
-	 *	{ u64			nr;
-	 *	  { u64 id, val; }	cnt[nr];  } && PERF_SAMPLE_GROUP
+	 *	{ struct read_format	values;	  } && PERF_SAMPLE_READ
 	 *
 	 *	{ u64			nr,
 	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
+	 *
+	 * 	#
+	 * 	# The RAW record below is opaque data wrt the ABI
+	 * 	#
+	 * 	# That is, the ABI doesn't make any promises wrt to
+	 * 	# the stability of its content, it may vary depending
+	 * 	# on event, hardware, kernel version and phase of
+	 * 	# the moon.
+	 * 	#
+	 * 	# In other words, PERF_SAMPLE_RAW contents are not an ABI.
+	 * 	#
+	 *
 	 *	{ u32			size;
 	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
 	 * };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 3dd4339..b8c6b97 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1692,7 +1692,32 @@ static int perf_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static u64 perf_counter_read_tree(struct perf_counter *counter)
+static int perf_counter_read_size(struct perf_counter *counter)
+{
+	int entry = sizeof(u64); /* value */
+	int size = 0;
+	int nr = 1;
+
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		size += sizeof(u64);
+
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		size += sizeof(u64);
+
+	if (counter->attr.read_format & PERF_FORMAT_ID)
+		entry += sizeof(u64);
+
+	if (counter->attr.read_format & PERF_FORMAT_GROUP) {
+		nr += counter->group_leader->nr_siblings;
+		size += sizeof(u64);
+	}
+
+	size += entry * nr;
+
+	return size;
+}
+
+static u64 perf_counter_read_value(struct perf_counter *counter)
 {
 	struct perf_counter *child;
 	u64 total = 0;
@@ -1704,14 +1729,96 @@ static u64 perf_counter_read_tree(struct perf_counter *counter)
 	return total;
 }
 
+static int perf_counter_read_entry(struct perf_counter *counter,
+				   u64 read_format, char __user *buf)
+{
+	int n = 0, count = 0;
+	u64 values[2];
+
+	values[n++] = perf_counter_read_value(counter);
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_counter_id(counter);
+
+	count = n * sizeof(u64);
+
+	if (copy_to_user(buf, values, count))
+		return -EFAULT;
+
+	return count;
+}
+
+static int perf_counter_read_group(struct perf_counter *counter,
+				   u64 read_format, char __user *buf)
+{
+	struct perf_counter *leader = counter->group_leader, *sub;
+	int n = 0, size = 0, err = -EFAULT;
+	u64 values[3];
+
+	values[n++] = 1 + leader->nr_siblings;
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+		values[n++] = leader->total_time_enabled +
+			atomic64_read(&leader->child_total_time_enabled);
+	}
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+		values[n++] = leader->total_time_running +
+			atomic64_read(&leader->child_total_time_running);
+	}
+
+	size = n * sizeof(u64);
+
+	if (copy_to_user(buf, values, size))
+		return -EFAULT;
+
+	err = perf_counter_read_entry(leader, read_format, buf + size);
+	if (err < 0)
+		return err;
+
+	size += err;
+
+	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+		err = perf_counter_read_entry(counter, read_format,
+				buf + size);
+		if (err < 0)
+			return err;
+
+		size += err;
+	}
+
+	return size;
+}
+
+static int perf_counter_read_one(struct perf_counter *counter,
+				 u64 read_format, char __user *buf)
+{
+	u64 values[4];
+	int n = 0;
+
+	values[n++] = perf_counter_read_value(counter);
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+		values[n++] = counter->total_time_enabled +
+			atomic64_read(&counter->child_total_time_enabled);
+	}
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+		values[n++] = counter->total_time_running +
+			atomic64_read(&counter->child_total_time_running);
+	}
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_counter_id(counter);
+
+	if (copy_to_user(buf, values, n * sizeof(u64)))
+		return -EFAULT;
+
+	return n * sizeof(u64);
+}
+
 /*
  * Read the performance counter - simple non blocking version for now
  */
 static ssize_t
 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 {
-	u64 values[4];
-	int n;
+	u64 read_format = counter->attr.read_format;
+	int ret;
 
 	/*
 	 * Return end-of-file for a read on a counter that is in
@@ -1721,28 +1828,18 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	if (counter->state == PERF_COUNTER_STATE_ERROR)
 		return 0;
 
+	if (count < perf_counter_read_size(counter))
+		return -ENOSPC;
+
 	WARN_ON_ONCE(counter->ctx->parent_ctx);
 	mutex_lock(&counter->child_mutex);
-	values[0] = perf_counter_read_tree(counter);
-	n = 1;
-	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-		values[n++] = counter->total_time_enabled +
-			atomic64_read(&counter->child_total_time_enabled);
-	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-		values[n++] = counter->total_time_running +
-			atomic64_read(&counter->child_total_time_running);
-	if (counter->attr.read_format & PERF_FORMAT_ID)
-		values[n++] = primary_counter_id(counter);
+	if (read_format & PERF_FORMAT_GROUP)
+		ret = perf_counter_read_group(counter, read_format, buf);
+	else
+		ret = perf_counter_read_one(counter, read_format, buf);
 	mutex_unlock(&counter->child_mutex);
 
-	if (count < n * sizeof(u64))
-		return -EINVAL;
-	count = n * sizeof(u64);
-
-	if (copy_to_user(buf, values, count))
-		return -EFAULT;
-
-	return count;
+	return ret;
 }
 
 static ssize_t
@@ -2631,6 +2728,79 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
 	return task_pid_nr_ns(p, counter->ns);
 }
 
+static void perf_output_read_one(struct perf_output_handle *handle,
+				 struct perf_counter *counter)
+{
+	u64 read_format = counter->attr.read_format;
+	u64 values[4];
+	int n = 0;
+
+	values[n++] = atomic64_read(&counter->count);
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+		values[n++] = counter->total_time_enabled +
+			atomic64_read(&counter->child_total_time_enabled);
+	}
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+		values[n++] = counter->total_time_running +
+			atomic64_read(&counter->child_total_time_running);
+	}
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_counter_id(counter);
+
+	perf_output_copy(handle, values, n * sizeof(u64));
+}
+
+/*
+ * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
+ */
+static void perf_output_read_group(struct perf_output_handle *handle,
+			    struct perf_counter *counter)
+{
+	struct perf_counter *leader = counter->group_leader, *sub;
+	u64 read_format = counter->attr.read_format;
+	u64 values[5];
+	int n = 0;
+
+	values[n++] = 1 + leader->nr_siblings;
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		values[n++] = leader->total_time_enabled;
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		values[n++] = leader->total_time_running;
+
+	if (leader != counter)
+		leader->pmu->read(leader);
+
+	values[n++] = atomic64_read(&leader->count);
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_counter_id(leader);
+
+	perf_output_copy(handle, values, n * sizeof(u64));
+
+	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+		n = 0;
+
+		if (sub != counter)
+			sub->pmu->read(sub);
+
+		values[n++] = atomic64_read(&sub->count);
+		if (read_format & PERF_FORMAT_ID)
+			values[n++] = primary_counter_id(sub);
+
+		perf_output_copy(handle, values, n * sizeof(u64));
+	}
+}
+
+static void perf_output_read(struct perf_output_handle *handle,
+			     struct perf_counter *counter)
+{
+	if (counter->attr.read_format & PERF_FORMAT_GROUP)
+		perf_output_read_group(handle, counter);
+	else
+		perf_output_read_one(handle, counter);
+}
+
 void perf_counter_output(struct perf_counter *counter, int nmi,
 				struct perf_sample_data *data)
 {
@@ -2642,10 +2812,6 @@ void perf_counter_output(struct perf_counter *counter, int nmi,
 	struct {
 		u32 pid, tid;
 	} tid_entry;
-	struct {
-		u64 id;
-		u64 counter;
-	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
 	int callchain_size = 0;
 	u64 time;
@@ -2700,10 +2866,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi,
 	if (sample_type & PERF_SAMPLE_PERIOD)
 		header.size += sizeof(u64);
 
-	if (sample_type & PERF_SAMPLE_GROUP) {
-		header.size += sizeof(u64) +
-			counter->nr_siblings * sizeof(group_entry);
-	}
+	if (sample_type & PERF_SAMPLE_READ)
+		header.size += perf_counter_read_size(counter);
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 		callchain = perf_callchain(data->regs);
@@ -2760,26 +2924,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi,
 	if (sample_type & PERF_SAMPLE_PERIOD)
 		perf_output_put(&handle, data->period);
 
-	/*
-	 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
-	 */
-	if (sample_type & PERF_SAMPLE_GROUP) {
-		struct perf_counter *leader, *sub;
-		u64 nr = counter->nr_siblings;
-
-		perf_output_put(&handle, nr);
-
-		leader = counter->group_leader;
-		list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-			if (sub != counter)
-				sub->pmu->read(sub);
-
-			group_entry.id = primary_counter_id(sub);
-			group_entry.counter = atomic64_read(&sub->count);
-
-			perf_output_put(&handle, group_entry);
-		}
-	}
+	if (sample_type & PERF_SAMPLE_READ)
+		perf_output_read(&handle, counter);
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 		if (callchain)
@@ -2818,8 +2964,6 @@ struct perf_read_event {
 
 	u32				pid;
 	u32				tid;
-	u64				value;
-	u64				format[3];
 };
 
 static void
@@ -2831,34 +2975,20 @@ perf_counter_read_event(struct perf_counter *counter,
 		.header = {
 			.type = PERF_EVENT_READ,
 			.misc = 0,
-			.size = sizeof(event) - sizeof(event.format),
+			.size = sizeof(event) + perf_counter_read_size(counter),
 		},
 		.pid = perf_counter_pid(counter, task),
 		.tid = perf_counter_tid(counter, task),
-		.value = atomic64_read(&counter->count),
 	};
-	int ret, i = 0;
-
-	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-		event.header.size += sizeof(u64);
-		event.format[i++] = counter->total_time_enabled;
-	}
-
-	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-		event.header.size += sizeof(u64);
-		event.format[i++] = counter->total_time_running;
-	}
-
-	if (counter->attr.read_format & PERF_FORMAT_ID) {
-		event.header.size += sizeof(u64);
-		event.format[i++] = primary_counter_id(counter);
-	}
+	int ret;
 
 	ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
 	if (ret)
 		return;
 
-	perf_output_copy(&handle, &event, event.header.size);
+	perf_output_put(&handle, event);
+	perf_output_read(&handle, counter);
+
 	perf_output_end(&handle);
 }
 
@@ -3921,9 +4051,9 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 	atomic64_set(&hwc->period_left, hwc->sample_period);
 
 	/*
-	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
+	 * we currently do not support PERF_FORMAT_GROUP on inherited counters
 	 */
-	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
+	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
 		goto done;
 
 	switch (attr->type) {

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [tip:perfcounters/urgent] perf_counter: Fix an ipi-deadlock
  2009-08-13  9:47 ` [PATCH 2/4] perf_counter: Fix an ipi-deadlock Peter Zijlstra
@ 2009-08-13 11:00   ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 12+ messages in thread
From: tip-bot for Peter Zijlstra @ 2009-08-13 11:00 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, paulus, hpa, mingo, eranian, cjashfor,
	a.p.zijlstra, tglx, mingo

Commit-ID:  970892a9031a5dc7217bd394fb9d89fa75a4a7bc
Gitweb:     http://git.kernel.org/tip/970892a9031a5dc7217bd394fb9d89fa75a4a7bc
Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Thu, 13 Aug 2009 11:47:54 +0200
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Thu, 13 Aug 2009 12:58:05 +0200

perf_counter: Fix an ipi-deadlock

perf_pending_counter() is called from IRQ context and will call
perf_counter_disable(), however perf_counter_disable() uses
smp_call_function_single() which doesn't fancy being used with
IRQs disabled due to IPI deadlocks.

Fix this by making it use the local __perf_counter_disable()
call and teaching the counter_sched_out() code about pending
disables as well.

This should cover the case where a counter migrates before the
pending queue gets processed.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey J Ashford <cjashfor@us.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
LKML-Reference: <20090813103655.244097721@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>


---
 kernel/perf_counter.c |    6 +++++-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index b8c6b97..3f841be 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -307,6 +307,10 @@ counter_sched_out(struct perf_counter *counter,
 		return;
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
+	if (counter->pending_disable) {
+		counter->pending_disable = 0;
+		counter->state = PERF_COUNTER_STATE_OFF;
+	}
 	counter->tstamp_stopped = ctx->time;
 	counter->pmu->disable(counter);
 	counter->oncpu = -1;
@@ -2343,7 +2347,7 @@ static void perf_pending_counter(struct perf_pending_entry *entry)
 
 	if (counter->pending_disable) {
 		counter->pending_disable = 0;
-		perf_counter_disable(counter);
+		__perf_counter_disable(counter);
 	}
 
 	if (counter->pending_wakeup) {

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [tip:perfcounters/core] perf tools: Add some comments to the event definitions
  2009-08-13  9:47 ` [PATCH 3/4] perf tools: Add some comments to the event definitions Peter Zijlstra
@ 2009-08-13 11:01   ` tip-bot for Peter Zijlstra
  2009-08-15 10:07   ` tip-bot for Peter Zijlstra
  1 sibling, 0 replies; 12+ messages in thread
From: tip-bot for Peter Zijlstra @ 2009-08-13 11:01 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, paulus, hpa, mingo, eranian, cjashfor,
	a.p.zijlstra, tglx, mingo

Commit-ID:  2e80a32422078658c661e1e8b8973f9b8b7aa1a7
Gitweb:     http://git.kernel.org/tip/2e80a32422078658c661e1e8b8973f9b8b7aa1a7
Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Thu, 13 Aug 2009 11:47:55 +0200
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Thu, 13 Aug 2009 12:59:01 +0200

perf tools: Add some comments to the event definitions

Just to make it clear that these are _not_ generic event
structures but do rely on the counter configuration.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey J Ashford <cjashfor@us.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
LKML-Reference: <20090813103655.334194326@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>


---
 tools/perf/util/event.h |    6 ++++++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index d26dc88..fa7c50b 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -4,6 +4,9 @@
 #include "util.h"
 #include <linux/list.h>
 
+/*
+ * PERF_SAMPLE_IP | PERF_SAMPLE_TID | *
+ */
 struct ip_event {
 	struct perf_event_header header;
 	u64 ip;
@@ -38,6 +41,9 @@ struct lost_event {
 	u64 lost;
 };
 
+/*
+ * PERF_FORMAT_ENABLED | PERF_FORMAT_RUNNING | PERF_FORMAT_ID
+ */
 struct read_event {
 	struct perf_event_header header;
 	u32 pid,tid;

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [tip:perfcounters/core] perf tools: Add some comments to the event definitions
  2009-08-13  9:47 ` [PATCH 3/4] perf tools: Add some comments to the event definitions Peter Zijlstra
  2009-08-13 11:01   ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra
@ 2009-08-15 10:07   ` tip-bot for Peter Zijlstra
  1 sibling, 0 replies; 12+ messages in thread
From: tip-bot for Peter Zijlstra @ 2009-08-15 10:07 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, paulus, hpa, mingo, eranian, cjashfor,
	a.p.zijlstra, tglx, mingo

Commit-ID:  18408ddc01136f505ae357c03f0d8e50b10e0db6
Gitweb:     http://git.kernel.org/tip/18408ddc01136f505ae357c03f0d8e50b10e0db6
Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Thu, 13 Aug 2009 11:47:55 +0200
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Sat, 15 Aug 2009 12:00:09 +0200

perf tools: Add some comments to the event definitions

Just to make it clear that these are _not_ generic event
structures but do rely on the counter configuration.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey J Ashford <cjashfor@us.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
LKML-Reference: <20090813103655.334194326@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>


---
 tools/perf/util/event.h |    6 ++++++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index d26dc88..fa7c50b 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -4,6 +4,9 @@
 #include "util.h"
 #include <linux/list.h>
 
+/*
+ * PERF_SAMPLE_IP | PERF_SAMPLE_TID | *
+ */
 struct ip_event {
 	struct perf_event_header header;
 	u64 ip;
@@ -38,6 +41,9 @@ struct lost_event {
 	u64 lost;
 };
 
+/*
+ * PERF_FORMAT_ENABLED | PERF_FORMAT_RUNNING | PERF_FORMAT_ID
+ */
 struct read_event {
 	struct perf_event_header header;
 	u32 pid,tid;

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/4 -v2] perf: rework the whole read vs group stuff
  2009-08-13  9:47 ` [PATCH 1/4 -v2] perf: rework the whole read vs group stuff Peter Zijlstra
  2009-08-13 11:00   ` [tip:perfcounters/urgent] perf: Rework/fix " tip-bot for Peter Zijlstra
@ 2009-08-21 14:47   ` stephane eranian
  2009-08-21 15:19     ` Peter Zijlstra
  1 sibling, 1 reply; 12+ messages in thread
From: stephane eranian @ 2009-08-21 14:47 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Paul Mackerras, Corey J Ashford, LKML, perfmon2-devel

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=UTF-8, Size: 21255 bytes --]

Hi,
Well, I tried that and it brings more complexity than neededespecially with regards toextracting the ID for events when you're doing grouping.
To extract the ID, one has to read out a struct as follows: *      { u64           nr; *        { u64         time_enabled; } && PERF_FORMAT_ENABLED *        { u64         time_running; } && PERF_FORMAT_RUNNING *        { u64         value; *          { u64       id;           } && PERF_FORMAT_ID *        }             cntr[nr]; *      } && PERF_FORMAT_GROUP
Supposedly, you should have to do this only once per group. Readingthis stuff using the group leader should yield the values of all the otherevents. This is not what I have observed. All events report the sameID as the leader. Not clear why.
As I suggested in a previous message, I don't think all of this is necessary.If a tool was able to pass the ID to associate with an event, then therewould be no need for a read(). Furthermore, it would make it easier to pickan ID which suites the tool's data structure. For instance, if you create4 events in a group, the ID could be 0,1,2,3 and would most likely map toan index in the array used by the tool to manage the perf_counter structures.That would also make it easier in the critical path in the signal handler. Noneed to have a lookup table to map "random" ID to ID more relevantfor the tool. The ID does not need to be very wide. IDs are relevant only ifone uses group sampling. Therefore the ID needs to identify an event withina group. Could use a reserved field in perf_counter_attr or add an ioctl() toassign an ID.
On Thu, Aug 13, 2009 at 11:47 AM, Peter Zijlstra<a.p.zijlstra@chello.nl> wrote:> Replace PERF_SAMPLE_GROUP with PERF_SAMPLE_READ and introduce> PERF_FORMAT_GROUP to deal with group reads in a more generic way.>> This allows you to get group reads out of read() as well.>> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>> --->  include/linux/perf_counter.h |   47 +++++-->  kernel/perf_counter.c        |  274 +++++++++++++++++++++++++++++++------------>  2 files changed, 238 insertions(+), 83 deletions(-)>> Index: linux-2.6/include/linux/perf_counter.h> ===================================================================> --- linux-2.6.orig/include/linux/perf_counter.h> +++ linux-2.6/include/linux/perf_counter.h> @@ -115,7 +115,7 @@ enum perf_counter_sample_format {>        PERF_SAMPLE_TID                         = 1U << 1,>        PERF_SAMPLE_TIME                        = 1U << 2,>        PERF_SAMPLE_ADDR                        = 1U << 3,> -       PERF_SAMPLE_GROUP                       = 1U << 4,> +       PERF_SAMPLE_READ                        = 1U << 4,>        PERF_SAMPLE_CALLCHAIN                   = 1U << 5,>        PERF_SAMPLE_ID                          = 1U << 6,>        PERF_SAMPLE_CPU                         = 1U << 7,> @@ -127,16 +127,32 @@ enum perf_counter_sample_format {>  };>>  /*> - * Bits that can be set in attr.read_format to request that> - * reads on the counter should return the indicated quantities,> - * in increasing order of bit value, after the counter value.> + * The format of the data returned by read() on a perf counter fd,> + * as specified by attr.read_format:> + *> + * struct read_format {> + *     { u64           value;> + *       { u64         time_enabled; } && PERF_FORMAT_ENABLED> + *       { u64         time_running; } && PERF_FORMAT_RUNNING> + *       { u64         id;           } && PERF_FORMAT_ID> + *     } && !PERF_FORMAT_GROUP> + *> + *     { u64           nr;> + *       { u64         time_enabled; } && PERF_FORMAT_ENABLED> + *       { u64         time_running; } && PERF_FORMAT_RUNNING> + *       { u64         value;> + *         { u64       id;           } && PERF_FORMAT_ID> + *       }             cntr[nr];> + *     } && PERF_FORMAT_GROUP> + * };>  */>  enum perf_counter_read_format {>        PERF_FORMAT_TOTAL_TIME_ENABLED          = 1U << 0,>        PERF_FORMAT_TOTAL_TIME_RUNNING          = 1U << 1,>        PERF_FORMAT_ID                          = 1U << 2,> +       PERF_FORMAT_GROUP                       = 1U << 3,>> -       PERF_FORMAT_MAX = 1U << 3,              /* non-ABI */> +       PERF_FORMAT_MAX = 1U << 4,              /* non-ABI */>  };>>  #define PERF_ATTR_SIZE_VER0    64      /* sizeof first published struct */> @@ -343,10 +359,8 @@ enum perf_event_type {>         * struct {>         *      struct perf_event_header        header;>         *      u32                             pid, tid;> -        *      u64                             value;> -        *      { u64           time_enabled;   } && PERF_FORMAT_ENABLED> -        *      { u64           time_running;   } && PERF_FORMAT_RUNNING> -        *      { u64           parent_id;      } && PERF_FORMAT_ID> +        *> +        *      struct read_format              values;>         * };>         */>        PERF_EVENT_READ                 = 8,> @@ -364,11 +378,22 @@ enum perf_event_type {>         *      { u32                   cpu, res; } && PERF_SAMPLE_CPU>         *      { u64                   period;   } && PERF_SAMPLE_PERIOD>         *> -        *      { u64                   nr;> -        *        { u64 id, val; }      cnt[nr];  } && PERF_SAMPLE_GROUP> +        *      { struct read_format    values;   } && PERF_SAMPLE_READ>         *>         *      { u64                   nr,>         *        u64                   ips[nr];  } && PERF_SAMPLE_CALLCHAIN> +        *> +        *      #> +        *      # The RAW record below is opaque data wrt the ABI> +        *      #> +        *      # That is, the ABI doesn't make any promises wrt to> +        *      # the stability of its content, it may vary depending> +        *      # on event, hardware, kernel version and phase of> +        *      # the moon.> +        *      #> +        *      # In other words, PERF_SAMPLE_RAW contents are not an ABI.> +        *      #> +        *>         *      { u32                   size;>         *        char                  data[size];}&& PERF_SAMPLE_RAW>         * };> Index: linux-2.6/kernel/perf_counter.c> ===================================================================> --- linux-2.6.orig/kernel/perf_counter.c> +++ linux-2.6/kernel/perf_counter.c> @@ -1692,7 +1692,32 @@ static int perf_release(struct inode *in>        return 0;>  }>> -static u64 perf_counter_read_tree(struct perf_counter *counter)> +static int perf_counter_read_size(struct perf_counter *counter)> +{> +       int entry = sizeof(u64); /* value */> +       int size = 0;> +       int nr = 1;> +> +       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)> +               size += sizeof(u64);> +> +       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)> +               size += sizeof(u64);> +> +       if (counter->attr.read_format & PERF_FORMAT_ID)> +               entry += sizeof(u64);> +> +       if (counter->attr.read_format & PERF_FORMAT_GROUP) {> +               nr += counter->group_leader->nr_siblings;> +               size += sizeof(u64);> +       }> +> +       size += entry * nr;> +> +       return size;> +}> +> +static u64 perf_counter_read_value(struct perf_counter *counter)>  {>        struct perf_counter *child;>        u64 total = 0;> @@ -1704,14 +1729,96 @@ static u64 perf_counter_read_tree(struct>        return total;>  }>> +static int perf_counter_read_entry(struct perf_counter *counter,> +                                  u64 read_format, char __user *buf)> +{> +       int n = 0, count = 0;> +       u64 values[2];> +> +       values[n++] = perf_counter_read_value(counter);> +       if (read_format & PERF_FORMAT_ID)> +               values[n++] = primary_counter_id(counter);> +> +       count = n * sizeof(u64);> +> +       if (copy_to_user(buf, values, count))> +               return -EFAULT;> +> +       return count;> +}> +> +static int perf_counter_read_group(struct perf_counter *counter,> +                                  u64 read_format, char __user *buf)> +{> +       struct perf_counter *leader = counter->group_leader, *sub;> +       int n = 0, size = 0, err = -EFAULT;> +       u64 values[3];> +> +       values[n++] = 1 + leader->nr_siblings;> +       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {> +               values[n++] = leader->total_time_enabled +> +                       atomic64_read(&leader->child_total_time_enabled);> +       }> +       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {> +               values[n++] = leader->total_time_running +> +                       atomic64_read(&leader->child_total_time_running);> +       }> +> +       size = n * sizeof(u64);> +> +       if (copy_to_user(buf, values, size))> +               return -EFAULT;> +> +       err = perf_counter_read_entry(leader, read_format, buf + size);> +       if (err < 0)> +               return err;> +> +       size += err;> +> +       list_for_each_entry(sub, &leader->sibling_list, list_entry) {> +               err = perf_counter_read_entry(counter, read_format,> +                               buf + size);> +               if (err < 0)> +                       return err;> +> +               size += err;> +       }> +> +       return size;> +}> +> +static int perf_counter_read_one(struct perf_counter *counter,> +                                u64 read_format, char __user *buf)> +{> +       u64 values[4];> +       int n = 0;> +> +       values[n++] = perf_counter_read_value(counter);> +       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {> +               values[n++] = counter->total_time_enabled +> +                       atomic64_read(&counter->child_total_time_enabled);> +       }> +       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {> +               values[n++] = counter->total_time_running +> +                       atomic64_read(&counter->child_total_time_running);> +       }> +       if (read_format & PERF_FORMAT_ID)> +               values[n++] = primary_counter_id(counter);> +> +       if (copy_to_user(buf, values, n * sizeof(u64)))> +               return -EFAULT;> +> +       return n * sizeof(u64);> +}> +>  /*>  * Read the performance counter - simple non blocking version for now>  */>  static ssize_t>  perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)>  {> -       u64 values[4];> -       int n;> +       u64 read_format = counter->attr.read_format;> +       int ret;>>        /*>         * Return end-of-file for a read on a counter that is in> @@ -1721,28 +1828,18 @@ perf_read_hw(struct perf_counter *counte>        if (counter->state == PERF_COUNTER_STATE_ERROR)>                return 0;>> +       if (count < perf_counter_read_size(counter))> +               return -ENOSPC;> +>        WARN_ON_ONCE(counter->ctx->parent_ctx);>        mutex_lock(&counter->child_mutex);> -       values[0] = perf_counter_read_tree(counter);> -       n = 1;> -       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)> -               values[n++] = counter->total_time_enabled +> -                       atomic64_read(&counter->child_total_time_enabled);> -       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)> -               values[n++] = counter->total_time_running +> -                       atomic64_read(&counter->child_total_time_running);> -       if (counter->attr.read_format & PERF_FORMAT_ID)> -               values[n++] = primary_counter_id(counter);> +       if (read_format & PERF_FORMAT_GROUP)> +               ret = perf_counter_read_group(counter, read_format, buf);> +       else> +               ret = perf_counter_read_one(counter, read_format, buf);>        mutex_unlock(&counter->child_mutex);>> -       if (count < n * sizeof(u64))> -               return -EINVAL;> -       count = n * sizeof(u64);> -> -       if (copy_to_user(buf, values, count))> -               return -EFAULT;> -> -       return count;> +       return ret;>  }>>  static ssize_t> @@ -2631,6 +2728,79 @@ static u32 perf_counter_tid(struct perf_>        return task_pid_nr_ns(p, counter->ns);>  }>> +static void perf_output_read_one(struct perf_output_handle *handle,> +                                struct perf_counter *counter)> +{> +       u64 read_format = counter->attr.read_format;> +       u64 values[4];> +       int n = 0;> +> +       values[n++] = atomic64_read(&counter->count);> +       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {> +               values[n++] = counter->total_time_enabled +> +                       atomic64_read(&counter->child_total_time_enabled);> +       }> +       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {> +               values[n++] = counter->total_time_running +> +                       atomic64_read(&counter->child_total_time_running);> +       }> +       if (read_format & PERF_FORMAT_ID)> +               values[n++] = primary_counter_id(counter);> +> +       perf_output_copy(handle, values, n * sizeof(u64));> +}> +> +/*> + * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.> + */> +static void perf_output_read_group(struct perf_output_handle *handle,> +                           struct perf_counter *counter)> +{> +       struct perf_counter *leader = counter->group_leader, *sub;> +       u64 read_format = counter->attr.read_format;> +       u64 values[5];> +       int n = 0;> +> +       values[n++] = 1 + leader->nr_siblings;> +> +       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)> +               values[n++] = leader->total_time_enabled;> +> +       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)> +               values[n++] = leader->total_time_running;> +> +       if (leader != counter)> +               leader->pmu->read(leader);> +> +       values[n++] = atomic64_read(&leader->count);> +       if (read_format & PERF_FORMAT_ID)> +               values[n++] = primary_counter_id(leader);> +> +       perf_output_copy(handle, values, n * sizeof(u64));> +> +       list_for_each_entry(sub, &leader->sibling_list, list_entry) {> +               n = 0;> +> +               if (sub != counter)> +                       sub->pmu->read(sub);> +> +               values[n++] = atomic64_read(&sub->count);> +               if (read_format & PERF_FORMAT_ID)> +                       values[n++] = primary_counter_id(sub);> +> +               perf_output_copy(handle, values, n * sizeof(u64));> +       }> +}> +> +static void perf_output_read(struct perf_output_handle *handle,> +                            struct perf_counter *counter)> +{> +       if (counter->attr.read_format & PERF_FORMAT_GROUP)> +               perf_output_read_group(handle, counter);> +       else> +               perf_output_read_one(handle, counter);> +}> +>  void perf_counter_output(struct perf_counter *counter, int nmi,>                                struct perf_sample_data *data)>  {> @@ -2642,10 +2812,6 @@ void perf_counter_output(struct perf_cou>        struct {>                u32 pid, tid;>        } tid_entry;> -       struct {> -               u64 id;> -               u64 counter;> -       } group_entry;>        struct perf_callchain_entry *callchain = NULL;>        int callchain_size = 0;>        u64 time;> @@ -2700,10 +2866,8 @@ void perf_counter_output(struct perf_cou>        if (sample_type & PERF_SAMPLE_PERIOD)>                header.size += sizeof(u64);>> -       if (sample_type & PERF_SAMPLE_GROUP) {> -               header.size += sizeof(u64) +> -                       counter->nr_siblings * sizeof(group_entry);> -       }> +       if (sample_type & PERF_SAMPLE_READ)> +               header.size += perf_counter_read_size(counter);>>        if (sample_type & PERF_SAMPLE_CALLCHAIN) {>                callchain = perf_callchain(data->regs);> @@ -2760,26 +2924,8 @@ void perf_counter_output(struct perf_cou>        if (sample_type & PERF_SAMPLE_PERIOD)>                perf_output_put(&handle, data->period);>> -       /*> -        * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.> -        */> -       if (sample_type & PERF_SAMPLE_GROUP) {> -               struct perf_counter *leader, *sub;> -               u64 nr = counter->nr_siblings;> -> -               perf_output_put(&handle, nr);> -> -               leader = counter->group_leader;> -               list_for_each_entry(sub, &leader->sibling_list, list_entry) {> -                       if (sub != counter)> -                               sub->pmu->read(sub);> -> -                       group_entry.id = primary_counter_id(sub);> -                       group_entry.counter = atomic64_read(&sub->count);> -> -                       perf_output_put(&handle, group_entry);> -               }> -       }> +       if (sample_type & PERF_SAMPLE_READ)> +               perf_output_read(&handle, counter);>>        if (sample_type & PERF_SAMPLE_CALLCHAIN) {>                if (callchain)> @@ -2818,8 +2964,6 @@ struct perf_read_event {>>        u32                             pid;>        u32                             tid;> -       u64                             value;> -       u64                             format[3];>  };>>  static void> @@ -2831,34 +2975,20 @@ perf_counter_read_event(struct perf_coun>                .header = {>                        .type = PERF_EVENT_READ,>                        .misc = 0,> -                       .size = sizeof(event) - sizeof(event.format),> +                       .size = sizeof(event) + perf_counter_read_size(counter),>                },>                .pid = perf_counter_pid(counter, task),>                .tid = perf_counter_tid(counter, task),> -               .value = atomic64_read(&counter->count),>        };> -       int ret, i = 0;> -> -       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {> -               event.header.size += sizeof(u64);> -               event.format[i++] = counter->total_time_enabled;> -       }> -> -       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {> -               event.header.size += sizeof(u64);> -               event.format[i++] = counter->total_time_running;> -       }> -> -       if (counter->attr.read_format & PERF_FORMAT_ID) {> -               event.header.size += sizeof(u64);> -               event.format[i++] = primary_counter_id(counter);> -       }> +       int ret;>>        ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);>        if (ret)>                return;>> -       perf_output_copy(&handle, &event, event.header.size);> +       perf_output_put(&handle, event);> +       perf_output_read(&handle, counter);> +>        perf_output_end(&handle);>  }>> @@ -3929,9 +4059,9 @@ perf_counter_alloc(struct perf_counter_a>        atomic64_set(&hwc->period_left, hwc->sample_period);>>        /*> -        * we currently do not support PERF_SAMPLE_GROUP on inherited counters> +        * we currently do not support PERF_FORMAT_GROUP on inherited counters>         */> -       if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))> +       if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))>                goto done;>>        switch (attr->type) {>> -->>ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/4 -v2] perf: rework the whole read vs group stuff
  2009-08-21 14:47   ` [PATCH 1/4 -v2] perf: rework " stephane eranian
@ 2009-08-21 15:19     ` Peter Zijlstra
  2009-08-21 19:36       ` [tip:perfcounters/urgent] perf_counter: Fix typo in read() output generation tip-bot for Peter Zijlstra
  0 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2009-08-21 15:19 UTC (permalink / raw)
  To: eranian
  Cc: Ingo Molnar, Paul Mackerras, Corey J Ashford, LKML, perfmon2-devel

On Fri, 2009-08-21 at 16:47 +0200, stephane eranian wrote:
> Hi,
> 
> Well, I tried that and it brings more complexity than needed
> especially with regards to
> extracting the ID for events when you're doing grouping.
> 
> To extract the ID, one has to read out a struct as follows:
>  *      { u64           nr;
>  *        { u64         time_enabled; } && PERF_FORMAT_ENABLED
>  *        { u64         time_running; } && PERF_FORMAT_RUNNING
>  *        { u64         value;
>  *          { u64       id;           } && PERF_FORMAT_ID
>  *        }             cntr[nr];
>  *      } && PERF_FORMAT_GROUP
> 
> Supposedly, you should have to do this only once per group. Reading
> this stuff using the group leader should yield the values of all the other
> events. This is not what I have observed. All events report the same
> ID as the leader. Not clear why.

Hmm, that's a bug alright.

> As I suggested in a previous message, I don't think all of this is necessary.
> If a tool was able to pass the ID to associate with an event, then there
> would be no need for a read(). Furthermore, it would make it easier to pick
> an ID which suites the tool's data structure. For instance, if you create
> 4 events in a group, the ID could be 0,1,2,3 and would most likely map to
> an index in the array used by the tool to manage the perf_counter structures.
> That would also make it easier in the critical path in the signal handler. No
> need to have a lookup table to map "random" ID to ID more relevant
> for the tool. The ID does not need to be very wide. IDs are relevant only if
> one uses group sampling. Therefore the ID needs to identify an event within
> a group. Could use a reserved field in perf_counter_attr or add an ioctl() to
> assign an ID.

ID are also needed when you want to allow mixing of the output buffers
(in any sense). That is, perf writes all mmap() data into a single file,
which mixes the streams, the other example is the output redirection
patch I posted the other day which basically does the same in-kernel.

A user-provided ID is still possible by extending perf_counter_attr. We
could add this later.

We still need a per-counter unique ID for the inherited thing, and I was
worrying that perhaps we would need to worry about collisions, but I
think these id spaces can be considered separate.

---
Subject: perf_counter: Fix typo in read() output generation

When you iterate a list, using the iterator is useful.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/perf_counter.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 36f65e2..f274e19 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1791,7 +1791,7 @@ static int perf_counter_read_group(struct perf_counter *counter,
 	size += err;
 
 	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-		err = perf_counter_read_entry(counter, read_format,
+		err = perf_counter_read_entry(sub, read_format,
 				buf + size);
 		if (err < 0)
 			return err;



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [tip:perfcounters/urgent] perf_counter: Fix typo in read() output generation
  2009-08-21 15:19     ` Peter Zijlstra
@ 2009-08-21 19:36       ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 12+ messages in thread
From: tip-bot for Peter Zijlstra @ 2009-08-21 19:36 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, paulus, hpa, mingo, eranian, cjashfor,
	a.p.zijlstra, perfmon2-devel, tglx, mingo

Commit-ID:  4464fcaa9cbfc9c551956b48af203e2f775ca892
Gitweb:     http://git.kernel.org/tip/4464fcaa9cbfc9c551956b48af203e2f775ca892
Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Fri, 21 Aug 2009 17:19:36 +0200
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Fri, 21 Aug 2009 18:00:35 +0200

perf_counter: Fix typo in read() output generation

When you iterate a list, using the iterator is useful.

Before:

   ID: 5
   ID: 5
   ID: 5
   ID: 5
   EVNT: 0x40088b scale: nan ID: 5 CNT: 1006252 ID: 6 CNT: 1011090 ID: 7 CNT: 1011196 ID: 8 CNT: 1011095
   EVNT: 0x40088c scale: 1.000000 ID: 5 CNT: 2003065 ID: 6 CNT: 2011671 ID: 7 CNT: 2012620 ID: 8 CNT: 2013479
   EVNT: 0x40088c scale: 1.000000 ID: 5 CNT: 3002390 ID: 6 CNT: 3015996 ID: 7 CNT: 3018019 ID: 8 CNT: 3020006
   EVNT: 0x40088b scale: 1.000000 ID: 5 CNT: 4002406 ID: 6 CNT: 4021120 ID: 7 CNT: 4024241 ID: 8 CNT: 4027059

After:

   ID: 1
   ID: 2
   ID: 3
   ID: 4
   EVNT: 0x400889 scale: nan ID: 1 CNT: 1005270 ID: 2 CNT: 1009833 ID: 3 CNT: 1010065 ID: 4 CNT: 1010088
   EVNT: 0x400898 scale: nan ID: 1 CNT: 2001531 ID: 2 CNT: 2022309 ID: 3 CNT: 2022470 ID: 4 CNT: 2022627
   EVNT: 0x400888 scale: 0.489467 ID: 1 CNT: 3001261 ID: 2 CNT: 3027088 ID: 3 CNT: 3027941 ID: 4 CNT: 3028762

Reported-by: stephane eranian <eranian@googlemail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey J Ashford <cjashfor@us.ibm.com>
Cc: perfmon2-devel <perfmon2-devel@lists.sourceforge.net>
LKML-Reference: <1250867976.7538.73.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>


---
 kernel/perf_counter.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 36f65e2..f274e19 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1791,7 +1791,7 @@ static int perf_counter_read_group(struct perf_counter *counter,
 	size += err;
 
 	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-		err = perf_counter_read_entry(counter, read_format,
+		err = perf_counter_read_entry(sub, read_format,
 				buf + size);
 		if (err < 0)
 			return err;

^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2009-08-21 19:37 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-08-13  9:47 [PATCH 0/4] perf_counter: Group reads and other patches Peter Zijlstra
2009-08-13  9:47 ` [PATCH 1/4 -v2] perf: rework the whole read vs group stuff Peter Zijlstra
2009-08-13 11:00   ` [tip:perfcounters/urgent] perf: Rework/fix " tip-bot for Peter Zijlstra
2009-08-21 14:47   ` [PATCH 1/4 -v2] perf: rework " stephane eranian
2009-08-21 15:19     ` Peter Zijlstra
2009-08-21 19:36       ` [tip:perfcounters/urgent] perf_counter: Fix typo in read() output generation tip-bot for Peter Zijlstra
2009-08-13  9:47 ` [PATCH 2/4] perf_counter: Fix an ipi-deadlock Peter Zijlstra
2009-08-13 11:00   ` [tip:perfcounters/urgent] " tip-bot for Peter Zijlstra
2009-08-13  9:47 ` [PATCH 3/4] perf tools: Add some comments to the event definitions Peter Zijlstra
2009-08-13 11:01   ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra
2009-08-15 10:07   ` tip-bot for Peter Zijlstra
2009-08-13  9:47 ` [PATCH 4/4] perf_counter: Fix swcounter context invariance Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).