All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH/RFC 1/2] perfcounters: provide a way to read the current value of interrupting counters
@ 2009-03-17  5:42 Paul Mackerras
  2009-03-17  7:32 ` Peter Zijlstra
  0 siblings, 1 reply; 6+ messages in thread
From: Paul Mackerras @ 2009-03-17  5:42 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra; +Cc: linux-kernel, Thomas Gleixner

Impact: new feature

At present, if the user specifies hw_event->record_type ==
PERF_RECORD_IRQ or PERF_RECORD_GROUP when creating a counter, reads from
the counter will return records from the interrupt event queue for the
counter.  This means that there is no way to find out the current value
of the counter.  Also, using the record_type is slightly problematic in
that what actually determines whether the counter generates interrupts
is whether hw_event->irq_period is non-zero or not.

This provides a way for users to get a second fd for an interrupting
counter, which has a different set of file operations, set up so that
reads on the second (or "clone") fd return the counter value rather than
reading the interrupt event queue.  The way to get the clone fd is like
this:

	clone_fd = sys_perf_counter_open(NULL, 0, 0, counter_fd, 0);

That is, the hw_event parameter is NULL and the original counter fd is
supplied in the group_fd parameter.

This also simplifies the counter read path a bit by setting up two
file_operations structs, one for interrupting counters and one for
simple (non-interrupting) counters, and uses irq_period rather than
record_type to determine which type of counter is being requested.
This will enable us to use a wider range of values in record_type in
future, allowing the user to specify what information they want recorded
on an interrupt.

Internally, we now potentially have multiple struct files pointing to
the one struct counter, which could lead to lifetime issues.  We avoid
any such issues by having the clone files keep a reference to the
original file.  The reference is dropped when the clone file is closed.
Thus the original file can never be released while there is any clone
file still open.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
This and the following patch are in the rfc branch of my
perfcounters.git repository at:

git://git.kernel.org/pub/scm/linux/kernel/git/paulus/perfcounters.git rfc

(Note that the rfc branch includes the master branch, which has the
"perfcounters: abstract wakeup flag setting in core to fix powerpc
build" commit in it.)

 kernel/perf_counter.c |  114 +++++++++++++++++++++++++++++++++++++------------
 1 files changed, 87 insertions(+), 27 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index b39456a..7c62b93 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1172,11 +1172,24 @@ static int perf_release(struct inode *inode, struct file *file)
 }
 
 /*
+ * Called when the last reference to a clone file is gone.
+ */
+static int perf_clone_release(struct inode *inode, struct file *file)
+{
+	struct perf_counter *counter = file->private_data;
+
+	file->private_data = NULL;
+	fput(counter->filp);
+	return 0;
+}
+
+/*
  * Read the performance counter - simple non blocking version for now
  */
 static ssize_t
-perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
+perf_read_value(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
+	struct perf_counter *counter = file->private_data;
 	u64 cntval;
 
 	if (count != sizeof(cntval))
@@ -1218,11 +1231,12 @@ perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
 }
 
 static ssize_t
-perf_read_irq_data(struct perf_counter	*counter,
-		   char __user		*buf,
-		   size_t		count,
-		   int			nonblocking)
+perf_read_irq_data(struct file	*file,
+		   char __user	*buf,
+		   size_t	count,
+		   loff_t	*ppos)
 {
+	struct perf_counter *counter = file->private_data;
 	struct perf_data *irqdata, *usrdata;
 	DECLARE_WAITQUEUE(wait, current);
 	ssize_t res, res2;
@@ -1233,7 +1247,7 @@ perf_read_irq_data(struct perf_counter	*counter,
 	if (usrdata->len + irqdata->len >= count)
 		goto read_pending;
 
-	if (nonblocking)
+	if (file->f_flags & O_NONBLOCK)
 		return -EAGAIN;
 
 	spin_lock_irq(&counter->waitq.lock);
@@ -1283,23 +1297,6 @@ out:
 	return res;
 }
 
-static ssize_t
-perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
-{
-	struct perf_counter *counter = file->private_data;
-
-	switch (counter->hw_event.record_type) {
-	case PERF_RECORD_SIMPLE:
-		return perf_read_hw(counter, buf, count);
-
-	case PERF_RECORD_IRQ:
-	case PERF_RECORD_GROUP:
-		return perf_read_irq_data(counter, buf, count,
-					  file->f_flags & O_NONBLOCK);
-	}
-	return -EINVAL;
-}
-
 static unsigned int perf_poll(struct file *file, poll_table *wait)
 {
 	struct perf_counter *counter = file->private_data;
@@ -1334,9 +1331,25 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
-static const struct file_operations perf_fops = {
+static const struct file_operations perf_intr_fops = {
 	.release		= perf_release,
-	.read			= perf_read,
+	.read			= perf_read_irq_data,
+	.poll			= perf_poll,
+	.unlocked_ioctl		= perf_ioctl,
+	.compat_ioctl		= perf_ioctl,
+};
+
+static const struct file_operations perf_value_fops = {
+	.release		= perf_release,
+	.read			= perf_read_value,
+	.poll			= perf_poll,
+	.unlocked_ioctl		= perf_ioctl,
+	.compat_ioctl		= perf_ioctl,
+};
+
+static const struct file_operations perf_clone_fops = {
+	.release		= perf_clone_release,
+	.read			= perf_read_value,
 	.poll			= perf_poll,
 	.unlocked_ioctl		= perf_ioctl,
 	.compat_ioctl		= perf_ioctl,
@@ -1888,6 +1901,38 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	return counter;
 }
 
+static long perf_counter_clone(int orig_fd)
+{
+	struct perf_counter *counter;
+	struct file *counter_file;
+	int ret;
+
+	/*
+	 * The cloned file holds a reference to the original
+	 * file; the corresponding fput to this fget is in
+	 * perf_clone_release().
+	 */
+	counter_file = fget(orig_fd);
+	if (!counter_file)
+		return -EBADF;
+
+	ret = -EINVAL;
+	if (counter_file->f_op != &perf_intr_fops &&
+	    counter_file->f_op != &perf_value_fops)
+		goto out_fput;
+
+	counter = counter_file->private_data;
+
+	ret = anon_inode_getfd("[perf_counter]", &perf_clone_fops, counter, 0);
+	if (ret < 0)
+		goto out_fput;
+	return ret;
+
+ out_fput:
+	fput(counter_file);
+	return ret;
+}
+
 /**
  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
  *
@@ -1905,6 +1950,7 @@ SYSCALL_DEFINE5(perf_counter_open,
 	struct perf_counter_context *ctx;
 	struct file *counter_file = NULL;
 	struct file *group_file = NULL;
+	const struct file_operations *fops;
 	int fput_needed = 0;
 	int fput_needed2 = 0;
 	int ret;
@@ -1913,6 +1959,15 @@ SYSCALL_DEFINE5(perf_counter_open,
 	if (flags)
 		return -EINVAL;
 
+	/*
+	 * See if the user wants to clone an existing counter,
+	 * to get another fd referring to the same counter but with
+	 * file operations set to read the counter value rather than
+	 * irq events.
+	 */
+	if (hw_event_uptr == NULL)
+		return perf_counter_clone(group_fd);
+
 	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
 		return -EFAULT;
 
@@ -1932,7 +1987,8 @@ SYSCALL_DEFINE5(perf_counter_open,
 		group_file = fget_light(group_fd, &fput_needed);
 		if (!group_file)
 			goto err_put_context;
-		if (group_file->f_op != &perf_fops)
+		if (group_file->f_op != &perf_intr_fops &&
+		    group_file->f_op != &perf_value_fops)
 			goto err_put_context;
 
 		group_leader = group_file->private_data;
@@ -1961,7 +2017,11 @@ SYSCALL_DEFINE5(perf_counter_open,
 	if (!counter)
 		goto err_put_context;
 
-	ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
+	if (counter->hw_event.irq_period)
+		fops = &perf_intr_fops;
+	else
+		fops = &perf_value_fops;
+	ret = anon_inode_getfd("[perf_counter]", fops, counter, 0);
 	if (ret < 0)
 		goto err_free_put_context;
 
-- 
1.5.5.rc3.7.gba13


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH/RFC 1/2] perfcounters: provide a way to read the current value of interrupting counters
  2009-03-17  5:42 [PATCH/RFC 1/2] perfcounters: provide a way to read the current value of interrupting counters Paul Mackerras
@ 2009-03-17  7:32 ` Peter Zijlstra
  2009-03-17  7:47   ` Paul Mackerras
  0 siblings, 1 reply; 6+ messages in thread
From: Peter Zijlstra @ 2009-03-17  7:32 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Ingo Molnar, linux-kernel, Thomas Gleixner

On Tue, 2009-03-17 at 16:42 +1100, Paul Mackerras wrote:
> Impact: new feature
> 
> At present, if the user specifies hw_event->record_type ==
> PERF_RECORD_IRQ or PERF_RECORD_GROUP when creating a counter, reads from
> the counter will return records from the interrupt event queue for the
> counter.  This means that there is no way to find out the current value
> of the counter.  Also, using the record_type is slightly problematic in
> that what actually determines whether the counter generates interrupts
> is whether hw_event->irq_period is non-zero or not.

I've never found that to be a problem, I've always read PERF_RECORD_IRQ
as PERF_RECORD_SINGLE or somesuch in that it will give a single counter
output, as opposed to PERF_RECORD_GROUP which will give a tuple.

> This provides a way for users to get a second fd for an interrupting
> counter, which has a different set of file operations, set up so that
> reads on the second (or "clone") fd return the counter value rather than
> reading the interrupt event queue.  The way to get the clone fd is like
> this:
> 
> 	clone_fd = sys_perf_counter_open(NULL, 0, 0, counter_fd, 0);

I'm not sure I understand why. It seems to me you're either interested
in sample data, that is {tid,ip,counter} like things, or you want raw
count values.

These two cases seem clearly distinct and provided for. Why are you
mixing them?

> This will enable us to use a wider range of values in record_type in
> future, allowing the user to specify what information they want recorded
> on an interrupt.

This seems unrelated, what will stop us now from adding record_type
values? PERF_RECORD_CALLCHAIN is one in particular I'd like to add soon.

> Internally, we now potentially have multiple struct files pointing to
> the one struct counter, which could lead to lifetime issues.  We avoid
> any such issues by having the clone files keep a reference to the
> original file.  The reference is dropped when the clone file is closed.
> Thus the original file can never be released while there is any clone
> file still open.

This is a bit bothersome, as we then have no unique identifier anymore.

Currently the group record type writes things like {hw_event->type,
counter} which is ambiguous since we really have a 65bit id space. So I
was thinking of making that {fd, counter} to at least have a unique
identifier in there.




^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH/RFC 1/2] perfcounters: provide a way to read the current value of interrupting counters
  2009-03-17  7:32 ` Peter Zijlstra
@ 2009-03-17  7:47   ` Paul Mackerras
  2009-03-17  8:37     ` Peter Zijlstra
  0 siblings, 1 reply; 6+ messages in thread
From: Paul Mackerras @ 2009-03-17  7:47 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, linux-kernel, Thomas Gleixner

Peter Zijlstra writes:

> I'm not sure I understand why. It seems to me you're either interested
> in sample data, that is {tid,ip,counter} like things, or you want raw
> count values.

It was specifically requested by people porting PAPI to PCL, and it
seems like a reasonable request.

> This seems unrelated, what will stop us now from adding record_type
> values? PERF_RECORD_CALLCHAIN is one in particular I'd like to add soon.

I think we should make record_type a bitset rather than a single
value, and define bits for recording the callchain as well as various
other interesting things.

> This is a bit bothersome, as we then have no unique identifier anymore.
> 
> Currently the group record type writes things like {hw_event->type,
> counter} which is ambiguous since we really have a 65bit id space. So I
> was thinking of making that {fd, counter} to at least have a unique
> identifier in there.

The fd is already not a unique identifier - think about dup().

The hw_event->type values are not really necessary, in fact, since the
group members will always be read out in the order that they were
added to the group.  The only time there might be any possibility of
confusion is if a new member is added after some samples have already
been taken (or a member is removed) - then we'll get new records being
added to the event queue being a different size from those already in
the queue.  But that's a somewhat different problem which isn't really
solved by having the type values in there.

Paul.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH/RFC 1/2] perfcounters: provide a way to read the current value of interrupting counters
  2009-03-17  7:47   ` Paul Mackerras
@ 2009-03-17  8:37     ` Peter Zijlstra
  2009-03-17  9:17       ` Paul Mackerras
  0 siblings, 1 reply; 6+ messages in thread
From: Peter Zijlstra @ 2009-03-17  8:37 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Ingo Molnar, linux-kernel, Thomas Gleixner

On Tue, 2009-03-17 at 18:47 +1100, Paul Mackerras wrote:
> Peter Zijlstra writes:
> 
> > I'm not sure I understand why. It seems to me you're either interested
> > in sample data, that is {tid,ip,counter} like things, or you want raw
> > count values.
> 
> It was specifically requested by people porting PAPI to PCL, and it
> seems like a reasonable request.

OK, then why didn't the changelog say so :-)

Could you ask them why though, if they need it I won't object too much,
but I'd like to know the use case.

As to the method proposed, I think Ingo and I talked about 'abusing'
non-blocking reads for this purpose, would that work? Then if you need
two fds you could dup() and flip one to non-blocking.

The non-blocking read would either output whatever is already pending,
but in case there is no data, it would generate some on the spot.

> I think we should make record_type a bitset rather than a single
> value, and define bits for recording the callchain as well as various
> other interesting things.

Sounds sensible at first though, I'll see if I can poke a hole in that.

The only two things I've got on my todo list is that challchain and mmap
info, and those can indeed be done as a bitmask, tid information too.

> > Currently the group record type writes things like {hw_event->type,
> > counter} which is ambiguous since we really have a 65bit id space. So I
> > was thinking of making that {fd, counter} to at least have a unique
> > identifier in there.
> 
> The fd is already not a unique identifier - think about dup().

You're quite right indeed - risks of mailing before waking up I
suppose :-)

> The hw_event->type values are not really necessary, in fact, since the
> group members will always be read out in the order that they were
> added to the group.  The only time there might be any possibility of
> confusion is if a new member is added after some samples have already
> been taken (or a member is removed) - then we'll get new records being
> added to the event queue being a different size from those already in
> the queue.  But that's a somewhat different problem which isn't really
> solved by having the type values in there.

Ah, ok, so we can leave that out entirely. I was already planning to add
a sibling_count field, with that we could do something like:

{ record_type, sibling_count, n*{counter} }




^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH/RFC 1/2] perfcounters: provide a way to read the current value of interrupting counters
  2009-03-17  8:37     ` Peter Zijlstra
@ 2009-03-17  9:17       ` Paul Mackerras
  2009-03-19 19:31         ` Corey Ashford
  0 siblings, 1 reply; 6+ messages in thread
From: Paul Mackerras @ 2009-03-17  9:17 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, linux-kernel, Thomas Gleixner

Peter Zijlstra writes:

> > It was specifically requested by people porting PAPI to PCL, and it
> > seems like a reasonable request.
> 
> OK, then why didn't the changelog say so :-)

Fair point. :)

> Could you ask them why though, if they need it I won't object too much,
> but I'd like to know the use case.

PAPI has the concept of "overflowing" counters, apparently, which
generate a signal every N counts, about which they said: "One thing to
keep in mind, you should always be able to read a live counter,
regardless of whether or not it's set to overflow..."

I assume the PAPI interface lets you do everything with overflowing
counters that you can do with non-overflowing counters, and that's why
they want it, but I don't know much about PAPI myself.

> As to the method proposed, I think Ingo and I talked about 'abusing'
> non-blocking reads for this purpose, would that work? Then if you need
> two fds you could dup() and flip one to non-blocking.

The non-blocking flag is one of the "file status" flags, which are
shared between all fds pointing at the same struct file.  So if you
dup() and set one to non-blocking, the other one becomes non-blocking
too.  So that doesn't fly.

> The non-blocking read would either output whatever is already pending,
> but in case there is no data, it would generate some on the spot.

The difficulty then is how userspace does know what it ended up
getting?  It may not always be possible to distinguish based on the
value you get.

The other idea I had was to use the file position, and say that
positions greater than some threshold read the event queue, and less
than the threshold read the counter value.  That way you can read the
event queue with read() and the counter value with pread(..., 0).

The objection to that is that the threshold is a bit artificial, and
would need to be different between interrupting and counting
counters.  Also we may need to do strange things to file->f_pos like
initializing it to the (non-zero) threshold when opening an
interrupting counter.

Paul.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH/RFC 1/2] perfcounters: provide a way to read the current value of interrupting counters
  2009-03-17  9:17       ` Paul Mackerras
@ 2009-03-19 19:31         ` Corey Ashford
  0 siblings, 0 replies; 6+ messages in thread
From: Corey Ashford @ 2009-03-19 19:31 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Thomas Gleixner

Paul Mackerras wrote:
> Peter Zijlstra writes:
> 
>>> It was specifically requested by people porting PAPI to PCL, and it
>>> seems like a reasonable request.
>> OK, then why didn't the changelog say so :-)
> 
> Fair point. :)
> 
>> Could you ask them why though, if they need it I won't object too much,
>> but I'd like to know the use case.
> 
> PAPI has the concept of "overflowing" counters, apparently, which
> generate a signal every N counts, about which they said: "One thing to
> keep in mind, you should always be able to read a live counter,
> regardless of whether or not it's set to overflow..."
> 
> I assume the PAPI interface lets you do everything with overflowing
> counters that you can do with non-overflowing counters, and that's why
> they want it, but I don't know much about PAPI myself.
> 
>> As to the method proposed, I think Ingo and I talked about 'abusing'
>> non-blocking reads for this purpose, would that work? Then if you need
>> two fds you could dup() and flip one to non-blocking.
> 
> The non-blocking flag is one of the "file status" flags, which are
> shared between all fds pointing at the same struct file.  So if you
> dup() and set one to non-blocking, the other one becomes non-blocking
> too.  So that doesn't fly.
> 
>> The non-blocking read would either output whatever is already pending,
>> but in case there is no data, it would generate some on the spot.
> 
> The difficulty then is how userspace does know what it ended up
> getting?  It may not always be possible to distinguish based on the
> value you get.
> 
> The other idea I had was to use the file position, and say that
> positions greater than some threshold read the event queue, and less
> than the threshold read the counter value.  That way you can read the
> event queue with read() and the counter value with pread(..., 0).
> 
> The objection to that is that the threshold is a bit artificial, and
> would need to be different between interrupting and counting
> counters.  Also we may need to do strange things to file->f_pos like
> initializing it to the (non-zero) threshold when opening an
> interrupting counter.
> 

This could be a reason for getting rid of the purely interrupting 
counter record type.  That way, you always read at the [artificial] 
offset to read the event queue for counters with a non-zero irq_period, 
and always at offset zero to read the current counter value.

It would work similarly well for the other idea of creating a cloned fd. 
  The fd returned from the initial open is always used for reading the 
current value, and the cloned one is for reading the event queue.

Regards,

- Corey

Corey Ashford
Software Engineer
IBM Linux Technology Center, Linux Toolchain
Beaverton, OR
503-578-3507
cjashfor@us.ibm.com


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2009-03-19 19:32 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-03-17  5:42 [PATCH/RFC 1/2] perfcounters: provide a way to read the current value of interrupting counters Paul Mackerras
2009-03-17  7:32 ` Peter Zijlstra
2009-03-17  7:47   ` Paul Mackerras
2009-03-17  8:37     ` Peter Zijlstra
2009-03-17  9:17       ` Paul Mackerras
2009-03-19 19:31         ` Corey Ashford

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.