* [PATCH 0/7] perf_counter: syscall ABI cleanup and mmap() interface
@ 2009-03-23 17:22 Peter Zijlstra
2009-03-23 17:22 ` [PATCH 1/7] perf_counter: remove the event config bitfields Peter Zijlstra
` (6 more replies)
0 siblings, 7 replies; 18+ messages in thread
From: Peter Zijlstra @ 2009-03-23 17:22 UTC (permalink / raw)
To: Paul Mackerras, Ingo Molnar; +Cc: linux-kernel, Peter Zijlstra
tested on x86_64, compile tested for ppc64.
--
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH 1/7] perf_counter: remove the event config bitfields
2009-03-23 17:22 [PATCH 0/7] perf_counter: syscall ABI cleanup and mmap() interface Peter Zijlstra
@ 2009-03-23 17:22 ` Peter Zijlstra
2009-03-23 20:56 ` [tip:perfcounters/core] " Peter Zijlstra
2009-03-23 17:22 ` [PATCH 2/7] perf_counter: avoid recursion Peter Zijlstra
` (5 subsequent siblings)
6 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2009-03-23 17:22 UTC (permalink / raw)
To: Paul Mackerras, Ingo Molnar; +Cc: linux-kernel, Peter Zijlstra
[-- Attachment #1: perf_counter-kill-bitfield-union.patch --]
[-- Type: text/plain, Size: 7354 bytes --]
Since the bitfields turned into a bit of a mess, remove them and rely on
good old masks.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
arch/powerpc/kernel/perf_counter.c | 6 +--
arch/x86/kernel/cpu/perf_counter.c | 8 ++--
include/linux/perf_counter.h | 74 +++++++++++++++++++++++++------------
kernel/perf_counter.c | 22 ++++++-----
4 files changed, 70 insertions(+), 40 deletions(-)
Index: linux-2.6/arch/powerpc/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/arch/powerpc/kernel/perf_counter.c
+++ linux-2.6/arch/powerpc/kernel/perf_counter.c
@@ -602,13 +602,13 @@ hw_perf_counter_init(struct perf_counter
return NULL;
if ((s64)counter->hw_event.irq_period < 0)
return NULL;
- if (!counter->hw_event.raw_type) {
- ev = counter->hw_event.event_id;
+ if (!perf_event_raw(&counter->hw_event)) {
+ ev = perf_event_id(&counter->hw_event);
if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
return NULL;
ev = ppmu->generic_events[ev];
} else {
- ev = counter->hw_event.raw_event_id;
+ ev = perf_event_config(&counter->hw_event);
}
counter->hw.config_base = ev;
counter->hw.idx = 0;
Index: linux-2.6/arch/x86/kernel/cpu/perf_counter.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/perf_counter.c
+++ linux-2.6/arch/x86/kernel/cpu/perf_counter.c
@@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct
/*
* Raw event type provide the config in the event structure
*/
- if (hw_event->raw_type) {
- hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id);
+ if (perf_event_raw(hw_event)) {
+ hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event));
} else {
- if (hw_event->event_id >= pmc_ops->max_events)
+ if (perf_event_id(hw_event) >= pmc_ops->max_events)
return -EINVAL;
/*
* The generic map:
*/
- hwc->config |= pmc_ops->event_map(hw_event->event_id);
+ hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
}
counter->wakeup_pending = 0;
Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -82,32 +82,37 @@ enum perf_counter_record_type {
PERF_RECORD_GROUP = 2,
};
+#define __PERF_COUNTER_MASK(name) \
+ (((1ULL << PERF_COUNTER_##name##_BITS) - 1) << \
+ PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW_BITS 1
+#define PERF_COUNTER_RAW_SHIFT 63
+#define PERF_COUNTER_RAW_MASK __PERF_COUNTER_MASK(RAW)
+
+#define PERF_COUNTER_CONFIG_BITS 63
+#define PERF_COUNTER_CONFIG_SHIFT 0
+#define PERF_COUNTER_CONFIG_MASK __PERF_COUNTER_MASK(CONFIG)
+
+#define PERF_COUNTER_TYPE_BITS 7
+#define PERF_COUNTER_TYPE_SHIFT 56
+#define PERF_COUNTER_TYPE_MASK __PERF_COUNTER_MASK(TYPE)
+
+#define PERF_COUNTER_EVENT_BITS 56
+#define PERF_COUNTER_EVENT_SHIFT 0
+#define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT)
+
/*
* Hardware event to monitor via a performance monitoring counter:
*/
struct perf_counter_hw_event {
- union {
-#ifndef __BIG_ENDIAN_BITFIELD
- struct {
- __u64 event_id : 56,
- type : 8;
- };
- struct {
- __u64 raw_event_id : 63,
- raw_type : 1;
- };
-#else
- struct {
- __u64 type : 8,
- event_id : 56;
- };
- struct {
- __u64 raw_type : 1,
- raw_event_id : 63;
- };
-#endif /* __BIT_ENDIAN_BITFIELD */
- __u64 event_config;
- };
+ /*
+ * The MSB of the config word signifies if the rest contains cpu
+ * specific (raw) counter configuration data, if unset, the next
+ * 7 bits are an event type and the rest of the bits are the event
+ * identifier.
+ */
+ __u64 config;
__u64 irq_period;
__u64 record_type;
@@ -157,6 +162,27 @@ struct perf_counter_hw_event {
struct task_struct;
+static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event)
+{
+ return hw_event->config & PERF_COUNTER_RAW_MASK;
+}
+
+static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event)
+{
+ return hw_event->config & PERF_COUNTER_CONFIG_MASK;
+}
+
+static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event)
+{
+ return (hw_event->config & PERF_COUNTER_TYPE_MASK) >>
+ PERF_COUNTER_TYPE_SHIFT;
+}
+
+static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event)
+{
+ return hw_event->config & PERF_COUNTER_EVENT_MASK;
+}
+
/**
* struct hw_perf_counter - performance counter hardware details:
*/
@@ -336,8 +362,8 @@ extern void perf_counter_output(struct p
*/
static inline int is_software_counter(struct perf_counter *counter)
{
- return !counter->hw_event.raw_type &&
- counter->hw_event.type != PERF_TYPE_HARDWARE;
+ return !perf_event_raw(&counter->hw_event) &&
+ perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE;
}
extern void perf_swcounter_event(u32, u64, int, struct pt_regs *);
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -1379,7 +1379,7 @@ static void perf_counter_handle_group(st
list_for_each_entry(sub, &leader->sibling_list, list_entry) {
if (sub != counter)
sub->hw_ops->read(sub);
- perf_counter_store_irq(counter, sub->hw_event.event_config);
+ perf_counter_store_irq(counter, sub->hw_event.config);
perf_counter_store_irq(counter, atomic64_read(&sub->count));
}
}
@@ -1489,13 +1489,13 @@ static int perf_swcounter_match(struct p
if (counter->state != PERF_COUNTER_STATE_ACTIVE)
return 0;
- if (counter->hw_event.raw_type)
+ if (perf_event_raw(&counter->hw_event))
return 0;
- if (counter->hw_event.type != type)
+ if (perf_event_type(&counter->hw_event) != type)
return 0;
- if (counter->hw_event.event_id != event)
+ if (perf_event_id(&counter->hw_event) != event)
return 0;
if (counter->hw_event.exclude_user && user_mode(regs))
@@ -1757,13 +1757,13 @@ extern void ftrace_profile_disable(int);
static void tp_perf_counter_destroy(struct perf_counter *counter)
{
- ftrace_profile_disable(counter->hw_event.event_id);
+ ftrace_profile_disable(perf_event_id(&counter->hw_event));
}
static const struct hw_perf_counter_ops *
tp_perf_counter_init(struct perf_counter *counter)
{
- int event_id = counter->hw_event.event_id;
+ int event_id = perf_event_id(&counter->hw_event);
int ret;
ret = ftrace_profile_enable(event_id);
@@ -1797,7 +1797,7 @@ sw_perf_counter_init(struct perf_counter
* to be kernel events, and page faults are never hypervisor
* events.
*/
- switch (counter->hw_event.event_id) {
+ switch (perf_event_id(&counter->hw_event)) {
case PERF_COUNT_CPU_CLOCK:
hw_ops = &perf_ops_cpu_clock;
@@ -1882,9 +1882,12 @@ perf_counter_alloc(struct perf_counter_h
hw_ops = NULL;
- if (hw_event->raw_type)
+ if (perf_event_raw(hw_event)) {
hw_ops = hw_perf_counter_init(counter);
- else switch (hw_event->type) {
+ goto done;
+ }
+
+ switch (perf_event_type(hw_event)) {
case PERF_TYPE_HARDWARE:
hw_ops = hw_perf_counter_init(counter);
break;
@@ -1902,6 +1905,7 @@ perf_counter_alloc(struct perf_counter_h
kfree(counter);
return NULL;
}
+done:
counter->hw_ops = hw_ops;
return counter;
--
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH 2/7] perf_counter: avoid recursion
2009-03-23 17:22 [PATCH 0/7] perf_counter: syscall ABI cleanup and mmap() interface Peter Zijlstra
2009-03-23 17:22 ` [PATCH 1/7] perf_counter: remove the event config bitfields Peter Zijlstra
@ 2009-03-23 17:22 ` Peter Zijlstra
2009-03-23 20:56 ` [tip:perfcounters/core] " Peter Zijlstra
2009-03-23 17:22 ` [PATCH 3/7] perf_counter: add an mmap method to allow userspace to read hardware counters Peter Zijlstra
` (4 subsequent siblings)
6 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2009-03-23 17:22 UTC (permalink / raw)
To: Paul Mackerras, Ingo Molnar; +Cc: linux-kernel, Peter Zijlstra
[-- Attachment #1: perf_counter-recursion.patch --]
[-- Type: text/plain, Size: 2065 bytes --]
Tracepoint events like lock_acquire and software counters like
pagefaults can recurse into the perf counter code again, avoid that.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/perf_counter.h | 7 +++++++
kernel/perf_counter.c | 26 ++++++++++++++++++++++++++
2 files changed, 33 insertions(+)
Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -328,6 +328,13 @@ struct perf_cpu_context {
int active_oncpu;
int max_pertask;
int exclusive;
+
+ /*
+ * Recursion avoidance:
+ *
+ * task, softirq, irq, nmi context
+ */
+ int recursion[4];
};
/*
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -23,6 +23,7 @@
#include <linux/mm.h>
#include <linux/vmstat.h>
#include <linux/rculist.h>
+#include <linux/hardirq.h>
#include <asm/irq_regs.h>
@@ -1532,10 +1533,31 @@ static void perf_swcounter_ctx_event(str
rcu_read_unlock();
}
+static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
+{
+ if (in_nmi())
+ return &cpuctx->recursion[3];
+
+ if (in_irq())
+ return &cpuctx->recursion[2];
+
+ if (in_softirq())
+ return &cpuctx->recursion[1];
+
+ return &cpuctx->recursion[0];
+}
+
static void __perf_swcounter_event(enum perf_event_types type, u32 event,
u64 nr, int nmi, struct pt_regs *regs)
{
struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+ int *recursion = perf_swcounter_recursion_context(cpuctx);
+
+ if (*recursion)
+ goto out;
+
+ (*recursion)++;
+ barrier();
perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs);
if (cpuctx->task_ctx) {
@@ -1543,6 +1565,10 @@ static void __perf_swcounter_event(enum
nr, nmi, regs);
}
+ barrier();
+ (*recursion)--;
+
+out:
put_cpu_var(perf_cpu_context);
}
--
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH 3/7] perf_counter: add an mmap method to allow userspace to read hardware counters
2009-03-23 17:22 [PATCH 0/7] perf_counter: syscall ABI cleanup and mmap() interface Peter Zijlstra
2009-03-23 17:22 ` [PATCH 1/7] perf_counter: remove the event config bitfields Peter Zijlstra
2009-03-23 17:22 ` [PATCH 2/7] perf_counter: avoid recursion Peter Zijlstra
@ 2009-03-23 17:22 ` Peter Zijlstra
2009-03-23 20:56 ` [tip:perfcounters/core] " Paul Mackerras
2009-03-23 17:22 ` [PATCH 4/7] mutex: add atomic_dec_and_mutex_lock] Peter Zijlstra
` (3 subsequent siblings)
6 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2009-03-23 17:22 UTC (permalink / raw)
To: Paul Mackerras, Ingo Molnar; +Cc: linux-kernel, Peter Zijlstra
[-- Attachment #1: paulus-perfcounters-add_an_mmap_method_to_allow_userspace_to_read_hardware_counters.patch --]
[-- Type: text/plain, Size: 6240 bytes --]
From: Paul Mackerras <paulus@samba.org>
Impact: new feature giving performance improvement
This adds the ability for userspace to do an mmap on a hardware counter
fd and get access to a read-only page that contains the information
needed to translate a hardware counter value to the full 64-bit
counter value that would be returned by a read on the fd. This is
useful on architectures that allow user programs to read the hardware
counters, such as PowerPC.
The mmap will only succeed if the counter is a hardware counter
monitoring the current process.
On my quad 2.5GHz PowerPC 970MP machine, userspace can read a counter
and translate it to the full 64-bit value in about 30ns using the
mmapped page, compared to about 830ns for the read syscall on the
counter, so this does give a significant performance improvement.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
arch/powerpc/kernel/perf_counter.c | 6 ++
include/linux/perf_counter.h | 15 +++++++
kernel/perf_counter.c | 76 +++++++++++++++++++++++++++++++++++++
3 files changed, 97 insertions(+)
Index: linux-2.6/arch/powerpc/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/arch/powerpc/kernel/perf_counter.c
+++ linux-2.6/arch/powerpc/kernel/perf_counter.c
@@ -417,6 +417,8 @@ void hw_perf_restore(u64 disable)
atomic64_set(&counter->hw.prev_count, val);
counter->hw.idx = hwc_index[i] + 1;
write_pmc(counter->hw.idx, val);
+ if (counter->user_page)
+ perf_counter_update_userpage(counter);
}
mb();
cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
@@ -572,6 +574,8 @@ static void power_perf_disable(struct pe
ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
write_pmc(counter->hw.idx, 0);
counter->hw.idx = 0;
+ if (counter->user_page)
+ perf_counter_update_userpage(counter);
break;
}
}
@@ -697,6 +701,8 @@ static void record_and_restart(struct pe
write_pmc(counter->hw.idx, val);
atomic64_set(&counter->hw.prev_count, val);
atomic64_set(&counter->hw.period_left, left);
+ if (counter->user_page)
+ perf_counter_update_userpage(counter);
/*
* Finally record data if requested.
Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -126,6 +126,17 @@ struct perf_counter_hw_event {
#define PERF_COUNTER_IOC_ENABLE _IO('$', 0)
#define PERF_COUNTER_IOC_DISABLE _IO('$', 1)
+/*
+ * Structure of the page that can be mapped via mmap
+ */
+struct perf_counter_mmap_page {
+ __u32 version; /* version number of this structure */
+ __u32 compat_version; /* lowest version this is compat with */
+ __u32 lock; /* seqlock for synchronization */
+ __u32 index; /* hardware counter identifier */
+ __s64 offset; /* add to hardware counter value */
+};
+
#ifdef __KERNEL__
/*
* Kernel-internal data types and definitions:
@@ -240,6 +251,9 @@ struct perf_counter {
int oncpu;
int cpu;
+ /* pointer to page shared with userspace via mmap */
+ unsigned long user_page;
+
/* read() / irq related data */
wait_queue_head_t waitq;
/* optional: for NMIs */
@@ -316,6 +330,7 @@ extern int perf_counter_task_enable(void
extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
struct perf_cpu_context *cpuctx,
struct perf_counter_context *ctx, int cpu);
+extern void perf_counter_update_userpage(struct perf_counter *counter);
extern void perf_counter_output(struct perf_counter *counter,
int nmi, struct pt_regs *regs);
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -1176,6 +1176,7 @@ static int perf_release(struct inode *in
mutex_unlock(&counter->mutex);
mutex_unlock(&ctx->mutex);
+ free_page(counter->user_page);
free_counter(counter);
put_context(ctx);
@@ -1345,12 +1346,87 @@ static long perf_ioctl(struct file *file
return err;
}
+void perf_counter_update_userpage(struct perf_counter *counter)
+{
+ struct perf_counter_mmap_page *userpg;
+
+ if (!counter->user_page)
+ return;
+ userpg = (struct perf_counter_mmap_page *) counter->user_page;
+
+ ++userpg->lock;
+ smp_wmb();
+ userpg->index = counter->hw.idx;
+ userpg->offset = atomic64_read(&counter->count);
+ if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+ userpg->offset -= atomic64_read(&counter->hw.prev_count);
+ smp_wmb();
+ ++userpg->lock;
+}
+
+static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct perf_counter *counter = vma->vm_file->private_data;
+
+ if (!counter->user_page)
+ return VM_FAULT_SIGBUS;
+
+ vmf->page = virt_to_page(counter->user_page);
+ get_page(vmf->page);
+ return 0;
+}
+
+static struct vm_operations_struct perf_mmap_vmops = {
+ .fault = perf_mmap_fault,
+};
+
+static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct perf_counter *counter = file->private_data;
+ unsigned long userpg;
+
+ if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
+ return -EINVAL;
+ if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+ return -EINVAL;
+
+ /*
+ * For now, restrict to the case of a hardware counter
+ * on the current task.
+ */
+ if (is_software_counter(counter) || counter->task != current)
+ return -EINVAL;
+
+ userpg = counter->user_page;
+ if (!userpg) {
+ userpg = get_zeroed_page(GFP_KERNEL);
+ mutex_lock(&counter->mutex);
+ if (counter->user_page) {
+ free_page(userpg);
+ userpg = counter->user_page;
+ } else {
+ counter->user_page = userpg;
+ }
+ mutex_unlock(&counter->mutex);
+ if (!userpg)
+ return -ENOMEM;
+ }
+
+ perf_counter_update_userpage(counter);
+
+ vma->vm_flags &= ~VM_MAYWRITE;
+ vma->vm_flags |= VM_RESERVED;
+ vma->vm_ops = &perf_mmap_vmops;
+ return 0;
+}
+
static const struct file_operations perf_fops = {
.release = perf_release,
.read = perf_read,
.poll = perf_poll,
.unlocked_ioctl = perf_ioctl,
.compat_ioctl = perf_ioctl,
+ .mmap = perf_mmap,
};
/*
--
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH 4/7] mutex: add atomic_dec_and_mutex_lock]
2009-03-23 17:22 [PATCH 0/7] perf_counter: syscall ABI cleanup and mmap() interface Peter Zijlstra
` (2 preceding siblings ...)
2009-03-23 17:22 ` [PATCH 3/7] perf_counter: add an mmap method to allow userspace to read hardware counters Peter Zijlstra
@ 2009-03-23 17:22 ` Peter Zijlstra
2009-03-23 20:56 ` [tip:perfcounters/core] mutex: add atomic_dec_and_mutex_lock() Eric Paris
2009-03-23 17:22 ` [PATCH 5/7] perf_counter: new output ABI - part 1 Peter Zijlstra
` (2 subsequent siblings)
6 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2009-03-23 17:22 UTC (permalink / raw)
To: Paul Mackerras, Ingo Molnar; +Cc: linux-kernel, Peter Zijlstra, Eric Paris
[-- Attachment #1: eparis-mutex-add_atomic_dec_and_mutex_lock.patch --]
[-- Type: text/plain, Size: 1468 bytes --]
From: Eric Paris <eparis@redhat.com>
Much like the atomic_dec_and_lock() function in which we take an hold a
spin_lock if we drop the atomic to 0 this function takes and holds the
mutex if we dec the atomic to 0.
Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Paul Mackerras <paulus@samba.org>
---
include/linux/mutex.h | 23 +++++++++++++++++++++++
1 file changed, 23 insertions(+)
Index: linux-2.6/include/linux/mutex.h
===================================================================
--- linux-2.6.orig/include/linux/mutex.h
+++ linux-2.6/include/linux/mutex.h
@@ -151,4 +151,27 @@ extern int __must_check mutex_lock_killa
extern int mutex_trylock(struct mutex *lock);
extern void mutex_unlock(struct mutex *lock);
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+ /* dec if we can't possibly hit 0 */
+ if (atomic_add_unless(cnt, -1, 1))
+ return 0;
+ /* we might hit 0, so take the lock */
+ mutex_lock(lock);
+ if (!atomic_dec_and_test(cnt)) {
+ /* when we actually did the dec, we didn't hit 0 */
+ mutex_unlock(lock);
+ return 0;
+ }
+ /* we hit 0, and we hold the lock */
+ return 1;
+}
+
#endif
--
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH 5/7] perf_counter: new output ABI - part 1
2009-03-23 17:22 [PATCH 0/7] perf_counter: syscall ABI cleanup and mmap() interface Peter Zijlstra
` (3 preceding siblings ...)
2009-03-23 17:22 ` [PATCH 4/7] mutex: add atomic_dec_and_mutex_lock] Peter Zijlstra
@ 2009-03-23 17:22 ` Peter Zijlstra
2009-03-23 20:56 ` [tip:perfcounters/core] " Peter Zijlstra
2009-03-23 17:22 ` [PATCH 6/7] kerneltop: update to new syscall ABI Peter Zijlstra
2009-03-23 17:22 ` [PATCH 7/7] kerneltop: use mmap() output Peter Zijlstra
6 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2009-03-23 17:22 UTC (permalink / raw)
To: Paul Mackerras, Ingo Molnar; +Cc: linux-kernel, Peter Zijlstra
[-- Attachment #1: mmap-output.patch --]
[-- Type: text/plain, Size: 19270 bytes --]
Rework the output ABI
use sys_read() only for instant data and provide mmap() output for all
async overflow data.
The first mmap() determines the size of the output buffer. The mmap()
size must be a PAGE_SIZE multiple of 1+pages, where pages must be a
power of 2 or 0. Further mmap()s of the same fd must have the same
size. Once all maps are gone, you can again mmap() with a new size.
In case of 0 extra pages there is no data output and the first page
only contains meta data.
When there are data pages, a poll() event will be generated for each
full page of data. Furthermore, the output is circular. This means
that although 1 page is a valid configuration, its useless, since
we'll start overwriting it the instant we report a full page.
Future work will focus on the output format (currently maintained)
where we'll likey want each entry denoted by a header which includes a
type and length.
Further future work will allow to splice() the fd, also containing the
async overflow data -- splice() would be mutually exclusive with
mmap() of the data.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Paul Mackerras <paulus@samba.org>
---
arch/powerpc/kernel/perf_counter.c | 9
include/linux/perf_counter.h | 36 +-
kernel/perf_counter.c | 464 +++++++++++++++++++------------------
3 files changed, 263 insertions(+), 246 deletions(-)
Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -152,6 +152,8 @@ struct perf_counter_mmap_page {
__u32 lock; /* seqlock for synchronization */
__u32 index; /* hardware counter identifier */
__s64 offset; /* add to hardware counter value */
+
+ __u32 data_head; /* head in the data section */
};
#ifdef __KERNEL__
@@ -218,21 +220,6 @@ struct hw_perf_counter {
#endif
};
-/*
- * Hardcoded buffer length limit for now, for IRQ-fed events:
- */
-#define PERF_DATA_BUFLEN 2048
-
-/**
- * struct perf_data - performance counter IRQ data sampling ...
- */
-struct perf_data {
- int len;
- int rd_idx;
- int overrun;
- u8 data[PERF_DATA_BUFLEN];
-};
-
struct perf_counter;
/**
@@ -256,6 +243,14 @@ enum perf_counter_active_state {
struct file;
+struct perf_mmap_data {
+ struct rcu_head rcu_head;
+ int nr_pages;
+ atomic_t head;
+ struct perf_counter_mmap_page *user_page;
+ void *data_pages[0];
+};
+
/**
* struct perf_counter - performance counter kernel representation:
*/
@@ -289,16 +284,15 @@ struct perf_counter {
int oncpu;
int cpu;
- /* pointer to page shared with userspace via mmap */
- unsigned long user_page;
+ /* mmap bits */
+ struct mutex mmap_mutex;
+ atomic_t mmap_count;
+ struct perf_mmap_data *data;
- /* read() / irq related data */
+ /* poll related */
wait_queue_head_t waitq;
/* optional: for NMIs */
int wakeup_pending;
- struct perf_data *irqdata;
- struct perf_data *usrdata;
- struct perf_data data[2];
void (*destroy)(struct perf_counter *);
struct rcu_head rcu_head;
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -4,7 +4,8 @@
* Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
* Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
*
- * For licencing details see kernel-base/COPYING
+ *
+ * For licensing details see kernel-base/COPYING
*/
#include <linux/fs.h>
@@ -1022,66 +1023,6 @@ static u64 perf_counter_read(struct perf
return atomic64_read(&counter->count);
}
-/*
- * Cross CPU call to switch performance data pointers
- */
-static void __perf_switch_irq_data(void *info)
-{
- struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
- struct perf_counter *counter = info;
- struct perf_counter_context *ctx = counter->ctx;
- struct perf_data *oldirqdata = counter->irqdata;
-
- /*
- * If this is a task context, we need to check whether it is
- * the current task context of this cpu. If not it has been
- * scheduled out before the smp call arrived.
- */
- if (ctx->task) {
- if (cpuctx->task_ctx != ctx)
- return;
- spin_lock(&ctx->lock);
- }
-
- /* Change the pointer NMI safe */
- atomic_long_set((atomic_long_t *)&counter->irqdata,
- (unsigned long) counter->usrdata);
- counter->usrdata = oldirqdata;
-
- if (ctx->task)
- spin_unlock(&ctx->lock);
-}
-
-static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
-{
- struct perf_counter_context *ctx = counter->ctx;
- struct perf_data *oldirqdata = counter->irqdata;
- struct task_struct *task = ctx->task;
-
- if (!task) {
- smp_call_function_single(counter->cpu,
- __perf_switch_irq_data,
- counter, 1);
- return counter->usrdata;
- }
-
-retry:
- spin_lock_irq(&ctx->lock);
- if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
- counter->irqdata = counter->usrdata;
- counter->usrdata = oldirqdata;
- spin_unlock_irq(&ctx->lock);
- return oldirqdata;
- }
- spin_unlock_irq(&ctx->lock);
- task_oncpu_function_call(task, __perf_switch_irq_data, counter);
- /* Might have failed, because task was scheduled out */
- if (counter->irqdata == oldirqdata)
- goto retry;
-
- return counter->usrdata;
-}
-
static void put_context(struct perf_counter_context *ctx)
{
if (ctx->task)
@@ -1177,7 +1118,6 @@ static int perf_release(struct inode *in
mutex_unlock(&counter->mutex);
mutex_unlock(&ctx->mutex);
- free_page(counter->user_page);
free_counter(counter);
put_context(ctx);
@@ -1192,7 +1132,7 @@ perf_read_hw(struct perf_counter *counte
{
u64 cntval;
- if (count != sizeof(cntval))
+ if (count < sizeof(cntval))
return -EINVAL;
/*
@@ -1211,121 +1151,20 @@ perf_read_hw(struct perf_counter *counte
}
static ssize_t
-perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
-{
- if (!usrdata->len)
- return 0;
-
- count = min(count, (size_t)usrdata->len);
- if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
- return -EFAULT;
-
- /* Adjust the counters */
- usrdata->len -= count;
- if (!usrdata->len)
- usrdata->rd_idx = 0;
- else
- usrdata->rd_idx += count;
-
- return count;
-}
-
-static ssize_t
-perf_read_irq_data(struct perf_counter *counter,
- char __user *buf,
- size_t count,
- int nonblocking)
-{
- struct perf_data *irqdata, *usrdata;
- DECLARE_WAITQUEUE(wait, current);
- ssize_t res, res2;
-
- irqdata = counter->irqdata;
- usrdata = counter->usrdata;
-
- if (usrdata->len + irqdata->len >= count)
- goto read_pending;
-
- if (nonblocking)
- return -EAGAIN;
-
- spin_lock_irq(&counter->waitq.lock);
- __add_wait_queue(&counter->waitq, &wait);
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (usrdata->len + irqdata->len >= count)
- break;
-
- if (signal_pending(current))
- break;
-
- if (counter->state == PERF_COUNTER_STATE_ERROR)
- break;
-
- spin_unlock_irq(&counter->waitq.lock);
- schedule();
- spin_lock_irq(&counter->waitq.lock);
- }
- __remove_wait_queue(&counter->waitq, &wait);
- __set_current_state(TASK_RUNNING);
- spin_unlock_irq(&counter->waitq.lock);
-
- if (usrdata->len + irqdata->len < count &&
- counter->state != PERF_COUNTER_STATE_ERROR)
- return -ERESTARTSYS;
-read_pending:
- mutex_lock(&counter->mutex);
-
- /* Drain pending data first: */
- res = perf_copy_usrdata(usrdata, buf, count);
- if (res < 0 || res == count)
- goto out;
-
- /* Switch irq buffer: */
- usrdata = perf_switch_irq_data(counter);
- res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
- if (res2 < 0) {
- if (!res)
- res = -EFAULT;
- } else {
- res += res2;
- }
-out:
- mutex_unlock(&counter->mutex);
-
- return res;
-}
-
-static ssize_t
perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
struct perf_counter *counter = file->private_data;
- switch (counter->hw_event.record_type) {
- case PERF_RECORD_SIMPLE:
- return perf_read_hw(counter, buf, count);
-
- case PERF_RECORD_IRQ:
- case PERF_RECORD_GROUP:
- return perf_read_irq_data(counter, buf, count,
- file->f_flags & O_NONBLOCK);
- }
- return -EINVAL;
+ return perf_read_hw(counter, buf, count);
}
static unsigned int perf_poll(struct file *file, poll_table *wait)
{
struct perf_counter *counter = file->private_data;
- unsigned int events = 0;
- unsigned long flags;
+ unsigned int events = POLLIN;
poll_wait(file, &counter->waitq, wait);
- spin_lock_irqsave(&counter->waitq.lock, flags);
- if (counter->usrdata->len || counter->irqdata->len)
- events |= POLLIN;
- spin_unlock_irqrestore(&counter->waitq.lock, flags);
-
return events;
}
@@ -1347,78 +1186,207 @@ static long perf_ioctl(struct file *file
return err;
}
-void perf_counter_update_userpage(struct perf_counter *counter)
+static void __perf_counter_update_userpage(struct perf_counter *counter,
+ struct perf_mmap_data *data)
{
- struct perf_counter_mmap_page *userpg;
-
- if (!counter->user_page)
- return;
- userpg = (struct perf_counter_mmap_page *) counter->user_page;
+ struct perf_counter_mmap_page *userpg = data->user_page;
+ /*
+ * Disable preemption so as to not let the corresponding user-space
+ * spin too long if we get preempted.
+ */
+ preempt_disable();
++userpg->lock;
smp_wmb();
userpg->index = counter->hw.idx;
userpg->offset = atomic64_read(&counter->count);
if (counter->state == PERF_COUNTER_STATE_ACTIVE)
userpg->offset -= atomic64_read(&counter->hw.prev_count);
+
+ userpg->data_head = atomic_read(&data->head);
smp_wmb();
++userpg->lock;
+ preempt_enable();
+}
+
+void perf_counter_update_userpage(struct perf_counter *counter)
+{
+ struct perf_mmap_data *data;
+
+ rcu_read_lock();
+ data = rcu_dereference(counter->data);
+ if (data)
+ __perf_counter_update_userpage(counter, data);
+ rcu_read_unlock();
}
static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct perf_counter *counter = vma->vm_file->private_data;
+ struct perf_mmap_data *data;
+ int ret = VM_FAULT_SIGBUS;
- if (!counter->user_page)
- return VM_FAULT_SIGBUS;
+ rcu_read_lock();
+ data = rcu_dereference(counter->data);
+ if (!data)
+ goto unlock;
- vmf->page = virt_to_page(counter->user_page);
+ if (vmf->pgoff == 0) {
+ vmf->page = virt_to_page(data->user_page);
+ } else {
+ int nr = vmf->pgoff - 1;
+
+ if ((unsigned)nr > data->nr_pages)
+ goto unlock;
+
+ vmf->page = virt_to_page(data->data_pages[nr]);
+ }
get_page(vmf->page);
+ ret = 0;
+unlock:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
+{
+ struct perf_mmap_data *data;
+ unsigned long size;
+ int i;
+
+ WARN_ON(atomic_read(&counter->mmap_count));
+
+ size = sizeof(struct perf_mmap_data);
+ size += nr_pages * sizeof(void *);
+
+ data = kzalloc(size, GFP_KERNEL);
+ if (!data)
+ goto fail;
+
+ data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!data->user_page)
+ goto fail_user_page;
+
+ for (i = 0; i < nr_pages; i++) {
+ data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!data->data_pages[i])
+ goto fail_data_pages;
+ }
+
+ data->nr_pages = nr_pages;
+
+ rcu_assign_pointer(counter->data, data);
+
return 0;
+
+fail_data_pages:
+ for (i--; i >= 0; i--)
+ free_page((unsigned long)data->data_pages[i]);
+
+ free_page((unsigned long)data->user_page);
+
+fail_user_page:
+ kfree(data);
+
+fail:
+ return -ENOMEM;
+}
+
+static void __perf_mmap_data_free(struct rcu_head *rcu_head)
+{
+ struct perf_mmap_data *data = container_of(rcu_head,
+ struct perf_mmap_data, rcu_head);
+ int i;
+
+ free_page((unsigned long)data->user_page);
+ for (i = 0; i < data->nr_pages; i++)
+ free_page((unsigned long)data->data_pages[i]);
+ kfree(data);
+}
+
+static void perf_mmap_data_free(struct perf_counter *counter)
+{
+ struct perf_mmap_data *data = counter->data;
+
+ WARN_ON(atomic_read(&counter->mmap_count));
+
+ rcu_assign_pointer(counter->data, NULL);
+ call_rcu(&data->rcu_head, __perf_mmap_data_free);
+}
+
+static void perf_mmap_open(struct vm_area_struct *vma)
+{
+ struct perf_counter *counter = vma->vm_file->private_data;
+
+ atomic_inc(&counter->mmap_count);
+}
+
+static void perf_mmap_close(struct vm_area_struct *vma)
+{
+ struct perf_counter *counter = vma->vm_file->private_data;
+
+ if (atomic_dec_and_mutex_lock(&counter->mmap_count,
+ &counter->mmap_mutex)) {
+ perf_mmap_data_free(counter);
+ mutex_unlock(&counter->mmap_mutex);
+ }
}
static struct vm_operations_struct perf_mmap_vmops = {
+ .open = perf_mmap_open,
+ .close = perf_mmap_close,
.fault = perf_mmap_fault,
};
static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
struct perf_counter *counter = file->private_data;
- unsigned long userpg;
+ unsigned long vma_size;
+ unsigned long nr_pages;
+ unsigned long locked, lock_limit;
+ int ret = 0;
if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
return -EINVAL;
- if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+
+ vma_size = vma->vm_end - vma->vm_start;
+ nr_pages = (vma_size / PAGE_SIZE) - 1;
+
+ if (nr_pages == 0 || !is_power_of_2(nr_pages))
return -EINVAL;
- /*
- * For now, restrict to the case of a hardware counter
- * on the current task.
- */
- if (is_software_counter(counter) || counter->task != current)
+ if (vma_size != PAGE_SIZE * (1 + nr_pages))
return -EINVAL;
- userpg = counter->user_page;
- if (!userpg) {
- userpg = get_zeroed_page(GFP_KERNEL);
- mutex_lock(&counter->mutex);
- if (counter->user_page) {
- free_page(userpg);
- userpg = counter->user_page;
- } else {
- counter->user_page = userpg;
- }
- mutex_unlock(&counter->mutex);
- if (!userpg)
- return -ENOMEM;
- }
+ if (vma->vm_pgoff != 0)
+ return -EINVAL;
+
+ locked = vma_size >> PAGE_SHIFT;
+ locked += vma->vm_mm->locked_vm;
+
+ lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit >>= PAGE_SHIFT;
- perf_counter_update_userpage(counter);
+ if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
+ return -EPERM;
+
+ mutex_lock(&counter->mmap_mutex);
+ if (atomic_inc_not_zero(&counter->mmap_count))
+ goto out;
+
+ WARN_ON(counter->data);
+ ret = perf_mmap_data_alloc(counter, nr_pages);
+ if (!ret)
+ atomic_set(&counter->mmap_count, 1);
+out:
+ mutex_unlock(&counter->mmap_mutex);
vma->vm_flags &= ~VM_MAYWRITE;
vma->vm_flags |= VM_RESERVED;
vma->vm_ops = &perf_mmap_vmops;
- return 0;
+
+ return ret;
}
static const struct file_operations perf_fops = {
@@ -1434,30 +1402,94 @@ static const struct file_operations perf
* Output
*/
-static void perf_counter_store_irq(struct perf_counter *counter, u64 data)
+static int perf_output_write(struct perf_counter *counter, int nmi,
+ void *buf, ssize_t size)
{
- struct perf_data *irqdata = counter->irqdata;
+ struct perf_mmap_data *data;
+ unsigned int offset, head, nr;
+ unsigned int len;
+ int ret, wakeup;
- if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
- irqdata->overrun++;
- } else {
- u64 *p = (u64 *) &irqdata->data[irqdata->len];
+ rcu_read_lock();
+ ret = -ENOSPC;
+ data = rcu_dereference(counter->data);
+ if (!data)
+ goto out;
+
+ if (!data->nr_pages)
+ goto out;
+
+ ret = -EINVAL;
+ if (size > PAGE_SIZE)
+ goto out;
+
+ do {
+ offset = head = atomic_read(&data->head);
+ head += sizeof(u64);
+ } while (atomic_cmpxchg(&data->head, offset, head) != offset);
+
+ wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
- *p = data;
- irqdata->len += sizeof(u64);
+ nr = (offset >> PAGE_SHIFT) & (data->nr_pages - 1);
+ offset &= PAGE_SIZE - 1;
+
+ len = min_t(unsigned int, PAGE_SIZE - offset, size);
+ memcpy(data->data_pages[nr] + offset, buf, len);
+ size -= len;
+
+ if (size) {
+ nr = (nr + 1) & (data->nr_pages - 1);
+ memcpy(data->data_pages[nr], buf + len, size);
+ }
+
+ /*
+ * generate a poll() wakeup for every page boundary crossed
+ */
+ if (wakeup) {
+ __perf_counter_update_userpage(counter, data);
+ if (nmi) {
+ counter->wakeup_pending = 1;
+ set_perf_counter_pending();
+ } else
+ wake_up(&counter->waitq);
}
+ ret = 0;
+out:
+ rcu_read_unlock();
+
+ return ret;
}
-static void perf_counter_handle_group(struct perf_counter *counter)
+static void perf_output_simple(struct perf_counter *counter,
+ int nmi, struct pt_regs *regs)
+{
+ u64 entry;
+
+ entry = instruction_pointer(regs);
+
+ perf_output_write(counter, nmi, &entry, sizeof(entry));
+}
+
+struct group_entry {
+ u64 event;
+ u64 counter;
+};
+
+static void perf_output_group(struct perf_counter *counter, int nmi)
{
struct perf_counter *leader, *sub;
leader = counter->group_leader;
list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+ struct group_entry entry;
+
if (sub != counter)
sub->hw_ops->read(sub);
- perf_counter_store_irq(counter, sub->hw_event.config);
- perf_counter_store_irq(counter, atomic64_read(&sub->count));
+
+ entry.event = sub->hw_event.config;
+ entry.counter = atomic64_read(&sub->count);
+
+ perf_output_write(counter, nmi, &entry, sizeof(entry));
}
}
@@ -1469,19 +1501,13 @@ void perf_counter_output(struct perf_cou
return;
case PERF_RECORD_IRQ:
- perf_counter_store_irq(counter, instruction_pointer(regs));
+ perf_output_simple(counter, nmi, regs);
break;
case PERF_RECORD_GROUP:
- perf_counter_handle_group(counter);
+ perf_output_group(counter, nmi);
break;
}
-
- if (nmi) {
- counter->wakeup_pending = 1;
- set_perf_counter_pending();
- } else
- wake_up(&counter->waitq);
}
/*
@@ -1967,10 +1993,10 @@ perf_counter_alloc(struct perf_counter_h
INIT_LIST_HEAD(&counter->sibling_list);
init_waitqueue_head(&counter->waitq);
+ mutex_init(&counter->mmap_mutex);
+
INIT_LIST_HEAD(&counter->child_list);
- counter->irqdata = &counter->data[0];
- counter->usrdata = &counter->data[1];
counter->cpu = cpu;
counter->hw_event = *hw_event;
counter->wakeup_pending = 0;
Index: linux-2.6/arch/powerpc/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/arch/powerpc/kernel/perf_counter.c
+++ linux-2.6/arch/powerpc/kernel/perf_counter.c
@@ -417,8 +417,7 @@ void hw_perf_restore(u64 disable)
atomic64_set(&counter->hw.prev_count, val);
counter->hw.idx = hwc_index[i] + 1;
write_pmc(counter->hw.idx, val);
- if (counter->user_page)
- perf_counter_update_userpage(counter);
+ perf_counter_update_userpage(counter);
}
mb();
cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
@@ -574,8 +573,7 @@ static void power_perf_disable(struct pe
ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
write_pmc(counter->hw.idx, 0);
counter->hw.idx = 0;
- if (counter->user_page)
- perf_counter_update_userpage(counter);
+ perf_counter_update_userpage(counter);
break;
}
}
@@ -702,8 +700,7 @@ static void record_and_restart(struct pe
write_pmc(counter->hw.idx, val);
atomic64_set(&counter->hw.prev_count, val);
atomic64_set(&counter->hw.period_left, left);
- if (counter->user_page)
- perf_counter_update_userpage(counter);
+ perf_counter_update_userpage(counter);
/*
* Finally record data if requested.
--
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH 6/7] kerneltop: update to new syscall ABI
2009-03-23 17:22 [PATCH 0/7] perf_counter: syscall ABI cleanup and mmap() interface Peter Zijlstra
` (4 preceding siblings ...)
2009-03-23 17:22 ` [PATCH 5/7] perf_counter: new output ABI - part 1 Peter Zijlstra
@ 2009-03-23 17:22 ` Peter Zijlstra
2009-03-23 20:57 ` [tip:perfcounters/core] " Peter Zijlstra
2009-03-23 17:22 ` [PATCH 7/7] kerneltop: use mmap() output Peter Zijlstra
6 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2009-03-23 17:22 UTC (permalink / raw)
To: Paul Mackerras, Ingo Molnar; +Cc: linux-kernel, Peter Zijlstra, Wu Fengguang
[-- Attachment #1: kerneltop-input-abi.patch --]
[-- Type: text/plain, Size: 11427 bytes --]
update the kerneltop userspace to work with the latest syscall ABI
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Wu Fengguang <fengguang.wu@intel.com>
---
kerneltop.c | 235 ++++++++++++++++++++++++++++++++++++++++--------------------
1 file changed, 157 insertions(+), 78 deletions(-)
Index: linux-2.6/kerneltop.c
===================================================================
--- linux-2.6.orig/kerneltop.c
+++ linux-2.6/kerneltop.c
@@ -87,20 +87,90 @@
#include <linux/unistd.h>
-#include "perfcounters.h"
+#include "include/linux/perf_counter.h"
+/*
+ * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * counters in the current task.
+ */
+#define PR_TASK_PERF_COUNTERS_DISABLE 31
+#define PR_TASK_PERF_COUNTERS_ENABLE 32
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#define rdclock() \
+({ \
+ struct timespec ts; \
+ \
+ clock_gettime(CLOCK_MONOTONIC, &ts); \
+ ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
+})
+
+/*
+ * Pick up some kernel type conventions:
+ */
+#define __user
+#define asmlinkage
+
+typedef unsigned int __u32;
+typedef unsigned long long __u64;
+typedef long long __s64;
+
+
+#ifdef __x86_64__
+# define __NR_perf_counter_open 295
+#endif
+
+#ifdef __i386__
+# define __NR_perf_counter_open 333
+#endif
+
+#ifdef __powerpc__
+#define __NR_perf_counter_open 319
+#endif
+
+asmlinkage int sys_perf_counter_open(
+ struct perf_counter_hw_event *hw_event_uptr __user,
+ pid_t pid,
+ int cpu,
+ int group_fd,
+ unsigned long flags)
+{
+ int ret;
+
+ ret = syscall(
+ __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
+#if defined(__x86_64__) || defined(__i386__)
+ if (ret < 0 && ret > -4096) {
+ errno = -ret;
+ ret = -1;
+ }
+#endif
+ return ret;
+}
+
#define MAX_COUNTERS 64
#define MAX_NR_CPUS 256
-#define DEF_PERFSTAT_EVENTS { -2, -5, -4, -3, 0, 1, 2, 3}
+#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
static int run_perfstat = 0;
static int system_wide = 0;
static int nr_counters = 0;
-static __s64 event_id[MAX_COUNTERS] = DEF_PERFSTAT_EVENTS;
-static int event_raw[MAX_COUNTERS];
+static __u64 event_id[MAX_COUNTERS] = {
+ EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
+ EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
+ EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
+ EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
+
+ EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
+ EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
+ EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
+ EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
+};
+static int default_interval = 100000;
static int event_count[MAX_COUNTERS];
static int fd[MAX_NR_CPUS][MAX_COUNTERS];
@@ -156,49 +226,63 @@ static char *sw_event_names[] = {
"pagefaults",
"context switches",
"CPU migrations",
+ "minor faults",
+ "major faults",
};
struct event_symbol {
- int event;
+ __u64 event;
char *symbol;
};
static struct event_symbol event_symbols[] = {
- {PERF_COUNT_CPU_CYCLES, "cpu-cycles", },
- {PERF_COUNT_CPU_CYCLES, "cycles", },
- {PERF_COUNT_INSTRUCTIONS, "instructions", },
- {PERF_COUNT_CACHE_REFERENCES, "cache-references", },
- {PERF_COUNT_CACHE_MISSES, "cache-misses", },
- {PERF_COUNT_BRANCH_INSTRUCTIONS, "branch-instructions", },
- {PERF_COUNT_BRANCH_INSTRUCTIONS, "branches", },
- {PERF_COUNT_BRANCH_MISSES, "branch-misses", },
- {PERF_COUNT_BUS_CYCLES, "bus-cycles", },
- {PERF_COUNT_CPU_CLOCK, "cpu-ticks", },
- {PERF_COUNT_CPU_CLOCK, "ticks", },
- {PERF_COUNT_TASK_CLOCK, "task-ticks", },
- {PERF_COUNT_PAGE_FAULTS, "page-faults", },
- {PERF_COUNT_PAGE_FAULTS, "faults", },
- {PERF_COUNT_CONTEXT_SWITCHES, "context-switches", },
- {PERF_COUNT_CONTEXT_SWITCHES, "cs", },
- {PERF_COUNT_CPU_MIGRATIONS, "cpu-migrations", },
- {PERF_COUNT_CPU_MIGRATIONS, "migrations", },
+ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
+ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
+ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
+ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
+ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
+ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
+ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
+ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
+ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
+
+ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
+ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
+ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
+ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
+ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
+ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
+ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
+ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
+ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
+ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
};
+#define __PERF_COUNTER_FIELD(config, name) \
+ ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
+#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
+#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
+#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
+
static void display_events_help(void)
{
unsigned int i;
- int e;
+ __u64 e;
printf(
" -e EVENT --event=EVENT # symbolic-name abbreviations");
- for (i = 0, e = PERF_HW_EVENTS_MAX; i < ARRAY_SIZE(event_symbols); i++) {
- if (e != event_symbols[i].event) {
- e = event_symbols[i].event;
- printf(
- "\n %2d: %-20s", e, event_symbols[i].symbol);
- } else
- printf(" %s", event_symbols[i].symbol);
+ for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+ int type, id;
+
+ e = event_symbols[i].event;
+ type = PERF_COUNTER_TYPE(e);
+ id = PERF_COUNTER_ID(e);
+
+ printf("\n %d:%d: %-20s",
+ type, id, event_symbols[i].symbol);
}
printf("\n"
@@ -249,44 +333,51 @@ static void display_help(void)
exit(0);
}
-static int type_valid(int type)
-{
- if (type >= PERF_HW_EVENTS_MAX)
- return 0;
- if (type <= PERF_SW_EVENTS_MIN)
- return 0;
-
- return 1;
-}
-
static char *event_name(int ctr)
{
- __s64 type = event_id[ctr];
+ __u64 config = event_id[ctr];
+ int type = PERF_COUNTER_TYPE(config);
+ int id = PERF_COUNTER_ID(config);
static char buf[32];
- if (event_raw[ctr]) {
- sprintf(buf, "raw 0x%llx", (long long)type);
+ if (PERF_COUNTER_RAW(config)) {
+ sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
return buf;
}
- if (!type_valid(type))
- return "unknown";
- if (type >= 0)
- return hw_event_names[type];
+ switch (type) {
+ case PERF_TYPE_HARDWARE:
+ if (id < PERF_HW_EVENTS_MAX)
+ return hw_event_names[id];
+ return "unknown-hardware";
+
+ case PERF_TYPE_SOFTWARE:
+ if (id < PERF_SW_EVENTS_MAX)
+ return sw_event_names[id];
+ return "unknown-software";
+
+ default:
+ break;
+ }
- return sw_event_names[-type-1];
+ return "unknown";
}
/*
* Each event can have multiple symbolic names.
* Symbolic names are (almost) exactly matched.
*/
-static int match_event_symbols(char *str)
+static __u64 match_event_symbols(char *str)
{
+ __u64 config, id;
+ int type;
unsigned int i;
- if (isdigit(str[0]) || str[0] == '-')
- return atoi(str);
+ if (sscanf(str, "r%llx", &config) == 1)
+ return config | PERF_COUNTER_RAW_MASK;
+
+ if (sscanf(str, "%d:%llu", &type, &id) == 2)
+ return EID(type, id);
for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
if (!strncmp(str, event_symbols[i].symbol,
@@ -294,31 +385,22 @@ static int match_event_symbols(char *str
return event_symbols[i].event;
}
- return PERF_HW_EVENTS_MAX;
+ return ~0ULL;
}
static int parse_events(char *str)
{
- __s64 type;
- int raw;
+ __u64 config;
again:
if (nr_counters == MAX_COUNTERS)
return -1;
- raw = 0;
- if (*str == 'r') {
- raw = 1;
- ++str;
- type = strtol(str, NULL, 16);
- } else {
- type = match_event_symbols(str);
- if (!type_valid(type))
- return -1;
- }
+ config = match_event_symbols(str);
+ if (config == ~0ULL)
+ return -1;
- event_id[nr_counters] = type;
- event_raw[nr_counters] = raw;
+ event_id[nr_counters] = config;
nr_counters++;
str = strstr(str, ",");
@@ -342,8 +424,7 @@ static void create_perfstat_counter(int
struct perf_counter_hw_event hw_event;
memset(&hw_event, 0, sizeof(hw_event));
- hw_event.type = event_id[counter];
- hw_event.raw = event_raw[counter];
+ hw_event.config = event_id[counter];
hw_event.record_type = PERF_RECORD_SIMPLE;
hw_event.nmi = 0;
@@ -428,7 +509,7 @@ int do_perfstat(int argc, char *argv[])
count += single_count;
}
- if (!event_raw[counter] &&
+ if (!PERF_COUNTER_RAW(event_id[counter]) &&
(event_id[counter] == PERF_COUNT_CPU_CLOCK ||
event_id[counter] == PERF_COUNT_TASK_CLOCK)) {
@@ -911,7 +992,7 @@ static void record_ip(uint64_t ip, int c
assert(left <= middle && middle <= right);
if (!(left <= ip && ip <= right)) {
printf(" left: %016lx\n", left);
- printf(" ip: %016lx\n", ip);
+ printf(" ip: %016llx\n", ip);
printf("right: %016lx\n", right);
}
assert(left <= ip && ip <= right);
@@ -983,7 +1064,7 @@ static void process_options(int argc, ch
switch (c) {
case 'a': system_wide = 1; break;
- case 'c': event_count[nr_counters] = atoi(optarg); break;
+ case 'c': default_interval = atoi(optarg); break;
case 'C':
/* CPU and PID are mutually exclusive */
if (tid != -1) {
@@ -1032,10 +1113,7 @@ static void process_options(int argc, ch
if (event_count[counter])
continue;
- if (event_id[counter] < PERF_HW_EVENTS_MAX)
- event_count[counter] = default_count[event_id[counter]];
- else
- event_count[counter] = 100000;
+ event_count[counter] = default_interval;
}
}
@@ -1070,12 +1148,13 @@ int main(int argc, char *argv[])
cpu = i;
memset(&hw_event, 0, sizeof(hw_event));
- hw_event.type = event_id[counter];
- hw_event.raw = event_raw[counter];
+ hw_event.config = event_id[counter];
hw_event.irq_period = event_count[counter];
hw_event.record_type = PERF_RECORD_IRQ;
hw_event.nmi = nmi;
+ printf("FOO: %d %llx %llx\n", counter, event_id[counter], event_count[counter]);
+
fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
if (fd[i][counter] < 0) {
--
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH 7/7] kerneltop: use mmap() output
2009-03-23 17:22 [PATCH 0/7] perf_counter: syscall ABI cleanup and mmap() interface Peter Zijlstra
` (5 preceding siblings ...)
2009-03-23 17:22 ` [PATCH 6/7] kerneltop: update to new syscall ABI Peter Zijlstra
@ 2009-03-23 17:22 ` Peter Zijlstra
2009-03-23 20:57 ` [tip:perfcounters/core] " Peter Zijlstra
2009-03-23 20:57 ` [tip:perfcounters/core] perf_counter tools: tidy up in-kernel dependencies Ingo Molnar
6 siblings, 2 replies; 18+ messages in thread
From: Peter Zijlstra @ 2009-03-23 17:22 UTC (permalink / raw)
To: Paul Mackerras, Ingo Molnar; +Cc: linux-kernel, Peter Zijlstra, Wu Fengguang
[-- Attachment #1: kerneltop-mmap.patch --]
[-- Type: text/plain, Size: 4459 bytes --]
update kerneltop to use the mmap() output to gather overflow information
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Wu Fengguang <fengguang.wu@intel.com>
---
kerneltop.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++----------
1 file changed, 79 insertions(+), 14 deletions(-)
Index: linux-2.6/kerneltop.c
===================================================================
--- linux-2.6.orig/kerneltop.c
+++ linux-2.6/kerneltop.c
@@ -84,6 +84,7 @@
#include <sys/prctl.h>
#include <sys/wait.h>
#include <sys/uio.h>
+#include <sys/mman.h>
#include <linux/unistd.h>
@@ -119,17 +120,25 @@ typedef long long __s64;
#ifdef __x86_64__
-# define __NR_perf_counter_open 295
+#define __NR_perf_counter_open 295
+#define rmb() asm volatile("lfence" ::: "memory")
+#define cpu_relax() asm volatile("rep; nop" ::: "memory");
#endif
#ifdef __i386__
-# define __NR_perf_counter_open 333
+#define __NR_perf_counter_open 333
+#define rmb() asm volatile("lfence" ::: "memory")
+#define cpu_relax() asm volatile("rep; nop" ::: "memory");
#endif
#ifdef __powerpc__
#define __NR_perf_counter_open 319
+#define rmb() asm volatile ("sync" ::: "memory")
+#define cpu_relax() asm volatile ("" ::: "memory");
#endif
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
asmlinkage int sys_perf_counter_open(
struct perf_counter_hw_event *hw_event_uptr __user,
pid_t pid,
@@ -181,6 +190,7 @@ static int profile_cpu = -1;
static int nr_cpus = 0;
static int nmi = 1;
static int group = 0;
+static unsigned int page_size;
static char *vmlinux;
@@ -1117,16 +1127,68 @@ static void process_options(int argc, ch
}
}
+struct mmap_data {
+ int counter;
+ void *base;
+ unsigned int mask;
+ unsigned int prev;
+};
+
+static unsigned int mmap_read_head(struct mmap_data *md)
+{
+ struct perf_counter_mmap_page *pc = md->base;
+ unsigned int seq, head;
+
+repeat:
+ rmb();
+ seq = pc->lock;
+
+ if (unlikely(seq & 1)) {
+ cpu_relax();
+ goto repeat;
+ }
+
+ head = pc->data_head;
+
+ rmb();
+ if (pc->lock != seq)
+ goto repeat;
+
+ return head;
+}
+
+static void mmap_read(struct mmap_data *md)
+{
+ unsigned int head = mmap_read_head(md);
+ unsigned int old = md->prev;
+ unsigned char *data = md->base + page_size;
+
+ if (head - old > md->mask) {
+ printf("ERROR: failed to keep up with mmap data\n");
+ exit(-1);
+ }
+
+ for (; old != head;) {
+ __u64 *ptr = (__u64 *)&data[old & md->mask];
+ old += sizeof(__u64);
+
+ process_event(*ptr, md->counter);
+ }
+
+ md->prev = old;
+}
+
int main(int argc, char *argv[])
{
struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS];
+ struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
struct perf_counter_hw_event hw_event;
int i, counter, group_fd;
unsigned int cpu;
- uint64_t ip;
- ssize_t res;
int ret;
+ page_size = sysconf(_SC_PAGE_SIZE);
+
process_options(argc, argv);
nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
@@ -1153,8 +1215,6 @@ int main(int argc, char *argv[])
hw_event.record_type = PERF_RECORD_IRQ;
hw_event.nmi = nmi;
- printf("FOO: %d %llx %llx\n", counter, event_id[counter], event_count[counter]);
-
fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
if (fd[i][counter] < 0) {
@@ -1174,6 +1234,17 @@ int main(int argc, char *argv[])
event_array[i][counter].fd = fd[i][counter];
event_array[i][counter].events = POLLIN;
+
+ mmap_array[i][counter].counter = counter;
+ mmap_array[i][counter].prev = 0;
+ mmap_array[i][counter].mask = 2*page_size - 1;
+ mmap_array[i][counter].base = mmap(NULL, 3*page_size,
+ PROT_READ, MAP_SHARED, fd[i][counter], 0);
+ if (mmap_array[i][counter].base == MAP_FAILED) {
+ printf("kerneltop error: failed to mmap with %d (%s)\n",
+ errno, strerror(errno));
+ exit(-1);
+ }
}
}
@@ -1188,14 +1259,8 @@ int main(int argc, char *argv[])
int hits = events;
for (i = 0; i < nr_cpus; i++) {
- for (counter = 0; counter < nr_counters; counter++) {
- res = read(fd[i][counter], (char *) &ip, sizeof(ip));
- if (res > 0) {
- assert(res == sizeof(ip));
-
- process_event(ip, counter);
- }
- }
+ for (counter = 0; counter < nr_counters; counter++)
+ mmap_read(&mmap_array[i][counter]);
}
if (time(NULL) >= last_refresh + delay_secs) {
--
^ permalink raw reply [flat|nested] 18+ messages in thread
* [tip:perfcounters/core] perf_counter: remove the event config bitfields
2009-03-23 17:22 ` [PATCH 1/7] perf_counter: remove the event config bitfields Peter Zijlstra
@ 2009-03-23 20:56 ` Peter Zijlstra
0 siblings, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2009-03-23 20:56 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, tglx, mingo
Commit-ID: 47211edfdf473ff304450dfaa2c949994a19f698
Gitweb: http://git.kernel.org/tip/47211edfdf473ff304450dfaa2c949994a19f698
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Mon, 23 Mar 2009 18:22:06 +0100
Committer: Ingo Molnar <mingo@elte.hu>
CommitDate: Mon, 23 Mar 2009 21:45:08 +0100
perf_counter: remove the event config bitfields
Since the bitfields turned into a bit of a mess, remove them and rely on
good old masks.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090323172417.059499915@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
arch/powerpc/kernel/perf_counter.c | 6 +-
arch/x86/kernel/cpu/perf_counter.c | 8 ++--
include/linux/perf_counter.h | 74 ++++++++++++++++++++++++------------
kernel/perf_counter.c | 22 ++++++----
4 files changed, 70 insertions(+), 40 deletions(-)
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 6413d9c..d056515 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -602,13 +602,13 @@ hw_perf_counter_init(struct perf_counter *counter)
return NULL;
if ((s64)counter->hw_event.irq_period < 0)
return NULL;
- if (!counter->hw_event.raw_type) {
- ev = counter->hw_event.event_id;
+ if (!perf_event_raw(&counter->hw_event)) {
+ ev = perf_event_id(&counter->hw_event);
if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
return NULL;
ev = ppmu->generic_events[ev];
} else {
- ev = counter->hw_event.raw_event_id;
+ ev = perf_event_config(&counter->hw_event);
}
counter->hw.config_base = ev;
counter->hw.idx = 0;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 902282d..3f95b0c 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
/*
* Raw event type provide the config in the event structure
*/
- if (hw_event->raw_type) {
- hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id);
+ if (perf_event_raw(hw_event)) {
+ hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event));
} else {
- if (hw_event->event_id >= pmc_ops->max_events)
+ if (perf_event_id(hw_event) >= pmc_ops->max_events)
return -EINVAL;
/*
* The generic map:
*/
- hwc->config |= pmc_ops->event_map(hw_event->event_id);
+ hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
}
counter->wakeup_pending = 0;
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 98f5990..56099e5 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -82,32 +82,37 @@ enum perf_counter_record_type {
PERF_RECORD_GROUP = 2,
};
+#define __PERF_COUNTER_MASK(name) \
+ (((1ULL << PERF_COUNTER_##name##_BITS) - 1) << \
+ PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW_BITS 1
+#define PERF_COUNTER_RAW_SHIFT 63
+#define PERF_COUNTER_RAW_MASK __PERF_COUNTER_MASK(RAW)
+
+#define PERF_COUNTER_CONFIG_BITS 63
+#define PERF_COUNTER_CONFIG_SHIFT 0
+#define PERF_COUNTER_CONFIG_MASK __PERF_COUNTER_MASK(CONFIG)
+
+#define PERF_COUNTER_TYPE_BITS 7
+#define PERF_COUNTER_TYPE_SHIFT 56
+#define PERF_COUNTER_TYPE_MASK __PERF_COUNTER_MASK(TYPE)
+
+#define PERF_COUNTER_EVENT_BITS 56
+#define PERF_COUNTER_EVENT_SHIFT 0
+#define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT)
+
/*
* Hardware event to monitor via a performance monitoring counter:
*/
struct perf_counter_hw_event {
- union {
-#ifndef __BIG_ENDIAN_BITFIELD
- struct {
- __u64 event_id : 56,
- type : 8;
- };
- struct {
- __u64 raw_event_id : 63,
- raw_type : 1;
- };
-#else
- struct {
- __u64 type : 8,
- event_id : 56;
- };
- struct {
- __u64 raw_type : 1,
- raw_event_id : 63;
- };
-#endif /* __BIT_ENDIAN_BITFIELD */
- __u64 event_config;
- };
+ /*
+ * The MSB of the config word signifies if the rest contains cpu
+ * specific (raw) counter configuration data, if unset, the next
+ * 7 bits are an event type and the rest of the bits are the event
+ * identifier.
+ */
+ __u64 config;
__u64 irq_period;
__u64 record_type;
@@ -157,6 +162,27 @@ struct perf_counter_hw_event {
struct task_struct;
+static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event)
+{
+ return hw_event->config & PERF_COUNTER_RAW_MASK;
+}
+
+static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event)
+{
+ return hw_event->config & PERF_COUNTER_CONFIG_MASK;
+}
+
+static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event)
+{
+ return (hw_event->config & PERF_COUNTER_TYPE_MASK) >>
+ PERF_COUNTER_TYPE_SHIFT;
+}
+
+static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event)
+{
+ return hw_event->config & PERF_COUNTER_EVENT_MASK;
+}
+
/**
* struct hw_perf_counter - performance counter hardware details:
*/
@@ -336,8 +362,8 @@ extern void perf_counter_output(struct perf_counter *counter,
*/
static inline int is_software_counter(struct perf_counter *counter)
{
- return !counter->hw_event.raw_type &&
- counter->hw_event.type != PERF_TYPE_HARDWARE;
+ return !perf_event_raw(&counter->hw_event) &&
+ perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE;
}
extern void perf_swcounter_event(u32, u64, int, struct pt_regs *);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f054b8c..ca14fc4 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1379,7 +1379,7 @@ static void perf_counter_handle_group(struct perf_counter *counter)
list_for_each_entry(sub, &leader->sibling_list, list_entry) {
if (sub != counter)
sub->hw_ops->read(sub);
- perf_counter_store_irq(counter, sub->hw_event.event_config);
+ perf_counter_store_irq(counter, sub->hw_event.config);
perf_counter_store_irq(counter, atomic64_read(&sub->count));
}
}
@@ -1489,13 +1489,13 @@ static int perf_swcounter_match(struct perf_counter *counter,
if (counter->state != PERF_COUNTER_STATE_ACTIVE)
return 0;
- if (counter->hw_event.raw_type)
+ if (perf_event_raw(&counter->hw_event))
return 0;
- if (counter->hw_event.type != type)
+ if (perf_event_type(&counter->hw_event) != type)
return 0;
- if (counter->hw_event.event_id != event)
+ if (perf_event_id(&counter->hw_event) != event)
return 0;
if (counter->hw_event.exclude_user && user_mode(regs))
@@ -1757,13 +1757,13 @@ extern void ftrace_profile_disable(int);
static void tp_perf_counter_destroy(struct perf_counter *counter)
{
- ftrace_profile_disable(counter->hw_event.event_id);
+ ftrace_profile_disable(perf_event_id(&counter->hw_event));
}
static const struct hw_perf_counter_ops *
tp_perf_counter_init(struct perf_counter *counter)
{
- int event_id = counter->hw_event.event_id;
+ int event_id = perf_event_id(&counter->hw_event);
int ret;
ret = ftrace_profile_enable(event_id);
@@ -1797,7 +1797,7 @@ sw_perf_counter_init(struct perf_counter *counter)
* to be kernel events, and page faults are never hypervisor
* events.
*/
- switch (counter->hw_event.event_id) {
+ switch (perf_event_id(&counter->hw_event)) {
case PERF_COUNT_CPU_CLOCK:
hw_ops = &perf_ops_cpu_clock;
@@ -1882,9 +1882,12 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
hw_ops = NULL;
- if (hw_event->raw_type)
+ if (perf_event_raw(hw_event)) {
hw_ops = hw_perf_counter_init(counter);
- else switch (hw_event->type) {
+ goto done;
+ }
+
+ switch (perf_event_type(hw_event)) {
case PERF_TYPE_HARDWARE:
hw_ops = hw_perf_counter_init(counter);
break;
@@ -1902,6 +1905,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
kfree(counter);
return NULL;
}
+done:
counter->hw_ops = hw_ops;
return counter;
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [tip:perfcounters/core] perf_counter: avoid recursion
2009-03-23 17:22 ` [PATCH 2/7] perf_counter: avoid recursion Peter Zijlstra
@ 2009-03-23 20:56 ` Peter Zijlstra
0 siblings, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2009-03-23 20:56 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, tglx, mingo
Commit-ID: 4d0ec5f64b60057b395f2d314189ccde71d2d0bd
Gitweb: http://git.kernel.org/tip/4d0ec5f64b60057b395f2d314189ccde71d2d0bd
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Mon, 23 Mar 2009 18:22:07 +0100
Committer: Ingo Molnar <mingo@elte.hu>
CommitDate: Mon, 23 Mar 2009 21:45:09 +0100
perf_counter: avoid recursion
Tracepoint events like lock_acquire and software counters like
pagefaults can recurse into the perf counter code again, avoid that.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090323172417.152096433@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
include/linux/perf_counter.h | 7 +++++++
kernel/perf_counter.c | 26 ++++++++++++++++++++++++++
2 files changed, 33 insertions(+), 0 deletions(-)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 56099e5..18dc17d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -328,6 +328,13 @@ struct perf_cpu_context {
int active_oncpu;
int max_pertask;
int exclusive;
+
+ /*
+ * Recursion avoidance:
+ *
+ * task, softirq, irq, nmi context
+ */
+ int recursion[4];
};
/*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ca14fc4..ce34bff 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -23,6 +23,7 @@
#include <linux/mm.h>
#include <linux/vmstat.h>
#include <linux/rculist.h>
+#include <linux/hardirq.h>
#include <asm/irq_regs.h>
@@ -1532,10 +1533,31 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
rcu_read_unlock();
}
+static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
+{
+ if (in_nmi())
+ return &cpuctx->recursion[3];
+
+ if (in_irq())
+ return &cpuctx->recursion[2];
+
+ if (in_softirq())
+ return &cpuctx->recursion[1];
+
+ return &cpuctx->recursion[0];
+}
+
static void __perf_swcounter_event(enum perf_event_types type, u32 event,
u64 nr, int nmi, struct pt_regs *regs)
{
struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+ int *recursion = perf_swcounter_recursion_context(cpuctx);
+
+ if (*recursion)
+ goto out;
+
+ (*recursion)++;
+ barrier();
perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs);
if (cpuctx->task_ctx) {
@@ -1543,6 +1565,10 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event,
nr, nmi, regs);
}
+ barrier();
+ (*recursion)--;
+
+out:
put_cpu_var(perf_cpu_context);
}
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [tip:perfcounters/core] perf_counter: add an mmap method to allow userspace to read hardware counters
2009-03-23 17:22 ` [PATCH 3/7] perf_counter: add an mmap method to allow userspace to read hardware counters Peter Zijlstra
@ 2009-03-23 20:56 ` Paul Mackerras
0 siblings, 0 replies; 18+ messages in thread
From: Paul Mackerras @ 2009-03-23 20:56 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, tglx, mingo
Commit-ID: 36e6cd42be5579128495e7d9e678638f4945de6e
Gitweb: http://git.kernel.org/tip/36e6cd42be5579128495e7d9e678638f4945de6e
Author: Paul Mackerras <paulus@samba.org>
AuthorDate: Mon, 23 Mar 2009 18:22:08 +0100
Committer: Ingo Molnar <mingo@elte.hu>
CommitDate: Mon, 23 Mar 2009 21:45:09 +0100
perf_counter: add an mmap method to allow userspace to read hardware counters
Impact: new feature giving performance improvement
This adds the ability for userspace to do an mmap on a hardware counter
fd and get access to a read-only page that contains the information
needed to translate a hardware counter value to the full 64-bit
counter value that would be returned by a read on the fd. This is
useful on architectures that allow user programs to read the hardware
counters, such as PowerPC.
The mmap will only succeed if the counter is a hardware counter
monitoring the current process.
On my quad 2.5GHz PowerPC 970MP machine, userspace can read a counter
and translate it to the full 64-bit value in about 30ns using the
mmapped page, compared to about 830ns for the read syscall on the
counter, so this does give a significant performance improvement.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090323172417.297057964@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
arch/powerpc/kernel/perf_counter.c | 6 +++
include/linux/perf_counter.h | 15 +++++++
kernel/perf_counter.c | 76 ++++++++++++++++++++++++++++++++++++
3 files changed, 97 insertions(+), 0 deletions(-)
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index d056515..e434928 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -417,6 +417,8 @@ void hw_perf_restore(u64 disable)
atomic64_set(&counter->hw.prev_count, val);
counter->hw.idx = hwc_index[i] + 1;
write_pmc(counter->hw.idx, val);
+ if (counter->user_page)
+ perf_counter_update_userpage(counter);
}
mb();
cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
@@ -572,6 +574,8 @@ static void power_perf_disable(struct perf_counter *counter)
ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
write_pmc(counter->hw.idx, 0);
counter->hw.idx = 0;
+ if (counter->user_page)
+ perf_counter_update_userpage(counter);
break;
}
}
@@ -698,6 +702,8 @@ static void record_and_restart(struct perf_counter *counter, long val,
write_pmc(counter->hw.idx, val);
atomic64_set(&counter->hw.prev_count, val);
atomic64_set(&counter->hw.period_left, left);
+ if (counter->user_page)
+ perf_counter_update_userpage(counter);
/*
* Finally record data if requested.
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 18dc17d..40b324e 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -143,6 +143,17 @@ struct perf_counter_hw_event {
#define PERF_COUNTER_IOC_ENABLE _IO('$', 0)
#define PERF_COUNTER_IOC_DISABLE _IO('$', 1)
+/*
+ * Structure of the page that can be mapped via mmap
+ */
+struct perf_counter_mmap_page {
+ __u32 version; /* version number of this structure */
+ __u32 compat_version; /* lowest version this is compat with */
+ __u32 lock; /* seqlock for synchronization */
+ __u32 index; /* hardware counter identifier */
+ __s64 offset; /* add to hardware counter value */
+};
+
#ifdef __KERNEL__
/*
* Kernel-internal data types and definitions:
@@ -278,6 +289,9 @@ struct perf_counter {
int oncpu;
int cpu;
+ /* pointer to page shared with userspace via mmap */
+ unsigned long user_page;
+
/* read() / irq related data */
wait_queue_head_t waitq;
/* optional: for NMIs */
@@ -361,6 +375,7 @@ extern int perf_counter_task_enable(void);
extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
struct perf_cpu_context *cpuctx,
struct perf_counter_context *ctx, int cpu);
+extern void perf_counter_update_userpage(struct perf_counter *counter);
extern void perf_counter_output(struct perf_counter *counter,
int nmi, struct pt_regs *regs);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ce34bff..d9cfd90 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1177,6 +1177,7 @@ static int perf_release(struct inode *inode, struct file *file)
mutex_unlock(&counter->mutex);
mutex_unlock(&ctx->mutex);
+ free_page(counter->user_page);
free_counter(counter);
put_context(ctx);
@@ -1346,12 +1347,87 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return err;
}
+void perf_counter_update_userpage(struct perf_counter *counter)
+{
+ struct perf_counter_mmap_page *userpg;
+
+ if (!counter->user_page)
+ return;
+ userpg = (struct perf_counter_mmap_page *) counter->user_page;
+
+ ++userpg->lock;
+ smp_wmb();
+ userpg->index = counter->hw.idx;
+ userpg->offset = atomic64_read(&counter->count);
+ if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+ userpg->offset -= atomic64_read(&counter->hw.prev_count);
+ smp_wmb();
+ ++userpg->lock;
+}
+
+static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct perf_counter *counter = vma->vm_file->private_data;
+
+ if (!counter->user_page)
+ return VM_FAULT_SIGBUS;
+
+ vmf->page = virt_to_page(counter->user_page);
+ get_page(vmf->page);
+ return 0;
+}
+
+static struct vm_operations_struct perf_mmap_vmops = {
+ .fault = perf_mmap_fault,
+};
+
+static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct perf_counter *counter = file->private_data;
+ unsigned long userpg;
+
+ if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
+ return -EINVAL;
+ if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+ return -EINVAL;
+
+ /*
+ * For now, restrict to the case of a hardware counter
+ * on the current task.
+ */
+ if (is_software_counter(counter) || counter->task != current)
+ return -EINVAL;
+
+ userpg = counter->user_page;
+ if (!userpg) {
+ userpg = get_zeroed_page(GFP_KERNEL);
+ mutex_lock(&counter->mutex);
+ if (counter->user_page) {
+ free_page(userpg);
+ userpg = counter->user_page;
+ } else {
+ counter->user_page = userpg;
+ }
+ mutex_unlock(&counter->mutex);
+ if (!userpg)
+ return -ENOMEM;
+ }
+
+ perf_counter_update_userpage(counter);
+
+ vma->vm_flags &= ~VM_MAYWRITE;
+ vma->vm_flags |= VM_RESERVED;
+ vma->vm_ops = &perf_mmap_vmops;
+ return 0;
+}
+
static const struct file_operations perf_fops = {
.release = perf_release,
.read = perf_read,
.poll = perf_poll,
.unlocked_ioctl = perf_ioctl,
.compat_ioctl = perf_ioctl,
+ .mmap = perf_mmap,
};
/*
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [tip:perfcounters/core] mutex: add atomic_dec_and_mutex_lock()
2009-03-23 17:22 ` [PATCH 4/7] mutex: add atomic_dec_and_mutex_lock] Peter Zijlstra
@ 2009-03-23 20:56 ` Eric Paris
2009-04-02 0:42 ` [tip:perfcounters/core] mutex: drop "inline" from mutex_lock() inside kernel/mutex.c H. Peter Anvin
0 siblings, 1 reply; 18+ messages in thread
From: Eric Paris @ 2009-03-23 20:56 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, eparis, tglx, mingo
Commit-ID: 3a2d367d9aabac486ac4444c6c7ec7a1dab16267
Gitweb: http://git.kernel.org/tip/3a2d367d9aabac486ac4444c6c7ec7a1dab16267
Author: Eric Paris <eparis@redhat.com>
AuthorDate: Mon, 23 Mar 2009 18:22:09 +0100
Committer: Ingo Molnar <mingo@elte.hu>
CommitDate: Mon, 23 Mar 2009 21:45:10 +0100
mutex: add atomic_dec_and_mutex_lock()
Much like the atomic_dec_and_lock() function in which we take an hold a
spin_lock if we drop the atomic to 0 this function takes and holds the
mutex if we dec the atomic to 0.
Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090323172417.410913479@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
include/linux/mutex.h | 23 +++++++++++++++++++++++
1 files changed, 23 insertions(+), 0 deletions(-)
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 3069ec7..93054fc 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -151,4 +151,27 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
extern int mutex_trylock(struct mutex *lock);
extern void mutex_unlock(struct mutex *lock);
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+ /* dec if we can't possibly hit 0 */
+ if (atomic_add_unless(cnt, -1, 1))
+ return 0;
+ /* we might hit 0, so take the lock */
+ mutex_lock(lock);
+ if (!atomic_dec_and_test(cnt)) {
+ /* when we actually did the dec, we didn't hit 0 */
+ mutex_unlock(lock);
+ return 0;
+ }
+ /* we hit 0, and we hold the lock */
+ return 1;
+}
+
#endif
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [tip:perfcounters/core] perf_counter: new output ABI - part 1
2009-03-23 17:22 ` [PATCH 5/7] perf_counter: new output ABI - part 1 Peter Zijlstra
@ 2009-03-23 20:56 ` Peter Zijlstra
0 siblings, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2009-03-23 20:56 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, tglx, mingo
Commit-ID: 6329237a583dd1d6ef9a8ebfe87eb59cdde26b2a
Gitweb: http://git.kernel.org/tip/6329237a583dd1d6ef9a8ebfe87eb59cdde26b2a
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Mon, 23 Mar 2009 18:22:10 +0100
Committer: Ingo Molnar <mingo@elte.hu>
CommitDate: Mon, 23 Mar 2009 21:45:10 +0100
perf_counter: new output ABI - part 1
Impact: Rework the perfcounter output ABI
use sys_read() only for instant data and provide mmap() output for all
async overflow data.
The first mmap() determines the size of the output buffer. The mmap()
size must be a PAGE_SIZE multiple of 1+pages, where pages must be a
power of 2 or 0. Further mmap()s of the same fd must have the same
size. Once all maps are gone, you can again mmap() with a new size.
In case of 0 extra pages there is no data output and the first page
only contains meta data.
When there are data pages, a poll() event will be generated for each
full page of data. Furthermore, the output is circular. This means
that although 1 page is a valid configuration, its useless, since
we'll start overwriting it the instant we report a full page.
Future work will focus on the output format (currently maintained)
where we'll likey want each entry denoted by a header which includes a
type and length.
Further future work will allow to splice() the fd, also containing the
async overflow data -- splice() would be mutually exclusive with
mmap() of the data.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090323172417.470536358@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
arch/powerpc/kernel/perf_counter.c | 9 +-
include/linux/perf_counter.h | 36 ++--
kernel/perf_counter.c | 464 +++++++++++++++++++-----------------
3 files changed, 263 insertions(+), 246 deletions(-)
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index e434928..d48596a 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -417,8 +417,7 @@ void hw_perf_restore(u64 disable)
atomic64_set(&counter->hw.prev_count, val);
counter->hw.idx = hwc_index[i] + 1;
write_pmc(counter->hw.idx, val);
- if (counter->user_page)
- perf_counter_update_userpage(counter);
+ perf_counter_update_userpage(counter);
}
mb();
cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
@@ -574,8 +573,7 @@ static void power_perf_disable(struct perf_counter *counter)
ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
write_pmc(counter->hw.idx, 0);
counter->hw.idx = 0;
- if (counter->user_page)
- perf_counter_update_userpage(counter);
+ perf_counter_update_userpage(counter);
break;
}
}
@@ -702,8 +700,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
write_pmc(counter->hw.idx, val);
atomic64_set(&counter->hw.prev_count, val);
atomic64_set(&counter->hw.period_left, left);
- if (counter->user_page)
- perf_counter_update_userpage(counter);
+ perf_counter_update_userpage(counter);
/*
* Finally record data if requested.
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 40b324e..2b5e66d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -152,6 +152,8 @@ struct perf_counter_mmap_page {
__u32 lock; /* seqlock for synchronization */
__u32 index; /* hardware counter identifier */
__s64 offset; /* add to hardware counter value */
+
+ __u32 data_head; /* head in the data section */
};
#ifdef __KERNEL__
@@ -218,21 +220,6 @@ struct hw_perf_counter {
#endif
};
-/*
- * Hardcoded buffer length limit for now, for IRQ-fed events:
- */
-#define PERF_DATA_BUFLEN 2048
-
-/**
- * struct perf_data - performance counter IRQ data sampling ...
- */
-struct perf_data {
- int len;
- int rd_idx;
- int overrun;
- u8 data[PERF_DATA_BUFLEN];
-};
-
struct perf_counter;
/**
@@ -256,6 +243,14 @@ enum perf_counter_active_state {
struct file;
+struct perf_mmap_data {
+ struct rcu_head rcu_head;
+ int nr_pages;
+ atomic_t head;
+ struct perf_counter_mmap_page *user_page;
+ void *data_pages[0];
+};
+
/**
* struct perf_counter - performance counter kernel representation:
*/
@@ -289,16 +284,15 @@ struct perf_counter {
int oncpu;
int cpu;
- /* pointer to page shared with userspace via mmap */
- unsigned long user_page;
+ /* mmap bits */
+ struct mutex mmap_mutex;
+ atomic_t mmap_count;
+ struct perf_mmap_data *data;
- /* read() / irq related data */
+ /* poll related */
wait_queue_head_t waitq;
/* optional: for NMIs */
int wakeup_pending;
- struct perf_data *irqdata;
- struct perf_data *usrdata;
- struct perf_data data[2];
void (*destroy)(struct perf_counter *);
struct rcu_head rcu_head;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d9cfd90..0dfe910 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -4,7 +4,8 @@
* Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
* Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
*
- * For licencing details see kernel-base/COPYING
+ *
+ * For licensing details see kernel-base/COPYING
*/
#include <linux/fs.h>
@@ -1022,66 +1023,6 @@ static u64 perf_counter_read(struct perf_counter *counter)
return atomic64_read(&counter->count);
}
-/*
- * Cross CPU call to switch performance data pointers
- */
-static void __perf_switch_irq_data(void *info)
-{
- struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
- struct perf_counter *counter = info;
- struct perf_counter_context *ctx = counter->ctx;
- struct perf_data *oldirqdata = counter->irqdata;
-
- /*
- * If this is a task context, we need to check whether it is
- * the current task context of this cpu. If not it has been
- * scheduled out before the smp call arrived.
- */
- if (ctx->task) {
- if (cpuctx->task_ctx != ctx)
- return;
- spin_lock(&ctx->lock);
- }
-
- /* Change the pointer NMI safe */
- atomic_long_set((atomic_long_t *)&counter->irqdata,
- (unsigned long) counter->usrdata);
- counter->usrdata = oldirqdata;
-
- if (ctx->task)
- spin_unlock(&ctx->lock);
-}
-
-static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
-{
- struct perf_counter_context *ctx = counter->ctx;
- struct perf_data *oldirqdata = counter->irqdata;
- struct task_struct *task = ctx->task;
-
- if (!task) {
- smp_call_function_single(counter->cpu,
- __perf_switch_irq_data,
- counter, 1);
- return counter->usrdata;
- }
-
-retry:
- spin_lock_irq(&ctx->lock);
- if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
- counter->irqdata = counter->usrdata;
- counter->usrdata = oldirqdata;
- spin_unlock_irq(&ctx->lock);
- return oldirqdata;
- }
- spin_unlock_irq(&ctx->lock);
- task_oncpu_function_call(task, __perf_switch_irq_data, counter);
- /* Might have failed, because task was scheduled out */
- if (counter->irqdata == oldirqdata)
- goto retry;
-
- return counter->usrdata;
-}
-
static void put_context(struct perf_counter_context *ctx)
{
if (ctx->task)
@@ -1177,7 +1118,6 @@ static int perf_release(struct inode *inode, struct file *file)
mutex_unlock(&counter->mutex);
mutex_unlock(&ctx->mutex);
- free_page(counter->user_page);
free_counter(counter);
put_context(ctx);
@@ -1192,7 +1132,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
{
u64 cntval;
- if (count != sizeof(cntval))
+ if (count < sizeof(cntval))
return -EINVAL;
/*
@@ -1211,121 +1151,20 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
}
static ssize_t
-perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
-{
- if (!usrdata->len)
- return 0;
-
- count = min(count, (size_t)usrdata->len);
- if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
- return -EFAULT;
-
- /* Adjust the counters */
- usrdata->len -= count;
- if (!usrdata->len)
- usrdata->rd_idx = 0;
- else
- usrdata->rd_idx += count;
-
- return count;
-}
-
-static ssize_t
-perf_read_irq_data(struct perf_counter *counter,
- char __user *buf,
- size_t count,
- int nonblocking)
-{
- struct perf_data *irqdata, *usrdata;
- DECLARE_WAITQUEUE(wait, current);
- ssize_t res, res2;
-
- irqdata = counter->irqdata;
- usrdata = counter->usrdata;
-
- if (usrdata->len + irqdata->len >= count)
- goto read_pending;
-
- if (nonblocking)
- return -EAGAIN;
-
- spin_lock_irq(&counter->waitq.lock);
- __add_wait_queue(&counter->waitq, &wait);
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (usrdata->len + irqdata->len >= count)
- break;
-
- if (signal_pending(current))
- break;
-
- if (counter->state == PERF_COUNTER_STATE_ERROR)
- break;
-
- spin_unlock_irq(&counter->waitq.lock);
- schedule();
- spin_lock_irq(&counter->waitq.lock);
- }
- __remove_wait_queue(&counter->waitq, &wait);
- __set_current_state(TASK_RUNNING);
- spin_unlock_irq(&counter->waitq.lock);
-
- if (usrdata->len + irqdata->len < count &&
- counter->state != PERF_COUNTER_STATE_ERROR)
- return -ERESTARTSYS;
-read_pending:
- mutex_lock(&counter->mutex);
-
- /* Drain pending data first: */
- res = perf_copy_usrdata(usrdata, buf, count);
- if (res < 0 || res == count)
- goto out;
-
- /* Switch irq buffer: */
- usrdata = perf_switch_irq_data(counter);
- res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
- if (res2 < 0) {
- if (!res)
- res = -EFAULT;
- } else {
- res += res2;
- }
-out:
- mutex_unlock(&counter->mutex);
-
- return res;
-}
-
-static ssize_t
perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
struct perf_counter *counter = file->private_data;
- switch (counter->hw_event.record_type) {
- case PERF_RECORD_SIMPLE:
- return perf_read_hw(counter, buf, count);
-
- case PERF_RECORD_IRQ:
- case PERF_RECORD_GROUP:
- return perf_read_irq_data(counter, buf, count,
- file->f_flags & O_NONBLOCK);
- }
- return -EINVAL;
+ return perf_read_hw(counter, buf, count);
}
static unsigned int perf_poll(struct file *file, poll_table *wait)
{
struct perf_counter *counter = file->private_data;
- unsigned int events = 0;
- unsigned long flags;
+ unsigned int events = POLLIN;
poll_wait(file, &counter->waitq, wait);
- spin_lock_irqsave(&counter->waitq.lock, flags);
- if (counter->usrdata->len || counter->irqdata->len)
- events |= POLLIN;
- spin_unlock_irqrestore(&counter->waitq.lock, flags);
-
return events;
}
@@ -1347,78 +1186,207 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return err;
}
-void perf_counter_update_userpage(struct perf_counter *counter)
+static void __perf_counter_update_userpage(struct perf_counter *counter,
+ struct perf_mmap_data *data)
{
- struct perf_counter_mmap_page *userpg;
-
- if (!counter->user_page)
- return;
- userpg = (struct perf_counter_mmap_page *) counter->user_page;
+ struct perf_counter_mmap_page *userpg = data->user_page;
+ /*
+ * Disable preemption so as to not let the corresponding user-space
+ * spin too long if we get preempted.
+ */
+ preempt_disable();
++userpg->lock;
smp_wmb();
userpg->index = counter->hw.idx;
userpg->offset = atomic64_read(&counter->count);
if (counter->state == PERF_COUNTER_STATE_ACTIVE)
userpg->offset -= atomic64_read(&counter->hw.prev_count);
+
+ userpg->data_head = atomic_read(&data->head);
smp_wmb();
++userpg->lock;
+ preempt_enable();
+}
+
+void perf_counter_update_userpage(struct perf_counter *counter)
+{
+ struct perf_mmap_data *data;
+
+ rcu_read_lock();
+ data = rcu_dereference(counter->data);
+ if (data)
+ __perf_counter_update_userpage(counter, data);
+ rcu_read_unlock();
}
static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct perf_counter *counter = vma->vm_file->private_data;
+ struct perf_mmap_data *data;
+ int ret = VM_FAULT_SIGBUS;
- if (!counter->user_page)
- return VM_FAULT_SIGBUS;
+ rcu_read_lock();
+ data = rcu_dereference(counter->data);
+ if (!data)
+ goto unlock;
+
+ if (vmf->pgoff == 0) {
+ vmf->page = virt_to_page(data->user_page);
+ } else {
+ int nr = vmf->pgoff - 1;
- vmf->page = virt_to_page(counter->user_page);
+ if ((unsigned)nr > data->nr_pages)
+ goto unlock;
+
+ vmf->page = virt_to_page(data->data_pages[nr]);
+ }
get_page(vmf->page);
+ ret = 0;
+unlock:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
+{
+ struct perf_mmap_data *data;
+ unsigned long size;
+ int i;
+
+ WARN_ON(atomic_read(&counter->mmap_count));
+
+ size = sizeof(struct perf_mmap_data);
+ size += nr_pages * sizeof(void *);
+
+ data = kzalloc(size, GFP_KERNEL);
+ if (!data)
+ goto fail;
+
+ data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!data->user_page)
+ goto fail_user_page;
+
+ for (i = 0; i < nr_pages; i++) {
+ data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!data->data_pages[i])
+ goto fail_data_pages;
+ }
+
+ data->nr_pages = nr_pages;
+
+ rcu_assign_pointer(counter->data, data);
+
return 0;
+
+fail_data_pages:
+ for (i--; i >= 0; i--)
+ free_page((unsigned long)data->data_pages[i]);
+
+ free_page((unsigned long)data->user_page);
+
+fail_user_page:
+ kfree(data);
+
+fail:
+ return -ENOMEM;
+}
+
+static void __perf_mmap_data_free(struct rcu_head *rcu_head)
+{
+ struct perf_mmap_data *data = container_of(rcu_head,
+ struct perf_mmap_data, rcu_head);
+ int i;
+
+ free_page((unsigned long)data->user_page);
+ for (i = 0; i < data->nr_pages; i++)
+ free_page((unsigned long)data->data_pages[i]);
+ kfree(data);
+}
+
+static void perf_mmap_data_free(struct perf_counter *counter)
+{
+ struct perf_mmap_data *data = counter->data;
+
+ WARN_ON(atomic_read(&counter->mmap_count));
+
+ rcu_assign_pointer(counter->data, NULL);
+ call_rcu(&data->rcu_head, __perf_mmap_data_free);
+}
+
+static void perf_mmap_open(struct vm_area_struct *vma)
+{
+ struct perf_counter *counter = vma->vm_file->private_data;
+
+ atomic_inc(&counter->mmap_count);
+}
+
+static void perf_mmap_close(struct vm_area_struct *vma)
+{
+ struct perf_counter *counter = vma->vm_file->private_data;
+
+ if (atomic_dec_and_mutex_lock(&counter->mmap_count,
+ &counter->mmap_mutex)) {
+ perf_mmap_data_free(counter);
+ mutex_unlock(&counter->mmap_mutex);
+ }
}
static struct vm_operations_struct perf_mmap_vmops = {
+ .open = perf_mmap_open,
+ .close = perf_mmap_close,
.fault = perf_mmap_fault,
};
static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
struct perf_counter *counter = file->private_data;
- unsigned long userpg;
+ unsigned long vma_size;
+ unsigned long nr_pages;
+ unsigned long locked, lock_limit;
+ int ret = 0;
if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
return -EINVAL;
- if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+
+ vma_size = vma->vm_end - vma->vm_start;
+ nr_pages = (vma_size / PAGE_SIZE) - 1;
+
+ if (nr_pages == 0 || !is_power_of_2(nr_pages))
return -EINVAL;
- /*
- * For now, restrict to the case of a hardware counter
- * on the current task.
- */
- if (is_software_counter(counter) || counter->task != current)
+ if (vma_size != PAGE_SIZE * (1 + nr_pages))
return -EINVAL;
- userpg = counter->user_page;
- if (!userpg) {
- userpg = get_zeroed_page(GFP_KERNEL);
- mutex_lock(&counter->mutex);
- if (counter->user_page) {
- free_page(userpg);
- userpg = counter->user_page;
- } else {
- counter->user_page = userpg;
- }
- mutex_unlock(&counter->mutex);
- if (!userpg)
- return -ENOMEM;
- }
+ if (vma->vm_pgoff != 0)
+ return -EINVAL;
+
+ locked = vma_size >> PAGE_SHIFT;
+ locked += vma->vm_mm->locked_vm;
- perf_counter_update_userpage(counter);
+ lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit >>= PAGE_SHIFT;
+
+ if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
+ return -EPERM;
+
+ mutex_lock(&counter->mmap_mutex);
+ if (atomic_inc_not_zero(&counter->mmap_count))
+ goto out;
+
+ WARN_ON(counter->data);
+ ret = perf_mmap_data_alloc(counter, nr_pages);
+ if (!ret)
+ atomic_set(&counter->mmap_count, 1);
+out:
+ mutex_unlock(&counter->mmap_mutex);
vma->vm_flags &= ~VM_MAYWRITE;
vma->vm_flags |= VM_RESERVED;
vma->vm_ops = &perf_mmap_vmops;
- return 0;
+
+ return ret;
}
static const struct file_operations perf_fops = {
@@ -1434,30 +1402,94 @@ static const struct file_operations perf_fops = {
* Output
*/
-static void perf_counter_store_irq(struct perf_counter *counter, u64 data)
+static int perf_output_write(struct perf_counter *counter, int nmi,
+ void *buf, ssize_t size)
{
- struct perf_data *irqdata = counter->irqdata;
+ struct perf_mmap_data *data;
+ unsigned int offset, head, nr;
+ unsigned int len;
+ int ret, wakeup;
- if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
- irqdata->overrun++;
- } else {
- u64 *p = (u64 *) &irqdata->data[irqdata->len];
+ rcu_read_lock();
+ ret = -ENOSPC;
+ data = rcu_dereference(counter->data);
+ if (!data)
+ goto out;
+
+ if (!data->nr_pages)
+ goto out;
+
+ ret = -EINVAL;
+ if (size > PAGE_SIZE)
+ goto out;
+
+ do {
+ offset = head = atomic_read(&data->head);
+ head += sizeof(u64);
+ } while (atomic_cmpxchg(&data->head, offset, head) != offset);
+
+ wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
- *p = data;
- irqdata->len += sizeof(u64);
+ nr = (offset >> PAGE_SHIFT) & (data->nr_pages - 1);
+ offset &= PAGE_SIZE - 1;
+
+ len = min_t(unsigned int, PAGE_SIZE - offset, size);
+ memcpy(data->data_pages[nr] + offset, buf, len);
+ size -= len;
+
+ if (size) {
+ nr = (nr + 1) & (data->nr_pages - 1);
+ memcpy(data->data_pages[nr], buf + len, size);
+ }
+
+ /*
+ * generate a poll() wakeup for every page boundary crossed
+ */
+ if (wakeup) {
+ __perf_counter_update_userpage(counter, data);
+ if (nmi) {
+ counter->wakeup_pending = 1;
+ set_perf_counter_pending();
+ } else
+ wake_up(&counter->waitq);
}
+ ret = 0;
+out:
+ rcu_read_unlock();
+
+ return ret;
}
-static void perf_counter_handle_group(struct perf_counter *counter)
+static void perf_output_simple(struct perf_counter *counter,
+ int nmi, struct pt_regs *regs)
+{
+ u64 entry;
+
+ entry = instruction_pointer(regs);
+
+ perf_output_write(counter, nmi, &entry, sizeof(entry));
+}
+
+struct group_entry {
+ u64 event;
+ u64 counter;
+};
+
+static void perf_output_group(struct perf_counter *counter, int nmi)
{
struct perf_counter *leader, *sub;
leader = counter->group_leader;
list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+ struct group_entry entry;
+
if (sub != counter)
sub->hw_ops->read(sub);
- perf_counter_store_irq(counter, sub->hw_event.config);
- perf_counter_store_irq(counter, atomic64_read(&sub->count));
+
+ entry.event = sub->hw_event.config;
+ entry.counter = atomic64_read(&sub->count);
+
+ perf_output_write(counter, nmi, &entry, sizeof(entry));
}
}
@@ -1469,19 +1501,13 @@ void perf_counter_output(struct perf_counter *counter,
return;
case PERF_RECORD_IRQ:
- perf_counter_store_irq(counter, instruction_pointer(regs));
+ perf_output_simple(counter, nmi, regs);
break;
case PERF_RECORD_GROUP:
- perf_counter_handle_group(counter);
+ perf_output_group(counter, nmi);
break;
}
-
- if (nmi) {
- counter->wakeup_pending = 1;
- set_perf_counter_pending();
- } else
- wake_up(&counter->waitq);
}
/*
@@ -1967,10 +1993,10 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
INIT_LIST_HEAD(&counter->sibling_list);
init_waitqueue_head(&counter->waitq);
+ mutex_init(&counter->mmap_mutex);
+
INIT_LIST_HEAD(&counter->child_list);
- counter->irqdata = &counter->data[0];
- counter->usrdata = &counter->data[1];
counter->cpu = cpu;
counter->hw_event = *hw_event;
counter->wakeup_pending = 0;
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [tip:perfcounters/core] kerneltop: update to new syscall ABI
2009-03-23 17:22 ` [PATCH 6/7] kerneltop: update to new syscall ABI Peter Zijlstra
@ 2009-03-23 20:57 ` Peter Zijlstra
0 siblings, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2009-03-23 20:57 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, tglx,
fengguang.wu, mingo
Commit-ID: d68cd6cac7e4b63ca990e7c4c24aef112f8d6c60
Gitweb: http://git.kernel.org/tip/d68cd6cac7e4b63ca990e7c4c24aef112f8d6c60
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Mon, 23 Mar 2009 18:22:11 +0100
Committer: Ingo Molnar <mingo@elte.hu>
CommitDate: Mon, 23 Mar 2009 21:45:11 +0100
kerneltop: update to new syscall ABI
update the kerneltop userspace to work with the latest syscall ABI
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090323172417.559643732@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
Documentation/perf_counter/kerneltop.c | 235 +++++++++++++++++++++-----------
1 files changed, 157 insertions(+), 78 deletions(-)
diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 81a68aa..a72c9bd 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -87,20 +87,90 @@
#include <linux/unistd.h>
-#include "perfcounters.h"
+#include "include/linux/perf_counter.h"
+/*
+ * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * counters in the current task.
+ */
+#define PR_TASK_PERF_COUNTERS_DISABLE 31
+#define PR_TASK_PERF_COUNTERS_ENABLE 32
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#define rdclock() \
+({ \
+ struct timespec ts; \
+ \
+ clock_gettime(CLOCK_MONOTONIC, &ts); \
+ ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
+})
+
+/*
+ * Pick up some kernel type conventions:
+ */
+#define __user
+#define asmlinkage
+
+typedef unsigned int __u32;
+typedef unsigned long long __u64;
+typedef long long __s64;
+
+
+#ifdef __x86_64__
+# define __NR_perf_counter_open 295
+#endif
+
+#ifdef __i386__
+# define __NR_perf_counter_open 333
+#endif
+
+#ifdef __powerpc__
+#define __NR_perf_counter_open 319
+#endif
+
+asmlinkage int sys_perf_counter_open(
+ struct perf_counter_hw_event *hw_event_uptr __user,
+ pid_t pid,
+ int cpu,
+ int group_fd,
+ unsigned long flags)
+{
+ int ret;
+
+ ret = syscall(
+ __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
+#if defined(__x86_64__) || defined(__i386__)
+ if (ret < 0 && ret > -4096) {
+ errno = -ret;
+ ret = -1;
+ }
+#endif
+ return ret;
+}
+
#define MAX_COUNTERS 64
#define MAX_NR_CPUS 256
-#define DEF_PERFSTAT_EVENTS { -2, -5, -4, -3, 0, 1, 2, 3}
+#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
static int run_perfstat = 0;
static int system_wide = 0;
static int nr_counters = 0;
-static __s64 event_id[MAX_COUNTERS] = DEF_PERFSTAT_EVENTS;
-static int event_raw[MAX_COUNTERS];
+static __u64 event_id[MAX_COUNTERS] = {
+ EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
+ EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
+ EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
+ EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
+
+ EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
+ EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
+ EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
+ EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
+};
+static int default_interval = 100000;
static int event_count[MAX_COUNTERS];
static int fd[MAX_NR_CPUS][MAX_COUNTERS];
@@ -156,49 +226,63 @@ static char *sw_event_names[] = {
"pagefaults",
"context switches",
"CPU migrations",
+ "minor faults",
+ "major faults",
};
struct event_symbol {
- int event;
+ __u64 event;
char *symbol;
};
static struct event_symbol event_symbols[] = {
- {PERF_COUNT_CPU_CYCLES, "cpu-cycles", },
- {PERF_COUNT_CPU_CYCLES, "cycles", },
- {PERF_COUNT_INSTRUCTIONS, "instructions", },
- {PERF_COUNT_CACHE_REFERENCES, "cache-references", },
- {PERF_COUNT_CACHE_MISSES, "cache-misses", },
- {PERF_COUNT_BRANCH_INSTRUCTIONS, "branch-instructions", },
- {PERF_COUNT_BRANCH_INSTRUCTIONS, "branches", },
- {PERF_COUNT_BRANCH_MISSES, "branch-misses", },
- {PERF_COUNT_BUS_CYCLES, "bus-cycles", },
- {PERF_COUNT_CPU_CLOCK, "cpu-ticks", },
- {PERF_COUNT_CPU_CLOCK, "ticks", },
- {PERF_COUNT_TASK_CLOCK, "task-ticks", },
- {PERF_COUNT_PAGE_FAULTS, "page-faults", },
- {PERF_COUNT_PAGE_FAULTS, "faults", },
- {PERF_COUNT_CONTEXT_SWITCHES, "context-switches", },
- {PERF_COUNT_CONTEXT_SWITCHES, "cs", },
- {PERF_COUNT_CPU_MIGRATIONS, "cpu-migrations", },
- {PERF_COUNT_CPU_MIGRATIONS, "migrations", },
+ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
+ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
+ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
+ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
+ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
+ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
+ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
+ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
+ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
+
+ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
+ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
+ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
+ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
+ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
+ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
+ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
+ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
+ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
+ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
};
+#define __PERF_COUNTER_FIELD(config, name) \
+ ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
+#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
+#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
+#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
+
static void display_events_help(void)
{
unsigned int i;
- int e;
+ __u64 e;
printf(
" -e EVENT --event=EVENT # symbolic-name abbreviations");
- for (i = 0, e = PERF_HW_EVENTS_MAX; i < ARRAY_SIZE(event_symbols); i++) {
- if (e != event_symbols[i].event) {
- e = event_symbols[i].event;
- printf(
- "\n %2d: %-20s", e, event_symbols[i].symbol);
- } else
- printf(" %s", event_symbols[i].symbol);
+ for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+ int type, id;
+
+ e = event_symbols[i].event;
+ type = PERF_COUNTER_TYPE(e);
+ id = PERF_COUNTER_ID(e);
+
+ printf("\n %d:%d: %-20s",
+ type, id, event_symbols[i].symbol);
}
printf("\n"
@@ -249,44 +333,51 @@ static void display_help(void)
exit(0);
}
-static int type_valid(int type)
-{
- if (type >= PERF_HW_EVENTS_MAX)
- return 0;
- if (type <= PERF_SW_EVENTS_MIN)
- return 0;
-
- return 1;
-}
-
static char *event_name(int ctr)
{
- __s64 type = event_id[ctr];
+ __u64 config = event_id[ctr];
+ int type = PERF_COUNTER_TYPE(config);
+ int id = PERF_COUNTER_ID(config);
static char buf[32];
- if (event_raw[ctr]) {
- sprintf(buf, "raw 0x%llx", (long long)type);
+ if (PERF_COUNTER_RAW(config)) {
+ sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
return buf;
}
- if (!type_valid(type))
- return "unknown";
- if (type >= 0)
- return hw_event_names[type];
+ switch (type) {
+ case PERF_TYPE_HARDWARE:
+ if (id < PERF_HW_EVENTS_MAX)
+ return hw_event_names[id];
+ return "unknown-hardware";
+
+ case PERF_TYPE_SOFTWARE:
+ if (id < PERF_SW_EVENTS_MAX)
+ return sw_event_names[id];
+ return "unknown-software";
- return sw_event_names[-type-1];
+ default:
+ break;
+ }
+
+ return "unknown";
}
/*
* Each event can have multiple symbolic names.
* Symbolic names are (almost) exactly matched.
*/
-static int match_event_symbols(char *str)
+static __u64 match_event_symbols(char *str)
{
+ __u64 config, id;
+ int type;
unsigned int i;
- if (isdigit(str[0]) || str[0] == '-')
- return atoi(str);
+ if (sscanf(str, "r%llx", &config) == 1)
+ return config | PERF_COUNTER_RAW_MASK;
+
+ if (sscanf(str, "%d:%llu", &type, &id) == 2)
+ return EID(type, id);
for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
if (!strncmp(str, event_symbols[i].symbol,
@@ -294,31 +385,22 @@ static int match_event_symbols(char *str)
return event_symbols[i].event;
}
- return PERF_HW_EVENTS_MAX;
+ return ~0ULL;
}
static int parse_events(char *str)
{
- __s64 type;
- int raw;
+ __u64 config;
again:
if (nr_counters == MAX_COUNTERS)
return -1;
- raw = 0;
- if (*str == 'r') {
- raw = 1;
- ++str;
- type = strtol(str, NULL, 16);
- } else {
- type = match_event_symbols(str);
- if (!type_valid(type))
- return -1;
- }
+ config = match_event_symbols(str);
+ if (config == ~0ULL)
+ return -1;
- event_id[nr_counters] = type;
- event_raw[nr_counters] = raw;
+ event_id[nr_counters] = config;
nr_counters++;
str = strstr(str, ",");
@@ -342,8 +424,7 @@ static void create_perfstat_counter(int counter)
struct perf_counter_hw_event hw_event;
memset(&hw_event, 0, sizeof(hw_event));
- hw_event.type = event_id[counter];
- hw_event.raw = event_raw[counter];
+ hw_event.config = event_id[counter];
hw_event.record_type = PERF_RECORD_SIMPLE;
hw_event.nmi = 0;
@@ -428,7 +509,7 @@ int do_perfstat(int argc, char *argv[])
count += single_count;
}
- if (!event_raw[counter] &&
+ if (!PERF_COUNTER_RAW(event_id[counter]) &&
(event_id[counter] == PERF_COUNT_CPU_CLOCK ||
event_id[counter] == PERF_COUNT_TASK_CLOCK)) {
@@ -911,7 +992,7 @@ static void record_ip(uint64_t ip, int counter)
assert(left <= middle && middle <= right);
if (!(left <= ip && ip <= right)) {
printf(" left: %016lx\n", left);
- printf(" ip: %016lx\n", ip);
+ printf(" ip: %016llx\n", ip);
printf("right: %016lx\n", right);
}
assert(left <= ip && ip <= right);
@@ -983,7 +1064,7 @@ static void process_options(int argc, char *argv[])
switch (c) {
case 'a': system_wide = 1; break;
- case 'c': event_count[nr_counters] = atoi(optarg); break;
+ case 'c': default_interval = atoi(optarg); break;
case 'C':
/* CPU and PID are mutually exclusive */
if (tid != -1) {
@@ -1032,10 +1113,7 @@ static void process_options(int argc, char *argv[])
if (event_count[counter])
continue;
- if (event_id[counter] < PERF_HW_EVENTS_MAX)
- event_count[counter] = default_count[event_id[counter]];
- else
- event_count[counter] = 100000;
+ event_count[counter] = default_interval;
}
}
@@ -1070,12 +1148,13 @@ int main(int argc, char *argv[])
cpu = i;
memset(&hw_event, 0, sizeof(hw_event));
- hw_event.type = event_id[counter];
- hw_event.raw = event_raw[counter];
+ hw_event.config = event_id[counter];
hw_event.irq_period = event_count[counter];
hw_event.record_type = PERF_RECORD_IRQ;
hw_event.nmi = nmi;
+ printf("FOO: %d %llx %llx\n", counter, event_id[counter], event_count[counter]);
+
fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
if (fd[i][counter] < 0) {
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [tip:perfcounters/core] kerneltop: use mmap() output
2009-03-23 17:22 ` [PATCH 7/7] kerneltop: use mmap() output Peter Zijlstra
@ 2009-03-23 20:57 ` Peter Zijlstra
2009-03-23 20:57 ` [tip:perfcounters/core] perf_counter tools: tidy up in-kernel dependencies Ingo Molnar
1 sibling, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2009-03-23 20:57 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, tglx,
fengguang.wu, mingo
Commit-ID: 0488f729768605502accdae7335911b4400a5927
Gitweb: http://git.kernel.org/tip/0488f729768605502accdae7335911b4400a5927
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Mon, 23 Mar 2009 18:22:12 +0100
Committer: Ingo Molnar <mingo@elte.hu>
CommitDate: Mon, 23 Mar 2009 21:45:11 +0100
kerneltop: use mmap() output
update kerneltop to use the mmap() output to gather overflow information
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090323172417.677932499@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
Documentation/perf_counter/kerneltop.c | 93 +++++++++++++++++++++++++++-----
1 files changed, 79 insertions(+), 14 deletions(-)
diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index a72c9bd..80b7905 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -84,6 +84,7 @@
#include <sys/prctl.h>
#include <sys/wait.h>
#include <sys/uio.h>
+#include <sys/mman.h>
#include <linux/unistd.h>
@@ -119,17 +120,25 @@ typedef long long __s64;
#ifdef __x86_64__
-# define __NR_perf_counter_open 295
+#define __NR_perf_counter_open 295
+#define rmb() asm volatile("lfence" ::: "memory")
+#define cpu_relax() asm volatile("rep; nop" ::: "memory");
#endif
#ifdef __i386__
-# define __NR_perf_counter_open 333
+#define __NR_perf_counter_open 333
+#define rmb() asm volatile("lfence" ::: "memory")
+#define cpu_relax() asm volatile("rep; nop" ::: "memory");
#endif
#ifdef __powerpc__
#define __NR_perf_counter_open 319
+#define rmb() asm volatile ("sync" ::: "memory")
+#define cpu_relax() asm volatile ("" ::: "memory");
#endif
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
asmlinkage int sys_perf_counter_open(
struct perf_counter_hw_event *hw_event_uptr __user,
pid_t pid,
@@ -181,6 +190,7 @@ static int profile_cpu = -1;
static int nr_cpus = 0;
static int nmi = 1;
static int group = 0;
+static unsigned int page_size;
static char *vmlinux;
@@ -1117,16 +1127,68 @@ static void process_options(int argc, char *argv[])
}
}
+struct mmap_data {
+ int counter;
+ void *base;
+ unsigned int mask;
+ unsigned int prev;
+};
+
+static unsigned int mmap_read_head(struct mmap_data *md)
+{
+ struct perf_counter_mmap_page *pc = md->base;
+ unsigned int seq, head;
+
+repeat:
+ rmb();
+ seq = pc->lock;
+
+ if (unlikely(seq & 1)) {
+ cpu_relax();
+ goto repeat;
+ }
+
+ head = pc->data_head;
+
+ rmb();
+ if (pc->lock != seq)
+ goto repeat;
+
+ return head;
+}
+
+static void mmap_read(struct mmap_data *md)
+{
+ unsigned int head = mmap_read_head(md);
+ unsigned int old = md->prev;
+ unsigned char *data = md->base + page_size;
+
+ if (head - old > md->mask) {
+ printf("ERROR: failed to keep up with mmap data\n");
+ exit(-1);
+ }
+
+ for (; old != head;) {
+ __u64 *ptr = (__u64 *)&data[old & md->mask];
+ old += sizeof(__u64);
+
+ process_event(*ptr, md->counter);
+ }
+
+ md->prev = old;
+}
+
int main(int argc, char *argv[])
{
struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS];
+ struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
struct perf_counter_hw_event hw_event;
int i, counter, group_fd;
unsigned int cpu;
- uint64_t ip;
- ssize_t res;
int ret;
+ page_size = sysconf(_SC_PAGE_SIZE);
+
process_options(argc, argv);
nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
@@ -1153,8 +1215,6 @@ int main(int argc, char *argv[])
hw_event.record_type = PERF_RECORD_IRQ;
hw_event.nmi = nmi;
- printf("FOO: %d %llx %llx\n", counter, event_id[counter], event_count[counter]);
-
fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
if (fd[i][counter] < 0) {
@@ -1174,6 +1234,17 @@ int main(int argc, char *argv[])
event_array[i][counter].fd = fd[i][counter];
event_array[i][counter].events = POLLIN;
+
+ mmap_array[i][counter].counter = counter;
+ mmap_array[i][counter].prev = 0;
+ mmap_array[i][counter].mask = 2*page_size - 1;
+ mmap_array[i][counter].base = mmap(NULL, 3*page_size,
+ PROT_READ, MAP_SHARED, fd[i][counter], 0);
+ if (mmap_array[i][counter].base == MAP_FAILED) {
+ printf("kerneltop error: failed to mmap with %d (%s)\n",
+ errno, strerror(errno));
+ exit(-1);
+ }
}
}
@@ -1188,14 +1259,8 @@ int main(int argc, char *argv[])
int hits = events;
for (i = 0; i < nr_cpus; i++) {
- for (counter = 0; counter < nr_counters; counter++) {
- res = read(fd[i][counter], (char *) &ip, sizeof(ip));
- if (res > 0) {
- assert(res == sizeof(ip));
-
- process_event(ip, counter);
- }
- }
+ for (counter = 0; counter < nr_counters; counter++)
+ mmap_read(&mmap_array[i][counter]);
}
if (time(NULL) >= last_refresh + delay_secs) {
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [tip:perfcounters/core] perf_counter tools: tidy up in-kernel dependencies
2009-03-23 17:22 ` [PATCH 7/7] kerneltop: use mmap() output Peter Zijlstra
2009-03-23 20:57 ` [tip:perfcounters/core] " Peter Zijlstra
@ 2009-03-23 20:57 ` Ingo Molnar
2009-05-20 9:26 ` Jaswinder Singh Rajput
1 sibling, 1 reply; 18+ messages in thread
From: Ingo Molnar @ 2009-03-23 20:57 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, tglx,
fengguang.wu, mingo
Commit-ID: 4d1b100b57e8d472ca3b3a628e03c24d66cb782c
Gitweb: http://git.kernel.org/tip/4d1b100b57e8d472ca3b3a628e03c24d66cb782c
Author: Ingo Molnar <mingo@elte.hu>
AuthorDate: Mon, 23 Mar 2009 21:49:25 +0100
Committer: Ingo Molnar <mingo@elte.hu>
CommitDate: Mon, 23 Mar 2009 21:51:35 +0100
perf_counter tools: tidy up in-kernel dependencies
Remove now unified perfstat.c and perf_counter.h, and link to the
in-kernel perf_counter.h.
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090323172417.677932499@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
Documentation/perf_counter/Makefile | 2 +-
Documentation/perf_counter/kerneltop.c | 2 +-
Documentation/perf_counter/perfcounters.h | 142 ----------------
Documentation/perf_counter/perfstat.c | 251 -----------------------------
4 files changed, 2 insertions(+), 395 deletions(-)
diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index b457497..666da95 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -2,7 +2,7 @@ BINS = kerneltop perfstat
all: $(BINS)
-kerneltop: kerneltop.c perfcounters.h
+kerneltop: kerneltop.c ../../include/linux/perf_counter.h
cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o $@ $<
perfstat: kerneltop
diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 80b7905..25e80bc 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -88,7 +88,7 @@
#include <linux/unistd.h>
-#include "include/linux/perf_counter.h"
+#include "../../include/linux/perf_counter.h"
/*
diff --git a/Documentation/perf_counter/perfcounters.h b/Documentation/perf_counter/perfcounters.h
deleted file mode 100644
index 32e24b9..0000000
--- a/Documentation/perf_counter/perfcounters.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Ioctls that can be done on a perf counter fd:
- */
-#define PERF_COUNTER_IOC_ENABLE _IO('$', 0)
-#define PERF_COUNTER_IOC_DISABLE _IO('$', 1)
-
-/*
- * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
- * counters in the current task.
- */
-#define PR_TASK_PERF_COUNTERS_DISABLE 31
-#define PR_TASK_PERF_COUNTERS_ENABLE 32
-
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-
-#define rdclock() \
-({ \
- struct timespec ts; \
- \
- clock_gettime(CLOCK_MONOTONIC, &ts); \
- ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
-})
-
-/*
- * Pick up some kernel type conventions:
- */
-#define __user
-#define asmlinkage
-
-typedef unsigned int __u32;
-typedef unsigned long long __u64;
-typedef long long __s64;
-
-/*
- * User-space ABI bits:
- */
-
-/*
- * Generalized performance counter event types, used by the hw_event.type
- * parameter of the sys_perf_counter_open() syscall:
- */
-enum hw_event_types {
- /*
- * Common hardware events, generalized by the kernel:
- */
- PERF_COUNT_CPU_CYCLES = 0,
- PERF_COUNT_INSTRUCTIONS = 1,
- PERF_COUNT_CACHE_REFERENCES = 2,
- PERF_COUNT_CACHE_MISSES = 3,
- PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
- PERF_COUNT_BRANCH_MISSES = 5,
- PERF_COUNT_BUS_CYCLES = 6,
-
- PERF_HW_EVENTS_MAX = 7,
-
- /*
- * Special "software" counters provided by the kernel, even if
- * the hardware does not support performance counters. These
- * counters measure various physical and sw events of the
- * kernel (and allow the profiling of them as well):
- */
- PERF_COUNT_CPU_CLOCK = -1,
- PERF_COUNT_TASK_CLOCK = -2,
- PERF_COUNT_PAGE_FAULTS = -3,
- PERF_COUNT_CONTEXT_SWITCHES = -4,
- PERF_COUNT_CPU_MIGRATIONS = -5,
-
- PERF_SW_EVENTS_MIN = -6,
-};
-
-/*
- * IRQ-notification data record type:
- */
-enum perf_counter_record_type {
- PERF_RECORD_SIMPLE = 0,
- PERF_RECORD_IRQ = 1,
- PERF_RECORD_GROUP = 2,
-};
-
-/*
- * Hardware event to monitor via a performance monitoring counter:
- */
-struct perf_counter_hw_event {
- __s64 type;
-
- __u64 irq_period;
- __u64 record_type;
- __u64 read_format;
-
- __u64 disabled : 1, /* off by default */
- nmi : 1, /* NMI sampling */
- raw : 1, /* raw event type */
- inherit : 1, /* children inherit it */
- pinned : 1, /* must always be on PMU */
- exclusive : 1, /* only group on PMU */
- exclude_user : 1, /* don't count user */
- exclude_kernel : 1, /* ditto kernel */
- exclude_hv : 1, /* ditto hypervisor */
- exclude_idle : 1, /* don't count when idle */
-
- __reserved_1 : 54;
-
- __u32 extra_config_len;
- __u32 __reserved_4;
-
- __u64 __reserved_2;
- __u64 __reserved_3;
-};
-
-
-#ifdef __x86_64__
-# define __NR_perf_counter_open 295
-#endif
-
-#ifdef __i386__
-# define __NR_perf_counter_open 333
-#endif
-
-#ifdef __powerpc__
-#define __NR_perf_counter_open 319
-#endif
-
-asmlinkage int sys_perf_counter_open(
-
- struct perf_counter_hw_event *hw_event_uptr __user,
- pid_t pid,
- int cpu,
- int group_fd,
- unsigned long flags)
-{
- int ret;
-
- ret = syscall(
- __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
-#if defined(__x86_64__) || defined(__i386__)
- if (ret < 0 && ret > -4096) {
- errno = -ret;
- ret = -1;
- }
-#endif
- return ret;
-}
diff --git a/Documentation/perf_counter/perfstat.c b/Documentation/perf_counter/perfstat.c
deleted file mode 100644
index fd59446..0000000
--- a/Documentation/perf_counter/perfstat.c
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * perfstat: /usr/bin/time -alike performance counter statistics utility
- *
- * It summarizes the counter events of all tasks (and child tasks),
- * covering all CPUs that the command (or workload) executes on.
- * It only counts the per-task events of the workload started,
- * independent of how many other tasks run on those CPUs.
- *
- * Build with: cc -O2 -g -lrt -Wall -W -o perfstat perfstat.c
- *
- * Sample output:
- *
-
- $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
-
- Performance counter stats for 'ls':
-
- 163516953 instructions
- 2295 cache-misses
- 2855182 branch-misses
-
- *
- * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
- *
- * Released under the GPLv2 (not later).
- *
- * Percpu counter support by: Yanmin Zhang <yanmin_zhang@linux.intel.com>
- * Symbolic event options by: Wu Fengguang <fengguang.wu@intel.com>
- */
-#define _GNU_SOURCE
-
-#include <assert.h>
-#include <getopt.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <ctype.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <time.h>
-
-#include <sys/syscall.h>
-#include <sys/ioctl.h>
-#include <sys/prctl.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <sys/wait.h>
-#include <sys/uio.h>
-
-#include <linux/unistd.h>
-
-#include "perfcounters.h"
-
-static int nr_cpus = 0;
-
-static int system_wide = 0;
-
-static void display_help(void)
-{
- unsigned int i;
- int e;
-
- printf(
- "Usage: perfstat [<events...>] <cmd...>\n\n"
- "PerfStat Options (up to %d event types can be specified):\n\n",
- MAX_COUNTERS);
- printf(
- " -e EVENT --event=EVENT # symbolic-name abbreviations");
-
- for (i = 0, e = PERF_HW_EVENTS_MAX; i < ARRAY_SIZE(event_symbols); i++) {
- if (e != event_symbols[i].event) {
- e = event_symbols[i].event;
- printf(
- "\n %2d: %-20s", e, event_symbols[i].symbol);
- } else
- printf(" %s", event_symbols[i].symbol);
- }
-
- printf("\n"
- " rNNN: raw event type\n\n"
- " -s # system-wide collection\n\n"
- " -c <cmd..> --command=<cmd..> # command+arguments to be timed.\n"
- "\n");
- exit(0);
-}
-
-static void process_options(int argc, char *argv[])
-{
- for (;;) {
- int option_index = 0;
- /** Options for getopt */
- static struct option long_options[] = {
- {"event", required_argument, NULL, 'e'},
- {"help", no_argument, NULL, 'h'},
- {"command", no_argument, NULL, 'c'},
- {NULL, 0, NULL, 0 }
- };
- int c = getopt_long(argc, argv, "+:e:c:s",
- long_options, &option_index);
- if (c == -1)
- break;
-
- switch (c) {
- case 'c':
- break;
- case 's':
- system_wide = 1;
- break;
- case 'e':
- parse_events(optarg);
- break;
- default:
- break;
- }
- }
- if (optind == argc)
- goto err;
-
- if (!nr_counters)
- nr_counters = 8;
- return;
-
-err:
- display_help();
-}
-
-char fault_here[1000000];
-
-static int fd[MAX_NR_CPUS][MAX_COUNTERS];
-
-static void create_counter(int counter)
-{
- struct perf_counter_hw_event hw_event;
-
- memset(&hw_event, 0, sizeof(hw_event));
- hw_event.type = event_id[counter];
- hw_event.raw = event_raw[counter];
- hw_event.record_type = PERF_RECORD_SIMPLE;
- hw_event.nmi = 0;
-
- if (system_wide) {
- int cpu;
- for (cpu = 0; cpu < nr_cpus; cpu ++) {
- fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
- if (fd[cpu][counter] < 0) {
- printf("perfstat error: syscall returned with %d (%s)\n",
- fd[cpu][counter], strerror(errno));
- exit(-1);
- }
-
- }
- } else {
- hw_event.inherit = 1;
- hw_event.disabled = 1;
-
- fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
- if (fd[0][counter] < 0) {
- printf("perfstat error: syscall returned with %d (%s)\n",
- fd[0][counter], strerror(errno));
- exit(-1);
- }
- }
-}
-
-
-int main(int argc, char *argv[])
-{
- unsigned long long t0, t1;
- int counter;
- ssize_t res;
- int status;
- int pid;
-
- process_options(argc, argv);
-
- if (system_wide) {
- nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
- assert(nr_cpus <= MAX_NR_CPUS);
- assert(nr_cpus >= 0);
- } else
- nr_cpus = 1;
-
- for (counter = 0; counter < nr_counters; counter++)
- create_counter(counter);
-
- argc -= optind;
- argv += optind;
-
- /*
- * Enable counters and exec the command:
- */
- t0 = rdclock();
- prctl(PR_TASK_PERF_COUNTERS_ENABLE);
-
- if ((pid = fork()) < 0)
- perror("failed to fork");
- if (!pid) {
- if (execvp(argv[0], argv)) {
- perror(argv[0]);
- exit(-1);
- }
- }
- while (wait(&status) >= 0)
- ;
- prctl(PR_TASK_PERF_COUNTERS_DISABLE);
- t1 = rdclock();
-
- fflush(stdout);
-
- fprintf(stderr, "\n");
- fprintf(stderr, " Performance counter stats for \'%s\':\n",
- argv[0]);
- fprintf(stderr, "\n");
-
- for (counter = 0; counter < nr_counters; counter++) {
- int cpu;
- __u64 count, single_count;
-
- count = 0;
- for (cpu = 0; cpu < nr_cpus; cpu ++) {
- res = read(fd[cpu][counter],
- (char *) &single_count, sizeof(single_count));
- assert(res == sizeof(single_count));
- count += single_count;
- }
-
- if (!event_raw[counter] &&
- (event_id[counter] == PERF_COUNT_CPU_CLOCK ||
- event_id[counter] == PERF_COUNT_TASK_CLOCK)) {
-
- double msecs = (double)count / 1000000;
-
- fprintf(stderr, " %14.6f %-20s (msecs)\n",
- msecs, event_name(counter));
- } else {
- fprintf(stderr, " %14Ld %-20s (events)\n",
- count, event_name(counter));
- }
- if (!counter)
- fprintf(stderr, "\n");
- }
- fprintf(stderr, "\n");
- fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
- (double)(t1-t0)/1e6);
- fprintf(stderr, "\n");
-
- return 0;
-}
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [tip:perfcounters/core] mutex: drop "inline" from mutex_lock() inside kernel/mutex.c
2009-03-23 20:56 ` [tip:perfcounters/core] mutex: add atomic_dec_and_mutex_lock() Eric Paris
@ 2009-04-02 0:42 ` H. Peter Anvin
0 siblings, 0 replies; 18+ messages in thread
From: H. Peter Anvin @ 2009-04-02 0:42 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, eparis, tglx, hpa
Commit-ID: 655be4a209fbcae85a745027f426e1f9879b5648
Gitweb: http://git.kernel.org/tip/655be4a209fbcae85a745027f426e1f9879b5648
Author: H. Peter Anvin <hpa@linux.intel.com>
AuthorDate: Wed, 1 Apr 2009 17:21:56 -0700
Committer: H. Peter Anvin <hpa@linux.intel.com>
CommitDate: Wed, 1 Apr 2009 17:21:56 -0700
mutex: drop "inline" from mutex_lock() inside kernel/mutex.c
Impact: build fix
mutex_lock() is was defined inline in kernel/mutex.c, but wasn't
declared so not in <linux/mutex.h>. This didn't cause a problem until
checkin 3a2d367d9aabac486ac4444c6c7ec7a1dab16267 added the
atomic_dec_and_mutex_lock() inline in between declaration and
definion.
This broke building with CONFIG_ALLOW_WARNINGS=n, e.g. make
allnoconfig.
Either from the source code nor the allnoconfig binary output I cannot
find any internal references to mutex_lock() in kernel/mutex.c, so
presumably this "inline" is now-useless legacy.
Cc: Eric Paris <eparis@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <tip-3a2d367d9aabac486ac4444c6c7ec7a1dab16267@git.kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
kernel/mutex.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index e1fb735..29758eb 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);
*
* This function is similar to (but not equivalent to) down().
*/
-void inline __sched mutex_lock(struct mutex *lock)
+void __sched mutex_lock(struct mutex *lock)
{
might_sleep();
/*
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [tip:perfcounters/core] perf_counter tools: tidy up in-kernel dependencies
2009-03-23 20:57 ` [tip:perfcounters/core] perf_counter tools: tidy up in-kernel dependencies Ingo Molnar
@ 2009-05-20 9:26 ` Jaswinder Singh Rajput
0 siblings, 0 replies; 18+ messages in thread
From: Jaswinder Singh Rajput @ 2009-05-20 9:26 UTC (permalink / raw)
To: mingo, hpa, paulus, linux-kernel, a.p.zijlstra, tglx,
fengguang.wu, mingo
Cc: linux-tip-commits
On Mon, 2009-03-23 at 20:57 +0000, Ingo Molnar wrote:
> Commit-ID: 4d1b100b57e8d472ca3b3a628e03c24d66cb782c
> Gitweb: http://git.kernel.org/tip/4d1b100b57e8d472ca3b3a628e03c24d66cb782c
> Author: Ingo Molnar <mingo@elte.hu>
> AuthorDate: Mon, 23 Mar 2009 21:49:25 +0100
> Committer: Ingo Molnar <mingo@elte.hu>
> CommitDate: Mon, 23 Mar 2009 21:51:35 +0100
>
> perf_counter tools: tidy up in-kernel dependencies
>
> Remove now unified perfstat.c and perf_counter.h, and link to the
> in-kernel perf_counter.h.
>
> Cc: Wu Fengguang <fengguang.wu@intel.com>
> Cc: Paul Mackerras <paulus@samba.org>
> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> LKML-Reference: <20090323172417.677932499@chello.nl>
> Signed-off-by: Ingo Molnar <mingo@elte.hu>
>
>
> ---
> Documentation/perf_counter/Makefile | 2 +-
> Documentation/perf_counter/kerneltop.c | 2 +-
> Documentation/perf_counter/perfcounters.h | 142 ----------------
> Documentation/perf_counter/perfstat.c | 251 -----------------------------
> 4 files changed, 2 insertions(+), 395 deletions(-)
>
> diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
> index b457497..666da95 100644
> --- a/Documentation/perf_counter/Makefile
> +++ b/Documentation/perf_counter/Makefile
> @@ -2,7 +2,7 @@ BINS = kerneltop perfstat
>
> all: $(BINS)
>
> -kerneltop: kerneltop.c perfcounters.h
> +kerneltop: kerneltop.c ../../include/linux/perf_counter.h
> cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o $@ $<
>
> perfstat: kerneltop
Can we support Documentation/perf_counter/Makefile to use
Documentation/Makefile so that we can also get these perf_counter user
apps along with new kernel.
--
JSR
^ permalink raw reply [flat|nested] 18+ messages in thread
end of thread, other threads:[~2009-05-20 9:27 UTC | newest]
Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-03-23 17:22 [PATCH 0/7] perf_counter: syscall ABI cleanup and mmap() interface Peter Zijlstra
2009-03-23 17:22 ` [PATCH 1/7] perf_counter: remove the event config bitfields Peter Zijlstra
2009-03-23 20:56 ` [tip:perfcounters/core] " Peter Zijlstra
2009-03-23 17:22 ` [PATCH 2/7] perf_counter: avoid recursion Peter Zijlstra
2009-03-23 20:56 ` [tip:perfcounters/core] " Peter Zijlstra
2009-03-23 17:22 ` [PATCH 3/7] perf_counter: add an mmap method to allow userspace to read hardware counters Peter Zijlstra
2009-03-23 20:56 ` [tip:perfcounters/core] " Paul Mackerras
2009-03-23 17:22 ` [PATCH 4/7] mutex: add atomic_dec_and_mutex_lock] Peter Zijlstra
2009-03-23 20:56 ` [tip:perfcounters/core] mutex: add atomic_dec_and_mutex_lock() Eric Paris
2009-04-02 0:42 ` [tip:perfcounters/core] mutex: drop "inline" from mutex_lock() inside kernel/mutex.c H. Peter Anvin
2009-03-23 17:22 ` [PATCH 5/7] perf_counter: new output ABI - part 1 Peter Zijlstra
2009-03-23 20:56 ` [tip:perfcounters/core] " Peter Zijlstra
2009-03-23 17:22 ` [PATCH 6/7] kerneltop: update to new syscall ABI Peter Zijlstra
2009-03-23 20:57 ` [tip:perfcounters/core] " Peter Zijlstra
2009-03-23 17:22 ` [PATCH 7/7] kerneltop: use mmap() output Peter Zijlstra
2009-03-23 20:57 ` [tip:perfcounters/core] " Peter Zijlstra
2009-03-23 20:57 ` [tip:perfcounters/core] perf_counter tools: tidy up in-kernel dependencies Ingo Molnar
2009-05-20 9:26 ` Jaswinder Singh Rajput
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).