* Re: [PATCH v2 2/2] perf/core: Remove perf_cpu_context::unique_pmu
2017-01-20 20:30 ` David Carrillo-Cisneros
@ 2017-01-25 15:23 ` Peter Zijlstra
0 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2017-01-25 15:23 UTC (permalink / raw)
To: David Carrillo-Cisneros
Cc: linux-kernel, x86, Ingo Molnar, Thomas Gleixner, Andi Kleen,
Kan Liang, Borislav Petkov, Srinivas Pandruvada, Dave Hansen,
Vikas Shivappa, Mark Rutland, Arnaldo Carvalho de Melo,
Vince Weaver, Paul Turner, Stephane Eranian
On Fri, Jan 20, 2017 at 12:30:38PM -0800, David Carrillo-Cisneros wrote:
> On Fri, Jan 20, 2017 at 1:20 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > On Wed, Jan 18, 2017 at 11:24:54AM -0800, David Carrillo-Cisneros wrote:
> >> cpuctx->unique_pmu was originally introduced as a way to identify cpuctxs
> >> with shared pmus in order to avoid visiting the same cpuctx more than once
> >> in a for_each_pmu loop.
> >>
> >> cpuctx->unique_pmu == cpuctx->pmu in non-software task contexts since they
> >> have only one pmu per cpuctx. Since perf_pmu_sched_task is only called in
> >> hw contexts, this patch replaces cpuctx->unique_pmu by cpuctx->pmu in it.
> >>
> >> The change above, together with the previous patch in this series, removed
> >> the remaining uses of cpuctx->unique_pmu, so we remove it altogether.
> >>
> >> Signed-off-by: David Carrillo-Cisneros <davidcc@google.com>
> >> Acked-by: Mark Rutland <mark.rutland@arm.com>
> >
> >
> > This very much relies on us never calling perf_pmu_unregister() on the
> > software PMUs afaict. A condition not mention in the Changelog.
> >
> What's a good way to solve this? Update the Changelog or add code to
> update ctx->pmu?
I think just update the Changelog and maybe put a comment near
perf_pmu_register() and/or the sw pmu abuse that relies on this.
> This issue would go away cleanly if we were to remove the context
> sharing across pmu's. Would you support work in that direction?
Its something that I've considered, the trivial solution is folding it
all into the one swevent pmu by adding a switch in all the
add/del/start/stop/read methods. Its a wee bit ugly but straight fwd.
I've not really found anything less ugly though; and I have to fully
admit to the current situation being rather vile.
I also just found the below patch that I've had bitrotting since 2015.
---
Subject: perf: Move all software PMUs into their own file
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri Apr 17 19:52:17 CEST 2015
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/events/Makefile | 2
kernel/events/core.c | 1280 +++++------------------------------------------
kernel/events/internal.h | 13
kernel/events/software.c | 1021 +++++++++++++++++++++++++++++++++++++
4 files changed, 1184 insertions(+), 1132 deletions(-)
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,7 +2,7 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
endif
-obj-y := core.o ring_buffer.o callchain.o
+obj-y := core.o software.o ring_buffer.o callchain.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_UPROBES) += uprobes.o
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,14 +36,11 @@
#include <linux/kernel_stat.h>
#include <linux/cgroup.h>
#include <linux/perf_event.h>
-#include <linux/ftrace_event.h>
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
#include <linux/module.h>
#include <linux/mman.h>
#include <linux/compat.h>
-#include <linux/bpf.h>
-#include <linux/filter.h>
#include "internal.h"
@@ -1828,8 +1825,6 @@ static void perf_set_shadow_time(struct
event->shadow_ctx_time = tstamp - ctx->timestamp;
}
-#define MAX_INTERRUPTS (~0ULL)
-
static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);
@@ -3411,9 +3406,6 @@ find_get_context(struct pmu *pmu, struct
return ERR_PTR(err);
}
-static void perf_event_free_filter(struct perf_event *event);
-static void perf_event_free_bpf_prog(struct perf_event *event);
-
static void free_event_rcu(struct rcu_head *head)
{
struct perf_event *event;
@@ -4020,8 +4012,6 @@ static inline int perf_fget_light(int fd
static int perf_event_set_output(struct perf_event *event,
struct perf_event *output_event);
-static int perf_event_set_filter(struct perf_event *event, void __user *arg);
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
@@ -6036,9 +6026,9 @@ static void perf_log_itrace_start(struct
* Generic event overflow handling, sampling.
*/
-static int __perf_event_overflow(struct perf_event *event,
- int throttle, struct perf_sample_data *data,
- struct pt_regs *regs)
+int __perf_event_overflow(struct perf_event *event,
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs)
{
int events = atomic_read(&event->event_limit);
struct hw_perf_event *hwc = &event->hw;
@@ -6111,1155 +6101,223 @@ int perf_event_overflow(struct perf_even
return __perf_event_overflow(event, 1, data, regs);
}
-/*
- * Generic software event infrastructure
- */
-
-struct swevent_htable {
- struct swevent_hlist *swevent_hlist;
- struct mutex hlist_mutex;
- int hlist_refcount;
-
- /* Recursion avoidance in each contexts */
- int recursion[PERF_NR_CONTEXTS];
-
- /* Keeps track of cpu being initialized/exited */
- bool online;
-};
-
-static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
-
-/*
- * We directly increment event->count and keep a second value in
- * event->hw.period_left to count intervals. This period event
- * is kept in the range [-sample_period, 0] so that we can use the
- * sign as trigger.
- */
-
-u64 perf_swevent_set_period(struct perf_event *event)
+static void perf_pmu_nop_void(struct pmu *pmu)
{
- struct hw_perf_event *hwc = &event->hw;
- u64 period = hwc->last_period;
- u64 nr, offset;
- s64 old, val;
-
- hwc->last_period = hwc->sample_period;
-
-again:
- old = val = local64_read(&hwc->period_left);
- if (val < 0)
- return 0;
-
- nr = div64_u64(period + val, period);
- offset = nr * period;
- val -= offset;
- if (local64_cmpxchg(&hwc->period_left, old, val) != old)
- goto again;
-
- return nr;
}
-static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+static int perf_pmu_nop_int(struct pmu *pmu)
{
- struct hw_perf_event *hwc = &event->hw;
- int throttle = 0;
-
- if (!overflow)
- overflow = perf_swevent_set_period(event);
-
- if (hwc->interrupts == MAX_INTERRUPTS)
- return;
-
- for (; overflow; overflow--) {
- if (__perf_event_overflow(event, throttle,
- data, regs)) {
- /*
- * We inhibit the overflow from happening when
- * hwc->interrupts == MAX_INTERRUPTS.
- */
- break;
- }
- throttle = 1;
- }
+ return 0;
}
-static void perf_swevent_event(struct perf_event *event, u64 nr,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+static void perf_pmu_start_txn(struct pmu *pmu)
{
- struct hw_perf_event *hwc = &event->hw;
-
- local64_add(nr, &event->count);
-
- if (!regs)
- return;
-
- if (!is_sampling_event(event))
- return;
-
- if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
- data->period = nr;
- return perf_swevent_overflow(event, 1, data, regs);
- } else
- data->period = event->hw.last_period;
-
- if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
- return perf_swevent_overflow(event, 1, data, regs);
-
- if (local64_add_negative(nr, &hwc->period_left))
- return;
-
- perf_swevent_overflow(event, 0, data, regs);
+ perf_pmu_disable(pmu);
}
-static int perf_exclude_event(struct perf_event *event,
- struct pt_regs *regs)
+static int perf_pmu_commit_txn(struct pmu *pmu)
{
- if (event->hw.state & PERF_HES_STOPPED)
- return 1;
-
- if (regs) {
- if (event->attr.exclude_user && user_mode(regs))
- return 1;
-
- if (event->attr.exclude_kernel && !user_mode(regs))
- return 1;
- }
-
+ perf_pmu_enable(pmu);
return 0;
}
-static int perf_swevent_match(struct perf_event *event,
- enum perf_type_id type,
- u32 event_id,
- struct perf_sample_data *data,
- struct pt_regs *regs)
-{
- if (event->attr.type != type)
- return 0;
-
- if (event->attr.config != event_id)
- return 0;
-
- if (perf_exclude_event(event, regs))
- return 0;
-
- return 1;
-}
-
-static inline u64 swevent_hash(u64 type, u32 event_id)
+static void perf_pmu_cancel_txn(struct pmu *pmu)
{
- u64 val = event_id | (type << 32);
-
- return hash_64(val, SWEVENT_HLIST_BITS);
+ perf_pmu_enable(pmu);
}
-static inline struct hlist_head *
-__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
+static int perf_event_idx_default(struct perf_event *event)
{
- u64 hash = swevent_hash(type, event_id);
-
- return &hlist->heads[hash];
+ return 0;
}
-/* For the read side: events when they trigger */
-static inline struct hlist_head *
-find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
+/*
+ * Ensures all contexts with the same task_ctx_nr have the same
+ * pmu_cpu_context too.
+ */
+static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
{
- struct swevent_hlist *hlist;
+ struct pmu *pmu;
- hlist = rcu_dereference(swhash->swevent_hlist);
- if (!hlist)
+ if (ctxn < 0)
return NULL;
- return __find_swevent_head(hlist, type, event_id);
+ list_for_each_entry(pmu, &pmus, entry) {
+ if (pmu->task_ctx_nr == ctxn)
+ return pmu->pmu_cpu_context;
+ }
+
+ return NULL;
}
-/* For the event head insertion and removal in the hlist */
-static inline struct hlist_head *
-find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
+static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
{
- struct swevent_hlist *hlist;
- u32 event_id = event->attr.config;
- u64 type = event->attr.type;
-
- /*
- * Event scheduling is always serialized against hlist allocation
- * and release. Which makes the protected version suitable here.
- * The context lock guarantees that.
- */
- hlist = rcu_dereference_protected(swhash->swevent_hlist,
- lockdep_is_held(&event->ctx->lock));
- if (!hlist)
- return NULL;
+ int cpu;
- return __find_swevent_head(hlist, type, event_id);
-}
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_context *cpuctx;
-static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
- u64 nr,
- struct perf_sample_data *data,
- struct pt_regs *regs)
-{
- struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
- struct perf_event *event;
- struct hlist_head *head;
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- rcu_read_lock();
- head = find_swevent_head_rcu(swhash, type, event_id);
- if (!head)
- goto end;
-
- hlist_for_each_entry_rcu(event, head, hlist_entry) {
- if (perf_swevent_match(event, type, event_id, data, regs))
- perf_swevent_event(event, nr, data, regs);
+ if (cpuctx->unique_pmu == old_pmu)
+ cpuctx->unique_pmu = pmu;
}
-end:
- rcu_read_unlock();
}
-DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
-
-int perf_swevent_get_recursion_context(void)
+static void free_pmu_context(struct pmu *pmu)
{
- struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
-
- return get_recursion_context(swhash->recursion);
-}
-EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
+ struct pmu *i;
-inline void perf_swevent_put_recursion_context(int rctx)
-{
- struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
+ mutex_lock(&pmus_lock);
+ /*
+ * Like a real lame refcount.
+ */
+ list_for_each_entry(i, &pmus, entry) {
+ if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
+ update_pmu_context(i, pmu);
+ goto out;
+ }
+ }
- put_recursion_context(swhash->recursion, rctx);
+ free_percpu(pmu->pmu_cpu_context);
+out:
+ mutex_unlock(&pmus_lock);
}
+static struct idr pmu_idr;
-void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+static ssize_t
+type_show(struct device *dev, struct device_attribute *attr, char *page)
{
- struct perf_sample_data data;
-
- if (WARN_ON_ONCE(!regs))
- return;
+ struct pmu *pmu = dev_get_drvdata(dev);
- perf_sample_data_init(&data, addr, 0);
- do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
+ return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
}
+static DEVICE_ATTR_RO(type);
-void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+static ssize_t
+perf_event_mux_interval_ms_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
{
- int rctx;
-
- preempt_disable_notrace();
- rctx = perf_swevent_get_recursion_context();
- if (unlikely(rctx < 0))
- goto fail;
-
- ___perf_sw_event(event_id, nr, regs, addr);
+ struct pmu *pmu = dev_get_drvdata(dev);
- perf_swevent_put_recursion_context(rctx);
-fail:
- preempt_enable_notrace();
+ return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
}
-static void perf_swevent_read(struct perf_event *event)
+static ssize_t
+perf_event_mux_interval_ms_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
-}
+ struct pmu *pmu = dev_get_drvdata(dev);
+ int timer, cpu, ret;
-static int perf_swevent_add(struct perf_event *event, int flags)
-{
- struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
- struct hw_perf_event *hwc = &event->hw;
- struct hlist_head *head;
+ ret = kstrtoint(buf, 0, &timer);
+ if (ret)
+ return ret;
- if (is_sampling_event(event)) {
- hwc->last_period = hwc->sample_period;
- perf_swevent_set_period(event);
- }
+ if (timer < 1)
+ return -EINVAL;
- hwc->state = !(flags & PERF_EF_START);
+ /* same value, noting to do */
+ if (timer == pmu->hrtimer_interval_ms)
+ return count;
- head = find_swevent_head(swhash, event);
- if (!head) {
- /*
- * We can race with cpu hotplug code. Do not
- * WARN if the cpu just got unplugged.
- */
- WARN_ON_ONCE(swhash->online);
- return -EINVAL;
- }
+ pmu->hrtimer_interval_ms = timer;
- hlist_add_head_rcu(&event->hlist_entry, head);
- perf_event_update_userpage(event);
+ /* update all cpuctx for this PMU */
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_context *cpuctx;
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
- return 0;
-}
+ if (hrtimer_active(&cpuctx->hrtimer))
+ hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+ }
-static void perf_swevent_del(struct perf_event *event, int flags)
-{
- hlist_del_rcu(&event->hlist_entry);
+ return count;
}
+static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
-static void perf_swevent_start(struct perf_event *event, int flags)
-{
- event->hw.state = 0;
-}
+static struct attribute *pmu_dev_attrs[] = {
+ &dev_attr_type.attr,
+ &dev_attr_perf_event_mux_interval_ms.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(pmu_dev);
-static void perf_swevent_stop(struct perf_event *event, int flags)
-{
- event->hw.state = PERF_HES_STOPPED;
-}
+static int pmu_bus_running;
+static struct bus_type pmu_bus = {
+ .name = "event_source",
+ .dev_groups = pmu_dev_groups,
+};
-/* Deref the hlist from the update side */
-static inline struct swevent_hlist *
-swevent_hlist_deref(struct swevent_htable *swhash)
+static void pmu_dev_release(struct device *dev)
{
- return rcu_dereference_protected(swhash->swevent_hlist,
- lockdep_is_held(&swhash->hlist_mutex));
+ kfree(dev);
}
-static void swevent_hlist_release(struct swevent_htable *swhash)
+static int pmu_dev_alloc(struct pmu *pmu)
{
- struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
-
- if (!hlist)
- return;
+ int ret = -ENOMEM;
- RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
- kfree_rcu(hlist, rcu_head);
-}
+ pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
+ if (!pmu->dev)
+ goto out;
-static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
-{
- struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+ pmu->dev->groups = pmu->attr_groups;
+ device_initialize(pmu->dev);
+ ret = dev_set_name(pmu->dev, "%s", pmu->name);
+ if (ret)
+ goto free_dev;
- mutex_lock(&swhash->hlist_mutex);
+ dev_set_drvdata(pmu->dev, pmu);
+ pmu->dev->bus = &pmu_bus;
+ pmu->dev->release = pmu_dev_release;
+ ret = device_add(pmu->dev);
+ if (ret)
+ goto free_dev;
- if (!--swhash->hlist_refcount)
- swevent_hlist_release(swhash);
+out:
+ return ret;
- mutex_unlock(&swhash->hlist_mutex);
+free_dev:
+ put_device(pmu->dev);
+ goto out;
}
-static void swevent_hlist_put(struct perf_event *event)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- swevent_hlist_put_cpu(event, cpu);
-}
+static struct lock_class_key cpuctx_mutex;
+static struct lock_class_key cpuctx_lock;
-static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+int perf_pmu_register(struct pmu *pmu, const char *name, int type)
{
- struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
- int err = 0;
+ int cpu, ret;
- mutex_lock(&swhash->hlist_mutex);
+ mutex_lock(&pmus_lock);
+ ret = -ENOMEM;
+ pmu->pmu_disable_count = alloc_percpu(int);
+ if (!pmu->pmu_disable_count)
+ goto unlock;
- if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
- struct swevent_hlist *hlist;
+ pmu->type = -1;
+ if (!name)
+ goto skip_type;
+ pmu->name = name;
- hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
- if (!hlist) {
- err = -ENOMEM;
- goto exit;
+ if (type < 0) {
+ type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
+ if (type < 0) {
+ ret = type;
+ goto free_pdc;
}
- rcu_assign_pointer(swhash->swevent_hlist, hlist);
}
- swhash->hlist_refcount++;
-exit:
- mutex_unlock(&swhash->hlist_mutex);
+ pmu->type = type;
- return err;
-}
-
-static int swevent_hlist_get(struct perf_event *event)
-{
- int err;
- int cpu, failed_cpu;
-
- get_online_cpus();
- for_each_possible_cpu(cpu) {
- err = swevent_hlist_get_cpu(event, cpu);
- if (err) {
- failed_cpu = cpu;
- goto fail;
- }
- }
- put_online_cpus();
-
- return 0;
-fail:
- for_each_possible_cpu(cpu) {
- if (cpu == failed_cpu)
- break;
- swevent_hlist_put_cpu(event, cpu);
- }
-
- put_online_cpus();
- return err;
-}
-
-struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
-
-static void sw_perf_event_destroy(struct perf_event *event)
-{
- u64 event_id = event->attr.config;
-
- WARN_ON(event->parent);
-
- static_key_slow_dec(&perf_swevent_enabled[event_id]);
- swevent_hlist_put(event);
-}
-
-static int perf_swevent_init(struct perf_event *event)
-{
- u64 event_id = event->attr.config;
-
- if (event->attr.type != PERF_TYPE_SOFTWARE)
- return -ENOENT;
-
- /*
- * no branch sampling for software events
- */
- if (has_branch_stack(event))
- return -EOPNOTSUPP;
-
- switch (event_id) {
- case PERF_COUNT_SW_CPU_CLOCK:
- case PERF_COUNT_SW_TASK_CLOCK:
- return -ENOENT;
-
- default:
- break;
- }
-
- if (event_id >= PERF_COUNT_SW_MAX)
- return -ENOENT;
-
- if (!event->parent) {
- int err;
-
- err = swevent_hlist_get(event);
- if (err)
- return err;
-
- static_key_slow_inc(&perf_swevent_enabled[event_id]);
- event->destroy = sw_perf_event_destroy;
- }
-
- return 0;
-}
-
-static struct pmu perf_swevent = {
- .task_ctx_nr = perf_sw_context,
-
- .capabilities = PERF_PMU_CAP_NO_NMI,
-
- .event_init = perf_swevent_init,
- .add = perf_swevent_add,
- .del = perf_swevent_del,
- .start = perf_swevent_start,
- .stop = perf_swevent_stop,
- .read = perf_swevent_read,
-};
-
-#ifdef CONFIG_EVENT_TRACING
-
-static int perf_tp_filter_match(struct perf_event *event,
- struct perf_sample_data *data)
-{
- void *record = data->raw->data;
-
- if (likely(!event->filter) || filter_match_preds(event->filter, record))
- return 1;
- return 0;
-}
-
-static int perf_tp_event_match(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
-{
- if (event->hw.state & PERF_HES_STOPPED)
- return 0;
- /*
- * All tracepoints are from kernel-space.
- */
- if (event->attr.exclude_kernel)
- return 0;
-
- if (!perf_tp_filter_match(event, data))
- return 0;
-
- return 1;
-}
-
-void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
- struct pt_regs *regs, struct hlist_head *head, int rctx,
- struct task_struct *task)
-{
- struct perf_sample_data data;
- struct perf_event *event;
-
- struct perf_raw_record raw = {
- .size = entry_size,
- .data = record,
- };
-
- perf_sample_data_init(&data, addr, 0);
- data.raw = &raw;
-
- hlist_for_each_entry_rcu(event, head, hlist_entry) {
- if (perf_tp_event_match(event, &data, regs))
- perf_swevent_event(event, count, &data, regs);
- }
-
- /*
- * If we got specified a target task, also iterate its context and
- * deliver this event there too.
- */
- if (task && task != current) {
- struct perf_event_context *ctx;
- struct trace_entry *entry = record;
-
- rcu_read_lock();
- ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
- if (!ctx)
- goto unlock;
-
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- continue;
- if (event->attr.config != entry->type)
- continue;
- if (perf_tp_event_match(event, &data, regs))
- perf_swevent_event(event, count, &data, regs);
- }
-unlock:
- rcu_read_unlock();
- }
-
- perf_swevent_put_recursion_context(rctx);
-}
-EXPORT_SYMBOL_GPL(perf_tp_event);
-
-static void tp_perf_event_destroy(struct perf_event *event)
-{
- perf_trace_destroy(event);
-}
-
-static int perf_tp_event_init(struct perf_event *event)
-{
- int err;
-
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -ENOENT;
-
- /*
- * no branch sampling for tracepoint events
- */
- if (has_branch_stack(event))
- return -EOPNOTSUPP;
-
- err = perf_trace_init(event);
- if (err)
- return err;
-
- event->destroy = tp_perf_event_destroy;
-
- return 0;
-}
-
-static struct pmu perf_tracepoint = {
- .task_ctx_nr = perf_sw_context,
-
- .event_init = perf_tp_event_init,
- .add = perf_trace_add,
- .del = perf_trace_del,
- .start = perf_swevent_start,
- .stop = perf_swevent_stop,
- .read = perf_swevent_read,
-};
-
-static inline void perf_tp_register(void)
-{
- perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
-}
-
-static int perf_event_set_filter(struct perf_event *event, void __user *arg)
-{
- char *filter_str;
- int ret;
-
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -EINVAL;
-
- filter_str = strndup_user(arg, PAGE_SIZE);
- if (IS_ERR(filter_str))
- return PTR_ERR(filter_str);
-
- ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
-
- kfree(filter_str);
- return ret;
-}
-
-static void perf_event_free_filter(struct perf_event *event)
-{
- ftrace_profile_free_filter(event);
-}
-
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
-{
- struct bpf_prog *prog;
-
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -EINVAL;
-
- if (event->tp_event->prog)
- return -EEXIST;
-
- if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
- /* bpf programs can only be attached to kprobes */
- return -EINVAL;
-
- prog = bpf_prog_get(prog_fd);
- if (IS_ERR(prog))
- return PTR_ERR(prog);
-
- if (prog->type != BPF_PROG_TYPE_KPROBE) {
- /* valid fd, but invalid bpf program type */
- bpf_prog_put(prog);
- return -EINVAL;
- }
-
- event->tp_event->prog = prog;
-
- return 0;
-}
-
-static void perf_event_free_bpf_prog(struct perf_event *event)
-{
- struct bpf_prog *prog;
-
- if (!event->tp_event)
- return;
-
- prog = event->tp_event->prog;
- if (prog) {
- event->tp_event->prog = NULL;
- bpf_prog_put(prog);
- }
-}
-
-#else
-
-static inline void perf_tp_register(void)
-{
-}
-
-static int perf_event_set_filter(struct perf_event *event, void __user *arg)
-{
- return -ENOENT;
-}
-
-static void perf_event_free_filter(struct perf_event *event)
-{
-}
-
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
-{
- return -ENOENT;
-}
-
-static void perf_event_free_bpf_prog(struct perf_event *event)
-{
-}
-#endif /* CONFIG_EVENT_TRACING */
-
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
-void perf_bp_event(struct perf_event *bp, void *data)
-{
- struct perf_sample_data sample;
- struct pt_regs *regs = data;
-
- perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
-
- if (!bp->hw.state && !perf_exclude_event(bp, regs))
- perf_swevent_event(bp, 1, &sample, regs);
-}
-#endif
-
-/*
- * hrtimer based swevent callback
- */
-
-static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
-{
- enum hrtimer_restart ret = HRTIMER_RESTART;
- struct perf_sample_data data;
- struct pt_regs *regs;
- struct perf_event *event;
- u64 period;
-
- event = container_of(hrtimer, struct perf_event, hw.hrtimer);
-
- if (event->state != PERF_EVENT_STATE_ACTIVE)
- return HRTIMER_NORESTART;
-
- event->pmu->read(event);
-
- perf_sample_data_init(&data, 0, event->hw.last_period);
- regs = get_irq_regs();
-
- if (regs && !perf_exclude_event(event, regs)) {
- if (!(event->attr.exclude_idle && is_idle_task(current)))
- if (__perf_event_overflow(event, 1, &data, regs))
- ret = HRTIMER_NORESTART;
- }
-
- period = max_t(u64, 10000, event->hw.sample_period);
- hrtimer_forward_now(hrtimer, ns_to_ktime(period));
-
- return ret;
-}
-
-static void perf_swevent_start_hrtimer(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- s64 period;
-
- if (!is_sampling_event(event))
- return;
-
- period = local64_read(&hwc->period_left);
- if (period) {
- if (period < 0)
- period = 10000;
-
- local64_set(&hwc->period_left, 0);
- } else {
- period = max_t(u64, 10000, hwc->sample_period);
- }
- __hrtimer_start_range_ns(&hwc->hrtimer,
- ns_to_ktime(period), 0,
- HRTIMER_MODE_REL_PINNED, 0);
-}
-
-static void perf_swevent_cancel_hrtimer(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- if (is_sampling_event(event)) {
- ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
- local64_set(&hwc->period_left, ktime_to_ns(remaining));
-
- hrtimer_cancel(&hwc->hrtimer);
- }
-}
-
-static void perf_swevent_init_hrtimer(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- if (!is_sampling_event(event))
- return;
-
- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hwc->hrtimer.function = perf_swevent_hrtimer;
-
- /*
- * Since hrtimers have a fixed rate, we can do a static freq->period
- * mapping and avoid the whole period adjust feedback stuff.
- */
- if (event->attr.freq) {
- long freq = event->attr.sample_freq;
-
- event->attr.sample_period = NSEC_PER_SEC / freq;
- hwc->sample_period = event->attr.sample_period;
- local64_set(&hwc->period_left, hwc->sample_period);
- hwc->last_period = hwc->sample_period;
- event->attr.freq = 0;
- }
-}
-
-/*
- * Software event: cpu wall time clock
- */
-
-static void cpu_clock_event_update(struct perf_event *event)
-{
- s64 prev;
- u64 now;
-
- now = local_clock();
- prev = local64_xchg(&event->hw.prev_count, now);
- local64_add(now - prev, &event->count);
-}
-
-static void cpu_clock_event_start(struct perf_event *event, int flags)
-{
- local64_set(&event->hw.prev_count, local_clock());
- perf_swevent_start_hrtimer(event);
-}
-
-static void cpu_clock_event_stop(struct perf_event *event, int flags)
-{
- perf_swevent_cancel_hrtimer(event);
- cpu_clock_event_update(event);
-}
-
-static int cpu_clock_event_add(struct perf_event *event, int flags)
-{
- if (flags & PERF_EF_START)
- cpu_clock_event_start(event, flags);
- perf_event_update_userpage(event);
-
- return 0;
-}
-
-static void cpu_clock_event_del(struct perf_event *event, int flags)
-{
- cpu_clock_event_stop(event, flags);
-}
-
-static void cpu_clock_event_read(struct perf_event *event)
-{
- cpu_clock_event_update(event);
-}
-
-static int cpu_clock_event_init(struct perf_event *event)
-{
- if (event->attr.type != PERF_TYPE_SOFTWARE)
- return -ENOENT;
-
- if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
- return -ENOENT;
-
- /*
- * no branch sampling for software events
- */
- if (has_branch_stack(event))
- return -EOPNOTSUPP;
-
- perf_swevent_init_hrtimer(event);
-
- return 0;
-}
-
-static struct pmu perf_cpu_clock = {
- .task_ctx_nr = perf_sw_context,
-
- .capabilities = PERF_PMU_CAP_NO_NMI,
-
- .event_init = cpu_clock_event_init,
- .add = cpu_clock_event_add,
- .del = cpu_clock_event_del,
- .start = cpu_clock_event_start,
- .stop = cpu_clock_event_stop,
- .read = cpu_clock_event_read,
-};
-
-/*
- * Software event: task time clock
- */
-
-static void task_clock_event_update(struct perf_event *event, u64 now)
-{
- u64 prev;
- s64 delta;
-
- prev = local64_xchg(&event->hw.prev_count, now);
- delta = now - prev;
- local64_add(delta, &event->count);
-}
-
-static void task_clock_event_start(struct perf_event *event, int flags)
-{
- local64_set(&event->hw.prev_count, event->ctx->time);
- perf_swevent_start_hrtimer(event);
-}
-
-static void task_clock_event_stop(struct perf_event *event, int flags)
-{
- perf_swevent_cancel_hrtimer(event);
- task_clock_event_update(event, event->ctx->time);
-}
-
-static int task_clock_event_add(struct perf_event *event, int flags)
-{
- if (flags & PERF_EF_START)
- task_clock_event_start(event, flags);
- perf_event_update_userpage(event);
-
- return 0;
-}
-
-static void task_clock_event_del(struct perf_event *event, int flags)
-{
- task_clock_event_stop(event, PERF_EF_UPDATE);
-}
-
-static void task_clock_event_read(struct perf_event *event)
-{
- u64 now = perf_clock();
- u64 delta = now - event->ctx->timestamp;
- u64 time = event->ctx->time + delta;
-
- task_clock_event_update(event, time);
-}
-
-static int task_clock_event_init(struct perf_event *event)
-{
- if (event->attr.type != PERF_TYPE_SOFTWARE)
- return -ENOENT;
-
- if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
- return -ENOENT;
-
- /*
- * no branch sampling for software events
- */
- if (has_branch_stack(event))
- return -EOPNOTSUPP;
-
- perf_swevent_init_hrtimer(event);
-
- return 0;
-}
-
-static struct pmu perf_task_clock = {
- .task_ctx_nr = perf_sw_context,
-
- .capabilities = PERF_PMU_CAP_NO_NMI,
-
- .event_init = task_clock_event_init,
- .add = task_clock_event_add,
- .del = task_clock_event_del,
- .start = task_clock_event_start,
- .stop = task_clock_event_stop,
- .read = task_clock_event_read,
-};
-
-static void perf_pmu_nop_void(struct pmu *pmu)
-{
-}
-
-static int perf_pmu_nop_int(struct pmu *pmu)
-{
- return 0;
-}
-
-static void perf_pmu_start_txn(struct pmu *pmu)
-{
- perf_pmu_disable(pmu);
-}
-
-static int perf_pmu_commit_txn(struct pmu *pmu)
-{
- perf_pmu_enable(pmu);
- return 0;
-}
-
-static void perf_pmu_cancel_txn(struct pmu *pmu)
-{
- perf_pmu_enable(pmu);
-}
-
-static int perf_event_idx_default(struct perf_event *event)
-{
- return 0;
-}
-
-/*
- * Ensures all contexts with the same task_ctx_nr have the same
- * pmu_cpu_context too.
- */
-static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
-{
- struct pmu *pmu;
-
- if (ctxn < 0)
- return NULL;
-
- list_for_each_entry(pmu, &pmus, entry) {
- if (pmu->task_ctx_nr == ctxn)
- return pmu->pmu_cpu_context;
- }
-
- return NULL;
-}
-
-static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
-
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-
- if (cpuctx->unique_pmu == old_pmu)
- cpuctx->unique_pmu = pmu;
- }
-}
-
-static void free_pmu_context(struct pmu *pmu)
-{
- struct pmu *i;
-
- mutex_lock(&pmus_lock);
- /*
- * Like a real lame refcount.
- */
- list_for_each_entry(i, &pmus, entry) {
- if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
- update_pmu_context(i, pmu);
- goto out;
- }
- }
-
- free_percpu(pmu->pmu_cpu_context);
-out:
- mutex_unlock(&pmus_lock);
-}
-static struct idr pmu_idr;
-
-static ssize_t
-type_show(struct device *dev, struct device_attribute *attr, char *page)
-{
- struct pmu *pmu = dev_get_drvdata(dev);
-
- return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
-}
-static DEVICE_ATTR_RO(type);
-
-static ssize_t
-perf_event_mux_interval_ms_show(struct device *dev,
- struct device_attribute *attr,
- char *page)
-{
- struct pmu *pmu = dev_get_drvdata(dev);
-
- return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
-}
-
-static ssize_t
-perf_event_mux_interval_ms_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct pmu *pmu = dev_get_drvdata(dev);
- int timer, cpu, ret;
-
- ret = kstrtoint(buf, 0, &timer);
- if (ret)
- return ret;
-
- if (timer < 1)
- return -EINVAL;
-
- /* same value, noting to do */
- if (timer == pmu->hrtimer_interval_ms)
- return count;
-
- pmu->hrtimer_interval_ms = timer;
-
- /* update all cpuctx for this PMU */
- for_each_possible_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
-
- if (hrtimer_active(&cpuctx->hrtimer))
- hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
- }
-
- return count;
-}
-static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
-
-static struct attribute *pmu_dev_attrs[] = {
- &dev_attr_type.attr,
- &dev_attr_perf_event_mux_interval_ms.attr,
- NULL,
-};
-ATTRIBUTE_GROUPS(pmu_dev);
-
-static int pmu_bus_running;
-static struct bus_type pmu_bus = {
- .name = "event_source",
- .dev_groups = pmu_dev_groups,
-};
-
-static void pmu_dev_release(struct device *dev)
-{
- kfree(dev);
-}
-
-static int pmu_dev_alloc(struct pmu *pmu)
-{
- int ret = -ENOMEM;
-
- pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
- if (!pmu->dev)
- goto out;
-
- pmu->dev->groups = pmu->attr_groups;
- device_initialize(pmu->dev);
- ret = dev_set_name(pmu->dev, "%s", pmu->name);
- if (ret)
- goto free_dev;
-
- dev_set_drvdata(pmu->dev, pmu);
- pmu->dev->bus = &pmu_bus;
- pmu->dev->release = pmu_dev_release;
- ret = device_add(pmu->dev);
- if (ret)
- goto free_dev;
-
-out:
- return ret;
-
-free_dev:
- put_device(pmu->dev);
- goto out;
-}
-
-static struct lock_class_key cpuctx_mutex;
-static struct lock_class_key cpuctx_lock;
-
-int perf_pmu_register(struct pmu *pmu, const char *name, int type)
-{
- int cpu, ret;
-
- mutex_lock(&pmus_lock);
- ret = -ENOMEM;
- pmu->pmu_disable_count = alloc_percpu(int);
- if (!pmu->pmu_disable_count)
- goto unlock;
-
- pmu->type = -1;
- if (!name)
- goto skip_type;
- pmu->name = name;
-
- if (type < 0) {
- type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
- if (type < 0) {
- ret = type;
- goto free_pdc;
- }
- }
- pmu->type = type;
-
- if (pmu_bus_running) {
- ret = pmu_dev_alloc(pmu);
- if (ret)
- goto free_idr;
- }
+ if (pmu_bus_running) {
+ ret = pmu_dev_alloc(pmu);
+ if (ret)
+ goto free_idr;
+ }
skip_type:
pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
@@ -8808,30 +7866,10 @@ int perf_event_init_task(struct task_str
static void __init perf_event_init_all_cpus(void)
{
- struct swevent_htable *swhash;
int cpu;
- for_each_possible_cpu(cpu) {
- swhash = &per_cpu(swevent_htable, cpu);
- mutex_init(&swhash->hlist_mutex);
+ for_each_possible_cpu(cpu)
INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
- }
-}
-
-static void perf_event_init_cpu(int cpu)
-{
- struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
-
- mutex_lock(&swhash->hlist_mutex);
- swhash->online = true;
- if (swhash->hlist_refcount > 0) {
- struct swevent_hlist *hlist;
-
- hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
- WARN_ON(!hlist);
- rcu_assign_pointer(swhash->swevent_hlist, hlist);
- }
- mutex_unlock(&swhash->hlist_mutex);
}
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
@@ -8862,20 +7900,8 @@ static void perf_event_exit_cpu_context(
}
srcu_read_unlock(&pmus_srcu, idx);
}
-
-static void perf_event_exit_cpu(int cpu)
-{
- struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
-
- perf_event_exit_cpu_context(cpu);
-
- mutex_lock(&swhash->hlist_mutex);
- swhash->online = false;
- swevent_hlist_release(swhash);
- mutex_unlock(&swhash->hlist_mutex);
-}
#else
-static inline void perf_event_exit_cpu(int cpu) { }
+static inline void perf_event_exit_cpu_context(int cpu) { }
#endif
static int
@@ -8884,7 +7910,7 @@ perf_reboot(struct notifier_block *notif
int cpu;
for_each_online_cpu(cpu)
- perf_event_exit_cpu(cpu);
+ perf_event_exit_cpu_context(cpu);
return NOTIFY_OK;
}
@@ -8905,14 +7931,9 @@ perf_cpu_notify(struct notifier_block *s
switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_UP_PREPARE:
- case CPU_DOWN_FAILED:
- perf_event_init_cpu(cpu);
- break;
-
case CPU_UP_CANCELED:
case CPU_DOWN_PREPARE:
- perf_event_exit_cpu(cpu);
+ perf_event_exit_cpu_context(cpu);
break;
default:
break;
@@ -8929,10 +7950,7 @@ void __init perf_event_init(void)
perf_event_init_all_cpus();
init_srcu_struct(&pmus_srcu);
- perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
- perf_pmu_register(&perf_cpu_clock, NULL, -1);
- perf_pmu_register(&perf_task_clock, NULL, -1);
- perf_tp_register();
+ perf_swevent_register();
perf_cpu_notifier(perf_cpu_notify);
register_reboot_notifier(&perf_reboot_notifier);
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -228,4 +228,17 @@ static inline bool arch_perf_have_user_s
#define perf_user_stack_pointer(regs) 0
#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */
+#define MAX_INTERRUPTS (~0ULL)
+
+extern int __perf_event_overflow(struct perf_event *event,
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs);
+
+extern void perf_event_free_filter(struct perf_event *event);
+extern void perf_event_free_bpf_prog(struct perf_event *event);
+extern int perf_event_set_filter(struct perf_event *event, void __user *arg);
+extern int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
+
+extern void perf_swevent_register(void);
+
#endif /* _KERNEL_EVENTS_INTERNAL_H */
--- /dev/null
+++ b/kernel/events/software.c
@@ -0,0 +1,1021 @@
+
+#include <linux/perf_event.h>
+#include <linux/rculist.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+#include <linux/ftrace_event.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+#include "internal.h"
+
+/*
+ * Generic software event infrastructure
+ */
+
+struct swevent_htable {
+ struct swevent_hlist *swevent_hlist;
+ struct mutex hlist_mutex;
+ int hlist_refcount;
+
+ /* Recursion avoidance in each contexts */
+ int recursion[PERF_NR_CONTEXTS];
+
+ /* Keeps track of cpu being initialized/exited */
+ bool online;
+};
+
+static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
+
+/*
+ * We directly increment event->count and keep a second value in
+ * event->hw.period_left to count intervals. This period event
+ * is kept in the range [-sample_period, 0] so that we can use the
+ * sign as trigger.
+ */
+
+u64 perf_swevent_set_period(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 period = hwc->last_period;
+ u64 nr, offset;
+ s64 old, val;
+
+ hwc->last_period = hwc->sample_period;
+
+again:
+ old = val = local64_read(&hwc->period_left);
+ if (val < 0)
+ return 0;
+
+ nr = div64_u64(period + val, period);
+ offset = nr * period;
+ val -= offset;
+ if (local64_cmpxchg(&hwc->period_left, old, val) != old)
+ goto again;
+
+ return nr;
+}
+
+static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int throttle = 0;
+
+ if (!overflow)
+ overflow = perf_swevent_set_period(event);
+
+ if (hwc->interrupts == MAX_INTERRUPTS)
+ return;
+
+ for (; overflow; overflow--) {
+ if (__perf_event_overflow(event, throttle,
+ data, regs)) {
+ /*
+ * We inhibit the overflow from happening when
+ * hwc->interrupts == MAX_INTERRUPTS.
+ */
+ break;
+ }
+ throttle = 1;
+ }
+}
+
+static void perf_swevent_event(struct perf_event *event, u64 nr,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ local64_add(nr, &event->count);
+
+ if (!regs)
+ return;
+
+ if (!is_sampling_event(event))
+ return;
+
+ if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
+ data->period = nr;
+ return perf_swevent_overflow(event, 1, data, regs);
+ } else
+ data->period = event->hw.last_period;
+
+ if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
+ return perf_swevent_overflow(event, 1, data, regs);
+
+ if (local64_add_negative(nr, &hwc->period_left))
+ return;
+
+ perf_swevent_overflow(event, 0, data, regs);
+}
+
+static int perf_exclude_event(struct perf_event *event,
+ struct pt_regs *regs)
+{
+ if (event->hw.state & PERF_HES_STOPPED)
+ return 1;
+
+ if (regs) {
+ if (event->attr.exclude_user && user_mode(regs))
+ return 1;
+
+ if (event->attr.exclude_kernel && !user_mode(regs))
+ return 1;
+ }
+
+ return 0;
+}
+
+
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+void perf_bp_event(struct perf_event *bp, void *data)
+{
+ struct perf_sample_data sample;
+ struct pt_regs *regs = data;
+
+ perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
+
+ if (!bp->hw.state && !perf_exclude_event(bp, regs))
+ perf_swevent_event(bp, 1, &sample, regs);
+}
+#endif
+
+static int perf_swevent_match(struct perf_event *event,
+ enum perf_type_id type,
+ u32 event_id,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ if (event->attr.type != type)
+ return 0;
+
+ if (event->attr.config != event_id)
+ return 0;
+
+ if (perf_exclude_event(event, regs))
+ return 0;
+
+ return 1;
+}
+
+static inline u64 swevent_hash(u64 type, u32 event_id)
+{
+ u64 val = event_id | (type << 32);
+
+ return hash_64(val, SWEVENT_HLIST_BITS);
+}
+
+static inline struct hlist_head *
+__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
+{
+ u64 hash = swevent_hash(type, event_id);
+
+ return &hlist->heads[hash];
+}
+
+/* For the read side: events when they trigger */
+static inline struct hlist_head *
+find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
+{
+ struct swevent_hlist *hlist;
+
+ hlist = rcu_dereference(swhash->swevent_hlist);
+ if (!hlist)
+ return NULL;
+
+ return __find_swevent_head(hlist, type, event_id);
+}
+
+/* For the event head insertion and removal in the hlist */
+static inline struct hlist_head *
+find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
+{
+ struct swevent_hlist *hlist;
+ u32 event_id = event->attr.config;
+ u64 type = event->attr.type;
+
+ /*
+ * Event scheduling is always serialized against hlist allocation
+ * and release. Which makes the protected version suitable here.
+ * The context lock guarantees that.
+ */
+ hlist = rcu_dereference_protected(swhash->swevent_hlist,
+ lockdep_is_held(&event->ctx->lock));
+ if (!hlist)
+ return NULL;
+
+ return __find_swevent_head(hlist, type, event_id);
+}
+
+static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
+ u64 nr,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
+ struct perf_event *event;
+ struct hlist_head *head;
+
+ rcu_read_lock();
+ head = find_swevent_head_rcu(swhash, type, event_id);
+ if (!head)
+ goto end;
+
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ if (perf_swevent_match(event, type, event_id, data, regs))
+ perf_swevent_event(event, nr, data, regs);
+ }
+end:
+ rcu_read_unlock();
+}
+
+DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
+
+int perf_swevent_get_recursion_context(void)
+{
+ struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
+
+ return get_recursion_context(swhash->recursion);
+}
+EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
+
+inline void perf_swevent_put_recursion_context(int rctx)
+{
+ struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
+
+ put_recursion_context(swhash->recursion, rctx);
+}
+
+void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+{
+ struct perf_sample_data data;
+
+ if (WARN_ON_ONCE(!regs))
+ return;
+
+ perf_sample_data_init(&data, addr, 0);
+ do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
+}
+
+void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+{
+ int rctx;
+
+ preempt_disable_notrace();
+ rctx = perf_swevent_get_recursion_context();
+ if (unlikely(rctx < 0))
+ goto fail;
+
+ ___perf_sw_event(event_id, nr, regs, addr);
+
+ perf_swevent_put_recursion_context(rctx);
+fail:
+ preempt_enable_notrace();
+}
+
+static void perf_swevent_read(struct perf_event *event)
+{
+}
+
+static int perf_swevent_add(struct perf_event *event, int flags)
+{
+ struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
+ struct hw_perf_event *hwc = &event->hw;
+ struct hlist_head *head;
+
+ if (is_sampling_event(event)) {
+ hwc->last_period = hwc->sample_period;
+ perf_swevent_set_period(event);
+ }
+
+ hwc->state = !(flags & PERF_EF_START);
+
+ head = find_swevent_head(swhash, event);
+ if (!head) {
+ /*
+ * We can race with cpu hotplug code. Do not
+ * WARN if the cpu just got unplugged.
+ */
+ WARN_ON_ONCE(swhash->online);
+ return -EINVAL;
+ }
+
+ hlist_add_head_rcu(&event->hlist_entry, head);
+ perf_event_update_userpage(event);
+
+ return 0;
+}
+
+static void perf_swevent_del(struct perf_event *event, int flags)
+{
+ hlist_del_rcu(&event->hlist_entry);
+}
+
+static void perf_swevent_start(struct perf_event *event, int flags)
+{
+ event->hw.state = 0;
+}
+
+static void perf_swevent_stop(struct perf_event *event, int flags)
+{
+ event->hw.state = PERF_HES_STOPPED;
+}
+
+/* Deref the hlist from the update side */
+static inline struct swevent_hlist *
+swevent_hlist_deref(struct swevent_htable *swhash)
+{
+ return rcu_dereference_protected(swhash->swevent_hlist,
+ lockdep_is_held(&swhash->hlist_mutex));
+}
+
+static void swevent_hlist_release(struct swevent_htable *swhash)
+{
+ struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
+
+ if (!hlist)
+ return;
+
+ RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
+ kfree_rcu(hlist, rcu_head);
+}
+
+static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
+{
+ struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+
+ mutex_lock(&swhash->hlist_mutex);
+
+ if (!--swhash->hlist_refcount)
+ swevent_hlist_release(swhash);
+
+ mutex_unlock(&swhash->hlist_mutex);
+}
+
+static void swevent_hlist_put(struct perf_event *event)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ swevent_hlist_put_cpu(event, cpu);
+}
+
+static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+{
+ struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+ int err = 0;
+
+ mutex_lock(&swhash->hlist_mutex);
+
+ if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
+ struct swevent_hlist *hlist;
+
+ hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+ if (!hlist) {
+ err = -ENOMEM;
+ goto exit;
+ }
+ rcu_assign_pointer(swhash->swevent_hlist, hlist);
+ }
+ swhash->hlist_refcount++;
+exit:
+ mutex_unlock(&swhash->hlist_mutex);
+
+ return err;
+}
+
+static int swevent_hlist_get(struct perf_event *event)
+{
+ int err;
+ int cpu, failed_cpu;
+
+ get_online_cpus();
+ for_each_possible_cpu(cpu) {
+ err = swevent_hlist_get_cpu(event, cpu);
+ if (err) {
+ failed_cpu = cpu;
+ goto fail;
+ }
+ }
+ put_online_cpus();
+
+ return 0;
+fail:
+ for_each_possible_cpu(cpu) {
+ if (cpu == failed_cpu)
+ break;
+ swevent_hlist_put_cpu(event, cpu);
+ }
+
+ put_online_cpus();
+ return err;
+}
+
+struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
+
+static void sw_perf_event_destroy(struct perf_event *event)
+{
+ u64 event_id = event->attr.config;
+
+ WARN_ON(event->parent);
+
+ static_key_slow_dec(&perf_swevent_enabled[event_id]);
+ swevent_hlist_put(event);
+}
+
+static int perf_swevent_init(struct perf_event *event)
+{
+ u64 event_id = event->attr.config;
+
+ if (event->attr.type != PERF_TYPE_SOFTWARE)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for software events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ switch (event_id) {
+ case PERF_COUNT_SW_CPU_CLOCK:
+ case PERF_COUNT_SW_TASK_CLOCK:
+ return -ENOENT;
+
+ default:
+ break;
+ }
+
+ if (event_id >= PERF_COUNT_SW_MAX)
+ return -ENOENT;
+
+ if (!event->parent) {
+ int err;
+
+ err = swevent_hlist_get(event);
+ if (err)
+ return err;
+
+ static_key_slow_inc(&perf_swevent_enabled[event_id]);
+ event->destroy = sw_perf_event_destroy;
+ }
+
+ return 0;
+}
+
+static struct pmu perf_swevent = {
+ .task_ctx_nr = perf_sw_context,
+
+ .capabilities = PERF_PMU_CAP_NO_NMI,
+
+ .event_init = perf_swevent_init,
+ .add = perf_swevent_add,
+ .del = perf_swevent_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+};
+
+#ifdef CONFIG_EVENT_TRACING
+
+static int perf_tp_filter_match(struct perf_event *event,
+ struct perf_sample_data *data)
+{
+ void *record = data->raw->data;
+
+ if (likely(!event->filter) || filter_match_preds(event->filter, record))
+ return 1;
+ return 0;
+}
+
+static int perf_tp_event_match(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ if (event->hw.state & PERF_HES_STOPPED)
+ return 0;
+ /*
+ * All tracepoints are from kernel-space.
+ */
+ if (event->attr.exclude_kernel)
+ return 0;
+
+ if (!perf_tp_filter_match(event, data))
+ return 0;
+
+ return 1;
+}
+
+void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
+ struct pt_regs *regs, struct hlist_head *head, int rctx,
+ struct task_struct *task)
+{
+ struct perf_sample_data data;
+ struct perf_event *event;
+
+ struct perf_raw_record raw = {
+ .size = entry_size,
+ .data = record,
+ };
+
+ perf_sample_data_init(&data, addr, 0);
+ data.raw = &raw;
+
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ if (perf_tp_event_match(event, &data, regs))
+ perf_swevent_event(event, count, &data, regs);
+ }
+
+ /*
+ * If we got specified a target task, also iterate its context and
+ * deliver this event there too.
+ */
+ if (task && task != current) {
+ struct perf_event_context *ctx;
+ struct trace_entry *entry = record;
+
+ rcu_read_lock();
+ ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+ if (!ctx)
+ goto unlock;
+
+ list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ continue;
+ if (event->attr.config != entry->type)
+ continue;
+ if (perf_tp_event_match(event, &data, regs))
+ perf_swevent_event(event, count, &data, regs);
+ }
+unlock:
+ rcu_read_unlock();
+ }
+
+ perf_swevent_put_recursion_context(rctx);
+}
+EXPORT_SYMBOL_GPL(perf_tp_event);
+
+static void tp_perf_event_destroy(struct perf_event *event)
+{
+ perf_trace_destroy(event);
+}
+
+static int perf_tp_event_init(struct perf_event *event)
+{
+ int err;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for tracepoint events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ err = perf_trace_init(event);
+ if (err)
+ return err;
+
+ event->destroy = tp_perf_event_destroy;
+
+ return 0;
+}
+
+static struct pmu perf_tracepoint = {
+ .task_ctx_nr = perf_sw_context,
+
+ .event_init = perf_tp_event_init,
+ .add = perf_trace_add,
+ .del = perf_trace_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+};
+
+static inline void perf_tp_register(void)
+{
+ perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
+}
+
+int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+ char *filter_str;
+ int ret;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -EINVAL;
+
+ filter_str = strndup_user(arg, PAGE_SIZE);
+ if (IS_ERR(filter_str))
+ return PTR_ERR(filter_str);
+
+ ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+
+ kfree(filter_str);
+ return ret;
+}
+
+void perf_event_free_filter(struct perf_event *event)
+{
+ ftrace_profile_free_filter(event);
+}
+
+int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+ struct bpf_prog *prog;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -EINVAL;
+
+ if (event->tp_event->prog)
+ return -EEXIST;
+
+ if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
+ /* bpf programs can only be attached to kprobes */
+ return -EINVAL;
+
+ prog = bpf_prog_get(prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->type != BPF_PROG_TYPE_KPROBE) {
+ /* valid fd, but invalid bpf program type */
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ event->tp_event->prog = prog;
+
+ return 0;
+}
+
+void perf_event_free_bpf_prog(struct perf_event *event)
+{
+ struct bpf_prog *prog;
+
+ if (!event->tp_event)
+ return;
+
+ prog = event->tp_event->prog;
+ if (prog) {
+ event->tp_event->prog = NULL;
+ bpf_prog_put(prog);
+ }
+}
+
+#else
+
+static inline void perf_tp_register(void)
+{
+}
+
+int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+ return -ENOENT;
+}
+
+void perf_event_free_filter(struct perf_event *event)
+{
+}
+
+int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+ return -ENOENT;
+}
+
+void perf_event_free_bpf_prog(struct perf_event *event)
+{
+}
+#endif /* CONFIG_EVENT_TRACING */
+
+/*
+ * hrtimer based swevent callback
+ */
+
+static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
+{
+ enum hrtimer_restart ret = HRTIMER_RESTART;
+ struct perf_sample_data data;
+ struct pt_regs *regs;
+ struct perf_event *event;
+ u64 period;
+
+ event = container_of(hrtimer, struct perf_event, hw.hrtimer);
+
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return HRTIMER_NORESTART;
+
+ event->pmu->read(event);
+
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+ regs = get_irq_regs();
+
+ if (regs && !perf_exclude_event(event, regs)) {
+ if (!(event->attr.exclude_idle && is_idle_task(current)))
+ if (__perf_event_overflow(event, 1, &data, regs))
+ ret = HRTIMER_NORESTART;
+ }
+
+ period = max_t(u64, 10000, event->hw.sample_period);
+ hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+
+ return ret;
+}
+
+static void perf_swevent_start_hrtimer(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ s64 period;
+
+ if (!is_sampling_event(event))
+ return;
+
+ period = local64_read(&hwc->period_left);
+ if (period) {
+ if (period < 0)
+ period = 10000;
+
+ local64_set(&hwc->period_left, 0);
+ } else {
+ period = max_t(u64, 10000, hwc->sample_period);
+ }
+ __hrtimer_start_range_ns(&hwc->hrtimer,
+ ns_to_ktime(period), 0,
+ HRTIMER_MODE_REL_PINNED, 0);
+}
+
+static void perf_swevent_cancel_hrtimer(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (is_sampling_event(event)) {
+ ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
+ local64_set(&hwc->period_left, ktime_to_ns(remaining));
+
+ hrtimer_cancel(&hwc->hrtimer);
+ }
+}
+
+static void perf_swevent_init_hrtimer(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!is_sampling_event(event))
+ return;
+
+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hwc->hrtimer.function = perf_swevent_hrtimer;
+
+ /*
+ * Since hrtimers have a fixed rate, we can do a static freq->period
+ * mapping and avoid the whole period adjust feedback stuff.
+ */
+ if (event->attr.freq) {
+ long freq = event->attr.sample_freq;
+
+ event->attr.sample_period = NSEC_PER_SEC / freq;
+ hwc->sample_period = event->attr.sample_period;
+ local64_set(&hwc->period_left, hwc->sample_period);
+ hwc->last_period = hwc->sample_period;
+ event->attr.freq = 0;
+ }
+}
+
+/*
+ * Software event: cpu wall time clock
+ */
+
+static void cpu_clock_event_update(struct perf_event *event)
+{
+ s64 prev;
+ u64 now;
+
+ now = local_clock();
+ prev = local64_xchg(&event->hw.prev_count, now);
+ local64_add(now - prev, &event->count);
+}
+
+static void cpu_clock_event_start(struct perf_event *event, int flags)
+{
+ local64_set(&event->hw.prev_count, local_clock());
+ perf_swevent_start_hrtimer(event);
+}
+
+static void cpu_clock_event_stop(struct perf_event *event, int flags)
+{
+ perf_swevent_cancel_hrtimer(event);
+ cpu_clock_event_update(event);
+}
+
+static int cpu_clock_event_add(struct perf_event *event, int flags)
+{
+ if (flags & PERF_EF_START)
+ cpu_clock_event_start(event, flags);
+ perf_event_update_userpage(event);
+
+ return 0;
+}
+
+static void cpu_clock_event_del(struct perf_event *event, int flags)
+{
+ cpu_clock_event_stop(event, flags);
+}
+
+static void cpu_clock_event_read(struct perf_event *event)
+{
+ cpu_clock_event_update(event);
+}
+
+static int cpu_clock_event_init(struct perf_event *event)
+{
+ if (event->attr.type != PERF_TYPE_SOFTWARE)
+ return -ENOENT;
+
+ if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for software events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ perf_swevent_init_hrtimer(event);
+
+ return 0;
+}
+
+static struct pmu perf_cpu_clock = {
+ .task_ctx_nr = perf_sw_context,
+
+ .capabilities = PERF_PMU_CAP_NO_NMI,
+
+ .event_init = cpu_clock_event_init,
+ .add = cpu_clock_event_add,
+ .del = cpu_clock_event_del,
+ .start = cpu_clock_event_start,
+ .stop = cpu_clock_event_stop,
+ .read = cpu_clock_event_read,
+};
+
+/*
+ * Software event: task time clock
+ */
+
+static void task_clock_event_update(struct perf_event *event, u64 now)
+{
+ u64 prev;
+ s64 delta;
+
+ prev = local64_xchg(&event->hw.prev_count, now);
+ delta = now - prev;
+ local64_add(delta, &event->count);
+}
+
+static void task_clock_event_start(struct perf_event *event, int flags)
+{
+ local64_set(&event->hw.prev_count, event->ctx->time);
+ perf_swevent_start_hrtimer(event);
+}
+
+static void task_clock_event_stop(struct perf_event *event, int flags)
+{
+ perf_swevent_cancel_hrtimer(event);
+ task_clock_event_update(event, event->ctx->time);
+}
+
+static int task_clock_event_add(struct perf_event *event, int flags)
+{
+ if (flags & PERF_EF_START)
+ task_clock_event_start(event, flags);
+ perf_event_update_userpage(event);
+
+ return 0;
+}
+
+static void task_clock_event_del(struct perf_event *event, int flags)
+{
+ task_clock_event_stop(event, PERF_EF_UPDATE);
+}
+
+static void task_clock_event_read(struct perf_event *event)
+{
+ u64 now = local_clock(); /* XXX */
+ u64 delta = now - event->ctx->timestamp;
+ u64 time = event->ctx->time + delta;
+
+ task_clock_event_update(event, time);
+}
+
+static int task_clock_event_init(struct perf_event *event)
+{
+ if (event->attr.type != PERF_TYPE_SOFTWARE)
+ return -ENOENT;
+
+ if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for software events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ perf_swevent_init_hrtimer(event);
+
+ return 0;
+}
+
+static struct pmu perf_task_clock = {
+ .task_ctx_nr = perf_sw_context,
+
+ .capabilities = PERF_PMU_CAP_NO_NMI,
+
+ .event_init = task_clock_event_init,
+ .add = task_clock_event_add,
+ .del = task_clock_event_del,
+ .start = task_clock_event_start,
+ .stop = task_clock_event_stop,
+ .read = task_clock_event_read,
+};
+
+static void __init perf_swevent_init_all_cpus(void)
+{
+ struct swevent_htable *swhash;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ swhash = &per_cpu(swevent_htable, cpu);
+ mutex_init(&swhash->hlist_mutex);
+ }
+}
+
+static void perf_swevent_init_cpu(int cpu)
+{
+ struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+
+ mutex_lock(&swhash->hlist_mutex);
+ swhash->online = true;
+ if (swhash->hlist_refcount > 0) {
+ struct swevent_hlist *hlist;
+
+ hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
+ WARN_ON(!hlist);
+ rcu_assign_pointer(swhash->swevent_hlist, hlist);
+ }
+ mutex_unlock(&swhash->hlist_mutex);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void perf_swevent_exit_cpu(int cpu)
+{
+ struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+
+ mutex_lock(&swhash->hlist_mutex);
+ swhash->online = false;
+ swevent_hlist_release(swhash);
+ mutex_unlock(&swhash->hlist_mutex);
+}
+#else
+static inline void perf_swevent_exit_cpu(int cpu) { }
+#endif
+
+static int
+perf_swevent_notify(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+
+ case CPU_UP_PREPARE:
+ case CPU_DOWN_FAILED:
+ perf_swevent_init_cpu(cpu);
+ break;
+
+ case CPU_UP_CANCELED:
+ case CPU_DOWN_PREPARE:
+ perf_swevent_exit_cpu(cpu);
+ break;
+
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+__init void perf_swevent_register(void)
+{
+ perf_swevent_init_all_cpus();
+
+ perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
+ perf_pmu_register(&perf_cpu_clock, NULL, -1);
+ perf_pmu_register(&perf_task_clock, NULL, -1);
+ perf_tp_register();
+
+ perf_cpu_notifier(perf_swevent_notify);
+}
+
^ permalink raw reply [flat|nested] 9+ messages in thread