From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751129AbdAYPYM (ORCPT ); Wed, 25 Jan 2017 10:24:12 -0500 Received: from bombadil.infradead.org ([65.50.211.133]:45752 "EHLO bombadil.infradead.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750744AbdAYPYL (ORCPT ); Wed, 25 Jan 2017 10:24:11 -0500 Date: Wed, 25 Jan 2017 16:23:54 +0100 From: Peter Zijlstra To: David Carrillo-Cisneros Cc: linux-kernel , "x86@kernel.org" , Ingo Molnar , Thomas Gleixner , Andi Kleen , Kan Liang , Borislav Petkov , Srinivas Pandruvada , Dave Hansen , Vikas Shivappa , Mark Rutland , Arnaldo Carvalho de Melo , Vince Weaver , Paul Turner , Stephane Eranian Subject: Re: [PATCH v2 2/2] perf/core: Remove perf_cpu_context::unique_pmu Message-ID: <20170125152354.GP6500@twins.programming.kicks-ass.net> References: <20170118192454.58008-1-davidcc@google.com> <20170118192454.58008-3-davidcc@google.com> <20170120092033.GK6515@twins.programming.kicks-ass.net> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: User-Agent: Mutt/1.5.23.1 (2014-03-12) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Fri, Jan 20, 2017 at 12:30:38PM -0800, David Carrillo-Cisneros wrote: > On Fri, Jan 20, 2017 at 1:20 AM, Peter Zijlstra wrote: > > On Wed, Jan 18, 2017 at 11:24:54AM -0800, David Carrillo-Cisneros wrote: > >> cpuctx->unique_pmu was originally introduced as a way to identify cpuctxs > >> with shared pmus in order to avoid visiting the same cpuctx more than once > >> in a for_each_pmu loop. > >> > >> cpuctx->unique_pmu == cpuctx->pmu in non-software task contexts since they > >> have only one pmu per cpuctx. Since perf_pmu_sched_task is only called in > >> hw contexts, this patch replaces cpuctx->unique_pmu by cpuctx->pmu in it. > >> > >> The change above, together with the previous patch in this series, removed > >> the remaining uses of cpuctx->unique_pmu, so we remove it altogether. > >> > >> Signed-off-by: David Carrillo-Cisneros > >> Acked-by: Mark Rutland > > > > > > This very much relies on us never calling perf_pmu_unregister() on the > > software PMUs afaict. A condition not mention in the Changelog. > > > What's a good way to solve this? Update the Changelog or add code to > update ctx->pmu? I think just update the Changelog and maybe put a comment near perf_pmu_register() and/or the sw pmu abuse that relies on this. > This issue would go away cleanly if we were to remove the context > sharing across pmu's. Would you support work in that direction? Its something that I've considered, the trivial solution is folding it all into the one swevent pmu by adding a switch in all the add/del/start/stop/read methods. Its a wee bit ugly but straight fwd. I've not really found anything less ugly though; and I have to fully admit to the current situation being rather vile. I also just found the below patch that I've had bitrotting since 2015. --- Subject: perf: Move all software PMUs into their own file From: Peter Zijlstra Date: Fri Apr 17 19:52:17 CEST 2015 Signed-off-by: Peter Zijlstra (Intel) --- kernel/events/Makefile | 2 kernel/events/core.c | 1280 +++++------------------------------------------ kernel/events/internal.h | 13 kernel/events/software.c | 1021 +++++++++++++++++++++++++++++++++++++ 4 files changed, 1184 insertions(+), 1132 deletions(-) --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -2,7 +2,7 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) endif -obj-y := core.o ring_buffer.o callchain.o +obj-y := core.o software.o ring_buffer.o callchain.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_UPROBES) += uprobes.o --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -36,14 +36,11 @@ #include #include #include -#include #include #include #include #include #include -#include -#include #include "internal.h" @@ -1828,8 +1825,6 @@ static void perf_set_shadow_time(struct event->shadow_ctx_time = tstamp - ctx->timestamp; } -#define MAX_INTERRUPTS (~0ULL) - static void perf_log_throttle(struct perf_event *event, int enable); static void perf_log_itrace_start(struct perf_event *event); @@ -3411,9 +3406,6 @@ find_get_context(struct pmu *pmu, struct return ERR_PTR(err); } -static void perf_event_free_filter(struct perf_event *event); -static void perf_event_free_bpf_prog(struct perf_event *event); - static void free_event_rcu(struct rcu_head *head) { struct perf_event *event; @@ -4020,8 +4012,6 @@ static inline int perf_fget_light(int fd static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); -static int perf_event_set_filter(struct perf_event *event, void __user *arg); -static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { @@ -6036,9 +6026,9 @@ static void perf_log_itrace_start(struct * Generic event overflow handling, sampling. */ -static int __perf_event_overflow(struct perf_event *event, - int throttle, struct perf_sample_data *data, - struct pt_regs *regs) +int __perf_event_overflow(struct perf_event *event, + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) { int events = atomic_read(&event->event_limit); struct hw_perf_event *hwc = &event->hw; @@ -6111,1155 +6101,223 @@ int perf_event_overflow(struct perf_even return __perf_event_overflow(event, 1, data, regs); } -/* - * Generic software event infrastructure - */ - -struct swevent_htable { - struct swevent_hlist *swevent_hlist; - struct mutex hlist_mutex; - int hlist_refcount; - - /* Recursion avoidance in each contexts */ - int recursion[PERF_NR_CONTEXTS]; - - /* Keeps track of cpu being initialized/exited */ - bool online; -}; - -static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); - -/* - * We directly increment event->count and keep a second value in - * event->hw.period_left to count intervals. This period event - * is kept in the range [-sample_period, 0] so that we can use the - * sign as trigger. - */ - -u64 perf_swevent_set_period(struct perf_event *event) +static void perf_pmu_nop_void(struct pmu *pmu) { - struct hw_perf_event *hwc = &event->hw; - u64 period = hwc->last_period; - u64 nr, offset; - s64 old, val; - - hwc->last_period = hwc->sample_period; - -again: - old = val = local64_read(&hwc->period_left); - if (val < 0) - return 0; - - nr = div64_u64(period + val, period); - offset = nr * period; - val -= offset; - if (local64_cmpxchg(&hwc->period_left, old, val) != old) - goto again; - - return nr; } -static void perf_swevent_overflow(struct perf_event *event, u64 overflow, - struct perf_sample_data *data, - struct pt_regs *regs) +static int perf_pmu_nop_int(struct pmu *pmu) { - struct hw_perf_event *hwc = &event->hw; - int throttle = 0; - - if (!overflow) - overflow = perf_swevent_set_period(event); - - if (hwc->interrupts == MAX_INTERRUPTS) - return; - - for (; overflow; overflow--) { - if (__perf_event_overflow(event, throttle, - data, regs)) { - /* - * We inhibit the overflow from happening when - * hwc->interrupts == MAX_INTERRUPTS. - */ - break; - } - throttle = 1; - } + return 0; } -static void perf_swevent_event(struct perf_event *event, u64 nr, - struct perf_sample_data *data, - struct pt_regs *regs) +static void perf_pmu_start_txn(struct pmu *pmu) { - struct hw_perf_event *hwc = &event->hw; - - local64_add(nr, &event->count); - - if (!regs) - return; - - if (!is_sampling_event(event)) - return; - - if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { - data->period = nr; - return perf_swevent_overflow(event, 1, data, regs); - } else - data->period = event->hw.last_period; - - if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) - return perf_swevent_overflow(event, 1, data, regs); - - if (local64_add_negative(nr, &hwc->period_left)) - return; - - perf_swevent_overflow(event, 0, data, regs); + perf_pmu_disable(pmu); } -static int perf_exclude_event(struct perf_event *event, - struct pt_regs *regs) +static int perf_pmu_commit_txn(struct pmu *pmu) { - if (event->hw.state & PERF_HES_STOPPED) - return 1; - - if (regs) { - if (event->attr.exclude_user && user_mode(regs)) - return 1; - - if (event->attr.exclude_kernel && !user_mode(regs)) - return 1; - } - + perf_pmu_enable(pmu); return 0; } -static int perf_swevent_match(struct perf_event *event, - enum perf_type_id type, - u32 event_id, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - if (event->attr.type != type) - return 0; - - if (event->attr.config != event_id) - return 0; - - if (perf_exclude_event(event, regs)) - return 0; - - return 1; -} - -static inline u64 swevent_hash(u64 type, u32 event_id) +static void perf_pmu_cancel_txn(struct pmu *pmu) { - u64 val = event_id | (type << 32); - - return hash_64(val, SWEVENT_HLIST_BITS); + perf_pmu_enable(pmu); } -static inline struct hlist_head * -__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) +static int perf_event_idx_default(struct perf_event *event) { - u64 hash = swevent_hash(type, event_id); - - return &hlist->heads[hash]; + return 0; } -/* For the read side: events when they trigger */ -static inline struct hlist_head * -find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) +/* + * Ensures all contexts with the same task_ctx_nr have the same + * pmu_cpu_context too. + */ +static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) { - struct swevent_hlist *hlist; + struct pmu *pmu; - hlist = rcu_dereference(swhash->swevent_hlist); - if (!hlist) + if (ctxn < 0) return NULL; - return __find_swevent_head(hlist, type, event_id); + list_for_each_entry(pmu, &pmus, entry) { + if (pmu->task_ctx_nr == ctxn) + return pmu->pmu_cpu_context; + } + + return NULL; } -/* For the event head insertion and removal in the hlist */ -static inline struct hlist_head * -find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) +static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) { - struct swevent_hlist *hlist; - u32 event_id = event->attr.config; - u64 type = event->attr.type; - - /* - * Event scheduling is always serialized against hlist allocation - * and release. Which makes the protected version suitable here. - * The context lock guarantees that. - */ - hlist = rcu_dereference_protected(swhash->swevent_hlist, - lockdep_is_held(&event->ctx->lock)); - if (!hlist) - return NULL; + int cpu; - return __find_swevent_head(hlist, type, event_id); -} + for_each_possible_cpu(cpu) { + struct perf_cpu_context *cpuctx; -static void do_perf_sw_event(enum perf_type_id type, u32 event_id, - u64 nr, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); - struct perf_event *event; - struct hlist_head *head; + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - rcu_read_lock(); - head = find_swevent_head_rcu(swhash, type, event_id); - if (!head) - goto end; - - hlist_for_each_entry_rcu(event, head, hlist_entry) { - if (perf_swevent_match(event, type, event_id, data, regs)) - perf_swevent_event(event, nr, data, regs); + if (cpuctx->unique_pmu == old_pmu) + cpuctx->unique_pmu = pmu; } -end: - rcu_read_unlock(); } -DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); - -int perf_swevent_get_recursion_context(void) +static void free_pmu_context(struct pmu *pmu) { - struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); - - return get_recursion_context(swhash->recursion); -} -EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); + struct pmu *i; -inline void perf_swevent_put_recursion_context(int rctx) -{ - struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); + mutex_lock(&pmus_lock); + /* + * Like a real lame refcount. + */ + list_for_each_entry(i, &pmus, entry) { + if (i->pmu_cpu_context == pmu->pmu_cpu_context) { + update_pmu_context(i, pmu); + goto out; + } + } - put_recursion_context(swhash->recursion, rctx); + free_percpu(pmu->pmu_cpu_context); +out: + mutex_unlock(&pmus_lock); } +static struct idr pmu_idr; -void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) +static ssize_t +type_show(struct device *dev, struct device_attribute *attr, char *page) { - struct perf_sample_data data; - - if (WARN_ON_ONCE(!regs)) - return; + struct pmu *pmu = dev_get_drvdata(dev); - perf_sample_data_init(&data, addr, 0); - do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); + return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); } +static DEVICE_ATTR_RO(type); -void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) +static ssize_t +perf_event_mux_interval_ms_show(struct device *dev, + struct device_attribute *attr, + char *page) { - int rctx; - - preempt_disable_notrace(); - rctx = perf_swevent_get_recursion_context(); - if (unlikely(rctx < 0)) - goto fail; - - ___perf_sw_event(event_id, nr, regs, addr); + struct pmu *pmu = dev_get_drvdata(dev); - perf_swevent_put_recursion_context(rctx); -fail: - preempt_enable_notrace(); + return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); } -static void perf_swevent_read(struct perf_event *event) +static ssize_t +perf_event_mux_interval_ms_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { -} + struct pmu *pmu = dev_get_drvdata(dev); + int timer, cpu, ret; -static int perf_swevent_add(struct perf_event *event, int flags) -{ - struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); - struct hw_perf_event *hwc = &event->hw; - struct hlist_head *head; + ret = kstrtoint(buf, 0, &timer); + if (ret) + return ret; - if (is_sampling_event(event)) { - hwc->last_period = hwc->sample_period; - perf_swevent_set_period(event); - } + if (timer < 1) + return -EINVAL; - hwc->state = !(flags & PERF_EF_START); + /* same value, noting to do */ + if (timer == pmu->hrtimer_interval_ms) + return count; - head = find_swevent_head(swhash, event); - if (!head) { - /* - * We can race with cpu hotplug code. Do not - * WARN if the cpu just got unplugged. - */ - WARN_ON_ONCE(swhash->online); - return -EINVAL; - } + pmu->hrtimer_interval_ms = timer; - hlist_add_head_rcu(&event->hlist_entry, head); - perf_event_update_userpage(event); + /* update all cpuctx for this PMU */ + for_each_possible_cpu(cpu) { + struct perf_cpu_context *cpuctx; + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); - return 0; -} + if (hrtimer_active(&cpuctx->hrtimer)) + hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval); + } -static void perf_swevent_del(struct perf_event *event, int flags) -{ - hlist_del_rcu(&event->hlist_entry); + return count; } +static DEVICE_ATTR_RW(perf_event_mux_interval_ms); -static void perf_swevent_start(struct perf_event *event, int flags) -{ - event->hw.state = 0; -} +static struct attribute *pmu_dev_attrs[] = { + &dev_attr_type.attr, + &dev_attr_perf_event_mux_interval_ms.attr, + NULL, +}; +ATTRIBUTE_GROUPS(pmu_dev); -static void perf_swevent_stop(struct perf_event *event, int flags) -{ - event->hw.state = PERF_HES_STOPPED; -} +static int pmu_bus_running; +static struct bus_type pmu_bus = { + .name = "event_source", + .dev_groups = pmu_dev_groups, +}; -/* Deref the hlist from the update side */ -static inline struct swevent_hlist * -swevent_hlist_deref(struct swevent_htable *swhash) +static void pmu_dev_release(struct device *dev) { - return rcu_dereference_protected(swhash->swevent_hlist, - lockdep_is_held(&swhash->hlist_mutex)); + kfree(dev); } -static void swevent_hlist_release(struct swevent_htable *swhash) +static int pmu_dev_alloc(struct pmu *pmu) { - struct swevent_hlist *hlist = swevent_hlist_deref(swhash); - - if (!hlist) - return; + int ret = -ENOMEM; - RCU_INIT_POINTER(swhash->swevent_hlist, NULL); - kfree_rcu(hlist, rcu_head); -} + pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); + if (!pmu->dev) + goto out; -static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) -{ - struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + pmu->dev->groups = pmu->attr_groups; + device_initialize(pmu->dev); + ret = dev_set_name(pmu->dev, "%s", pmu->name); + if (ret) + goto free_dev; - mutex_lock(&swhash->hlist_mutex); + dev_set_drvdata(pmu->dev, pmu); + pmu->dev->bus = &pmu_bus; + pmu->dev->release = pmu_dev_release; + ret = device_add(pmu->dev); + if (ret) + goto free_dev; - if (!--swhash->hlist_refcount) - swevent_hlist_release(swhash); +out: + return ret; - mutex_unlock(&swhash->hlist_mutex); +free_dev: + put_device(pmu->dev); + goto out; } -static void swevent_hlist_put(struct perf_event *event) -{ - int cpu; - - for_each_possible_cpu(cpu) - swevent_hlist_put_cpu(event, cpu); -} +static struct lock_class_key cpuctx_mutex; +static struct lock_class_key cpuctx_lock; -static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) +int perf_pmu_register(struct pmu *pmu, const char *name, int type) { - struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - int err = 0; + int cpu, ret; - mutex_lock(&swhash->hlist_mutex); + mutex_lock(&pmus_lock); + ret = -ENOMEM; + pmu->pmu_disable_count = alloc_percpu(int); + if (!pmu->pmu_disable_count) + goto unlock; - if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { - struct swevent_hlist *hlist; + pmu->type = -1; + if (!name) + goto skip_type; + pmu->name = name; - hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); - if (!hlist) { - err = -ENOMEM; - goto exit; + if (type < 0) { + type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL); + if (type < 0) { + ret = type; + goto free_pdc; } - rcu_assign_pointer(swhash->swevent_hlist, hlist); } - swhash->hlist_refcount++; -exit: - mutex_unlock(&swhash->hlist_mutex); + pmu->type = type; - return err; -} - -static int swevent_hlist_get(struct perf_event *event) -{ - int err; - int cpu, failed_cpu; - - get_online_cpus(); - for_each_possible_cpu(cpu) { - err = swevent_hlist_get_cpu(event, cpu); - if (err) { - failed_cpu = cpu; - goto fail; - } - } - put_online_cpus(); - - return 0; -fail: - for_each_possible_cpu(cpu) { - if (cpu == failed_cpu) - break; - swevent_hlist_put_cpu(event, cpu); - } - - put_online_cpus(); - return err; -} - -struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; - -static void sw_perf_event_destroy(struct perf_event *event) -{ - u64 event_id = event->attr.config; - - WARN_ON(event->parent); - - static_key_slow_dec(&perf_swevent_enabled[event_id]); - swevent_hlist_put(event); -} - -static int perf_swevent_init(struct perf_event *event) -{ - u64 event_id = event->attr.config; - - if (event->attr.type != PERF_TYPE_SOFTWARE) - return -ENOENT; - - /* - * no branch sampling for software events - */ - if (has_branch_stack(event)) - return -EOPNOTSUPP; - - switch (event_id) { - case PERF_COUNT_SW_CPU_CLOCK: - case PERF_COUNT_SW_TASK_CLOCK: - return -ENOENT; - - default: - break; - } - - if (event_id >= PERF_COUNT_SW_MAX) - return -ENOENT; - - if (!event->parent) { - int err; - - err = swevent_hlist_get(event); - if (err) - return err; - - static_key_slow_inc(&perf_swevent_enabled[event_id]); - event->destroy = sw_perf_event_destroy; - } - - return 0; -} - -static struct pmu perf_swevent = { - .task_ctx_nr = perf_sw_context, - - .capabilities = PERF_PMU_CAP_NO_NMI, - - .event_init = perf_swevent_init, - .add = perf_swevent_add, - .del = perf_swevent_del, - .start = perf_swevent_start, - .stop = perf_swevent_stop, - .read = perf_swevent_read, -}; - -#ifdef CONFIG_EVENT_TRACING - -static int perf_tp_filter_match(struct perf_event *event, - struct perf_sample_data *data) -{ - void *record = data->raw->data; - - if (likely(!event->filter) || filter_match_preds(event->filter, record)) - return 1; - return 0; -} - -static int perf_tp_event_match(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - if (event->hw.state & PERF_HES_STOPPED) - return 0; - /* - * All tracepoints are from kernel-space. - */ - if (event->attr.exclude_kernel) - return 0; - - if (!perf_tp_filter_match(event, data)) - return 0; - - return 1; -} - -void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, - struct pt_regs *regs, struct hlist_head *head, int rctx, - struct task_struct *task) -{ - struct perf_sample_data data; - struct perf_event *event; - - struct perf_raw_record raw = { - .size = entry_size, - .data = record, - }; - - perf_sample_data_init(&data, addr, 0); - data.raw = &raw; - - hlist_for_each_entry_rcu(event, head, hlist_entry) { - if (perf_tp_event_match(event, &data, regs)) - perf_swevent_event(event, count, &data, regs); - } - - /* - * If we got specified a target task, also iterate its context and - * deliver this event there too. - */ - if (task && task != current) { - struct perf_event_context *ctx; - struct trace_entry *entry = record; - - rcu_read_lock(); - ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); - if (!ctx) - goto unlock; - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (event->attr.type != PERF_TYPE_TRACEPOINT) - continue; - if (event->attr.config != entry->type) - continue; - if (perf_tp_event_match(event, &data, regs)) - perf_swevent_event(event, count, &data, regs); - } -unlock: - rcu_read_unlock(); - } - - perf_swevent_put_recursion_context(rctx); -} -EXPORT_SYMBOL_GPL(perf_tp_event); - -static void tp_perf_event_destroy(struct perf_event *event) -{ - perf_trace_destroy(event); -} - -static int perf_tp_event_init(struct perf_event *event) -{ - int err; - - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -ENOENT; - - /* - * no branch sampling for tracepoint events - */ - if (has_branch_stack(event)) - return -EOPNOTSUPP; - - err = perf_trace_init(event); - if (err) - return err; - - event->destroy = tp_perf_event_destroy; - - return 0; -} - -static struct pmu perf_tracepoint = { - .task_ctx_nr = perf_sw_context, - - .event_init = perf_tp_event_init, - .add = perf_trace_add, - .del = perf_trace_del, - .start = perf_swevent_start, - .stop = perf_swevent_stop, - .read = perf_swevent_read, -}; - -static inline void perf_tp_register(void) -{ - perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); -} - -static int perf_event_set_filter(struct perf_event *event, void __user *arg) -{ - char *filter_str; - int ret; - - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -EINVAL; - - filter_str = strndup_user(arg, PAGE_SIZE); - if (IS_ERR(filter_str)) - return PTR_ERR(filter_str); - - ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); - - kfree(filter_str); - return ret; -} - -static void perf_event_free_filter(struct perf_event *event) -{ - ftrace_profile_free_filter(event); -} - -static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) -{ - struct bpf_prog *prog; - - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -EINVAL; - - if (event->tp_event->prog) - return -EEXIST; - - if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) - /* bpf programs can only be attached to kprobes */ - return -EINVAL; - - prog = bpf_prog_get(prog_fd); - if (IS_ERR(prog)) - return PTR_ERR(prog); - - if (prog->type != BPF_PROG_TYPE_KPROBE) { - /* valid fd, but invalid bpf program type */ - bpf_prog_put(prog); - return -EINVAL; - } - - event->tp_event->prog = prog; - - return 0; -} - -static void perf_event_free_bpf_prog(struct perf_event *event) -{ - struct bpf_prog *prog; - - if (!event->tp_event) - return; - - prog = event->tp_event->prog; - if (prog) { - event->tp_event->prog = NULL; - bpf_prog_put(prog); - } -} - -#else - -static inline void perf_tp_register(void) -{ -} - -static int perf_event_set_filter(struct perf_event *event, void __user *arg) -{ - return -ENOENT; -} - -static void perf_event_free_filter(struct perf_event *event) -{ -} - -static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) -{ - return -ENOENT; -} - -static void perf_event_free_bpf_prog(struct perf_event *event) -{ -} -#endif /* CONFIG_EVENT_TRACING */ - -#ifdef CONFIG_HAVE_HW_BREAKPOINT -void perf_bp_event(struct perf_event *bp, void *data) -{ - struct perf_sample_data sample; - struct pt_regs *regs = data; - - perf_sample_data_init(&sample, bp->attr.bp_addr, 0); - - if (!bp->hw.state && !perf_exclude_event(bp, regs)) - perf_swevent_event(bp, 1, &sample, regs); -} -#endif - -/* - * hrtimer based swevent callback - */ - -static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) -{ - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_sample_data data; - struct pt_regs *regs; - struct perf_event *event; - u64 period; - - event = container_of(hrtimer, struct perf_event, hw.hrtimer); - - if (event->state != PERF_EVENT_STATE_ACTIVE) - return HRTIMER_NORESTART; - - event->pmu->read(event); - - perf_sample_data_init(&data, 0, event->hw.last_period); - regs = get_irq_regs(); - - if (regs && !perf_exclude_event(event, regs)) { - if (!(event->attr.exclude_idle && is_idle_task(current))) - if (__perf_event_overflow(event, 1, &data, regs)) - ret = HRTIMER_NORESTART; - } - - period = max_t(u64, 10000, event->hw.sample_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); - - return ret; -} - -static void perf_swevent_start_hrtimer(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - s64 period; - - if (!is_sampling_event(event)) - return; - - period = local64_read(&hwc->period_left); - if (period) { - if (period < 0) - period = 10000; - - local64_set(&hwc->period_left, 0); - } else { - period = max_t(u64, 10000, hwc->sample_period); - } - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL_PINNED, 0); -} - -static void perf_swevent_cancel_hrtimer(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - if (is_sampling_event(event)) { - ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); - local64_set(&hwc->period_left, ktime_to_ns(remaining)); - - hrtimer_cancel(&hwc->hrtimer); - } -} - -static void perf_swevent_init_hrtimer(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - if (!is_sampling_event(event)) - return; - - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - - /* - * Since hrtimers have a fixed rate, we can do a static freq->period - * mapping and avoid the whole period adjust feedback stuff. - */ - if (event->attr.freq) { - long freq = event->attr.sample_freq; - - event->attr.sample_period = NSEC_PER_SEC / freq; - hwc->sample_period = event->attr.sample_period; - local64_set(&hwc->period_left, hwc->sample_period); - hwc->last_period = hwc->sample_period; - event->attr.freq = 0; - } -} - -/* - * Software event: cpu wall time clock - */ - -static void cpu_clock_event_update(struct perf_event *event) -{ - s64 prev; - u64 now; - - now = local_clock(); - prev = local64_xchg(&event->hw.prev_count, now); - local64_add(now - prev, &event->count); -} - -static void cpu_clock_event_start(struct perf_event *event, int flags) -{ - local64_set(&event->hw.prev_count, local_clock()); - perf_swevent_start_hrtimer(event); -} - -static void cpu_clock_event_stop(struct perf_event *event, int flags) -{ - perf_swevent_cancel_hrtimer(event); - cpu_clock_event_update(event); -} - -static int cpu_clock_event_add(struct perf_event *event, int flags) -{ - if (flags & PERF_EF_START) - cpu_clock_event_start(event, flags); - perf_event_update_userpage(event); - - return 0; -} - -static void cpu_clock_event_del(struct perf_event *event, int flags) -{ - cpu_clock_event_stop(event, flags); -} - -static void cpu_clock_event_read(struct perf_event *event) -{ - cpu_clock_event_update(event); -} - -static int cpu_clock_event_init(struct perf_event *event) -{ - if (event->attr.type != PERF_TYPE_SOFTWARE) - return -ENOENT; - - if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) - return -ENOENT; - - /* - * no branch sampling for software events - */ - if (has_branch_stack(event)) - return -EOPNOTSUPP; - - perf_swevent_init_hrtimer(event); - - return 0; -} - -static struct pmu perf_cpu_clock = { - .task_ctx_nr = perf_sw_context, - - .capabilities = PERF_PMU_CAP_NO_NMI, - - .event_init = cpu_clock_event_init, - .add = cpu_clock_event_add, - .del = cpu_clock_event_del, - .start = cpu_clock_event_start, - .stop = cpu_clock_event_stop, - .read = cpu_clock_event_read, -}; - -/* - * Software event: task time clock - */ - -static void task_clock_event_update(struct perf_event *event, u64 now) -{ - u64 prev; - s64 delta; - - prev = local64_xchg(&event->hw.prev_count, now); - delta = now - prev; - local64_add(delta, &event->count); -} - -static void task_clock_event_start(struct perf_event *event, int flags) -{ - local64_set(&event->hw.prev_count, event->ctx->time); - perf_swevent_start_hrtimer(event); -} - -static void task_clock_event_stop(struct perf_event *event, int flags) -{ - perf_swevent_cancel_hrtimer(event); - task_clock_event_update(event, event->ctx->time); -} - -static int task_clock_event_add(struct perf_event *event, int flags) -{ - if (flags & PERF_EF_START) - task_clock_event_start(event, flags); - perf_event_update_userpage(event); - - return 0; -} - -static void task_clock_event_del(struct perf_event *event, int flags) -{ - task_clock_event_stop(event, PERF_EF_UPDATE); -} - -static void task_clock_event_read(struct perf_event *event) -{ - u64 now = perf_clock(); - u64 delta = now - event->ctx->timestamp; - u64 time = event->ctx->time + delta; - - task_clock_event_update(event, time); -} - -static int task_clock_event_init(struct perf_event *event) -{ - if (event->attr.type != PERF_TYPE_SOFTWARE) - return -ENOENT; - - if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) - return -ENOENT; - - /* - * no branch sampling for software events - */ - if (has_branch_stack(event)) - return -EOPNOTSUPP; - - perf_swevent_init_hrtimer(event); - - return 0; -} - -static struct pmu perf_task_clock = { - .task_ctx_nr = perf_sw_context, - - .capabilities = PERF_PMU_CAP_NO_NMI, - - .event_init = task_clock_event_init, - .add = task_clock_event_add, - .del = task_clock_event_del, - .start = task_clock_event_start, - .stop = task_clock_event_stop, - .read = task_clock_event_read, -}; - -static void perf_pmu_nop_void(struct pmu *pmu) -{ -} - -static int perf_pmu_nop_int(struct pmu *pmu) -{ - return 0; -} - -static void perf_pmu_start_txn(struct pmu *pmu) -{ - perf_pmu_disable(pmu); -} - -static int perf_pmu_commit_txn(struct pmu *pmu) -{ - perf_pmu_enable(pmu); - return 0; -} - -static void perf_pmu_cancel_txn(struct pmu *pmu) -{ - perf_pmu_enable(pmu); -} - -static int perf_event_idx_default(struct perf_event *event) -{ - return 0; -} - -/* - * Ensures all contexts with the same task_ctx_nr have the same - * pmu_cpu_context too. - */ -static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) -{ - struct pmu *pmu; - - if (ctxn < 0) - return NULL; - - list_for_each_entry(pmu, &pmus, entry) { - if (pmu->task_ctx_nr == ctxn) - return pmu->pmu_cpu_context; - } - - return NULL; -} - -static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct perf_cpu_context *cpuctx; - - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - - if (cpuctx->unique_pmu == old_pmu) - cpuctx->unique_pmu = pmu; - } -} - -static void free_pmu_context(struct pmu *pmu) -{ - struct pmu *i; - - mutex_lock(&pmus_lock); - /* - * Like a real lame refcount. - */ - list_for_each_entry(i, &pmus, entry) { - if (i->pmu_cpu_context == pmu->pmu_cpu_context) { - update_pmu_context(i, pmu); - goto out; - } - } - - free_percpu(pmu->pmu_cpu_context); -out: - mutex_unlock(&pmus_lock); -} -static struct idr pmu_idr; - -static ssize_t -type_show(struct device *dev, struct device_attribute *attr, char *page) -{ - struct pmu *pmu = dev_get_drvdata(dev); - - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); -} -static DEVICE_ATTR_RO(type); - -static ssize_t -perf_event_mux_interval_ms_show(struct device *dev, - struct device_attribute *attr, - char *page) -{ - struct pmu *pmu = dev_get_drvdata(dev); - - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); -} - -static ssize_t -perf_event_mux_interval_ms_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct pmu *pmu = dev_get_drvdata(dev); - int timer, cpu, ret; - - ret = kstrtoint(buf, 0, &timer); - if (ret) - return ret; - - if (timer < 1) - return -EINVAL; - - /* same value, noting to do */ - if (timer == pmu->hrtimer_interval_ms) - return count; - - pmu->hrtimer_interval_ms = timer; - - /* update all cpuctx for this PMU */ - for_each_possible_cpu(cpu) { - struct perf_cpu_context *cpuctx; - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); - - if (hrtimer_active(&cpuctx->hrtimer)) - hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval); - } - - return count; -} -static DEVICE_ATTR_RW(perf_event_mux_interval_ms); - -static struct attribute *pmu_dev_attrs[] = { - &dev_attr_type.attr, - &dev_attr_perf_event_mux_interval_ms.attr, - NULL, -}; -ATTRIBUTE_GROUPS(pmu_dev); - -static int pmu_bus_running; -static struct bus_type pmu_bus = { - .name = "event_source", - .dev_groups = pmu_dev_groups, -}; - -static void pmu_dev_release(struct device *dev) -{ - kfree(dev); -} - -static int pmu_dev_alloc(struct pmu *pmu) -{ - int ret = -ENOMEM; - - pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); - if (!pmu->dev) - goto out; - - pmu->dev->groups = pmu->attr_groups; - device_initialize(pmu->dev); - ret = dev_set_name(pmu->dev, "%s", pmu->name); - if (ret) - goto free_dev; - - dev_set_drvdata(pmu->dev, pmu); - pmu->dev->bus = &pmu_bus; - pmu->dev->release = pmu_dev_release; - ret = device_add(pmu->dev); - if (ret) - goto free_dev; - -out: - return ret; - -free_dev: - put_device(pmu->dev); - goto out; -} - -static struct lock_class_key cpuctx_mutex; -static struct lock_class_key cpuctx_lock; - -int perf_pmu_register(struct pmu *pmu, const char *name, int type) -{ - int cpu, ret; - - mutex_lock(&pmus_lock); - ret = -ENOMEM; - pmu->pmu_disable_count = alloc_percpu(int); - if (!pmu->pmu_disable_count) - goto unlock; - - pmu->type = -1; - if (!name) - goto skip_type; - pmu->name = name; - - if (type < 0) { - type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL); - if (type < 0) { - ret = type; - goto free_pdc; - } - } - pmu->type = type; - - if (pmu_bus_running) { - ret = pmu_dev_alloc(pmu); - if (ret) - goto free_idr; - } + if (pmu_bus_running) { + ret = pmu_dev_alloc(pmu); + if (ret) + goto free_idr; + } skip_type: pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); @@ -8808,30 +7866,10 @@ int perf_event_init_task(struct task_str static void __init perf_event_init_all_cpus(void) { - struct swevent_htable *swhash; int cpu; - for_each_possible_cpu(cpu) { - swhash = &per_cpu(swevent_htable, cpu); - mutex_init(&swhash->hlist_mutex); + for_each_possible_cpu(cpu) INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); - } -} - -static void perf_event_init_cpu(int cpu) -{ - struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - - mutex_lock(&swhash->hlist_mutex); - swhash->online = true; - if (swhash->hlist_refcount > 0) { - struct swevent_hlist *hlist; - - hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); - WARN_ON(!hlist); - rcu_assign_pointer(swhash->swevent_hlist, hlist); - } - mutex_unlock(&swhash->hlist_mutex); } #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC @@ -8862,20 +7900,8 @@ static void perf_event_exit_cpu_context( } srcu_read_unlock(&pmus_srcu, idx); } - -static void perf_event_exit_cpu(int cpu) -{ - struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - - perf_event_exit_cpu_context(cpu); - - mutex_lock(&swhash->hlist_mutex); - swhash->online = false; - swevent_hlist_release(swhash); - mutex_unlock(&swhash->hlist_mutex); -} #else -static inline void perf_event_exit_cpu(int cpu) { } +static inline void perf_event_exit_cpu_context(int cpu) { } #endif static int @@ -8884,7 +7910,7 @@ perf_reboot(struct notifier_block *notif int cpu; for_each_online_cpu(cpu) - perf_event_exit_cpu(cpu); + perf_event_exit_cpu_context(cpu); return NOTIFY_OK; } @@ -8905,14 +7931,9 @@ perf_cpu_notify(struct notifier_block *s switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - case CPU_DOWN_FAILED: - perf_event_init_cpu(cpu); - break; - case CPU_UP_CANCELED: case CPU_DOWN_PREPARE: - perf_event_exit_cpu(cpu); + perf_event_exit_cpu_context(cpu); break; default: break; @@ -8929,10 +7950,7 @@ void __init perf_event_init(void) perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); - perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); - perf_pmu_register(&perf_cpu_clock, NULL, -1); - perf_pmu_register(&perf_task_clock, NULL, -1); - perf_tp_register(); + perf_swevent_register(); perf_cpu_notifier(perf_cpu_notify); register_reboot_notifier(&perf_reboot_notifier); --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -228,4 +228,17 @@ static inline bool arch_perf_have_user_s #define perf_user_stack_pointer(regs) 0 #endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */ +#define MAX_INTERRUPTS (~0ULL) + +extern int __perf_event_overflow(struct perf_event *event, + int throttle, struct perf_sample_data *data, + struct pt_regs *regs); + +extern void perf_event_free_filter(struct perf_event *event); +extern void perf_event_free_bpf_prog(struct perf_event *event); +extern int perf_event_set_filter(struct perf_event *event, void __user *arg); +extern int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); + +extern void perf_swevent_register(void); + #endif /* _KERNEL_EVENTS_INTERNAL_H */ --- /dev/null +++ b/kernel/events/software.c @@ -0,0 +1,1021 @@ + +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* + * Generic software event infrastructure + */ + +struct swevent_htable { + struct swevent_hlist *swevent_hlist; + struct mutex hlist_mutex; + int hlist_refcount; + + /* Recursion avoidance in each contexts */ + int recursion[PERF_NR_CONTEXTS]; + + /* Keeps track of cpu being initialized/exited */ + bool online; +}; + +static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); + +/* + * We directly increment event->count and keep a second value in + * event->hw.period_left to count intervals. This period event + * is kept in the range [-sample_period, 0] so that we can use the + * sign as trigger. + */ + +u64 perf_swevent_set_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 period = hwc->last_period; + u64 nr, offset; + s64 old, val; + + hwc->last_period = hwc->sample_period; + +again: + old = val = local64_read(&hwc->period_left); + if (val < 0) + return 0; + + nr = div64_u64(period + val, period); + offset = nr * period; + val -= offset; + if (local64_cmpxchg(&hwc->period_left, old, val) != old) + goto again; + + return nr; +} + +static void perf_swevent_overflow(struct perf_event *event, u64 overflow, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct hw_perf_event *hwc = &event->hw; + int throttle = 0; + + if (!overflow) + overflow = perf_swevent_set_period(event); + + if (hwc->interrupts == MAX_INTERRUPTS) + return; + + for (; overflow; overflow--) { + if (__perf_event_overflow(event, throttle, + data, regs)) { + /* + * We inhibit the overflow from happening when + * hwc->interrupts == MAX_INTERRUPTS. + */ + break; + } + throttle = 1; + } +} + +static void perf_swevent_event(struct perf_event *event, u64 nr, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct hw_perf_event *hwc = &event->hw; + + local64_add(nr, &event->count); + + if (!regs) + return; + + if (!is_sampling_event(event)) + return; + + if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { + data->period = nr; + return perf_swevent_overflow(event, 1, data, regs); + } else + data->period = event->hw.last_period; + + if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) + return perf_swevent_overflow(event, 1, data, regs); + + if (local64_add_negative(nr, &hwc->period_left)) + return; + + perf_swevent_overflow(event, 0, data, regs); +} + +static int perf_exclude_event(struct perf_event *event, + struct pt_regs *regs) +{ + if (event->hw.state & PERF_HES_STOPPED) + return 1; + + if (regs) { + if (event->attr.exclude_user && user_mode(regs)) + return 1; + + if (event->attr.exclude_kernel && !user_mode(regs)) + return 1; + } + + return 0; +} + + +#ifdef CONFIG_HAVE_HW_BREAKPOINT +void perf_bp_event(struct perf_event *bp, void *data) +{ + struct perf_sample_data sample; + struct pt_regs *regs = data; + + perf_sample_data_init(&sample, bp->attr.bp_addr, 0); + + if (!bp->hw.state && !perf_exclude_event(bp, regs)) + perf_swevent_event(bp, 1, &sample, regs); +} +#endif + +static int perf_swevent_match(struct perf_event *event, + enum perf_type_id type, + u32 event_id, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + if (event->attr.type != type) + return 0; + + if (event->attr.config != event_id) + return 0; + + if (perf_exclude_event(event, regs)) + return 0; + + return 1; +} + +static inline u64 swevent_hash(u64 type, u32 event_id) +{ + u64 val = event_id | (type << 32); + + return hash_64(val, SWEVENT_HLIST_BITS); +} + +static inline struct hlist_head * +__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) +{ + u64 hash = swevent_hash(type, event_id); + + return &hlist->heads[hash]; +} + +/* For the read side: events when they trigger */ +static inline struct hlist_head * +find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) +{ + struct swevent_hlist *hlist; + + hlist = rcu_dereference(swhash->swevent_hlist); + if (!hlist) + return NULL; + + return __find_swevent_head(hlist, type, event_id); +} + +/* For the event head insertion and removal in the hlist */ +static inline struct hlist_head * +find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) +{ + struct swevent_hlist *hlist; + u32 event_id = event->attr.config; + u64 type = event->attr.type; + + /* + * Event scheduling is always serialized against hlist allocation + * and release. Which makes the protected version suitable here. + * The context lock guarantees that. + */ + hlist = rcu_dereference_protected(swhash->swevent_hlist, + lockdep_is_held(&event->ctx->lock)); + if (!hlist) + return NULL; + + return __find_swevent_head(hlist, type, event_id); +} + +static void do_perf_sw_event(enum perf_type_id type, u32 event_id, + u64 nr, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); + struct perf_event *event; + struct hlist_head *head; + + rcu_read_lock(); + head = find_swevent_head_rcu(swhash, type, event_id); + if (!head) + goto end; + + hlist_for_each_entry_rcu(event, head, hlist_entry) { + if (perf_swevent_match(event, type, event_id, data, regs)) + perf_swevent_event(event, nr, data, regs); + } +end: + rcu_read_unlock(); +} + +DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); + +int perf_swevent_get_recursion_context(void) +{ + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); + + return get_recursion_context(swhash->recursion); +} +EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); + +inline void perf_swevent_put_recursion_context(int rctx) +{ + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); + + put_recursion_context(swhash->recursion, rctx); +} + +void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) +{ + struct perf_sample_data data; + + if (WARN_ON_ONCE(!regs)) + return; + + perf_sample_data_init(&data, addr, 0); + do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); +} + +void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) +{ + int rctx; + + preempt_disable_notrace(); + rctx = perf_swevent_get_recursion_context(); + if (unlikely(rctx < 0)) + goto fail; + + ___perf_sw_event(event_id, nr, regs, addr); + + perf_swevent_put_recursion_context(rctx); +fail: + preempt_enable_notrace(); +} + +static void perf_swevent_read(struct perf_event *event) +{ +} + +static int perf_swevent_add(struct perf_event *event, int flags) +{ + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); + struct hw_perf_event *hwc = &event->hw; + struct hlist_head *head; + + if (is_sampling_event(event)) { + hwc->last_period = hwc->sample_period; + perf_swevent_set_period(event); + } + + hwc->state = !(flags & PERF_EF_START); + + head = find_swevent_head(swhash, event); + if (!head) { + /* + * We can race with cpu hotplug code. Do not + * WARN if the cpu just got unplugged. + */ + WARN_ON_ONCE(swhash->online); + return -EINVAL; + } + + hlist_add_head_rcu(&event->hlist_entry, head); + perf_event_update_userpage(event); + + return 0; +} + +static void perf_swevent_del(struct perf_event *event, int flags) +{ + hlist_del_rcu(&event->hlist_entry); +} + +static void perf_swevent_start(struct perf_event *event, int flags) +{ + event->hw.state = 0; +} + +static void perf_swevent_stop(struct perf_event *event, int flags) +{ + event->hw.state = PERF_HES_STOPPED; +} + +/* Deref the hlist from the update side */ +static inline struct swevent_hlist * +swevent_hlist_deref(struct swevent_htable *swhash) +{ + return rcu_dereference_protected(swhash->swevent_hlist, + lockdep_is_held(&swhash->hlist_mutex)); +} + +static void swevent_hlist_release(struct swevent_htable *swhash) +{ + struct swevent_hlist *hlist = swevent_hlist_deref(swhash); + + if (!hlist) + return; + + RCU_INIT_POINTER(swhash->swevent_hlist, NULL); + kfree_rcu(hlist, rcu_head); +} + +static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) +{ + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + + mutex_lock(&swhash->hlist_mutex); + + if (!--swhash->hlist_refcount) + swevent_hlist_release(swhash); + + mutex_unlock(&swhash->hlist_mutex); +} + +static void swevent_hlist_put(struct perf_event *event) +{ + int cpu; + + for_each_possible_cpu(cpu) + swevent_hlist_put_cpu(event, cpu); +} + +static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) +{ + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + int err = 0; + + mutex_lock(&swhash->hlist_mutex); + + if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { + struct swevent_hlist *hlist; + + hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); + if (!hlist) { + err = -ENOMEM; + goto exit; + } + rcu_assign_pointer(swhash->swevent_hlist, hlist); + } + swhash->hlist_refcount++; +exit: + mutex_unlock(&swhash->hlist_mutex); + + return err; +} + +static int swevent_hlist_get(struct perf_event *event) +{ + int err; + int cpu, failed_cpu; + + get_online_cpus(); + for_each_possible_cpu(cpu) { + err = swevent_hlist_get_cpu(event, cpu); + if (err) { + failed_cpu = cpu; + goto fail; + } + } + put_online_cpus(); + + return 0; +fail: + for_each_possible_cpu(cpu) { + if (cpu == failed_cpu) + break; + swevent_hlist_put_cpu(event, cpu); + } + + put_online_cpus(); + return err; +} + +struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; + +static void sw_perf_event_destroy(struct perf_event *event) +{ + u64 event_id = event->attr.config; + + WARN_ON(event->parent); + + static_key_slow_dec(&perf_swevent_enabled[event_id]); + swevent_hlist_put(event); +} + +static int perf_swevent_init(struct perf_event *event) +{ + u64 event_id = event->attr.config; + + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + /* + * no branch sampling for software events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + switch (event_id) { + case PERF_COUNT_SW_CPU_CLOCK: + case PERF_COUNT_SW_TASK_CLOCK: + return -ENOENT; + + default: + break; + } + + if (event_id >= PERF_COUNT_SW_MAX) + return -ENOENT; + + if (!event->parent) { + int err; + + err = swevent_hlist_get(event); + if (err) + return err; + + static_key_slow_inc(&perf_swevent_enabled[event_id]); + event->destroy = sw_perf_event_destroy; + } + + return 0; +} + +static struct pmu perf_swevent = { + .task_ctx_nr = perf_sw_context, + + .capabilities = PERF_PMU_CAP_NO_NMI, + + .event_init = perf_swevent_init, + .add = perf_swevent_add, + .del = perf_swevent_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, +}; + +#ifdef CONFIG_EVENT_TRACING + +static int perf_tp_filter_match(struct perf_event *event, + struct perf_sample_data *data) +{ + void *record = data->raw->data; + + if (likely(!event->filter) || filter_match_preds(event->filter, record)) + return 1; + return 0; +} + +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + if (event->hw.state & PERF_HES_STOPPED) + return 0; + /* + * All tracepoints are from kernel-space. + */ + if (event->attr.exclude_kernel) + return 0; + + if (!perf_tp_filter_match(event, data)) + return 0; + + return 1; +} + +void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, + struct pt_regs *regs, struct hlist_head *head, int rctx, + struct task_struct *task) +{ + struct perf_sample_data data; + struct perf_event *event; + + struct perf_raw_record raw = { + .size = entry_size, + .data = record, + }; + + perf_sample_data_init(&data, addr, 0); + data.raw = &raw; + + hlist_for_each_entry_rcu(event, head, hlist_entry) { + if (perf_tp_event_match(event, &data, regs)) + perf_swevent_event(event, count, &data, regs); + } + + /* + * If we got specified a target task, also iterate its context and + * deliver this event there too. + */ + if (task && task != current) { + struct perf_event_context *ctx; + struct trace_entry *entry = record; + + rcu_read_lock(); + ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); + if (!ctx) + goto unlock; + + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (event->attr.type != PERF_TYPE_TRACEPOINT) + continue; + if (event->attr.config != entry->type) + continue; + if (perf_tp_event_match(event, &data, regs)) + perf_swevent_event(event, count, &data, regs); + } +unlock: + rcu_read_unlock(); + } + + perf_swevent_put_recursion_context(rctx); +} +EXPORT_SYMBOL_GPL(perf_tp_event); + +static void tp_perf_event_destroy(struct perf_event *event) +{ + perf_trace_destroy(event); +} + +static int perf_tp_event_init(struct perf_event *event) +{ + int err; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -ENOENT; + + /* + * no branch sampling for tracepoint events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + err = perf_trace_init(event); + if (err) + return err; + + event->destroy = tp_perf_event_destroy; + + return 0; +} + +static struct pmu perf_tracepoint = { + .task_ctx_nr = perf_sw_context, + + .event_init = perf_tp_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, +}; + +static inline void perf_tp_register(void) +{ + perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); +} + +int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + char *filter_str; + int ret; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + + filter_str = strndup_user(arg, PAGE_SIZE); + if (IS_ERR(filter_str)) + return PTR_ERR(filter_str); + + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + + kfree(filter_str); + return ret; +} + +void perf_event_free_filter(struct perf_event *event) +{ + ftrace_profile_free_filter(event); +} + +int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +{ + struct bpf_prog *prog; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + + if (event->tp_event->prog) + return -EEXIST; + + if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) + /* bpf programs can only be attached to kprobes */ + return -EINVAL; + + prog = bpf_prog_get(prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->type != BPF_PROG_TYPE_KPROBE) { + /* valid fd, but invalid bpf program type */ + bpf_prog_put(prog); + return -EINVAL; + } + + event->tp_event->prog = prog; + + return 0; +} + +void perf_event_free_bpf_prog(struct perf_event *event) +{ + struct bpf_prog *prog; + + if (!event->tp_event) + return; + + prog = event->tp_event->prog; + if (prog) { + event->tp_event->prog = NULL; + bpf_prog_put(prog); + } +} + +#else + +static inline void perf_tp_register(void) +{ +} + +int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + return -ENOENT; +} + +void perf_event_free_filter(struct perf_event *event) +{ +} + +int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +{ + return -ENOENT; +} + +void perf_event_free_bpf_prog(struct perf_event *event) +{ +} +#endif /* CONFIG_EVENT_TRACING */ + +/* + * hrtimer based swevent callback + */ + +static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) +{ + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct pt_regs *regs; + struct perf_event *event; + u64 period; + + event = container_of(hrtimer, struct perf_event, hw.hrtimer); + + if (event->state != PERF_EVENT_STATE_ACTIVE) + return HRTIMER_NORESTART; + + event->pmu->read(event); + + perf_sample_data_init(&data, 0, event->hw.last_period); + regs = get_irq_regs(); + + if (regs && !perf_exclude_event(event, regs)) { + if (!(event->attr.exclude_idle && is_idle_task(current))) + if (__perf_event_overflow(event, 1, &data, regs)) + ret = HRTIMER_NORESTART; + } + + period = max_t(u64, 10000, event->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return ret; +} + +static void perf_swevent_start_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + s64 period; + + if (!is_sampling_event(event)) + return; + + period = local64_read(&hwc->period_left); + if (period) { + if (period < 0) + period = 10000; + + local64_set(&hwc->period_left, 0); + } else { + period = max_t(u64, 10000, hwc->sample_period); + } + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static void perf_swevent_cancel_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (is_sampling_event(event)) { + ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); + local64_set(&hwc->period_left, ktime_to_ns(remaining)); + + hrtimer_cancel(&hwc->hrtimer); + } +} + +static void perf_swevent_init_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!is_sampling_event(event)) + return; + + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + + /* + * Since hrtimers have a fixed rate, we can do a static freq->period + * mapping and avoid the whole period adjust feedback stuff. + */ + if (event->attr.freq) { + long freq = event->attr.sample_freq; + + event->attr.sample_period = NSEC_PER_SEC / freq; + hwc->sample_period = event->attr.sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + hwc->last_period = hwc->sample_period; + event->attr.freq = 0; + } +} + +/* + * Software event: cpu wall time clock + */ + +static void cpu_clock_event_update(struct perf_event *event) +{ + s64 prev; + u64 now; + + now = local_clock(); + prev = local64_xchg(&event->hw.prev_count, now); + local64_add(now - prev, &event->count); +} + +static void cpu_clock_event_start(struct perf_event *event, int flags) +{ + local64_set(&event->hw.prev_count, local_clock()); + perf_swevent_start_hrtimer(event); +} + +static void cpu_clock_event_stop(struct perf_event *event, int flags) +{ + perf_swevent_cancel_hrtimer(event); + cpu_clock_event_update(event); +} + +static int cpu_clock_event_add(struct perf_event *event, int flags) +{ + if (flags & PERF_EF_START) + cpu_clock_event_start(event, flags); + perf_event_update_userpage(event); + + return 0; +} + +static void cpu_clock_event_del(struct perf_event *event, int flags) +{ + cpu_clock_event_stop(event, flags); +} + +static void cpu_clock_event_read(struct perf_event *event) +{ + cpu_clock_event_update(event); +} + +static int cpu_clock_event_init(struct perf_event *event) +{ + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) + return -ENOENT; + + /* + * no branch sampling for software events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + perf_swevent_init_hrtimer(event); + + return 0; +} + +static struct pmu perf_cpu_clock = { + .task_ctx_nr = perf_sw_context, + + .capabilities = PERF_PMU_CAP_NO_NMI, + + .event_init = cpu_clock_event_init, + .add = cpu_clock_event_add, + .del = cpu_clock_event_del, + .start = cpu_clock_event_start, + .stop = cpu_clock_event_stop, + .read = cpu_clock_event_read, +}; + +/* + * Software event: task time clock + */ + +static void task_clock_event_update(struct perf_event *event, u64 now) +{ + u64 prev; + s64 delta; + + prev = local64_xchg(&event->hw.prev_count, now); + delta = now - prev; + local64_add(delta, &event->count); +} + +static void task_clock_event_start(struct perf_event *event, int flags) +{ + local64_set(&event->hw.prev_count, event->ctx->time); + perf_swevent_start_hrtimer(event); +} + +static void task_clock_event_stop(struct perf_event *event, int flags) +{ + perf_swevent_cancel_hrtimer(event); + task_clock_event_update(event, event->ctx->time); +} + +static int task_clock_event_add(struct perf_event *event, int flags) +{ + if (flags & PERF_EF_START) + task_clock_event_start(event, flags); + perf_event_update_userpage(event); + + return 0; +} + +static void task_clock_event_del(struct perf_event *event, int flags) +{ + task_clock_event_stop(event, PERF_EF_UPDATE); +} + +static void task_clock_event_read(struct perf_event *event) +{ + u64 now = local_clock(); /* XXX */ + u64 delta = now - event->ctx->timestamp; + u64 time = event->ctx->time + delta; + + task_clock_event_update(event, time); +} + +static int task_clock_event_init(struct perf_event *event) +{ + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) + return -ENOENT; + + /* + * no branch sampling for software events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + perf_swevent_init_hrtimer(event); + + return 0; +} + +static struct pmu perf_task_clock = { + .task_ctx_nr = perf_sw_context, + + .capabilities = PERF_PMU_CAP_NO_NMI, + + .event_init = task_clock_event_init, + .add = task_clock_event_add, + .del = task_clock_event_del, + .start = task_clock_event_start, + .stop = task_clock_event_stop, + .read = task_clock_event_read, +}; + +static void __init perf_swevent_init_all_cpus(void) +{ + struct swevent_htable *swhash; + int cpu; + + for_each_possible_cpu(cpu) { + swhash = &per_cpu(swevent_htable, cpu); + mutex_init(&swhash->hlist_mutex); + } +} + +static void perf_swevent_init_cpu(int cpu) +{ + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + + mutex_lock(&swhash->hlist_mutex); + swhash->online = true; + if (swhash->hlist_refcount > 0) { + struct swevent_hlist *hlist; + + hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); + WARN_ON(!hlist); + rcu_assign_pointer(swhash->swevent_hlist, hlist); + } + mutex_unlock(&swhash->hlist_mutex); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void perf_swevent_exit_cpu(int cpu) +{ + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + + mutex_lock(&swhash->hlist_mutex); + swhash->online = false; + swevent_hlist_release(swhash); + mutex_unlock(&swhash->hlist_mutex); +} +#else +static inline void perf_swevent_exit_cpu(int cpu) { } +#endif + +static int +perf_swevent_notify(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + + case CPU_UP_PREPARE: + case CPU_DOWN_FAILED: + perf_swevent_init_cpu(cpu); + break; + + case CPU_UP_CANCELED: + case CPU_DOWN_PREPARE: + perf_swevent_exit_cpu(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +__init void perf_swevent_register(void) +{ + perf_swevent_init_all_cpus(); + + perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); + perf_pmu_register(&perf_cpu_clock, NULL, -1); + perf_pmu_register(&perf_task_clock, NULL, -1); + perf_tp_register(); + + perf_cpu_notifier(perf_swevent_notify); +} +