perf_event_mux_interval_ms_store() tries to apply an updated hrtimer_interval to all possible cpus in a completely unsafe way. The changelog of the offending commit says: "In the 5th version, we handle the reprogramming of the hrtimer using hrtimer_forward_now(). That way, we sync up to new timer value quickly (suggested by Jiri Olsa)." The hrtimer reprogramming is completely unrelated to hrtimer_forward_now(). hrtimer_forward_now() merily forwards the expiry time of the hrtimer past now and that's where things get really bad in this code: The forward function is called on enqueued timers. That means the update of the expiry time corrupts the ordering of the hrtimer rbtree. The proper way to update hrtimers on remote cpus is to use a smp function call on all online cpus and perform the update locally by canceling the timer, forwarding and restarting it. Fixes: 62b856397927 "perf: Add sysfs entry to adjust multiplexing interval per PMU" Signed-off-by: Thomas Gleixner Cc: Stephane Eranian Cc: Jiri Olsa Cc: stable@vger.kernel.org Cc: Arnaldo Carvalho de Melo --- kernel/events/core.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) Index: linux/kernel/events/core.c =================================================================== --- linux.orig/kernel/events/core.c +++ linux/kernel/events/core.c @@ -6827,13 +6827,27 @@ perf_event_mux_interval_ms_show(struct d return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); } +static void perf_event_mux_update_interval(void *info) +{ + struct pmu *pmu = info; + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + int was_armed = hrtimer_cancel(&cpuctx->hrtimer); + + /* Update the interval and restart the timer with the new interval */ + cpuctx->hrtimer_interval = ms_to_ktime(pmu->hrtimer_interval_ms); + if (was_armed) { + hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval); + hrtimer_start_expires(&cpuctx->hrtimer, HRTIMER_MODE_ABS); + } +} + static ssize_t perf_event_mux_interval_ms_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct pmu *pmu = dev_get_drvdata(dev); - int timer, cpu, ret; + int timer, ret; ret = kstrtoint(buf, 0, &timer); if (ret) @@ -6842,22 +6856,12 @@ perf_event_mux_interval_ms_store(struct if (timer < 1) return -EINVAL; - /* same value, noting to do */ - if (timer == pmu->hrtimer_interval_ms) - return count; - - pmu->hrtimer_interval_ms = timer; - - /* update all cpuctx for this PMU */ - for_each_possible_cpu(cpu) { - struct perf_cpu_context *cpuctx; - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); - - if (hrtimer_active(&cpuctx->hrtimer)) - hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval); + if (timer != pmu->hrtimer_interval_ms) { + get_online_cpus(); + pmu->hrtimer_interval_ms = timer; + on_each_cpu(perf_event_mux_update_interval, pmu, 1); + put_online_cpus(); } - return count; } static DEVICE_ATTR_RW(perf_event_mux_interval_ms);