In order to allow optimizing perf_pmu_sched_task() we must ensure perf_sched_cb_{inc,dec} are no longer called from NMI context; this means that pmu::{start,stop}() can no longer use them. Prepare for this by reworking the whole large PEBS setup code. The current code relied on the cpuc->pebs_enabled state, however since that reflects the current active state as per pmu::{start,stop}() we can no longer rely on this. Introduce two counters: cpuc->n_pebs and cpuc->n_large_pebs which count the total number of PEBS events and the number of PEBS events that have FREERUNNING set, resp.. With this we can tell if the current setup requires a single record interrupt threshold or can use a larger buffer. This also improves the code in that it re-enables the large threshold once the PEBS event that required single record gets removed. Signed-off-by: Peter Zijlstra (Intel) --- arch/x86/events/intel/ds.c | 96 +++++++++++++++++++++++++++---------------- arch/x86/events/perf_event.h | 2 kernel/events/core.c | 4 + 3 files changed, 67 insertions(+), 35 deletions(-) --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -806,9 +806,45 @@ struct event_constraint *intel_pebs_cons return &emptyconstraint; } -static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc) +/* + * We need the sched_task callback even for per-cpu events when we use + * the large interrupt threshold, such that we can provide PID and TID + * to PEBS samples. + */ +static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc) { - return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1)); + return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs); +} + +static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) +{ + struct debug_store *ds = cpuc->ds; + u64 threshold; + + if (cpuc->n_pebs == cpuc->n_large_pebs) { + threshold = ds->pebs_absolute_maximum - + x86_pmu.max_pebs_events * x86_pmu.pebs_record_size; + } else { + threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size; + } + + ds->pebs_interrupt_threshold = threshold; +} + +static void intel_pmu_pebs_add(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + bool needs_cb = pebs_needs_sched_cb(cpuc); + + cpuc->n_pebs++; + if (hwc->flags & PERF_X86_EVENT_FREERUNNING) + cpuc->n_large_pebs++; + + if (!needs_cb && pebs_needs_sched_cb(cpuc)) + perf_sched_cb_inc(event->ctx->pmu); + + pebs_update_threshold(cpuc); } void intel_pmu_pebs_enable(struct perf_event *event) @@ -816,12 +852,11 @@ void intel_pmu_pebs_enable(struct perf_e struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; struct debug_store *ds = cpuc->ds; - bool first_pebs; - u64 threshold; + + intel_pmu_pebs_add(event); hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; - first_pebs = !pebs_is_enabled(cpuc); cpuc->pebs_enabled |= 1ULL << hwc->idx; if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) @@ -830,46 +865,38 @@ void intel_pmu_pebs_enable(struct perf_e cpuc->pebs_enabled |= 1ULL << 63; /* - * When the event is constrained enough we can use a larger - * threshold and run the event with less frequent PMI. + * Use auto-reload if possible to save a MSR write in the PMI. + * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD. */ - if (hwc->flags & PERF_X86_EVENT_FREERUNNING) { - threshold = ds->pebs_absolute_maximum - - x86_pmu.max_pebs_events * x86_pmu.pebs_record_size; - - if (first_pebs) - perf_sched_cb_inc(event->ctx->pmu); - } else { - threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size; - - /* - * If not all events can use larger buffer, - * roll back to threshold = 1 - */ - if (!first_pebs && - (ds->pebs_interrupt_threshold > threshold)) - perf_sched_cb_dec(event->ctx->pmu); - } - - /* Use auto-reload if possible to save a MSR write in the PMI */ if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { ds->pebs_event_reset[hwc->idx] = (u64)(-hwc->sample_period) & x86_pmu.cntval_mask; } +} - if (first_pebs || ds->pebs_interrupt_threshold > threshold) - ds->pebs_interrupt_threshold = threshold; +static void intel_pmu_pebs_del(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + bool needs_cb = pebs_needs_sched_cb(cpuc); + + cpuc->n_pebs--; + if (hwc->flags & PERF_X86_EVENT_FREERUNNING) + cpuc->n_large_pebs--; + + if (needs_cb && !pebs_needs_sched_cb(cpuc)) + perf_sched_cb_dec(event->ctx->pmu); + + if (cpuc->n_pebs) + pebs_update_threshold(cpuc); } void intel_pmu_pebs_disable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - struct debug_store *ds = cpuc->ds; - bool large_pebs = ds->pebs_interrupt_threshold > - ds->pebs_buffer_base + x86_pmu.pebs_record_size; - if (large_pebs) + if (cpuc->n_pebs == cpuc->n_large_pebs) intel_pmu_drain_pebs_buffer(); cpuc->pebs_enabled &= ~(1ULL << hwc->idx); @@ -879,13 +906,12 @@ void intel_pmu_pebs_disable(struct perf_ else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled &= ~(1ULL << 63); - if (large_pebs && !pebs_is_enabled(cpuc)) - perf_sched_cb_dec(event->ctx->pmu); - if (cpuc->enabled) wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); hwc->config |= ARCH_PERFMON_EVENTSEL_INT; + + intel_pmu_pebs_del(event); } void intel_pmu_pebs_enable_all(void) --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -194,6 +194,8 @@ struct cpu_hw_events { */ struct debug_store *ds; u64 pebs_enabled; + int n_pebs; + int n_large_pebs; /* * Intel LBR bits --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2791,6 +2791,10 @@ void perf_sched_cb_inc(struct pmu *pmu) /* * This function provides the context switch callback to the lower code * layer. It is invoked ONLY when the context switch callback is enabled. + * + * This callback is relevant even to per-cpu events; for example multi event + * PEBS requires this to provide PID/TID information. This requires we flush + * all queued PEBS records before we context switch to a new task. */ static void perf_pmu_sched_task(struct task_struct *prev, struct task_struct *next,