All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V2 1/1] perf/core: don't find side-band event from all pmus
@ 2016-03-23 18:24 kan.liang
  2016-03-29 12:06 ` Peter Zijlstra
  2016-06-03 10:49 ` [tip:perf/core] perf/core: Optimize side-band event delivery tip-bot for Kan Liang
  0 siblings, 2 replies; 13+ messages in thread
From: kan.liang @ 2016-03-23 18:24 UTC (permalink / raw)
  To: peterz
  Cc: ak, eranian, vincent.weaver, tglx, mingo, acme, jolsa,
	alexander.shishkin, ying.huang, linux-kernel, Kan Liang

From: Kan Liang <kan.liang@intel.com>

perf_event_aux funciton goes through all pmus and all events in whatever
contexts to find the side-band event to output, which is unnecessary and
expensive.

For example, the brk test case in lkp triggers many mmap operations, at
the time, perf with cycles:pp is also running on the system. As a
result, many perf_event_aux are invoked, and each would search all pmus
and all events. If we enable the uncore support (even when uncore event
are not really used), dozens of uncore pmus will be added into pmus
list, which can significantly decrease brk_test's ops_per_sec. Based on
our test, the ops_per_sec without uncore patch is 2647573, while the
ops_per_sec with uncore patch is only 1768444, which is a 33.2%
reduction.

To get at the per cpu side-band event, this patch put the side-band
events to four categories, which are tracked by 4 per-cpu lists. It only
finds the interested events from masked category.
To get at the per task side-band event, each task context for current task
will be searched. Because we don't want to go update more global state on
context switch.

Reported-by: Huang, Ying <ying.huang@linux.intel.com>
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@intel.com>

---

The V1 patch is "perf/core: find auxiliary events in running pmus list"
https://lkml.org/lkml/2016/2/24/961.
This V2 patch almost changes everything compare with V1.
The V2 patch is mainly based on Peter's suggestion. But I didn't rename
perf_event_aux to perf_event_sb. Because it looks there are many aux things
in the codes, e.g. AUX area in ring buffer. I'm not sure if we need to change
all aux to sb. We may do the rename later in separate patch.

 include/linux/perf_event.h |  26 +++++++++
 kernel/events/core.c       | 135 +++++++++++++++++++++++++++++++++++++++------
 2 files changed, 144 insertions(+), 17 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 15588d4..953113e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -437,6 +437,31 @@ struct swevent_hlist {
 struct perf_cgroup;
 struct ring_buffer;
 
+struct pmu_event_list {
+	raw_spinlock_t		lock;
+	struct list_head	list;
+};
+
+/*
+ * {mmap,mmap_data,mmap2} -> mmap
+ * {comm,comm_exec} -> comm
+ * task
+ * context_switch
+ */
+enum event_sb_channel {
+	sb_mmap = 0,
+	sb_comm,
+	sb_task,
+	sb_switch,
+
+	sb_nr,
+};
+
+#define IS_SB_MMAP(attr)		\
+	(attr.mmap || attr.mmap_data || attr.mmap2)
+#define IS_SB_COMM(attr)		\
+	(attr.comm || attr.comm_exec)
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -589,6 +614,7 @@ struct perf_event {
 	int				cgrp_defer_enabled;
 #endif
 
+	struct list_head		sb_list[sb_nr];
 #endif /* CONFIG_PERF_EVENTS */
 };
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index de24fbc..bff49d0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -333,6 +333,7 @@ static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
+static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events[sb_nr]);
 
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -3560,6 +3561,37 @@ static void free_event_rcu(struct rcu_head *head)
 static void ring_buffer_attach(struct perf_event *event,
 			       struct ring_buffer *rb);
 
+static void detach_sb_event(struct perf_event *event, enum event_sb_channel sb)
+{
+	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events[sb], event->cpu);
+
+	raw_spin_lock(&pel->lock);
+	list_del_rcu(&event->sb_list[sb]);
+	raw_spin_unlock(&pel->lock);
+}
+
+static void unaccount_pmu_sb_event(struct perf_event *event)
+{
+	if (event->parent)
+		return;
+
+	if (event->attach_state & PERF_ATTACH_TASK)
+		return;
+
+	if (IS_SB_MMAP(event->attr))
+		detach_sb_event(event, sb_mmap);
+
+	if (IS_SB_COMM(event->attr))
+		detach_sb_event(event, sb_comm);
+
+	if (event->attr.task)
+		detach_sb_event(event, sb_task);
+
+	if (event->attr.context_switch)
+		detach_sb_event(event, sb_switch);
+
+}
+
 static void unaccount_event_cpu(struct perf_event *event, int cpu)
 {
 	if (event->parent)
@@ -3623,6 +3655,8 @@ static void unaccount_event(struct perf_event *event)
 	}
 
 	unaccount_event_cpu(event, event->cpu);
+
+	unaccount_pmu_sb_event(event);
 }
 
 static void perf_sched_delayed(struct work_struct *work)
@@ -5720,13 +5754,41 @@ perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
 	rcu_read_unlock();
 }
 
+static void perf_event_sb_iterate(enum event_sb_channel sb,
+				  perf_event_aux_output_cb output,
+				  void *data)
+{
+	struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events[sb]);
+	struct perf_event *event;
+
+	list_for_each_entry_rcu(event, &pel->list, sb_list[sb]) {
+		if (event->state < PERF_EVENT_STATE_INACTIVE)
+			continue;
+		if (!event_filter_match(event))
+			continue;
+		output(event, data);
+	}
+}
+
+static void perf_event_sb_mask(unsigned int sb_mask,
+			       perf_event_aux_output_cb output,
+			       void *data)
+{
+	int sb;
+
+	for (sb = 0; sb < sb_nr; sb++) {
+		if (!(sb_mask & (1 << sb)))
+			continue;
+		perf_event_sb_iterate(sb, output, data);
+	}
+}
+
 static void
 perf_event_aux(perf_event_aux_output_cb output, void *data,
-	       struct perf_event_context *task_ctx)
+	       struct perf_event_context *task_ctx,
+	       unsigned int sb_mask)
 {
-	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx;
-	struct pmu *pmu;
 	int ctxn;
 
 	/*
@@ -5741,21 +5803,17 @@ perf_event_aux(perf_event_aux_output_cb output, void *data,
 	}
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-		if (cpuctx->unique_pmu != pmu)
-			goto next;
-		perf_event_aux_ctx(&cpuctx->ctx, output, data);
-		ctxn = pmu->task_ctx_nr;
-		if (ctxn < 0)
-			goto next;
+	preempt_disable();
+	perf_event_sb_mask(sb_mask, output, data);
+
+	for_each_task_context_nr(ctxn) {
 		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
 		if (ctx)
 			perf_event_aux_ctx(ctx, output, data);
-next:
-		put_cpu_ptr(pmu->pmu_cpu_context);
 	}
+	preempt_enable();
 	rcu_read_unlock();
+
 }
 
 /*
@@ -5852,7 +5910,8 @@ static void perf_event_task(struct task_struct *task,
 
 	perf_event_aux(perf_event_task_output,
 		       &task_event,
-		       task_ctx);
+		       task_ctx,
+		       (1 << sb_task) | (1 << sb_mmap) | (1 << sb_comm));
 }
 
 void perf_event_fork(struct task_struct *task)
@@ -5931,7 +5990,8 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 
 	perf_event_aux(perf_event_comm_output,
 		       comm_event,
-		       NULL);
+		       NULL,
+		       1 << sb_comm);
 }
 
 void perf_event_comm(struct task_struct *task, bool exec)
@@ -6162,7 +6222,8 @@ got_name:
 
 	perf_event_aux(perf_event_mmap_output,
 		       mmap_event,
-		       NULL);
+		       NULL,
+		       1 << sb_mmap);
 
 	kfree(buf);
 }
@@ -6350,7 +6411,8 @@ static void perf_event_switch(struct task_struct *task,
 
 	perf_event_aux(perf_event_switch_output,
 		       &switch_event,
-		       NULL);
+		       NULL,
+		       1 << sb_switch);
 }
 
 /*
@@ -7841,6 +7903,37 @@ unlock:
 	return pmu;
 }
 
+static void attach_sb_event(struct perf_event *event, enum event_sb_channel sb)
+{
+	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events[sb], event->cpu);
+
+	raw_spin_lock(&pel->lock);
+	list_add_rcu(&event->sb_list[sb], &pel->list);
+	raw_spin_unlock(&pel->lock);
+}
+
+static void account_pmu_sb_event(struct perf_event *event)
+{
+	if (event->parent)
+		return;
+
+	if (event->attach_state & PERF_ATTACH_TASK)
+		return;
+
+	if (IS_SB_MMAP(event->attr))
+		attach_sb_event(event, sb_mmap);
+
+	if (IS_SB_COMM(event->attr))
+		attach_sb_event(event, sb_comm);
+
+	if (event->attr.task)
+		attach_sb_event(event, sb_task);
+
+	if (event->attr.context_switch)
+		attach_sb_event(event, sb_switch);
+
+}
+
 static void account_event_cpu(struct perf_event *event, int cpu)
 {
 	if (event->parent)
@@ -7921,6 +8014,8 @@ static void account_event(struct perf_event *event)
 enabled:
 
 	account_event_cpu(event, event->cpu);
+
+	account_pmu_sb_event(event);
 }
 
 /*
@@ -7938,6 +8033,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	struct perf_event *event;
 	struct hw_perf_event *hwc;
 	long err = -EINVAL;
+	int i;
 
 	if ((unsigned)cpu >= nr_cpu_ids) {
 		if (!task || cpu != -1)
@@ -7965,6 +8061,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	INIT_LIST_HEAD(&event->active_entry);
 	INIT_HLIST_NODE(&event->hlist_entry);
 
+	for (i = 0; i < sb_nr; i++)
+		INIT_LIST_HEAD(&event->sb_list[i]);
 
 	init_waitqueue_head(&event->waitq);
 	init_irq_work(&event->pending, perf_pending_event);
@@ -9360,11 +9458,14 @@ static void __init perf_event_init_all_cpus(void)
 {
 	struct swevent_htable *swhash;
 	int cpu;
+	int i;
 
 	for_each_possible_cpu(cpu) {
 		swhash = &per_cpu(swevent_htable, cpu);
 		mutex_init(&swhash->hlist_mutex);
 		INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
+		for (i = 0; i < sb_nr; i++)
+			INIT_LIST_HEAD(&per_cpu(pmu_sb_events[i].list, cpu));
 	}
 }
 
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 1/1] perf/core: don't find side-band event from all pmus
  2016-03-23 18:24 [PATCH V2 1/1] perf/core: don't find side-band event from all pmus kan.liang
@ 2016-03-29 12:06 ` Peter Zijlstra
  2016-03-31 14:05   ` Liang, Kan
  2016-03-31 14:44   ` Arnaldo Carvalho de Melo
  2016-06-03 10:49 ` [tip:perf/core] perf/core: Optimize side-band event delivery tip-bot for Kan Liang
  1 sibling, 2 replies; 13+ messages in thread
From: Peter Zijlstra @ 2016-03-29 12:06 UTC (permalink / raw)
  To: kan.liang
  Cc: ak, eranian, vincent.weaver, tglx, mingo, acme, jolsa,
	alexander.shishkin, ying.huang, linux-kernel

On Wed, Mar 23, 2016 at 11:24:37AM -0700, kan.liang@intel.com wrote:
> The V2 patch is mainly based on Peter's suggestion. But I didn't rename
> perf_event_aux to perf_event_sb. Because it looks there are many aux things
> in the codes, e.g. AUX area in ring buffer. I'm not sure if we need to change
> all aux to sb. We may do the rename later in separate patch.

Right.. no problem doing that in a separate patch.

> +static void perf_event_sb_mask(unsigned int sb_mask,
> +			       perf_event_aux_output_cb output,
> +			       void *data)
> +{
> +	int sb;
> +
> +	for (sb = 0; sb < sb_nr; sb++) {
> +		if (!(sb_mask & (1 << sb)))
> +			continue;
> +		perf_event_sb_iterate(sb, output, data);
> +	}
> +}

> @@ -5852,7 +5910,8 @@ static void perf_event_task(struct task_struct *task,
>  
>  	perf_event_aux(perf_event_task_output,
>  		       &task_event,
> -		       task_ctx);
> +		       task_ctx,
> +		       (1 << sb_task) | (1 << sb_mmap) | (1 << sb_comm));
>  }

So one side-effect of this change is that the above event can be
delivered 3 times if you're 'lucky'.

Acme; does userspace care?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH V2 1/1] perf/core: don't find side-band event from all pmus
  2016-03-29 12:06 ` Peter Zijlstra
@ 2016-03-31 14:05   ` Liang, Kan
  2016-03-31 14:25     ` acme
  2016-03-31 14:44   ` Arnaldo Carvalho de Melo
  1 sibling, 1 reply; 13+ messages in thread
From: Liang, Kan @ 2016-03-31 14:05 UTC (permalink / raw)
  To: acme, Peter Zijlstra
  Cc: ak, eranian, vincent.weaver, tglx, mingo, acme, jolsa,
	alexander.shishkin, ying.huang, linux-kernel



> > +static void perf_event_sb_mask(unsigned int sb_mask,
> > +			       perf_event_aux_output_cb output,
> > +			       void *data)
> > +{
> > +	int sb;
> > +
> > +	for (sb = 0; sb < sb_nr; sb++) {
> > +		if (!(sb_mask & (1 << sb)))
> > +			continue;
> > +		perf_event_sb_iterate(sb, output, data);
> > +	}
> > +}
> 
> > @@ -5852,7 +5910,8 @@ static void perf_event_task(struct task_struct
> > *task,
> >
> >  	perf_event_aux(perf_event_task_output,
> >  		       &task_event,
> > -		       task_ctx);
> > +		       task_ctx,
> > +		       (1 << sb_task) | (1 << sb_mmap) | (1 << sb_comm));
> >  }
> 
> So one side-effect of this change is that the above event can be delivered
> 3 times if you're 'lucky'.
> 
> Acme; does userspace care?

Hi Arnaldo,

Do you think if it's an issue for userspace?

Thanks,
Kan

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 1/1] perf/core: don't find side-band event from all pmus
  2016-03-31 14:05   ` Liang, Kan
@ 2016-03-31 14:25     ` acme
  0 siblings, 0 replies; 13+ messages in thread
From: acme @ 2016-03-31 14:25 UTC (permalink / raw)
  To: Liang, Kan
  Cc: Peter Zijlstra, ak, eranian, vincent.weaver, tglx, mingo, acme,
	jolsa, alexander.shishkin, ying.huang, linux-kernel

Em Thu, Mar 31, 2016 at 02:05:46PM +0000, Liang, Kan escreveu:
> 
> 
> > > +static void perf_event_sb_mask(unsigned int sb_mask,
> > > +			       perf_event_aux_output_cb output,
> > > +			       void *data)
> > > +{
> > > +	int sb;
> > > +
> > > +	for (sb = 0; sb < sb_nr; sb++) {
> > > +		if (!(sb_mask & (1 << sb)))
> > > +			continue;
> > > +		perf_event_sb_iterate(sb, output, data);
> > > +	}
> > > +}
> > 
> > > @@ -5852,7 +5910,8 @@ static void perf_event_task(struct task_struct
> > > *task,
> > >
> > >  	perf_event_aux(perf_event_task_output,
> > >  		       &task_event,
> > > -		       task_ctx);
> > > +		       task_ctx,
> > > +		       (1 << sb_task) | (1 << sb_mmap) | (1 << sb_comm));
> > >  }
> > 
> > So one side-effect of this change is that the above event can be delivered
> > 3 times if you're 'lucky'.
> > 
> > Acme; does userspace care?
> 
> Hi Arnaldo,
> 
> Do you think if it's an issue for userspace?

Trying to get context and decode what you guys wrote...

- Arnaldo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 1/1] perf/core: don't find side-band event from all pmus
  2016-03-29 12:06 ` Peter Zijlstra
  2016-03-31 14:05   ` Liang, Kan
@ 2016-03-31 14:44   ` Arnaldo Carvalho de Melo
  2016-03-31 14:56     ` Peter Zijlstra
  1 sibling, 1 reply; 13+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-03-31 14:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: kan.liang, ak, eranian, vincent.weaver, tglx, mingo, acme, jolsa,
	alexander.shishkin, ying.huang, linux-kernel

Em Tue, Mar 29, 2016 at 02:06:09PM +0200, Peter Zijlstra escreveu:
> On Wed, Mar 23, 2016 at 11:24:37AM -0700, kan.liang@intel.com wrote:
> > The V2 patch is mainly based on Peter's suggestion. But I didn't rename
> > perf_event_aux to perf_event_sb. Because it looks there are many aux things
> > in the codes, e.g. AUX area in ring buffer. I'm not sure if we need to change
> > all aux to sb. We may do the rename later in separate patch.
> 
> Right.. no problem doing that in a separate patch.
> 
> > +static void perf_event_sb_mask(unsigned int sb_mask,
> > +			       perf_event_aux_output_cb output,
> > +			       void *data)
> > +{
> > +	int sb;
> > +
> > +	for (sb = 0; sb < sb_nr; sb++) {
> > +		if (!(sb_mask & (1 << sb)))
> > +			continue;
> > +		perf_event_sb_iterate(sb, output, data);
> > +	}
> > +}
> 
> > @@ -5852,7 +5910,8 @@ static void perf_event_task(struct task_struct *task,
> >  
> >  	perf_event_aux(perf_event_task_output,
> >  		       &task_event,
> > -		       task_ctx);
> > +		       task_ctx,
> > +		       (1 << sb_task) | (1 << sb_mmap) | (1 << sb_comm));
> >  }
> 
> So one side-effect of this change is that the above event can be
> delivered 3 times if you're 'lucky'.
> 
> Acme; does userspace care?

So, when processing a PERF_RECORD_FORK there is some sanity checks in
machine__process_fork_event() (tools/perf/util/machine.c), and one that
is affected by copies is:

        struct thread *thread = machine__find_thread(machine,
                                                     event->fork.pid,
                                                     event->fork.tid);
<SNIP>
        /* if a thread currently exists for the thread id remove it */
        if (thread != NULL) {
                machine__remove_thread(machine, thread);
                thread__put(thread);
        }

        thread = machine__findnew_thread(machine, event->fork.pid,
                                         event->fork.tid);
<SNIP>

So we conceivably may end up with multiple 'struct thread' pointing to
the same kernel thread, being held as references somewhere (in a
hist_entry, for instance, if a PERF_RECORD_SAMPLE happens mid sentence).

It probably will cope, but can't we just emit one single record?

PERF_RECORD_EXIT are ok:

int machine__process_exit_event(struct machine *machine, union perf_event *event,
                                struct perf_sample *sample __maybe_unused)
{
        struct thread *thread = machine__find_thread(machine,
                                                     event->fork.pid,
                                                     event->fork.tid);

        if (dump_trace)
                perf_event__fprintf_task(event, stdout);

        if (thread != NULL) {
                thread__exited(thread);
                thread__put(thread);
        }

        return 0;
}

PERF_RECORD_COMM will unecessarily add a new COMM to its list, leading tooling
to think that there was a prctl(PR_SET_NAME). This could conceivably be annoyed
away by checking if the "new" COMM is the current one, again, can't the kernel
emit just one record?

- Arnaldo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 1/1] perf/core: don't find side-band event from all pmus
  2016-03-31 14:44   ` Arnaldo Carvalho de Melo
@ 2016-03-31 14:56     ` Peter Zijlstra
  2016-03-31 16:21       ` Peter Zijlstra
  0 siblings, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2016-03-31 14:56 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: kan.liang, ak, eranian, vincent.weaver, tglx, mingo, acme, jolsa,
	alexander.shishkin, ying.huang, linux-kernel

On Thu, Mar 31, 2016 at 11:44:39AM -0300, Arnaldo Carvalho de Melo wrote:
> It probably will cope, but can't we just emit one single record?

I'll try and figure something out...

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 1/1] perf/core: don't find side-band event from all pmus
  2016-03-31 14:56     ` Peter Zijlstra
@ 2016-03-31 16:21       ` Peter Zijlstra
  2016-03-31 16:22         ` Peter Zijlstra
  0 siblings, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2016-03-31 16:21 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: kan.liang, ak, eranian, vincent.weaver, tglx, mingo, acme, jolsa,
	alexander.shishkin, ying.huang, linux-kernel

On Thu, Mar 31, 2016 at 04:56:21PM +0200, Peter Zijlstra wrote:
> On Thu, Mar 31, 2016 at 11:44:39AM -0300, Arnaldo Carvalho de Melo wrote:
> > It probably will cope, but can't we just emit one single record?
> 
> I'll try and figure something out...

less clever but probably good enough..

---
Subject: perf/core: don't find side-band event from all pmus 
From: Kan Liang <kan.liang@intel.com>
Date: Wed, 23 Mar 2016 11:24:37 -0700

perf_event_aux funciton goes through all pmus and all events in whatever
contexts to find the side-band event to output, which is unnecessary and
expensive.

For example, the brk test case in lkp triggers many mmap operations, at
the time, perf with cycles:pp is also running on the system. As a
result, many perf_event_aux are invoked, and each would search all pmus
and all events. If we enable the uncore support (even when uncore event
are not really used), dozens of uncore pmus will be added into pmus
list, which can significantly decrease brk_test's ops_per_sec. Based on
our test, the ops_per_sec without uncore patch is 2647573, while the
ops_per_sec with uncore patch is only 1768444, which is a 33.2%
reduction.

To get at the per cpu side-band event, this patch put the side-band
events to four categories, which are tracked by 4 per-cpu lists. It only
finds the interested events from masked category.
To get at the per task side-band event, each task context for current task
will be searched. Because we don't want to go update more global state on
context switch.


Cc: vincent.weaver@maine.edu
Cc: mingo@kernel.org
Cc: acme@redhat.com
Cc: jolsa@redhat.com
Cc: ak@linux.intel.com
Cc: tglx@linutronix.de
Cc: eranian@google.com
Cc: alexander.shishkin@linux.intel.com
Reported-by: Huang, Ying <ying.huang@linux.intel.com>
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1458757477-3781-1-git-send-email-kan.liang@intel.com
---

The V1 patch is "perf/core: find auxiliary events in running pmus list"
https://lkml.org/lkml/2016/2/24/961.
This V2 patch almost changes everything compare with V1.
The V2 patch is mainly based on Peter's suggestion. But I didn't rename
perf_event_aux to perf_event_sb. Because it looks there are many aux things
in the codes, e.g. AUX area in ring buffer. I'm not sure if we need to change
all aux to sb. We may do the rename later in separate patch.

 include/linux/perf_event.h |   26 ++++++++
 kernel/events/core.c       |  134 +++++++++++++++++++++++++++++++++++++++------
 2 files changed, 143 insertions(+), 17 deletions(-)

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -437,6 +437,31 @@ struct swevent_hlist {
 struct perf_cgroup;
 struct ring_buffer;
 
+struct pmu_event_list {
+	raw_spinlock_t		lock;
+	struct list_head	list;
+};
+
+/*
+ * {mmap,mmap_data,mmap2} -> mmap
+ * {comm,comm_exec} -> comm
+ * task
+ * context_switch
+ */
+enum event_sb_channel {
+	sb_mmap = 0,
+	sb_comm,
+	sb_task,
+	sb_switch,
+
+	sb_nr,
+};
+
+#define IS_SB_MMAP(attr)		\
+	(attr.mmap || attr.mmap_data || attr.mmap2)
+#define IS_SB_COMM(attr)		\
+	(attr.comm || attr.comm_exec)
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -589,6 +614,7 @@ struct perf_event {
 	int				cgrp_defer_enabled;
 #endif
 
+	struct list_head		sb_list[sb_nr];
 #endif /* CONFIG_PERF_EVENTS */
 };
 
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -333,6 +333,7 @@ static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
+static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events[sb_nr]);
 
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -3598,6 +3599,37 @@ static void free_event_rcu(struct rcu_he
 static void ring_buffer_attach(struct perf_event *event,
 			       struct ring_buffer *rb);
 
+static void detach_sb_event(struct perf_event *event, enum event_sb_channel sb)
+{
+	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events[sb], event->cpu);
+
+	raw_spin_lock(&pel->lock);
+	list_del_rcu(&event->sb_list[sb]);
+	raw_spin_unlock(&pel->lock);
+}
+
+static void unaccount_pmu_sb_event(struct perf_event *event)
+{
+	if (event->parent)
+		return;
+
+	if (event->attach_state & PERF_ATTACH_TASK)
+		return;
+
+	if (IS_SB_MMAP(event->attr))
+		detach_sb_event(event, sb_mmap);
+
+	if (IS_SB_COMM(event->attr))
+		detach_sb_event(event, sb_comm);
+
+	if (event->attr.task)
+		detach_sb_event(event, sb_task);
+
+	if (event->attr.context_switch)
+		detach_sb_event(event, sb_switch);
+
+}
+
 static void unaccount_event_cpu(struct perf_event *event, int cpu)
 {
 	if (event->parent)
@@ -3661,6 +3693,8 @@ static void unaccount_event(struct perf_
 	}
 
 	unaccount_event_cpu(event, event->cpu);
+
+	unaccount_pmu_sb_event(event);
 }
 
 static void perf_sched_delayed(struct work_struct *work)
@@ -5785,13 +5819,41 @@ perf_event_aux_task_ctx(perf_event_aux_o
 	rcu_read_unlock();
 }
 
+static void perf_event_sb_iterate(enum event_sb_channel sb,
+				  perf_event_aux_output_cb output,
+				  void *data)
+{
+	struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events[sb]);
+	struct perf_event *event;
+
+	list_for_each_entry_rcu(event, &pel->list, sb_list[sb]) {
+		if (event->state < PERF_EVENT_STATE_INACTIVE)
+			continue;
+		if (!event_filter_match(event))
+			continue;
+		output(event, data);
+	}
+}
+
+static void perf_event_sb_mask(unsigned int sb_mask,
+			       perf_event_aux_output_cb output,
+			       void *data)
+{
+	int sb;
+
+	for (sb = 0; sb < sb_nr; sb++) {
+		if (!(sb_mask & (1 << sb)))
+			continue;
+		perf_event_sb_iterate(sb, output, data);
+	}
+}
+
 static void
 perf_event_aux(perf_event_aux_output_cb output, void *data,
-	       struct perf_event_context *task_ctx)
+	       struct perf_event_context *task_ctx,
+	       unsigned int sb_mask)
 {
-	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx;
-	struct pmu *pmu;
 	int ctxn;
 
 	/*
@@ -5806,20 +5868,15 @@ perf_event_aux(perf_event_aux_output_cb
 	}
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-		if (cpuctx->unique_pmu != pmu)
-			goto next;
-		perf_event_aux_ctx(&cpuctx->ctx, output, data);
-		ctxn = pmu->task_ctx_nr;
-		if (ctxn < 0)
-			goto next;
+	preempt_disable();
+	perf_event_sb_mask(sb_mask, output, data);
+
+	for_each_task_context_nr(ctxn) {
 		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
 		if (ctx)
 			perf_event_aux_ctx(ctx, output, data);
-next:
-		put_cpu_ptr(pmu->pmu_cpu_context);
 	}
+	preempt_enable();
 	rcu_read_unlock();
 }
 
@@ -5991,7 +6048,8 @@ static void perf_event_task(struct task_
 
 	perf_event_aux(perf_event_task_output,
 		       &task_event,
-		       task_ctx);
+		       task_ctx,
+		       (1 << sb_task) | (1 << sb_mmap) | (1 << sb_comm));
 }
 
 void perf_event_fork(struct task_struct *task)
@@ -6070,7 +6128,8 @@ static void perf_event_comm_event(struct
 
 	perf_event_aux(perf_event_comm_output,
 		       comm_event,
-		       NULL);
+		       NULL,
+		       1 << sb_comm);
 }
 
 void perf_event_comm(struct task_struct *task, bool exec)
@@ -6301,7 +6360,8 @@ static void perf_event_mmap_event(struct
 
 	perf_event_aux(perf_event_mmap_output,
 		       mmap_event,
-		       NULL);
+		       NULL,
+		       1 << sb_mmap);
 
 	kfree(buf);
 }
@@ -6489,7 +6549,8 @@ static void perf_event_switch(struct tas
 
 	perf_event_aux(perf_event_switch_output,
 		       &switch_event,
-		       NULL);
+		       NULL,
+		       1 << sb_switch);
 }
 
 /*
@@ -7986,6 +8047,37 @@ static struct pmu *perf_init_event(struc
 	return pmu;
 }
 
+static void attach_sb_event(struct perf_event *event, enum event_sb_channel sb)
+{
+	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events[sb], event->cpu);
+
+	raw_spin_lock(&pel->lock);
+	list_add_rcu(&event->sb_list[sb], &pel->list);
+	raw_spin_unlock(&pel->lock);
+}
+
+static void account_pmu_sb_event(struct perf_event *event)
+{
+	if (event->parent)
+		return;
+
+	if (event->attach_state & PERF_ATTACH_TASK)
+		return;
+
+	if (IS_SB_MMAP(event->attr))
+		attach_sb_event(event, sb_mmap);
+
+	if (IS_SB_COMM(event->attr))
+		attach_sb_event(event, sb_comm);
+
+	if (event->attr.task)
+		attach_sb_event(event, sb_task);
+
+	if (event->attr.context_switch)
+		attach_sb_event(event, sb_switch);
+
+}
+
 static void account_event_cpu(struct perf_event *event, int cpu)
 {
 	if (event->parent)
@@ -8066,6 +8158,8 @@ static void account_event(struct perf_ev
 enabled:
 
 	account_event_cpu(event, event->cpu);
+
+	account_pmu_sb_event(event);
 }
 
 /*
@@ -8083,6 +8177,7 @@ perf_event_alloc(struct perf_event_attr
 	struct perf_event *event;
 	struct hw_perf_event *hwc;
 	long err = -EINVAL;
+	int i;
 
 	if ((unsigned)cpu >= nr_cpu_ids) {
 		if (!task || cpu != -1)
@@ -8110,6 +8205,8 @@ perf_event_alloc(struct perf_event_attr
 	INIT_LIST_HEAD(&event->active_entry);
 	INIT_HLIST_NODE(&event->hlist_entry);
 
+	for (i = 0; i < sb_nr; i++)
+		INIT_LIST_HEAD(&event->sb_list[i]);
 
 	init_waitqueue_head(&event->waitq);
 	init_irq_work(&event->pending, perf_pending_event);
@@ -9511,11 +9608,14 @@ static void __init perf_event_init_all_c
 {
 	struct swevent_htable *swhash;
 	int cpu;
+	int i;
 
 	for_each_possible_cpu(cpu) {
 		swhash = &per_cpu(swevent_htable, cpu);
 		mutex_init(&swhash->hlist_mutex);
 		INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
+		for (i = 0; i < sb_nr; i++)
+			INIT_LIST_HEAD(&per_cpu(pmu_sb_events[i].list, cpu));
 	}
 }
 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 1/1] perf/core: don't find side-band event from all pmus
  2016-03-31 16:21       ` Peter Zijlstra
@ 2016-03-31 16:22         ` Peter Zijlstra
  2016-04-06  8:09           ` Peter Zijlstra
  2016-05-12 13:30           ` Liang, Kan
  0 siblings, 2 replies; 13+ messages in thread
From: Peter Zijlstra @ 2016-03-31 16:22 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: kan.liang, ak, eranian, vincent.weaver, tglx, mingo, acme, jolsa,
	alexander.shishkin, ying.huang, linux-kernel

On Thu, Mar 31, 2016 at 06:21:41PM +0200, Peter Zijlstra wrote:
> On Thu, Mar 31, 2016 at 04:56:21PM +0200, Peter Zijlstra wrote:
> > On Thu, Mar 31, 2016 at 11:44:39AM -0300, Arnaldo Carvalho de Melo wrote:
> > > It probably will cope, but can't we just emit one single record?
> > 
> > I'll try and figure something out...
> 
> less clever but probably good enough..

I'm an idiot; quilt refresh is needed..

---
Subject: perf/core: don't find side-band event from all pmus 
From: Kan Liang <kan.liang@intel.com>
Date: Wed, 23 Mar 2016 11:24:37 -0700

perf_event_aux funciton goes through all pmus and all events in whatever
contexts to find the side-band event to output, which is unnecessary and
expensive.

For example, the brk test case in lkp triggers many mmap operations, at
the time, perf with cycles:pp is also running on the system. As a
result, many perf_event_aux are invoked, and each would search all pmus
and all events. If we enable the uncore support (even when uncore event
are not really used), dozens of uncore pmus will be added into pmus
list, which can significantly decrease brk_test's ops_per_sec. Based on
our test, the ops_per_sec without uncore patch is 2647573, while the
ops_per_sec with uncore patch is only 1768444, which is a 33.2%
reduction.

To get at the per cpu side-band event, this patch put the side-band
events to four categories, which are tracked by 4 per-cpu lists. It only
finds the interested events from masked category.
To get at the per task side-band event, each task context for current task
will be searched. Because we don't want to go update more global state on
context switch.


Cc: vincent.weaver@maine.edu
Cc: mingo@kernel.org
Cc: acme@redhat.com
Cc: ak@linux.intel.com
Cc: jolsa@redhat.com
Cc: tglx@linutronix.de
Cc: eranian@google.com
Cc: alexander.shishkin@linux.intel.com
Reported-by: Huang, Ying <ying.huang@linux.intel.com>
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1458757477-3781-1-git-send-email-kan.liang@intel.com
---

 include/linux/perf_event.h |    6 +++
 kernel/events/core.c       |   87 ++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 80 insertions(+), 13 deletions(-)

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -437,6 +437,11 @@ struct swevent_hlist {
 struct perf_cgroup;
 struct ring_buffer;
 
+struct pmu_event_list {
+	raw_spinlock_t		lock;
+	struct list_head	list;
+};
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -589,6 +594,7 @@ struct perf_event {
 	int				cgrp_defer_enabled;
 #endif
 
+	struct list_head		sb_list;
 #endif /* CONFIG_PERF_EVENTS */
 };
 
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -333,6 +333,7 @@ static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
+static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -3598,6 +3599,26 @@ static void free_event_rcu(struct rcu_he
 static void ring_buffer_attach(struct perf_event *event,
 			       struct ring_buffer *rb);
 
+static void detach_sb_event(struct perf_event *event)
+{
+	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+	raw_spin_lock(&pel->lock);
+	list_del_rcu(&event->sb_list);
+	raw_spin_unlock(&pel->lock);
+}
+
+static void unaccount_pmu_sb_event(struct perf_event *event)
+{
+	if (event->parent)
+		return;
+
+	if (event->attach_state & PERF_ATTACH_TASK)
+		return;
+
+	detach_sb_event(event);
+}
+
 static void unaccount_event_cpu(struct perf_event *event, int cpu)
 {
 	if (event->parent)
@@ -3661,6 +3682,8 @@ static void unaccount_event(struct perf_
 	}
 
 	unaccount_event_cpu(event, event->cpu);
+
+	unaccount_pmu_sb_event(event);
 }
 
 static void perf_sched_delayed(struct work_struct *work)
@@ -5785,13 +5808,25 @@ perf_event_aux_task_ctx(perf_event_aux_o
 	rcu_read_unlock();
 }
 
+static void perf_event_sb_iterate(perf_event_aux_output_cb output, void *data)
+{
+	struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
+	struct perf_event *event;
+
+	list_for_each_entry_rcu(event, &pel->list, sb_list) {
+		if (event->state < PERF_EVENT_STATE_INACTIVE)
+			continue;
+		if (!event_filter_match(event))
+			continue;
+		output(event, data);
+	}
+}
+
 static void
 perf_event_aux(perf_event_aux_output_cb output, void *data,
 	       struct perf_event_context *task_ctx)
 {
-	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx;
-	struct pmu *pmu;
 	int ctxn;
 
 	/*
@@ -5806,20 +5841,15 @@ perf_event_aux(perf_event_aux_output_cb
 	}
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-		if (cpuctx->unique_pmu != pmu)
-			goto next;
-		perf_event_aux_ctx(&cpuctx->ctx, output, data);
-		ctxn = pmu->task_ctx_nr;
-		if (ctxn < 0)
-			goto next;
+	preempt_disable();
+	perf_event_sb_iterate(output, data);
+
+	for_each_task_context_nr(ctxn) {
 		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
 		if (ctx)
 			perf_event_aux_ctx(ctx, output, data);
-next:
-		put_cpu_ptr(pmu->pmu_cpu_context);
 	}
+	preempt_enable();
 	rcu_read_unlock();
 }
 
@@ -7986,6 +8016,32 @@ static struct pmu *perf_init_event(struc
 	return pmu;
 }
 
+static void attach_sb_event(struct perf_event *event)
+{
+	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+	raw_spin_lock(&pel->lock);
+	list_add_rcu(&event->sb_list, &pel->list);
+	raw_spin_unlock(&pel->lock);
+}
+
+static void account_pmu_sb_event(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+
+	if (event->parent)
+		return;
+
+	if (event->attach_state & PERF_ATTACH_TASK)
+		return;
+
+	if (attr->mmap || attr->mmap_data || attr->mmap2 ||
+	    attr->comm || attr->comm_exec ||
+	    attr->task ||
+	    attr->context_switch)
+		attach_sb_event(event);
+}
+
 static void account_event_cpu(struct perf_event *event, int cpu)
 {
 	if (event->parent)
@@ -8066,6 +8122,8 @@ static void account_event(struct perf_ev
 enabled:
 
 	account_event_cpu(event, event->cpu);
+
+	account_pmu_sb_event(event);
 }
 
 /*
@@ -8109,7 +8167,7 @@ perf_event_alloc(struct perf_event_attr
 	INIT_LIST_HEAD(&event->rb_entry);
 	INIT_LIST_HEAD(&event->active_entry);
 	INIT_HLIST_NODE(&event->hlist_entry);
-
+	INIT_LIST_HEAD(&event->sb_list);
 
 	init_waitqueue_head(&event->waitq);
 	init_irq_work(&event->pending, perf_pending_event);
@@ -9516,6 +9574,9 @@ static void __init perf_event_init_all_c
 		swhash = &per_cpu(swevent_htable, cpu);
 		mutex_init(&swhash->hlist_mutex);
 		INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
+
+		INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
+		raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
 	}
 }
 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 1/1] perf/core: don't find side-band event from all pmus
  2016-03-31 16:22         ` Peter Zijlstra
@ 2016-04-06  8:09           ` Peter Zijlstra
  2016-04-06 14:25             ` Liang, Kan
  2016-05-12 13:30           ` Liang, Kan
  1 sibling, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2016-04-06  8:09 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: kan.liang, ak, eranian, vincent.weaver, tglx, mingo, acme, jolsa,
	alexander.shishkin, ying.huang, linux-kernel

On Thu, Mar 31, 2016 at 06:22:26PM +0200, Peter Zijlstra wrote:
> I'm an idiot; quilt refresh is needed..
> 
> ---
> Subject: perf/core: don't find side-band event from all pmus 
> From: Kan Liang <kan.liang@intel.com>
> Date: Wed, 23 Mar 2016 11:24:37 -0700
> 

Kan, does this patch work for you?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH V2 1/1] perf/core: don't find side-band event from all pmus
  2016-04-06  8:09           ` Peter Zijlstra
@ 2016-04-06 14:25             ` Liang, Kan
  0 siblings, 0 replies; 13+ messages in thread
From: Liang, Kan @ 2016-04-06 14:25 UTC (permalink / raw)
  To: Peter Zijlstra, Arnaldo Carvalho de Melo
  Cc: ak, eranian, vincent.weaver, tglx, mingo, acme, jolsa,
	alexander.shishkin, ying.huang, linux-kernel



> -----Original Message-----
> From: Peter Zijlstra [mailto:peterz@infradead.org]
> Sent: Wednesday, April 06, 2016 4:10 AM
> To: Arnaldo Carvalho de Melo <acme@kernel.org>
> Cc: Liang, Kan <kan.liang@intel.com>; ak@linux.intel.com;
> eranian@google.com; vincent.weaver@maine.edu; tglx@linutronix.de;
> mingo@kernel.org; acme@redhat.com; jolsa@redhat.com;
> alexander.shishkin@linux.intel.com; ying.huang@linux.intel.com; linux-
> kernel@vger.kernel.org
> Subject: Re: [PATCH V2 1/1] perf/core: don't find side-band event from all
> pmus
> 
> On Thu, Mar 31, 2016 at 06:22:26PM +0200, Peter Zijlstra wrote:
> > I'm an idiot; quilt refresh is needed..
> >
> > ---
> > Subject: perf/core: don't find side-band event from all pmus
> > From: Kan Liang <kan.liang@intel.com>
> > Date: Wed, 23 Mar 2016 11:24:37 -0700
> >
> 
> Kan, does this patch work for you?

Yes. I did some tests on my machine. It looks good.

Thanks,
Kan

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH V2 1/1] perf/core: don't find side-band event from all pmus
  2016-03-31 16:22         ` Peter Zijlstra
  2016-04-06  8:09           ` Peter Zijlstra
@ 2016-05-12 13:30           ` Liang, Kan
  2016-05-12 14:00             ` Peter Zijlstra
  1 sibling, 1 reply; 13+ messages in thread
From: Liang, Kan @ 2016-05-12 13:30 UTC (permalink / raw)
  To: Peter Zijlstra, Arnaldo Carvalho de Melo
  Cc: ak, eranian, vincent.weaver, tglx, mingo, acme, jolsa,
	alexander.shishkin, ying.huang, linux-kernel



> ---
> Subject: perf/core: don't find side-band event from all pmus
> From: Kan Liang <kan.liang@intel.com>
> Date: Wed, 23 Mar 2016 11:24:37 -0700
> 
Hi Peter,

Is there something wrong with the patch?
The last time I saw this patch was in your personal tree (kernel/git/peterz/queue.git).
But now I cannot find it anymore. 

Thanks,
Kan
> perf_event_aux funciton goes through all pmus and all events in whatever
> contexts to find the side-band event to output, which is unnecessary and
> expensive.
> 
> For example, the brk test case in lkp triggers many mmap operations, at the
> time, perf with cycles:pp is also running on the system. As a result, many
> perf_event_aux are invoked, and each would search all pmus and all events.
> If we enable the uncore support (even when uncore event are not really
> used), dozens of uncore pmus will be added into pmus list, which can
> significantly decrease brk_test's ops_per_sec. Based on our test, the
> ops_per_sec without uncore patch is 2647573, while the ops_per_sec with
> uncore patch is only 1768444, which is a 33.2% reduction.
> 
> To get at the per cpu side-band event, this patch put the side-band events to
> four categories, which are tracked by 4 per-cpu lists. It only finds the
> interested events from masked category.
> To get at the per task side-band event, each task context for current task will
> be searched. Because we don't want to go update more global state on
> context switch.
> 
> 
> Cc: vincent.weaver@maine.edu
> Cc: mingo@kernel.org
> Cc: acme@redhat.com
> Cc: ak@linux.intel.com
> Cc: jolsa@redhat.com
> Cc: tglx@linutronix.de
> Cc: eranian@google.com
> Cc: alexander.shishkin@linux.intel.com
> Reported-by: Huang, Ying <ying.huang@linux.intel.com>
> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Kan Liang <kan.liang@intel.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Link: http://lkml.kernel.org/r/1458757477-3781-1-git-send-email-
> kan.liang@intel.com 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 1/1] perf/core: don't find side-band event from all pmus
  2016-05-12 13:30           ` Liang, Kan
@ 2016-05-12 14:00             ` Peter Zijlstra
  0 siblings, 0 replies; 13+ messages in thread
From: Peter Zijlstra @ 2016-05-12 14:00 UTC (permalink / raw)
  To: Liang, Kan
  Cc: Arnaldo Carvalho de Melo, ak, eranian, vincent.weaver, tglx,
	mingo, acme, jolsa, alexander.shishkin, ying.huang, linux-kernel

On Thu, May 12, 2016 at 01:30:36PM +0000, Liang, Kan wrote:
> 
> 
> > ---
> > Subject: perf/core: don't find side-band event from all pmus
> > From: Kan Liang <kan.liang@intel.com>
> > Date: Wed, 23 Mar 2016 11:24:37 -0700
> > 
> Hi Peter,
> 
> Is there something wrong with the patch?
> The last time I saw this patch was in your personal tree (kernel/git/peterz/queue.git).
> But now I cannot find it anymore. 

Hurm; I seem to have lost it, sorry about that.

Let me go find where its gone.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [tip:perf/core] perf/core: Optimize side-band event delivery
  2016-03-23 18:24 [PATCH V2 1/1] perf/core: don't find side-band event from all pmus kan.liang
  2016-03-29 12:06 ` Peter Zijlstra
@ 2016-06-03 10:49 ` tip-bot for Kan Liang
  1 sibling, 0 replies; 13+ messages in thread
From: tip-bot for Kan Liang @ 2016-06-03 10:49 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: acme, torvalds, ying.huang, jolsa, hpa, kan.liang,
	vincent.weaver, alexander.shishkin, linux-kernel, peterz, tglx,
	eranian, mingo

Commit-ID:  f2fb6bef92514432398a653df1c2f1041d79ac46
Gitweb:     http://git.kernel.org/tip/f2fb6bef92514432398a653df1c2f1041d79ac46
Author:     Kan Liang <kan.liang@intel.com>
AuthorDate: Wed, 23 Mar 2016 11:24:37 -0700
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Fri, 3 Jun 2016 09:40:15 +0200

perf/core: Optimize side-band event delivery

The perf_event_aux() function iterates all PMUs and all events in
their respective per-CPU contexts to find the events to deliver
side-band records to.

For example, the brk test case in lkp triggers many mmap() operations,
which, if we're also running perf, results in many perf_event_aux()
invocations.

If we enable uncore PMU support (even when uncore events are not used),
dozens of uncore PMUs will be iterated, which can significantly
decrease brk_test's throughput.

For example, the brk throughput:

  without uncore PMUs: 2647573 ops_per_sec
  with    uncore PMUs: 1768444 ops_per_sec

... a 33% reduction.

To get at the per-CPU events that need side-band records, this patch
puts these events on a per-CPU list, this avoids iterating the PMUs
and any events that do not need side-band records.

Per task events are unchanged to avoid extra overhead on the context
switch paths.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reported-by: Huang, Ying <ying.huang@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1458757477-3781-1-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  6 ++++
 kernel/events/core.c       | 85 +++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 79 insertions(+), 12 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0e43355..92e9ce7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -517,6 +517,11 @@ struct swevent_hlist {
 struct perf_cgroup;
 struct ring_buffer;
 
+struct pmu_event_list {
+	raw_spinlock_t		lock;
+	struct list_head	list;
+};
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -675,6 +680,7 @@ struct perf_event {
 	int				cgrp_defer_enabled;
 #endif
 
+	struct list_head		sb_list;
 #endif /* CONFIG_PERF_EVENTS */
 };
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 79363f2..6615c89 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -335,6 +335,7 @@ static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
+static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -3665,6 +3666,26 @@ static void free_event_rcu(struct rcu_head *head)
 static void ring_buffer_attach(struct perf_event *event,
 			       struct ring_buffer *rb);
 
+static void detach_sb_event(struct perf_event *event)
+{
+	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+	raw_spin_lock(&pel->lock);
+	list_del_rcu(&event->sb_list);
+	raw_spin_unlock(&pel->lock);
+}
+
+static void unaccount_pmu_sb_event(struct perf_event *event)
+{
+	if (event->parent)
+		return;
+
+	if (event->attach_state & PERF_ATTACH_TASK)
+		return;
+
+	detach_sb_event(event);
+}
+
 static void unaccount_event_cpu(struct perf_event *event, int cpu)
 {
 	if (event->parent)
@@ -3728,6 +3749,8 @@ static void unaccount_event(struct perf_event *event)
 	}
 
 	unaccount_event_cpu(event, event->cpu);
+
+	unaccount_pmu_sb_event(event);
 }
 
 static void perf_sched_delayed(struct work_struct *work)
@@ -5888,13 +5911,25 @@ perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
 	rcu_read_unlock();
 }
 
+static void perf_event_sb_iterate(perf_event_aux_output_cb output, void *data)
+{
+	struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
+	struct perf_event *event;
+
+	list_for_each_entry_rcu(event, &pel->list, sb_list) {
+		if (event->state < PERF_EVENT_STATE_INACTIVE)
+			continue;
+		if (!event_filter_match(event))
+			continue;
+		output(event, data);
+	}
+}
+
 static void
 perf_event_aux(perf_event_aux_output_cb output, void *data,
 	       struct perf_event_context *task_ctx)
 {
-	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx;
-	struct pmu *pmu;
 	int ctxn;
 
 	/*
@@ -5909,20 +5944,15 @@ perf_event_aux(perf_event_aux_output_cb output, void *data,
 	}
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-		if (cpuctx->unique_pmu != pmu)
-			goto next;
-		perf_event_aux_ctx(&cpuctx->ctx, output, data, false);
-		ctxn = pmu->task_ctx_nr;
-		if (ctxn < 0)
-			goto next;
+	preempt_disable();
+	perf_event_sb_iterate(output, data);
+
+	for_each_task_context_nr(ctxn) {
 		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
 		if (ctx)
 			perf_event_aux_ctx(ctx, output, data, false);
-next:
-		put_cpu_ptr(pmu->pmu_cpu_context);
 	}
+	preempt_enable();
 	rcu_read_unlock();
 }
 
@@ -8615,6 +8645,32 @@ unlock:
 	return pmu;
 }
 
+static void attach_sb_event(struct perf_event *event)
+{
+	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+	raw_spin_lock(&pel->lock);
+	list_add_rcu(&event->sb_list, &pel->list);
+	raw_spin_unlock(&pel->lock);
+}
+
+static void account_pmu_sb_event(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+
+	if (event->parent)
+		return;
+
+	if (event->attach_state & PERF_ATTACH_TASK)
+		return;
+
+	if (attr->mmap || attr->mmap_data || attr->mmap2 ||
+	    attr->comm || attr->comm_exec ||
+	    attr->task ||
+	    attr->context_switch)
+		attach_sb_event(event);
+}
+
 static void account_event_cpu(struct perf_event *event, int cpu)
 {
 	if (event->parent)
@@ -8695,6 +8751,8 @@ static void account_event(struct perf_event *event)
 enabled:
 
 	account_event_cpu(event, event->cpu);
+
+	account_pmu_sb_event(event);
 }
 
 /*
@@ -10203,6 +10261,9 @@ static void __init perf_event_init_all_cpus(void)
 		swhash = &per_cpu(swevent_htable, cpu);
 		mutex_init(&swhash->hlist_mutex);
 		INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
+
+		INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
+		raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
 	}
 }
 

^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2016-06-03 10:50 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-03-23 18:24 [PATCH V2 1/1] perf/core: don't find side-band event from all pmus kan.liang
2016-03-29 12:06 ` Peter Zijlstra
2016-03-31 14:05   ` Liang, Kan
2016-03-31 14:25     ` acme
2016-03-31 14:44   ` Arnaldo Carvalho de Melo
2016-03-31 14:56     ` Peter Zijlstra
2016-03-31 16:21       ` Peter Zijlstra
2016-03-31 16:22         ` Peter Zijlstra
2016-04-06  8:09           ` Peter Zijlstra
2016-04-06 14:25             ` Liang, Kan
2016-05-12 13:30           ` Liang, Kan
2016-05-12 14:00             ` Peter Zijlstra
2016-06-03 10:49 ` [tip:perf/core] perf/core: Optimize side-band event delivery tip-bot for Kan Liang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.