linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH V2 1/3] perf/core: Flush PMU internal buffers for per-CPU events
@ 2020-11-30 19:38 kan.liang
  2020-11-30 19:38 ` [PATCH V2 2/3] perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR kan.liang
                   ` (4 more replies)
  0 siblings, 5 replies; 18+ messages in thread
From: kan.liang @ 2020-11-30 19:38 UTC (permalink / raw)
  To: peterz, mingo, linux-kernel
  Cc: namhyung, eranian, irogers, gmx, acme, jolsa, ak, benh, paulus,
	mpe, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

Sometimes the PMU internal buffers have to be flushed for per-CPU events
during a context switch, e.g., large PEBS. Otherwise, the perf tool may
report samples in locations that do not belong to the process where the
samples are processed in, because PEBS does not tag samples with PID/TID.

The current code only flush the buffers for a per-task event. It doesn't
check a per-CPU event.

Add a new event state flag, PERF_ATTACH_SCHED_CB, to indicate that the
PMU internal buffers have to be flushed for this event during a context
switch.

Add sched_cb_entry and perf_sched_cb_usages back to track the PMU/cpuctx
which is required to be flushed.

Only need to invoke the sched_task() for per-CPU events in this patch.
The per-task events have been handled in perf_event_context_sched_in/out
already.

Fixes: 9c964efa4330 ("perf/x86/intel: Drain the PEBS buffer during context switches")
Reported-by: Gabriel Marin <gmx@google.com>
Reported-by: Namhyung Kim <namhyung@kernel.org>
Originally-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

No changes since V1

 include/linux/perf_event.h |  2 ++
 kernel/events/core.c       | 42 ++++++++++++++++++++++++++++++++++----
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0c19d279b97f..530a505e1c7e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -606,6 +606,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_TASK	0x04
 #define PERF_ATTACH_TASK_DATA	0x08
 #define PERF_ATTACH_ITRACE	0x10
+#define PERF_ATTACH_SCHED_CB	0x20
 
 struct perf_cgroup;
 struct perf_buffer;
@@ -872,6 +873,7 @@ struct perf_cpu_context {
 	struct list_head		cgrp_cpuctx_entry;
 #endif
 
+	struct list_head		sched_cb_entry;
 	int				sched_cb_usage;
 
 	int				online;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index da467e1dd49a..c0f7d84a4245 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -383,6 +383,7 @@ static DEFINE_MUTEX(perf_sched_mutex);
 static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
@@ -3474,11 +3475,16 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 	}
 }
 
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+
 void perf_sched_cb_dec(struct pmu *pmu)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-	--cpuctx->sched_cb_usage;
+	this_cpu_dec(perf_sched_cb_usages);
+
+	if (!--cpuctx->sched_cb_usage)
+		list_del(&cpuctx->sched_cb_entry);
 }
 
 
@@ -3486,7 +3492,10 @@ void perf_sched_cb_inc(struct pmu *pmu)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-	cpuctx->sched_cb_usage++;
+	if (!cpuctx->sched_cb_usage++)
+		list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
+	this_cpu_inc(perf_sched_cb_usages);
 }
 
 /*
@@ -3515,6 +3524,24 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 
+static void perf_pmu_sched_task(struct task_struct *prev,
+				struct task_struct *next,
+				bool sched_in)
+{
+	struct perf_cpu_context *cpuctx;
+
+	if (prev == next)
+		return;
+
+	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
+		/* will be handled in perf_event_context_sched_in/out */
+		if (cpuctx->task_ctx)
+			continue;
+
+		__perf_pmu_sched_task(cpuctx, sched_in);
+	}
+}
+
 static void perf_event_switch(struct task_struct *task,
 			      struct task_struct *next_prev, bool sched_in);
 
@@ -3537,6 +3564,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
 {
 	int ctxn;
 
+	if (__this_cpu_read(perf_sched_cb_usages))
+		perf_pmu_sched_task(task, next, false);
+
 	if (atomic_read(&nr_switch_events))
 		perf_event_switch(task, next, false);
 
@@ -3844,6 +3874,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 
 	if (atomic_read(&nr_switch_events))
 		perf_event_switch(task, prev, true);
+
+	if (__this_cpu_read(perf_sched_cb_usages))
+		perf_pmu_sched_task(prev, task, true);
 }
 
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -4668,7 +4701,7 @@ static void unaccount_event(struct perf_event *event)
 	if (event->parent)
 		return;
 
-	if (event->attach_state & PERF_ATTACH_TASK)
+	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
 		dec = true;
 	if (event->attr.mmap || event->attr.mmap_data)
 		atomic_dec(&nr_mmap_events);
@@ -11065,7 +11098,7 @@ static void account_event(struct perf_event *event)
 	if (event->parent)
 		return;
 
-	if (event->attach_state & PERF_ATTACH_TASK)
+	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
 		inc = true;
 	if (event->attr.mmap || event->attr.mmap_data)
 		atomic_inc(&nr_mmap_events);
@@ -12857,6 +12890,7 @@ static void __init perf_event_init_all_cpus(void)
 #ifdef CONFIG_CGROUP_PERF
 		INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
 #endif
+		INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
 	}
 }
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH V2 2/3] perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR
  2020-11-30 19:38 [PATCH V2 1/3] perf/core: Flush PMU internal buffers for per-CPU events kan.liang
@ 2020-11-30 19:38 ` kan.liang
  2021-03-01 10:16   ` [tip: perf/urgent] " tip-bot2 for Kan Liang
  2021-03-06 11:54   ` tip-bot2 for Kan Liang
  2020-11-30 19:38 ` [PATCH V2 3/3] perf: Optimize sched_task() in a context switch kan.liang
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 18+ messages in thread
From: kan.liang @ 2020-11-30 19:38 UTC (permalink / raw)
  To: peterz, mingo, linux-kernel
  Cc: namhyung, eranian, irogers, gmx, acme, jolsa, ak, benh, paulus,
	mpe, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

To supply a PID/TID for large PEBS, it requires flushing the PEBS buffer
in a context switch.

For normal LBRs, a context switch can flip the address space and LBR
entries are not tagged with an identifier, we need to wipe the LBR, even
for per-cpu events.

For LBR callstack, save/restore the stack is required during a context
switch.

Set PERF_ATTACH_SCHED_CB for the event with large PEBS & LBR.

Fixes: 9c964efa4330 ("perf/x86/intel: Drain the PEBS buffer during context switches")
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

Changes since V1
- Set PERF_ATTACH_SCHED_CB for a LBR event as well

 arch/x86/events/intel/core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index f1926e9f2143..534e0c84f7f8 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3564,8 +3564,10 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) {
 			event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
 			if (!(event->attr.sample_type &
-			      ~intel_pmu_large_pebs_flags(event)))
+			      ~intel_pmu_large_pebs_flags(event))) {
 				event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS;
+				event->attach_state |= PERF_ATTACH_SCHED_CB;
+			}
 		}
 		if (x86_pmu.pebs_aliases)
 			x86_pmu.pebs_aliases(event);
@@ -3578,6 +3580,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		ret = intel_pmu_setup_lbr_filter(event);
 		if (ret)
 			return ret;
+		event->attach_state |= PERF_ATTACH_SCHED_CB;
 
 		/*
 		 * BTS is set up earlier in this path, so don't account twice
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH V2 3/3] perf: Optimize sched_task() in a context switch
  2020-11-30 19:38 [PATCH V2 1/3] perf/core: Flush PMU internal buffers for per-CPU events kan.liang
  2020-11-30 19:38 ` [PATCH V2 2/3] perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR kan.liang
@ 2020-11-30 19:38 ` kan.liang
  2020-12-01 17:29   ` Peter Zijlstra
  2020-12-01 17:21 ` [PATCH V2 1/3] perf/core: Flush PMU internal buffers for per-CPU events Peter Zijlstra
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 18+ messages in thread
From: kan.liang @ 2020-11-30 19:38 UTC (permalink / raw)
  To: peterz, mingo, linux-kernel
  Cc: namhyung, eranian, irogers, gmx, acme, jolsa, ak, benh, paulus,
	mpe, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

Some calls to sched_task() in a context switch can be avoided. For
example, large PEBS only requires flushing the buffer in context switch
out. The current code still invokes the sched_task() for large PEBS in
context switch in.

The current code doesn't know and check the states of an event. It has
no idea which calls is unnecessary.

Add a new variable, sched_cb_state, to indicate the states of an event.
Add sched_cb_state to track the states per CPU context. Only invokes
the sched_task() when the relative states are set.

Split sched_cb_usage into sched_cb_task_usage and sched_cb_cpu_usage to
track per-task and per-CPU events respectively. Avoid going through the
sched_cb_list for pure per-task events. Avoid invoking sched_task() in
perf_event_context_sched_in/out for pure perf-CPU events.

Since the interface functions are changed, the specific codes are also
changed here.
- For power, the BHRB entries are reset in context switch in.
- For X86 large PEBS, the PEBS buffer are flushed in context switch out.
- For X86 normal LBR, the LBR registers are reset in context switch in.
- For X86 LBR call stack, the LBR registers are saved in context switch
out, and restored in context switch in.

Suggested-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

Changes since V1:
- Fix the missed changes in the power_pmu_bhrb_disable()

This patch still not split the ppc change to a separate patch, because
both PPC and X86 invokes the perf_sched_cb_inc() directly. The patch
changes the parameters of the perf_sched_cb_inc(). We have to update
the PPC and X86 codes together.

 arch/powerpc/perf/core-book3s.c | 14 ++++++--
 arch/x86/events/intel/ds.c      |  8 +++--
 arch/x86/events/intel/lbr.c     | 15 ++++++--
 include/linux/perf_event.h      | 12 +++++--
 kernel/events/core.c            | 62 +++++++++++++++++++++++++++------
 5 files changed, 92 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 08643cba1494..63477002bfcb 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -370,6 +370,7 @@ static void power_pmu_bhrb_reset(void)
 static void power_pmu_bhrb_enable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
+	int state = PERF_SCHED_CB_SW_IN;
 
 	if (!ppmu->bhrb_nr)
 		return;
@@ -380,19 +381,28 @@ static void power_pmu_bhrb_enable(struct perf_event *event)
 		cpuhw->bhrb_context = event->ctx;
 	}
 	cpuhw->bhrb_users++;
-	perf_sched_cb_inc(event->ctx->pmu);
+
+	if (!(event->attach_state & PERF_ATTACH_TASK))
+		state |= PERF_SCHED_CB_CPU;
+
+	perf_sched_cb_inc(event->ctx->pmu, state);
 }
 
 static void power_pmu_bhrb_disable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
+	int state = PERF_SCHED_CB_SW_IN;
 
 	if (!ppmu->bhrb_nr)
 		return;
 
 	WARN_ON_ONCE(!cpuhw->bhrb_users);
 	cpuhw->bhrb_users--;
-	perf_sched_cb_dec(event->ctx->pmu);
+
+	if (!(event->attach_state & PERF_ATTACH_TASK))
+		state |= PERF_SCHED_CB_CPU;
+
+	perf_sched_cb_dec(event->ctx->pmu, state);
 
 	if (!cpuhw->disabled && !cpuhw->bhrb_users) {
 		/* BHRB cannot be turned off when other
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 404315df1e16..3c124c203264 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1018,12 +1018,16 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
 	 * that does not hurt:
 	 */
 	bool update = cpuc->n_pebs == 1;
+	int state = PERF_SCHED_CB_SW_OUT;
 
 	if (needed_cb != pebs_needs_sched_cb(cpuc)) {
+		if (!(event->attach_state & PERF_ATTACH_TASK))
+			state |= PERF_SCHED_CB_CPU;
+
 		if (!needed_cb)
-			perf_sched_cb_inc(pmu);
+			perf_sched_cb_inc(pmu, state);
 		else
-			perf_sched_cb_dec(pmu);
+			perf_sched_cb_dec(pmu, state);
 
 		update = true;
 	}
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 8961653c5dd2..e4c500580df5 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -660,6 +660,7 @@ void intel_pmu_lbr_add(struct perf_event *event)
 {
 	struct kmem_cache *kmem_cache = event->pmu->task_ctx_cache;
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int state = PERF_SCHED_CB_SW_IN;
 
 	if (!x86_pmu.lbr_nr)
 		return;
@@ -693,7 +694,13 @@ void intel_pmu_lbr_add(struct perf_event *event)
 	 */
 	if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
 		cpuc->lbr_pebs_users++;
-	perf_sched_cb_inc(event->ctx->pmu);
+
+	if (!(event->attach_state & PERF_ATTACH_TASK))
+		state |= PERF_SCHED_CB_CPU;
+	if (event->attach_state & PERF_ATTACH_TASK_DATA)
+		state |= PERF_SCHED_CB_SW_OUT;
+
+	perf_sched_cb_inc(event->ctx->pmu, state);
 	if (!cpuc->lbr_users++ && !event->total_time_running)
 		intel_pmu_lbr_reset();
 
@@ -724,6 +731,7 @@ void release_lbr_buffers(void)
 void intel_pmu_lbr_del(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int state = 0;
 
 	if (!x86_pmu.lbr_nr)
 		return;
@@ -740,7 +748,10 @@ void intel_pmu_lbr_del(struct perf_event *event)
 	cpuc->lbr_users--;
 	WARN_ON_ONCE(cpuc->lbr_users < 0);
 	WARN_ON_ONCE(cpuc->lbr_pebs_users < 0);
-	perf_sched_cb_dec(event->ctx->pmu);
+
+	if (!(event->attach_state & PERF_ATTACH_TASK))
+		state |= PERF_SCHED_CB_CPU;
+	perf_sched_cb_dec(event->ctx->pmu, state);
 }
 
 static inline bool vlbr_exclude_host(void)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 530a505e1c7e..35f741a610bd 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -854,6 +854,10 @@ struct perf_event_context {
  */
 #define PERF_NR_CONTEXTS	4
 
+#define PERF_SCHED_CB_CPU	0x1
+#define PERF_SCHED_CB_SW_IN	0x2
+#define PERF_SCHED_CB_SW_OUT	0x4
+
 /**
  * struct perf_event_cpu_context - per cpu event context structure
  */
@@ -874,7 +878,9 @@ struct perf_cpu_context {
 #endif
 
 	struct list_head		sched_cb_entry;
-	int				sched_cb_usage;
+	int				sched_cb_task_usage;
+	int				sched_cb_cpu_usage;
+	int				sched_cb_state;
 
 	int				online;
 	/*
@@ -967,8 +973,8 @@ extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
 extern void perf_pmu_enable(struct pmu *pmu);
-extern void perf_sched_cb_dec(struct pmu *pmu);
-extern void perf_sched_cb_inc(struct pmu *pmu);
+extern void perf_sched_cb_dec(struct pmu *pmu, int state);
+extern void perf_sched_cb_inc(struct pmu *pmu, int state);
 extern int perf_event_task_disable(void);
 extern int perf_event_task_enable(void);
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c0f7d84a4245..a642494f4ce2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3377,6 +3377,32 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
 	}
 }
 
+static inline bool perf_pmu_sched_in_ctx(struct perf_cpu_context *cpuctx)
+{
+	return !!(cpuctx->sched_cb_state & PERF_SCHED_CB_SW_IN);
+}
+
+static inline bool perf_pmu_sched_out_ctx(struct perf_cpu_context *cpuctx)
+{
+	return !!(cpuctx->sched_cb_state & PERF_SCHED_CB_SW_OUT);
+}
+
+static inline bool perf_pmu_has_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
+{
+	struct pmu *pmu = cpuctx->ctx.pmu;
+
+	if (!pmu->sched_task)
+		return false;
+
+	if (sched_in && perf_pmu_sched_in_ctx(cpuctx))
+		return true;
+
+	if (!sched_in && perf_pmu_sched_out_ctx(cpuctx))
+		return true;
+
+	return false;
+}
+
 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 					 struct task_struct *next)
 {
@@ -3426,7 +3452,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 
 			perf_pmu_disable(pmu);
 
-			if (cpuctx->sched_cb_usage && pmu->sched_task)
+			if (cpuctx->sched_cb_task_usage && perf_pmu_has_sched_task(cpuctx, false))
 				pmu->sched_task(ctx, false);
 
 			/*
@@ -3466,7 +3492,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 		raw_spin_lock(&ctx->lock);
 		perf_pmu_disable(pmu);
 
-		if (cpuctx->sched_cb_usage && pmu->sched_task)
+		if (cpuctx->sched_cb_task_usage && perf_pmu_has_sched_task(cpuctx, false))
 			pmu->sched_task(ctx, false);
 		task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
 
@@ -3477,22 +3503,37 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 
 static DEFINE_PER_CPU(struct list_head, sched_cb_list);
 
-void perf_sched_cb_dec(struct pmu *pmu)
+void perf_sched_cb_dec(struct pmu *pmu, int state)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
+	if (!(state & PERF_SCHED_CB_CPU)) {
+		--cpuctx->sched_cb_task_usage;
+		goto end;
+	}
+
 	this_cpu_dec(perf_sched_cb_usages);
 
-	if (!--cpuctx->sched_cb_usage)
+	if (!--cpuctx->sched_cb_cpu_usage)
 		list_del(&cpuctx->sched_cb_entry);
+end:
+	if (!cpuctx->sched_cb_cpu_usage && !cpuctx->sched_cb_task_usage)
+		cpuctx->sched_cb_state = 0;
 }
 
 
-void perf_sched_cb_inc(struct pmu *pmu)
+void perf_sched_cb_inc(struct pmu *pmu, int state)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-	if (!cpuctx->sched_cb_usage++)
+	cpuctx->sched_cb_state |= state;
+
+	if (!(state & PERF_SCHED_CB_CPU)) {
+		cpuctx->sched_cb_task_usage++;
+		return;
+	}
+
+	if (!cpuctx->sched_cb_cpu_usage++)
 		list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
 
 	this_cpu_inc(perf_sched_cb_usages);
@@ -3537,8 +3578,8 @@ static void perf_pmu_sched_task(struct task_struct *prev,
 		/* will be handled in perf_event_context_sched_in/out */
 		if (cpuctx->task_ctx)
 			continue;
-
-		__perf_pmu_sched_task(cpuctx, sched_in);
+		if (perf_pmu_has_sched_task(cpuctx, sched_in))
+			__perf_pmu_sched_task(cpuctx, sched_in);
 	}
 }
 
@@ -3802,7 +3843,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 
 	cpuctx = __get_cpu_context(ctx);
 	if (cpuctx->task_ctx == ctx) {
-		if (cpuctx->sched_cb_usage)
+		if (cpuctx->sched_cb_task_usage &&
+		    (cpuctx->sched_cb_state & PERF_SCHED_CB_SW_IN))
 			__perf_pmu_sched_task(cpuctx, true);
 		return;
 	}
@@ -3828,7 +3870,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 	perf_event_sched_in(cpuctx, ctx, task);
 
-	if (cpuctx->sched_cb_usage && pmu->sched_task)
+	if (cpuctx->sched_cb_task_usage && perf_pmu_has_sched_task(cpuctx, true))
 		pmu->sched_task(cpuctx->task_ctx, true);
 
 	perf_pmu_enable(pmu);
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH V2 1/3] perf/core: Flush PMU internal buffers for per-CPU events
  2020-11-30 19:38 [PATCH V2 1/3] perf/core: Flush PMU internal buffers for per-CPU events kan.liang
  2020-11-30 19:38 ` [PATCH V2 2/3] perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR kan.liang
  2020-11-30 19:38 ` [PATCH V2 3/3] perf: Optimize sched_task() in a context switch kan.liang
@ 2020-12-01 17:21 ` Peter Zijlstra
  2021-03-01 10:16 ` [tip: perf/urgent] " tip-bot2 for Kan Liang
  2021-03-06 11:54 ` tip-bot2 for Kan Liang
  4 siblings, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2020-12-01 17:21 UTC (permalink / raw)
  To: kan.liang
  Cc: mingo, linux-kernel, namhyung, eranian, irogers, gmx, acme,
	jolsa, ak, benh, paulus, mpe

On Mon, Nov 30, 2020 at 11:38:40AM -0800, kan.liang@linux.intel.com wrote:
> +static DEFINE_PER_CPU(int, perf_sched_cb_usages);

% s/perf_sched_cb_usages/perf_sched_cb_usage/g

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V2 3/3] perf: Optimize sched_task() in a context switch
  2020-11-30 19:38 ` [PATCH V2 3/3] perf: Optimize sched_task() in a context switch kan.liang
@ 2020-12-01 17:29   ` Peter Zijlstra
  2020-12-02 10:26     ` Peter Zijlstra
                       ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Peter Zijlstra @ 2020-12-01 17:29 UTC (permalink / raw)
  To: kan.liang
  Cc: mingo, linux-kernel, namhyung, eranian, irogers, gmx, acme,
	jolsa, ak, benh, paulus, mpe

On Mon, Nov 30, 2020 at 11:38:42AM -0800, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
> 
> Some calls to sched_task() in a context switch can be avoided. For
> example, large PEBS only requires flushing the buffer in context switch
> out. The current code still invokes the sched_task() for large PEBS in
> context switch in.

I still hate this one, how's something like this then?
Which I still don't really like.. but at least its simpler.

(completely untested, may contain spurious edits, might ICE the
compiler and set your pets on fire if it doesn't)

And given this is an optimization, can we actually measure it to improve
matters?

---

--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -380,7 +380,7 @@ static void power_pmu_bhrb_enable(struct
 		cpuhw->bhrb_context = event->ctx;
 	}
 	cpuhw->bhrb_users++;
-	perf_sched_cb_inc(event->ctx->pmu);
+	perf_sched_cb_inc(event);
 }
 
 static void power_pmu_bhrb_disable(struct perf_event *event)
@@ -392,7 +392,7 @@ static void power_pmu_bhrb_disable(struc
 
 	WARN_ON_ONCE(!cpuhw->bhrb_users);
 	cpuhw->bhrb_users--;
-	perf_sched_cb_dec(event->ctx->pmu);
+	perf_sched_cb_dec(event);
 
 	if (!cpuhw->disabled && !cpuhw->bhrb_users) {
 		/* BHRB cannot be turned off when other
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3567,7 +3567,7 @@ static int intel_pmu_hw_config(struct pe
 			if (!(event->attr.sample_type &
 			      ~intel_pmu_large_pebs_flags(event))) {
 				event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS;
-				event->attach_state |= PERF_ATTACH_SCHED_CB;
+				event->attach_state |= PERF_ATTACH_SCHED_CB_OUT;
 			}
 		}
 		if (x86_pmu.pebs_aliases)
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1014,7 +1014,6 @@ static void
 pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
 		  struct perf_event *event, bool add)
 {
-	struct pmu *pmu = event->ctx->pmu;
 	/*
 	 * Make sure we get updated with the first PEBS
 	 * event. It will trigger also during removal, but
@@ -1024,9 +1023,9 @@ pebs_update_state(bool needed_cb, struct
 
 	if (needed_cb != pebs_needs_sched_cb(cpuc)) {
 		if (!needed_cb)
-			perf_sched_cb_inc(pmu);
+			perf_sched_cb_inc(event);
 		else
-			perf_sched_cb_dec(pmu);
+			perf_sched_cb_dec(event);
 
 		update = true;
 	}
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -693,7 +693,7 @@ void intel_pmu_lbr_add(struct perf_event
 	 */
 	if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
 		cpuc->lbr_pebs_users++;
-	perf_sched_cb_inc(event->ctx->pmu);
+	perf_sched_cb_inc(event);
 	if (!cpuc->lbr_users++ && !event->total_time_running)
 		intel_pmu_lbr_reset();
 
@@ -740,7 +740,7 @@ void intel_pmu_lbr_del(struct perf_event
 	cpuc->lbr_users--;
 	WARN_ON_ONCE(cpuc->lbr_users < 0);
 	WARN_ON_ONCE(cpuc->lbr_pebs_users < 0);
-	perf_sched_cb_dec(event->ctx->pmu);
+	perf_sched_cb_dec(event);
 }
 
 static inline bool vlbr_exclude_host(void)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -601,12 +601,14 @@ struct swevent_hlist {
 	struct rcu_head			rcu_head;
 };
 
-#define PERF_ATTACH_CONTEXT	0x01
-#define PERF_ATTACH_GROUP	0x02
-#define PERF_ATTACH_TASK	0x04
-#define PERF_ATTACH_TASK_DATA	0x08
-#define PERF_ATTACH_ITRACE	0x10
-#define PERF_ATTACH_SCHED_CB	0x20
+#define PERF_ATTACH_CONTEXT		0x01
+#define PERF_ATTACH_GROUP		0x02
+#define PERF_ATTACH_TASK		0x04
+#define PERF_ATTACH_TASK_DATA		0x08
+#define PERF_ATTACH_ITRACE		0x10
+#define PERF_ATTACH_SCHED_CB_IN		0x20
+#define PERF_ATTACH_SCHED_CB_OUT	0x40
+#define PERF_ATTACH_SCHED_CB		0x60
 
 struct perf_cgroup;
 struct perf_buffer;
@@ -875,6 +877,7 @@ struct perf_cpu_context {
 
 	struct list_head		sched_cb_entry;
 	int				sched_cb_usage;
+	int				sched_cb_dir[2];
 
 	int				online;
 	/*
@@ -967,8 +970,8 @@ extern const struct perf_event_attr *per
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
 extern void perf_pmu_enable(struct pmu *pmu);
-extern void perf_sched_cb_dec(struct pmu *pmu);
-extern void perf_sched_cb_inc(struct pmu *pmu);
+extern void perf_sched_cb_dec(struct perf_event *event);
+extern void perf_sched_cb_inc(struct perf_event *event);
 extern int perf_event_task_disable(void);
 extern int perf_event_task_enable(void);
 
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -384,7 +384,6 @@ static DEFINE_MUTEX(perf_sched_mutex);
 static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(int, perf_sched_cb_usage);
 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
@@ -3376,6 +3375,16 @@ static void perf_event_sync_stat(struct
 	}
 }
 
+static void context_sched_task(struct perf_cpu_context *cpuctx,
+			       struct perf_event_context *ctx,
+			       bool sched_in)
+{
+	struct pmu *pmu = ctx->pmu;
+
+	if (cpuctx->sched_cb_dir[sched_in] && pmu->sched_task)
+		pmu->sched_task(ctx, false);
+}
+
 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 					 struct task_struct *next)
 {
@@ -3424,9 +3433,7 @@ static void perf_event_context_sched_out
 			WRITE_ONCE(next_ctx->task, task);
 
 			perf_pmu_disable(pmu);
-
-			if (cpuctx->sched_cb_usage && pmu->sched_task)
-				pmu->sched_task(ctx, false);
+			context_sched_task(cpuctx, ctx, false);
 
 			/*
 			 * PMU specific parts of task perf context can require
@@ -3465,8 +3472,7 @@ static void perf_event_context_sched_out
 		raw_spin_lock(&ctx->lock);
 		perf_pmu_disable(pmu);
 
-		if (cpuctx->sched_cb_usage && pmu->sched_task)
-			pmu->sched_task(ctx, false);
+		context_sched_task(cpuctx, ctx, false);
 		task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
 
 		perf_pmu_enable(pmu);
@@ -3476,25 +3482,38 @@ static void perf_event_context_sched_out
 
 static DEFINE_PER_CPU(struct list_head, sched_cb_list);
 
-void perf_sched_cb_dec(struct pmu *pmu)
+void perf_sched_cb_dec(struct perf_event *event)
 {
+	struct perf_event_context *ctx = event->ctx;
+	struct pmu *pmu = ctx->pmu;
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-	this_cpu_dec(perf_sched_cb_usage);
-
-	if (!--cpuctx->sched_cb_usage)
+	if (!ctx->task && !--cpuctx->sched_cb_usage)
 		list_del(&cpuctx->sched_cb_entry);
+
+	if (event->attach_state & PERF_ATTACH_SCHED_CB_OUT)
+		cpuctx->sched_cb_dir[0]--;
+	if (event->attach_state & PERF_ATTACH_SCHED_CB_IN)
+		cpuctx->sched_cb_dir[1]--;
 }
 
 
-void perf_sched_cb_inc(struct pmu *pmu)
+void perf_sched_cb_inc(struct perf_event *event)
 {
+	struct perf_event_context *ctx = event->ctx;
+	struct pmu *pmu = ctx->pmu;
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-	if (!cpuctx->sched_cb_usage++)
+	if (!ctx->task && !cpuctx->sched_cb_usage++)
 		list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
 
-	this_cpu_inc(perf_sched_cb_usage);
+	if (!(event->attach_state & PERF_ATTACH_SCHED_CB))
+		event->attach_state |= PERF_ATTACH_SCHED_CB;
+
+	if (event->attach_state & PERF_ATTACH_SCHED_CB_OUT)
+		cpuctx->sched_cb_dir[0]++;
+	if (event->attach_state & PERF_ATTACH_SCHED_CB_IN)
+		cpuctx->sched_cb_dir[1]++;
 }
 
 /*
@@ -3523,9 +3542,9 @@ static void __perf_pmu_sched_task(struct
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 
-static void perf_pmu_sched_task(struct task_struct *prev,
-				struct task_struct *next,
-				bool sched_in)
+static inline void perf_pmu_sched_task(struct task_struct *prev,
+				       struct task_struct *next,
+				       bool sched_in)
 {
 	struct perf_cpu_context *cpuctx;
 
@@ -3563,8 +3582,7 @@ void __perf_event_task_sched_out(struct
 {
 	int ctxn;
 
-	if (__this_cpu_read(perf_sched_cb_usage))
-		perf_pmu_sched_task(task, next, false);
+	perf_pmu_sched_task(task, next, false);
 
 	if (atomic_read(&nr_switch_events))
 		perf_event_switch(task, next, false);
@@ -3828,8 +3846,7 @@ static void perf_event_context_sched_in(
 		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 	perf_event_sched_in(cpuctx, ctx, task);
 
-	if (cpuctx->sched_cb_usage && pmu->sched_task)
-		pmu->sched_task(cpuctx->task_ctx, true);
+	context_sched_task(cpuctx, ctx, true);
 
 	perf_pmu_enable(pmu);
 
@@ -3875,8 +3892,7 @@ void __perf_event_task_sched_in(struct t
 	if (atomic_read(&nr_switch_events))
 		perf_event_switch(task, prev, true);
 
-	if (__this_cpu_read(perf_sched_cb_usage))
-		perf_pmu_sched_task(prev, task, true);
+	perf_pmu_sched_task(prev, task, true);
 }
 
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V2 3/3] perf: Optimize sched_task() in a context switch
  2020-12-01 17:29   ` Peter Zijlstra
@ 2020-12-02 10:26     ` Peter Zijlstra
  2020-12-02 14:40     ` Namhyung Kim
  2020-12-04  7:14     ` Namhyung Kim
  2 siblings, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2020-12-02 10:26 UTC (permalink / raw)
  To: kan.liang
  Cc: mingo, linux-kernel, namhyung, eranian, irogers, gmx, acme,
	jolsa, ak, benh, paulus, mpe

On Tue, Dec 01, 2020 at 06:29:03PM +0100, Peter Zijlstra wrote:
> +static void context_sched_task(struct perf_cpu_context *cpuctx,
> +			       struct perf_event_context *ctx,
> +			       bool sched_in)
> +{
> +	struct pmu *pmu = ctx->pmu;
> +
> +	if (cpuctx->sched_cb_dir[sched_in] && pmu->sched_task)
> +		pmu->sched_task(ctx, false);
> +}

s/false/sched_in/



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V2 3/3] perf: Optimize sched_task() in a context switch
  2020-12-01 17:29   ` Peter Zijlstra
  2020-12-02 10:26     ` Peter Zijlstra
@ 2020-12-02 14:40     ` Namhyung Kim
  2020-12-04  7:14     ` Namhyung Kim
  2 siblings, 0 replies; 18+ messages in thread
From: Namhyung Kim @ 2020-12-02 14:40 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: kan.liang, mingo, linux-kernel, eranian, irogers, gmx, acme,
	jolsa, ak, benh, paulus, mpe

Hi Peter and Kan,

On Tue, Dec 01, 2020 at 06:29:03PM +0100, Peter Zijlstra wrote:
> On Mon, Nov 30, 2020 at 11:38:42AM -0800, kan.liang@linux.intel.com wrote:
> > From: Kan Liang <kan.liang@linux.intel.com>
> > 
> > Some calls to sched_task() in a context switch can be avoided. For
> > example, large PEBS only requires flushing the buffer in context switch
> > out. The current code still invokes the sched_task() for large PEBS in
> > context switch in.
> 
> I still hate this one, how's something like this then?
> Which I still don't really like.. but at least its simpler.
> 
> (completely untested, may contain spurious edits, might ICE the
> compiler and set your pets on fire if it doesn't)

I've tested Kan's v2 patches and it worked well.  Will test your
version (with the fix in the other email) too.


> 
> And given this is an optimization, can we actually measure it to improve
> matters?

I just checked perf bench sched pipe result.  Without perf record
running, it usually takes less than 7 seconds.  Note that this (and
below) is a median value of 10 runs.

  # perf bench sched pipe
  # Running 'sched/pipe' benchmark:
  # Executed 1000000 pipe operations between two processes

     Total time: 6.875 [sec]

       6.875700 usecs/op
         145439 ops/sec


And I ran it again with perf record like below.  This is a result when
I applied the patch 1 and 2 only.

  # perf record -aB -c 100001 -e cycles:pp perf bench sched pipe
  # Running 'sched/pipe' benchmark:
  # Executed 1000000 pipe operations between two processes

     Total time: 8.198 [sec]

       8.198952 usecs/op
         121966 ops/sec
  [ perf record: Woken up 10 times to write data ]
  [ perf record: Captured and wrote 4.972 MB perf.data ]


With patch 3 applied, the total time went down a little bit.

  # perf record -aB -c 100001 -e cycles:pp perf bench sched pipe
  # Running 'sched/pipe' benchmark:
  # Executed 1000000 pipe operations between two processes

     Total time: 7.785 [sec]

       7.785119 usecs/op
         128450 ops/sec
  [ perf record: Woken up 12 times to write data ]
  [ perf record: Captured and wrote 4.622 MB perf.data ]


Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V2 3/3] perf: Optimize sched_task() in a context switch
  2020-12-01 17:29   ` Peter Zijlstra
  2020-12-02 10:26     ` Peter Zijlstra
  2020-12-02 14:40     ` Namhyung Kim
@ 2020-12-04  7:14     ` Namhyung Kim
  2020-12-10  7:13       ` Namhyung Kim
  2 siblings, 1 reply; 18+ messages in thread
From: Namhyung Kim @ 2020-12-04  7:14 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Kan Liang, Ingo Molnar, linux-kernel, Stephane Eranian,
	Ian Rogers, Gabriel Marin, Arnaldo Carvalho de Melo, Jiri Olsa,
	Andi Kleen, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman

Hi Peter,

On Wed, Dec 2, 2020 at 2:29 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Nov 30, 2020 at 11:38:42AM -0800, kan.liang@linux.intel.com wrote:
> > From: Kan Liang <kan.liang@linux.intel.com>
> >
> > Some calls to sched_task() in a context switch can be avoided. For
> > example, large PEBS only requires flushing the buffer in context switch
> > out. The current code still invokes the sched_task() for large PEBS in
> > context switch in.
>
> I still hate this one, how's something like this then?
> Which I still don't really like.. but at least its simpler.
>
> (completely untested, may contain spurious edits, might ICE the
> compiler and set your pets on fire if it doesn't)

I've tested this version... and it worked well besides the optimization.. :)

[SNIP]
> +static void context_sched_task(struct perf_cpu_context *cpuctx,
> +                              struct perf_event_context *ctx,
> +                              bool sched_in)
> +{
> +       struct pmu *pmu = ctx->pmu;
> +
> +       if (cpuctx->sched_cb_dir[sched_in] && pmu->sched_task)
> +               pmu->sched_task(ctx, false);

applied: s/false/sched_in/


> +}
> +
>  static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
>                                          struct task_struct *next)
>  {
> @@ -3424,9 +3433,7 @@ static void perf_event_context_sched_out
>                         WRITE_ONCE(next_ctx->task, task);
>
>                         perf_pmu_disable(pmu);
> -
> -                       if (cpuctx->sched_cb_usage && pmu->sched_task)
> -                               pmu->sched_task(ctx, false);
> +                       context_sched_task(cpuctx, ctx, false);
>
>                         /*
>                          * PMU specific parts of task perf context can require
> @@ -3465,8 +3472,7 @@ static void perf_event_context_sched_out
>                 raw_spin_lock(&ctx->lock);
>                 perf_pmu_disable(pmu);
>
> -               if (cpuctx->sched_cb_usage && pmu->sched_task)
> -                       pmu->sched_task(ctx, false);
> +               context_sched_task(cpuctx, ctx, false);
>                 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
>
>                 perf_pmu_enable(pmu);

[SNIP]
> @@ -3563,8 +3582,7 @@ void __perf_event_task_sched_out(struct
>  {
>         int ctxn;
>
> -       if (__this_cpu_read(perf_sched_cb_usage))
> -               perf_pmu_sched_task(task, next, false);
> +       perf_pmu_sched_task(task, next, false);

I think the reason is this change.  It now calls perf_pmu_sched_task()
without checking the counter.  And this is for per-cpu events.

>
>         if (atomic_read(&nr_switch_events))
>                 perf_event_switch(task, next, false);
> @@ -3828,8 +3846,7 @@ static void perf_event_context_sched_in(
>                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
>         perf_event_sched_in(cpuctx, ctx, task);
>
> -       if (cpuctx->sched_cb_usage && pmu->sched_task)
> -               pmu->sched_task(cpuctx->task_ctx, true);
> +       context_sched_task(cpuctx, ctx, true);
>
>         perf_pmu_enable(pmu);
>
> @@ -3875,8 +3892,7 @@ void __perf_event_task_sched_in(struct t
>         if (atomic_read(&nr_switch_events))
>                 perf_event_switch(task, prev, true);
>
> -       if (__this_cpu_read(perf_sched_cb_usage))
> -               perf_pmu_sched_task(prev, task, true);
> +       perf_pmu_sched_task(prev, task, true);

Ditto.

>  }
>
>  static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)

So I made a change like below.. and it could bring the optimization back.

Thanks,
Namhyung


diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9107e7c3ccfb..a30243a9fab5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3528,6 +3528,9 @@ static void __perf_pmu_sched_task(struct
perf_cpu_context *cpuctx, bool sched_in
 {
        struct pmu *pmu;

+       if (!cpuctx->sched_cb_dir[sched_in])
+               return;
+
        pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */

        if (WARN_ON_ONCE(!pmu->sched_task))

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH V2 3/3] perf: Optimize sched_task() in a context switch
  2020-12-04  7:14     ` Namhyung Kim
@ 2020-12-10  7:13       ` Namhyung Kim
  2020-12-10 13:52         ` Liang, Kan
  0 siblings, 1 reply; 18+ messages in thread
From: Namhyung Kim @ 2020-12-10  7:13 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Kan Liang, Ingo Molnar, linux-kernel, Stephane Eranian,
	Ian Rogers, Gabriel Marin, Arnaldo Carvalho de Melo, Jiri Olsa,
	Andi Kleen, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman

Hi Peter and Kan,

How can we move this forward?

Thanks,
Namhyung

On Fri, Dec 4, 2020 at 4:14 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Hi Peter,
>
> On Wed, Dec 2, 2020 at 2:29 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Mon, Nov 30, 2020 at 11:38:42AM -0800, kan.liang@linux.intel.com wrote:
> > > From: Kan Liang <kan.liang@linux.intel.com>
> > >
> > > Some calls to sched_task() in a context switch can be avoided. For
> > > example, large PEBS only requires flushing the buffer in context switch
> > > out. The current code still invokes the sched_task() for large PEBS in
> > > context switch in.
> >
> > I still hate this one, how's something like this then?
> > Which I still don't really like.. but at least its simpler.
> >
> > (completely untested, may contain spurious edits, might ICE the
> > compiler and set your pets on fire if it doesn't)
>
> I've tested this version... and it worked well besides the optimization.. :)
>
> [SNIP]
> > +static void context_sched_task(struct perf_cpu_context *cpuctx,
> > +                              struct perf_event_context *ctx,
> > +                              bool sched_in)
> > +{
> > +       struct pmu *pmu = ctx->pmu;
> > +
> > +       if (cpuctx->sched_cb_dir[sched_in] && pmu->sched_task)
> > +               pmu->sched_task(ctx, false);
>
> applied: s/false/sched_in/
>
>
> > +}
> > +
> >  static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
> >                                          struct task_struct *next)
> >  {
> > @@ -3424,9 +3433,7 @@ static void perf_event_context_sched_out
> >                         WRITE_ONCE(next_ctx->task, task);
> >
> >                         perf_pmu_disable(pmu);
> > -
> > -                       if (cpuctx->sched_cb_usage && pmu->sched_task)
> > -                               pmu->sched_task(ctx, false);
> > +                       context_sched_task(cpuctx, ctx, false);
> >
> >                         /*
> >                          * PMU specific parts of task perf context can require
> > @@ -3465,8 +3472,7 @@ static void perf_event_context_sched_out
> >                 raw_spin_lock(&ctx->lock);
> >                 perf_pmu_disable(pmu);
> >
> > -               if (cpuctx->sched_cb_usage && pmu->sched_task)
> > -                       pmu->sched_task(ctx, false);
> > +               context_sched_task(cpuctx, ctx, false);
> >                 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
> >
> >                 perf_pmu_enable(pmu);
>
> [SNIP]
> > @@ -3563,8 +3582,7 @@ void __perf_event_task_sched_out(struct
> >  {
> >         int ctxn;
> >
> > -       if (__this_cpu_read(perf_sched_cb_usage))
> > -               perf_pmu_sched_task(task, next, false);
> > +       perf_pmu_sched_task(task, next, false);
>
> I think the reason is this change.  It now calls perf_pmu_sched_task()
> without checking the counter.  And this is for per-cpu events.
>
> >
> >         if (atomic_read(&nr_switch_events))
> >                 perf_event_switch(task, next, false);
> > @@ -3828,8 +3846,7 @@ static void perf_event_context_sched_in(
> >                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
> >         perf_event_sched_in(cpuctx, ctx, task);
> >
> > -       if (cpuctx->sched_cb_usage && pmu->sched_task)
> > -               pmu->sched_task(cpuctx->task_ctx, true);
> > +       context_sched_task(cpuctx, ctx, true);
> >
> >         perf_pmu_enable(pmu);
> >
> > @@ -3875,8 +3892,7 @@ void __perf_event_task_sched_in(struct t
> >         if (atomic_read(&nr_switch_events))
> >                 perf_event_switch(task, prev, true);
> >
> > -       if (__this_cpu_read(perf_sched_cb_usage))
> > -               perf_pmu_sched_task(prev, task, true);
> > +       perf_pmu_sched_task(prev, task, true);
>
> Ditto.
>
> >  }
> >
> >  static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
>
> So I made a change like below.. and it could bring the optimization back.
>
> Thanks,
> Namhyung
>
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 9107e7c3ccfb..a30243a9fab5 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -3528,6 +3528,9 @@ static void __perf_pmu_sched_task(struct
> perf_cpu_context *cpuctx, bool sched_in
>  {
>         struct pmu *pmu;
>
> +       if (!cpuctx->sched_cb_dir[sched_in])
> +               return;
> +
>         pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
>
>         if (WARN_ON_ONCE(!pmu->sched_task))

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V2 3/3] perf: Optimize sched_task() in a context switch
  2020-12-10  7:13       ` Namhyung Kim
@ 2020-12-10 13:52         ` Liang, Kan
  2020-12-10 14:25           ` Peter Zijlstra
  0 siblings, 1 reply; 18+ messages in thread
From: Liang, Kan @ 2020-12-10 13:52 UTC (permalink / raw)
  To: Namhyung Kim, Peter Zijlstra
  Cc: Ingo Molnar, linux-kernel, Stephane Eranian, Ian Rogers,
	Gabriel Marin, Arnaldo Carvalho de Melo, Jiri Olsa, Andi Kleen,
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman



On 12/10/2020 2:13 AM, Namhyung Kim wrote:
> Hi Peter and Kan,
> 
> How can we move this forward?

Hi Namhyung,

Thanks for the test. The changes look good to me.

Hi Peter,

Should we resend the patch set for further review?


Thanks,
Kan


> 
> Thanks,
> Namhyung
> 
> On Fri, Dec 4, 2020 at 4:14 PM Namhyung Kim <namhyung@kernel.org> wrote:
>>
>> Hi Peter,
>>
>> On Wed, Dec 2, 2020 at 2:29 AM Peter Zijlstra <peterz@infradead.org> wrote:
>>>
>>> On Mon, Nov 30, 2020 at 11:38:42AM -0800, kan.liang@linux.intel.com wrote:
>>>> From: Kan Liang <kan.liang@linux.intel.com>
>>>>
>>>> Some calls to sched_task() in a context switch can be avoided. For
>>>> example, large PEBS only requires flushing the buffer in context switch
>>>> out. The current code still invokes the sched_task() for large PEBS in
>>>> context switch in.
>>>
>>> I still hate this one, how's something like this then?
>>> Which I still don't really like.. but at least its simpler.
>>>
>>> (completely untested, may contain spurious edits, might ICE the
>>> compiler and set your pets on fire if it doesn't)
>>
>> I've tested this version... and it worked well besides the optimization.. :)
>>
>> [SNIP]
>>> +static void context_sched_task(struct perf_cpu_context *cpuctx,
>>> +                              struct perf_event_context *ctx,
>>> +                              bool sched_in)
>>> +{
>>> +       struct pmu *pmu = ctx->pmu;
>>> +
>>> +       if (cpuctx->sched_cb_dir[sched_in] && pmu->sched_task)
>>> +               pmu->sched_task(ctx, false);
>>
>> applied: s/false/sched_in/
>>
>>
>>> +}
>>> +
>>>   static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
>>>                                           struct task_struct *next)
>>>   {
>>> @@ -3424,9 +3433,7 @@ static void perf_event_context_sched_out
>>>                          WRITE_ONCE(next_ctx->task, task);
>>>
>>>                          perf_pmu_disable(pmu);
>>> -
>>> -                       if (cpuctx->sched_cb_usage && pmu->sched_task)
>>> -                               pmu->sched_task(ctx, false);
>>> +                       context_sched_task(cpuctx, ctx, false);
>>>
>>>                          /*
>>>                           * PMU specific parts of task perf context can require
>>> @@ -3465,8 +3472,7 @@ static void perf_event_context_sched_out
>>>                  raw_spin_lock(&ctx->lock);
>>>                  perf_pmu_disable(pmu);
>>>
>>> -               if (cpuctx->sched_cb_usage && pmu->sched_task)
>>> -                       pmu->sched_task(ctx, false);
>>> +               context_sched_task(cpuctx, ctx, false);
>>>                  task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
>>>
>>>                  perf_pmu_enable(pmu);
>>
>> [SNIP]
>>> @@ -3563,8 +3582,7 @@ void __perf_event_task_sched_out(struct
>>>   {
>>>          int ctxn;
>>>
>>> -       if (__this_cpu_read(perf_sched_cb_usage))
>>> -               perf_pmu_sched_task(task, next, false);
>>> +       perf_pmu_sched_task(task, next, false);
>>
>> I think the reason is this change.  It now calls perf_pmu_sched_task()
>> without checking the counter.  And this is for per-cpu events.
>>
>>>
>>>          if (atomic_read(&nr_switch_events))
>>>                  perf_event_switch(task, next, false);
>>> @@ -3828,8 +3846,7 @@ static void perf_event_context_sched_in(
>>>                  cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
>>>          perf_event_sched_in(cpuctx, ctx, task);
>>>
>>> -       if (cpuctx->sched_cb_usage && pmu->sched_task)
>>> -               pmu->sched_task(cpuctx->task_ctx, true);
>>> +       context_sched_task(cpuctx, ctx, true);
>>>
>>>          perf_pmu_enable(pmu);
>>>
>>> @@ -3875,8 +3892,7 @@ void __perf_event_task_sched_in(struct t
>>>          if (atomic_read(&nr_switch_events))
>>>                  perf_event_switch(task, prev, true);
>>>
>>> -       if (__this_cpu_read(perf_sched_cb_usage))
>>> -               perf_pmu_sched_task(prev, task, true);
>>> +       perf_pmu_sched_task(prev, task, true);
>>
>> Ditto.
>>
>>>   }
>>>
>>>   static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
>>
>> So I made a change like below.. and it could bring the optimization back.
>>
>> Thanks,
>> Namhyung
>>
>>
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 9107e7c3ccfb..a30243a9fab5 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -3528,6 +3528,9 @@ static void __perf_pmu_sched_task(struct
>> perf_cpu_context *cpuctx, bool sched_in
>>   {
>>          struct pmu *pmu;
>>
>> +       if (!cpuctx->sched_cb_dir[sched_in])
>> +               return;
>> +
>>          pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
>>
>>          if (WARN_ON_ONCE(!pmu->sched_task))

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V2 3/3] perf: Optimize sched_task() in a context switch
  2020-12-10 13:52         ` Liang, Kan
@ 2020-12-10 14:25           ` Peter Zijlstra
  2021-01-18  7:04             ` Namhyung Kim
  0 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2020-12-10 14:25 UTC (permalink / raw)
  To: Liang, Kan
  Cc: Namhyung Kim, Ingo Molnar, linux-kernel, Stephane Eranian,
	Ian Rogers, Gabriel Marin, Arnaldo Carvalho de Melo, Jiri Olsa,
	Andi Kleen, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman

On Thu, Dec 10, 2020 at 08:52:55AM -0500, Liang, Kan wrote:
> 
> 
> On 12/10/2020 2:13 AM, Namhyung Kim wrote:
> > Hi Peter and Kan,
> > 
> > How can we move this forward?
> 
> Hi Namhyung,
> 
> Thanks for the test. The changes look good to me.
> 
> Hi Peter,
> 
> Should we resend the patch set for further review?

I've not yet seen a coherent replacement of #3, what I send was just a
PoC.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V2 3/3] perf: Optimize sched_task() in a context switch
  2020-12-10 14:25           ` Peter Zijlstra
@ 2021-01-18  7:04             ` Namhyung Kim
  2021-01-27  4:41               ` Namhyung Kim
  0 siblings, 1 reply; 18+ messages in thread
From: Namhyung Kim @ 2021-01-18  7:04 UTC (permalink / raw)
  To: Peter Zijlstra, Liang, Kan
  Cc: Ingo Molnar, linux-kernel, Stephane Eranian, Ian Rogers,
	Gabriel Marin, Arnaldo Carvalho de Melo, Jiri Olsa, Andi Kleen,
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman

Hi Peter and Kan,

On Thu, Dec 10, 2020 at 11:25 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Dec 10, 2020 at 08:52:55AM -0500, Liang, Kan wrote:
> >
> >
> > On 12/10/2020 2:13 AM, Namhyung Kim wrote:
> > > Hi Peter and Kan,
> > >
> > > How can we move this forward?
> >
> > Hi Namhyung,
> >
> > Thanks for the test. The changes look good to me.
> >
> > Hi Peter,
> >
> > Should we resend the patch set for further review?
>
> I've not yet seen a coherent replacement of #3, what I send was just a
> PoC.

Any updates?

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V2 3/3] perf: Optimize sched_task() in a context switch
  2021-01-18  7:04             ` Namhyung Kim
@ 2021-01-27  4:41               ` Namhyung Kim
  2021-02-22  9:46                 ` Namhyung Kim
  0 siblings, 1 reply; 18+ messages in thread
From: Namhyung Kim @ 2021-01-27  4:41 UTC (permalink / raw)
  To: Peter Zijlstra, Liang, Kan
  Cc: Ingo Molnar, linux-kernel, Stephane Eranian, Ian Rogers,
	Gabriel Marin, Arnaldo Carvalho de Melo, Jiri Olsa, Andi Kleen,
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman

Hi,

On Mon, Jan 18, 2021 at 4:04 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Hi Peter and Kan,
>
> On Thu, Dec 10, 2020 at 11:25 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Thu, Dec 10, 2020 at 08:52:55AM -0500, Liang, Kan wrote:
> > >
> > >
> > > On 12/10/2020 2:13 AM, Namhyung Kim wrote:
> > > > Hi Peter and Kan,
> > > >
> > > > How can we move this forward?
> > >
> > > Hi Namhyung,
> > >
> > > Thanks for the test. The changes look good to me.
> > >
> > > Hi Peter,
> > >
> > > Should we resend the patch set for further review?
> >
> > I've not yet seen a coherent replacement of #3, what I send was just a
> > PoC.

If it's the only problem of #3 which is an optimization,
can we merge the actual fixes in #1 and #2 first?

I know some people waiting for the fix..

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V2 3/3] perf: Optimize sched_task() in a context switch
  2021-01-27  4:41               ` Namhyung Kim
@ 2021-02-22  9:46                 ` Namhyung Kim
  0 siblings, 0 replies; 18+ messages in thread
From: Namhyung Kim @ 2021-02-22  9:46 UTC (permalink / raw)
  To: Peter Zijlstra, Liang, Kan
  Cc: Ingo Molnar, linux-kernel, Stephane Eranian, Ian Rogers,
	Gabriel Marin, Arnaldo Carvalho de Melo, Jiri Olsa, Andi Kleen,
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman

On Wed, Jan 27, 2021 at 1:41 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Hi,
>
> On Mon, Jan 18, 2021 at 4:04 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > Hi Peter and Kan,
> >
> > On Thu, Dec 10, 2020 at 11:25 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Thu, Dec 10, 2020 at 08:52:55AM -0500, Liang, Kan wrote:
> > > >
> > > >
> > > > On 12/10/2020 2:13 AM, Namhyung Kim wrote:
> > > > > Hi Peter and Kan,
> > > > >
> > > > > How can we move this forward?
> > > >
> > > > Hi Namhyung,
> > > >
> > > > Thanks for the test. The changes look good to me.
> > > >
> > > > Hi Peter,
> > > >
> > > > Should we resend the patch set for further review?
> > >
> > > I've not yet seen a coherent replacement of #3, what I send was just a
> > > PoC.
>
> If it's the only problem of #3 which is an optimization,
> can we merge the actual fixes in #1 and #2 first?
>
> I know some people waiting for the fix..

Ping again...

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [tip: perf/urgent] perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR
  2020-11-30 19:38 ` [PATCH V2 2/3] perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR kan.liang
@ 2021-03-01 10:16   ` tip-bot2 for Kan Liang
  2021-03-06 11:54   ` tip-bot2 for Kan Liang
  1 sibling, 0 replies; 18+ messages in thread
From: tip-bot2 for Kan Liang @ 2021-03-01 10:16 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: Kan Liang, Peter Zijlstra (Intel), x86, linux-kernel

The following commit has been merged into the perf/urgent branch of tip:

Commit-ID:     a8abc881981762631a22568d5e4b2c0ce4aeb15c
Gitweb:        https://git.kernel.org/tip/a8abc881981762631a22568d5e4b2c0ce4aeb15c
Author:        Kan Liang <kan.liang@linux.intel.com>
AuthorDate:    Mon, 30 Nov 2020 11:38:41 -08:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 01 Mar 2021 11:02:19 +01:00

perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR

To supply a PID/TID for large PEBS, it requires flushing the PEBS buffer
in a context switch.

For normal LBRs, a context switch can flip the address space and LBR
entries are not tagged with an identifier, we need to wipe the LBR, even
for per-cpu events.

For LBR callstack, save/restore the stack is required during a context
switch.

Set PERF_ATTACH_SCHED_CB for the event with large PEBS & LBR.

Fixes: 9c964efa4330 ("perf/x86/intel: Drain the PEBS buffer during context switches")
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201130193842.10569-2-kan.liang@linux.intel.com
---
 arch/x86/events/intel/core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 5bac48d..7bbb5bb 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3662,8 +3662,10 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) {
 			event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
 			if (!(event->attr.sample_type &
-			      ~intel_pmu_large_pebs_flags(event)))
+			      ~intel_pmu_large_pebs_flags(event))) {
 				event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS;
+				event->attach_state |= PERF_ATTACH_SCHED_CB;
+			}
 		}
 		if (x86_pmu.pebs_aliases)
 			x86_pmu.pebs_aliases(event);
@@ -3676,6 +3678,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		ret = intel_pmu_setup_lbr_filter(event);
 		if (ret)
 			return ret;
+		event->attach_state |= PERF_ATTACH_SCHED_CB;
 
 		/*
 		 * BTS is set up earlier in this path, so don't account twice

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [tip: perf/urgent] perf/core: Flush PMU internal buffers for per-CPU events
  2020-11-30 19:38 [PATCH V2 1/3] perf/core: Flush PMU internal buffers for per-CPU events kan.liang
                   ` (2 preceding siblings ...)
  2020-12-01 17:21 ` [PATCH V2 1/3] perf/core: Flush PMU internal buffers for per-CPU events Peter Zijlstra
@ 2021-03-01 10:16 ` tip-bot2 for Kan Liang
  2021-03-06 11:54 ` tip-bot2 for Kan Liang
  4 siblings, 0 replies; 18+ messages in thread
From: tip-bot2 for Kan Liang @ 2021-03-01 10:16 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Gabriel Marin, Namhyung Kim, Kan Liang, Peter Zijlstra (Intel),
	x86, linux-kernel

The following commit has been merged into the perf/urgent branch of tip:

Commit-ID:     e748d3716e0e581401630d36d3ef0fc8fa8f830d
Gitweb:        https://git.kernel.org/tip/e748d3716e0e581401630d36d3ef0fc8fa8f830d
Author:        Kan Liang <kan.liang@linux.intel.com>
AuthorDate:    Mon, 30 Nov 2020 11:38:40 -08:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 01 Mar 2021 11:02:18 +01:00

perf/core: Flush PMU internal buffers for per-CPU events

Sometimes the PMU internal buffers have to be flushed for per-CPU events
during a context switch, e.g., large PEBS. Otherwise, the perf tool may
report samples in locations that do not belong to the process where the
samples are processed in, because PEBS does not tag samples with PID/TID.

The current code only flush the buffers for a per-task event. It doesn't
check a per-CPU event.

Add a new event state flag, PERF_ATTACH_SCHED_CB, to indicate that the
PMU internal buffers have to be flushed for this event during a context
switch.

Add sched_cb_entry and perf_sched_cb_usages back to track the PMU/cpuctx
which is required to be flushed.

Only need to invoke the sched_task() for per-CPU events in this patch.
The per-task events have been handled in perf_event_context_sched_in/out
already.

Fixes: 9c964efa4330 ("perf/x86/intel: Drain the PEBS buffer during context switches")
Reported-by: Gabriel Marin <gmx@google.com>
Originally-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201130193842.10569-1-kan.liang@linux.intel.com
---
 include/linux/perf_event.h |  2 ++-
 kernel/events/core.c       | 42 +++++++++++++++++++++++++++++++++----
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fab42cf..3f7f89e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -606,6 +606,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_TASK	0x04
 #define PERF_ATTACH_TASK_DATA	0x08
 #define PERF_ATTACH_ITRACE	0x10
+#define PERF_ATTACH_SCHED_CB	0x20
 
 struct perf_cgroup;
 struct perf_buffer;
@@ -872,6 +873,7 @@ struct perf_cpu_context {
 	struct list_head		cgrp_cpuctx_entry;
 #endif
 
+	struct list_head		sched_cb_entry;
 	int				sched_cb_usage;
 
 	int				online;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0aeca5f..03db40f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -386,6 +386,7 @@ static DEFINE_MUTEX(perf_sched_mutex);
 static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
@@ -3461,11 +3462,16 @@ unlock:
 	}
 }
 
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+
 void perf_sched_cb_dec(struct pmu *pmu)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-	--cpuctx->sched_cb_usage;
+	this_cpu_dec(perf_sched_cb_usages);
+
+	if (!--cpuctx->sched_cb_usage)
+		list_del(&cpuctx->sched_cb_entry);
 }
 
 
@@ -3473,7 +3479,10 @@ void perf_sched_cb_inc(struct pmu *pmu)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-	cpuctx->sched_cb_usage++;
+	if (!cpuctx->sched_cb_usage++)
+		list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
+	this_cpu_inc(perf_sched_cb_usages);
 }
 
 /*
@@ -3502,6 +3511,24 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 
+static void perf_pmu_sched_task(struct task_struct *prev,
+				struct task_struct *next,
+				bool sched_in)
+{
+	struct perf_cpu_context *cpuctx;
+
+	if (prev == next)
+		return;
+
+	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
+		/* will be handled in perf_event_context_sched_in/out */
+		if (cpuctx->task_ctx)
+			continue;
+
+		__perf_pmu_sched_task(cpuctx, sched_in);
+	}
+}
+
 static void perf_event_switch(struct task_struct *task,
 			      struct task_struct *next_prev, bool sched_in);
 
@@ -3524,6 +3551,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
 {
 	int ctxn;
 
+	if (__this_cpu_read(perf_sched_cb_usages))
+		perf_pmu_sched_task(task, next, false);
+
 	if (atomic_read(&nr_switch_events))
 		perf_event_switch(task, next, false);
 
@@ -3832,6 +3862,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 
 	if (atomic_read(&nr_switch_events))
 		perf_event_switch(task, prev, true);
+
+	if (__this_cpu_read(perf_sched_cb_usages))
+		perf_pmu_sched_task(prev, task, true);
 }
 
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -4656,7 +4689,7 @@ static void unaccount_event(struct perf_event *event)
 	if (event->parent)
 		return;
 
-	if (event->attach_state & PERF_ATTACH_TASK)
+	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
 		dec = true;
 	if (event->attr.mmap || event->attr.mmap_data)
 		atomic_dec(&nr_mmap_events);
@@ -11175,7 +11208,7 @@ static void account_event(struct perf_event *event)
 	if (event->parent)
 		return;
 
-	if (event->attach_state & PERF_ATTACH_TASK)
+	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
 		inc = true;
 	if (event->attr.mmap || event->attr.mmap_data)
 		atomic_inc(&nr_mmap_events);
@@ -12972,6 +13005,7 @@ static void __init perf_event_init_all_cpus(void)
 #ifdef CONFIG_CGROUP_PERF
 		INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
 #endif
+		INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
 	}
 }
 

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [tip: perf/urgent] perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR
  2020-11-30 19:38 ` [PATCH V2 2/3] perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR kan.liang
  2021-03-01 10:16   ` [tip: perf/urgent] " tip-bot2 for Kan Liang
@ 2021-03-06 11:54   ` tip-bot2 for Kan Liang
  1 sibling, 0 replies; 18+ messages in thread
From: tip-bot2 for Kan Liang @ 2021-03-06 11:54 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Kan Liang, Peter Zijlstra (Intel), Ingo Molnar, x86, linux-kernel

The following commit has been merged into the perf/urgent branch of tip:

Commit-ID:     afbef30149587ad46f4780b1e0cc5e219745ce90
Gitweb:        https://git.kernel.org/tip/afbef30149587ad46f4780b1e0cc5e219745ce90
Author:        Kan Liang <kan.liang@linux.intel.com>
AuthorDate:    Mon, 30 Nov 2020 11:38:41 -08:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Sat, 06 Mar 2021 12:52:44 +01:00

perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR

To supply a PID/TID for large PEBS, it requires flushing the PEBS buffer
in a context switch.

For normal LBRs, a context switch can flip the address space and LBR
entries are not tagged with an identifier, we need to wipe the LBR, even
for per-cpu events.

For LBR callstack, save/restore the stack is required during a context
switch.

Set PERF_ATTACH_SCHED_CB for the event with large PEBS & LBR.

Fixes: 9c964efa4330 ("perf/x86/intel: Drain the PEBS buffer during context switches")
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20201130193842.10569-2-kan.liang@linux.intel.com
---
 arch/x86/events/intel/core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 5bac48d..7bbb5bb 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3662,8 +3662,10 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) {
 			event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
 			if (!(event->attr.sample_type &
-			      ~intel_pmu_large_pebs_flags(event)))
+			      ~intel_pmu_large_pebs_flags(event))) {
 				event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS;
+				event->attach_state |= PERF_ATTACH_SCHED_CB;
+			}
 		}
 		if (x86_pmu.pebs_aliases)
 			x86_pmu.pebs_aliases(event);
@@ -3676,6 +3678,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		ret = intel_pmu_setup_lbr_filter(event);
 		if (ret)
 			return ret;
+		event->attach_state |= PERF_ATTACH_SCHED_CB;
 
 		/*
 		 * BTS is set up earlier in this path, so don't account twice

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [tip: perf/urgent] perf/core: Flush PMU internal buffers for per-CPU events
  2020-11-30 19:38 [PATCH V2 1/3] perf/core: Flush PMU internal buffers for per-CPU events kan.liang
                   ` (3 preceding siblings ...)
  2021-03-01 10:16 ` [tip: perf/urgent] " tip-bot2 for Kan Liang
@ 2021-03-06 11:54 ` tip-bot2 for Kan Liang
  4 siblings, 0 replies; 18+ messages in thread
From: tip-bot2 for Kan Liang @ 2021-03-06 11:54 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Gabriel Marin, Namhyung Kim, Kan Liang, Peter Zijlstra (Intel),
	Ingo Molnar, x86, linux-kernel

The following commit has been merged into the perf/urgent branch of tip:

Commit-ID:     a5398bffc01fe044848c5024e5e867e407f239b8
Gitweb:        https://git.kernel.org/tip/a5398bffc01fe044848c5024e5e867e407f239b8
Author:        Kan Liang <kan.liang@linux.intel.com>
AuthorDate:    Mon, 30 Nov 2020 11:38:40 -08:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Sat, 06 Mar 2021 12:52:39 +01:00

perf/core: Flush PMU internal buffers for per-CPU events

Sometimes the PMU internal buffers have to be flushed for per-CPU events
during a context switch, e.g., large PEBS. Otherwise, the perf tool may
report samples in locations that do not belong to the process where the
samples are processed in, because PEBS does not tag samples with PID/TID.

The current code only flush the buffers for a per-task event. It doesn't
check a per-CPU event.

Add a new event state flag, PERF_ATTACH_SCHED_CB, to indicate that the
PMU internal buffers have to be flushed for this event during a context
switch.

Add sched_cb_entry and perf_sched_cb_usages back to track the PMU/cpuctx
which is required to be flushed.

Only need to invoke the sched_task() for per-CPU events in this patch.
The per-task events have been handled in perf_event_context_sched_in/out
already.

Fixes: 9c964efa4330 ("perf/x86/intel: Drain the PEBS buffer during context switches")
Reported-by: Gabriel Marin <gmx@google.com>
Originally-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20201130193842.10569-1-kan.liang@linux.intel.com
---
 include/linux/perf_event.h |  2 ++-
 kernel/events/core.c       | 42 +++++++++++++++++++++++++++++++++----
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fab42cf..3f7f89e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -606,6 +606,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_TASK	0x04
 #define PERF_ATTACH_TASK_DATA	0x08
 #define PERF_ATTACH_ITRACE	0x10
+#define PERF_ATTACH_SCHED_CB	0x20
 
 struct perf_cgroup;
 struct perf_buffer;
@@ -872,6 +873,7 @@ struct perf_cpu_context {
 	struct list_head		cgrp_cpuctx_entry;
 #endif
 
+	struct list_head		sched_cb_entry;
 	int				sched_cb_usage;
 
 	int				online;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0aeca5f..03db40f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -386,6 +386,7 @@ static DEFINE_MUTEX(perf_sched_mutex);
 static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
@@ -3461,11 +3462,16 @@ unlock:
 	}
 }
 
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+
 void perf_sched_cb_dec(struct pmu *pmu)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-	--cpuctx->sched_cb_usage;
+	this_cpu_dec(perf_sched_cb_usages);
+
+	if (!--cpuctx->sched_cb_usage)
+		list_del(&cpuctx->sched_cb_entry);
 }
 
 
@@ -3473,7 +3479,10 @@ void perf_sched_cb_inc(struct pmu *pmu)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-	cpuctx->sched_cb_usage++;
+	if (!cpuctx->sched_cb_usage++)
+		list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
+	this_cpu_inc(perf_sched_cb_usages);
 }
 
 /*
@@ -3502,6 +3511,24 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 
+static void perf_pmu_sched_task(struct task_struct *prev,
+				struct task_struct *next,
+				bool sched_in)
+{
+	struct perf_cpu_context *cpuctx;
+
+	if (prev == next)
+		return;
+
+	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
+		/* will be handled in perf_event_context_sched_in/out */
+		if (cpuctx->task_ctx)
+			continue;
+
+		__perf_pmu_sched_task(cpuctx, sched_in);
+	}
+}
+
 static void perf_event_switch(struct task_struct *task,
 			      struct task_struct *next_prev, bool sched_in);
 
@@ -3524,6 +3551,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
 {
 	int ctxn;
 
+	if (__this_cpu_read(perf_sched_cb_usages))
+		perf_pmu_sched_task(task, next, false);
+
 	if (atomic_read(&nr_switch_events))
 		perf_event_switch(task, next, false);
 
@@ -3832,6 +3862,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 
 	if (atomic_read(&nr_switch_events))
 		perf_event_switch(task, prev, true);
+
+	if (__this_cpu_read(perf_sched_cb_usages))
+		perf_pmu_sched_task(prev, task, true);
 }
 
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -4656,7 +4689,7 @@ static void unaccount_event(struct perf_event *event)
 	if (event->parent)
 		return;
 
-	if (event->attach_state & PERF_ATTACH_TASK)
+	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
 		dec = true;
 	if (event->attr.mmap || event->attr.mmap_data)
 		atomic_dec(&nr_mmap_events);
@@ -11175,7 +11208,7 @@ static void account_event(struct perf_event *event)
 	if (event->parent)
 		return;
 
-	if (event->attach_state & PERF_ATTACH_TASK)
+	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
 		inc = true;
 	if (event->attr.mmap || event->attr.mmap_data)
 		atomic_inc(&nr_mmap_events);
@@ -12972,6 +13005,7 @@ static void __init perf_event_init_all_cpus(void)
 #ifdef CONFIG_CGROUP_PERF
 		INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
 #endif
+		INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
 	}
 }
 

^ permalink raw reply related	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2021-03-06 11:56 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-30 19:38 [PATCH V2 1/3] perf/core: Flush PMU internal buffers for per-CPU events kan.liang
2020-11-30 19:38 ` [PATCH V2 2/3] perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR kan.liang
2021-03-01 10:16   ` [tip: perf/urgent] " tip-bot2 for Kan Liang
2021-03-06 11:54   ` tip-bot2 for Kan Liang
2020-11-30 19:38 ` [PATCH V2 3/3] perf: Optimize sched_task() in a context switch kan.liang
2020-12-01 17:29   ` Peter Zijlstra
2020-12-02 10:26     ` Peter Zijlstra
2020-12-02 14:40     ` Namhyung Kim
2020-12-04  7:14     ` Namhyung Kim
2020-12-10  7:13       ` Namhyung Kim
2020-12-10 13:52         ` Liang, Kan
2020-12-10 14:25           ` Peter Zijlstra
2021-01-18  7:04             ` Namhyung Kim
2021-01-27  4:41               ` Namhyung Kim
2021-02-22  9:46                 ` Namhyung Kim
2020-12-01 17:21 ` [PATCH V2 1/3] perf/core: Flush PMU internal buffers for per-CPU events Peter Zijlstra
2021-03-01 10:16 ` [tip: perf/urgent] " tip-bot2 for Kan Liang
2021-03-06 11:54 ` tip-bot2 for Kan Liang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).