linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events
@ 2020-11-06 21:29 kan.liang
  2020-11-06 21:29 ` [PATCH 2/3] perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS kan.liang
                   ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: kan.liang @ 2020-11-06 21:29 UTC (permalink / raw)
  To: peterz, mingo, linux-kernel
  Cc: namhyung, eranian, irogers, gmx, acme, jolsa, ak, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

Sometimes the PMU internal buffers have to be flushed for per-CPU events
during a context switch, e.g., large PEBS. Otherwise, the perf tool may
report samples in locations that do not belong to the process where the
samples are processed in, because PEBS does not tag samples with PID/TID.

The current code only flush the buffers for a per-task event. It doesn't
check a per-CPU event.

Add a new event state flag, PERF_ATTACH_SCHED_CB, to indicate that the
PMU internal buffers have to be flushed for this event during a context
switch.

Add sched_cb_entry and perf_sched_cb_usages back to track the PMU/cpuctx
which is required to be flushed.

Only need to invoke the sched_task() for per-CPU events in this patch.
The per-task events have been handled in perf_event_context_sched_in/out
already.

Fixes: 9c964efa4330 ("perf/x86/intel: Drain the PEBS buffer during context switches")
Reported-by: Gabriel Marin <gmx@google.com>
Reported-by: Namhyung Kim <namhyung@kernel.org>
Originally-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
 include/linux/perf_event.h |  2 ++
 kernel/events/core.c       | 42 ++++++++++++++++++++++++++++++++++----
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0defb526cd0c..f7a84d1048b9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -606,6 +606,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_TASK	0x04
 #define PERF_ATTACH_TASK_DATA	0x08
 #define PERF_ATTACH_ITRACE	0x10
+#define PERF_ATTACH_SCHED_CB	0x20
 
 struct perf_cgroup;
 struct perf_buffer;
@@ -872,6 +873,7 @@ struct perf_cpu_context {
 	struct list_head		cgrp_cpuctx_entry;
 #endif
 
+	struct list_head		sched_cb_entry;
 	int				sched_cb_usage;
 
 	int				online;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dba4ea4e648b..df0df5514097 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -384,6 +384,7 @@ static DEFINE_MUTEX(perf_sched_mutex);
 static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
@@ -3481,11 +3482,16 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 	}
 }
 
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+
 void perf_sched_cb_dec(struct pmu *pmu)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-	--cpuctx->sched_cb_usage;
+	this_cpu_dec(perf_sched_cb_usages);
+
+	if (!--cpuctx->sched_cb_usage)
+		list_del(&cpuctx->sched_cb_entry);
 }
 
 
@@ -3493,7 +3499,10 @@ void perf_sched_cb_inc(struct pmu *pmu)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-	cpuctx->sched_cb_usage++;
+	if (!cpuctx->sched_cb_usage++)
+		list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
+	this_cpu_inc(perf_sched_cb_usages);
 }
 
 /*
@@ -3522,6 +3531,24 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 
+static void perf_pmu_sched_task(struct task_struct *prev,
+				struct task_struct *next,
+				bool sched_in)
+{
+	struct perf_cpu_context *cpuctx;
+
+	if (prev == next)
+		return;
+
+	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
+		/* will be handled in perf_event_context_sched_in/out */
+		if (cpuctx->task_ctx)
+			continue;
+
+		__perf_pmu_sched_task(cpuctx, sched_in);
+	}
+}
+
 static void perf_event_switch(struct task_struct *task,
 			      struct task_struct *next_prev, bool sched_in);
 
@@ -3544,6 +3571,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
 {
 	int ctxn;
 
+	if (__this_cpu_read(perf_sched_cb_usages))
+		perf_pmu_sched_task(task, next, false);
+
 	if (atomic_read(&nr_switch_events))
 		perf_event_switch(task, next, false);
 
@@ -3851,6 +3881,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 
 	if (atomic_read(&nr_switch_events))
 		perf_event_switch(task, prev, true);
+
+	if (__this_cpu_read(perf_sched_cb_usages))
+		perf_pmu_sched_task(prev, task, true);
 }
 
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -4675,7 +4708,7 @@ static void unaccount_event(struct perf_event *event)
 	if (event->parent)
 		return;
 
-	if (event->attach_state & PERF_ATTACH_TASK)
+	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
 		dec = true;
 	if (event->attr.mmap || event->attr.mmap_data)
 		atomic_dec(&nr_mmap_events);
@@ -11204,7 +11237,7 @@ static void account_event(struct perf_event *event)
 	if (event->parent)
 		return;
 
-	if (event->attach_state & PERF_ATTACH_TASK)
+	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
 		inc = true;
 	if (event->attr.mmap || event->attr.mmap_data)
 		atomic_inc(&nr_mmap_events);
@@ -12996,6 +13029,7 @@ static void __init perf_event_init_all_cpus(void)
 #ifdef CONFIG_CGROUP_PERF
 		INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
 #endif
+		INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
 	}
 }
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/3] perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS
  2020-11-06 21:29 [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events kan.liang
@ 2020-11-06 21:29 ` kan.liang
  2020-11-06 21:29 ` [PATCH 3/3] perf: Optimize sched_task() in a context switch kan.liang
  2020-11-09  9:52 ` [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events Peter Zijlstra
  2 siblings, 0 replies; 18+ messages in thread
From: kan.liang @ 2020-11-06 21:29 UTC (permalink / raw)
  To: peterz, mingo, linux-kernel
  Cc: namhyung, eranian, irogers, gmx, acme, jolsa, ak, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

To supply a PID/TID for large PEBS, it requires flushing the PEBS buffer
in a context switch.

Set PERF_ATTACH_SCHED_CB for the event with large PEBS.

Fixes: 9c964efa4330 ("perf/x86/intel: Drain the PEBS buffer during context switches")
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
 arch/x86/events/intel/core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index c79748f6921d..fe922be94b6d 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3565,8 +3565,10 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) {
 			event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
 			if (!(event->attr.sample_type &
-			      ~intel_pmu_large_pebs_flags(event)))
+			      ~intel_pmu_large_pebs_flags(event))) {
 				event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS;
+				event->attach_state |= PERF_ATTACH_SCHED_CB;
+			}
 		}
 		if (x86_pmu.pebs_aliases)
 			x86_pmu.pebs_aliases(event);
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 3/3] perf: Optimize sched_task() in a context switch
  2020-11-06 21:29 [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events kan.liang
  2020-11-06 21:29 ` [PATCH 2/3] perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS kan.liang
@ 2020-11-06 21:29 ` kan.liang
  2020-11-09  9:52 ` [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events Peter Zijlstra
  2 siblings, 0 replies; 18+ messages in thread
From: kan.liang @ 2020-11-06 21:29 UTC (permalink / raw)
  To: peterz, mingo, linux-kernel
  Cc: namhyung, eranian, irogers, gmx, acme, jolsa, ak, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

Some calls to sched_task() in a context switch can be avoided. For
example, large PEBS only requires flushing the buffer in context switch
out. The current code still invokes the sched_task() for large PEBS in
context switch in.

The current code doesn't know and check the states of an event. It has
no idea which calls is unnecessary.

Add a new variable, sched_cb_state, to indicate the states of an event.
Add sched_cb_state to track the states per CPU context. Only invokes
the sched_task() when the relative states are set.

Split sched_cb_usage into sched_cb_task_usage and sched_cb_cpu_usage to
track per-task and per-CPU events respectively. Avoid going through the
sched_cb_list for pure per-task events. Avoid invoking sched_task() in
perf_event_context_sched_in/out for pure perf-CPU events.

Since the interface functions are changed, the specific codes are also
changed here.
- For power, the BHRB entries are reset in context switch in.
- For X86 large PEBS, the PEBS buffer are flushed in context switch out.
- For X86 normal LBR, the LBR registers are reset in context switch in.
- For X86 LBR call stack, the LBR registers are saved in context switch
out, and restored in context switch in.

Suggested-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
 arch/powerpc/perf/core-book3s.c |  7 +++-
 arch/x86/events/intel/ds.c      |  8 +++--
 arch/x86/events/intel/lbr.c     | 15 ++++++--
 include/linux/perf_event.h      | 12 +++++--
 kernel/events/core.c            | 62 +++++++++++++++++++++++++++------
 5 files changed, 86 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 6586f7e71cfb..e5837fdf82e3 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -370,6 +370,7 @@ static void power_pmu_bhrb_reset(void)
 static void power_pmu_bhrb_enable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
+	int state = PERF_SCHED_CB_SW_IN;
 
 	if (!ppmu->bhrb_nr)
 		return;
@@ -380,7 +381,11 @@ static void power_pmu_bhrb_enable(struct perf_event *event)
 		cpuhw->bhrb_context = event->ctx;
 	}
 	cpuhw->bhrb_users++;
-	perf_sched_cb_inc(event->ctx->pmu);
+
+	if (!(event->attach_state & PERF_ATTACH_TASK))
+		state |= PERF_SCHED_CB_CPU;
+
+	perf_sched_cb_inc(event->ctx->pmu, state);
 }
 
 static void power_pmu_bhrb_disable(struct perf_event *event)
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 444e5f061d04..f4f9d737ca85 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1019,12 +1019,16 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
 	 * that does not hurt:
 	 */
 	bool update = cpuc->n_pebs == 1;
+	int state = PERF_SCHED_CB_SW_OUT;
 
 	if (needed_cb != pebs_needs_sched_cb(cpuc)) {
+		if (!(event->attach_state & PERF_ATTACH_TASK))
+			state |= PERF_SCHED_CB_CPU;
+
 		if (!needed_cb)
-			perf_sched_cb_inc(pmu);
+			perf_sched_cb_inc(pmu, state);
 		else
-			perf_sched_cb_dec(pmu);
+			perf_sched_cb_dec(pmu, state);
 
 		update = true;
 	}
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 8961653c5dd2..e4c500580df5 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -660,6 +660,7 @@ void intel_pmu_lbr_add(struct perf_event *event)
 {
 	struct kmem_cache *kmem_cache = event->pmu->task_ctx_cache;
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int state = PERF_SCHED_CB_SW_IN;
 
 	if (!x86_pmu.lbr_nr)
 		return;
@@ -693,7 +694,13 @@ void intel_pmu_lbr_add(struct perf_event *event)
 	 */
 	if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
 		cpuc->lbr_pebs_users++;
-	perf_sched_cb_inc(event->ctx->pmu);
+
+	if (!(event->attach_state & PERF_ATTACH_TASK))
+		state |= PERF_SCHED_CB_CPU;
+	if (event->attach_state & PERF_ATTACH_TASK_DATA)
+		state |= PERF_SCHED_CB_SW_OUT;
+
+	perf_sched_cb_inc(event->ctx->pmu, state);
 	if (!cpuc->lbr_users++ && !event->total_time_running)
 		intel_pmu_lbr_reset();
 
@@ -724,6 +731,7 @@ void release_lbr_buffers(void)
 void intel_pmu_lbr_del(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int state = 0;
 
 	if (!x86_pmu.lbr_nr)
 		return;
@@ -740,7 +748,10 @@ void intel_pmu_lbr_del(struct perf_event *event)
 	cpuc->lbr_users--;
 	WARN_ON_ONCE(cpuc->lbr_users < 0);
 	WARN_ON_ONCE(cpuc->lbr_pebs_users < 0);
-	perf_sched_cb_dec(event->ctx->pmu);
+
+	if (!(event->attach_state & PERF_ATTACH_TASK))
+		state |= PERF_SCHED_CB_CPU;
+	perf_sched_cb_dec(event->ctx->pmu, state);
 }
 
 static inline bool vlbr_exclude_host(void)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f7a84d1048b9..839dd65dd1f3 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -854,6 +854,10 @@ struct perf_event_context {
  */
 #define PERF_NR_CONTEXTS	4
 
+#define PERF_SCHED_CB_CPU	0x1
+#define PERF_SCHED_CB_SW_IN	0x2
+#define PERF_SCHED_CB_SW_OUT	0x4
+
 /**
  * struct perf_event_cpu_context - per cpu event context structure
  */
@@ -874,7 +878,9 @@ struct perf_cpu_context {
 #endif
 
 	struct list_head		sched_cb_entry;
-	int				sched_cb_usage;
+	int				sched_cb_task_usage;
+	int				sched_cb_cpu_usage;
+	int				sched_cb_state;
 
 	int				online;
 	/*
@@ -967,8 +973,8 @@ extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
 extern void perf_pmu_enable(struct pmu *pmu);
-extern void perf_sched_cb_dec(struct pmu *pmu);
-extern void perf_sched_cb_inc(struct pmu *pmu);
+extern void perf_sched_cb_dec(struct pmu *pmu, int state);
+extern void perf_sched_cb_inc(struct pmu *pmu, int state);
 extern int perf_event_task_disable(void);
 extern int perf_event_task_enable(void);
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index df0df5514097..6a19b8fa0dd9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3384,6 +3384,32 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
 	}
 }
 
+static inline bool perf_pmu_sched_in_ctx(struct perf_cpu_context *cpuctx)
+{
+	return !!(cpuctx->sched_cb_state & PERF_SCHED_CB_SW_IN);
+}
+
+static inline bool perf_pmu_sched_out_ctx(struct perf_cpu_context *cpuctx)
+{
+	return !!(cpuctx->sched_cb_state & PERF_SCHED_CB_SW_OUT);
+}
+
+static inline bool perf_pmu_has_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
+{
+	struct pmu *pmu = cpuctx->ctx.pmu;
+
+	if (!pmu->sched_task)
+		return false;
+
+	if (sched_in && perf_pmu_sched_in_ctx(cpuctx))
+		return true;
+
+	if (!sched_in && perf_pmu_sched_out_ctx(cpuctx))
+		return true;
+
+	return false;
+}
+
 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 					 struct task_struct *next)
 {
@@ -3433,7 +3459,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 
 			perf_pmu_disable(pmu);
 
-			if (cpuctx->sched_cb_usage && pmu->sched_task)
+			if (cpuctx->sched_cb_task_usage && perf_pmu_has_sched_task(cpuctx, false))
 				pmu->sched_task(ctx, false);
 
 			/*
@@ -3473,7 +3499,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 		raw_spin_lock(&ctx->lock);
 		perf_pmu_disable(pmu);
 
-		if (cpuctx->sched_cb_usage && pmu->sched_task)
+		if (cpuctx->sched_cb_task_usage && perf_pmu_has_sched_task(cpuctx, false))
 			pmu->sched_task(ctx, false);
 		task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
 
@@ -3484,22 +3510,37 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 
 static DEFINE_PER_CPU(struct list_head, sched_cb_list);
 
-void perf_sched_cb_dec(struct pmu *pmu)
+void perf_sched_cb_dec(struct pmu *pmu, int state)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
+	if (!(state & PERF_SCHED_CB_CPU)) {
+		--cpuctx->sched_cb_task_usage;
+		goto end;
+	}
+
 	this_cpu_dec(perf_sched_cb_usages);
 
-	if (!--cpuctx->sched_cb_usage)
+	if (!--cpuctx->sched_cb_cpu_usage)
 		list_del(&cpuctx->sched_cb_entry);
+end:
+	if (!cpuctx->sched_cb_cpu_usage && !cpuctx->sched_cb_task_usage)
+		cpuctx->sched_cb_state = 0;
 }
 
 
-void perf_sched_cb_inc(struct pmu *pmu)
+void perf_sched_cb_inc(struct pmu *pmu, int state)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-	if (!cpuctx->sched_cb_usage++)
+	cpuctx->sched_cb_state |= state;
+
+	if (!(state & PERF_SCHED_CB_CPU)) {
+		cpuctx->sched_cb_task_usage++;
+		return;
+	}
+
+	if (!cpuctx->sched_cb_cpu_usage++)
 		list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
 
 	this_cpu_inc(perf_sched_cb_usages);
@@ -3544,8 +3585,8 @@ static void perf_pmu_sched_task(struct task_struct *prev,
 		/* will be handled in perf_event_context_sched_in/out */
 		if (cpuctx->task_ctx)
 			continue;
-
-		__perf_pmu_sched_task(cpuctx, sched_in);
+		if (perf_pmu_has_sched_task(cpuctx, sched_in))
+			__perf_pmu_sched_task(cpuctx, sched_in);
 	}
 }
 
@@ -3809,7 +3850,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 
 	cpuctx = __get_cpu_context(ctx);
 	if (cpuctx->task_ctx == ctx) {
-		if (cpuctx->sched_cb_usage)
+		if (cpuctx->sched_cb_task_usage &&
+		    (cpuctx->sched_cb_state & PERF_SCHED_CB_SW_IN))
 			__perf_pmu_sched_task(cpuctx, true);
 		return;
 	}
@@ -3835,7 +3877,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 	perf_event_sched_in(cpuctx, ctx, task);
 
-	if (cpuctx->sched_cb_usage && pmu->sched_task)
+	if (cpuctx->sched_cb_task_usage && perf_pmu_has_sched_task(cpuctx, true))
 		pmu->sched_task(cpuctx->task_ctx, true);
 
 	perf_pmu_enable(pmu);
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events
  2020-11-06 21:29 [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events kan.liang
  2020-11-06 21:29 ` [PATCH 2/3] perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS kan.liang
  2020-11-06 21:29 ` [PATCH 3/3] perf: Optimize sched_task() in a context switch kan.liang
@ 2020-11-09  9:52 ` Peter Zijlstra
  2020-11-09 11:04   ` Peter Zijlstra
  2 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2020-11-09  9:52 UTC (permalink / raw)
  To: kan.liang
  Cc: mingo, linux-kernel, namhyung, eranian, irogers, gmx, acme, jolsa, ak

On Fri, Nov 06, 2020 at 01:29:33PM -0800, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
> 
> Sometimes the PMU internal buffers have to be flushed for per-CPU events
> during a context switch, e.g., large PEBS. Otherwise, the perf tool may
> report samples in locations that do not belong to the process where the
> samples are processed in, because PEBS does not tag samples with PID/TID.
> 
> The current code only flush the buffers for a per-task event. It doesn't
> check a per-CPU event.
> 
> Add a new event state flag, PERF_ATTACH_SCHED_CB, to indicate that the
> PMU internal buffers have to be flushed for this event during a context
> switch.
> 
> Add sched_cb_entry and perf_sched_cb_usages back to track the PMU/cpuctx
> which is required to be flushed.
> 
> Only need to invoke the sched_task() for per-CPU events in this patch.
> The per-task events have been handled in perf_event_context_sched_in/out
> already.
> 
> Fixes: 9c964efa4330 ("perf/x86/intel: Drain the PEBS buffer during context switches")

Are you sure? In part this patch looks like a revert of:

  44fae179ce73a26733d9e2d346da4e1a1cb94647
  556cccad389717d6eb4f5a24b45ff41cad3aaabf


> +static void perf_pmu_sched_task(struct task_struct *prev,
> +				struct task_struct *next,
> +				bool sched_in)
> +{
> +	struct perf_cpu_context *cpuctx;
> +
> +	if (prev == next)
> +		return;
> +
> +	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
> +		/* will be handled in perf_event_context_sched_in/out */
> +		if (cpuctx->task_ctx)
> +			continue;

This seems wrong; cpuctx->task_ctx merely indicates that there is a
task-ctx for this CPU. Not that the event you're interested in is in
fact there.

So consider the case where the event is on the CPU context, but we also
have a task context. Then we'll not issue this call.

> +
> +		__perf_pmu_sched_task(cpuctx, sched_in);
> +	}
> +}

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events
  2020-11-09  9:52 ` [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events Peter Zijlstra
@ 2020-11-09 11:04   ` Peter Zijlstra
  2020-11-09 14:49     ` Liang, Kan
  0 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2020-11-09 11:04 UTC (permalink / raw)
  To: kan.liang
  Cc: mingo, linux-kernel, namhyung, eranian, irogers, gmx, acme, jolsa, ak

On Mon, Nov 09, 2020 at 10:52:35AM +0100, Peter Zijlstra wrote:
> On Fri, Nov 06, 2020 at 01:29:33PM -0800, kan.liang@linux.intel.com wrote:
> > From: Kan Liang <kan.liang@linux.intel.com>
> > 
> > Sometimes the PMU internal buffers have to be flushed for per-CPU events
> > during a context switch, e.g., large PEBS. Otherwise, the perf tool may
> > report samples in locations that do not belong to the process where the
> > samples are processed in, because PEBS does not tag samples with PID/TID.
> > 
> > The current code only flush the buffers for a per-task event. It doesn't
> > check a per-CPU event.
> > 
> > Add a new event state flag, PERF_ATTACH_SCHED_CB, to indicate that the
> > PMU internal buffers have to be flushed for this event during a context
> > switch.
> > 
> > Add sched_cb_entry and perf_sched_cb_usages back to track the PMU/cpuctx
> > which is required to be flushed.
> > 
> > Only need to invoke the sched_task() for per-CPU events in this patch.
> > The per-task events have been handled in perf_event_context_sched_in/out
> > already.
> > 
> > Fixes: 9c964efa4330 ("perf/x86/intel: Drain the PEBS buffer during context switches")
> 
> Are you sure? In part this patch looks like a revert of:
> 
>   44fae179ce73a26733d9e2d346da4e1a1cb94647
>   556cccad389717d6eb4f5a24b45ff41cad3aaabf

*groan*... I think I might've made a mistake with those two patches. I
assumed the whole cpuctx->task_ctx thing was relevant, it is not.

As per perf_sched_cb_{inc,dec}(struct pmu *), the thing we care about is
that the *PMU* gets a context switch callback, we don't give a crap
about the actual task context. Except that LBR code does, but I'm
thinking that started the whole confusion -- and I'm still not sure it's
actually correct either.

Now,.. how did we end up with the above two patches anyway... /me frobs
around in the inbox... Ah! that daft user RDPMC thing. I wanted to avoid
yet another pmu::method().

Hmm.. and the reason I proposed that change is because we'd end up with
the sched_cb for every context switch now, not just large-pebs and or
lbr crud. And this form avoids the double perf_pmu_disable() and all
that.

Maybe we can frob x86_pmu_enable()...

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events
  2020-11-09 11:04   ` Peter Zijlstra
@ 2020-11-09 14:49     ` Liang, Kan
  2020-11-09 17:33       ` Peter Zijlstra
                         ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Liang, Kan @ 2020-11-09 14:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, linux-kernel, namhyung, eranian, irogers, gmx, acme, jolsa, ak



On 11/9/2020 6:04 AM, Peter Zijlstra wrote:
> On Mon, Nov 09, 2020 at 10:52:35AM +0100, Peter Zijlstra wrote:
>> On Fri, Nov 06, 2020 at 01:29:33PM -0800, kan.liang@linux.intel.com wrote:
>>> From: Kan Liang <kan.liang@linux.intel.com>
>>>
>>> Sometimes the PMU internal buffers have to be flushed for per-CPU events
>>> during a context switch, e.g., large PEBS. Otherwise, the perf tool may
>>> report samples in locations that do not belong to the process where the
>>> samples are processed in, because PEBS does not tag samples with PID/TID.
>>>
>>> The current code only flush the buffers for a per-task event. It doesn't
>>> check a per-CPU event.
>>>
>>> Add a new event state flag, PERF_ATTACH_SCHED_CB, to indicate that the
>>> PMU internal buffers have to be flushed for this event during a context
>>> switch.
>>>
>>> Add sched_cb_entry and perf_sched_cb_usages back to track the PMU/cpuctx
>>> which is required to be flushed.
>>>
>>> Only need to invoke the sched_task() for per-CPU events in this patch.
>>> The per-task events have been handled in perf_event_context_sched_in/out
>>> already.
>>>
>>> Fixes: 9c964efa4330 ("perf/x86/intel: Drain the PEBS buffer during context switches")
>>
>> Are you sure? In part this patch looks like a revert of:
>>
>>    44fae179ce73a26733d9e2d346da4e1a1cb94647
>>    556cccad389717d6eb4f5a24b45ff41cad3aaabf
> 

The patch tries to fix a long term PEBS bug with per-CPU event. It's not 
a revert. I think we still want to keep the optimization for per-task 
event with 44fae179ce73 and 556cccad3897.

Here is the story.

- When the large PEBS was introduced (9c964efa4330), the sched_task() 
should be invoked to flush the PEBS buffer in each context switch. 
However, The perf_sched_events in account_event() is not updated 
accordingly. The perf_event_task_sched_* never be invoked for a pure 
per-CPU context. Only per-task event works.
    At that time, the perf_pmu_sched_task() is outside of 
perf_event_context_sched_in/out. It means that perf has to double 
perf_pmu_disable() for per-task event.


- The recent optimization (44fae179ce73 and 556cccad3897) only optimize 
the per-task event by moving the sched_task() into 
perf_event_context_sched_in/out. So perf only need to invoke 
perf_pmu_disable() once.
   But it doesn't fix the issue in a pure per-CPU context. The per-CPU 
events are still broken.


- The patch 1 tries to fix broken per-CPU events. The CPU context cannot 
be retrieved from the task->perf_event_ctxp. So it has to be tracked in 
the sched_cb_list. Yes, the code is very similar to the original codes, 
but it is actually the new code for per-CPU events. The optimization for 
per-task events is still kept.
   For the case, which has both a CPU context and a task context, yes, 
the __perf_pmu_sched_task() in this patch is not invoked. Because the 
sched_task() only need to be invoked once in a context switch. The 
sched_task() will be eventually invoked in the task context.


- The following patch 3 tries to optimize the sched_task() further.
   - We handle the per-CPU event and per-task event in different places. 
In a pure per-task context, we don't need to go through the 
sched_cb_list. We can get the information from task->perf_event_ctxp. We 
introduce a flag to distinguish them.
   - For most of the cases, we don't need to invoke the sched_task() in 
both sched in and sched out. We also introduce a flag for PMUs to avoid 
unnecessary calls.


All the implementation and optimization are generic. For example, for 
power, the BHRB entries are reset in context switch in. They can also 
benefit from the optimization.


> *groan*... I think I might've made a mistake with those two patches. I
> assumed the whole cpuctx->task_ctx thing was relevant, it is not.
> 
> As per perf_sched_cb_{inc,dec}(struct pmu *), the thing we care about is
> that the *PMU* gets a context switch callback, we don't give a crap
> about the actual task context. Except that LBR code does, but I'm
> thinking that started the whole confusion -- and I'm still not sure it's
> actually correct either.
> 
> Now,.. how did we end up with the above two patches anyway... /me frobs
> around in the inbox... Ah! that daft user RDPMC thing. I wanted to avoid
> yet another pmu::method().
> 
> Hmm.. and the reason I proposed that change is because we'd end up with
> the sched_cb for every context switch now, not just large-pebs and or
> lbr crud. And this form avoids the double perf_pmu_disable() and all
> that.

With the patch set,  I think the double perf_pmu_disable() is still avoided.

> 
> Maybe we can frob x86_pmu_enable()...

Could you please elaborate?

Thanks,
Kan


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events
  2020-11-09 14:49     ` Liang, Kan
@ 2020-11-09 17:33       ` Peter Zijlstra
  2020-11-09 19:52         ` Liang, Kan
  2020-11-09 17:40       ` Peter Zijlstra
  2020-11-11 16:25       ` Peter Zijlstra
  2 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2020-11-09 17:33 UTC (permalink / raw)
  To: Liang, Kan
  Cc: mingo, linux-kernel, namhyung, eranian, irogers, gmx, acme, jolsa, ak

On Mon, Nov 09, 2020 at 09:49:31AM -0500, Liang, Kan wrote:
> > Maybe we can frob x86_pmu_enable()...
> 
> Could you please elaborate?

Something horrible like this. It will detect the first time we enable
the PMU on a new task (IOW we did a context switch) and wipe the
counters when user RDPMC is on...

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 77b963e5e70a..d862927baaef 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1289,6 +1289,15 @@ static void x86_pmu_enable(struct pmu *pmu)
 		perf_events_lapic_init();
 	}
 
+	if (cpuc->current != current) {
+		struct mm_struct *mm = current->mm;
+
+		cpuc->current = current;
+
+		if (mm && atomic_read(&mm->context.perf_rdpmc_allowed))
+			wipe_dirty_counters();
+	}
+
 	cpuc->enabled = 1;
 	barrier();
 
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 7895cf4c59a7..d16118cb3bd0 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -248,6 +248,8 @@ struct cpu_hw_events {
 	unsigned int		txn_flags;
 	int			is_fake;
 
+	void			*current;
+
 	/*
 	 * Intel DebugStore bits
 	 */

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events
  2020-11-09 14:49     ` Liang, Kan
  2020-11-09 17:33       ` Peter Zijlstra
@ 2020-11-09 17:40       ` Peter Zijlstra
  2020-11-11 16:25       ` Peter Zijlstra
  2 siblings, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2020-11-09 17:40 UTC (permalink / raw)
  To: Liang, Kan
  Cc: mingo, linux-kernel, namhyung, eranian, irogers, gmx, acme, jolsa, ak

On Mon, Nov 09, 2020 at 09:49:31AM -0500, Liang, Kan wrote:

> - The patch 1 tries to fix broken per-CPU events. The CPU context cannot be
> retrieved from the task->perf_event_ctxp. So it has to be tracked in the
> sched_cb_list. Yes, the code is very similar to the original codes, but it
> is actually the new code for per-CPU events. The optimization for per-task
> events is still kept.
>   For the case, which has both a CPU context and a task context, yes, the
> __perf_pmu_sched_task() in this patch is not invoked. Because the
> sched_task() only need to be invoked once in a context switch. The
> sched_task() will be eventually invoked in the task context.
> 
> 
> - The following patch 3 tries to optimize the sched_task() further.
>   - We handle the per-CPU event and per-task event in different places. In a
> pure per-task context, we don't need to go through the sched_cb_list. We can
> get the information from task->perf_event_ctxp. We introduce a flag to
> distinguish them.
>   - For most of the cases, we don't need to invoke the sched_task() in both
> sched in and sched out. We also introduce a flag for PMUs to avoid
> unnecessary calls.
> 

There's a further optimiation possible; we can have pmu::swap_task_ctx()
imply pmu::sched_task() (both in and out).


It's all a bit of a mess though.. let me consider.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events
  2020-11-09 17:33       ` Peter Zijlstra
@ 2020-11-09 19:52         ` Liang, Kan
  0 siblings, 0 replies; 18+ messages in thread
From: Liang, Kan @ 2020-11-09 19:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, linux-kernel, namhyung, eranian, irogers, gmx, acme, jolsa, ak



On 11/9/2020 12:33 PM, Peter Zijlstra wrote:
> On Mon, Nov 09, 2020 at 09:49:31AM -0500, Liang, Kan wrote:
>>> Maybe we can frob x86_pmu_enable()...
>>
>> Could you please elaborate?
> 
> Something horrible like this. It will detect the first time we enable
> the PMU on a new task (IOW we did a context switch) and wipe the
> counters when user RDPMC is on...
>

Oh, you mean the RDPMC patch. It should be doable, but I think 
sched_task() may be a better place, especially with the new optimization 
(patch 3). We can set PERF_SCHED_CB_SW_IN bit for the case, so we only 
do the check for per-task events in sched in.

It looks like the below patch has to unconditionally do the check (even 
for the non-RDPMC cases), which should be unnecessary.

Anyway, I think the RDPMC patch should depend on the implementation of 
the sched_task(). We may have further discussion when the design of 
sched_task() is finalized.


Thanks,
Kan

> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index 77b963e5e70a..d862927baaef 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -1289,6 +1289,15 @@ static void x86_pmu_enable(struct pmu *pmu)
>   		perf_events_lapic_init();
>   	}
>   
> +	if (cpuc->current != current) {
> +		struct mm_struct *mm = current->mm;
> +
> +		cpuc->current = current;
> +
> +		if (mm && atomic_read(&mm->context.perf_rdpmc_allowed))
> +			wipe_dirty_counters();
> +	}
> +
>   	cpuc->enabled = 1;
>   	barrier();
>   
> diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
> index 7895cf4c59a7..d16118cb3bd0 100644
> --- a/arch/x86/events/perf_event.h
> +++ b/arch/x86/events/perf_event.h
> @@ -248,6 +248,8 @@ struct cpu_hw_events {
>   	unsigned int		txn_flags;
>   	int			is_fake;
>   
> +	void			*current;
> +
>   	/*
>   	 * Intel DebugStore bits
>   	 */
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events
  2020-11-09 14:49     ` Liang, Kan
  2020-11-09 17:33       ` Peter Zijlstra
  2020-11-09 17:40       ` Peter Zijlstra
@ 2020-11-11 16:25       ` Peter Zijlstra
  2020-11-11 19:54         ` Liang, Kan
  2 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2020-11-11 16:25 UTC (permalink / raw)
  To: Liang, Kan
  Cc: mingo, linux-kernel, namhyung, eranian, irogers, gmx, acme, jolsa, ak

On Mon, Nov 09, 2020 at 09:49:31AM -0500, Liang, Kan wrote:

> - When the large PEBS was introduced (9c964efa4330), the sched_task() should
> be invoked to flush the PEBS buffer in each context switch. However, The
> perf_sched_events in account_event() is not updated accordingly. The
> perf_event_task_sched_* never be invoked for a pure per-CPU context. Only
> per-task event works.
>    At that time, the perf_pmu_sched_task() is outside of
> perf_event_context_sched_in/out. It means that perf has to double
> perf_pmu_disable() for per-task event.

> - The patch 1 tries to fix broken per-CPU events. The CPU context cannot be
> retrieved from the task->perf_event_ctxp. So it has to be tracked in the
> sched_cb_list. Yes, the code is very similar to the original codes, but it
> is actually the new code for per-CPU events. The optimization for per-task
> events is still kept.
>   For the case, which has both a CPU context and a task context, yes, the
> __perf_pmu_sched_task() in this patch is not invoked. Because the
> sched_task() only need to be invoked once in a context switch. The
> sched_task() will be eventually invoked in the task context.

The thing is; your first two patches rely on PERF_ATTACH_SCHED_CB and
only set that for large pebs. Are you sure the other users (Intel LBR
and PowerPC BHRB) don't need it?

If they indeed do not require the pmu::sched_task() callback for CPU
events, then I still think the whole perf_sched_cb_{inc,dec}() interface
is confusing at best.

Can't we do something like this instead?

---
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 546cc89217bb..672d6f039fce 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3565,8 +3565,10 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) {
 			event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
 			if (!(event->attr.sample_type &
-			      ~intel_pmu_large_pebs_flags(event)))
+			      ~intel_pmu_large_pebs_flags(event))) {
 				event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS;
+				event->attach_state |= PERF_ATTACH_SCHED_CB;
+			}
 		}
 		if (x86_pmu.pebs_aliases)
 			x86_pmu.pebs_aliases(event);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 9a38f579bc76..af9ee539c179 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -606,6 +606,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_TASK	0x04
 #define PERF_ATTACH_TASK_DATA	0x08
 #define PERF_ATTACH_ITRACE	0x10
+#define PERF_ATTACH_SCHED_CB	0x20
 
 struct perf_cgroup;
 struct perf_buffer;
@@ -817,6 +818,7 @@ struct perf_event_context {
 	int				is_active;
 	int				nr_stat;
 	int				nr_freq;
+	int				nr_sched_task;
 	int				rotate_disable;
 	/*
 	 * Set when nr_events != nr_active, except tolerant to events not
@@ -872,7 +874,7 @@ struct perf_cpu_context {
 	struct list_head		cgrp_cpuctx_entry;
 #endif
 
-	int				sched_cb_usage;
+	struct list_head		sched_cb_entry;
 
 	int				online;
 	/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d2f3ca792936..0a5dfed6bb46 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -384,6 +384,7 @@ static DEFINE_MUTEX(perf_sched_mutex);
 static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usage);
 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
@@ -2292,6 +2293,12 @@ event_sched_out(struct perf_event *event,
 		perf_event_ctx_deactivate(ctx);
 	if (event->attr.freq && event->attr.sample_freq)
 		ctx->nr_freq--;
+	if (event->attach_state & PERF_ATTACH_SCHED_CB) {
+		if (!--ctx->nr_sched_task && &cpuctx->ctx == ctx) {
+			list_del(&cpuctx->sched_cb_entry);
+			this_cpu_dec(perf_sched_cb_usage);
+		}
+	}
 	if (event->attr.exclusive || !cpuctx->active_oncpu)
 		cpuctx->exclusive = 0;
 
@@ -2564,6 +2571,12 @@ event_sched_in(struct perf_event *event,
 		perf_event_ctx_activate(ctx);
 	if (event->attr.freq && event->attr.sample_freq)
 		ctx->nr_freq++;
+	if (event->attach_state & PERF_ATTACH_SCHED_CB) {
+		if (!ctx->nr_sched_task++ && &cpuctx->ctx == ctx) {
+			list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+			this_cpu_inc(perf_sched_cb_usage);
+		}
+	}
 
 	if (event->attr.exclusive)
 		cpuctx->exclusive = 1;
@@ -3424,7 +3437,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 
 			perf_pmu_disable(pmu);
 
-			if (cpuctx->sched_cb_usage && pmu->sched_task)
+			if (ctx->nr_sched_task)
 				pmu->sched_task(ctx, false);
 
 			/*
@@ -3464,7 +3477,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 		raw_spin_lock(&ctx->lock);
 		perf_pmu_disable(pmu);
 
-		if (cpuctx->sched_cb_usage && pmu->sched_task)
+		if (ctx->nr_sched_task)
 			pmu->sched_task(ctx, false);
 		task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
 
@@ -3473,20 +3486,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 	}
 }
 
-void perf_sched_cb_dec(struct pmu *pmu)
-{
-	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
-	--cpuctx->sched_cb_usage;
-}
-
-
-void perf_sched_cb_inc(struct pmu *pmu)
-{
-	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
-	cpuctx->sched_cb_usage++;
-}
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
 
 /*
  * This function provides the context switch callback to the lower code
@@ -3514,6 +3514,24 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 
+static void perf_pmu_sched_task(struct task_struct *prev,
+				struct task_struct *next,
+				bool sched_in)
+{
+	struct perf_cpu_context *cpuctx;
+
+	if (prev == next)
+		return;
+
+	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
+		/* will be handled in perf_event_context_sched_in/out */
+		if (cpuctx->task_ctx)
+			continue;
+
+		__perf_pmu_sched_task(cpuctx, sched_in);
+	}
+}
+
 static void perf_event_switch(struct task_struct *task,
 			      struct task_struct *next_prev, bool sched_in);
 
@@ -3536,6 +3554,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
 {
 	int ctxn;
 
+	if (__this_cpu_read(perf_sched_cb_usage))
+		perf_pmu_sched_task(task, next, false);
+
 	if (atomic_read(&nr_switch_events))
 		perf_event_switch(task, next, false);
 
@@ -3772,7 +3793,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 
 	cpuctx = __get_cpu_context(ctx);
 	if (cpuctx->task_ctx == ctx) {
-		if (cpuctx->sched_cb_usage)
+		if (ctx->nr_sched_task)
 			__perf_pmu_sched_task(cpuctx, true);
 		return;
 	}
@@ -3798,8 +3819,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 	perf_event_sched_in(cpuctx, ctx, task);
 
-	if (cpuctx->sched_cb_usage && pmu->sched_task)
-		pmu->sched_task(cpuctx->task_ctx, true);
+	if (ctx->nr_sched_task)
+		pmu->sched_task(ctx, true);
 
 	perf_pmu_enable(pmu);
 
@@ -3844,6 +3865,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 
 	if (atomic_read(&nr_switch_events))
 		perf_event_switch(task, prev, true);
+
+	if (__this_cpu_read(perf_sched_cb_usage))
+		perf_pmu_sched_task(prev, task, true);
 }
 
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -4668,7 +4692,7 @@ static void unaccount_event(struct perf_event *event)
 	if (event->parent)
 		return;
 
-	if (event->attach_state & PERF_ATTACH_TASK)
+	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
 		dec = true;
 	if (event->attr.mmap || event->attr.mmap_data)
 		atomic_dec(&nr_mmap_events);
@@ -11195,7 +11219,7 @@ static void account_event(struct perf_event *event)
 	if (event->parent)
 		return;
 
-	if (event->attach_state & PERF_ATTACH_TASK)
+	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
 		inc = true;
 	if (event->attr.mmap || event->attr.mmap_data)
 		atomic_inc(&nr_mmap_events);
@@ -12987,6 +13011,7 @@ static void __init perf_event_init_all_cpus(void)
 #ifdef CONFIG_CGROUP_PERF
 		INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
 #endif
+		INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
 	}
 }
 

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events
  2020-11-11 16:25       ` Peter Zijlstra
@ 2020-11-11 19:54         ` Liang, Kan
  2020-11-17  5:01           ` Namhyung Kim
  0 siblings, 1 reply; 18+ messages in thread
From: Liang, Kan @ 2020-11-11 19:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, linux-kernel, namhyung, eranian, irogers, gmx, acme, jolsa, ak



On 11/11/2020 11:25 AM, Peter Zijlstra wrote:
> On Mon, Nov 09, 2020 at 09:49:31AM -0500, Liang, Kan wrote:
> 
>> - When the large PEBS was introduced (9c964efa4330), the sched_task() should
>> be invoked to flush the PEBS buffer in each context switch. However, The
>> perf_sched_events in account_event() is not updated accordingly. The
>> perf_event_task_sched_* never be invoked for a pure per-CPU context. Only
>> per-task event works.
>>     At that time, the perf_pmu_sched_task() is outside of
>> perf_event_context_sched_in/out. It means that perf has to double
>> perf_pmu_disable() for per-task event.
> 
>> - The patch 1 tries to fix broken per-CPU events. The CPU context cannot be
>> retrieved from the task->perf_event_ctxp. So it has to be tracked in the
>> sched_cb_list. Yes, the code is very similar to the original codes, but it
>> is actually the new code for per-CPU events. The optimization for per-task
>> events is still kept.
>>    For the case, which has both a CPU context and a task context, yes, the
>> __perf_pmu_sched_task() in this patch is not invoked. Because the
>> sched_task() only need to be invoked once in a context switch. The
>> sched_task() will be eventually invoked in the task context.
> 
> The thing is; your first two patches rely on PERF_ATTACH_SCHED_CB and
> only set that for large pebs. Are you sure the other users (Intel LBR
> and PowerPC BHRB) don't need it?

I didn't set it for LBR, because the perf_sched_events is always enabled 
for LBR. But, yes, we should explicitly set the PERF_ATTACH_SCHED_CB 
for LBR.

	if (has_branch_stack(event))
		inc = true;

> 
> If they indeed do not require the pmu::sched_task() callback for CPU
> events, then I still think the whole perf_sched_cb_{inc,dec}() interface

No, LBR requires the pmu::sched_task() callback for CPU events.

Now, The LBR registers have to be reset in sched in even for CPU events.

To fix the shorter LBR callstack issue for CPU events, we also need to 
save/restore LBRs in pmu::sched_task().
https://lore.kernel.org/lkml/1578495789-95006-4-git-send-email-kan.liang@linux.intel.com/

> is confusing at best.
> 
> Can't we do something like this instead?
> 
I think the below patch may have two issues.
- PERF_ATTACH_SCHED_CB is required for LBR (maybe PowerPC BHRB as well) now.
- We may disable the large PEBS later if not all PEBS events support 
large PEBS. The PMU need a way to notify the generic code to decrease 
the nr_sched_task.

The current related code in Intel PMU is as below.

static void
pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
		  struct perf_event *event, bool add)
{
	struct pmu *pmu = event->ctx->pmu;
	/*
	 * Make sure we get updated with the first PEBS
	 * event. It will trigger also during removal, but
	 * that does not hurt:
	 */
	bool update = cpuc->n_pebs == 1;

	if (needed_cb != pebs_needs_sched_cb(cpuc)) {
		if (!needed_cb)
			perf_sched_cb_inc(pmu);
		else
			perf_sched_cb_dec(pmu);

		update = true;
	}

Thanks,
Kan

> ---
> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
> index 546cc89217bb..672d6f039fce 100644
> --- a/arch/x86/events/intel/core.c
> +++ b/arch/x86/events/intel/core.c
> @@ -3565,8 +3565,10 @@ static int intel_pmu_hw_config(struct perf_event *event)
>   		if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) {
>   			event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
>   			if (!(event->attr.sample_type &
> -			      ~intel_pmu_large_pebs_flags(event)))
> +			      ~intel_pmu_large_pebs_flags(event))) {
>   				event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS;
> +				event->attach_state |= PERF_ATTACH_SCHED_CB;
> +			}
>   		}
>   		if (x86_pmu.pebs_aliases)
>   			x86_pmu.pebs_aliases(event);
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 9a38f579bc76..af9ee539c179 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -606,6 +606,7 @@ struct swevent_hlist {
>   #define PERF_ATTACH_TASK	0x04
>   #define PERF_ATTACH_TASK_DATA	0x08
>   #define PERF_ATTACH_ITRACE	0x10
> +#define PERF_ATTACH_SCHED_CB	0x20
>   
>   struct perf_cgroup;
>   struct perf_buffer;
> @@ -817,6 +818,7 @@ struct perf_event_context {
>   	int				is_active;
>   	int				nr_stat;
>   	int				nr_freq;
> +	int				nr_sched_task;
>   	int				rotate_disable;
>   	/*
>   	 * Set when nr_events != nr_active, except tolerant to events not
> @@ -872,7 +874,7 @@ struct perf_cpu_context {
>   	struct list_head		cgrp_cpuctx_entry;
>   #endif
>   
> -	int				sched_cb_usage;
> +	struct list_head		sched_cb_entry;
>   
>   	int				online;
>   	/*
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index d2f3ca792936..0a5dfed6bb46 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -384,6 +384,7 @@ static DEFINE_MUTEX(perf_sched_mutex);
>   static atomic_t perf_sched_count;
>   
>   static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
> +static DEFINE_PER_CPU(int, perf_sched_cb_usage);
>   static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
>   
>   static atomic_t nr_mmap_events __read_mostly;
> @@ -2292,6 +2293,12 @@ event_sched_out(struct perf_event *event,
>   		perf_event_ctx_deactivate(ctx);
>   	if (event->attr.freq && event->attr.sample_freq)
>   		ctx->nr_freq--;
> +	if (event->attach_state & PERF_ATTACH_SCHED_CB) {
> +		if (!--ctx->nr_sched_task && &cpuctx->ctx == ctx) {
> +			list_del(&cpuctx->sched_cb_entry);
> +			this_cpu_dec(perf_sched_cb_usage);
> +		}
> +	}
>   	if (event->attr.exclusive || !cpuctx->active_oncpu)
>   		cpuctx->exclusive = 0;
>   
> @@ -2564,6 +2571,12 @@ event_sched_in(struct perf_event *event,
>   		perf_event_ctx_activate(ctx);
>   	if (event->attr.freq && event->attr.sample_freq)
>   		ctx->nr_freq++;
> +	if (event->attach_state & PERF_ATTACH_SCHED_CB) {
> +		if (!ctx->nr_sched_task++ && &cpuctx->ctx == ctx) {
> +			list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
> +			this_cpu_inc(perf_sched_cb_usage);
> +		}
> +	}
>   
>   	if (event->attr.exclusive)
>   		cpuctx->exclusive = 1;
> @@ -3424,7 +3437,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
>   
>   			perf_pmu_disable(pmu);
>   
> -			if (cpuctx->sched_cb_usage && pmu->sched_task)
> +			if (ctx->nr_sched_task)
>   				pmu->sched_task(ctx, false);
>   
>   			/*
> @@ -3464,7 +3477,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
>   		raw_spin_lock(&ctx->lock);
>   		perf_pmu_disable(pmu);
>   
> -		if (cpuctx->sched_cb_usage && pmu->sched_task)
> +		if (ctx->nr_sched_task)
>   			pmu->sched_task(ctx, false);
>   		task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
>   
> @@ -3473,20 +3486,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
>   	}
>   }
>   
> -void perf_sched_cb_dec(struct pmu *pmu)
> -{
> -	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
> -
> -	--cpuctx->sched_cb_usage;
> -}
> -
> -
> -void perf_sched_cb_inc(struct pmu *pmu)
> -{
> -	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
> -
> -	cpuctx->sched_cb_usage++;
> -}
> +static DEFINE_PER_CPU(struct list_head, sched_cb_list);
>   
>   /*
>    * This function provides the context switch callback to the lower code
> @@ -3514,6 +3514,24 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in
>   	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>   }
>   
> +static void perf_pmu_sched_task(struct task_struct *prev,
> +				struct task_struct *next,
> +				bool sched_in)
> +{
> +	struct perf_cpu_context *cpuctx;
> +
> +	if (prev == next)
> +		return;
> +
> +	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
> +		/* will be handled in perf_event_context_sched_in/out */
> +		if (cpuctx->task_ctx)
> +			continue;
> +
> +		__perf_pmu_sched_task(cpuctx, sched_in);
> +	}
> +}
> +
>   static void perf_event_switch(struct task_struct *task,
>   			      struct task_struct *next_prev, bool sched_in);
>   
> @@ -3536,6 +3554,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
>   {
>   	int ctxn;
>   
> +	if (__this_cpu_read(perf_sched_cb_usage))
> +		perf_pmu_sched_task(task, next, false);
> +
>   	if (atomic_read(&nr_switch_events))
>   		perf_event_switch(task, next, false);
>   
> @@ -3772,7 +3793,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
>   
>   	cpuctx = __get_cpu_context(ctx);
>   	if (cpuctx->task_ctx == ctx) {
> -		if (cpuctx->sched_cb_usage)
> +		if (ctx->nr_sched_task)
>   			__perf_pmu_sched_task(cpuctx, true);
>   		return;
>   	}
> @@ -3798,8 +3819,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
>   		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
>   	perf_event_sched_in(cpuctx, ctx, task);
>   
> -	if (cpuctx->sched_cb_usage && pmu->sched_task)
> -		pmu->sched_task(cpuctx->task_ctx, true);
> +	if (ctx->nr_sched_task)
> +		pmu->sched_task(ctx, true);
>   
>   	perf_pmu_enable(pmu);
>   
> @@ -3844,6 +3865,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
>   
>   	if (atomic_read(&nr_switch_events))
>   		perf_event_switch(task, prev, true);
> +
> +	if (__this_cpu_read(perf_sched_cb_usage))
> +		perf_pmu_sched_task(prev, task, true);
>   }
>   
>   static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
> @@ -4668,7 +4692,7 @@ static void unaccount_event(struct perf_event *event)
>   	if (event->parent)
>   		return;
>   
> -	if (event->attach_state & PERF_ATTACH_TASK)
> +	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
>   		dec = true;
>   	if (event->attr.mmap || event->attr.mmap_data)
>   		atomic_dec(&nr_mmap_events);
> @@ -11195,7 +11219,7 @@ static void account_event(struct perf_event *event)
>   	if (event->parent)
>   		return;
>   
> -	if (event->attach_state & PERF_ATTACH_TASK)
> +	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
>   		inc = true;
>   	if (event->attr.mmap || event->attr.mmap_data)
>   		atomic_inc(&nr_mmap_events);
> @@ -12987,6 +13011,7 @@ static void __init perf_event_init_all_cpus(void)
>   #ifdef CONFIG_CGROUP_PERF
>   		INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
>   #endif
> +		INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
>   	}
>   }
>   
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events
  2020-11-11 19:54         ` Liang, Kan
@ 2020-11-17  5:01           ` Namhyung Kim
  2020-11-20 11:24             ` Namhyung Kim
  0 siblings, 1 reply; 18+ messages in thread
From: Namhyung Kim @ 2020-11-17  5:01 UTC (permalink / raw)
  To: Liang, Kan
  Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Stephane Eranian,
	Ian Rogers, Gabriel Marin, Arnaldo Carvalho de Melo, Jiri Olsa,
	Andi Kleen

Hello,

On Thu, Nov 12, 2020 at 4:54 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>
>
>
> On 11/11/2020 11:25 AM, Peter Zijlstra wrote:
> > On Mon, Nov 09, 2020 at 09:49:31AM -0500, Liang, Kan wrote:
> >
> >> - When the large PEBS was introduced (9c964efa4330), the sched_task() should
> >> be invoked to flush the PEBS buffer in each context switch. However, The
> >> perf_sched_events in account_event() is not updated accordingly. The
> >> perf_event_task_sched_* never be invoked for a pure per-CPU context. Only
> >> per-task event works.
> >>     At that time, the perf_pmu_sched_task() is outside of
> >> perf_event_context_sched_in/out. It means that perf has to double
> >> perf_pmu_disable() for per-task event.
> >
> >> - The patch 1 tries to fix broken per-CPU events. The CPU context cannot be
> >> retrieved from the task->perf_event_ctxp. So it has to be tracked in the
> >> sched_cb_list. Yes, the code is very similar to the original codes, but it
> >> is actually the new code for per-CPU events. The optimization for per-task
> >> events is still kept.
> >>    For the case, which has both a CPU context and a task context, yes, the
> >> __perf_pmu_sched_task() in this patch is not invoked. Because the
> >> sched_task() only need to be invoked once in a context switch. The
> >> sched_task() will be eventually invoked in the task context.
> >
> > The thing is; your first two patches rely on PERF_ATTACH_SCHED_CB and
> > only set that for large pebs. Are you sure the other users (Intel LBR
> > and PowerPC BHRB) don't need it?
>
> I didn't set it for LBR, because the perf_sched_events is always enabled
> for LBR. But, yes, we should explicitly set the PERF_ATTACH_SCHED_CB
> for LBR.
>
>         if (has_branch_stack(event))
>                 inc = true;
>
> >
> > If they indeed do not require the pmu::sched_task() callback for CPU
> > events, then I still think the whole perf_sched_cb_{inc,dec}() interface
>
> No, LBR requires the pmu::sched_task() callback for CPU events.
>
> Now, The LBR registers have to be reset in sched in even for CPU events.
>
> To fix the shorter LBR callstack issue for CPU events, we also need to
> save/restore LBRs in pmu::sched_task().
> https://lore.kernel.org/lkml/1578495789-95006-4-git-send-email-kan.liang@linux.intel.com/
>
> > is confusing at best.
> >
> > Can't we do something like this instead?
> >
> I think the below patch may have two issues.
> - PERF_ATTACH_SCHED_CB is required for LBR (maybe PowerPC BHRB as well) now.
> - We may disable the large PEBS later if not all PEBS events support
> large PEBS. The PMU need a way to notify the generic code to decrease
> the nr_sched_task.

Any updates on this?  I've reviewed and tested Kan's patches
and they all look good.

Maybe we can talk to PPC folks to confirm the BHRB case?

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events
  2020-11-17  5:01           ` Namhyung Kim
@ 2020-11-20 11:24             ` Namhyung Kim
  2020-11-23 11:00               ` Michael Ellerman
  0 siblings, 1 reply; 18+ messages in thread
From: Namhyung Kim @ 2020-11-20 11:24 UTC (permalink / raw)
  To: Liang, Kan
  Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Stephane Eranian,
	Ian Rogers, Gabriel Marin, Arnaldo Carvalho de Melo, Jiri Olsa,
	Andi Kleen, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, linuxppc-dev

Hi Peter and Kan,

(Adding PPC folks)

On Tue, Nov 17, 2020 at 2:01 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Hello,
>
> On Thu, Nov 12, 2020 at 4:54 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
> >
> >
> >
> > On 11/11/2020 11:25 AM, Peter Zijlstra wrote:
> > > On Mon, Nov 09, 2020 at 09:49:31AM -0500, Liang, Kan wrote:
> > >
> > >> - When the large PEBS was introduced (9c964efa4330), the sched_task() should
> > >> be invoked to flush the PEBS buffer in each context switch. However, The
> > >> perf_sched_events in account_event() is not updated accordingly. The
> > >> perf_event_task_sched_* never be invoked for a pure per-CPU context. Only
> > >> per-task event works.
> > >>     At that time, the perf_pmu_sched_task() is outside of
> > >> perf_event_context_sched_in/out. It means that perf has to double
> > >> perf_pmu_disable() for per-task event.
> > >
> > >> - The patch 1 tries to fix broken per-CPU events. The CPU context cannot be
> > >> retrieved from the task->perf_event_ctxp. So it has to be tracked in the
> > >> sched_cb_list. Yes, the code is very similar to the original codes, but it
> > >> is actually the new code for per-CPU events. The optimization for per-task
> > >> events is still kept.
> > >>    For the case, which has both a CPU context and a task context, yes, the
> > >> __perf_pmu_sched_task() in this patch is not invoked. Because the
> > >> sched_task() only need to be invoked once in a context switch. The
> > >> sched_task() will be eventually invoked in the task context.
> > >
> > > The thing is; your first two patches rely on PERF_ATTACH_SCHED_CB and
> > > only set that for large pebs. Are you sure the other users (Intel LBR
> > > and PowerPC BHRB) don't need it?
> >
> > I didn't set it for LBR, because the perf_sched_events is always enabled
> > for LBR. But, yes, we should explicitly set the PERF_ATTACH_SCHED_CB
> > for LBR.
> >
> >         if (has_branch_stack(event))
> >                 inc = true;
> >
> > >
> > > If they indeed do not require the pmu::sched_task() callback for CPU
> > > events, then I still think the whole perf_sched_cb_{inc,dec}() interface
> >
> > No, LBR requires the pmu::sched_task() callback for CPU events.
> >
> > Now, The LBR registers have to be reset in sched in even for CPU events.
> >
> > To fix the shorter LBR callstack issue for CPU events, we also need to
> > save/restore LBRs in pmu::sched_task().
> > https://lore.kernel.org/lkml/1578495789-95006-4-git-send-email-kan.liang@linux.intel.com/
> >
> > > is confusing at best.
> > >
> > > Can't we do something like this instead?
> > >
> > I think the below patch may have two issues.
> > - PERF_ATTACH_SCHED_CB is required for LBR (maybe PowerPC BHRB as well) now.
> > - We may disable the large PEBS later if not all PEBS events support
> > large PEBS. The PMU need a way to notify the generic code to decrease
> > the nr_sched_task.
>
> Any updates on this?  I've reviewed and tested Kan's patches
> and they all look good.
>
> Maybe we can talk to PPC folks to confirm the BHRB case?

Can we move this forward?  I saw patch 3/3 also adds PERF_ATTACH_SCHED_CB
for PowerPC too.  But it'd be nice if ppc folks can confirm the change.

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events
  2020-11-20 11:24             ` Namhyung Kim
@ 2020-11-23 11:00               ` Michael Ellerman
  2020-11-24  4:51                 ` Namhyung Kim
  0 siblings, 1 reply; 18+ messages in thread
From: Michael Ellerman @ 2020-11-23 11:00 UTC (permalink / raw)
  To: Namhyung Kim, Liang, Kan
  Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Stephane Eranian,
	Ian Rogers, Gabriel Marin, Arnaldo Carvalho de Melo, Jiri Olsa,
	Andi Kleen, Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev

Namhyung Kim <namhyung@kernel.org> writes:
> Hi Peter and Kan,
>
> (Adding PPC folks)
>
> On Tue, Nov 17, 2020 at 2:01 PM Namhyung Kim <namhyung@kernel.org> wrote:
>>
>> Hello,
>>
>> On Thu, Nov 12, 2020 at 4:54 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>> >
>> >
>> >
>> > On 11/11/2020 11:25 AM, Peter Zijlstra wrote:
>> > > On Mon, Nov 09, 2020 at 09:49:31AM -0500, Liang, Kan wrote:
>> > >
>> > >> - When the large PEBS was introduced (9c964efa4330), the sched_task() should
>> > >> be invoked to flush the PEBS buffer in each context switch. However, The
>> > >> perf_sched_events in account_event() is not updated accordingly. The
>> > >> perf_event_task_sched_* never be invoked for a pure per-CPU context. Only
>> > >> per-task event works.
>> > >>     At that time, the perf_pmu_sched_task() is outside of
>> > >> perf_event_context_sched_in/out. It means that perf has to double
>> > >> perf_pmu_disable() for per-task event.
>> > >
>> > >> - The patch 1 tries to fix broken per-CPU events. The CPU context cannot be
>> > >> retrieved from the task->perf_event_ctxp. So it has to be tracked in the
>> > >> sched_cb_list. Yes, the code is very similar to the original codes, but it
>> > >> is actually the new code for per-CPU events. The optimization for per-task
>> > >> events is still kept.
>> > >>    For the case, which has both a CPU context and a task context, yes, the
>> > >> __perf_pmu_sched_task() in this patch is not invoked. Because the
>> > >> sched_task() only need to be invoked once in a context switch. The
>> > >> sched_task() will be eventually invoked in the task context.
>> > >
>> > > The thing is; your first two patches rely on PERF_ATTACH_SCHED_CB and
>> > > only set that for large pebs. Are you sure the other users (Intel LBR
>> > > and PowerPC BHRB) don't need it?
>> >
>> > I didn't set it for LBR, because the perf_sched_events is always enabled
>> > for LBR. But, yes, we should explicitly set the PERF_ATTACH_SCHED_CB
>> > for LBR.
>> >
>> >         if (has_branch_stack(event))
>> >                 inc = true;
>> >
>> > >
>> > > If they indeed do not require the pmu::sched_task() callback for CPU
>> > > events, then I still think the whole perf_sched_cb_{inc,dec}() interface
>> >
>> > No, LBR requires the pmu::sched_task() callback for CPU events.
>> >
>> > Now, The LBR registers have to be reset in sched in even for CPU events.
>> >
>> > To fix the shorter LBR callstack issue for CPU events, we also need to
>> > save/restore LBRs in pmu::sched_task().
>> > https://lore.kernel.org/lkml/1578495789-95006-4-git-send-email-kan.liang@linux.intel.com/
>> >
>> > > is confusing at best.
>> > >
>> > > Can't we do something like this instead?
>> > >
>> > I think the below patch may have two issues.
>> > - PERF_ATTACH_SCHED_CB is required for LBR (maybe PowerPC BHRB as well) now.
>> > - We may disable the large PEBS later if not all PEBS events support
>> > large PEBS. The PMU need a way to notify the generic code to decrease
>> > the nr_sched_task.
>>
>> Any updates on this?  I've reviewed and tested Kan's patches
>> and they all look good.
>>
>> Maybe we can talk to PPC folks to confirm the BHRB case?
>
> Can we move this forward?  I saw patch 3/3 also adds PERF_ATTACH_SCHED_CB
> for PowerPC too.  But it'd be nice if ppc folks can confirm the change.

Sorry I've read the whole thread, but I'm still not entirely sure I
understand the question.

cheers

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events
  2020-11-23 11:00               ` Michael Ellerman
@ 2020-11-24  4:51                 ` Namhyung Kim
  2020-11-24  5:42                   ` Madhavan Srinivasan
  2020-11-25  8:12                   ` Michael Ellerman
  0 siblings, 2 replies; 18+ messages in thread
From: Namhyung Kim @ 2020-11-24  4:51 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Liang, Kan, Peter Zijlstra, Ingo Molnar, linux-kernel,
	Stephane Eranian, Ian Rogers, Gabriel Marin,
	Arnaldo Carvalho de Melo, Jiri Olsa, Andi Kleen,
	Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev

Hello,

On Mon, Nov 23, 2020 at 8:00 PM Michael Ellerman <mpe@ellerman.id.au> wrote:
>
> Namhyung Kim <namhyung@kernel.org> writes:
> > Hi Peter and Kan,
> >
> > (Adding PPC folks)
> >
> > On Tue, Nov 17, 2020 at 2:01 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >>
> >> Hello,
> >>
> >> On Thu, Nov 12, 2020 at 4:54 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
> >> >
> >> >
> >> >
> >> > On 11/11/2020 11:25 AM, Peter Zijlstra wrote:
> >> > > On Mon, Nov 09, 2020 at 09:49:31AM -0500, Liang, Kan wrote:
> >> > >
> >> > >> - When the large PEBS was introduced (9c964efa4330), the sched_task() should
> >> > >> be invoked to flush the PEBS buffer in each context switch. However, The
> >> > >> perf_sched_events in account_event() is not updated accordingly. The
> >> > >> perf_event_task_sched_* never be invoked for a pure per-CPU context. Only
> >> > >> per-task event works.
> >> > >>     At that time, the perf_pmu_sched_task() is outside of
> >> > >> perf_event_context_sched_in/out. It means that perf has to double
> >> > >> perf_pmu_disable() for per-task event.
> >> > >
> >> > >> - The patch 1 tries to fix broken per-CPU events. The CPU context cannot be
> >> > >> retrieved from the task->perf_event_ctxp. So it has to be tracked in the
> >> > >> sched_cb_list. Yes, the code is very similar to the original codes, but it
> >> > >> is actually the new code for per-CPU events. The optimization for per-task
> >> > >> events is still kept.
> >> > >>    For the case, which has both a CPU context and a task context, yes, the
> >> > >> __perf_pmu_sched_task() in this patch is not invoked. Because the
> >> > >> sched_task() only need to be invoked once in a context switch. The
> >> > >> sched_task() will be eventually invoked in the task context.
> >> > >
> >> > > The thing is; your first two patches rely on PERF_ATTACH_SCHED_CB and
> >> > > only set that for large pebs. Are you sure the other users (Intel LBR
> >> > > and PowerPC BHRB) don't need it?
> >> >
> >> > I didn't set it for LBR, because the perf_sched_events is always enabled
> >> > for LBR. But, yes, we should explicitly set the PERF_ATTACH_SCHED_CB
> >> > for LBR.
> >> >
> >> >         if (has_branch_stack(event))
> >> >                 inc = true;
> >> >
> >> > >
> >> > > If they indeed do not require the pmu::sched_task() callback for CPU
> >> > > events, then I still think the whole perf_sched_cb_{inc,dec}() interface
> >> >
> >> > No, LBR requires the pmu::sched_task() callback for CPU events.
> >> >
> >> > Now, The LBR registers have to be reset in sched in even for CPU events.
> >> >
> >> > To fix the shorter LBR callstack issue for CPU events, we also need to
> >> > save/restore LBRs in pmu::sched_task().
> >> > https://lore.kernel.org/lkml/1578495789-95006-4-git-send-email-kan.liang@linux.intel.com/
> >> >
> >> > > is confusing at best.
> >> > >
> >> > > Can't we do something like this instead?
> >> > >
> >> > I think the below patch may have two issues.
> >> > - PERF_ATTACH_SCHED_CB is required for LBR (maybe PowerPC BHRB as well) now.
> >> > - We may disable the large PEBS later if not all PEBS events support
> >> > large PEBS. The PMU need a way to notify the generic code to decrease
> >> > the nr_sched_task.
> >>
> >> Any updates on this?  I've reviewed and tested Kan's patches
> >> and they all look good.
> >>
> >> Maybe we can talk to PPC folks to confirm the BHRB case?
> >
> > Can we move this forward?  I saw patch 3/3 also adds PERF_ATTACH_SCHED_CB
> > for PowerPC too.  But it'd be nice if ppc folks can confirm the change.
>
> Sorry I've read the whole thread, but I'm still not entirely sure I
> understand the question.

Thanks for your time and sorry about not being clear enough.

We found per-cpu events are not calling pmu::sched_task()
on context switches.  So PERF_ATTACH_SCHED_CB was
added to indicate the core logic that it needs to invoke the
callback.

The patch 3/3 added the flag to PPC (for BHRB) with other
changes (I think it should be split like in the patch 2/3) and
want to get ACKs from the PPC folks.

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events
  2020-11-24  4:51                 ` Namhyung Kim
@ 2020-11-24  5:42                   ` Madhavan Srinivasan
  2020-11-24 16:04                     ` Liang, Kan
  2020-11-25  8:12                   ` Michael Ellerman
  1 sibling, 1 reply; 18+ messages in thread
From: Madhavan Srinivasan @ 2020-11-24  5:42 UTC (permalink / raw)
  To: Namhyung Kim, Michael Ellerman
  Cc: Ian Rogers, Andi Kleen, Peter Zijlstra, linuxppc-dev,
	linux-kernel, Stephane Eranian, Paul Mackerras,
	Arnaldo Carvalho de Melo, Jiri Olsa, Ingo Molnar, Gabriel Marin,
	Liang, Kan


On 11/24/20 10:21 AM, Namhyung Kim wrote:
> Hello,
>
> On Mon, Nov 23, 2020 at 8:00 PM Michael Ellerman <mpe@ellerman.id.au> wrote:
>> Namhyung Kim <namhyung@kernel.org> writes:
>>> Hi Peter and Kan,
>>>
>>> (Adding PPC folks)
>>>
>>> On Tue, Nov 17, 2020 at 2:01 PM Namhyung Kim <namhyung@kernel.org> wrote:
>>>> Hello,
>>>>
>>>> On Thu, Nov 12, 2020 at 4:54 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>>>>>
>>>>>
>>>>> On 11/11/2020 11:25 AM, Peter Zijlstra wrote:
>>>>>> On Mon, Nov 09, 2020 at 09:49:31AM -0500, Liang, Kan wrote:
>>>>>>
>>>>>>> - When the large PEBS was introduced (9c964efa4330), the sched_task() should
>>>>>>> be invoked to flush the PEBS buffer in each context switch. However, The
>>>>>>> perf_sched_events in account_event() is not updated accordingly. The
>>>>>>> perf_event_task_sched_* never be invoked for a pure per-CPU context. Only
>>>>>>> per-task event works.
>>>>>>>      At that time, the perf_pmu_sched_task() is outside of
>>>>>>> perf_event_context_sched_in/out. It means that perf has to double
>>>>>>> perf_pmu_disable() for per-task event.
>>>>>>> - The patch 1 tries to fix broken per-CPU events. The CPU context cannot be
>>>>>>> retrieved from the task->perf_event_ctxp. So it has to be tracked in the
>>>>>>> sched_cb_list. Yes, the code is very similar to the original codes, but it
>>>>>>> is actually the new code for per-CPU events. The optimization for per-task
>>>>>>> events is still kept.
>>>>>>>     For the case, which has both a CPU context and a task context, yes, the
>>>>>>> __perf_pmu_sched_task() in this patch is not invoked. Because the
>>>>>>> sched_task() only need to be invoked once in a context switch. The
>>>>>>> sched_task() will be eventually invoked in the task context.
>>>>>> The thing is; your first two patches rely on PERF_ATTACH_SCHED_CB and
>>>>>> only set that for large pebs. Are you sure the other users (Intel LBR
>>>>>> and PowerPC BHRB) don't need it?
>>>>> I didn't set it for LBR, because the perf_sched_events is always enabled
>>>>> for LBR. But, yes, we should explicitly set the PERF_ATTACH_SCHED_CB
>>>>> for LBR.
>>>>>
>>>>>          if (has_branch_stack(event))
>>>>>                  inc = true;
>>>>>
>>>>>> If they indeed do not require the pmu::sched_task() callback for CPU
>>>>>> events, then I still think the whole perf_sched_cb_{inc,dec}() interface
>>>>> No, LBR requires the pmu::sched_task() callback for CPU events.
>>>>>
>>>>> Now, The LBR registers have to be reset in sched in even for CPU events.
>>>>>
>>>>> To fix the shorter LBR callstack issue for CPU events, we also need to
>>>>> save/restore LBRs in pmu::sched_task().
>>>>> https://lore.kernel.org/lkml/1578495789-95006-4-git-send-email-kan.liang@linux.intel.com/
>>>>>
>>>>>> is confusing at best.
>>>>>>
>>>>>> Can't we do something like this instead?
>>>>>>
>>>>> I think the below patch may have two issues.
>>>>> - PERF_ATTACH_SCHED_CB is required for LBR (maybe PowerPC BHRB as well) now.
>>>>> - We may disable the large PEBS later if not all PEBS events support
>>>>> large PEBS. The PMU need a way to notify the generic code to decrease
>>>>> the nr_sched_task.
>>>> Any updates on this?  I've reviewed and tested Kan's patches
>>>> and they all look good.
>>>>
>>>> Maybe we can talk to PPC folks to confirm the BHRB case?
>>> Can we move this forward?  I saw patch 3/3 also adds PERF_ATTACH_SCHED_CB
>>> for PowerPC too.  But it'd be nice if ppc folks can confirm the change.
>> Sorry I've read the whole thread, but I'm still not entirely sure I
>> understand the question.
> Thanks for your time and sorry about not being clear enough.
>
> We found per-cpu events are not calling pmu::sched_task()
> on context switches.  So PERF_ATTACH_SCHED_CB was
> added to indicate the core logic that it needs to invoke the
> callback.
>
> The patch 3/3 added the flag to PPC (for BHRB) with other
> changes (I think it should be split like in the patch 2/3) and
> want to get ACKs from the PPC folks.

Sorry for delay.

I guess first it will be better to split the ppc change to a separate patch,

secondly, we are missing the changes needed in the power_pmu_bhrb_disable()

where perf_sched_cb_dec() needs the "state" to be included.


Maddy


>
> Thanks,
> Namhyung

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events
  2020-11-24  5:42                   ` Madhavan Srinivasan
@ 2020-11-24 16:04                     ` Liang, Kan
  0 siblings, 0 replies; 18+ messages in thread
From: Liang, Kan @ 2020-11-24 16:04 UTC (permalink / raw)
  To: Madhavan Srinivasan, Namhyung Kim, Michael Ellerman
  Cc: Ian Rogers, Andi Kleen, Peter Zijlstra, linuxppc-dev,
	linux-kernel, Stephane Eranian, Paul Mackerras,
	Arnaldo Carvalho de Melo, Jiri Olsa, Ingo Molnar, Gabriel Marin



On 11/24/2020 12:42 AM, Madhavan Srinivasan wrote:
> 
> On 11/24/20 10:21 AM, Namhyung Kim wrote:
>> Hello,
>>
>> On Mon, Nov 23, 2020 at 8:00 PM Michael Ellerman <mpe@ellerman.id.au> 
>> wrote:
>>> Namhyung Kim <namhyung@kernel.org> writes:
>>>> Hi Peter and Kan,
>>>>
>>>> (Adding PPC folks)
>>>>
>>>> On Tue, Nov 17, 2020 at 2:01 PM Namhyung Kim <namhyung@kernel.org> 
>>>> wrote:
>>>>> Hello,
>>>>>
>>>>> On Thu, Nov 12, 2020 at 4:54 AM Liang, Kan 
>>>>> <kan.liang@linux.intel.com> wrote:
>>>>>>
>>>>>>
>>>>>> On 11/11/2020 11:25 AM, Peter Zijlstra wrote:
>>>>>>> On Mon, Nov 09, 2020 at 09:49:31AM -0500, Liang, Kan wrote:
>>>>>>>
>>>>>>>> - When the large PEBS was introduced (9c964efa4330), the 
>>>>>>>> sched_task() should
>>>>>>>> be invoked to flush the PEBS buffer in each context switch. 
>>>>>>>> However, The
>>>>>>>> perf_sched_events in account_event() is not updated accordingly. 
>>>>>>>> The
>>>>>>>> perf_event_task_sched_* never be invoked for a pure per-CPU 
>>>>>>>> context. Only
>>>>>>>> per-task event works.
>>>>>>>>      At that time, the perf_pmu_sched_task() is outside of
>>>>>>>> perf_event_context_sched_in/out. It means that perf has to double
>>>>>>>> perf_pmu_disable() for per-task event.
>>>>>>>> - The patch 1 tries to fix broken per-CPU events. The CPU 
>>>>>>>> context cannot be
>>>>>>>> retrieved from the task->perf_event_ctxp. So it has to be 
>>>>>>>> tracked in the
>>>>>>>> sched_cb_list. Yes, the code is very similar to the original 
>>>>>>>> codes, but it
>>>>>>>> is actually the new code for per-CPU events. The optimization 
>>>>>>>> for per-task
>>>>>>>> events is still kept.
>>>>>>>>     For the case, which has both a CPU context and a task 
>>>>>>>> context, yes, the
>>>>>>>> __perf_pmu_sched_task() in this patch is not invoked. Because the
>>>>>>>> sched_task() only need to be invoked once in a context switch. The
>>>>>>>> sched_task() will be eventually invoked in the task context.
>>>>>>> The thing is; your first two patches rely on PERF_ATTACH_SCHED_CB 
>>>>>>> and
>>>>>>> only set that for large pebs. Are you sure the other users (Intel 
>>>>>>> LBR
>>>>>>> and PowerPC BHRB) don't need it?
>>>>>> I didn't set it for LBR, because the perf_sched_events is always 
>>>>>> enabled
>>>>>> for LBR. But, yes, we should explicitly set the PERF_ATTACH_SCHED_CB
>>>>>> for LBR.
>>>>>>
>>>>>>          if (has_branch_stack(event))
>>>>>>                  inc = true;
>>>>>>
>>>>>>> If they indeed do not require the pmu::sched_task() callback for CPU
>>>>>>> events, then I still think the whole perf_sched_cb_{inc,dec}() 
>>>>>>> interface
>>>>>> No, LBR requires the pmu::sched_task() callback for CPU events.
>>>>>>
>>>>>> Now, The LBR registers have to be reset in sched in even for CPU 
>>>>>> events.
>>>>>>
>>>>>> To fix the shorter LBR callstack issue for CPU events, we also 
>>>>>> need to
>>>>>> save/restore LBRs in pmu::sched_task().
>>>>>> https://lore.kernel.org/lkml/1578495789-95006-4-git-send-email-kan.liang@linux.intel.com/ 
>>>>>>
>>>>>>
>>>>>>> is confusing at best.
>>>>>>>
>>>>>>> Can't we do something like this instead?
>>>>>>>
>>>>>> I think the below patch may have two issues.
>>>>>> - PERF_ATTACH_SCHED_CB is required for LBR (maybe PowerPC BHRB as 
>>>>>> well) now.
>>>>>> - We may disable the large PEBS later if not all PEBS events support
>>>>>> large PEBS. The PMU need a way to notify the generic code to decrease
>>>>>> the nr_sched_task.
>>>>> Any updates on this?  I've reviewed and tested Kan's patches
>>>>> and they all look good.
>>>>>
>>>>> Maybe we can talk to PPC folks to confirm the BHRB case?
>>>> Can we move this forward?  I saw patch 3/3 also adds 
>>>> PERF_ATTACH_SCHED_CB
>>>> for PowerPC too.  But it'd be nice if ppc folks can confirm the change.
>>> Sorry I've read the whole thread, but I'm still not entirely sure I
>>> understand the question.
>> Thanks for your time and sorry about not being clear enough.
>>
>> We found per-cpu events are not calling pmu::sched_task()
>> on context switches.  So PERF_ATTACH_SCHED_CB was
>> added to indicate the core logic that it needs to invoke the
>> callback.
>>
>> The patch 3/3 added the flag to PPC (for BHRB) with other
>> changes (I think it should be split like in the patch 2/3) and
>> want to get ACKs from the PPC folks.
> 
> Sorry for delay.
> 
> I guess first it will be better to split the ppc change to a separate 
> patch,

Both PPC and X86 invokes the perf_sched_cb_inc() directly. The patch 
changes the parameters of the perf_sched_cb_inc(). I think we have to 
update the PPC and X86 codes together. Otherwise, there will be a 
compile error, if someone may only applies the change for the 
perf_sched_cb_inc() but forget to applies the changes in PPC or X86 
specific codes.

> 
> secondly, we are missing the changes needed in the power_pmu_bhrb_disable()
> 
> where perf_sched_cb_dec() needs the "state" to be included.
> 

Ah, right. The below patch should fix the issue.

diff --git a/arch/powerpc/perf/core-book3s.c 
b/arch/powerpc/perf/core-book3s.c
index bced502f64a1..6756d1602a67 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -391,13 +391,18 @@ static void power_pmu_bhrb_enable(struct 
perf_event *event)
  static void power_pmu_bhrb_disable(struct perf_event *event)
  {
  	struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
+	int state = PERF_SCHED_CB_SW_IN;

  	if (!ppmu->bhrb_nr)
  		return;

  	WARN_ON_ONCE(!cpuhw->bhrb_users);
  	cpuhw->bhrb_users--;
-	perf_sched_cb_dec(event->ctx->pmu);
+
+	if (!(event->attach_state & PERF_ATTACH_TASK))
+		state |= PERF_SCHED_CB_CPU;
+
+	perf_sched_cb_dec(event->ctx->pmu, state);

  	if (!cpuhw->disabled && !cpuhw->bhrb_users) {
  		/* BHRB cannot be turned off when other



Thanks,
Kan

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events
  2020-11-24  4:51                 ` Namhyung Kim
  2020-11-24  5:42                   ` Madhavan Srinivasan
@ 2020-11-25  8:12                   ` Michael Ellerman
  1 sibling, 0 replies; 18+ messages in thread
From: Michael Ellerman @ 2020-11-25  8:12 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Liang, Kan, Peter Zijlstra, Ingo Molnar, linux-kernel,
	Stephane Eranian, Ian Rogers, Gabriel Marin,
	Arnaldo Carvalho de Melo, Jiri Olsa, Andi Kleen,
	Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev

Namhyung Kim <namhyung@kernel.org> writes:
> Hello,
>
> On Mon, Nov 23, 2020 at 8:00 PM Michael Ellerman <mpe@ellerman.id.au> wrote:
>>
>> Namhyung Kim <namhyung@kernel.org> writes:
>> > Hi Peter and Kan,
>> >
>> > (Adding PPC folks)
>> >
>> > On Tue, Nov 17, 2020 at 2:01 PM Namhyung Kim <namhyung@kernel.org> wrote:
>> >>
>> >> Hello,
>> >>
>> >> On Thu, Nov 12, 2020 at 4:54 AM Liang, Kan <kan.liang@linux.intel.com> wrote:
>> >> >
>> >> >
>> >> >
>> >> > On 11/11/2020 11:25 AM, Peter Zijlstra wrote:
>> >> > > On Mon, Nov 09, 2020 at 09:49:31AM -0500, Liang, Kan wrote:
>> >> > >
>> >> > >> - When the large PEBS was introduced (9c964efa4330), the sched_task() should
>> >> > >> be invoked to flush the PEBS buffer in each context switch. However, The
>> >> > >> perf_sched_events in account_event() is not updated accordingly. The
>> >> > >> perf_event_task_sched_* never be invoked for a pure per-CPU context. Only
>> >> > >> per-task event works.
>> >> > >>     At that time, the perf_pmu_sched_task() is outside of
>> >> > >> perf_event_context_sched_in/out. It means that perf has to double
>> >> > >> perf_pmu_disable() for per-task event.
>> >> > >
>> >> > >> - The patch 1 tries to fix broken per-CPU events. The CPU context cannot be
>> >> > >> retrieved from the task->perf_event_ctxp. So it has to be tracked in the
>> >> > >> sched_cb_list. Yes, the code is very similar to the original codes, but it
>> >> > >> is actually the new code for per-CPU events. The optimization for per-task
>> >> > >> events is still kept.
>> >> > >>    For the case, which has both a CPU context and a task context, yes, the
>> >> > >> __perf_pmu_sched_task() in this patch is not invoked. Because the
>> >> > >> sched_task() only need to be invoked once in a context switch. The
>> >> > >> sched_task() will be eventually invoked in the task context.
>> >> > >
>> >> > > The thing is; your first two patches rely on PERF_ATTACH_SCHED_CB and
>> >> > > only set that for large pebs. Are you sure the other users (Intel LBR
>> >> > > and PowerPC BHRB) don't need it?
>> >> >
>> >> > I didn't set it for LBR, because the perf_sched_events is always enabled
>> >> > for LBR. But, yes, we should explicitly set the PERF_ATTACH_SCHED_CB
>> >> > for LBR.
>> >> >
>> >> >         if (has_branch_stack(event))
>> >> >                 inc = true;
>> >> >
>> >> > >
>> >> > > If they indeed do not require the pmu::sched_task() callback for CPU
>> >> > > events, then I still think the whole perf_sched_cb_{inc,dec}() interface
>> >> >
>> >> > No, LBR requires the pmu::sched_task() callback for CPU events.
>> >> >
>> >> > Now, The LBR registers have to be reset in sched in even for CPU events.
>> >> >
>> >> > To fix the shorter LBR callstack issue for CPU events, we also need to
>> >> > save/restore LBRs in pmu::sched_task().
>> >> > https://lore.kernel.org/lkml/1578495789-95006-4-git-send-email-kan.liang@linux.intel.com/
>> >> >
>> >> > > is confusing at best.
>> >> > >
>> >> > > Can't we do something like this instead?
>> >> > >
>> >> > I think the below patch may have two issues.
>> >> > - PERF_ATTACH_SCHED_CB is required for LBR (maybe PowerPC BHRB as well) now.
>> >> > - We may disable the large PEBS later if not all PEBS events support
>> >> > large PEBS. The PMU need a way to notify the generic code to decrease
>> >> > the nr_sched_task.
>> >>
>> >> Any updates on this?  I've reviewed and tested Kan's patches
>> >> and they all look good.
>> >>
>> >> Maybe we can talk to PPC folks to confirm the BHRB case?
>> >
>> > Can we move this forward?  I saw patch 3/3 also adds PERF_ATTACH_SCHED_CB
>> > for PowerPC too.  But it'd be nice if ppc folks can confirm the change.
>>
>> Sorry I've read the whole thread, but I'm still not entirely sure I
>> understand the question.
>
> Thanks for your time and sorry about not being clear enough.
>
> We found per-cpu events are not calling pmu::sched_task()
> on context switches.  So PERF_ATTACH_SCHED_CB was
> added to indicate the core logic that it needs to invoke the
> callback.

OK. TBH I've never thought of using branch stack with a per-cpu event,
but I guess you can do it.

I think the same logic applies as LBR, we need to read the BHRB entries
in the context of the task that they were recorded for.

> The patch 3/3 added the flag to PPC (for BHRB) with other
> changes (I think it should be split like in the patch 2/3) and
> want to get ACKs from the PPC folks.

If you post a new version with Maddy's comments addressed then he or I
can ack it.

cheers

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2020-11-25  8:12 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-06 21:29 [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events kan.liang
2020-11-06 21:29 ` [PATCH 2/3] perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS kan.liang
2020-11-06 21:29 ` [PATCH 3/3] perf: Optimize sched_task() in a context switch kan.liang
2020-11-09  9:52 ` [PATCH 1/3] perf/core: Flush PMU internal buffers for per-CPU events Peter Zijlstra
2020-11-09 11:04   ` Peter Zijlstra
2020-11-09 14:49     ` Liang, Kan
2020-11-09 17:33       ` Peter Zijlstra
2020-11-09 19:52         ` Liang, Kan
2020-11-09 17:40       ` Peter Zijlstra
2020-11-11 16:25       ` Peter Zijlstra
2020-11-11 19:54         ` Liang, Kan
2020-11-17  5:01           ` Namhyung Kim
2020-11-20 11:24             ` Namhyung Kim
2020-11-23 11:00               ` Michael Ellerman
2020-11-24  4:51                 ` Namhyung Kim
2020-11-24  5:42                   ` Madhavan Srinivasan
2020-11-24 16:04                     ` Liang, Kan
2020-11-25  8:12                   ` Michael Ellerman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).