linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH V2 1/3] perf/core: Pull pmu::sched_task() into perf_event_context_sched_in()
@ 2020-08-21 19:57 kan.liang
  2020-08-21 19:57 ` [PATCH V2 2/3] perf/core: Pull pmu::sched_task() into perf_event_context_sched_out() kan.liang
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: kan.liang @ 2020-08-21 19:57 UTC (permalink / raw)
  To: peterz, mingo, acme, linux-kernel; +Cc: ak, mark.rutland, luto, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

The pmu::sched_task() is a context switch callback. It passes the
cpuctx->task_ctx as a parameter to the lower code. To find the
cpuctx->task_ctx, the current code iterates a cpuctx list.

The same context was just iterated in perf_event_context_sched_in(),
which is invoked right before the pmu::sched_task().

Reuse the cpuctx->task_ctx from perf_event_context_sched_in() can avoid
the unnecessary iteration of the cpuctx list.

Both pmu::sched_task and perf_event_context_sched_in() have to disable
PMU. Pull the pmu::sched_task into perf_event_context_sched_in() can
also save the overhead from the PMU disable and reenable.

The new and old tasks may have equivalent contexts. The current code
optimize this case by swapping the context, which avoids the scheduling.
For this case, pmu::sched_task() is still required, e.g., restore the
LBR content.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

New patch for V2

 kernel/events/core.c | 51 +++++++++++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0fbf17ff7cc5..c785cbadbdb8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3491,30 +3491,36 @@ void perf_sched_cb_inc(struct pmu *pmu)
  * PEBS requires this to provide PID/TID information. This requires we flush
  * all queued PEBS records before we context switch to a new task.
  */
+static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
+{
+	struct pmu *pmu;
+
+	pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
+
+	if (WARN_ON_ONCE(!pmu->sched_task))
+		return;
+
+	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+	perf_pmu_disable(pmu);
+
+	pmu->sched_task(cpuctx->task_ctx, sched_in);
+
+	perf_pmu_enable(pmu);
+	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+}
+
 static void perf_pmu_sched_task(struct task_struct *prev,
 				struct task_struct *next,
 				bool sched_in)
 {
 	struct perf_cpu_context *cpuctx;
-	struct pmu *pmu;
 
 	if (prev == next)
 		return;
 
-	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
-		pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
-
-		if (WARN_ON_ONCE(!pmu->sched_task))
-			continue;
-
-		perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-		perf_pmu_disable(pmu);
-
-		pmu->sched_task(cpuctx->task_ctx, sched_in);
+	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
+		__perf_pmu_sched_task(cpuctx, sched_in);
 
-		perf_pmu_enable(pmu);
-		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-	}
 }
 
 static void perf_event_switch(struct task_struct *task,
@@ -3773,10 +3779,14 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 					struct task_struct *task)
 {
 	struct perf_cpu_context *cpuctx;
+	struct pmu *pmu = ctx->pmu;
 
 	cpuctx = __get_cpu_context(ctx);
-	if (cpuctx->task_ctx == ctx)
+	if (cpuctx->task_ctx == ctx) {
+		if (cpuctx->sched_cb_usage)
+			__perf_pmu_sched_task(cpuctx, true);
 		return;
+	}
 
 	perf_ctx_lock(cpuctx, ctx);
 	/*
@@ -3786,7 +3796,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 	if (!ctx->nr_events)
 		goto unlock;
 
-	perf_pmu_disable(ctx->pmu);
+	perf_pmu_disable(pmu);
 	/*
 	 * We want to keep the following priority order:
 	 * cpu pinned (that don't need to move), task pinned,
@@ -3798,7 +3808,11 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
 		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 	perf_event_sched_in(cpuctx, ctx, task);
-	perf_pmu_enable(ctx->pmu);
+
+	if (cpuctx->sched_cb_usage && pmu->sched_task)
+		pmu->sched_task(cpuctx->task_ctx, true);
+
+	perf_pmu_enable(pmu);
 
 unlock:
 	perf_ctx_unlock(cpuctx, ctx);
@@ -3841,9 +3855,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 
 	if (atomic_read(&nr_switch_events))
 		perf_event_switch(task, prev, true);
-
-	if (__this_cpu_read(perf_sched_cb_usages))
-		perf_pmu_sched_task(prev, task, true);
 }
 
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH V2 2/3] perf/core: Pull pmu::sched_task() into perf_event_context_sched_out()
  2020-08-21 19:57 [PATCH V2 1/3] perf/core: Pull pmu::sched_task() into perf_event_context_sched_in() kan.liang
@ 2020-08-21 19:57 ` kan.liang
  2020-09-11  7:02   ` [tip: perf/core] " tip-bot2 for Kan Liang
  2020-08-21 19:57 ` [PATCH V2 3/3] perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task kan.liang
  2020-09-11  7:02 ` [tip: perf/core] perf/core: Pull pmu::sched_task() into perf_event_context_sched_in() tip-bot2 for Kan Liang
  2 siblings, 1 reply; 8+ messages in thread
From: kan.liang @ 2020-08-21 19:57 UTC (permalink / raw)
  To: peterz, mingo, acme, linux-kernel; +Cc: ak, mark.rutland, luto, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

The pmu::sched_task() is a context switch callback. It passes the
cpuctx->task_ctx as a parameter to the lower code. To find the
cpuctx->task_ctx, the current code iterates a cpuctx list.
The same context will iterated in perf_event_context_sched_out() soon.
Share the cpuctx->task_ctx can avoid the unnecessary iteration of the
cpuctx list.

The pmu::sched_task() is also required for the optimization case for
equivalent contexts.

The task_ctx_sched_out() will eventually disable and reenable the PMU
when schedule out events. Add perf_pmu_disable() and perf_pmu_enable()
around task_ctx_sched_out() don't break anything.

Drop the cpuctx->ctx.lock for the pmu::sched_task(). The lock is for
per-CPU context, which is not necessary for the per-task context
schedule.

No one uses sched_cb_entry, perf_sched_cb_usages, sched_cb_list, and
perf_pmu_sched_task() any more.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

New patch for V2

 include/linux/perf_event.h |  1 -
 kernel/events/core.c       | 47 ++++++++++++++------------------------
 2 files changed, 17 insertions(+), 31 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3737e653f47e..9357720df7de 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -872,7 +872,6 @@ struct perf_cpu_context {
 	struct list_head		cgrp_cpuctx_entry;
 #endif
 
-	struct list_head		sched_cb_entry;
 	int				sched_cb_usage;
 
 	int				online;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c785cbadbdb8..d0393d1970b7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -382,7 +382,6 @@ static DEFINE_MUTEX(perf_sched_mutex);
 static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
@@ -3384,10 +3383,12 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 	struct perf_event_context *parent, *next_parent;
 	struct perf_cpu_context *cpuctx;
 	int do_switch = 1;
+	struct pmu *pmu;
 
 	if (likely(!ctx))
 		return;
 
+	pmu = ctx->pmu;
 	cpuctx = __get_cpu_context(ctx);
 	if (!cpuctx->task_ctx)
 		return;
@@ -3417,11 +3418,15 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 		raw_spin_lock(&ctx->lock);
 		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
 		if (context_equiv(ctx, next_ctx)) {
-			struct pmu *pmu = ctx->pmu;
 
 			WRITE_ONCE(ctx->task, next);
 			WRITE_ONCE(next_ctx->task, task);
 
+			perf_pmu_disable(pmu);
+
+			if (cpuctx->sched_cb_usage && pmu->sched_task)
+				pmu->sched_task(ctx, false);
+
 			/*
 			 * PMU specific parts of task perf context can require
 			 * additional synchronization. As an example of such
@@ -3433,6 +3438,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 			else
 				swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
 
+			perf_pmu_enable(pmu);
+
 			/*
 			 * RCU_INIT_POINTER here is safe because we've not
 			 * modified the ctx and the above modification of
@@ -3455,21 +3462,22 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 
 	if (do_switch) {
 		raw_spin_lock(&ctx->lock);
+		perf_pmu_disable(pmu);
+
+		if (cpuctx->sched_cb_usage && pmu->sched_task)
+			pmu->sched_task(ctx, false);
 		task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
+
+		perf_pmu_enable(pmu);
 		raw_spin_unlock(&ctx->lock);
 	}
 }
 
-static DEFINE_PER_CPU(struct list_head, sched_cb_list);
-
 void perf_sched_cb_dec(struct pmu *pmu)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-	this_cpu_dec(perf_sched_cb_usages);
-
-	if (!--cpuctx->sched_cb_usage)
-		list_del(&cpuctx->sched_cb_entry);
+	--cpuctx->sched_cb_usage;
 }
 
 
@@ -3477,10 +3485,7 @@ void perf_sched_cb_inc(struct pmu *pmu)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-	if (!cpuctx->sched_cb_usage++)
-		list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
-
-	this_cpu_inc(perf_sched_cb_usages);
+	cpuctx->sched_cb_usage++;
 }
 
 /*
@@ -3509,20 +3514,6 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 
-static void perf_pmu_sched_task(struct task_struct *prev,
-				struct task_struct *next,
-				bool sched_in)
-{
-	struct perf_cpu_context *cpuctx;
-
-	if (prev == next)
-		return;
-
-	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
-		__perf_pmu_sched_task(cpuctx, sched_in);
-
-}
-
 static void perf_event_switch(struct task_struct *task,
 			      struct task_struct *next_prev, bool sched_in);
 
@@ -3545,9 +3536,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
 {
 	int ctxn;
 
-	if (__this_cpu_read(perf_sched_cb_usages))
-		perf_pmu_sched_task(task, next, false);
-
 	if (atomic_read(&nr_switch_events))
 		perf_event_switch(task, next, false);
 
@@ -12850,7 +12838,6 @@ static void __init perf_event_init_all_cpus(void)
 #ifdef CONFIG_CGROUP_PERF
 		INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
 #endif
-		INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
 	}
 }
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH V2 3/3] perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task
  2020-08-21 19:57 [PATCH V2 1/3] perf/core: Pull pmu::sched_task() into perf_event_context_sched_in() kan.liang
  2020-08-21 19:57 ` [PATCH V2 2/3] perf/core: Pull pmu::sched_task() into perf_event_context_sched_out() kan.liang
@ 2020-08-21 19:57 ` kan.liang
  2020-09-07 16:01   ` peterz
  2020-09-11  7:02 ` [tip: perf/core] perf/core: Pull pmu::sched_task() into perf_event_context_sched_in() tip-bot2 for Kan Liang
  2 siblings, 1 reply; 8+ messages in thread
From: kan.liang @ 2020-08-21 19:57 UTC (permalink / raw)
  To: peterz, mingo, acme, linux-kernel; +Cc: ak, mark.rutland, luto, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

The counter value of a perf task may leak to another RDPMC task.
For example, a perf stat task as below is running on CPU 0.

    perf stat -e 'branches,cycles' -- taskset -c 0 ./workload

In the meantime, an RDPMC task, which is also running on CPU 0, may read
the GP counters periodically. (The RDPMC task creates a fixed event,
but read four GP counters.)

    $ taskset -c 0 ./rdpmc_read_all_counters
    index 0x0 value 0x8001e5970f99
    index 0x1 value 0x8005d750edb6
    index 0x2 value 0x0
    index 0x3 value 0x0

    index 0x0 value 0x8002358e48a5
    index 0x1 value 0x8006bd1e3bc9
    index 0x2 value 0x0
    index 0x3 value 0x0

It is a potential security issue. Once the attacker knows what the other
thread is counting. The PerfMon counter can be used as a side-channel to
attack cryptosystems.

The counter value of the perf stat task leaks to the RDPMC task because
perf never clears the counter when it's stopped.

Two methods were considered to address the issue.
- Unconditionally reset the counter in x86_pmu_del(). It can bring extra
  overhead even when there is no RDPMC task running.
- Only reset the un-assigned dirty counters when the RDPMC task is
  scheduled in. The method is implemented here.

The dirty counter is a counter, on which the assigned event has been
deleted, but the counter is not reset. To track the dirty counters,
add a 'dirty' variable in the struct cpu_hw_events. The 'running'
variable is only used in P4 PMU, which doesn't support the RDPMC. Share
the memory space between the 'dirty' and the 'running' variable.

The current code doesn't reset the counter when the assigned event is
deleted. Set the corresponding bit in the 'dirty' variable in
x86_pmu_del(), if the RDPMC feature is available on the system.

The security issue can only be found with an RDPMC task. The event for
an RDPMC task is a non-sampling event, and requires the mmap buffer.
This can be used to detect an RDPMC task. Once the event is detected in
the event_mapped(), enable sched_task(), which is invoked in each
context switch. Add a check in the sched_task() to clear the dirty
counters, when the RDPMC task is scheduled in. Only the current
un-assigned dirty counters are reset, bacuase the RDPMC assigned dirty
counters will be updated soon.

The RDPMC instruction is also supported on the older platforms. Add
sched_task() for the core_pmu. The core_pmu doesn't support large PEBS
and LBR callstack, the intel_pmu_pebs/lbr_sched_task() will be ignored.

After applying the patch,

        $ taskset -c 0 ./rdpmc_read_all_counters
        index 0x0 value 0x0
        index 0x1 value 0x0
        index 0x2 value 0x0
        index 0x3 value 0x0

        index 0x0 value 0x0
        index 0x1 value 0x0
        index 0x2 value 0x0
        index 0x3 value 0x0

Performance

The performance of a context switch only be impacted when there are two
or more perf users and one of the users must be an RDPMC user. In other
cases, there is no performance impact.

The worst-case occurs when there are two users: the RDPMC user only
applies one counter; while the other user applies all available
counters. When the RDPMC task is scheduled in, all the counters, other
than the RDPMC assigned one, have to be reset.

Here is the test result for the worst-case.

The test is implemented on an Ice Lake platform, which has 8 GP
counters and 3 fixed counters (Not include SLOTS counter).

The lat_ctx is used to measure the context switching time.

    lat_ctx -s 128K -N 1000 processes 2

It creates 2 tasks. One task opens all 8 GP counters and 3 fixed
counters. The other task opens a fixed counter and enable RDPMC.

Without the patch:
The context switch time is 4.74 us
The context switch number per second is ~125K (from vmstat 1)

With the patch:
The context switch time is 5.19 us
The context switch number per second is ~118K

There is ~9% performance drop for the context switching time in the
worst-case.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

Changes since V1:
- Drop the old method, which unconditionally reset the counter in
  x86_pmu_del().
  Only reset the dirty counters when a RDPMC task is sheduled in.


 arch/x86/events/core.c       | 45 +++++++++++++++++++++++++++++++++++-
 arch/x86/events/intel/core.c | 29 +++++++++++++++++++++++
 arch/x86/events/perf_event.h |  7 +++++-
 3 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 0f3d01562ded..fa08d810dcd2 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1440,7 +1440,10 @@ static void x86_pmu_start(struct perf_event *event, int flags)
 
 	cpuc->events[idx] = event;
 	__set_bit(idx, cpuc->active_mask);
-	__set_bit(idx, cpuc->running);
+	/* The cpuc->running is only used by the P4 PMU */
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON) &&
+	    (boot_cpu_data.x86 == 0xf))
+		__set_bit(idx, cpuc->running);
 	x86_pmu.enable(event);
 	perf_event_update_userpage(event);
 }
@@ -1544,6 +1547,9 @@ static void x86_pmu_del(struct perf_event *event, int flags)
 	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
 		goto do_del;
 
+	if (READ_ONCE(x86_pmu.attr_rdpmc) && x86_pmu.sched_task &&
+	    test_bit(event->hw.idx, cpuc->active_mask))
+		__set_bit(event->hw.idx, cpuc->dirty);
 	/*
 	 * Not a TXN, therefore cleanup properly.
 	 */
@@ -2219,11 +2225,45 @@ static int x86_pmu_event_init(struct perf_event *event)
 	return err;
 }
 
+void x86_pmu_clear_dirty_counters(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int i;
+
+	if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX))
+		return;
+
+	 /* Don't need to clear the assigned counter. */
+	for (i = 0; i < cpuc->n_events; i++)
+		__clear_bit(cpuc->assign[i], cpuc->dirty);
+
+	for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) {
+		/* Metrics events don't have corresponding HW counters. */
+		if (is_metric_idx(i))
+			continue;
+		else if (i >= INTEL_PMC_IDX_FIXED)
+			wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + (i - INTEL_PMC_IDX_FIXED), 0);
+		else
+			wrmsrl(x86_pmu_event_addr(i), 0);
+	}
+
+	bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX);
+}
+
 static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
 {
 	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
 		return;
 
+	/*
+	 * Enable sched_task() for the RDPMC task,
+	 * and clear the existing dirty counters.
+	 */
+	if (x86_pmu.sched_task && event->hw.target && !is_sampling_event(event)) {
+		perf_sched_cb_inc(event->ctx->pmu);
+		x86_pmu_clear_dirty_counters();
+	}
+
 	/*
 	 * This function relies on not being called concurrently in two
 	 * tasks in the same mm.  Otherwise one task could observe
@@ -2246,6 +2286,9 @@ static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *m
 	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
 		return;
 
+	if (x86_pmu.sched_task && event->hw.target && !is_sampling_event(event))
+		perf_sched_cb_dec(event->ctx->pmu);
+
 	if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
 		on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
 }
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index c72e4904e056..e67713bfa33a 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4166,11 +4166,39 @@ static void intel_pmu_cpu_dead(int cpu)
 	intel_cpuc_finish(&per_cpu(cpu_hw_events, cpu));
 }
 
+static void intel_pmu_rdpmc_sched_task(struct perf_event_context *ctx)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct perf_event *event;
+
+	if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX))
+		return;
+
+	/*
+	 * If the new task has the RDPMC enabled, clear the dirty counters to
+	 * prevent the potential leak. If the new task doesn't have the RDPMC
+	 * enabled, do nothing.
+	 */
+	list_for_each_entry(event, &ctx->event_list, event_entry) {
+		if (event->hw.target &&
+		    (event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED) &&
+		    !is_sampling_event(event) &&
+		    atomic_read(&event->mmap_count))
+			break;
+	}
+	if (&event->event_entry == &ctx->event_list)
+		return;
+
+	x86_pmu_clear_dirty_counters();
+}
+
 static void intel_pmu_sched_task(struct perf_event_context *ctx,
 				 bool sched_in)
 {
 	intel_pmu_pebs_sched_task(ctx, sched_in);
 	intel_pmu_lbr_sched_task(ctx, sched_in);
+	if (sched_in && READ_ONCE(x86_pmu.attr_rdpmc))
+		intel_pmu_rdpmc_sched_task(ctx);
 }
 
 static void intel_pmu_swap_task_ctx(struct perf_event_context *prev,
@@ -4273,6 +4301,7 @@ static __initconst const struct x86_pmu core_pmu = {
 	.cpu_dying		= intel_pmu_cpu_dying,
 	.cpu_dead		= intel_pmu_cpu_dead,
 
+	.sched_task		= intel_pmu_sched_task,
 	.check_period		= intel_pmu_check_period,
 
 	.lbr_reset		= intel_pmu_lbr_reset_64,
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 345442410a4d..52e7650cece5 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -227,7 +227,10 @@ struct cpu_hw_events {
 	 */
 	struct perf_event	*events[X86_PMC_IDX_MAX]; /* in counter order */
 	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	unsigned long		running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	union {
+		unsigned long	running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+		unsigned long	dirty[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	};
 	int			enabled;
 
 	int			n_events; /* the # of events in the below arrays */
@@ -1016,6 +1019,8 @@ void x86_pmu_enable_event(struct perf_event *event);
 
 int x86_pmu_handle_irq(struct pt_regs *regs);
 
+void x86_pmu_clear_dirty_counters(void);
+
 extern struct event_constraint emptyconstraint;
 
 extern struct event_constraint unconstrained;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH V2 3/3] perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task
  2020-08-21 19:57 ` [PATCH V2 3/3] perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task kan.liang
@ 2020-09-07 16:01   ` peterz
  2020-09-08 15:58     ` peterz
  2020-09-09 13:24     ` Liang, Kan
  0 siblings, 2 replies; 8+ messages in thread
From: peterz @ 2020-09-07 16:01 UTC (permalink / raw)
  To: kan.liang; +Cc: mingo, acme, linux-kernel, ak, mark.rutland, luto

On Fri, Aug 21, 2020 at 12:57:54PM -0700, kan.liang@linux.intel.com wrote:
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index 0f3d01562ded..fa08d810dcd2 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -1440,7 +1440,10 @@ static void x86_pmu_start(struct perf_event *event, int flags)
>  
>  	cpuc->events[idx] = event;
>  	__set_bit(idx, cpuc->active_mask);
> -	__set_bit(idx, cpuc->running);
> +	/* The cpuc->running is only used by the P4 PMU */
> +	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON) &&
> +	    (boot_cpu_data.x86 == 0xf))
> +		__set_bit(idx, cpuc->running);
>  	x86_pmu.enable(event);
>  	perf_event_update_userpage(event);
>  }

Yuck! Use a static_branch() or something. This is a gnarly nest of code
that runs 99.9% of the time to conclude a negative. IOW a complete waste
of cycles for everybody not running a P4 space heater.

> @@ -1544,6 +1547,9 @@ static void x86_pmu_del(struct perf_event *event, int flags)
>  	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
>  		goto do_del;
>  
> +	if (READ_ONCE(x86_pmu.attr_rdpmc) && x86_pmu.sched_task &&
> +	    test_bit(event->hw.idx, cpuc->active_mask))
> +		__set_bit(event->hw.idx, cpuc->dirty);

And that too seems like an overly complicated set of tests and branches.
This should be effectivly true for the 99% common case.

> @@ -2219,11 +2225,45 @@ static int x86_pmu_event_init(struct perf_event *event)
>  	return err;
>  }
>  
> +void x86_pmu_clear_dirty_counters(void)
> +{
> +	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> +	int i;
> +
> +	if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX))
> +		return;
> +
> +	 /* Don't need to clear the assigned counter. */
> +	for (i = 0; i < cpuc->n_events; i++)
> +		__clear_bit(cpuc->assign[i], cpuc->dirty);
> +
> +	for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) {
> +		/* Metrics events don't have corresponding HW counters. */
> +		if (is_metric_idx(i))
> +			continue;
> +		else if (i >= INTEL_PMC_IDX_FIXED)
> +			wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + (i - INTEL_PMC_IDX_FIXED), 0);
> +		else
> +			wrmsrl(x86_pmu_event_addr(i), 0);
> +	}
> +
> +	bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX);
> +}

The bitmap_{empty,zero}() do indeed compile into single instructions,
neat!

>  static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
>  {
>  	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
>  		return;
>  
> +	/*
> +	 * Enable sched_task() for the RDPMC task,
> +	 * and clear the existing dirty counters.
> +	 */
> +	if (x86_pmu.sched_task && event->hw.target && !is_sampling_event(event)) {
> +		perf_sched_cb_inc(event->ctx->pmu);
> +		x86_pmu_clear_dirty_counters();
> +	}

I'm failing to see the correctness of the !is_sampling_event() part
there.

>  	/*
>  	 * This function relies on not being called concurrently in two
>  	 * tasks in the same mm.  Otherwise one task could observe
> @@ -2246,6 +2286,9 @@ static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *m
>  	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
>  		return;
>  
> +	if (x86_pmu.sched_task && event->hw.target && !is_sampling_event(event))
> +		perf_sched_cb_dec(event->ctx->pmu);
> +

Idem.

>  	if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
>  		on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
>  }
> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
> index c72e4904e056..e67713bfa33a 100644
> --- a/arch/x86/events/intel/core.c
> +++ b/arch/x86/events/intel/core.c
> @@ -4166,11 +4166,39 @@ static void intel_pmu_cpu_dead(int cpu)
>  	intel_cpuc_finish(&per_cpu(cpu_hw_events, cpu));
>  }
>  
> +static void intel_pmu_rdpmc_sched_task(struct perf_event_context *ctx)
> +{
> +	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> +	struct perf_event *event;
> +
> +	if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX))
> +		return;
> +
> +	/*
> +	 * If the new task has the RDPMC enabled, clear the dirty counters to
> +	 * prevent the potential leak. If the new task doesn't have the RDPMC
> +	 * enabled, do nothing.
> +	 */
> +	list_for_each_entry(event, &ctx->event_list, event_entry) {
> +		if (event->hw.target &&
> +		    (event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED) &&
> +		    !is_sampling_event(event) &&
> +		    atomic_read(&event->mmap_count))
> +			break;
> +	}
> +	if (&event->event_entry == &ctx->event_list)
> +		return;

That's horrific, what's wrong with something like:

	if (!atomic_read(&current->mm->context.perf_rdpmc_allowed))
		return;

> +
> +	x86_pmu_clear_dirty_counters();
> +}

How is this Intel specific code? IIRC AMD has RDPMC too.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH V2 3/3] perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task
  2020-09-07 16:01   ` peterz
@ 2020-09-08 15:58     ` peterz
  2020-09-09 13:24     ` Liang, Kan
  1 sibling, 0 replies; 8+ messages in thread
From: peterz @ 2020-09-08 15:58 UTC (permalink / raw)
  To: kan.liang; +Cc: mingo, acme, linux-kernel, ak, mark.rutland, luto

On Mon, Sep 07, 2020 at 06:01:15PM +0200, peterz@infradead.org wrote:
> On Fri, Aug 21, 2020 at 12:57:54PM -0700, kan.liang@linux.intel.com wrote:
> > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> > index 0f3d01562ded..fa08d810dcd2 100644
> > --- a/arch/x86/events/core.c
> > +++ b/arch/x86/events/core.c
> > @@ -1440,7 +1440,10 @@ static void x86_pmu_start(struct perf_event *event, int flags)
> >  
> >  	cpuc->events[idx] = event;
> >  	__set_bit(idx, cpuc->active_mask);
> > -	__set_bit(idx, cpuc->running);
> > +	/* The cpuc->running is only used by the P4 PMU */
> > +	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON) &&
> > +	    (boot_cpu_data.x86 == 0xf))
> > +		__set_bit(idx, cpuc->running);
> >  	x86_pmu.enable(event);
> >  	perf_event_update_userpage(event);
> >  }
> 
> Yuck! Use a static_branch() or something. This is a gnarly nest of code
> that runs 99.9% of the time to conclude a negative. IOW a complete waste
> of cycles for everybody not running a P4 space heater.

Better still, move it into p4_pmu_enable_event().

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH V2 3/3] perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task
  2020-09-07 16:01   ` peterz
  2020-09-08 15:58     ` peterz
@ 2020-09-09 13:24     ` Liang, Kan
  1 sibling, 0 replies; 8+ messages in thread
From: Liang, Kan @ 2020-09-09 13:24 UTC (permalink / raw)
  To: peterz; +Cc: mingo, acme, linux-kernel, ak, mark.rutland, luto



On 9/8/2020 11:58 AM, peterz@infradead.org wrote:
 > On Mon, Sep 07, 2020 at 06:01:15PM +0200, peterz@infradead.org wrote:
 >> On Fri, Aug 21, 2020 at 12:57:54PM -0700, kan.liang@linux.intel.com
 >>> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
 >>> index 0f3d01562ded..fa08d810dcd2 100644
 >>> --- a/arch/x86/events/core.c
 >>> +++ b/arch/x86/events/core.c
 >>> @@ -1440,7 +1440,10 @@ static void x86_pmu_start(struct perf_event
 >>> *event, int flags)
 >>>
 >>>   	cpuc->events[idx] = event;
 >>>   	__set_bit(idx, cpuc->active_mask);
 >>> -	__set_bit(idx, cpuc->running);
 >>> +	/* The cpuc->running is only used by the P4 PMU */
 >>> +	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON) &&
 >>> +	    (boot_cpu_data.x86 == 0xf))
 >>> +		__set_bit(idx, cpuc->running);
 >>>   	x86_pmu.enable(event);
 >>>   	perf_event_update_userpage(event);
 >>>   }
 >>
 >> Yuck! Use a static_branch() or something. This is a gnarly nest of
 >> code
 >> that runs 99.9% of the time to conclude a negative. IOW a complete
 >> waste
 >> of cycles for everybody not running a P4 space heater.
 >
 > Better still, move it into p4_pmu_enable_event().
 >

Sure, I will move the "cpuc->running" into P4 specific code.

On 9/7/2020 12:01 PM, peterz@infradead.org wrote:
>> @@ -1544,6 +1547,9 @@ static void x86_pmu_del(struct perf_event *event, int flags)
>>   	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
>>   		goto do_del;
>>   
>> +	if (READ_ONCE(x86_pmu.attr_rdpmc) && x86_pmu.sched_task &&
>> +	    test_bit(event->hw.idx, cpuc->active_mask))
>> +		__set_bit(event->hw.idx, cpuc->dirty);
> 
> And that too seems like an overly complicated set of tests and branches.
> This should be effectivly true for the 99% common case.

I made a mistake in the V2 version regarding whether P4 PMU supports 
RDPMC. SDM explicitly documents that the RDPMC instruction was 
introduced in P6. I once thought P4 is older than P6, but I'm wrong. P4 
should have NetBurst microarchitecture which is the successor to the P6.
So P4 should also support the RDPMC instruction.

We should not share the memory space between the 'dirty' and the 
'running' variable. I will modify it in V3

I will also unconditionally set cpuc->dirty here. The worst case for an 
RDPMC task is that we may have to clear all counters for the first time 
in x86_pmu_event_mapped. After that, the sched_task() will clear/update 
the 'dirty'. Only the real 'dirty' counters are clear. For non-RDPMC 
task, it's harmless to unconditionally set the cpuc->dirty.


>>   static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
>>   {
>>   	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
>>   		return;
>>   
>> +	/*
>> +	 * Enable sched_task() for the RDPMC task,
>> +	 * and clear the existing dirty counters.
>> +	 */
>> +	if (x86_pmu.sched_task && event->hw.target && !is_sampling_event(event)) {
>> +		perf_sched_cb_inc(event->ctx->pmu);
>> +		x86_pmu_clear_dirty_counters();
>> +	}
> 
> I'm failing to see the correctness of the !is_sampling_event() part
> there.

It looks like an RDPMC task can do sampling as well? I once thought it 
only does counting. I will fix it in V3.


> 
>>   	/*
>>   	 * This function relies on not being called concurrently in two
>>   	 * tasks in the same mm.  Otherwise one task could observe
>> @@ -2246,6 +2286,9 @@ static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *m
>>   	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
>>   		return;
>>   
>> +	if (x86_pmu.sched_task && event->hw.target && !is_sampling_event(event))
>> +		perf_sched_cb_dec(event->ctx->pmu);
>> +
> 
> Idem.
> 
>>   	if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
>>   		on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
>>   }
>> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
>> index c72e4904e056..e67713bfa33a 100644
>> --- a/arch/x86/events/intel/core.c
>> +++ b/arch/x86/events/intel/core.c
>> @@ -4166,11 +4166,39 @@ static void intel_pmu_cpu_dead(int cpu)
>>   	intel_cpuc_finish(&per_cpu(cpu_hw_events, cpu));
>>   }
>>   
>> +static void intel_pmu_rdpmc_sched_task(struct perf_event_context *ctx)
>> +{
>> +	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
>> +	struct perf_event *event;
>> +
>> +	if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX))
>> +		return;
>> +
>> +	/*
>> +	 * If the new task has the RDPMC enabled, clear the dirty counters to
>> +	 * prevent the potential leak. If the new task doesn't have the RDPMC
>> +	 * enabled, do nothing.
>> +	 */
>> +	list_for_each_entry(event, &ctx->event_list, event_entry) {
>> +		if (event->hw.target &&
>> +		    (event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED) &&
>> +		    !is_sampling_event(event) &&
>> +		    atomic_read(&event->mmap_count))
>> +			break;
>> +	}
>> +	if (&event->event_entry == &ctx->event_list)
>> +		return;
> 
> That's horrific, what's wrong with something like:
> 
> 	if (!atomic_read(&current->mm->context.perf_rdpmc_allowed))
> 		return;
>

After removing the !is_sampling_event() part, I think we can use this.
I will use it in V3.

>> +
>> +	x86_pmu_clear_dirty_counters();
>> +}
> 
> How is this Intel specific code? IIRC AMD has RDPMC too.
> 

Sure, I will move the code to X86 generic code.

Thanks,
Kan

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [tip: perf/core] perf/core: Pull pmu::sched_task() into perf_event_context_sched_in()
  2020-08-21 19:57 [PATCH V2 1/3] perf/core: Pull pmu::sched_task() into perf_event_context_sched_in() kan.liang
  2020-08-21 19:57 ` [PATCH V2 2/3] perf/core: Pull pmu::sched_task() into perf_event_context_sched_out() kan.liang
  2020-08-21 19:57 ` [PATCH V2 3/3] perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task kan.liang
@ 2020-09-11  7:02 ` tip-bot2 for Kan Liang
  2 siblings, 0 replies; 8+ messages in thread
From: tip-bot2 for Kan Liang @ 2020-09-11  7:02 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: Peter Zijlstra (Intel), Kan Liang, x86, LKML

The following commit has been merged into the perf/core branch of tip:

Commit-ID:     556cccad389717d6eb4f5a24b45ff41cad3aaabf
Gitweb:        https://git.kernel.org/tip/556cccad389717d6eb4f5a24b45ff41cad3aaabf
Author:        Kan Liang <kan.liang@linux.intel.com>
AuthorDate:    Fri, 21 Aug 2020 12:57:52 -07:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 10 Sep 2020 11:19:34 +02:00

perf/core: Pull pmu::sched_task() into perf_event_context_sched_in()

The pmu::sched_task() is a context switch callback. It passes the
cpuctx->task_ctx as a parameter to the lower code. To find the
cpuctx->task_ctx, the current code iterates a cpuctx list.

The same context was just iterated in perf_event_context_sched_in(),
which is invoked right before the pmu::sched_task().

Reuse the cpuctx->task_ctx from perf_event_context_sched_in() can avoid
the unnecessary iteration of the cpuctx list.

Both pmu::sched_task and perf_event_context_sched_in() have to disable
PMU. Pull the pmu::sched_task into perf_event_context_sched_in() can
also save the overhead from the PMU disable and reenable.

The new and old tasks may have equivalent contexts. The current code
optimize this case by swapping the context, which avoids the scheduling.
For this case, pmu::sched_task() is still required, e.g., restore the
LBR content.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200821195754.20159-1-kan.liang@linux.intel.com
---
 kernel/events/core.c | 51 ++++++++++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 57efe3b..3f5fec4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3491,30 +3491,36 @@ void perf_sched_cb_inc(struct pmu *pmu)
  * PEBS requires this to provide PID/TID information. This requires we flush
  * all queued PEBS records before we context switch to a new task.
  */
+static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
+{
+	struct pmu *pmu;
+
+	pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
+
+	if (WARN_ON_ONCE(!pmu->sched_task))
+		return;
+
+	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+	perf_pmu_disable(pmu);
+
+	pmu->sched_task(cpuctx->task_ctx, sched_in);
+
+	perf_pmu_enable(pmu);
+	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+}
+
 static void perf_pmu_sched_task(struct task_struct *prev,
 				struct task_struct *next,
 				bool sched_in)
 {
 	struct perf_cpu_context *cpuctx;
-	struct pmu *pmu;
 
 	if (prev == next)
 		return;
 
-	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
-		pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
-
-		if (WARN_ON_ONCE(!pmu->sched_task))
-			continue;
-
-		perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-		perf_pmu_disable(pmu);
-
-		pmu->sched_task(cpuctx->task_ctx, sched_in);
+	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
+		__perf_pmu_sched_task(cpuctx, sched_in);
 
-		perf_pmu_enable(pmu);
-		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-	}
 }
 
 static void perf_event_switch(struct task_struct *task,
@@ -3773,10 +3779,14 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 					struct task_struct *task)
 {
 	struct perf_cpu_context *cpuctx;
+	struct pmu *pmu = ctx->pmu;
 
 	cpuctx = __get_cpu_context(ctx);
-	if (cpuctx->task_ctx == ctx)
+	if (cpuctx->task_ctx == ctx) {
+		if (cpuctx->sched_cb_usage)
+			__perf_pmu_sched_task(cpuctx, true);
 		return;
+	}
 
 	perf_ctx_lock(cpuctx, ctx);
 	/*
@@ -3786,7 +3796,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 	if (!ctx->nr_events)
 		goto unlock;
 
-	perf_pmu_disable(ctx->pmu);
+	perf_pmu_disable(pmu);
 	/*
 	 * We want to keep the following priority order:
 	 * cpu pinned (that don't need to move), task pinned,
@@ -3798,7 +3808,11 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
 		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 	perf_event_sched_in(cpuctx, ctx, task);
-	perf_pmu_enable(ctx->pmu);
+
+	if (cpuctx->sched_cb_usage && pmu->sched_task)
+		pmu->sched_task(cpuctx->task_ctx, true);
+
+	perf_pmu_enable(pmu);
 
 unlock:
 	perf_ctx_unlock(cpuctx, ctx);
@@ -3841,9 +3855,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 
 	if (atomic_read(&nr_switch_events))
 		perf_event_switch(task, prev, true);
-
-	if (__this_cpu_read(perf_sched_cb_usages))
-		perf_pmu_sched_task(prev, task, true);
 }
 
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [tip: perf/core] perf/core: Pull pmu::sched_task() into perf_event_context_sched_out()
  2020-08-21 19:57 ` [PATCH V2 2/3] perf/core: Pull pmu::sched_task() into perf_event_context_sched_out() kan.liang
@ 2020-09-11  7:02   ` tip-bot2 for Kan Liang
  0 siblings, 0 replies; 8+ messages in thread
From: tip-bot2 for Kan Liang @ 2020-09-11  7:02 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: Peter Zijlstra (Intel), Kan Liang, x86, LKML

The following commit has been merged into the perf/core branch of tip:

Commit-ID:     44fae179ce73a26733d9e2d346da4e1a1cb94647
Gitweb:        https://git.kernel.org/tip/44fae179ce73a26733d9e2d346da4e1a1cb94647
Author:        Kan Liang <kan.liang@linux.intel.com>
AuthorDate:    Fri, 21 Aug 2020 12:57:53 -07:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 10 Sep 2020 11:19:34 +02:00

perf/core: Pull pmu::sched_task() into perf_event_context_sched_out()

The pmu::sched_task() is a context switch callback. It passes the
cpuctx->task_ctx as a parameter to the lower code. To find the
cpuctx->task_ctx, the current code iterates a cpuctx list.
The same context will iterated in perf_event_context_sched_out() soon.
Share the cpuctx->task_ctx can avoid the unnecessary iteration of the
cpuctx list.

The pmu::sched_task() is also required for the optimization case for
equivalent contexts.

The task_ctx_sched_out() will eventually disable and reenable the PMU
when schedule out events. Add perf_pmu_disable() and perf_pmu_enable()
around task_ctx_sched_out() don't break anything.

Drop the cpuctx->ctx.lock for the pmu::sched_task(). The lock is for
per-CPU context, which is not necessary for the per-task context
schedule.

No one uses sched_cb_entry, perf_sched_cb_usages, sched_cb_list, and
perf_pmu_sched_task() any more.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200821195754.20159-2-kan.liang@linux.intel.com
---
 include/linux/perf_event.h |  1 +-
 kernel/events/core.c       | 47 +++++++++++++------------------------
 2 files changed, 17 insertions(+), 31 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 46a3974..0c19d27 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -872,7 +872,6 @@ struct perf_cpu_context {
 	struct list_head		cgrp_cpuctx_entry;
 #endif
 
-	struct list_head		sched_cb_entry;
 	int				sched_cb_usage;
 
 	int				online;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3f5fec4..45edb85 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -382,7 +382,6 @@ static DEFINE_MUTEX(perf_sched_mutex);
 static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
@@ -3384,10 +3383,12 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 	struct perf_event_context *parent, *next_parent;
 	struct perf_cpu_context *cpuctx;
 	int do_switch = 1;
+	struct pmu *pmu;
 
 	if (likely(!ctx))
 		return;
 
+	pmu = ctx->pmu;
 	cpuctx = __get_cpu_context(ctx);
 	if (!cpuctx->task_ctx)
 		return;
@@ -3417,11 +3418,15 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 		raw_spin_lock(&ctx->lock);
 		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
 		if (context_equiv(ctx, next_ctx)) {
-			struct pmu *pmu = ctx->pmu;
 
 			WRITE_ONCE(ctx->task, next);
 			WRITE_ONCE(next_ctx->task, task);
 
+			perf_pmu_disable(pmu);
+
+			if (cpuctx->sched_cb_usage && pmu->sched_task)
+				pmu->sched_task(ctx, false);
+
 			/*
 			 * PMU specific parts of task perf context can require
 			 * additional synchronization. As an example of such
@@ -3433,6 +3438,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 			else
 				swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
 
+			perf_pmu_enable(pmu);
+
 			/*
 			 * RCU_INIT_POINTER here is safe because we've not
 			 * modified the ctx and the above modification of
@@ -3455,21 +3462,22 @@ unlock:
 
 	if (do_switch) {
 		raw_spin_lock(&ctx->lock);
+		perf_pmu_disable(pmu);
+
+		if (cpuctx->sched_cb_usage && pmu->sched_task)
+			pmu->sched_task(ctx, false);
 		task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
+
+		perf_pmu_enable(pmu);
 		raw_spin_unlock(&ctx->lock);
 	}
 }
 
-static DEFINE_PER_CPU(struct list_head, sched_cb_list);
-
 void perf_sched_cb_dec(struct pmu *pmu)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-	this_cpu_dec(perf_sched_cb_usages);
-
-	if (!--cpuctx->sched_cb_usage)
-		list_del(&cpuctx->sched_cb_entry);
+	--cpuctx->sched_cb_usage;
 }
 
 
@@ -3477,10 +3485,7 @@ void perf_sched_cb_inc(struct pmu *pmu)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-	if (!cpuctx->sched_cb_usage++)
-		list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
-
-	this_cpu_inc(perf_sched_cb_usages);
+	cpuctx->sched_cb_usage++;
 }
 
 /*
@@ -3509,20 +3514,6 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 
-static void perf_pmu_sched_task(struct task_struct *prev,
-				struct task_struct *next,
-				bool sched_in)
-{
-	struct perf_cpu_context *cpuctx;
-
-	if (prev == next)
-		return;
-
-	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
-		__perf_pmu_sched_task(cpuctx, sched_in);
-
-}
-
 static void perf_event_switch(struct task_struct *task,
 			      struct task_struct *next_prev, bool sched_in);
 
@@ -3545,9 +3536,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
 {
 	int ctxn;
 
-	if (__this_cpu_read(perf_sched_cb_usages))
-		perf_pmu_sched_task(task, next, false);
-
 	if (atomic_read(&nr_switch_events))
 		perf_event_switch(task, next, false);
 
@@ -12867,7 +12855,6 @@ static void __init perf_event_init_all_cpus(void)
 #ifdef CONFIG_CGROUP_PERF
 		INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
 #endif
-		INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
 	}
 }
 

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2020-09-11  7:04 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-21 19:57 [PATCH V2 1/3] perf/core: Pull pmu::sched_task() into perf_event_context_sched_in() kan.liang
2020-08-21 19:57 ` [PATCH V2 2/3] perf/core: Pull pmu::sched_task() into perf_event_context_sched_out() kan.liang
2020-09-11  7:02   ` [tip: perf/core] " tip-bot2 for Kan Liang
2020-08-21 19:57 ` [PATCH V2 3/3] perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task kan.liang
2020-09-07 16:01   ` peterz
2020-09-08 15:58     ` peterz
2020-09-09 13:24     ` Liang, Kan
2020-09-11  7:02 ` [tip: perf/core] perf/core: Pull pmu::sched_task() into perf_event_context_sched_in() tip-bot2 for Kan Liang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).