All of lore.kernel.org
 help / color / mirror / Atom feed
* [RESEND PATCH 1/2] perf/x86: extract code to assign perf events for both core and uncore
@ 2022-03-04 11:03 Wen Yang
  2022-03-04 11:03 ` [RESEND PATCH 2/2] perf/x86: improve the event scheduling to avoid unnecessary pmu_stop/start Wen Yang
  0 siblings, 1 reply; 16+ messages in thread
From: Wen Yang @ 2022-03-04 11:03 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Thomas Gleixner
  Cc: Wen Yang, Mark Rutland, Jiri Olsa, Namhyung Kim, Borislav Petkov,
	x86, Wen Yang, H. Peter Anvin, linux-perf-users, linux-kernel

Following two patterns in x86 perf code are used in multiple places where
similar code is duplicated:
- fast path, try to reuse previous register
- slow path, assign a counter for each event

In order to reduce duplicate and prepare for following patch series that
also uses described patterns, extract the codes to perf_assign_events.

This commit doesn't change functionality.

Signed-off-by: Wen Yang <simon.wy@alibaba-inc.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: x86@kernel.org
Cc: Wen Yang <wenyang@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: linux-perf-users@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/events/core.c         | 141 ++++++++++++++++++---------------
 arch/x86/events/intel/uncore.c |  31 +-------
 arch/x86/events/perf_event.h   |   6 +-
 3 files changed, 82 insertions(+), 96 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index eef816fc216d..9846d422f06d 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -950,10 +950,7 @@ static bool perf_sched_next_event(struct perf_sched *sched)
 	return true;
 }
 
-/*
- * Assign a counter for each event.
- */
-int perf_assign_events(struct event_constraint **constraints, int n,
+int _perf_assign_events(struct event_constraint **constraints, int n,
 			int wmin, int wmax, int gpmax, int *assign)
 {
 	struct perf_sched sched;
@@ -969,16 +966,66 @@ int perf_assign_events(struct event_constraint **constraints, int n,
 
 	return sched.state.unassigned;
 }
+
+/*
+ * Assign a counter for each event.
+ */
+int perf_assign_events(struct perf_event **event_list,
+		struct event_constraint **constraints, int n,
+		int wmin, int wmax, int gpmax, int *assign)
+{
+	struct event_constraint *c;
+	struct hw_perf_event *hwc;
+	u64 used_mask = 0;
+	int unsched = 0;
+	int i;
+
+	/*
+	 * fastpath, try to reuse previous register
+	 */
+	for (i = 0; i < n; i++) {
+		u64 mask;
+
+		hwc = &event_list[i]->hw;
+		c = constraints[i];
+
+		/* never assigned */
+		if (hwc->idx == -1)
+			break;
+
+		/* constraint still honored */
+		if (!test_bit(hwc->idx, c->idxmsk))
+			break;
+
+		mask = BIT_ULL(hwc->idx);
+		if (is_counter_pair(hwc))
+			mask |= mask << 1;
+
+		/* not already used */
+		if (used_mask & mask)
+			break;
+
+		used_mask |= mask;
+
+		if (assign)
+			assign[i] = hwc->idx;
+	}
+
+	/* slow path */
+	if (i != n)
+		unsched = _perf_assign_events(constraints, n,
+				wmin, wmax, gpmax, assign);
+
+	return unsched;
+}
 EXPORT_SYMBOL_GPL(perf_assign_events);
 
 int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 {
 	int num_counters = hybrid(cpuc->pmu, num_counters);
-	struct event_constraint *c;
-	struct perf_event *e;
 	int n0, i, wmin, wmax, unsched = 0;
-	struct hw_perf_event *hwc;
-	u64 used_mask = 0;
+	struct event_constraint *c;
+	int gpmax = num_counters;
 
 	/*
 	 * Compute the number of events already present; see x86_pmu_add(),
@@ -1017,66 +1064,30 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 	}
 
 	/*
-	 * fastpath, try to reuse previous register
+	 * Do not allow scheduling of more than half the available
+	 * generic counters.
+	 *
+	 * This helps avoid counter starvation of sibling thread by
+	 * ensuring at most half the counters cannot be in exclusive
+	 * mode. There is no designated counters for the limits. Any
+	 * N/2 counters can be used. This helps with events with
+	 * specific counter constraints.
 	 */
-	for (i = 0; i < n; i++) {
-		u64 mask;
-
-		hwc = &cpuc->event_list[i]->hw;
-		c = cpuc->event_constraint[i];
-
-		/* never assigned */
-		if (hwc->idx == -1)
-			break;
-
-		/* constraint still honored */
-		if (!test_bit(hwc->idx, c->idxmsk))
-			break;
-
-		mask = BIT_ULL(hwc->idx);
-		if (is_counter_pair(hwc))
-			mask |= mask << 1;
-
-		/* not already used */
-		if (used_mask & mask)
-			break;
+	if (is_ht_workaround_enabled() && !cpuc->is_fake &&
+			READ_ONCE(cpuc->excl_cntrs->exclusive_present))
+		gpmax /= 2;
 
-		used_mask |= mask;
-
-		if (assign)
-			assign[i] = hwc->idx;
+	/*
+	 * Reduce the amount of available counters to allow fitting
+	 * the extra Merge events needed by large increment events.
+	 */
+	if (x86_pmu.flags & PMU_FL_PAIR) {
+		gpmax = num_counters - cpuc->n_pair;
+		WARN_ON(gpmax <= 0);
 	}
 
-	/* slow path */
-	if (i != n) {
-		int gpmax = num_counters;
-
-		/*
-		 * Do not allow scheduling of more than half the available
-		 * generic counters.
-		 *
-		 * This helps avoid counter starvation of sibling thread by
-		 * ensuring at most half the counters cannot be in exclusive
-		 * mode. There is no designated counters for the limits. Any
-		 * N/2 counters can be used. This helps with events with
-		 * specific counter constraints.
-		 */
-		if (is_ht_workaround_enabled() && !cpuc->is_fake &&
-		    READ_ONCE(cpuc->excl_cntrs->exclusive_present))
-			gpmax /= 2;
-
-		/*
-		 * Reduce the amount of available counters to allow fitting
-		 * the extra Merge events needed by large increment events.
-		 */
-		if (x86_pmu.flags & PMU_FL_PAIR) {
-			gpmax = num_counters - cpuc->n_pair;
-			WARN_ON(gpmax <= 0);
-		}
-
-		unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
-					     wmax, gpmax, assign);
-	}
+	unsched = perf_assign_events(cpuc->event_list, cpuc->event_constraint,
+			n, wmin, wmax, gpmax, assign);
 
 	/*
 	 * In case of success (unsched = 0), mark events as committed,
@@ -1093,7 +1104,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 			static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]);
 	} else {
 		for (i = n0; i < n; i++) {
-			e = cpuc->event_list[i];
+			struct perf_event *e = cpuc->event_list[i];
 
 			/*
 			 * release events that failed scheduling
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index e497da9bf427..101358ae2814 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -442,12 +442,8 @@ static void uncore_put_event_constraint(struct intel_uncore_box *box,
 
 static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
 {
-	unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
 	struct event_constraint *c;
 	int i, wmin, wmax, ret = 0;
-	struct hw_perf_event *hwc;
-
-	bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
 
 	for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
 		c = uncore_get_event_constraint(box, box->event_list[i]);
@@ -456,31 +452,8 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int
 		wmax = max(wmax, c->weight);
 	}
 
-	/* fastpath, try to reuse previous register */
-	for (i = 0; i < n; i++) {
-		hwc = &box->event_list[i]->hw;
-		c = box->event_constraint[i];
-
-		/* never assigned */
-		if (hwc->idx == -1)
-			break;
-
-		/* constraint still honored */
-		if (!test_bit(hwc->idx, c->idxmsk))
-			break;
-
-		/* not already used */
-		if (test_bit(hwc->idx, used_mask))
-			break;
-
-		__set_bit(hwc->idx, used_mask);
-		if (assign)
-			assign[i] = hwc->idx;
-	}
-	/* slow path */
-	if (i != n)
-		ret = perf_assign_events(box->event_constraint, n,
-					 wmin, wmax, n, assign);
+	ret = perf_assign_events(box->event_list,
+			box->event_constraint, n, wmin, wmax, n, assign);
 
 	if (!assign || ret) {
 		for (i = 0; i < n; i++)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 150261d929b9..f1acd1ded001 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1130,8 +1130,10 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
 
 void x86_pmu_enable_all(int added);
 
-int perf_assign_events(struct event_constraint **constraints, int n,
-			int wmin, int wmax, int gpmax, int *assign);
+int perf_assign_events(struct perf_event **event_list,
+		struct event_constraint **constraints, int n,
+		int wmin, int wmax, int gpmax, int *assign);
+
 int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
 
 void x86_pmu_stop(struct perf_event *event, int flags);
-- 
2.19.1.6.gb485710b


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [RESEND PATCH 2/2] perf/x86: improve the event scheduling to avoid unnecessary pmu_stop/start
  2022-03-04 11:03 [RESEND PATCH 1/2] perf/x86: extract code to assign perf events for both core and uncore Wen Yang
@ 2022-03-04 11:03 ` Wen Yang
  2022-03-04 15:39   ` Peter Zijlstra
  0 siblings, 1 reply; 16+ messages in thread
From: Wen Yang @ 2022-03-04 11:03 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Thomas Gleixner
  Cc: Wen Yang, Stephane Eranian, mark rutland, jiri olsa,
	namhyung kim, borislav petkov, x86, Wen Yang, h. peter anvin,
	linux-perf-users, linux-kernel

this issue has been there for a long time, we could reproduce it as follows:

1, run a script that periodically collects perf data, eg:
while true
do
    perf stat -e cache-misses,cache-misses,cache-misses -c 1 sleep 2
    perf stat -e cache-misses -c 1 sleep 2
    sleep 1
done

2, run another one to capture the ipc, eg:
perf stat -e cycles:d,instructions:d  -c 1 -i 1000

then we could observe that the counter used by cycles:d changes frequently:
crash> struct  cpu_hw_events.n_events,assign,event_list,events 0xffff88bf7f44f420
  n_events = 3
  assign = {33, 1, 32, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
  event_list = {0xffff88bf77b85000, 0xffff88b72db82000, 0xffff88b72db85800, 0xffff88ff6cfcb000, 0xffff88ff609f1800, 0xffff88ff609f1800, 0xffff88ff5f46a800, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
  events = {0x0, 0xffff88b72db82000, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xffff88b72db85800, 0xffff88bf77b85000, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}

crash> struct  cpu_hw_events.n_events,assign,event_list,events  0xffff88bf7f44f420
  n_events = 6
  assign = {33, 3, 32, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
  event_list = {0xffff88bf77b85000, 0xffff88b72db82000, 0xffff88b72db85800, 0xffff88bf46c34000, 0xffff88bf46c35000, 0xffff88bf46c30000, 0xffff88ff5f46a800, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
  events = {0xffff88bf46c34000, 0xffff88bf46c35000, 0xffff88bf46c30000, 0xffff88b72db82000, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xffff88b72db85800, 0xffff88bf77b85000, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}

the reason is that the nmi watchdog permanently consumes one fp
(*cycles*). therefore, when the above shell script obtains *cycles*
again, only one gp can be used, and its weight is 5.
but other events (like *cache-misses*) have a weight of 4,
so the counter used by *cycles* will often be taken away, as in
the raw data above:
[1]
  n_events = 3
  assign = {33, 1, 32, ...}
-->
  n_events = 6
  assign = {33, 3, 32, 0, 1, 2, ...}

so it will cause unnecessary pmu_stop/start and also cause abnormal cpi.

Cloud servers usually continuously monitor the cpi data of some important
services. This issue affects performance and misleads monitoring.

The current event scheduling algorithm is more than 10 years old:
commit 1da53e023029 ("perf_events, x86: Improve x86 event scheduling")

we wish it could be optimized a bit.
The fields msk_counters and msk_events are added to indicate currently
used counters and events so that the used ones can be skipped
in __perf_sched_find_counter and perf_sched_next_event functions to avoid
unnecessary pmu_stop/start.

[1]:
32: intel_pmc_idx_fixed_instructions
33: intel_pmc_idx_fixed_cpu_cycles
34: intel_pmc_idx_fixed_ref_cycles
0,1,2,3: gp

Signed-off-by: Wen Yang <simon.wy@alibaba-inc.com>
Cc: peter zijlstra (intel) <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: ingo molnar <mingo@redhat.com>
Cc: arnaldo carvalho de melo <acme@kernel.org>
Cc: mark rutland <mark.rutland@arm.com>
Cc: alexander shishkin <alexander.shishkin@linux.intel.com>
Cc: jiri olsa <jolsa@redhat.com>
Cc: namhyung kim <namhyung@kernel.org>
Cc: thomas gleixner <tglx@linutronix.de>
Cc: borislav petkov <bp@alien8.de>
Cc: x86@kernel.org
Cc: Wen Yang <wenyang@linux.alibaba.com>
Cc: "h. peter anvin" <hpa@zytor.com>
Cc: linux-perf-users@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/events/core.c | 93 +++++++++++++++++++++++++++++++++---------
 1 file changed, 74 insertions(+), 19 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 9846d422f06d..88313d669756 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -796,33 +796,67 @@ struct perf_sched {
 	int			max_events;
 	int			max_gp;
 	int			saved_states;
+	u64			msk_counters;
+	u64			msk_events;
 	struct event_constraint	**constraints;
 	struct sched_state	state;
 	struct sched_state	saved[SCHED_STATES_MAX];
 };
 
+static int perf_sched_calc_weight(struct event_constraint **constraints,
+		int num, int wmin, u64 msk_events)
+{
+	int tmp = wmin;
+	int idx;
+
+	if (!msk_events)
+		goto out;
+
+	for (idx = 0; idx < num; idx++) {
+		if (msk_events & BIT_ULL(idx))
+			continue;
+
+		tmp = min(tmp, constraints[idx]->weight);
+	}
+
+out:
+	return tmp;
+}
+
+static int perf_sched_calc_event(struct event_constraint **constraints,
+		int num, int weight, u64 msk_events)
+{
+	int idx;
+
+	for (idx = 0; idx < num; idx++) {
+		if (msk_events & BIT_ULL(idx))
+			continue;
+
+		if (constraints[idx]->weight == weight)
+			break;
+	}
+
+	/* start with min weight */
+	return idx;
+}
+
 /*
  * Initialize iterator that runs through all events and counters.
  */
 static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
-			    int num, int wmin, int wmax, int gpmax)
+			    int num, int wmin, int wmax, int gpmax, u64 cntm, u64 evtm)
 {
-	int idx;
-
 	memset(sched, 0, sizeof(*sched));
 	sched->max_events	= num;
 	sched->max_weight	= wmax;
 	sched->max_gp		= gpmax;
 	sched->constraints	= constraints;
+	sched->msk_counters = cntm;
+	sched->msk_events   = evtm;
 
-	for (idx = 0; idx < num; idx++) {
-		if (constraints[idx]->weight == wmin)
-			break;
-	}
-
-	sched->state.event	= idx;		/* start with min weight */
-	sched->state.weight	= wmin;
-	sched->state.unassigned	= num;
+	sched->state.weight = perf_sched_calc_weight(constraints, num, wmin, cntm);
+	sched->state.event = perf_sched_calc_event(constraints, num, sched->state.weight, evtm);
+	sched->state.unassigned = num - hweight_long(evtm);
 }
 
 static void perf_sched_save_state(struct perf_sched *sched)
@@ -874,6 +908,9 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
 		for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
 			u64 mask = BIT_ULL(idx);
 
+			if (sched->msk_counters & mask)
+				continue;
+
 			if (sched->state.used & mask)
 				continue;
 
@@ -890,6 +927,9 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
 		if (c->flags & PERF_X86_EVENT_PAIR)
 			mask |= mask << 1;
 
+		if (sched->msk_counters & mask)
+			continue;
+
 		if (sched->state.used & mask)
 			continue;
 
@@ -921,6 +961,12 @@ static bool perf_sched_find_counter(struct perf_sched *sched)
 	return true;
 }
 
+static void ignore_used_index(u64 mask, int *index)
+{
+	while (mask & BIT_ULL(*index))
+		++*index;
+}
+
 /*
  * Go through all unassigned events and find the next one to schedule.
  * Take events with the least weight first. Return true on success.
@@ -935,9 +981,12 @@ static bool perf_sched_next_event(struct perf_sched *sched)
 	do {
 		/* next event */
 		sched->state.event++;
+		ignore_used_index(sched->msk_events, &sched->state.event);
 		if (sched->state.event >= sched->max_events) {
 			/* next weight */
 			sched->state.event = 0;
+			ignore_used_index(sched->msk_events, &sched->state.event);
+
 			sched->state.weight++;
 			if (sched->state.weight > sched->max_weight)
 				return false;
@@ -951,11 +1000,11 @@ static bool perf_sched_next_event(struct perf_sched *sched)
 }
 
 int _perf_assign_events(struct event_constraint **constraints, int n,
-			int wmin, int wmax, int gpmax, int *assign)
+			int wmin, int wmax, int gpmax, u64 cntm, u64 evtm, int *assign)
 {
 	struct perf_sched sched;
 
-	perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
+	perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax, cntm, evtm);
 
 	do {
 		if (!perf_sched_find_counter(&sched))
@@ -976,7 +1025,8 @@ int perf_assign_events(struct perf_event **event_list,
 {
 	struct event_constraint *c;
 	struct hw_perf_event *hwc;
-	u64 used_mask = 0;
+	u64 msk_counters = 0;
+	u64 msk_event = 0;
 	int unsched = 0;
 	int i;
 
@@ -1002,19 +1052,24 @@ int perf_assign_events(struct perf_event **event_list,
 			mask |= mask << 1;
 
 		/* not already used */
-		if (used_mask & mask)
+		if (msk_counters & mask)
 			break;
 
-		used_mask |= mask;
+		msk_counters |= mask;
+		msk_event |= BIT_ULL(i);
 
 		if (assign)
 			assign[i] = hwc->idx;
 	}
 
 	/* slow path */
-	if (i != n)
-		unsched = _perf_assign_events(constraints, n,
-				wmin, wmax, gpmax, assign);
+	if (i != n) {
+		unsched = _perf_assign_events(constraints, n, wmin, wmax, gpmax,
+				msk_counters, msk_event, assign);
+		if (unsched)
+			unsched = _perf_assign_events(constraints, n, wmin, wmax, gpmax,
+					0, 0, assign);
+	}
 
 	return unsched;
 }
-- 
2.19.1.6.gb485710b


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [RESEND PATCH 2/2] perf/x86: improve the event scheduling to avoid unnecessary pmu_stop/start
  2022-03-04 11:03 ` [RESEND PATCH 2/2] perf/x86: improve the event scheduling to avoid unnecessary pmu_stop/start Wen Yang
@ 2022-03-04 15:39   ` Peter Zijlstra
  2022-03-06 14:36     ` Wen Yang
  0 siblings, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2022-03-04 15:39 UTC (permalink / raw)
  To: Wen Yang
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Alexander Shishkin,
	Thomas Gleixner, Stephane Eranian, mark rutland, jiri olsa,
	namhyung kim, borislav petkov, x86, Wen Yang, h. peter anvin,
	linux-perf-users, linux-kernel

On Fri, Mar 04, 2022 at 07:03:51PM +0800, Wen Yang wrote:
> this issue has been there for a long time, we could reproduce it as follows:

What issue? You've not described an issue. So you cannot reference one.

This is still completely unreadable gibberish.

> 1, run a script that periodically collects perf data, eg:
> while true
> do
>     perf stat -e cache-misses,cache-misses,cache-misses -c 1 sleep 2
>     perf stat -e cache-misses -c 1 sleep 2
>     sleep 1
> done
> 
> 2, run another one to capture the ipc, eg:
> perf stat -e cycles:d,instructions:d  -c 1 -i 1000

<snip line noise>

> the reason is that the nmi watchdog permanently consumes one fp
> (*cycles*). therefore, when the above shell script obtains *cycles*
> again, only one gp can be used, and its weight is 5.
> but other events (like *cache-misses*) have a weight of 4,
> so the counter used by *cycles* will often be taken away, as in
> the raw data above:
> [1]
>   n_events = 3
>   assign = {33, 1, 32, ...}
> -->
>   n_events = 6
>   assign = {33, 3, 32, 0, 1, 2, ...}

Again unreadable... what do any of those numbers mean?

> 
> so it will cause unnecessary pmu_stop/start and also cause abnormal cpi.

How?!?

> Cloud servers usually continuously monitor the cpi data of some important
> services. This issue affects performance and misleads monitoring.
> 
> The current event scheduling algorithm is more than 10 years old:
> commit 1da53e023029 ("perf_events, x86: Improve x86 event scheduling")

irrelevant

> we wish it could be optimized a bit.

I wish for a unicorn ...

> The fields msk_counters and msk_events are added to indicate currently
> used counters and events so that the used ones can be skipped
> in __perf_sched_find_counter and perf_sched_next_event functions to avoid
> unnecessary pmu_stop/start.

Still not sure what your actual problem is, nor what the actual proposal
is.

Why should I attempt to reverse engineer your code without basic
understanding of what you're actually trying to achieve?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RESEND PATCH 2/2] perf/x86: improve the event scheduling to avoid unnecessary pmu_stop/start
  2022-03-04 15:39   ` Peter Zijlstra
@ 2022-03-06 14:36     ` Wen Yang
  2022-03-07 17:14       ` Stephane Eranian
  2022-03-08 12:50       ` Peter Zijlstra
  0 siblings, 2 replies; 16+ messages in thread
From: Wen Yang @ 2022-03-06 14:36 UTC (permalink / raw)
  To: Peter Zijlstra, Wen Yang
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Alexander Shishkin,
	Thomas Gleixner, Stephane Eranian, mark rutland, jiri olsa,
	namhyung kim, borislav petkov, x86, h. peter anvin,
	linux-perf-users, linux-kernel



在 2022/3/4 下午11:39, Peter Zijlstra 写道:
> On Fri, Mar 04, 2022 at 07:03:51PM +0800, Wen Yang wrote:
>> this issue has been there for a long time, we could reproduce it as follows:
> 
> What issue? You've not described an issue. So you cannot reference one.
> 

OK, thank you for your opinion. Let's explain it.


> This is still completely unreadable gibberish.
> 
>> 1, run a script that periodically collects perf data, eg:
>> while true
>> do
>>      perf stat -e cache-misses,cache-misses,cache-misses -c 1 sleep 2
>>      perf stat -e cache-misses -c 1 sleep 2
>>      sleep 1
>> done
>>
>> 2, run another one to capture the ipc, eg:
>> perf stat -e cycles:d,instructions:d  -c 1 -i 1000
> 
> <snip line noise>
> 
>> the reason is that the nmi watchdog permanently consumes one fp
>> (*cycles*). therefore, when the above shell script obtains *cycles*
>> again, only one gp can be used, and its weight is 5.
>> but other events (like *cache-misses*) have a weight of 4,
>> so the counter used by *cycles* will often be taken away, as in
>> the raw data above:
>> [1]
>>    n_events = 3
>>    assign = {33, 1, 32, ...}
>> -->
>>    n_events = 6
>>    assign = {33, 3, 32, 0, 1, 2, ...}
> 
> Again unreadable... what do any of those numbers mean?
> 

Scenario: a monitor program will monitor the CPI on a specific CPU in 
pinned mode for a long time, such as the CPI in the original email:
     perf stat -e cycles:D,instructions:D  -C 1 -I 1000

Perf here is just an example. Our monitor program will continuously read 
the counter values of these perf events (cycles and instructions).

However, it is found that CPI data will be abnormal periodically because 
PMU counter conflicts. For example, scripts in e-mail will cause 
interference:
     perf stat -e cache-misses,cache-misses,cache-misses -C 1 sleep 2
     perf stat -e cache-misses -C 1 sleep 2


   n_events = 3
   assign = {33, 1, 32, ...}
-->
   n_events = 6
   assign = {33, 3, 32, 0, 1, 2, ...}

They are some fields of cpu_hw_events structure.
int n_events; /* the # of events in the below arrays */
int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */

32: intel_pmc_idx_fixed_instructions
33: intel_pmc_idx_fixed_cpu_cycles
34: intel_pmc_idx_fixed_ref_cycles
0,1,2,3: gp


n_events = 3
assign = {33, 1, 32, ...}
event_list = {0xffff88bf77b85000, 0xffff88b72db82000, 
0xffff88b72db85800, ...}

—>
Indicates that there are 3 perf events on CPU 1, which occupy the 
#fixed_cpu_cycles, #1 and #fixed_instruction counter one by one.
These 3 perf events are generated by the NMI watchdog and the script A:
perf stat -e cycles:D,instructions:D  -C 1 -I 1000


n_events = 6
assign = {33, 3, 32, 0, 1, 2, ...}
event_list = {0xffff88bf77b85000, 0xffff88b72db82000, 
0xffff88b72db85800, 0xffff88bf46c34000, 0xffff88bf46c35000, 
0xffff88bf46c30000,  ...}

—>
Indicates that there are 6 perf events on CPU 1, which occupy the 
#fixed_cpu_cycles, #3, #fixed_instruction, #0, #1 and #2 counter one by one.
These 6 perf events are generated by the NMI watchdog and the script A 
and B:
perf stat -e cycles:D,instructions:D  -C 1 -I 1000
perf stat -e cache-misses,cache-misses,cache-misses -C 1 sleep 2

0xffff88bf77b85000:
The perf event generated by NMI watchdog, which has always occupied 
#fixed_cpu_cycles

0xffff88b72db82000:
The perf event generated by the above script A (cycles:D), and the 
counter it used changes from #1 to #3. We use perf event in pinned mode, 
and then continuously read its value for a long time, but its PMU 
counter changes, so the counter value will also jump.


0xffff88b72db85800:
The perf event generated by the above script A (instructions:D), which 
has always occupied #fixed_instruction.

0xffff88bf46c34000, 0xffff88bf46c35000, 0xffff88bf46c30000:
Theses perf events are generated by the above script B.


>>
>> so it will cause unnecessary pmu_stop/start and also cause abnormal cpi.
> 
> How?!?
> 

We may refer to the x86_pmu_enable function:
step1: save events moving to new counters
step2: reprogram moved events into new counters

especially:

static inline int match_prev_assignment(struct hw_perf_event *hwc,
                     struct cpu_hw_events *cpuc,
                     int i)
{
     return hwc->idx == cpuc->assign[i] &&
         hwc->last_cpu == smp_processor_id() &&
         hwc->last_tag == cpuc->tags[i];
}



>> Cloud servers usually continuously monitor the cpi data of some important
>> services. This issue affects performance and misleads monitoring.
>>
>> The current event scheduling algorithm is more than 10 years old:
>> commit 1da53e023029 ("perf_events, x86: Improve x86 event scheduling")
> 
> irrelevant
> 

commit 1da53e023029 ("perf_events, x86: Improve x86 event scheduling")

This commit is the basis of the perf event scheduling algorithm we 
currently use.
The reason why the counter above changed from #1 to #3 can be found from it:
The algorithm takes into account the list of counter constraints
for each event. It assigns events to counters from the most
constrained, i.e., works on only one counter, to the least
constrained, i.e., works on any counter.

the nmi watchdog permanently consumes one fp (*cycles*).
  therefore, when the above shell script obtains *cycles:D*
again, it has to use a GP, and its weight is 5.
but other events (like *cache-misses*) have a weight of 4,
so the counter used by *cycles:D* will often be taken away.

In addition, we also found that this problem may affect NMI watchdog in 
the production cluster.
The NMI watchdog also uses a fixed counter in fixed mode. Usually, it is 
The first element of the event_list array, so it usually takes 
precedence and can get a fixed counter.
But if the administrator closes the watchdog first and then enables it, 
it may be at the end of the event_list array, so its expected fixed 
counter may be occupied by other perf event, and it can only use one GP. 
In this way, there is a similar issue here: the PMU counter used by the 
NMI watchdog may be disabled/enabled frequently and unnecessarily.


Any advice or guidance on this would be appreciated.

Best wishes,
Wen


>> we wish it could be optimized a bit.
> 
> I wish for a unicorn ...
> 
>> The fields msk_counters and msk_events are added to indicate currently
>> used counters and events so that the used ones can be skipped
>> in __perf_sched_find_counter and perf_sched_next_event functions to avoid
>> unnecessary pmu_stop/start.
> 
> Still not sure what your actual problem is, nor what the actual proposal
> is.
> 
> Why should I attempt to reverse engineer your code without basic
> understanding of what you're actually trying to achieve?
> 




^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RESEND PATCH 2/2] perf/x86: improve the event scheduling to avoid unnecessary pmu_stop/start
  2022-03-06 14:36     ` Wen Yang
@ 2022-03-07 17:14       ` Stephane Eranian
  2022-03-08  6:42         ` Wen Yang
  2022-03-08 12:50       ` Peter Zijlstra
  1 sibling, 1 reply; 16+ messages in thread
From: Stephane Eranian @ 2022-03-07 17:14 UTC (permalink / raw)
  To: Wen Yang
  Cc: Peter Zijlstra, Wen Yang, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Thomas Gleixner, mark rutland, jiri olsa,
	namhyung kim, borislav petkov, x86, h. peter anvin,
	linux-perf-users, linux-kernel

On Sun, Mar 6, 2022 at 6:36 AM Wen Yang <wenyang@linux.alibaba.com> wrote:
>
>
>
> 在 2022/3/4 下午11:39, Peter Zijlstra 写道:
> > On Fri, Mar 04, 2022 at 07:03:51PM +0800, Wen Yang wrote:
> >> this issue has been there for a long time, we could reproduce it as follows:
> >
> > What issue? You've not described an issue. So you cannot reference one.
> >
>
> OK, thank you for your opinion. Let's explain it.
>
>
> > This is still completely unreadable gibberish.
> >
> >> 1, run a script that periodically collects perf data, eg:
> >> while true
> >> do
> >>      perf stat -e cache-misses,cache-misses,cache-misses -c 1 sleep 2
> >>      perf stat -e cache-misses -c 1 sleep 2
> >>      sleep 1
> >> done
> >>
> >> 2, run another one to capture the ipc, eg:
> >> perf stat -e cycles:d,instructions:d  -c 1 -i 1000
> >
> > <snip line noise>
> >
> >> the reason is that the nmi watchdog permanently consumes one fp
> >> (*cycles*). therefore, when the above shell script obtains *cycles*
> >> again, only one gp can be used, and its weight is 5.
> >> but other events (like *cache-misses*) have a weight of 4,
> >> so the counter used by *cycles* will often be taken away, as in
> >> the raw data above:
> >> [1]
> >>    n_events = 3
> >>    assign = {33, 1, 32, ...}
> >> -->
> >>    n_events = 6
> >>    assign = {33, 3, 32, 0, 1, 2, ...}
> >
> > Again unreadable... what do any of those numbers mean?
> >
>
> Scenario: a monitor program will monitor the CPI on a specific CPU in
> pinned mode for a long time, such as the CPI in the original email:
>      perf stat -e cycles:D,instructions:D  -C 1 -I 1000
>
> Perf here is just an example. Our monitor program will continuously read
> the counter values of these perf events (cycles and instructions).
>
> However, it is found that CPI data will be abnormal periodically because
> PMU counter conflicts. For example, scripts in e-mail will cause
> interference:
>      perf stat -e cache-misses,cache-misses,cache-misses -C 1 sleep 2
>      perf stat -e cache-misses -C 1 sleep 2
>
>    n_events = 3
>    assign = {33, 1, 32, ...}
> -->
>    n_events = 6
>    assign = {33, 3, 32, 0, 1, 2, ...}
>
> They are some fields of cpu_hw_events structure.
> int n_events; /* the # of events in the below arrays */
> int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
> struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
>
> 32: intel_pmc_idx_fixed_instructions
> 33: intel_pmc_idx_fixed_cpu_cycles
> 34: intel_pmc_idx_fixed_ref_cycles
> 0,1,2,3: gp
>
>
> n_events = 3
> assign = {33, 1, 32, ...}
> event_list = {0xffff88bf77b85000, 0xffff88b72db82000,
> 0xffff88b72db85800, ...}
>
> —>
> Indicates that there are 3 perf events on CPU 1, which occupy the
> #fixed_cpu_cycles, #1 and #fixed_instruction counter one by one.
> These 3 perf events are generated by the NMI watchdog and the script A:
> perf stat -e cycles:D,instructions:D  -C 1 -I 1000
>
>
> n_events = 6
> assign = {33, 3, 32, 0, 1, 2, ...}
> event_list = {0xffff88bf77b85000, 0xffff88b72db82000,
> 0xffff88b72db85800, 0xffff88bf46c34000, 0xffff88bf46c35000,
> 0xffff88bf46c30000,  ...}
>
> —>
> Indicates that there are 6 perf events on CPU 1, which occupy the
> #fixed_cpu_cycles, #3, #fixed_instruction, #0, #1 and #2 counter one by one.
> These 6 perf events are generated by the NMI watchdog and the script A
> and B:
> perf stat -e cycles:D,instructions:D  -C 1 -I 1000
> perf stat -e cache-misses,cache-misses,cache-misses -C 1 sleep 2
>
> 0xffff88bf77b85000:
> The perf event generated by NMI watchdog, which has always occupied
> #fixed_cpu_cycles
>
> 0xffff88b72db82000:
> The perf event generated by the above script A (cycles:D), and the
> counter it used changes from #1 to #3. We use perf event in pinned mode,
> and then continuously read its value for a long time, but its PMU
> counter changes, so the counter value will also jump.
>
What you are describing here is working as expected. Pinning an event DOES NOT
mean pinning the event to a counter for the whole measurement. It
means the event
will always be measured on "a" counter. But that counter can change
overtime. Moving
the event to a different counter SHOULD NOT change its value assuming
you use official
API to read the counter, i.e., the read() syscall or rdpmc using the
official guideline for it.

>
>
> 0xffff88b72db85800:
> The perf event generated by the above script A (instructions:D), which
> has always occupied #fixed_instruction.
>
> 0xffff88bf46c34000, 0xffff88bf46c35000, 0xffff88bf46c30000:
> Theses perf events are generated by the above script B.
>
>
> >>
> >> so it will cause unnecessary pmu_stop/start and also cause abnormal cpi.
> >
> > How?!?
> >
>
> We may refer to the x86_pmu_enable function:
> step1: save events moving to new counters
> step2: reprogram moved events into new counters
>
> especially:
>
> static inline int match_prev_assignment(struct hw_perf_event *hwc,
>                      struct cpu_hw_events *cpuc,
>                      int i)
> {
>      return hwc->idx == cpuc->assign[i] &&
>          hwc->last_cpu == smp_processor_id() &&
>          hwc->last_tag == cpuc->tags[i];
> }
>
>
>
> >> Cloud servers usually continuously monitor the cpi data of some important
> >> services. This issue affects performance and misleads monitoring.
> >>
> >> The current event scheduling algorithm is more than 10 years old:
> >> commit 1da53e023029 ("perf_events, x86: Improve x86 event scheduling")
> >
> > irrelevant
> >
>
> commit 1da53e023029 ("perf_events, x86: Improve x86 event scheduling")
>
> This commit is the basis of the perf event scheduling algorithm we
> currently use.
> The reason why the counter above changed from #1 to #3 can be found from it:
> The algorithm takes into account the list of counter constraints
> for each event. It assigns events to counters from the most
> constrained, i.e., works on only one counter, to the least
> constrained, i.e., works on any counter.
>
> the nmi watchdog permanently consumes one fp (*cycles*).
>   therefore, when the above shell script obtains *cycles:D*
> again, it has to use a GP, and its weight is 5.
> but other events (like *cache-misses*) have a weight of 4,
> so the counter used by *cycles:D* will often be taken away.
>
And that 's normal. Measuring cycles on a generic counter or fixed
counter does not
change what is counted or the precision of it.

>
> In addition, we also found that this problem may affect NMI watchdog in
> the production cluster.
> The NMI watchdog also uses a fixed counter in fixed mode. Usually, it is
> The first element of the event_list array, so it usually takes
> precedence and can get a fixed counter.


Not true. The NMI watchdog does not request a fixed counter. There is
no way to do this.
It just lands on the fixed counter because the scheduling algorithm
favors fixed counters whenever
possible.

>
> But if the administrator closes the watchdog first and then enables it,
> it may be at the end of the event_list array, so its expected fixed
> counter may be occupied by other perf event, and it can only use one GP.
> In this way, there is a similar issue here: the PMU counter used by the
> NMI watchdog may be disabled/enabled frequently and unnecessarily.
>
Yes, and I don't think this is a problem.
You are trying to get counter guarantee but the interface DOES NOT
give this to users
and should not in my opinion.

>
> Any advice or guidance on this would be appreciated.
>
> Best wishes,
> Wen
>
>
> >> we wish it could be optimized a bit.
> >
> > I wish for a unicorn ...
> >
> >> The fields msk_counters and msk_events are added to indicate currently
> >> used counters and events so that the used ones can be skipped
> >> in __perf_sched_find_counter and perf_sched_next_event functions to avoid
> >> unnecessary pmu_stop/start.
> >
> > Still not sure what your actual problem is, nor what the actual proposal
> > is.
> >
> > Why should I attempt to reverse engineer your code without basic
> > understanding of what you're actually trying to achieve?
> >
>
>
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RESEND PATCH 2/2] perf/x86: improve the event scheduling to avoid unnecessary pmu_stop/start
  2022-03-07 17:14       ` Stephane Eranian
@ 2022-03-08  6:42         ` Wen Yang
  2022-03-08 12:53           ` Peter Zijlstra
  0 siblings, 1 reply; 16+ messages in thread
From: Wen Yang @ 2022-03-08  6:42 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: Peter Zijlstra, Wen Yang, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Thomas Gleixner, mark rutland, jiri olsa,
	namhyung kim, borislav petkov, x86, h. peter anvin,
	linux-perf-users, linux-kernel



在 2022/3/8 上午1:14, Stephane Eranian 写道:
> On Sun, Mar 6, 2022 at 6:36 AM Wen Yang <wenyang@linux.alibaba.com> wrote:
>>
>>
>>
>> 在 2022/3/4 下午11:39, Peter Zijlstra 写道:
>>> On Fri, Mar 04, 2022 at 07:03:51PM +0800, Wen Yang wrote:
>>>> this issue has been there for a long time, we could reproduce it as follows:
>>>
>>> What issue? You've not described an issue. So you cannot reference one.
>>>
>>
>> OK, thank you for your opinion. Let's explain it.
>>
>>
>>> This is still completely unreadable gibberish.
>>>
>>>> 1, run a script that periodically collects perf data, eg:
>>>> while true
>>>> do
>>>>       perf stat -e cache-misses,cache-misses,cache-misses -c 1 sleep 2
>>>>       perf stat -e cache-misses -c 1 sleep 2
>>>>       sleep 1
>>>> done
>>>>
>>>> 2, run another one to capture the ipc, eg:
>>>> perf stat -e cycles:d,instructions:d  -c 1 -i 1000
>>>
>>> <snip line noise>
>>>
>>>> the reason is that the nmi watchdog permanently consumes one fp
>>>> (*cycles*). therefore, when the above shell script obtains *cycles*
>>>> again, only one gp can be used, and its weight is 5.
>>>> but other events (like *cache-misses*) have a weight of 4,
>>>> so the counter used by *cycles* will often be taken away, as in
>>>> the raw data above:
>>>> [1]
>>>>     n_events = 3
>>>>     assign = {33, 1, 32, ...}
>>>> -->
>>>>     n_events = 6
>>>>     assign = {33, 3, 32, 0, 1, 2, ...}
>>>
>>> Again unreadable... what do any of those numbers mean?
>>>
>>
>> Scenario: a monitor program will monitor the CPI on a specific CPU in
>> pinned mode for a long time, such as the CPI in the original email:
>>       perf stat -e cycles:D,instructions:D  -C 1 -I 1000
>>
>> Perf here is just an example. Our monitor program will continuously read
>> the counter values of these perf events (cycles and instructions).
>>
>> However, it is found that CPI data will be abnormal periodically because
>> PMU counter conflicts. For example, scripts in e-mail will cause
>> interference:
>>       perf stat -e cache-misses,cache-misses,cache-misses -C 1 sleep 2
>>       perf stat -e cache-misses -C 1 sleep 2
>>
>>     n_events = 3
>>     assign = {33, 1, 32, ...}
>> -->
>>     n_events = 6
>>     assign = {33, 3, 32, 0, 1, 2, ...}
>>
>> They are some fields of cpu_hw_events structure.
>> int n_events; /* the # of events in the below arrays */
>> int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
>> struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
>>
>> 32: intel_pmc_idx_fixed_instructions
>> 33: intel_pmc_idx_fixed_cpu_cycles
>> 34: intel_pmc_idx_fixed_ref_cycles
>> 0,1,2,3: gp
>>
>>
>> n_events = 3
>> assign = {33, 1, 32, ...}
>> event_list = {0xffff88bf77b85000, 0xffff88b72db82000,
>> 0xffff88b72db85800, ...}
>>
>> —>
>> Indicates that there are 3 perf events on CPU 1, which occupy the
>> #fixed_cpu_cycles, #1 and #fixed_instruction counter one by one.
>> These 3 perf events are generated by the NMI watchdog and the script A:
>> perf stat -e cycles:D,instructions:D  -C 1 -I 1000
>>
>>
>> n_events = 6
>> assign = {33, 3, 32, 0, 1, 2, ...}
>> event_list = {0xffff88bf77b85000, 0xffff88b72db82000,
>> 0xffff88b72db85800, 0xffff88bf46c34000, 0xffff88bf46c35000,
>> 0xffff88bf46c30000,  ...}
>>
>> —>
>> Indicates that there are 6 perf events on CPU 1, which occupy the
>> #fixed_cpu_cycles, #3, #fixed_instruction, #0, #1 and #2 counter one by one.
>> These 6 perf events are generated by the NMI watchdog and the script A
>> and B:
>> perf stat -e cycles:D,instructions:D  -C 1 -I 1000
>> perf stat -e cache-misses,cache-misses,cache-misses -C 1 sleep 2
>>
>> 0xffff88bf77b85000:
>> The perf event generated by NMI watchdog, which has always occupied
>> #fixed_cpu_cycles
>>
>> 0xffff88b72db82000:
>> The perf event generated by the above script A (cycles:D), and the
>> counter it used changes from #1 to #3. We use perf event in pinned mode,
>> and then continuously read its value for a long time, but its PMU
>> counter changes, so the counter value will also jump.
>>
> What you are describing here is working as expected. Pinning an event DOES NOT
> mean pinning the event to a counter for the whole measurement. It
> means the event
> will always be measured on "a" counter. But that counter can change
> overtime. Moving
> the event to a different counter SHOULD NOT change its value assuming
> you use official
> API to read the counter, i.e., the read() syscall or rdpmc using the
> official guideline for it.
> 

Perhaps the following code could ensure that the pmu counter value is 
stable:


     /*
      * Careful: an NMI might modify the previous event value.
      *
      * Our tactic to handle this is to first atomically read and
      * exchange a new raw count - then add that new-prev delta
      * count to the generic event atomically:
      */
again:
     prev_raw_count = local64_read(&hwc->prev_count);
     rdpmcl(hwc->event_base_rdpmc, new_raw_count);

     if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
                     new_raw_count) != prev_raw_count)
         goto again;


It might be better if we could reduce the calls to goto again.


>>
>>
>> 0xffff88b72db85800:
>> The perf event generated by the above script A (instructions:D), which
>> has always occupied #fixed_instruction.
>>
>> 0xffff88bf46c34000, 0xffff88bf46c35000, 0xffff88bf46c30000:
>> Theses perf events are generated by the above script B.
>>
>>
>>>>
>>>> so it will cause unnecessary pmu_stop/start and also cause abnormal cpi.
>>>
>>> How?!?
>>>
>>
>> We may refer to the x86_pmu_enable function:
>> step1: save events moving to new counters
>> step2: reprogram moved events into new counters
>>
>> especially:
>>
>> static inline int match_prev_assignment(struct hw_perf_event *hwc,
>>                       struct cpu_hw_events *cpuc,
>>                       int i)
>> {
>>       return hwc->idx == cpuc->assign[i] &&
>>           hwc->last_cpu == smp_processor_id() &&
>>           hwc->last_tag == cpuc->tags[i];
>> }
>>
>>
>>
>>>> Cloud servers usually continuously monitor the cpi data of some important
>>>> services. This issue affects performance and misleads monitoring.
>>>>
>>>> The current event scheduling algorithm is more than 10 years old:
>>>> commit 1da53e023029 ("perf_events, x86: Improve x86 event scheduling")
>>>
>>> irrelevant
>>>
>>
>> commit 1da53e023029 ("perf_events, x86: Improve x86 event scheduling")
>>
>> This commit is the basis of the perf event scheduling algorithm we
>> currently use.
>> The reason why the counter above changed from #1 to #3 can be found from it:
>> The algorithm takes into account the list of counter constraints
>> for each event. It assigns events to counters from the most
>> constrained, i.e., works on only one counter, to the least
>> constrained, i.e., works on any counter.
>>
>> the nmi watchdog permanently consumes one fp (*cycles*).
>>    therefore, when the above shell script obtains *cycles:D*
>> again, it has to use a GP, and its weight is 5.
>> but other events (like *cache-misses*) have a weight of 4,
>> so the counter used by *cycles:D* will often be taken away.
>>
> And that 's normal. Measuring cycles on a generic counter or fixed
> counter does not
> change what is counted or the precision of it.
> 

Well, it's generally OK, although it wastes a valuable GP.
However, on the ECS clusters, there are various monitoring programs, and 
the PMU counters are not enough.
Our subsequent patch will be optimized as follows: all fixed perf events 
(such as cycles) will be read from the same PMU counter unless the 
sampling period is set.


>>
>> In addition, we also found that this problem may affect NMI watchdog in
>> the production cluster.
>> The NMI watchdog also uses a fixed counter in fixed mode. Usually, it is
>> The first element of the event_list array, so it usually takes
>> precedence and can get a fixed counter.
> 
> 
> Not true. The NMI watchdog does not request a fixed counter. There is
> no way to do this.
> It just lands on the fixed counter because the scheduling algorithm
> favors fixed counters whenever
> possible.
> 
>>
>> But if the administrator closes the watchdog first and then enables it,
>> it may be at the end of the event_list array, so its expected fixed
>> counter may be occupied by other perf event, and it can only use one GP.
>> In this way, there is a similar issue here: the PMU counter used by the
>> NMI watchdog may be disabled/enabled frequently and unnecessarily.
>>
> Yes, and I don't think this is a problem.
> You are trying to get counter guarantee but the interface DOES NOT
> give this to users
> and should not in my opinion.
> 

The existing code is already trying to reuse the previous PMU counters, 
such as:

     /*
      * fastpath, try to reuse previous register
      */
     for (i = 0; i < n; i++) {
         u64 mask;

         hwc = &cpuc->event_list[i]->hw;
         c = cpuc->event_constraint[i];

         /* never assigned */
         if (hwc->idx == -1)
             break;

         /* constraint still honored */
         if (!test_bit(hwc->idx, c->idxmsk))
             break;

         mask = BIT_ULL(hwc->idx);
         if (is_counter_pair(hwc))
             mask |= mask << 1;

         /* not already used */
         if (used_mask & mask)
             break;

         used_mask |= mask;

         if (assign)
             assign[i] = hwc->idx;
     }


But it only works when the perf events are reduced.

Now we want to extend this idea a bit. In the scenario of increasing 
perf events, we also try to reuse the previous PMU counters.
Finally, if this attempt fails, still go back to the original algorithm 
and reassign all PMU counters. eg:

  	/* slow path */
-	if (i != n)
-		unsched = _perf_assign_events(constraints, n,
-				wmin, wmax, gpmax, assign);
+	if (i != n) {
+		unsched = _perf_assign_events(constraints, n, wmin, wmax, gpmax,
+				msk_counters, msk_event, assign);
+		if (unsched)
+			unsched = _perf_assign_events(constraints, n, wmin, wmax, gpmax,
+					0, 0, assign);
+	}


Your guidance has been particularly helpful, and we look forward to 
your comments and input during this session.

Best wishes,
Wen


>>
>> Any advice or guidance on this would be appreciated.
>>
>> Best wishes,
>> Wen
>>
>>
>>>> we wish it could be optimized a bit.
>>>
>>> I wish for a unicorn ...
>>>
>>>> The fields msk_counters and msk_events are added to indicate currently
>>>> used counters and events so that the used ones can be skipped
>>>> in __perf_sched_find_counter and perf_sched_next_event functions to avoid
>>>> unnecessary pmu_stop/start.
>>>
>>> Still not sure what your actual problem is, nor what the actual proposal
>>> is.
>>>
>>> Why should I attempt to reverse engineer your code without basic
>>> understanding of what you're actually trying to achieve?
>>>
>>
>>
>>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RESEND PATCH 2/2] perf/x86: improve the event scheduling to avoid unnecessary pmu_stop/start
  2022-03-06 14:36     ` Wen Yang
  2022-03-07 17:14       ` Stephane Eranian
@ 2022-03-08 12:50       ` Peter Zijlstra
  2022-03-10  3:50         ` Wen Yang
  1 sibling, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2022-03-08 12:50 UTC (permalink / raw)
  To: Wen Yang
  Cc: Wen Yang, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Thomas Gleixner, Stephane Eranian,
	mark rutland, jiri olsa, namhyung kim, borislav petkov, x86,
	h. peter anvin, linux-perf-users, linux-kernel

On Sun, Mar 06, 2022 at 10:36:38PM +0800, Wen Yang wrote:

> The perf event generated by the above script A (cycles:D), and the counter
> it used changes from #1 to #3. We use perf event in pinned mode, and then
> continuously read its value for a long time, but its PMU counter changes

Yes, so what?

>, so the counter value will also jump.

I fail to see how the counter value will jump when we reprogram the
thing. When we stop we update the value, then reprogram on another
counter and continue. So where does it go sideways?

> 
> 0xffff88b72db85800:
> The perf event generated by the above script A (instructions:D), which has
> always occupied #fixed_instruction.
> 
> 0xffff88bf46c34000, 0xffff88bf46c35000, 0xffff88bf46c30000:
> Theses perf events are generated by the above script B.
> 
> 
> > > 
> > > so it will cause unnecessary pmu_stop/start and also cause abnormal cpi.
> > 
> > How?!?
> > 
> 
> We may refer to the x86_pmu_enable function:
> step1: save events moving to new counters
> step2: reprogram moved events into new counters
> 
> especially:
> 
> static inline int match_prev_assignment(struct hw_perf_event *hwc,
>                     struct cpu_hw_events *cpuc,
>                     int i)
> {
>     return hwc->idx == cpuc->assign[i] &&
>         hwc->last_cpu == smp_processor_id() &&
>         hwc->last_tag == cpuc->tags[i];
> }

I'm not seeing an explanation for how a counter value is not preserved.

> > > Cloud servers usually continuously monitor the cpi data of some important
> > > services. This issue affects performance and misleads monitoring.
> > > 
> > > The current event scheduling algorithm is more than 10 years old:
> > > commit 1da53e023029 ("perf_events, x86: Improve x86 event scheduling")
> > 
> > irrelevant
> > 
> 
> commit 1da53e023029 ("perf_events, x86: Improve x86 event scheduling")
> 
> This commit is the basis of the perf event scheduling algorithm we currently
> use.

Well yes. But how is the age of it relevant?

> The reason why the counter above changed from #1 to #3 can be found from it:
> The algorithm takes into account the list of counter constraints
> for each event. It assigns events to counters from the most
> constrained, i.e., works on only one counter, to the least
> constrained, i.e., works on any counter.
> 
> the nmi watchdog permanently consumes one fp (*cycles*).
>  therefore, when the above shell script obtains *cycles:D*
> again, it has to use a GP, and its weight is 5.
> but other events (like *cache-misses*) have a weight of 4,
> so the counter used by *cycles:D* will often be taken away.

So what?

I mean, it is known the algorithm isn't optimal, but at least it's
bounded. There are event sets that will fail to schedule but could, but
I don't think you're talking about that.

Event migrating to a different counter is not a problem. This is
expected and normal. Code *must* be able to deal with it.

> In addition, we also found that this problem may affect NMI watchdog in the
> production cluster.
> The NMI watchdog also uses a fixed counter in fixed mode. Usually, it is The
> first element of the event_list array, so it usually takes precedence and
> can get a fixed counter.
> But if the administrator closes the watchdog first and then enables it, it
> may be at the end of the event_list array, so its expected fixed counter may
> be occupied by other perf event, and it can only use one GP. In this way,
> there is a similar issue here: the PMU counter used by the NMI watchdog may
> be disabled/enabled frequently and unnecessarily.

Again, I'm not seeing a problem. If you create more events than we have
hardware counters we'll rotate the list and things will get scheduled in
all sorts of order. This works.

> Any advice or guidance on this would be appreciated.

I'm still not sure what your actual problem is; I suspect you're using
perf wrong.

Are you using rdpmc and not respecting the scheme described in
include/uapi/linux/perf_events.h:perf_event_mmap_page ?

Note that if you're using pinned counters you can simplify that scheme
by ignoring all the timekeeping nonsense. In that case it does become
significantly simpler/faster.

But you cannot use rdpmc without using the mmap page's self-monitoring
data.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RESEND PATCH 2/2] perf/x86: improve the event scheduling to avoid unnecessary pmu_stop/start
  2022-03-08  6:42         ` Wen Yang
@ 2022-03-08 12:53           ` Peter Zijlstra
  0 siblings, 0 replies; 16+ messages in thread
From: Peter Zijlstra @ 2022-03-08 12:53 UTC (permalink / raw)
  To: Wen Yang
  Cc: Stephane Eranian, Wen Yang, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Thomas Gleixner,
	mark rutland, jiri olsa, namhyung kim, borislav petkov, x86,
	h. peter anvin, linux-perf-users, linux-kernel

On Tue, Mar 08, 2022 at 02:42:10PM +0800, Wen Yang wrote:

> Perhaps the following code could ensure that the pmu counter value is
> stable:
> 
> 
>     /*
>      * Careful: an NMI might modify the previous event value.
>      *
>      * Our tactic to handle this is to first atomically read and
>      * exchange a new raw count - then add that new-prev delta
>      * count to the generic event atomically:
>      */
> again:
>     prev_raw_count = local64_read(&hwc->prev_count);
>     rdpmcl(hwc->event_base_rdpmc, new_raw_count);
> 
>     if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
>                     new_raw_count) != prev_raw_count)
>         goto again;
> 
> 
> It might be better if we could reduce the calls to goto again.

This is completely unrelated. And that goto is rather unlikely, unless
you're doing some truely weird things.

That case happens when the PMI for a counter lands in the middle of a
read() for that counter. In that case both will try and fold the
hardware delta into the software counter. This conflict is fundamentally
unavoidable and needs to be dealt with. The above guarantees correctness
in this case.

It is however extremely unlikely and has *NOTHING* what so ever to do
with counter scheduling.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RESEND PATCH 2/2] perf/x86: improve the event scheduling to avoid unnecessary pmu_stop/start
  2022-03-08 12:50       ` Peter Zijlstra
@ 2022-03-10  3:50         ` Wen Yang
  2022-03-14 10:55           ` Peter Zijlstra
  0 siblings, 1 reply; 16+ messages in thread
From: Wen Yang @ 2022-03-10  3:50 UTC (permalink / raw)
  To: Peter Zijlstra, Stephane Eranian
  Cc: Wen Yang, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Thomas Gleixner, Stephane Eranian,
	mark rutland, jiri olsa, namhyung kim, borislav petkov, x86,
	h. peter anvin, linux-perf-users, linux-kernel



在 2022/3/8 下午8:50, Peter Zijlstra 写道:
> On Sun, Mar 06, 2022 at 10:36:38PM +0800, Wen Yang wrote:
> 
>> The perf event generated by the above script A (cycles:D), and the counter
>> it used changes from #1 to #3. We use perf event in pinned mode, and then
>> continuously read its value for a long time, but its PMU counter changes
> 
> Yes, so what?
> 
>> , so the counter value will also jump.
> 
> I fail to see how the counter value will jump when we reprogram the
> thing. When we stop we update the value, then reprogram on another
> counter and continue. So where does it go sideways?
> 
>>
>> 0xffff88b72db85800:
>> The perf event generated by the above script A (instructions:D), which has
>> always occupied #fixed_instruction.
>>
>> 0xffff88bf46c34000, 0xffff88bf46c35000, 0xffff88bf46c30000:
>> Theses perf events are generated by the above script B.
>>
>>
>>>>
>>>> so it will cause unnecessary pmu_stop/start and also cause abnormal cpi.
>>>
>>> How?!?
>>>
>>
>> We may refer to the x86_pmu_enable function:
>> step1: save events moving to new counters
>> step2: reprogram moved events into new counters
>>
>> especially:
>>
>> static inline int match_prev_assignment(struct hw_perf_event *hwc,
>>                      struct cpu_hw_events *cpuc,
>>                      int i)
>> {
>>      return hwc->idx == cpuc->assign[i] &&
>>          hwc->last_cpu == smp_processor_id() &&
>>          hwc->last_tag == cpuc->tags[i];
>> }
> 
> I'm not seeing an explanation for how a counter value is not preserved.
> 
>>>> Cloud servers usually continuously monitor the cpi data of some important
>>>> services. This issue affects performance and misleads monitoring.
>>>>
>>>> The current event scheduling algorithm is more than 10 years old:
>>>> commit 1da53e023029 ("perf_events, x86: Improve x86 event scheduling")
>>>
>>> irrelevant
>>>
>>
>> commit 1da53e023029 ("perf_events, x86: Improve x86 event scheduling")
>>
>> This commit is the basis of the perf event scheduling algorithm we currently
>> use.
> 
> Well yes. But how is the age of it relevant?
> 
>> The reason why the counter above changed from #1 to #3 can be found from it:
>> The algorithm takes into account the list of counter constraints
>> for each event. It assigns events to counters from the most
>> constrained, i.e., works on only one counter, to the least
>> constrained, i.e., works on any counter.
>>
>> the nmi watchdog permanently consumes one fp (*cycles*).
>>   therefore, when the above shell script obtains *cycles:D*
>> again, it has to use a GP, and its weight is 5.
>> but other events (like *cache-misses*) have a weight of 4,
>> so the counter used by *cycles:D* will often be taken away.
> 
> So what?
> 
> I mean, it is known the algorithm isn't optimal, but at least it's
> bounded. There are event sets that will fail to schedule but could, but
> I don't think you're talking about that.
> 
> Event migrating to a different counter is not a problem. This is
> expected and normal. Code *must* be able to deal with it.
> 
>> In addition, we also found that this problem may affect NMI watchdog in the
>> production cluster.
>> The NMI watchdog also uses a fixed counter in fixed mode. Usually, it is The
>> first element of the event_list array, so it usually takes precedence and
>> can get a fixed counter.
>> But if the administrator closes the watchdog first and then enables it, it
>> may be at the end of the event_list array, so its expected fixed counter may
>> be occupied by other perf event, and it can only use one GP. In this way,
>> there is a similar issue here: the PMU counter used by the NMI watchdog may
>> be disabled/enabled frequently and unnecessarily.
> 
> Again, I'm not seeing a problem. If you create more events than we have
> hardware counters we'll rotate the list and things will get scheduled in
> all sorts of order. This works.
> 
>> Any advice or guidance on this would be appreciated.
> 
> I'm still not sure what your actual problem is; I suspect you're using
> perf wrong.
> 
> Are you using rdpmc and not respecting the scheme described in
> include/uapi/linux/perf_events.h:perf_event_mmap_page ?
> 
> Note that if you're using pinned counters you can simplify that scheme
> by ignoring all the timekeeping nonsense. In that case it does become
> significantly simpler/faster.
> 
> But you cannot use rdpmc without using the mmap page's self-monitoring
> data.
> 


The current algorithm is outstanding and has been running stably for so 
many years, and all existing performance work benefits from it.

We're just trying to optimize a little bit.

As you pointed out, some non-compliant rdpmc can cause problems. But you 
also know that linux is the foundation of cloud servers, and many 
third-party programs run on it (we don't have any code for it), and we 
can only observe that the monitoring data will jitter abnormally (the 
probability of this issue is not high, about dozens of tens of thousands 
of machines).

We also see that the existing perf code is also trying to avoid frequent 
changes of the pmu counter, such as:


     /*
      * fastpath, try to reuse previous register
      */
     for (i = 0; i < n; i++) {
         u64 mask;

         hwc = &cpuc->event_list[i]->hw;
         c = cpuc->event_constraint[i];

         /* never assigned */
         if (hwc->idx == -1)
             break;

         /* constraint still honored */
         if (!test_bit(hwc->idx, c->idxmsk))
             break;

         mask = BIT_ULL(hwc->idx);
         if (is_counter_pair(hwc))
             mask |= mask << 1;

         /* not already used */
         if (used_mask & mask)
             break;

         used_mask |= mask;

         if (assign)
             assign[i] = hwc->idx;
     }


But it only works when the perf events are reduced.

    and nother example (via the PERF_HES_ARCH flag):

          * step1: save events moving to new counters
          */
         for (i = 0; i < n_running; i++) {
...
             /*
              * Ensure we don't accidentally enable a stopped
              * counter simply because we rescheduled.
              */
             if (hwc->state & PERF_HES_STOPPED)
                 hwc->state |= PERF_HES_ARCH;

             x86_pmu_stop(event, PERF_EF_UPDATE);
         }

         /*
          * step2: reprogram moved events into new counters
          */
         for (i = 0; i < cpuc->n_events; i++) {
...

             if (hwc->state & PERF_HES_ARCH)
                 continue;

             x86_pmu_start(event, PERF_EF_RELOAD);
         }


Now we are just follow this idea.

In the scenario of increasing perf events, we also try to reuse the 
previous PMU counters.
Finally, if this attempt fails, still fall back to the original 
algorithm and reassign all PMU counters. eg:

      /* slow path */
-    if (i != n)
-        unsched = _perf_assign_events(constraints, n,
-                wmin, wmax, gpmax, assign);
+    if (i != n) {
+        unsched = _perf_assign_events(constraints, n, wmin, wmax, gpmax,
+                msk_counters, msk_event, assign);
+        if (unsched)
+            unsched = _perf_assign_events(constraints, n, wmin, wmax, 
gpmax,
+                    0, 0, assign);
+    }


We have applied the above optimizations in some small-scale clusters and 
verified that the abnormal cpi data has indeed disappeared. It might 
also improve performance a bit.


Your guidance has been particularly helpful.

Best wishes,
Wen






^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RESEND PATCH 2/2] perf/x86: improve the event scheduling to avoid unnecessary pmu_stop/start
  2022-03-10  3:50         ` Wen Yang
@ 2022-03-14 10:55           ` Peter Zijlstra
  2022-03-17 17:54             ` Wen Yang
  0 siblings, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2022-03-14 10:55 UTC (permalink / raw)
  To: Wen Yang
  Cc: Stephane Eranian, Wen Yang, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Thomas Gleixner,
	mark rutland, jiri olsa, namhyung kim, borislav petkov, x86,
	h. peter anvin, linux-perf-users, linux-kernel

On Thu, Mar 10, 2022 at 11:50:33AM +0800, Wen Yang wrote:

> As you pointed out, some non-compliant rdpmc can cause problems. But you
> also know that linux is the foundation of cloud servers, and many
> third-party programs run on it (we don't have any code for it), and we can
> only observe that the monitoring data will jitter abnormally (the
> probability of this issue is not high, about dozens of tens of thousands of
> machines).

This might be a novel insight, but I *really* don't give a crap about
any of that. If they're not using it right, they get to keep the pieces.

I'd almost make it reschedule more to force them to fix their stuff.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RESEND PATCH 2/2] perf/x86: improve the event scheduling to avoid unnecessary pmu_stop/start
  2022-03-14 10:55           ` Peter Zijlstra
@ 2022-03-17 17:54             ` Wen Yang
  2022-04-17 15:06               ` Wen Yang
  0 siblings, 1 reply; 16+ messages in thread
From: Wen Yang @ 2022-03-17 17:54 UTC (permalink / raw)
  To: Peter Zijlstra, Stephane Eranian
  Cc: Stephane Eranian, Wen Yang, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Thomas Gleixner,
	mark rutland, jiri olsa, namhyung kim, borislav petkov, x86,
	h. peter anvin, linux-perf-users, linux-kernel



在 2022/3/14 下午6:55, Peter Zijlstra 写道:
> On Thu, Mar 10, 2022 at 11:50:33AM +0800, Wen Yang wrote:
> 
>> As you pointed out, some non-compliant rdpmc can cause problems. But you
>> also know that linux is the foundation of cloud servers, and many
>> third-party programs run on it (we don't have any code for it), and we can
>> only observe that the monitoring data will jitter abnormally (the
>> probability of this issue is not high, about dozens of tens of thousands of
>> machines).
> 
> This might be a novel insight, but I *really* don't give a crap about
> any of that. If they're not using it right, they get to keep the pieces.
> 
> I'd almost make it reschedule more to force them to fix their stuff.
> 


Thank you for your guidance.

We also found a case in thousands of servers where the PMU counter is no 
longer updated due to frequent x86_pmu_stop/x86_pmu_start.

We added logs in the kernel and found that a third-party program would 
cause the PMU counter to start/stop several times in just a few seconds, 
as follows:


[8993460.537776] XXX x86_pmu_stop line=1388 [cpu1] active_mask=100000001 
event=ffff880a53411000, state=1, attr.type=0, attr.config=0x0, 
attr.pinned=1, hw.idx=3, hw.prev_count=0x802a877ef302, 
hw.period_left=0x7fd578810cfe, event.count=0x14db802a877ecab4, 
event.prev_count=0x14db802a877ecab4
[8993460.915873] XXX x86_pmu_start line=1312 [cpu1] 
active_mask=200000008 event=ffff880a53411000, state=1, attr.type=0, 
attr.config=0x0, attr.pinned=1, hw.idx=3, 
hw.prev_count=0xffff802a9cf6a166, hw.period_left=0x7fd563095e9a, 
event.count=0x14db802a9cf67918, event.prev_count=0x14db802a9cf67918
[8993461.104643] XXX x86_pmu_stop line=1388 [cpu1] active_mask=100000001 
event=ffff880a53411000, state=1, attr.type=0, attr.config=0x0, 
attr.pinned=1, hw.idx=3, hw.prev_count=0xffff802a9cf6a166, 
hw.period_left=0x7fd563095e9a, event.count=0x14db802a9cf67918, 
event.prev_count=0x14db802a9cf67918
[8993461.442508] XXX x86_pmu_start line=1312 [cpu1] 
active_mask=200000004 event=ffff880a53411000, state=1, attr.type=0, 
attr.config=0x0, attr.pinned=1, hw.idx=2, 
hw.prev_count=0xffff802a9cf8492e, hw.period_left=0x7fd56307b6d2, 
event.count=0x14db802a9cf820e0, event.prev_count=0x14db802a9cf820e0
[8993461.736927] XXX x86_pmu_stop line=1388 [cpu1] active_mask=100000001 
event=ffff880a53411000, state=1, attr.type=0, attr.config=0x0, 
attr.pinned=1, hw.idx=2, hw.prev_count=0xffff802a9cf8492e, 
hw.period_left=0x7fd56307b6d2, event.count=0x14db802a9cf820e0, 
event.prev_count=0x14db802a9cf820e0
[8993461.983135] XXX x86_pmu_start line=1312 [cpu1] 
active_mask=200000004 event=ffff880a53411000, state=1, attr.type=0, 
attr.config=0x0, attr.pinned=1, hw.idx=2, 
hw.prev_count=0xffff802a9cfc29ed, hw.period_left=0x7fd56303d613, 
event.count=0x14db802a9cfc019f, event.prev_count=0x14db802a9cfc019f
[8993462.274599] XXX x86_pmu_stop line=1388 [cpu1] active_mask=100000001 
event=ffff880a53411000, state=1, attr.type=0, attr.config=0x0, 
attr.pinned=1, hw.idx=2, hw.prev_count=0x802a9d24040e, 
hw.period_left=0x7fd562dbfbf2, event.count=0x14db802a9d23dbc0, 
event.prev_count=0x14db802a9d23dbc0
[8993462.519488] XXX x86_pmu_start line=1312 [cpu1] 
active_mask=200000004 event=ffff880a53411000, state=1, attr.type=0, 
attr.config=0x0, attr.pinned=1, hw.idx=2, 
hw.prev_count=0xffff802ab0bb4719, hw.period_left=0x7fd54f44b8e7, 
event.count=0x14db802ab0bb1ecb, event.prev_count=0x14db802ab0bb1ecb
[8993462.726929] XXX x86_pmu_stop line=1388 [cpu1] active_mask=100000003 
event=ffff880a53411000, state=1, attr.type=0, attr.config=0x0, 
attr.pinned=1, hw.idx=2, hw.prev_count=0xffff802ab0bb4719, 
hw.period_left=0x7fd54f44b8e7, event.count=0x14db802ab0bb1ecb, 
event.prev_count=0x14db802ab0bb1ecb
[8993463.035674] XXX x86_pmu_start line=1312 [cpu1] 
active_mask=200000008 event=ffff880a53411000, state=1, attr.type=0, 
attr.config=0x0, attr.pinned=1, hw.idx=3, 
hw.prev_count=0xffff802ab0bcd328, hw.period_left=0x7fd54f432cd8, 
event.count=0x14db802ab0bcaada, event.prev_count=0x14db802ab0bcaada


Then, the PMU counter will not be updated:

[8993463.333622] x86_perf_event_update, event=ffff880a53411000, 
new_raw_count=802abea31354
[8993463.359905] x86_perf_event_update [cpu1] active_mask=30000000f 
event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
hw.idx=3, hw.prev_count=0x802abea31354, hw.period_left=0x7fd5415cecac, 
event.count=0x14db802abea2eb06,
[8993463.504783] x86_perf_event_update, event=ffff880a53411000, 
new_raw_count=802ad8760160
[8993463.521138] x86_perf_event_update [cpu1] active_mask=30000000f 
event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
hw.idx=3, hw.prev_count=0x802ad8760160, hw.period_left=0x7fd52789fea0, 
event.count=0x14db802ad875d912,
[8993463.638337] x86_perf_event_update, event=ffff880a53411000, 
new_raw_count=802aecb4747b
[8993463.654441] x86_perf_event_update [cpu1] active_mask=30000000f 
event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
hw.idx=3, hw.prev_count=0x802aecb4747b, hw.period_left=0x7fd5134b8b85, 
event.count=0x14db802aecb44c2d,
[8993463.837321] x86_perf_event_update, event=ffff880a53411000, 
new_raw_count=802aecb4747b
[8993463.861625] x86_perf_event_update [cpu1] active_mask=30000000f 
event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
hw.idx=3, hw.prev_count=0x802aecb4747b, hw.period_left=0x7fd5134b8b85, 
event.count=0x14db802aecb44c2d,
[8993464.012398] x86_perf_event_update, event=ffff880a53411000, 
new_raw_count=802aecb4747b
[8993464.012402] x86_perf_event_update [cpu1] active_mask=30000000f 
event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
hw.idx=3, hw.prev_count=0x802aecb4747b, hw.period_left=0x7fd5134b8b85, 
event.count=0x14db802aecb44c2d,
[8993464.013676] x86_perf_event_update, event=ffff880a53411000, 
new_raw_count=802aecb4747b
[8993464.013678] x86_perf_event_update [cpu1] active_mask=30000000f 
event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
hw.idx=3, hw.prev_count=0x802aecb4747b, hw.period_left=0x7fd5134b8b85, 
event.count=0x14db802aecb44c2d,
[8993464.016123] x86_perf_event_update, event=ffff880a53411000, 
new_raw_count=802aecb4747b
[8993464.016125] x86_perf_event_update [cpu1] active_mask=30000000f 
event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
hw.idx=3, hw.prev_count=0x802aecb4747b, hw.period_left=0x7fd5134b8b85, 
event.count=0x14db802aecb44c2d,
[8993464.016196] x86_perf_event_update, event=ffff880a53411000, 
new_raw_count=802aecb4747b
[8993464.016199] x86_perf_event_update [cpu1] active_mask=30000000f 
event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
hw.idx=3, hw.prev_count=0x802aecb4747b, hw.period_left=0x7fd5134b8b85, 
event.count=0x14db802aecb44c2d,

......


Until 6 seconds later, the counter is stopped/started again:


[8993470.243959] XXX x86_pmu_stop line=1388 [cpu1] active_mask=100000001 
event=ffff880a53411000, state=1, attr.type=0, attr.config=0x0, 
attr.pinned=1, hw.idx=3, hw.prev_count=0x802aecb4747b, 
hw.period_left=0x7fd5134b8b85, event.count=0x14db802aecb44c2d, 
event.prev_count=0x14db802aecb44c2d
[8993470.243998] XXX x86_pmu_start line=1305 [cpu1] 
active_mask=200000000 event=ffff880a53411000, state=1, attr.type=0, 
attr.config=0x0, attr.pinned=1, hw.idx=3, 
hw.prev_count=0xffff802aecb4747b, hw.period_left=0x7fd5134b8b85, 
event.count=0x14db802aecb44c2d, event.prev_count=0x14db802aecb44c2d

[8993470.245285] x86_perf_event_update, event=ffff880a53411000, 
new_raw_count=802aece1e6f6

...

Such problems can be solved by avoiding unnecessary x86_pmu_{stop|start}.

Please have a look again. Thanks.

--
Best wishes,
Wen



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RESEND PATCH 2/2] perf/x86: improve the event scheduling to avoid unnecessary pmu_stop/start
  2022-03-17 17:54             ` Wen Yang
@ 2022-04-17 15:06               ` Wen Yang
  2022-04-19 14:16                 ` Wen Yang
  0 siblings, 1 reply; 16+ messages in thread
From: Wen Yang @ 2022-04-17 15:06 UTC (permalink / raw)
  To: Peter Zijlstra, Stephane Eranian
  Cc: Wen Yang, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Thomas Gleixner, mark rutland, jiri olsa,
	namhyung kim, borislav petkov, x86, h. peter anvin,
	linux-perf-users, linux-kernel



在 2022/3/18 上午1:54, Wen Yang 写道:
> 
> 
> 在 2022/3/14 下午6:55, Peter Zijlstra 写道:
>> On Thu, Mar 10, 2022 at 11:50:33AM +0800, Wen Yang wrote:
>>
>>> As you pointed out, some non-compliant rdpmc can cause problems. But you
>>> also know that linux is the foundation of cloud servers, and many
>>> third-party programs run on it (we don't have any code for it), and 
>>> we can
>>> only observe that the monitoring data will jitter abnormally (the
>>> probability of this issue is not high, about dozens of tens of 
>>> thousands of
>>> machines).
>>
>> This might be a novel insight, but I *really* don't give a crap about
>> any of that. If they're not using it right, they get to keep the pieces.
>>
>> I'd almost make it reschedule more to force them to fix their stuff.
>>
> 
> 
> Thank you for your guidance.
> 
> We also found a case in thousands of servers where the PMU counter is no 
> longer updated due to frequent x86_pmu_stop/x86_pmu_start.
> 
> We added logs in the kernel and found that a third-party program would 
> cause the PMU counter to start/stop several times in just a few seconds, 
> as follows:
> 
> 
> [8993460.537776] XXX x86_pmu_stop line=1388 [cpu1] active_mask=100000001 
> event=ffff880a53411000, state=1, attr.type=0, attr.config=0x0, 
> attr.pinned=1, hw.idx=3, hw.prev_count=0x802a877ef302, 
> hw.period_left=0x7fd578810cfe, event.count=0x14db802a877ecab4, 
> event.prev_count=0x14db802a877ecab4
> [8993460.915873] XXX x86_pmu_start line=1312 [cpu1] 
> active_mask=200000008 event=ffff880a53411000, state=1, attr.type=0, 
> attr.config=0x0, attr.pinned=1, hw.idx=3, 
> hw.prev_count=0xffff802a9cf6a166, hw.period_left=0x7fd563095e9a, 
> event.count=0x14db802a9cf67918, event.prev_count=0x14db802a9cf67918
> [8993461.104643] XXX x86_pmu_stop line=1388 [cpu1] active_mask=100000001 
> event=ffff880a53411000, state=1, attr.type=0, attr.config=0x0, 
> attr.pinned=1, hw.idx=3, hw.prev_count=0xffff802a9cf6a166, 
> hw.period_left=0x7fd563095e9a, event.count=0x14db802a9cf67918, 
> event.prev_count=0x14db802a9cf67918
> [8993461.442508] XXX x86_pmu_start line=1312 [cpu1] 
> active_mask=200000004 event=ffff880a53411000, state=1, attr.type=0, 
> attr.config=0x0, attr.pinned=1, hw.idx=2, 
> hw.prev_count=0xffff802a9cf8492e, hw.period_left=0x7fd56307b6d2, 
> event.count=0x14db802a9cf820e0, event.prev_count=0x14db802a9cf820e0
> [8993461.736927] XXX x86_pmu_stop line=1388 [cpu1] active_mask=100000001 
> event=ffff880a53411000, state=1, attr.type=0, attr.config=0x0, 
> attr.pinned=1, hw.idx=2, hw.prev_count=0xffff802a9cf8492e, 
> hw.period_left=0x7fd56307b6d2, event.count=0x14db802a9cf820e0, 
> event.prev_count=0x14db802a9cf820e0
> [8993461.983135] XXX x86_pmu_start line=1312 [cpu1] 
> active_mask=200000004 event=ffff880a53411000, state=1, attr.type=0, 
> attr.config=0x0, attr.pinned=1, hw.idx=2, 
> hw.prev_count=0xffff802a9cfc29ed, hw.period_left=0x7fd56303d613, 
> event.count=0x14db802a9cfc019f, event.prev_count=0x14db802a9cfc019f
> [8993462.274599] XXX x86_pmu_stop line=1388 [cpu1] active_mask=100000001 
> event=ffff880a53411000, state=1, attr.type=0, attr.config=0x0, 
> attr.pinned=1, hw.idx=2, hw.prev_count=0x802a9d24040e, 
> hw.period_left=0x7fd562dbfbf2, event.count=0x14db802a9d23dbc0, 
> event.prev_count=0x14db802a9d23dbc0
> [8993462.519488] XXX x86_pmu_start line=1312 [cpu1] 
> active_mask=200000004 event=ffff880a53411000, state=1, attr.type=0, 
> attr.config=0x0, attr.pinned=1, hw.idx=2, 
> hw.prev_count=0xffff802ab0bb4719, hw.period_left=0x7fd54f44b8e7, 
> event.count=0x14db802ab0bb1ecb, event.prev_count=0x14db802ab0bb1ecb
> [8993462.726929] XXX x86_pmu_stop line=1388 [cpu1] active_mask=100000003 
> event=ffff880a53411000, state=1, attr.type=0, attr.config=0x0, 
> attr.pinned=1, hw.idx=2, hw.prev_count=0xffff802ab0bb4719, 
> hw.period_left=0x7fd54f44b8e7, event.count=0x14db802ab0bb1ecb, 
> event.prev_count=0x14db802ab0bb1ecb
> [8993463.035674] XXX x86_pmu_start line=1312 [cpu1] 
> active_mask=200000008 event=ffff880a53411000, state=1, attr.type=0, 
> attr.config=0x0, attr.pinned=1, hw.idx=3, 
> hw.prev_count=0xffff802ab0bcd328, hw.period_left=0x7fd54f432cd8, 
> event.count=0x14db802ab0bcaada, event.prev_count=0x14db802ab0bcaada
> 
> 
> Then, the PMU counter will not be updated:
> 
> [8993463.333622] x86_perf_event_update, event=ffff880a53411000, 
> new_raw_count=802abea31354
> [8993463.359905] x86_perf_event_update [cpu1] active_mask=30000000f 
> event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
> hw.idx=3, hw.prev_count=0x802abea31354, hw.period_left=0x7fd5415cecac, 
> event.count=0x14db802abea2eb06,
> [8993463.504783] x86_perf_event_update, event=ffff880a53411000, 
> new_raw_count=802ad8760160
> [8993463.521138] x86_perf_event_update [cpu1] active_mask=30000000f 
> event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
> hw.idx=3, hw.prev_count=0x802ad8760160, hw.period_left=0x7fd52789fea0, 
> event.count=0x14db802ad875d912,
> [8993463.638337] x86_perf_event_update, event=ffff880a53411000, 
> new_raw_count=802aecb4747b
> [8993463.654441] x86_perf_event_update [cpu1] active_mask=30000000f 
> event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
> hw.idx=3, hw.prev_count=0x802aecb4747b, hw.period_left=0x7fd5134b8b85, 
> event.count=0x14db802aecb44c2d,
> [8993463.837321] x86_perf_event_update, event=ffff880a53411000, 
> new_raw_count=802aecb4747b
> [8993463.861625] x86_perf_event_update [cpu1] active_mask=30000000f 
> event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
> hw.idx=3, hw.prev_count=0x802aecb4747b, hw.period_left=0x7fd5134b8b85, 
> event.count=0x14db802aecb44c2d,
> [8993464.012398] x86_perf_event_update, event=ffff880a53411000, 
> new_raw_count=802aecb4747b
> [8993464.012402] x86_perf_event_update [cpu1] active_mask=30000000f 
> event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
> hw.idx=3, hw.prev_count=0x802aecb4747b, hw.period_left=0x7fd5134b8b85, 
> event.count=0x14db802aecb44c2d,
> [8993464.013676] x86_perf_event_update, event=ffff880a53411000, 
> new_raw_count=802aecb4747b
> [8993464.013678] x86_perf_event_update [cpu1] active_mask=30000000f 
> event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
> hw.idx=3, hw.prev_count=0x802aecb4747b, hw.period_left=0x7fd5134b8b85, 
> event.count=0x14db802aecb44c2d,
> [8993464.016123] x86_perf_event_update, event=ffff880a53411000, 
> new_raw_count=802aecb4747b
> [8993464.016125] x86_perf_event_update [cpu1] active_mask=30000000f 
> event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
> hw.idx=3, hw.prev_count=0x802aecb4747b, hw.period_left=0x7fd5134b8b85, 
> event.count=0x14db802aecb44c2d,
> [8993464.016196] x86_perf_event_update, event=ffff880a53411000, 
> new_raw_count=802aecb4747b
> [8993464.016199] x86_perf_event_update [cpu1] active_mask=30000000f 
> event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
> hw.idx=3, hw.prev_count=0x802aecb4747b, hw.period_left=0x7fd5134b8b85, 
> event.count=0x14db802aecb44c2d,
> 
> ......
> 
> 
> Until 6 seconds later, the counter is stopped/started again:
> 
> 
> [8993470.243959] XXX x86_pmu_stop line=1388 [cpu1] active_mask=100000001 
> event=ffff880a53411000, state=1, attr.type=0, attr.config=0x0, 
> attr.pinned=1, hw.idx=3, hw.prev_count=0x802aecb4747b, 
> hw.period_left=0x7fd5134b8b85, event.count=0x14db802aecb44c2d, 
> event.prev_count=0x14db802aecb44c2d
> [8993470.243998] XXX x86_pmu_start line=1305 [cpu1] 
> active_mask=200000000 event=ffff880a53411000, state=1, attr.type=0, 
> attr.config=0x0, attr.pinned=1, hw.idx=3, 
> hw.prev_count=0xffff802aecb4747b, hw.period_left=0x7fd5134b8b85, 
> event.count=0x14db802aecb44c2d, event.prev_count=0x14db802aecb44c2d
> 
> [8993470.245285] x86_perf_event_update, event=ffff880a53411000, 
> new_raw_count=802aece1e6f6
> 
> ...
> 
> Such problems can be solved by avoiding unnecessary x86_pmu_{stop|start}.
> 
> Please have a look again. Thanks.
> 

We recently tracked this issue again found that it may be related to the 
behavior of the third GP of  the Intel(R) Xeon(R) Platinum 8163 CPU:


[54511836.022997] CPU#1: ctrl:       000000070000000f
[54511836.022997] CPU#1: status:     0000000000000000
[54511836.022998] CPU#1: overflow:   0000000000000000
[54511836.022998] CPU#1: fixed:      00000000000000bb
[54511836.022998] CPU#1: pebs:       0000000000000000
[54511836.022999] CPU#1: debugctl:   0000000000000000
[54511836.022999] CPU#1: active:     000000030000000f
[54511836.023000] CPU#1:   gen-PMC0 ctrl:  000000000053412e
[54511836.023000] CPU#1:   gen-PMC0 count: 0000985b7d1a15e7
[54511836.023000] CPU#1:   gen-PMC0 left:  000067a483643939
[54511836.023001] CPU#1:   gen-PMC1 ctrl:  00000000005310d1
[54511836.023002] CPU#1:   gen-PMC1 count: 000080000016448e
[54511836.023002] CPU#1:   gen-PMC1 left:  00007ffffffffd37
[54511836.023003] CPU#1:   gen-PMC2 ctrl:  00000000005301d1
[54511836.023003] CPU#1:   gen-PMC2 count: 00008000e615b9ab
[54511836.023004] CPU#1:   gen-PMC2 left:  00007fffffffffff
[54511836.023005] CPU#1:   gen-PMC3 ctrl:  000000000053003c
[54511836.023005] CPU#1:   gen-PMC3 count: 0000801f6139b1e1
[54511836.023005] CPU#1:   gen-PMC3 left:  00007fe2a2dc14b7
[54511836.023006] CPU#1: fixed-PMC0 count: 00008e0fa307b34e
[54511836.023006] CPU#1: fixed-PMC1 count: 0000ffff3d01adb8
[54511836.023007] CPU#1: fixed-PMC2 count: 0000cf10d01b651e


The Gen-pmc3 Ctrl will be changed suddenly:

[54511836.023085] CPU#1: ctrl:       000000070000000f
[54511836.023085] CPU#1: status:     0000000000000000
[54511836.023085] CPU#1: overflow:   0000000000000000
[54511836.023086] CPU#1: fixed:      00000000000000bb
[54511836.023086] CPU#1: pebs:       0000000000000000
[54511836.023086] CPU#1: debugctl:   0000000000000000
[54511836.023087] CPU#1: active:     000000030000000f
[54511836.023087] CPU#1:   gen-PMC0 ctrl:  000000000053412e
[54511836.023088] CPU#1:   gen-PMC0 count: 0000985b7d1a183b
[54511836.023088] CPU#1:   gen-PMC0 left:  000067a483643939
[54511836.023089] CPU#1:   gen-PMC1 ctrl:  00000000005310d1
[54511836.023089] CPU#1:   gen-PMC1 count: 0000800000164ca8
[54511836.023090] CPU#1:   gen-PMC1 left:  00007ffffffffd37
[54511836.023091] CPU#1:   gen-PMC2 ctrl:  00000000005301d1
[54511836.023091] CPU#1:   gen-PMC2 count: 00008000e61634fd
[54511836.023092] CPU#1:   gen-PMC2 left:  00007fffffffffff
[54511836.023092] CPU#1:   gen-PMC3 ctrl:  000000010043003c
[54511836.023093] CPU#1:   gen-PMC3 count: 0000801f613b87d0
[54511836.023093] CPU#1:   gen-PMC3 left:  00007fe2a2dc14b7
[54511836.023094] CPU#1: fixed-PMC0 count: 00008e0fa309e091
[54511836.023095] CPU#1: fixed-PMC1 count: 0000ffff3d050901
[54511836.023095] CPU#1: fixed-PMC2 count: 0000cf10d01b651e


The gen-PMC3 ctrl changed,
000000000053003c -> 000000010043003c

After that, the gen-PMC3 count remains 0000801f613b87d0 and will not be 
updated. A series of subsequent issues, such as abnormal CPI data, are 
generated.

However, the special value (000000010043003c) of the gen-pmc3 Ctrl is 
not actively set by the application. It is suspected that some special 
operation has caused the GP3 Ctrl to be changed, and it is still under 
discussion with Intel’s FAE.
We are also looking at the following code:
https://github.com/torvalds/linux/blob/94710cac0ef4ee177a63b5227664b38c95bbf703/arch/x86/events/intel/core.c#L1931 


At present, only the above phenomenon has been observed, but the exact 
cause has not yet been found.

However, this patch attempts to avoid the switching of the pmu counters 
in various perf_events, so the special behavior of a single pmu counter 
will not be propagated to other events.

--
Best wishes,
Wen


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RESEND PATCH 2/2] perf/x86: improve the event scheduling to avoid unnecessary pmu_stop/start
  2022-04-17 15:06               ` Wen Yang
@ 2022-04-19 14:16                 ` Wen Yang
  2022-04-19 20:57                   ` Peter Zijlstra
  0 siblings, 1 reply; 16+ messages in thread
From: Wen Yang @ 2022-04-19 14:16 UTC (permalink / raw)
  To: Peter Zijlstra, Stephane Eranian
  Cc: Wen Yang, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Thomas Gleixner, mark rutland, jiri olsa,
	namhyung kim, borislav petkov, x86, h. peter anvin,
	linux-perf-users, linux-kernel



在 2022/4/17 下午11:06, Wen Yang 写道:
> 
> 
> 在 2022/3/18 上午1:54, Wen Yang 写道:
>>
>>
>> 在 2022/3/14 下午6:55, Peter Zijlstra 写道:
>>> On Thu, Mar 10, 2022 at 11:50:33AM +0800, Wen Yang wrote:
>>>
>>>> As you pointed out, some non-compliant rdpmc can cause problems. But 
>>>> you
>>>> also know that linux is the foundation of cloud servers, and many
>>>> third-party programs run on it (we don't have any code for it), and 
>>>> we can
>>>> only observe that the monitoring data will jitter abnormally (the
>>>> probability of this issue is not high, about dozens of tens of 
>>>> thousands of
>>>> machines).
>>>
>>> This might be a novel insight, but I *really* don't give a crap about
>>> any of that. If they're not using it right, they get to keep the pieces.
>>>
>>> I'd almost make it reschedule more to force them to fix their stuff.
>>>
>>
>>
>> Thank you for your guidance.
>>
>> We also found a case in thousands of servers where the PMU counter is 
>> no longer updated due to frequent x86_pmu_stop/x86_pmu_start.
>>
>> We added logs in the kernel and found that a third-party program would 
>> cause the PMU counter to start/stop several times in just a few 
>> seconds, as follows:
>>
>>
>> [8993460.537776] XXX x86_pmu_stop line=1388 [cpu1] 
>> active_mask=100000001 event=ffff880a53411000, state=1, attr.type=0, 
>> attr.config=0x0, attr.pinned=1, hw.idx=3, 
>> hw.prev_count=0x802a877ef302, hw.period_left=0x7fd578810cfe, 
>> event.count=0x14db802a877ecab4, event.prev_count=0x14db802a877ecab4
>> [8993460.915873] XXX x86_pmu_start line=1312 [cpu1] 
>> active_mask=200000008 event=ffff880a53411000, state=1, attr.type=0, 
>> attr.config=0x0, attr.pinned=1, hw.idx=3, 
>> hw.prev_count=0xffff802a9cf6a166, hw.period_left=0x7fd563095e9a, 
>> event.count=0x14db802a9cf67918, event.prev_count=0x14db802a9cf67918
>> [8993461.104643] XXX x86_pmu_stop line=1388 [cpu1] 
>> active_mask=100000001 event=ffff880a53411000, state=1, attr.type=0, 
>> attr.config=0x0, attr.pinned=1, hw.idx=3, 
>> hw.prev_count=0xffff802a9cf6a166, hw.period_left=0x7fd563095e9a, 
>> event.count=0x14db802a9cf67918, event.prev_count=0x14db802a9cf67918
>> [8993461.442508] XXX x86_pmu_start line=1312 [cpu1] 
>> active_mask=200000004 event=ffff880a53411000, state=1, attr.type=0, 
>> attr.config=0x0, attr.pinned=1, hw.idx=2, 
>> hw.prev_count=0xffff802a9cf8492e, hw.period_left=0x7fd56307b6d2, 
>> event.count=0x14db802a9cf820e0, event.prev_count=0x14db802a9cf820e0
>> [8993461.736927] XXX x86_pmu_stop line=1388 [cpu1] 
>> active_mask=100000001 event=ffff880a53411000, state=1, attr.type=0, 
>> attr.config=0x0, attr.pinned=1, hw.idx=2, 
>> hw.prev_count=0xffff802a9cf8492e, hw.period_left=0x7fd56307b6d2, 
>> event.count=0x14db802a9cf820e0, event.prev_count=0x14db802a9cf820e0
>> [8993461.983135] XXX x86_pmu_start line=1312 [cpu1] 
>> active_mask=200000004 event=ffff880a53411000, state=1, attr.type=0, 
>> attr.config=0x0, attr.pinned=1, hw.idx=2, 
>> hw.prev_count=0xffff802a9cfc29ed, hw.period_left=0x7fd56303d613, 
>> event.count=0x14db802a9cfc019f, event.prev_count=0x14db802a9cfc019f
>> [8993462.274599] XXX x86_pmu_stop line=1388 [cpu1] 
>> active_mask=100000001 event=ffff880a53411000, state=1, attr.type=0, 
>> attr.config=0x0, attr.pinned=1, hw.idx=2, 
>> hw.prev_count=0x802a9d24040e, hw.period_left=0x7fd562dbfbf2, 
>> event.count=0x14db802a9d23dbc0, event.prev_count=0x14db802a9d23dbc0
>> [8993462.519488] XXX x86_pmu_start line=1312 [cpu1] 
>> active_mask=200000004 event=ffff880a53411000, state=1, attr.type=0, 
>> attr.config=0x0, attr.pinned=1, hw.idx=2, 
>> hw.prev_count=0xffff802ab0bb4719, hw.period_left=0x7fd54f44b8e7, 
>> event.count=0x14db802ab0bb1ecb, event.prev_count=0x14db802ab0bb1ecb
>> [8993462.726929] XXX x86_pmu_stop line=1388 [cpu1] 
>> active_mask=100000003 event=ffff880a53411000, state=1, attr.type=0, 
>> attr.config=0x0, attr.pinned=1, hw.idx=2, 
>> hw.prev_count=0xffff802ab0bb4719, hw.period_left=0x7fd54f44b8e7, 
>> event.count=0x14db802ab0bb1ecb, event.prev_count=0x14db802ab0bb1ecb
>> [8993463.035674] XXX x86_pmu_start line=1312 [cpu1] 
>> active_mask=200000008 event=ffff880a53411000, state=1, attr.type=0, 
>> attr.config=0x0, attr.pinned=1, hw.idx=3, 
>> hw.prev_count=0xffff802ab0bcd328, hw.period_left=0x7fd54f432cd8, 
>> event.count=0x14db802ab0bcaada, event.prev_count=0x14db802ab0bcaada
>>
>>
>> Then, the PMU counter will not be updated:
>>
>> [8993463.333622] x86_perf_event_update, event=ffff880a53411000, 
>> new_raw_count=802abea31354
>> [8993463.359905] x86_perf_event_update [cpu1] active_mask=30000000f 
>> event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
>> hw.idx=3, hw.prev_count=0x802abea31354, hw.period_left=0x7fd5415cecac, 
>> event.count=0x14db802abea2eb06,
>> [8993463.504783] x86_perf_event_update, event=ffff880a53411000, 
>> new_raw_count=802ad8760160
>> [8993463.521138] x86_perf_event_update [cpu1] active_mask=30000000f 
>> event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
>> hw.idx=3, hw.prev_count=0x802ad8760160, hw.period_left=0x7fd52789fea0, 
>> event.count=0x14db802ad875d912,
>> [8993463.638337] x86_perf_event_update, event=ffff880a53411000, 
>> new_raw_count=802aecb4747b
>> [8993463.654441] x86_perf_event_update [cpu1] active_mask=30000000f 
>> event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
>> hw.idx=3, hw.prev_count=0x802aecb4747b, hw.period_left=0x7fd5134b8b85, 
>> event.count=0x14db802aecb44c2d,
>> [8993463.837321] x86_perf_event_update, event=ffff880a53411000, 
>> new_raw_count=802aecb4747b
>> [8993463.861625] x86_perf_event_update [cpu1] active_mask=30000000f 
>> event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
>> hw.idx=3, hw.prev_count=0x802aecb4747b, hw.period_left=0x7fd5134b8b85, 
>> event.count=0x14db802aecb44c2d,
>> [8993464.012398] x86_perf_event_update, event=ffff880a53411000, 
>> new_raw_count=802aecb4747b
>> [8993464.012402] x86_perf_event_update [cpu1] active_mask=30000000f 
>> event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
>> hw.idx=3, hw.prev_count=0x802aecb4747b, hw.period_left=0x7fd5134b8b85, 
>> event.count=0x14db802aecb44c2d,
>> [8993464.013676] x86_perf_event_update, event=ffff880a53411000, 
>> new_raw_count=802aecb4747b
>> [8993464.013678] x86_perf_event_update [cpu1] active_mask=30000000f 
>> event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
>> hw.idx=3, hw.prev_count=0x802aecb4747b, hw.period_left=0x7fd5134b8b85, 
>> event.count=0x14db802aecb44c2d,
>> [8993464.016123] x86_perf_event_update, event=ffff880a53411000, 
>> new_raw_count=802aecb4747b
>> [8993464.016125] x86_perf_event_update [cpu1] active_mask=30000000f 
>> event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
>> hw.idx=3, hw.prev_count=0x802aecb4747b, hw.period_left=0x7fd5134b8b85, 
>> event.count=0x14db802aecb44c2d,
>> [8993464.016196] x86_perf_event_update, event=ffff880a53411000, 
>> new_raw_count=802aecb4747b
>> [8993464.016199] x86_perf_event_update [cpu1] active_mask=30000000f 
>> event=ffff880a53411000, state=1, attr.config=0x0, attr.pinned=1, 
>> hw.idx=3, hw.prev_count=0x802aecb4747b, hw.period_left=0x7fd5134b8b85, 
>> event.count=0x14db802aecb44c2d,
>>
>> ......
>>
>>
>> Until 6 seconds later, the counter is stopped/started again:
>>
>>
>> [8993470.243959] XXX x86_pmu_stop line=1388 [cpu1] 
>> active_mask=100000001 event=ffff880a53411000, state=1, attr.type=0, 
>> attr.config=0x0, attr.pinned=1, hw.idx=3, 
>> hw.prev_count=0x802aecb4747b, hw.period_left=0x7fd5134b8b85, 
>> event.count=0x14db802aecb44c2d, event.prev_count=0x14db802aecb44c2d
>> [8993470.243998] XXX x86_pmu_start line=1305 [cpu1] 
>> active_mask=200000000 event=ffff880a53411000, state=1, attr.type=0, 
>> attr.config=0x0, attr.pinned=1, hw.idx=3, 
>> hw.prev_count=0xffff802aecb4747b, hw.period_left=0x7fd5134b8b85, 
>> event.count=0x14db802aecb44c2d, event.prev_count=0x14db802aecb44c2d
>>
>> [8993470.245285] x86_perf_event_update, event=ffff880a53411000, 
>> new_raw_count=802aece1e6f6
>>
>> ...
>>
>> Such problems can be solved by avoiding unnecessary x86_pmu_{stop|start}.
>>
>> Please have a look again. Thanks.
>>
> 
> We recently tracked this issue again found that it may be related to the 
> behavior of the third GP of  the Intel(R) Xeon(R) Platinum 8163 CPU:
> 
> 
> [54511836.022997] CPU#1: ctrl:       000000070000000f
> [54511836.022997] CPU#1: status:     0000000000000000
> [54511836.022998] CPU#1: overflow:   0000000000000000
> [54511836.022998] CPU#1: fixed:      00000000000000bb
> [54511836.022998] CPU#1: pebs:       0000000000000000
> [54511836.022999] CPU#1: debugctl:   0000000000000000
> [54511836.022999] CPU#1: active:     000000030000000f
> [54511836.023000] CPU#1:   gen-PMC0 ctrl:  000000000053412e
> [54511836.023000] CPU#1:   gen-PMC0 count: 0000985b7d1a15e7
> [54511836.023000] CPU#1:   gen-PMC0 left:  000067a483643939
> [54511836.023001] CPU#1:   gen-PMC1 ctrl:  00000000005310d1
> [54511836.023002] CPU#1:   gen-PMC1 count: 000080000016448e
> [54511836.023002] CPU#1:   gen-PMC1 left:  00007ffffffffd37
> [54511836.023003] CPU#1:   gen-PMC2 ctrl:  00000000005301d1
> [54511836.023003] CPU#1:   gen-PMC2 count: 00008000e615b9ab
> [54511836.023004] CPU#1:   gen-PMC2 left:  00007fffffffffff
> [54511836.023005] CPU#1:   gen-PMC3 ctrl:  000000000053003c
> [54511836.023005] CPU#1:   gen-PMC3 count: 0000801f6139b1e1
> [54511836.023005] CPU#1:   gen-PMC3 left:  00007fe2a2dc14b7
> [54511836.023006] CPU#1: fixed-PMC0 count: 00008e0fa307b34e
> [54511836.023006] CPU#1: fixed-PMC1 count: 0000ffff3d01adb8
> [54511836.023007] CPU#1: fixed-PMC2 count: 0000cf10d01b651e
> 
> 
> The Gen-pmc3 Ctrl will be changed suddenly:
> 
> [54511836.023085] CPU#1: ctrl:       000000070000000f
> [54511836.023085] CPU#1: status:     0000000000000000
> [54511836.023085] CPU#1: overflow:   0000000000000000
> [54511836.023086] CPU#1: fixed:      00000000000000bb
> [54511836.023086] CPU#1: pebs:       0000000000000000
> [54511836.023086] CPU#1: debugctl:   0000000000000000
> [54511836.023087] CPU#1: active:     000000030000000f
> [54511836.023087] CPU#1:   gen-PMC0 ctrl:  000000000053412e
> [54511836.023088] CPU#1:   gen-PMC0 count: 0000985b7d1a183b
> [54511836.023088] CPU#1:   gen-PMC0 left:  000067a483643939
> [54511836.023089] CPU#1:   gen-PMC1 ctrl:  00000000005310d1
> [54511836.023089] CPU#1:   gen-PMC1 count: 0000800000164ca8
> [54511836.023090] CPU#1:   gen-PMC1 left:  00007ffffffffd37
> [54511836.023091] CPU#1:   gen-PMC2 ctrl:  00000000005301d1
> [54511836.023091] CPU#1:   gen-PMC2 count: 00008000e61634fd
> [54511836.023092] CPU#1:   gen-PMC2 left:  00007fffffffffff
> [54511836.023092] CPU#1:   gen-PMC3 ctrl:  000000010043003c
> [54511836.023093] CPU#1:   gen-PMC3 count: 0000801f613b87d0
> [54511836.023093] CPU#1:   gen-PMC3 left:  00007fe2a2dc14b7
> [54511836.023094] CPU#1: fixed-PMC0 count: 00008e0fa309e091
> [54511836.023095] CPU#1: fixed-PMC1 count: 0000ffff3d050901
> [54511836.023095] CPU#1: fixed-PMC2 count: 0000cf10d01b651e
> 
> 
> The gen-PMC3 ctrl changed,
> 000000000053003c -> 000000010043003c
> 
> After that, the gen-PMC3 count remains 0000801f613b87d0 and will not be 
> updated. A series of subsequent issues, such as abnormal CPI data, are 
> generated.
> 
> However, the special value (000000010043003c) of the gen-pmc3 Ctrl is 
> not actively set by the application. It is suspected that some special 
> operation has caused the GP3 Ctrl to be changed, and it is still under 
> discussion with Intel’s FAE.
> > At present, only the above phenomenon has been observed, but the exact
> cause has not yet been found.


We finally found that TFA (TSX Force Abort) may affect PMC3's behavior, 
refer to the following patch:

400816f60c54 perf/x86/intel:  ("Implement support for TSX Force Abort")

When the MSR gets set; the microcode will no longer use PMC3 but will
Force Abort every TSX transaction (upon executing COMMIT).

When TSX Force Abort (TFA) is allowed (default); the MSR gets set when
PMC3 gets scheduled and cleared when, after scheduling, PMC3 is
unused.

When TFA is not allowed; clear PMC3 from all constraints such that it
will not get used.


> 
> However, this patch attempts to avoid the switching of the pmu counters 
> in various perf_events, so the special behavior of a single pmu counter 
> will not be propagated to other events.
> 

Since PMC3 may have special behaviors, the continuous switching of PMU 
counters may not only affects the performance, but also may lead to 
abnormal data, please consider this patch again.

Thanks.

> -- 
> Best wishes,
> Wen
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RESEND PATCH 2/2] perf/x86: improve the event scheduling to avoid unnecessary pmu_stop/start
  2022-04-19 14:16                 ` Wen Yang
@ 2022-04-19 20:57                   ` Peter Zijlstra
  2022-04-19 21:18                     ` Stephane Eranian
  0 siblings, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2022-04-19 20:57 UTC (permalink / raw)
  To: Wen Yang
  Cc: Stephane Eranian, Wen Yang, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Thomas Gleixner,
	mark rutland, jiri olsa, namhyung kim, borislav petkov, x86,
	h. peter anvin, linux-perf-users, linux-kernel

On Tue, Apr 19, 2022 at 10:16:12PM +0800, Wen Yang wrote:
> We finally found that TFA (TSX Force Abort) may affect PMC3's behavior,
> refer to the following patch:
> 
> 400816f60c54 perf/x86/intel:  ("Implement support for TSX Force Abort")
> 
> When the MSR gets set; the microcode will no longer use PMC3 but will
> Force Abort every TSX transaction (upon executing COMMIT).
> 
> When TSX Force Abort (TFA) is allowed (default); the MSR gets set when
> PMC3 gets scheduled and cleared when, after scheduling, PMC3 is
> unused.
> 
> When TFA is not allowed; clear PMC3 from all constraints such that it
> will not get used.
> 
> 
> > 
> > However, this patch attempts to avoid the switching of the pmu counters
> > in various perf_events, so the special behavior of a single pmu counter
> > will not be propagated to other events.
> > 
> 
> Since PMC3 may have special behaviors, the continuous switching of PMU
> counters may not only affects the performance, but also may lead to abnormal
> data, please consider this patch again.

I'm not following. How do you get abnormal data?

Are you using RDPMC from userspace? If so, are you following the
prescribed logic using the self-monitoring interface?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RESEND PATCH 2/2] perf/x86: improve the event scheduling to avoid unnecessary pmu_stop/start
  2022-04-19 20:57                   ` Peter Zijlstra
@ 2022-04-19 21:18                     ` Stephane Eranian
  2022-04-20 14:44                       ` Wen Yang
  0 siblings, 1 reply; 16+ messages in thread
From: Stephane Eranian @ 2022-04-19 21:18 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Wen Yang, Wen Yang, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Thomas Gleixner, mark rutland, jiri olsa,
	namhyung kim, borislav petkov, x86, h. peter anvin,
	linux-perf-users, linux-kernel

Hi,

Going back to the original description of this patch 2/2, it seems the
problem was that you expected PINNED events to always remain in
the same counters. This is NOT what the interface guarantees. A pinned
event is guaranteed to either be on a counter or in error state if active.
But while active the event can change counters because of event scheduling
and this is fine. The kernel only computes deltas of the raw counter. If you
are using the read() syscall to extract a value, then this is totally
transparent
and you will see no jumps. If you are instead using RDPMC, then you cannot
assume the counter index of a pinned event remains the same. If you do, then
yes, you will see discrepancies in the count returned by RDPMC.  You cannot
just use RDPMC to read a counter from user space. You need kernel help.
The info you need is in the page you must mmap on the fd of the event. It
shows the current counter index of the event along with sequence number and
timing to help scale the count if necessary. This proper loop for
RDPMC is documented
in include/uapi/linux/perf_event.h inside the perf_event_mmap_page definition.

As for TFA, it is not clear to me why this is a problem unless you
have the RDPMC problem
I described above.


On Tue, Apr 19, 2022 at 1:57 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Apr 19, 2022 at 10:16:12PM +0800, Wen Yang wrote:
> > We finally found that TFA (TSX Force Abort) may affect PMC3's behavior,
> > refer to the following patch:
> >
> > 400816f60c54 perf/x86/intel:  ("Implement support for TSX Force Abort")
> >
> > When the MSR gets set; the microcode will no longer use PMC3 but will
> > Force Abort every TSX transaction (upon executing COMMIT).
> >
> > When TSX Force Abort (TFA) is allowed (default); the MSR gets set when
> > PMC3 gets scheduled and cleared when, after scheduling, PMC3 is
> > unused.
> >
> > When TFA is not allowed; clear PMC3 from all constraints such that it
> > will not get used.
> >
> >
> > >
> > > However, this patch attempts to avoid the switching of the pmu counters
> > > in various perf_events, so the special behavior of a single pmu counter
> > > will not be propagated to other events.
> > >
> >
> > Since PMC3 may have special behaviors, the continuous switching of PMU
> > counters may not only affects the performance, but also may lead to abnormal
> > data, please consider this patch again.
>
> I'm not following. How do you get abnormal data?
>
> Are you using RDPMC from userspace? If so, are you following the
> prescribed logic using the self-monitoring interface?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RESEND PATCH 2/2] perf/x86: improve the event scheduling to avoid unnecessary pmu_stop/start
  2022-04-19 21:18                     ` Stephane Eranian
@ 2022-04-20 14:44                       ` Wen Yang
  0 siblings, 0 replies; 16+ messages in thread
From: Wen Yang @ 2022-04-20 14:44 UTC (permalink / raw)
  To: Stephane Eranian, Peter Zijlstra
  Cc: Wen Yang, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Thomas Gleixner, mark rutland, jiri olsa,
	namhyung kim, borislav petkov, x86, h. peter anvin,
	linux-perf-users, linux-kernel



在 2022/4/20 上午5:18, Stephane Eranian 写道:
> Hi,
> 
> Going back to the original description of this patch 2/2, it seems the
> problem was that you expected PINNED events to always remain in
> the same counters. This is NOT what the interface guarantees. A pinned
> event is guaranteed to either be on a counter or in error state if active.
> But while active the event can change counters because of event scheduling
> and this is fine. The kernel only computes deltas of the raw counter. If you
> are using the read() syscall to extract a value, then this is totally
> transparent
> and you will see no jumps. If you are instead using RDPMC, then you cannot
> assume the counter index of a pinned event remains the same. If you do, then
> yes, you will see discrepancies in the count returned by RDPMC.  You cannot
> just use RDPMC to read a counter from user space. You need kernel help.
> The info you need is in the page you must mmap on the fd of the event. It
> shows the current counter index of the event along with sequence number and
> timing to help scale the count if necessary. This proper loop for
> RDPMC is documented
> in include/uapi/linux/perf_event.h inside the perf_event_mmap_page definition.
> 
> As for TFA, it is not clear to me why this is a problem unless you
> have the RDPMC problem
> I described above.
> 

Thank you for your comments.

Our scenario is: all four GP are used up, and the abnormal PMC3 counter 
is observed on several machines. In addition, the kernel version is 
4.9/4.19.

After we encountered the problem of abnormal CPI data a few months ago, 
we checked all kinds of applications according to your suggestions here 
and finally determined that they all comply with the specifications in 
include/uapi/linux/perf_event.h.

After a long experiment, it was found that this problem was caused by TFA:

When Restricted Transactional Memory (RTM) is supported 
(CPUID.07H.EBX.RTM [bit 11] = 1) and CPUID.07H.EDX[bit 13]=1 and 
TSX_FORCE_ABORT[RTM_FORCE_ABORT]=0 (described later in this document), 
then Performance Monitor Unit (PMU) general purpose counter 3 
(IA32_PMC3, MSR C4H and IA32_A_PMC3, MSR 4C4H) may contain unexpected 
values. Specifically, IA32_PMC3 (MSR C4H), IA32_PERF_GLOBAL_CTRL[3] (MSR 
38FH) and IA32_PERFEVTSEL3 (MSR 189H) may contain unexpected values, 
which also affects IA32_A_PMC3 (MSR 4C4H) and IA32_PERF_GLOBAL_INUSE[3] 
(MSR 392H).
--> from 
https://www.intel.com/content/dam/support/us/en/documents/processors/Performance-Monitoring-Impact-of-TSX-Memory-Ordering-Issue-604224.pdf

We also submitted an IPS to Intel:
https://premiersupport.intel.com/IPS/5003b00001fqdhaAAA

For the latest kernel, this issue could be handled by the following commit:
400816f60c54 perf/x86/intel:  ("Implement support for TSX Force Abort")

However, many production environments are 4.9, 4.19, or even 3.10 
kernel, which do not contain the above commit, and it is difficult to 
make hotfix from this commit, so these kernels will be affected by this 
problem.

This patch 2/2 attempts to avoid the switching of the pmu counters in 
various perf_events, so the special behavior of a single pmu counter 
(eg, PMC3 here) will not be propagated to other events. We also made 
hotfix from it and verified it on some machines.

Please have another look.
Thanks

--
Best wishes,
Wen


> 
> On Tue, Apr 19, 2022 at 1:57 PM Peter Zijlstra <peterz@infradead.org> wrote:
>>
>> On Tue, Apr 19, 2022 at 10:16:12PM +0800, Wen Yang wrote:
>>> We finally found that TFA (TSX Force Abort) may affect PMC3's behavior,
>>> refer to the following patch:
>>>
>>> 400816f60c54 perf/x86/intel:  ("Implement support for TSX Force Abort")
>>>
>>> When the MSR gets set; the microcode will no longer use PMC3 but will
>>> Force Abort every TSX transaction (upon executing COMMIT).
>>>
>>> When TSX Force Abort (TFA) is allowed (default); the MSR gets set when
>>> PMC3 gets scheduled and cleared when, after scheduling, PMC3 is
>>> unused.
>>>
>>> When TFA is not allowed; clear PMC3 from all constraints such that it
>>> will not get used.
>>>
>>>
>>>>
>>>> However, this patch attempts to avoid the switching of the pmu counters
>>>> in various perf_events, so the special behavior of a single pmu counter
>>>> will not be propagated to other events.
>>>>
>>>
>>> Since PMC3 may have special behaviors, the continuous switching of PMU
>>> counters may not only affects the performance, but also may lead to abnormal
>>> data, please consider this patch again.
>>
>> I'm not following. How do you get abnormal data?
>>
>> Are you using RDPMC from userspace? If so, are you following the
>> prescribed logic using the self-monitoring interface?

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2022-04-20 14:44 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-04 11:03 [RESEND PATCH 1/2] perf/x86: extract code to assign perf events for both core and uncore Wen Yang
2022-03-04 11:03 ` [RESEND PATCH 2/2] perf/x86: improve the event scheduling to avoid unnecessary pmu_stop/start Wen Yang
2022-03-04 15:39   ` Peter Zijlstra
2022-03-06 14:36     ` Wen Yang
2022-03-07 17:14       ` Stephane Eranian
2022-03-08  6:42         ` Wen Yang
2022-03-08 12:53           ` Peter Zijlstra
2022-03-08 12:50       ` Peter Zijlstra
2022-03-10  3:50         ` Wen Yang
2022-03-14 10:55           ` Peter Zijlstra
2022-03-17 17:54             ` Wen Yang
2022-04-17 15:06               ` Wen Yang
2022-04-19 14:16                 ` Wen Yang
2022-04-19 20:57                   ` Peter Zijlstra
2022-04-19 21:18                     ` Stephane Eranian
2022-04-20 14:44                       ` Wen Yang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.