All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V2 1/3] perf/x86/intel: Factor out common code of PMI handler
@ 2018-08-08  7:12 kan.liang
  2018-08-08  7:12 ` [PATCH V2 2/3] x86, perf: Add a separate Arch Perfmon v4 " kan.liang
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: kan.liang @ 2018-08-08  7:12 UTC (permalink / raw)
  To: peterz, tglx, mingo, acme, linux-kernel
  Cc: eranian, ak, alexander.shishkin, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

The Arch Perfmon v4 PMI handler is substantially different than
the older PMI handler. Instead of adding more and more ifs cleanly
fork the new handler into a new function, with the main common
code factored out into a common function.

Fix complaint from checkpatch.pl by removing "false" from "static bool
warned".

No functional change.

Based-on-code-from: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

No changes since V1

 arch/x86/events/intel/core.c | 109 ++++++++++++++++++++++++-------------------
 1 file changed, 60 insertions(+), 49 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index e2d36491..a7d7759 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2200,59 +2200,15 @@ static void intel_pmu_reset(void)
 	local_irq_restore(flags);
 }
 
-/*
- * This handler is triggered by the local APIC, so the APIC IRQ handling
- * rules apply:
- */
-static int intel_pmu_handle_irq(struct pt_regs *regs)
+static int handle_pmi_common(struct pt_regs *regs, u64 status)
 {
 	struct perf_sample_data data;
-	struct cpu_hw_events *cpuc;
-	int bit, loops;
-	u64 status;
-	int handled;
-	int pmu_enabled;
-
-	cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	/*
-	 * Save the PMU state.
-	 * It needs to be restored when leaving the handler.
-	 */
-	pmu_enabled = cpuc->enabled;
-	/*
-	 * No known reason to not always do late ACK,
-	 * but just in case do it opt-in.
-	 */
-	if (!x86_pmu.late_ack)
-		apic_write(APIC_LVTPC, APIC_DM_NMI);
-	intel_bts_disable_local();
-	cpuc->enabled = 0;
-	__intel_pmu_disable_all();
-	handled = intel_pmu_drain_bts_buffer();
-	handled += intel_bts_interrupt();
-	status = intel_pmu_get_status();
-	if (!status)
-		goto done;
-
-	loops = 0;
-again:
-	intel_pmu_lbr_read();
-	intel_pmu_ack_status(status);
-	if (++loops > 100) {
-		static bool warned = false;
-		if (!warned) {
-			WARN(1, "perfevents: irq loop stuck!\n");
-			perf_event_print_debug();
-			warned = true;
-		}
-		intel_pmu_reset();
-		goto done;
-	}
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int bit;
+	int handled = 0;
 
 	inc_irq_stat(apic_perf_irqs);
 
-
 	/*
 	 * Ignore a range of extra bits in status that do not indicate
 	 * overflow by themselves.
@@ -2261,7 +2217,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 		    GLOBAL_STATUS_ASIF |
 		    GLOBAL_STATUS_LBRS_FROZEN);
 	if (!status)
-		goto done;
+		return 0;
 	/*
 	 * In case multiple PEBS events are sampled at the same time,
 	 * it is possible to have GLOBAL_STATUS bit 62 set indicating
@@ -2331,6 +2287,61 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 			x86_pmu_stop(event, 0);
 	}
 
+	return handled;
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int intel_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct cpu_hw_events *cpuc;
+	int loops;
+	u64 status;
+	int handled;
+	int pmu_enabled;
+
+	cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	/*
+	 * Save the PMU state.
+	 * It needs to be restored when leaving the handler.
+	 */
+	pmu_enabled = cpuc->enabled;
+	/*
+	 * No known reason to not always do late ACK,
+	 * but just in case do it opt-in.
+	 */
+	if (!x86_pmu.late_ack)
+		apic_write(APIC_LVTPC, APIC_DM_NMI);
+	intel_bts_disable_local();
+	cpuc->enabled = 0;
+	__intel_pmu_disable_all();
+	handled = intel_pmu_drain_bts_buffer();
+	handled += intel_bts_interrupt();
+	status = intel_pmu_get_status();
+	if (!status)
+		goto done;
+
+	loops = 0;
+again:
+	intel_pmu_lbr_read();
+	intel_pmu_ack_status(status);
+	if (++loops > 100) {
+		static bool warned;
+
+		if (!warned) {
+			WARN(1, "perfevents: irq loop stuck!\n");
+			perf_event_print_debug();
+			warned = true;
+		}
+		intel_pmu_reset();
+		goto done;
+	}
+
+	handled += handle_pmi_common(regs, status);
+
 	/*
 	 * Repeat if there is more work to be done:
 	 */
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH V2 2/3] x86, perf: Add a separate Arch Perfmon v4 PMI handler
  2018-08-08  7:12 [PATCH V2 1/3] perf/x86/intel: Factor out common code of PMI handler kan.liang
@ 2018-08-08  7:12 ` kan.liang
  2018-09-12 13:33   ` Liang, Kan
                     ` (2 more replies)
  2018-08-08  7:12 ` [PATCH V2 3/3] perf/x86/intel: Add quirk for Goldmont Plus kan.liang
  2018-10-02 10:08 ` [tip:perf/core] perf/x86/intel: Factor out common code of PMI handler tip-bot for Kan Liang
  2 siblings, 3 replies; 10+ messages in thread
From: kan.liang @ 2018-08-08  7:12 UTC (permalink / raw)
  To: peterz, tglx, mingo, acme, linux-kernel
  Cc: eranian, ak, alexander.shishkin, Kan Liang

From: Andi Kleen <ak@linux.intel.com>

Implements counter freezing for Arch Perfmon v4 (Skylake and
newer). This allows to speed up the PMI handler by avoiding
unnecessary MSR writes and make it more accurate.

The Arch Perfmon v4 PMI handler is substantially different than
the older PMI handler.

Differences to the old handler:
- It relies on counter freezing, which eliminates several MSR
writes from the PMI handler and lowers the overhead significantly.

It makes the PMI handler more accurate, as all counters get
frozen atomically as soon as any counter overflows. So there is
much less counting of the PMI handler itself.

With the freezing we don't need to disable or enable counters or
PEBS. Only BTS which does not support auto-freezing still needs to
be explicitly managed.

- The PMU acking is done at the end, not the beginning.
This makes it possible to avoid manual enabling/disabling
of the PMU, instead we just rely on the freezing/acking.

- The APIC is acked before reenabling the PMU, which avoids
problems with LBRs occasionally not getting unfreezed on Skylake.

- Looping is only needed to workaround a corner case which several PMIs
are very close to each other. For common cases, the counters are freezed
during PMI handler. It doesn't need to do re-check.

This patch
- Adds code to enable v4 counter freezing
- Fork <=v3 and >=v4 PMI handlers into separate functions.
- Add kernel parameter to disable counter freezing. It took some time to
  debug counter freezing, so in case there are new problems we added an
  option to turn it off. Would not expect this to be used until there
  are new bugs.
- Only for big core. The patch for small core will be posted later
  separately.

Performance:

When profiling a kernel build on Kabylake with different perf options,
measuring the length of all NMI handlers using the nmi handler
trace point:

V3 is without counter freezing.
V4 is with counter freezing.
The value is the average cost of the PMI handler.
(lower is better)

perf options    `           V3(ns) V4(ns)  delta
-c 100000                   1088   894     -18%
-g -c 100000                1862   1646    -12%
--call-graph lbr -c 100000  3649   3367    -8%
--c.g. dwarf -c 100000      2248   1982    -12%

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

Changes since V1:
 - Move enable_counter_freeze() to intel_pmu_cpu_starting().
 - Remove frozen_enabled. The state of counter-freeze feature doesn't
   change after initialization.
 - Use __setup() to replace of module_param
 - Don't print "counter freezing" to log
 - Use bit fields to replace bool for all PMI handler knobs. 
 - Update comments and document

 Documentation/admin-guide/kernel-parameters.txt |   5 ++
 arch/x86/events/intel/core.c                    | 112 ++++++++++++++++++++++++
 arch/x86/events/perf_event.h                    |   4 +-
 arch/x86/include/asm/msr-index.h                |   1 +
 4 files changed, 121 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 533ff5c..cb2a6f68 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -828,6 +828,11 @@
 			causing system reset or hang due to sending
 			INIT from AP to BSP.
 
+	disable_counter_freezing [HW]
+			Disable Intel PMU counter freezing feature.
+			The feature only exists starting from
+			Arch Perfmon v4 (Skylake and newer).
+
 	disable_ddw	[PPC/PSERIES]
 			Disable Dynamic DMA Window support. Use this if
 			to workaround buggy firmware.
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index a7d7759..fdd2f99 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -1995,6 +1995,18 @@ static void intel_pmu_nhm_enable_all(int added)
 	intel_pmu_enable_all(added);
 }
 
+static void enable_counter_freeze(void)
+{
+	update_debugctlmsr(get_debugctlmsr() |
+			DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
+}
+
+static void disable_counter_freeze(void)
+{
+	update_debugctlmsr(get_debugctlmsr() &
+			~DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
+}
+
 static inline u64 intel_pmu_get_status(void)
 {
 	u64 status;
@@ -2290,6 +2302,91 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
 	return handled;
 }
 
+static bool disable_counter_freezing;
+static int __init intel_perf_counter_freezing_setup(char *s)
+{
+	disable_counter_freezing = true;
+	pr_info("Intel PMU Counter freezing feature disabled\n");
+	return 1;
+}
+__setup("disable_counter_freezing", intel_perf_counter_freezing_setup);
+
+/*
+ * Simplified handler for Arch Perfmon v4:
+ * - We rely on counter freezing/unfreezing to enable/disable the PMU.
+ * This is done automatically on PMU ack.
+ * - Ack the PMU only after the APIC.
+ */
+
+static int intel_pmu_handle_irq_v4(struct pt_regs *regs)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int handled = 0;
+	bool bts = false;
+	u64 status;
+	int pmu_enabled = cpuc->enabled;
+	int loops = 0;
+
+	/* PMU has been disabled because of counter freezing */
+	cpuc->enabled = 0;
+	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+		bts = true;
+		intel_bts_disable_local();
+		handled = intel_pmu_drain_bts_buffer();
+		handled += intel_bts_interrupt();
+	}
+	status = intel_pmu_get_status();
+	if (!status)
+		goto done;
+again:
+	intel_pmu_lbr_read();
+	if (++loops > 100) {
+		static bool warned;
+
+		if (!warned) {
+			WARN(1, "perfevents: irq loop stuck!\n");
+			perf_event_print_debug();
+			warned = true;
+		}
+		intel_pmu_reset();
+		goto done;
+	}
+
+
+	handled += handle_pmi_common(regs, status);
+done:
+	/* Ack the PMI in the APIC */
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+	/*
+	 * The counters start counting immediately while ack the status.
+	 * Make it as close as possible to IRET. This avoids bogus
+	 * freezing on Skylake CPUs.
+	 */
+	if (status) {
+		intel_pmu_ack_status(status);
+	} else {
+		/*
+		 * CPU may issues two PMIs very close to each other.
+		 * When the PMI handler services the first one, the
+		 * GLOBAL_STATUS is already updated to reflect both.
+		 * When it IRETs, the second PMI is immediately
+		 * handled and it sees clear status. At the meantime,
+		 * there may be a third PMI, because the freezing bit
+		 * isn't set since the ack in first PMI handlers.
+		 * Double check if there is more work to be done.
+		 */
+		status = intel_pmu_get_status();
+		if (status)
+			goto again;
+	}
+
+	if (bts)
+		intel_bts_enable_local();
+	cpuc->enabled = pmu_enabled;
+	return handled;
+}
+
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
@@ -3361,6 +3458,9 @@ static void intel_pmu_cpu_starting(int cpu)
 	if (x86_pmu.version > 1)
 		flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
 
+	if (x86_pmu.counter_freezing)
+		enable_counter_freeze();
+
 	if (!cpuc->shared_regs)
 		return;
 
@@ -3432,6 +3532,9 @@ static void intel_pmu_cpu_dying(int cpu)
 	free_excl_cntrs(cpu);
 
 	fini_debug_store_on_cpu(cpu);
+
+	if (x86_pmu.counter_freezing)
+		disable_counter_freeze();
 }
 
 static void intel_pmu_sched_task(struct perf_event_context *ctx,
@@ -4325,6 +4428,8 @@ __init int intel_pmu_init(void)
 		x86_pmu.extra_regs = intel_skl_extra_regs;
 		x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
 		x86_pmu.pebs_prec_dist = true;
+		x86_pmu.counter_freezing = disable_counter_freezing ?
+					   false : true;
 		/* all extra regs are per-cpu when HT is on */
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
@@ -4442,6 +4547,13 @@ __init int intel_pmu_init(void)
 		pr_cont("full-width counters, ");
 	}
 
+	/*
+	 * For arch perfmon 4 use counter freezing to avoid
+	 * several MSR accesses in the PMI.
+	 */
+	if (x86_pmu.counter_freezing)
+		x86_pmu.handle_irq = intel_pmu_handle_irq_v4;
+
 	kfree(to_free);
 	return 0;
 }
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 1562863..adae087 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -560,9 +560,11 @@ struct x86_pmu {
 	struct event_constraint *event_constraints;
 	struct x86_pmu_quirk *quirks;
 	int		perfctr_second_write;
-	bool		late_ack;
 	u64		(*limit_period)(struct perf_event *event, u64 l);
 
+	/* PMI handler bits */
+	unsigned int	late_ack		:1,
+			counter_freezing	:1;
 	/*
 	 * sysfs attrs
 	 */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 68b2c31..4ae4a59 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -157,6 +157,7 @@
 #define DEBUGCTLMSR_BTS_OFF_OS		(1UL <<  9)
 #define DEBUGCTLMSR_BTS_OFF_USR		(1UL << 10)
 #define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI	(1UL << 11)
+#define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI	(1UL << 12)
 #define DEBUGCTLMSR_FREEZE_IN_SMM_BIT	14
 #define DEBUGCTLMSR_FREEZE_IN_SMM	(1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
 
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH V2 3/3] perf/x86/intel: Add quirk for Goldmont Plus
  2018-08-08  7:12 [PATCH V2 1/3] perf/x86/intel: Factor out common code of PMI handler kan.liang
  2018-08-08  7:12 ` [PATCH V2 2/3] x86, perf: Add a separate Arch Perfmon v4 " kan.liang
@ 2018-08-08  7:12 ` kan.liang
  2018-10-02 10:10   ` [tip:perf/core] " tip-bot for Kan Liang
  2018-10-02 10:08 ` [tip:perf/core] perf/x86/intel: Factor out common code of PMI handler tip-bot for Kan Liang
  2 siblings, 1 reply; 10+ messages in thread
From: kan.liang @ 2018-08-08  7:12 UTC (permalink / raw)
  To: peterz, tglx, mingo, acme, linux-kernel
  Cc: eranian, ak, alexander.shishkin, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

A ucode patch is needed for Goldmont Plus while counter freezing feature
is enabled. Otherwise, there will be some issues, e.g. PMI flood with
some events.

Add a quirk to check microcode version. If the system starts with the
wrong ucode, leave the counter-freezing feature permanently disabled.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

Changes since V1:
 - Only check the microcode version on CPU 0 at initialization.

 arch/x86/events/intel/core.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index fdd2f99..928a393 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3839,6 +3839,42 @@ static __init void intel_nehalem_quirk(void)
 	}
 }
 
+static bool intel_glp_counter_freezing_broken(int cpu)
+{
+	u32 rev = UINT_MAX; /* default to broken for unknown stepping */
+
+	switch (cpu_data(cpu).x86_stepping) {
+	case 1:
+		rev = 0x28;
+		break;
+	case 8:
+		rev = 0x6;
+		break;
+	}
+
+	return (cpu_data(cpu).microcode < rev);
+}
+
+static __init void intel_glp_counter_freezing_quirk(void)
+{
+	/* Check if it's already disabled */
+	if (disable_counter_freezing)
+		return;
+
+	/*
+	 * If the system starts with the wrong ucode, leave the
+	 * counter-freezing feature permanently disabled.
+	 * Only checking one CPU is enough. When perf first initializes,
+	 * only CPU 0 is online.
+	 */
+	if (intel_glp_counter_freezing_broken(0)) {
+		pr_info("PMU counter freezing disabled due to CPU errata,"
+			"please upgrade microcode\n");
+		x86_pmu.counter_freezing = false;
+		x86_pmu.handle_irq = intel_pmu_handle_irq;
+	}
+}
+
 /*
  * enable software workaround for errata:
  * SNB: BJ122
@@ -4183,6 +4219,7 @@ __init int intel_pmu_init(void)
 		break;
 
 	case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+		x86_add_quirk(intel_glp_counter_freezing_quirk);
 		memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs,
@@ -4199,6 +4236,8 @@ __init int intel_pmu_init(void)
 		x86_pmu.pebs_aliases = NULL;
 		x86_pmu.pebs_prec_dist = true;
 		x86_pmu.lbr_pt_coexist = true;
+		x86_pmu.counter_freezing = disable_counter_freezing ?
+					   false : true;
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		x86_pmu.flags |= PMU_FL_PEBS_ALL;
 		x86_pmu.get_event_constraints = glp_get_event_constraints;
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH V2 2/3] x86, perf: Add a separate Arch Perfmon v4 PMI handler
  2018-08-08  7:12 ` [PATCH V2 2/3] x86, perf: Add a separate Arch Perfmon v4 " kan.liang
@ 2018-09-12 13:33   ` Liang, Kan
  2018-09-27 12:51   ` Peter Zijlstra
  2018-10-02 10:09   ` [tip:perf/core] perf/x86/intel: " tip-bot for Andi Kleen
  2 siblings, 0 replies; 10+ messages in thread
From: Liang, Kan @ 2018-09-12 13:33 UTC (permalink / raw)
  To: peterz, tglx, mingo, acme, linux-kernel; +Cc: eranian, ak, alexander.shishkin

Hi Peter,

Any comments for the patch series regarding to v4 PMI handler?

Thanks,
Kan

On 8/8/2018 3:12 AM, kan.liang@linux.intel.com wrote:
> From: Andi Kleen <ak@linux.intel.com>
> 
> Implements counter freezing for Arch Perfmon v4 (Skylake and
> newer). This allows to speed up the PMI handler by avoiding
> unnecessary MSR writes and make it more accurate.
> 
> The Arch Perfmon v4 PMI handler is substantially different than
> the older PMI handler.
> 
> Differences to the old handler:
> - It relies on counter freezing, which eliminates several MSR
> writes from the PMI handler and lowers the overhead significantly.
> 
> It makes the PMI handler more accurate, as all counters get
> frozen atomically as soon as any counter overflows. So there is
> much less counting of the PMI handler itself.
> 
> With the freezing we don't need to disable or enable counters or
> PEBS. Only BTS which does not support auto-freezing still needs to
> be explicitly managed.
> 
> - The PMU acking is done at the end, not the beginning.
> This makes it possible to avoid manual enabling/disabling
> of the PMU, instead we just rely on the freezing/acking.
> 
> - The APIC is acked before reenabling the PMU, which avoids
> problems with LBRs occasionally not getting unfreezed on Skylake.
> 
> - Looping is only needed to workaround a corner case which several PMIs
> are very close to each other. For common cases, the counters are freezed
> during PMI handler. It doesn't need to do re-check.
> 
> This patch
> - Adds code to enable v4 counter freezing
> - Fork <=v3 and >=v4 PMI handlers into separate functions.
> - Add kernel parameter to disable counter freezing. It took some time to
>    debug counter freezing, so in case there are new problems we added an
>    option to turn it off. Would not expect this to be used until there
>    are new bugs.
> - Only for big core. The patch for small core will be posted later
>    separately.
> 
> Performance:
> 
> When profiling a kernel build on Kabylake with different perf options,
> measuring the length of all NMI handlers using the nmi handler
> trace point:
> 
> V3 is without counter freezing.
> V4 is with counter freezing.
> The value is the average cost of the PMI handler.
> (lower is better)
> 
> perf options    `           V3(ns) V4(ns)  delta
> -c 100000                   1088   894     -18%
> -g -c 100000                1862   1646    -12%
> --call-graph lbr -c 100000  3649   3367    -8%
> --c.g. dwarf -c 100000      2248   1982    -12%
> 
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> ---
> 
> Changes since V1:
>   - Move enable_counter_freeze() to intel_pmu_cpu_starting().
>   - Remove frozen_enabled. The state of counter-freeze feature doesn't
>     change after initialization.
>   - Use __setup() to replace of module_param
>   - Don't print "counter freezing" to log
>   - Use bit fields to replace bool for all PMI handler knobs.
>   - Update comments and document
> 
>   Documentation/admin-guide/kernel-parameters.txt |   5 ++
>   arch/x86/events/intel/core.c                    | 112 ++++++++++++++++++++++++
>   arch/x86/events/perf_event.h                    |   4 +-
>   arch/x86/include/asm/msr-index.h                |   1 +
>   4 files changed, 121 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 533ff5c..cb2a6f68 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -828,6 +828,11 @@
>   			causing system reset or hang due to sending
>   			INIT from AP to BSP.
>   
> +	disable_counter_freezing [HW]
> +			Disable Intel PMU counter freezing feature.
> +			The feature only exists starting from
> +			Arch Perfmon v4 (Skylake and newer).
> +
>   	disable_ddw	[PPC/PSERIES]
>   			Disable Dynamic DMA Window support. Use this if
>   			to workaround buggy firmware.
> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
> index a7d7759..fdd2f99 100644
> --- a/arch/x86/events/intel/core.c
> +++ b/arch/x86/events/intel/core.c
> @@ -1995,6 +1995,18 @@ static void intel_pmu_nhm_enable_all(int added)
>   	intel_pmu_enable_all(added);
>   }
>   
> +static void enable_counter_freeze(void)
> +{
> +	update_debugctlmsr(get_debugctlmsr() |
> +			DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
> +}
> +
> +static void disable_counter_freeze(void)
> +{
> +	update_debugctlmsr(get_debugctlmsr() &
> +			~DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
> +}
> +
>   static inline u64 intel_pmu_get_status(void)
>   {
>   	u64 status;
> @@ -2290,6 +2302,91 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
>   	return handled;
>   }
>   
> +static bool disable_counter_freezing;
> +static int __init intel_perf_counter_freezing_setup(char *s)
> +{
> +	disable_counter_freezing = true;
> +	pr_info("Intel PMU Counter freezing feature disabled\n");
> +	return 1;
> +}
> +__setup("disable_counter_freezing", intel_perf_counter_freezing_setup);
> +
> +/*
> + * Simplified handler for Arch Perfmon v4:
> + * - We rely on counter freezing/unfreezing to enable/disable the PMU.
> + * This is done automatically on PMU ack.
> + * - Ack the PMU only after the APIC.
> + */
> +
> +static int intel_pmu_handle_irq_v4(struct pt_regs *regs)
> +{
> +	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> +	int handled = 0;
> +	bool bts = false;
> +	u64 status;
> +	int pmu_enabled = cpuc->enabled;
> +	int loops = 0;
> +
> +	/* PMU has been disabled because of counter freezing */
> +	cpuc->enabled = 0;
> +	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
> +		bts = true;
> +		intel_bts_disable_local();
> +		handled = intel_pmu_drain_bts_buffer();
> +		handled += intel_bts_interrupt();
> +	}
> +	status = intel_pmu_get_status();
> +	if (!status)
> +		goto done;
> +again:
> +	intel_pmu_lbr_read();
> +	if (++loops > 100) {
> +		static bool warned;
> +
> +		if (!warned) {
> +			WARN(1, "perfevents: irq loop stuck!\n");
> +			perf_event_print_debug();
> +			warned = true;
> +		}
> +		intel_pmu_reset();
> +		goto done;
> +	}
> +
> +
> +	handled += handle_pmi_common(regs, status);
> +done:
> +	/* Ack the PMI in the APIC */
> +	apic_write(APIC_LVTPC, APIC_DM_NMI);
> +
> +	/*
> +	 * The counters start counting immediately while ack the status.
> +	 * Make it as close as possible to IRET. This avoids bogus
> +	 * freezing on Skylake CPUs.
> +	 */
> +	if (status) {
> +		intel_pmu_ack_status(status);
> +	} else {
> +		/*
> +		 * CPU may issues two PMIs very close to each other.
> +		 * When the PMI handler services the first one, the
> +		 * GLOBAL_STATUS is already updated to reflect both.
> +		 * When it IRETs, the second PMI is immediately
> +		 * handled and it sees clear status. At the meantime,
> +		 * there may be a third PMI, because the freezing bit
> +		 * isn't set since the ack in first PMI handlers.
> +		 * Double check if there is more work to be done.
> +		 */
> +		status = intel_pmu_get_status();
> +		if (status)
> +			goto again;
> +	}
> +
> +	if (bts)
> +		intel_bts_enable_local();
> +	cpuc->enabled = pmu_enabled;
> +	return handled;
> +}
> +
>   /*
>    * This handler is triggered by the local APIC, so the APIC IRQ handling
>    * rules apply:
> @@ -3361,6 +3458,9 @@ static void intel_pmu_cpu_starting(int cpu)
>   	if (x86_pmu.version > 1)
>   		flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
>   
> +	if (x86_pmu.counter_freezing)
> +		enable_counter_freeze();
> +
>   	if (!cpuc->shared_regs)
>   		return;
>   
> @@ -3432,6 +3532,9 @@ static void intel_pmu_cpu_dying(int cpu)
>   	free_excl_cntrs(cpu);
>   
>   	fini_debug_store_on_cpu(cpu);
> +
> +	if (x86_pmu.counter_freezing)
> +		disable_counter_freeze();
>   }
>   
>   static void intel_pmu_sched_task(struct perf_event_context *ctx,
> @@ -4325,6 +4428,8 @@ __init int intel_pmu_init(void)
>   		x86_pmu.extra_regs = intel_skl_extra_regs;
>   		x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
>   		x86_pmu.pebs_prec_dist = true;
> +		x86_pmu.counter_freezing = disable_counter_freezing ?
> +					   false : true;
>   		/* all extra regs are per-cpu when HT is on */
>   		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
>   		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
> @@ -4442,6 +4547,13 @@ __init int intel_pmu_init(void)
>   		pr_cont("full-width counters, ");
>   	}
>   
> +	/*
> +	 * For arch perfmon 4 use counter freezing to avoid
> +	 * several MSR accesses in the PMI.
> +	 */
> +	if (x86_pmu.counter_freezing)
> +		x86_pmu.handle_irq = intel_pmu_handle_irq_v4;
> +
>   	kfree(to_free);
>   	return 0;
>   }
> diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
> index 1562863..adae087 100644
> --- a/arch/x86/events/perf_event.h
> +++ b/arch/x86/events/perf_event.h
> @@ -560,9 +560,11 @@ struct x86_pmu {
>   	struct event_constraint *event_constraints;
>   	struct x86_pmu_quirk *quirks;
>   	int		perfctr_second_write;
> -	bool		late_ack;
>   	u64		(*limit_period)(struct perf_event *event, u64 l);
>   
> +	/* PMI handler bits */
> +	unsigned int	late_ack		:1,
> +			counter_freezing	:1;
>   	/*
>   	 * sysfs attrs
>   	 */
> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
> index 68b2c31..4ae4a59 100644
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -157,6 +157,7 @@
>   #define DEBUGCTLMSR_BTS_OFF_OS		(1UL <<  9)
>   #define DEBUGCTLMSR_BTS_OFF_USR		(1UL << 10)
>   #define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI	(1UL << 11)
> +#define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI	(1UL << 12)
>   #define DEBUGCTLMSR_FREEZE_IN_SMM_BIT	14
>   #define DEBUGCTLMSR_FREEZE_IN_SMM	(1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
>   
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH V2 2/3] x86, perf: Add a separate Arch Perfmon v4 PMI handler
  2018-08-08  7:12 ` [PATCH V2 2/3] x86, perf: Add a separate Arch Perfmon v4 " kan.liang
  2018-09-12 13:33   ` Liang, Kan
@ 2018-09-27 12:51   ` Peter Zijlstra
  2018-09-27 13:53     ` Liang, Kan
  2018-10-02 10:09   ` [tip:perf/core] perf/x86/intel: " tip-bot for Andi Kleen
  2 siblings, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2018-09-27 12:51 UTC (permalink / raw)
  To: kan.liang
  Cc: tglx, mingo, acme, linux-kernel, eranian, ak, alexander.shishkin

On Wed, Aug 08, 2018 at 12:12:07AM -0700, kan.liang@linux.intel.com wrote:
> @@ -4325,6 +4428,8 @@ __init int intel_pmu_init(void)
>  		x86_pmu.extra_regs = intel_skl_extra_regs;
>  		x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
>  		x86_pmu.pebs_prec_dist = true;
> +		x86_pmu.counter_freezing = disable_counter_freezing ?
> +					   false : true;
>  		/* all extra regs are per-cpu when HT is on */
>  		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
>  		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;


How about so instead? It is very much tied to the perfmon version, not
the FMS.

--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4049,6 +4049,9 @@ __init int intel_pmu_init(void)
 			max((int)edx.split.num_counters_fixed, assume);
 	}
 
+	if (version >= 4)
+		x86_pmu.counter_freezing = !disable_counter_freezing;
+
 	if (boot_cpu_has(X86_FEATURE_PDCM)) {
 		u64 capabilities;
 
@@ -4428,8 +4431,6 @@ __init int intel_pmu_init(void)
 		x86_pmu.extra_regs = intel_skl_extra_regs;
 		x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
 		x86_pmu.pebs_prec_dist = true;
-		x86_pmu.counter_freezing = disable_counter_freezing ?
-					   false : true;
 		/* all extra regs are per-cpu when HT is on */
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH V2 2/3] x86, perf: Add a separate Arch Perfmon v4 PMI handler
  2018-09-27 12:51   ` Peter Zijlstra
@ 2018-09-27 13:53     ` Liang, Kan
  2018-09-27 14:39       ` Peter Zijlstra
  0 siblings, 1 reply; 10+ messages in thread
From: Liang, Kan @ 2018-09-27 13:53 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, acme, linux-kernel, eranian, ak, alexander.shishkin



On 9/27/2018 8:51 AM, Peter Zijlstra wrote:
> On Wed, Aug 08, 2018 at 12:12:07AM -0700, kan.liang@linux.intel.com wrote:
>> @@ -4325,6 +4428,8 @@ __init int intel_pmu_init(void)
>>   		x86_pmu.extra_regs = intel_skl_extra_regs;
>>   		x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
>>   		x86_pmu.pebs_prec_dist = true;
>> +		x86_pmu.counter_freezing = disable_counter_freezing ?
>> +					   false : true;
>>   		/* all extra regs are per-cpu when HT is on */
>>   		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
>>   		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
> 
> 
> How about so instead? It is very much tied to the perfmon version, not
> the FMS.
>

Yes, it matches the name of the handler.
But the change as below is not sufficient. We have to temporarily 
disable counter_freezing in this patch for small core.
The 3/3 patch will also be impact.

I will send V3 patch for these changes.

Thanks,
Kan

> --- a/arch/x86/events/intel/core.c
> +++ b/arch/x86/events/intel/core.c
> @@ -4049,6 +4049,9 @@ __init int intel_pmu_init(void)
>   			max((int)edx.split.num_counters_fixed, assume);
>   	}
>   
> +	if (version >= 4)
> +		x86_pmu.counter_freezing = !disable_counter_freezing;
> +
>   	if (boot_cpu_has(X86_FEATURE_PDCM)) {
>   		u64 capabilities;
>   
> @@ -4428,8 +4431,6 @@ __init int intel_pmu_init(void)
>   		x86_pmu.extra_regs = intel_skl_extra_regs;
>   		x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
>   		x86_pmu.pebs_prec_dist = true;
> -		x86_pmu.counter_freezing = disable_counter_freezing ?
> -					   false : true;
>   		/* all extra regs are per-cpu when HT is on */
>   		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
>   		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH V2 2/3] x86, perf: Add a separate Arch Perfmon v4 PMI handler
  2018-09-27 13:53     ` Liang, Kan
@ 2018-09-27 14:39       ` Peter Zijlstra
  0 siblings, 0 replies; 10+ messages in thread
From: Peter Zijlstra @ 2018-09-27 14:39 UTC (permalink / raw)
  To: Liang, Kan
  Cc: tglx, mingo, acme, linux-kernel, eranian, ak, alexander.shishkin

On Thu, Sep 27, 2018 at 09:53:28AM -0400, Liang, Kan wrote:
> 
> 
> On 9/27/2018 8:51 AM, Peter Zijlstra wrote:
> > On Wed, Aug 08, 2018 at 12:12:07AM -0700, kan.liang@linux.intel.com wrote:
> > > @@ -4325,6 +4428,8 @@ __init int intel_pmu_init(void)
> > >   		x86_pmu.extra_regs = intel_skl_extra_regs;
> > >   		x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
> > >   		x86_pmu.pebs_prec_dist = true;
> > > +		x86_pmu.counter_freezing = disable_counter_freezing ?
> > > +					   false : true;
> > >   		/* all extra regs are per-cpu when HT is on */
> > >   		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
> > >   		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
> > 
> > 
> > How about so instead? It is very much tied to the perfmon version, not
> > the FMS.
> > 
> 
> Yes, it matches the name of the handler.
> But the change as below is not sufficient. We have to temporarily disable
> counter_freezing in this patch for small core.
> The 3/3 patch will also be impact.
> 
> I will send V3 patch for these changes.

Hold on, I just munched the third patch on top as well.

Let me push out the lot so you can have a peek..

  https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/log/?h=perf/core

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [tip:perf/core] perf/x86/intel: Factor out common code of PMI handler
  2018-08-08  7:12 [PATCH V2 1/3] perf/x86/intel: Factor out common code of PMI handler kan.liang
  2018-08-08  7:12 ` [PATCH V2 2/3] x86, perf: Add a separate Arch Perfmon v4 " kan.liang
  2018-08-08  7:12 ` [PATCH V2 3/3] perf/x86/intel: Add quirk for Goldmont Plus kan.liang
@ 2018-10-02 10:08 ` tip-bot for Kan Liang
  2 siblings, 0 replies; 10+ messages in thread
From: tip-bot for Kan Liang @ 2018-10-02 10:08 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: tglx, torvalds, peterz, jolsa, kan.liang, hpa, linux-kernel,
	alexander.shishkin, acme, eranian, mingo, vincent.weaver

Commit-ID:  ba12d20edc5caf9835006d8f3efd4ed18465c75b
Gitweb:     https://git.kernel.org/tip/ba12d20edc5caf9835006d8f3efd4ed18465c75b
Author:     Kan Liang <kan.liang@linux.intel.com>
AuthorDate: Wed, 8 Aug 2018 00:12:06 -0700
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Tue, 2 Oct 2018 10:14:30 +0200

perf/x86/intel: Factor out common code of PMI handler

The Arch Perfmon v4 PMI handler is substantially different than
the older PMI handler. Instead of adding more and more ifs cleanly
fork the new handler into a new function, with the main common
code factored out into a common function.

Fix complaint from checkpatch.pl by removing "false" from "static bool
warned".

No functional change.

Based-on-code-from: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: acme@kernel.org
Link: http://lkml.kernel.org/r/1533712328-2834-1-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/core.c | 109 ++++++++++++++++++++++++-------------------
 1 file changed, 60 insertions(+), 49 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 035c37481f57..9b320a51f82f 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2200,59 +2200,15 @@ static void intel_pmu_reset(void)
 	local_irq_restore(flags);
 }
 
-/*
- * This handler is triggered by the local APIC, so the APIC IRQ handling
- * rules apply:
- */
-static int intel_pmu_handle_irq(struct pt_regs *regs)
+static int handle_pmi_common(struct pt_regs *regs, u64 status)
 {
 	struct perf_sample_data data;
-	struct cpu_hw_events *cpuc;
-	int bit, loops;
-	u64 status;
-	int handled;
-	int pmu_enabled;
-
-	cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	/*
-	 * Save the PMU state.
-	 * It needs to be restored when leaving the handler.
-	 */
-	pmu_enabled = cpuc->enabled;
-	/*
-	 * No known reason to not always do late ACK,
-	 * but just in case do it opt-in.
-	 */
-	if (!x86_pmu.late_ack)
-		apic_write(APIC_LVTPC, APIC_DM_NMI);
-	intel_bts_disable_local();
-	cpuc->enabled = 0;
-	__intel_pmu_disable_all();
-	handled = intel_pmu_drain_bts_buffer();
-	handled += intel_bts_interrupt();
-	status = intel_pmu_get_status();
-	if (!status)
-		goto done;
-
-	loops = 0;
-again:
-	intel_pmu_lbr_read();
-	intel_pmu_ack_status(status);
-	if (++loops > 100) {
-		static bool warned = false;
-		if (!warned) {
-			WARN(1, "perfevents: irq loop stuck!\n");
-			perf_event_print_debug();
-			warned = true;
-		}
-		intel_pmu_reset();
-		goto done;
-	}
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int bit;
+	int handled = 0;
 
 	inc_irq_stat(apic_perf_irqs);
 
-
 	/*
 	 * Ignore a range of extra bits in status that do not indicate
 	 * overflow by themselves.
@@ -2261,7 +2217,7 @@ again:
 		    GLOBAL_STATUS_ASIF |
 		    GLOBAL_STATUS_LBRS_FROZEN);
 	if (!status)
-		goto done;
+		return 0;
 	/*
 	 * In case multiple PEBS events are sampled at the same time,
 	 * it is possible to have GLOBAL_STATUS bit 62 set indicating
@@ -2331,6 +2287,61 @@ again:
 			x86_pmu_stop(event, 0);
 	}
 
+	return handled;
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int intel_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct cpu_hw_events *cpuc;
+	int loops;
+	u64 status;
+	int handled;
+	int pmu_enabled;
+
+	cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	/*
+	 * Save the PMU state.
+	 * It needs to be restored when leaving the handler.
+	 */
+	pmu_enabled = cpuc->enabled;
+	/*
+	 * No known reason to not always do late ACK,
+	 * but just in case do it opt-in.
+	 */
+	if (!x86_pmu.late_ack)
+		apic_write(APIC_LVTPC, APIC_DM_NMI);
+	intel_bts_disable_local();
+	cpuc->enabled = 0;
+	__intel_pmu_disable_all();
+	handled = intel_pmu_drain_bts_buffer();
+	handled += intel_bts_interrupt();
+	status = intel_pmu_get_status();
+	if (!status)
+		goto done;
+
+	loops = 0;
+again:
+	intel_pmu_lbr_read();
+	intel_pmu_ack_status(status);
+	if (++loops > 100) {
+		static bool warned;
+
+		if (!warned) {
+			WARN(1, "perfevents: irq loop stuck!\n");
+			perf_event_print_debug();
+			warned = true;
+		}
+		intel_pmu_reset();
+		goto done;
+	}
+
+	handled += handle_pmi_common(regs, status);
+
 	/*
 	 * Repeat if there is more work to be done:
 	 */

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [tip:perf/core] perf/x86/intel: Add a separate Arch Perfmon v4 PMI handler
  2018-08-08  7:12 ` [PATCH V2 2/3] x86, perf: Add a separate Arch Perfmon v4 " kan.liang
  2018-09-12 13:33   ` Liang, Kan
  2018-09-27 12:51   ` Peter Zijlstra
@ 2018-10-02 10:09   ` tip-bot for Andi Kleen
  2 siblings, 0 replies; 10+ messages in thread
From: tip-bot for Andi Kleen @ 2018-10-02 10:09 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: alexander.shishkin, peterz, jolsa, linux-kernel, eranian, tglx,
	ak, torvalds, vincent.weaver, acme, hpa, kan.liang, mingo

Commit-ID:  af3bdb991a5cb57c189d34aadbd3aa88995e0d9f
Gitweb:     https://git.kernel.org/tip/af3bdb991a5cb57c189d34aadbd3aa88995e0d9f
Author:     Andi Kleen <ak@linux.intel.com>
AuthorDate: Wed, 8 Aug 2018 00:12:07 -0700
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Tue, 2 Oct 2018 10:14:31 +0200

perf/x86/intel: Add a separate Arch Perfmon v4 PMI handler

Implements counter freezing for Arch Perfmon v4 (Skylake and
newer). This allows to speed up the PMI handler by avoiding
unnecessary MSR writes and make it more accurate.

The Arch Perfmon v4 PMI handler is substantially different than
the older PMI handler.

Differences to the old handler:

- It relies on counter freezing, which eliminates several MSR
  writes from the PMI handler and lowers the overhead significantly.

  It makes the PMI handler more accurate, as all counters get
  frozen atomically as soon as any counter overflows. So there is
  much less counting of the PMI handler itself.

  With the freezing we don't need to disable or enable counters or
  PEBS. Only BTS which does not support auto-freezing still needs to
  be explicitly managed.

- The PMU acking is done at the end, not the beginning.
  This makes it possible to avoid manual enabling/disabling
  of the PMU, instead we just rely on the freezing/acking.

- The APIC is acked before reenabling the PMU, which avoids
  problems with LBRs occasionally not getting unfreezed on Skylake.

- Looping is only needed to workaround a corner case which several PMIs
  are very close to each other. For common cases, the counters are freezed
  during PMI handler. It doesn't need to do re-check.

This patch:

- Adds code to enable v4 counter freezing
- Fork <=v3 and >=v4 PMI handlers into separate functions.
- Add kernel parameter to disable counter freezing. It took some time to
  debug counter freezing, so in case there are new problems we added an
  option to turn it off. Would not expect this to be used until there
  are new bugs.
- Only for big core. The patch for small core will be posted later
  separately.

Performance:

When profiling a kernel build on Kabylake with different perf options,
measuring the length of all NMI handlers using the nmi handler
trace point:

V3 is without counter freezing.
V4 is with counter freezing.
The value is the average cost of the PMI handler.
(lower is better)

perf options    `           V3(ns) V4(ns)  delta
-c 100000                   1088   894     -18%
-g -c 100000                1862   1646    -12%
--call-graph lbr -c 100000  3649   3367    -8%
--c.g. dwarf -c 100000      2248   1982    -12%

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: acme@kernel.org
Link: http://lkml.kernel.org/r/1533712328-2834-2-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt |   5 ++
 arch/x86/events/intel/core.c                    | 113 ++++++++++++++++++++++++
 arch/x86/events/perf_event.h                    |   4 +-
 arch/x86/include/asm/msr-index.h                |   1 +
 4 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 92eb1f42240d..6795dedcbd1e 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -856,6 +856,11 @@
 			causing system reset or hang due to sending
 			INIT from AP to BSP.
 
+	disable_counter_freezing [HW]
+			Disable Intel PMU counter freezing feature.
+			The feature only exists starting from
+			Arch Perfmon v4 (Skylake and newer).
+
 	disable_ddw	[PPC/PSERIES]
 			Disable Dynamic DMA Window support. Use this if
 			to workaround buggy firmware.
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 9b320a51f82f..bd3b8f3600b2 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -1995,6 +1995,18 @@ static void intel_pmu_nhm_enable_all(int added)
 	intel_pmu_enable_all(added);
 }
 
+static void enable_counter_freeze(void)
+{
+	update_debugctlmsr(get_debugctlmsr() |
+			DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
+}
+
+static void disable_counter_freeze(void)
+{
+	update_debugctlmsr(get_debugctlmsr() &
+			~DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
+}
+
 static inline u64 intel_pmu_get_status(void)
 {
 	u64 status;
@@ -2290,6 +2302,91 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
 	return handled;
 }
 
+static bool disable_counter_freezing;
+static int __init intel_perf_counter_freezing_setup(char *s)
+{
+	disable_counter_freezing = true;
+	pr_info("Intel PMU Counter freezing feature disabled\n");
+	return 1;
+}
+__setup("disable_counter_freezing", intel_perf_counter_freezing_setup);
+
+/*
+ * Simplified handler for Arch Perfmon v4:
+ * - We rely on counter freezing/unfreezing to enable/disable the PMU.
+ * This is done automatically on PMU ack.
+ * - Ack the PMU only after the APIC.
+ */
+
+static int intel_pmu_handle_irq_v4(struct pt_regs *regs)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int handled = 0;
+	bool bts = false;
+	u64 status;
+	int pmu_enabled = cpuc->enabled;
+	int loops = 0;
+
+	/* PMU has been disabled because of counter freezing */
+	cpuc->enabled = 0;
+	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+		bts = true;
+		intel_bts_disable_local();
+		handled = intel_pmu_drain_bts_buffer();
+		handled += intel_bts_interrupt();
+	}
+	status = intel_pmu_get_status();
+	if (!status)
+		goto done;
+again:
+	intel_pmu_lbr_read();
+	if (++loops > 100) {
+		static bool warned;
+
+		if (!warned) {
+			WARN(1, "perfevents: irq loop stuck!\n");
+			perf_event_print_debug();
+			warned = true;
+		}
+		intel_pmu_reset();
+		goto done;
+	}
+
+
+	handled += handle_pmi_common(regs, status);
+done:
+	/* Ack the PMI in the APIC */
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+	/*
+	 * The counters start counting immediately while ack the status.
+	 * Make it as close as possible to IRET. This avoids bogus
+	 * freezing on Skylake CPUs.
+	 */
+	if (status) {
+		intel_pmu_ack_status(status);
+	} else {
+		/*
+		 * CPU may issues two PMIs very close to each other.
+		 * When the PMI handler services the first one, the
+		 * GLOBAL_STATUS is already updated to reflect both.
+		 * When it IRETs, the second PMI is immediately
+		 * handled and it sees clear status. At the meantime,
+		 * there may be a third PMI, because the freezing bit
+		 * isn't set since the ack in first PMI handlers.
+		 * Double check if there is more work to be done.
+		 */
+		status = intel_pmu_get_status();
+		if (status)
+			goto again;
+	}
+
+	if (bts)
+		intel_bts_enable_local();
+	cpuc->enabled = pmu_enabled;
+	return handled;
+}
+
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
@@ -3361,6 +3458,9 @@ static void intel_pmu_cpu_starting(int cpu)
 	if (x86_pmu.version > 1)
 		flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
 
+	if (x86_pmu.counter_freezing)
+		enable_counter_freeze();
+
 	if (!cpuc->shared_regs)
 		return;
 
@@ -3432,6 +3532,9 @@ static void intel_pmu_cpu_dying(int cpu)
 	free_excl_cntrs(cpu);
 
 	fini_debug_store_on_cpu(cpu);
+
+	if (x86_pmu.counter_freezing)
+		disable_counter_freeze();
 }
 
 static void intel_pmu_sched_task(struct perf_event_context *ctx,
@@ -3946,6 +4049,9 @@ __init int intel_pmu_init(void)
 			max((int)edx.split.num_counters_fixed, assume);
 	}
 
+	if (version >= 4)
+		x86_pmu.counter_freezing = !disable_counter_freezing;
+
 	if (boot_cpu_has(X86_FEATURE_PDCM)) {
 		u64 capabilities;
 
@@ -4442,6 +4548,13 @@ __init int intel_pmu_init(void)
 		pr_cont("full-width counters, ");
 	}
 
+	/*
+	 * For arch perfmon 4 use counter freezing to avoid
+	 * several MSR accesses in the PMI.
+	 */
+	if (x86_pmu.counter_freezing)
+		x86_pmu.handle_irq = intel_pmu_handle_irq_v4;
+
 	kfree(to_free);
 	return 0;
 }
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 156286335351..adae087cecdd 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -560,9 +560,11 @@ struct x86_pmu {
 	struct event_constraint *event_constraints;
 	struct x86_pmu_quirk *quirks;
 	int		perfctr_second_write;
-	bool		late_ack;
 	u64		(*limit_period)(struct perf_event *event, u64 l);
 
+	/* PMI handler bits */
+	unsigned int	late_ack		:1,
+			counter_freezing	:1;
 	/*
 	 * sysfs attrs
 	 */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 4731f0cf97c5..80f4a4f38c79 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -164,6 +164,7 @@
 #define DEBUGCTLMSR_BTS_OFF_OS		(1UL <<  9)
 #define DEBUGCTLMSR_BTS_OFF_USR		(1UL << 10)
 #define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI	(1UL << 11)
+#define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI	(1UL << 12)
 #define DEBUGCTLMSR_FREEZE_IN_SMM_BIT	14
 #define DEBUGCTLMSR_FREEZE_IN_SMM	(1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
 

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [tip:perf/core] perf/x86/intel: Add quirk for Goldmont Plus
  2018-08-08  7:12 ` [PATCH V2 3/3] perf/x86/intel: Add quirk for Goldmont Plus kan.liang
@ 2018-10-02 10:10   ` tip-bot for Kan Liang
  0 siblings, 0 replies; 10+ messages in thread
From: tip-bot for Kan Liang @ 2018-10-02 10:10 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: peterz, linux-kernel, jolsa, tglx, vincent.weaver, eranian, hpa,
	mingo, torvalds, alexander.shishkin, acme, kan.liang

Commit-ID:  7c5314b88da6d5af98239786772a1c44cc5eb67d
Gitweb:     https://git.kernel.org/tip/7c5314b88da6d5af98239786772a1c44cc5eb67d
Author:     Kan Liang <kan.liang@linux.intel.com>
AuthorDate: Wed, 8 Aug 2018 00:12:08 -0700
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Tue, 2 Oct 2018 10:14:33 +0200

perf/x86/intel: Add quirk for Goldmont Plus

A ucode patch is needed for Goldmont Plus while counter freezing feature
is enabled. Otherwise, there will be some issues, e.g. PMI flood with
some events.

Add a quirk to check microcode version. If the system starts with the
wrong ucode, leave the counter-freezing feature permanently disabled.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: acme@kernel.org
Link: http://lkml.kernel.org/r/1533712328-2834-3-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/core.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index f17cf6c3ec6f..ab01ef9ddd77 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3839,6 +3839,40 @@ static __init void intel_nehalem_quirk(void)
 	}
 }
 
+static bool intel_glp_counter_freezing_broken(int cpu)
+{
+	u32 rev = UINT_MAX; /* default to broken for unknown stepping */
+
+	switch (cpu_data(cpu).x86_stepping) {
+	case 1:
+		rev = 0x28;
+		break;
+	case 8:
+		rev = 0x6;
+		break;
+	}
+
+	return (cpu_data(cpu).microcode < rev);
+}
+
+static __init void intel_glp_counter_freezing_quirk(void)
+{
+	/* Check if it's already disabled */
+	if (disable_counter_freezing)
+		return;
+
+	/*
+	 * If the system starts with the wrong ucode, leave the
+	 * counter-freezing feature permanently disabled.
+	 */
+	if (intel_glp_counter_freezing_broken(raw_smp_processor_id())) {
+		pr_info("PMU counter freezing disabled due to CPU errata,"
+			"please upgrade microcode\n");
+		x86_pmu.counter_freezing = false;
+		x86_pmu.handle_irq = intel_pmu_handle_irq;
+	}
+}
+
 /*
  * enable software workaround for errata:
  * SNB: BJ122
@@ -4188,6 +4222,7 @@ __init int intel_pmu_init(void)
 		break;
 
 	case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+		x86_add_quirk(intel_glp_counter_freezing_quirk);
 		memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs,

^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2018-10-02 10:10 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-08  7:12 [PATCH V2 1/3] perf/x86/intel: Factor out common code of PMI handler kan.liang
2018-08-08  7:12 ` [PATCH V2 2/3] x86, perf: Add a separate Arch Perfmon v4 " kan.liang
2018-09-12 13:33   ` Liang, Kan
2018-09-27 12:51   ` Peter Zijlstra
2018-09-27 13:53     ` Liang, Kan
2018-09-27 14:39       ` Peter Zijlstra
2018-10-02 10:09   ` [tip:perf/core] perf/x86/intel: " tip-bot for Andi Kleen
2018-08-08  7:12 ` [PATCH V2 3/3] perf/x86/intel: Add quirk for Goldmont Plus kan.liang
2018-10-02 10:10   ` [tip:perf/core] " tip-bot for Kan Liang
2018-10-02 10:08 ` [tip:perf/core] perf/x86/intel: Factor out common code of PMI handler tip-bot for Kan Liang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.