All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andi Kleen <andi@firstfloor.org>
To: peterz@infradead.org
Cc: x86@kernel.org, eranian@google.com, kan.liang@intel.com,
	linux-kernel@vger.kernel.org, Andi Kleen <ak@linux.intel.com>
Subject: [PATCH v2 2/2] perf/x86/kvm: Avoid unnecessary work in guest filtering
Date: Wed, 10 Oct 2018 09:26:08 -0700	[thread overview]
Message-ID: <20181010162608.23899-2-andi@firstfloor.org> (raw)
In-Reply-To: <20181010162608.23899-1-andi@firstfloor.org>

From: Andi Kleen <ak@linux.intel.com>

KVM added a workaround for PEBS events leaking
into guests with 26a4f3c08de4 ("perf/x86: disable PEBS on a guest entry.")
This uses the VT entry/exit list to add an extra disable of the PEBS_ENABLE MSR.

Intel also added a fix for this issue to microcode updates on
Haswell/Broadwell/Skylake.

It turns out using the MSR entry/exit list makes VM exits
significantly slower. The list is only needed for disabling
PEBS, because the GLOBAL_CTRL change gets optimized by
KVM into changing the VMCS.

Check for the microcode updates that have the microcode
fix for leaking PEBS, and disable the extra entry/exit list
entry for PEBS_ENABLE. In addition we always clear the
GLOBAL_CTRL for the PEBS counter while running in the guest,
which is enough to make them never fire at the wrong
side of the host/guest transition.

We see significantly reduced overhead for VM exits with the
filtering active with the patch from 8% to 4%.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
v2:
Use match_ucode, not match_ucode_all
Remove cpu lock
Use INTEL_MIN_UCODE and move to header
Update Table to include skylake clients.
---
 arch/x86/events/intel/core.c | 80 ++++++++++++++++++++++++++++++++----
 arch/x86/events/perf_event.h |  3 +-
 2 files changed, 73 insertions(+), 10 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index ab01ef9ddd77..5e8e76753eea 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -18,6 +18,7 @@
 #include <asm/hardirq.h>
 #include <asm/intel-family.h>
 #include <asm/apic.h>
+#include <asm/cpu_device_id.h>
 
 #include "../perf_event.h"
 
@@ -3166,16 +3167,27 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
 	arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
 	arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
 	arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
-	/*
-	 * If PMU counter has PEBS enabled it is not enough to disable counter
-	 * on a guest entry since PEBS memory write can overshoot guest entry
-	 * and corrupt guest memory. Disabling PEBS solves the problem.
-	 */
-	arr[1].msr = MSR_IA32_PEBS_ENABLE;
-	arr[1].host = cpuc->pebs_enabled;
-	arr[1].guest = 0;
+	if (x86_pmu.flags & PMU_FL_PEBS_ALL)
+		arr[0].guest &= ~cpuc->pebs_enabled;
+	else
+		arr[0].guest &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
+	*nr = 1;
+
+	if (!x86_pmu.pebs_isolated) {
+		/*
+		 * If PMU counter has PEBS enabled it is not enough to
+		 * disable counter on a guest entry since PEBS memory
+		 * write can overshoot guest entry and corrupt guest
+		 * memory. Disabling PEBS solves the problem.
+		 *
+		 * Don't do this if the CPU already enforces it.
+		 */
+		arr[1].msr = MSR_IA32_PEBS_ENABLE;
+		arr[1].host = cpuc->pebs_enabled;
+		arr[1].guest = 0;
+		*nr = 2;
+	}
 
-	*nr = 2;
 	return arr;
 }
 
@@ -3693,6 +3705,45 @@ static __init void intel_clovertown_quirk(void)
 	x86_pmu.pebs_constraints = NULL;
 }
 
+static const struct x86_ucode_id isolation_ucodes[] = {
+	INTEL_MIN_UCODE(INTEL_FAM6_HASWELL_CORE,	 3, 0x0000001f),
+	INTEL_MIN_UCODE(INTEL_FAM6_HASWELL_ULT,		 1, 0x0000001e),
+	INTEL_MIN_UCODE(INTEL_FAM6_HASWELL_GT3E,	 1, 0x00000015),
+	INTEL_MIN_UCODE(INTEL_FAM6_HASWELL_X,		 2, 0x00000037),
+	INTEL_MIN_UCODE(INTEL_FAM6_HASWELL_X,		 4, 0x0000000a),
+	INTEL_MIN_UCODE(INTEL_FAM6_BROADWELL_CORE,	 4, 0x00000023),
+	INTEL_MIN_UCODE(INTEL_FAM6_BROADWELL_GT3E,	 1, 0x00000014),
+	INTEL_MIN_UCODE(INTEL_FAM6_BROADWELL_XEON_D,	 2, 0x00000010),
+	INTEL_MIN_UCODE(INTEL_FAM6_BROADWELL_XEON_D,	 3, 0x07000009),
+	INTEL_MIN_UCODE(INTEL_FAM6_BROADWELL_XEON_D,	 4, 0x0f000009),
+	INTEL_MIN_UCODE(INTEL_FAM6_BROADWELL_XEON_D,	 5, 0x0e000002),
+	INTEL_MIN_UCODE(INTEL_FAM6_BROADWELL_X,		 2, 0x0b000014),
+	INTEL_MIN_UCODE(INTEL_FAM6_SKYLAKE_X,		 3, 0x00000021),
+	INTEL_MIN_UCODE(INTEL_FAM6_SKYLAKE_X,		 4, 0x00000000),
+	INTEL_MIN_UCODE(INTEL_FAM6_SKYLAKE_MOBILE,	 3, 0x0000007c),
+	INTEL_MIN_UCODE(INTEL_FAM6_SKYLAKE_DESKTOP,	 3, 0x0000007c),
+	INTEL_MIN_UCODE(INTEL_FAM6_KABYLAKE_DESKTOP,	 9, 0x0000004e),
+	INTEL_MIN_UCODE(INTEL_FAM6_KABYLAKE_MOBILE,	 9, 0x0000004e),
+	INTEL_MIN_UCODE(INTEL_FAM6_KABYLAKE_MOBILE,     10, 0x0000004e),
+	INTEL_MIN_UCODE(INTEL_FAM6_KABYLAKE_MOBILE,     11, 0x0000004e),
+	INTEL_MIN_UCODE(INTEL_FAM6_KABYLAKE_MOBILE,     12, 0x0000004e),
+	INTEL_MIN_UCODE(INTEL_FAM6_KABYLAKE_DESKTOP,    10, 0x0000004e),
+	INTEL_MIN_UCODE(INTEL_FAM6_KABYLAKE_DESKTOP,    11, 0x0000004e),
+	INTEL_MIN_UCODE(INTEL_FAM6_KABYLAKE_DESKTOP,    12, 0x0000004e),
+	INTEL_MIN_UCODE(INTEL_FAM6_KABYLAKE_DESKTOP,    13, 0x0000004e),
+	INTEL_MIN_UCODE(INTEL_FAM6_CANNONLAKE_MOBILE,    3, 0x00000000),
+	{}
+};
+
+static void intel_check_isolation(void)
+{
+	if (!x86_match_ucode(isolation_ucodes)) {
+		x86_pmu.pebs_isolated = 0;
+		return;
+	}
+	x86_pmu.pebs_isolated = 1;
+}
+
 static int intel_snb_pebs_broken(int cpu)
 {
 	u32 rev = UINT_MAX; /* default to broken for unknown models */
@@ -3717,6 +3768,8 @@ static void intel_snb_check_microcode(void)
 	int pebs_broken = 0;
 	int cpu;
 
+	intel_check_isolation();
+
 	for_each_online_cpu(cpu) {
 		if ((pebs_broken = intel_snb_pebs_broken(cpu)))
 			break;
@@ -3798,6 +3851,12 @@ static __init void intel_sandybridge_quirk(void)
 	cpus_read_unlock();
 }
 
+static __init void intel_isolation_quirk(void)
+{
+	x86_pmu.check_microcode = intel_check_isolation;
+	intel_check_isolation();
+}
+
 static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
 	{ PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
 	{ PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
@@ -4362,6 +4421,7 @@ __init int intel_pmu_init(void)
 	case INTEL_FAM6_HASWELL_X:
 	case INTEL_FAM6_HASWELL_ULT:
 	case INTEL_FAM6_HASWELL_GT3E:
+		x86_add_quirk(intel_isolation_quirk);
 		x86_add_quirk(intel_ht_bug);
 		x86_pmu.late_ack = true;
 		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
@@ -4392,6 +4452,7 @@ __init int intel_pmu_init(void)
 	case INTEL_FAM6_BROADWELL_XEON_D:
 	case INTEL_FAM6_BROADWELL_GT3E:
 	case INTEL_FAM6_BROADWELL_X:
+		x86_add_quirk(intel_isolation_quirk);
 		x86_pmu.late_ack = true;
 		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
@@ -4452,6 +4513,7 @@ __init int intel_pmu_init(void)
 	case INTEL_FAM6_SKYLAKE_X:
 	case INTEL_FAM6_KABYLAKE_MOBILE:
 	case INTEL_FAM6_KABYLAKE_DESKTOP:
+		x86_add_quirk(intel_isolation_quirk);
 		x86_pmu.late_ack = true;
 		memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index adae087cecdd..d5745ed62622 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -607,7 +607,8 @@ struct x86_pmu {
 			pebs_active	:1,
 			pebs_broken	:1,
 			pebs_prec_dist	:1,
-			pebs_no_tlb	:1;
+			pebs_no_tlb	:1,
+			pebs_isolated   :1;
 	int		pebs_record_size;
 	int		pebs_buffer_size;
 	void		(*drain_pebs)(struct pt_regs *regs);
-- 
2.17.1


  reply	other threads:[~2018-10-10 16:26 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-10-10 16:26 [PATCH v2 1/2] x86/cpufeature: Add facility to match microcode revisions Andi Kleen
2018-10-10 16:26 ` Andi Kleen [this message]
2018-10-10 16:37 ` Borislav Petkov
2018-10-11 11:43 ` Henrique de Moraes Holschuh
2018-10-17  9:59 ` Thomas Gleixner
2018-10-19 23:47   ` Andi Kleen
2018-10-20  8:19     ` Thomas Gleixner
2018-10-20 14:38       ` Andi Kleen
2018-10-21 10:20         ` Thomas Gleixner
2018-10-21 15:13           ` Borislav Petkov
2018-10-25 23:23           ` Andi Kleen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181010162608.23899-2-andi@firstfloor.org \
    --to=andi@firstfloor.org \
    --cc=ak@linux.intel.com \
    --cc=eranian@google.com \
    --cc=kan.liang@intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=peterz@infradead.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.