linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH 1/2] x86: remove duplicate TSC DEADLINE MSR definitions
@ 2020-03-05 17:47 Dave Hansen
  2020-03-05 17:47 ` [RFC][PATCH 2/2] x86: add extra serialization for non-serializing MSRs Dave Hansen
                   ` (4 more replies)
  0 siblings, 5 replies; 15+ messages in thread
From: Dave Hansen @ 2020-03-05 17:47 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dave Hansen, x86, peterz


There are two definitions for the TSC deadline MSR in msr-index.h,
one with an underscore and one without.  Axe one of them and move
all the references over to the other one.

Cc: x86@kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>

---

 b/arch/x86/include/asm/msr-index.h                       |    2 --
 b/arch/x86/kvm/x86.c                                     |    6 +++---
 b/tools/arch/x86/include/asm/msr-index.h                 |    2 --
 b/tools/perf/trace/beauty/tracepoints/x86_msr.sh         |    2 +-
 b/tools/testing/selftests/kvm/include/x86_64/processor.h |    2 --
 5 files changed, 4 insertions(+), 10 deletions(-)

diff -puN arch/x86/include/asm/msr-index.h~kill-dup-MSR_IA32_TSCDEADLINE arch/x86/include/asm/msr-index.h
--- a/arch/x86/include/asm/msr-index.h~kill-dup-MSR_IA32_TSCDEADLINE	2020-03-05 09:42:37.049901042 -0800
+++ b/arch/x86/include/asm/msr-index.h	2020-03-05 09:42:37.062901042 -0800
@@ -576,8 +576,6 @@
 #define MSR_IA32_APICBASE_ENABLE	(1<<11)
 #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
 
-#define MSR_IA32_TSCDEADLINE		0x000006e0
-
 #define MSR_IA32_UCODE_WRITE		0x00000079
 #define MSR_IA32_UCODE_REV		0x0000008b
 
diff -puN arch/x86/kvm/x86.c~kill-dup-MSR_IA32_TSCDEADLINE arch/x86/kvm/x86.c
--- a/arch/x86/kvm/x86.c~kill-dup-MSR_IA32_TSCDEADLINE	2020-03-05 09:42:37.051901042 -0800
+++ b/arch/x86/kvm/x86.c	2020-03-05 09:42:37.065901042 -0800
@@ -1200,7 +1200,7 @@ static const u32 emulated_msrs_all[] = {
 	MSR_KVM_PV_EOI_EN,
 
 	MSR_IA32_TSC_ADJUST,
-	MSR_IA32_TSCDEADLINE,
+	MSR_IA32_TSC_DEADLINE,
 	MSR_IA32_ARCH_CAPABILITIES,
 	MSR_IA32_MISC_ENABLE,
 	MSR_IA32_MCG_STATUS,
@@ -2688,7 +2688,7 @@ int kvm_set_msr_common(struct kvm_vcpu *
 		return kvm_set_apic_base(vcpu, msr_info);
 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
 		return kvm_x2apic_msr_write(vcpu, msr, data);
-	case MSR_IA32_TSCDEADLINE:
+	case MSR_IA32_TSC_DEADLINE:
 		kvm_set_lapic_tscdeadline_msr(vcpu, data);
 		break;
 	case MSR_IA32_TSC_ADJUST:
@@ -3009,7 +3009,7 @@ int kvm_get_msr_common(struct kvm_vcpu *
 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
 		return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
 		break;
-	case MSR_IA32_TSCDEADLINE:
+	case MSR_IA32_TSC_DEADLINE:
 		msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
 		break;
 	case MSR_IA32_TSC_ADJUST:
diff -puN tools/arch/x86/include/asm/msr-index.h~kill-dup-MSR_IA32_TSCDEADLINE tools/arch/x86/include/asm/msr-index.h
--- a/tools/arch/x86/include/asm/msr-index.h~kill-dup-MSR_IA32_TSCDEADLINE	2020-03-05 09:42:37.055901042 -0800
+++ b/tools/arch/x86/include/asm/msr-index.h	2020-03-05 09:42:37.066901042 -0800
@@ -576,8 +576,6 @@
 #define MSR_IA32_APICBASE_ENABLE	(1<<11)
 #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
 
-#define MSR_IA32_TSCDEADLINE		0x000006e0
-
 #define MSR_IA32_UCODE_WRITE		0x00000079
 #define MSR_IA32_UCODE_REV		0x0000008b
 
diff -puN tools/perf/trace/beauty/tracepoints/x86_msr.sh~kill-dup-MSR_IA32_TSCDEADLINE tools/perf/trace/beauty/tracepoints/x86_msr.sh
--- a/tools/perf/trace/beauty/tracepoints/x86_msr.sh~kill-dup-MSR_IA32_TSCDEADLINE	2020-03-05 09:42:37.057901042 -0800
+++ b/tools/perf/trace/beauty/tracepoints/x86_msr.sh	2020-03-05 09:42:37.066901042 -0800
@@ -15,7 +15,7 @@ x86_msr_index=${arch_x86_header_dir}/msr
 
 printf "static const char *x86_MSRs[] = {\n"
 regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MSR_([[:alnum:]][[:alnum:]_]+)[[:space:]]+(0x00000[[:xdigit:]]+)[[:space:]]*.*'
-egrep $regex ${x86_msr_index} | egrep -v 'MSR_(ATOM|P[46]|AMD64|IA32_TSCDEADLINE|IDT_FCR4)' | \
+egrep $regex ${x86_msr_index} | egrep -v 'MSR_(ATOM|P[46]|AMD64|IA32_TSC_DEADLINE|IDT_FCR4)' | \
 	sed -r "s/$regex/\2 \1/g" | sort -n | \
 	xargs printf "\t[%s] = \"%s\",\n"
 printf "};\n\n"
diff -puN tools/testing/selftests/kvm/include/x86_64/processor.h~kill-dup-MSR_IA32_TSCDEADLINE tools/testing/selftests/kvm/include/x86_64/processor.h
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h~kill-dup-MSR_IA32_TSCDEADLINE	2020-03-05 09:42:37.058901042 -0800
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h	2020-03-05 09:42:37.067901042 -0800
@@ -813,8 +813,6 @@ void kvm_get_cpu_address_width(unsigned
 #define		APIC_VECTOR_MASK	0x000FF
 #define	APIC_ICR2	0x310
 
-#define MSR_IA32_TSCDEADLINE		0x000006e0
-
 #define MSR_IA32_UCODE_WRITE		0x00000079
 #define MSR_IA32_UCODE_REV		0x0000008b
 
_

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [RFC][PATCH 2/2] x86: add extra serialization for non-serializing MSRs
  2020-03-05 17:47 [RFC][PATCH 1/2] x86: remove duplicate TSC DEADLINE MSR definitions Dave Hansen
@ 2020-03-05 17:47 ` Dave Hansen
  2021-02-04 16:37   ` Dave Hansen
                     ` (3 more replies)
  2020-03-09 23:50 ` [RFC][PATCH 1/2] x86: remove duplicate TSC DEADLINE MSR definitions Sean Christopherson
                   ` (3 subsequent siblings)
  4 siblings, 4 replies; 15+ messages in thread
From: Dave Hansen @ 2020-03-05 17:47 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dave Hansen, jan.kiszka, x86, peterz


Jan Kiszka reported that the x2apic_wrmsr_fence() function uses a
plain "mfence" while the Intel SDM (10.12.3 MSR Access in x2APIC
Mode) calls for "mfence;lfence".

Short summary: we have special MSRs that have weaker ordering
than all the rest.  Add fencing consistent with current SDM
recommendatrions.

This is not known to cause any issues in practice, only in
theory.

Longer story below:

The reason the kernel uses a different semantic is that the SDM
changed (roughly in late 2017).  The SDM changed because folks at
Intel were auditing all of the recommended fences in the SDM and
realized that the x2apic fences were insufficient.

Why was the pain "mfence" judged insufficient?

WRMSR itself is normally a serializing instruction.  No fences
are needed because because the instruction itself serializes
everything.

But, there are explicit exceptions for this serializing behavior
written into the WRMSR instruction documentation for two classes
of MSRs: IA32_TSC_DEADLINE and the X2APIC MSRs.

Back to x2apic: WRMSR is *not* serializing in this specific case.
But why is MFENCE insufficient?  MFENCE makes writes visible, but
only affects load/store instructions.  WRMSR is unfortunately not
a load/store instruction and is unaffected by MEFNCE.  This means
that a non-serializing WRMSR could be reordered by the CPU to
execute before the writes made visible by the MFENCE have even
occurred in the first place.

This mean that an x2apic IPI could theoretically be triggered
before there is any (visible) data to process.

Does this affect anything in practice?  I honestly don't know.
It seems quite possible that by the time an interrupt gets to
consume the (not yet) MFENCE'd data, it has become visible,
mostly by accident.

To be safe, add the SDM-recommended fences for all x2apic WRMSRs.

This also leaves open the question of the _other_ weakly-ordered
WRMSR: MSR_IA32_TSC_DEADLINE.  While it has the same ordering
architecture as the x2APIC MSRs, it seems substantially less
likely to be a problem in practice.  While writes to the
in-memory Local Vector Table (LVT) might theoretically be
reordered with respect to a weakly-ordered WRMSR like
TSC_DEADLINE, the SDM has this to say:

	In x2APIC mode, the WRMSR instruction is used to write to
	the LVT entry. The processor ensures the ordering of this
	write and any subsequent WRMSR to the deadline; no
	fencing is required.

But, that might still leave xAPIC exposed.  The safest thing to
do for now is to add the extra, recommended LFENCE.

Reported-by: Jan Kiszka <jan.kiszka@siemens.com>
Cc: x86@kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>

---

 b/arch/x86/include/asm/apic.h           |   10 ----------
 b/arch/x86/include/asm/barrier.h        |   18 ++++++++++++++++++
 b/arch/x86/kernel/apic/apic.c           |    4 ++++
 b/arch/x86/kernel/apic/x2apic_cluster.c |    6 ++++--
 b/arch/x86/kernel/apic/x2apic_phys.c    |    9 ++++++---
 b/tools/arch/x86/include/asm/barrier.h  |    1 +
 6 files changed, 33 insertions(+), 15 deletions(-)

diff -puN arch/x86/include/asm/apic.h~x2apic-wrmsr-serialization arch/x86/include/asm/apic.h
--- a/arch/x86/include/asm/apic.h~x2apic-wrmsr-serialization	2020-03-05 09:42:38.876901038 -0800
+++ b/arch/x86/include/asm/apic.h	2020-03-05 09:42:38.891901038 -0800
@@ -195,16 +195,6 @@ static inline bool apic_needs_pit(void)
 #endif /* !CONFIG_X86_LOCAL_APIC */
 
 #ifdef CONFIG_X86_X2APIC
-/*
- * Make previous memory operations globally visible before
- * sending the IPI through x2apic wrmsr. We need a serializing instruction or
- * mfence for this.
- */
-static inline void x2apic_wrmsr_fence(void)
-{
-	asm volatile("mfence" : : : "memory");
-}
-
 static inline void native_apic_msr_write(u32 reg, u32 v)
 {
 	if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
diff -puN arch/x86/include/asm/barrier.h~x2apic-wrmsr-serialization arch/x86/include/asm/barrier.h
--- a/arch/x86/include/asm/barrier.h~x2apic-wrmsr-serialization	2020-03-05 09:42:38.878901038 -0800
+++ b/arch/x86/include/asm/barrier.h	2020-03-05 09:42:38.893901038 -0800
@@ -84,4 +84,22 @@ do {									\
 
 #include <asm-generic/barrier.h>
 
+/*
+ * Make previous memory operations globally visible before
+ * a WRMSR.
+ *
+ * MFENCE makes writes visible, but only affects load/store
+ * instructions.  WRMSR is unfortunately not a load/store
+ * instruction and is unaffected by MEFNCE.  The LFENCE ensures
+ * that the WRMSR is not reordered.
+ *
+ * Most WRMSRs are full serializing instructions themselves and
+ * do not require this barrier.  This is only required for the
+ * IA32_TSC_DEADLINE and X2APIC MSRs.
+ */
+static inline void weak_wrmsr_fence(void)
+{
+	asm volatile("mfence; lfence" : : : "memory");
+}
+
 #endif /* _ASM_X86_BARRIER_H */
diff -puN arch/x86/kernel/apic/apic.c~x2apic-wrmsr-serialization arch/x86/kernel/apic/apic.c
--- a/arch/x86/kernel/apic/apic.c~x2apic-wrmsr-serialization	2020-03-05 09:42:38.880901038 -0800
+++ b/arch/x86/kernel/apic/apic.c	2020-03-05 09:42:38.892901038 -0800
@@ -42,6 +42,7 @@
 #include <asm/x86_init.h>
 #include <asm/pgalloc.h>
 #include <linux/atomic.h>
+#include <asm/barrier.h>
 #include <asm/mpspec.h>
 #include <asm/i8259.h>
 #include <asm/proto.h>
@@ -474,6 +475,9 @@ static int lapic_next_deadline(unsigned
 {
 	u64 tsc;
 
+	/* This MSR is special and need a special fence: */
+	weak_wrmsr_fence();
+
 	tsc = rdtsc();
 	wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
 	return 0;
diff -puN arch/x86/kernel/apic/x2apic_cluster.c~x2apic-wrmsr-serialization arch/x86/kernel/apic/x2apic_cluster.c
--- a/arch/x86/kernel/apic/x2apic_cluster.c~x2apic-wrmsr-serialization	2020-03-05 09:42:38.882901038 -0800
+++ b/arch/x86/kernel/apic/x2apic_cluster.c	2020-03-05 09:42:38.892901038 -0800
@@ -29,7 +29,8 @@ static void x2apic_send_IPI(int cpu, int
 {
 	u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
 
-	x2apic_wrmsr_fence();
+	/* x2apic MSRs are special and need a special fence: */
+	weak_wrmsr_fence();
 	__x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
 }
 
@@ -41,7 +42,8 @@ __x2apic_send_IPI_mask(const struct cpum
 	unsigned long flags;
 	u32 dest;
 
-	x2apic_wrmsr_fence();
+	/* x2apic MSRs are special and need a special fence: */
+	weak_wrmsr_fence();
 	local_irq_save(flags);
 
 	tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask);
diff -puN arch/x86/kernel/apic/x2apic_phys.c~x2apic-wrmsr-serialization arch/x86/kernel/apic/x2apic_phys.c
--- a/arch/x86/kernel/apic/x2apic_phys.c~x2apic-wrmsr-serialization	2020-03-05 09:42:38.885901038 -0800
+++ b/arch/x86/kernel/apic/x2apic_phys.c	2020-03-05 09:42:38.892901038 -0800
@@ -37,7 +37,8 @@ static void x2apic_send_IPI(int cpu, int
 {
 	u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
 
-	x2apic_wrmsr_fence();
+	/* x2apic MSRs are special and need a special fence: */
+	weak_wrmsr_fence();
 	__x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL);
 }
 
@@ -48,7 +49,8 @@ __x2apic_send_IPI_mask(const struct cpum
 	unsigned long this_cpu;
 	unsigned long flags;
 
-	x2apic_wrmsr_fence();
+	/* x2apic MSRs are special and need a special fence: */
+	weak_wrmsr_fence();
 
 	local_irq_save(flags);
 
@@ -116,7 +118,8 @@ void __x2apic_send_IPI_shorthand(int vec
 {
 	unsigned long cfg = __prepare_ICR(which, vector, 0);
 
-	x2apic_wrmsr_fence();
+	/* x2apic MSRs are special and need a special fence: */
+	weak_wrmsr_fence();
 	native_x2apic_icr_write(cfg, 0);
 }
 
diff -puN tools/arch/x86/include/asm/barrier.h~x2apic-wrmsr-serialization tools/arch/x86/include/asm/barrier.h
--- a/tools/arch/x86/include/asm/barrier.h~x2apic-wrmsr-serialization	2020-03-05 09:42:38.887901038 -0800
+++ b/tools/arch/x86/include/asm/barrier.h	2020-03-05 09:42:38.892901038 -0800
@@ -43,4 +43,5 @@ do {						\
 	___p1;					\
 })
 #endif /* defined(__x86_64__) */
+
 #endif /* _TOOLS_LINUX_ASM_X86_BARRIER_H */
_

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH 1/2] x86: remove duplicate TSC DEADLINE MSR definitions
  2020-03-05 17:47 [RFC][PATCH 1/2] x86: remove duplicate TSC DEADLINE MSR definitions Dave Hansen
  2020-03-05 17:47 ` [RFC][PATCH 2/2] x86: add extra serialization for non-serializing MSRs Dave Hansen
@ 2020-03-09 23:50 ` Sean Christopherson
  2021-02-05  9:31 ` Borislav Petkov
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 15+ messages in thread
From: Sean Christopherson @ 2020-03-09 23:50 UTC (permalink / raw)
  To: Dave Hansen; +Cc: linux-kernel, x86, peterz

On Thu, Mar 05, 2020 at 09:47:06AM -0800, Dave Hansen wrote:
> 
> There are two definitions for the TSC deadline MSR in msr-index.h,
> one with an underscore and one without.  Axe one of them and move
> all the references over to the other one.

It's worth calling out that you're keeping the one that matches Intel's
SDM.

I was going to suggest renaming KVM's functions/variables to add the
underscore, but after looking at the code it's probably best to leave
that churn for a different time.

> Cc: x86@kernel.org
> Cc: Peter Zijlstra <peterz@infradead.org>
> 
> ---
> 
>  b/arch/x86/include/asm/msr-index.h                       |    2 --
>  b/arch/x86/kvm/x86.c                                     |    6 +++---
>  b/tools/arch/x86/include/asm/msr-index.h                 |    2 --
>  b/tools/perf/trace/beauty/tracepoints/x86_msr.sh         |    2 +-
>  b/tools/testing/selftests/kvm/include/x86_64/processor.h |    2 --
>  5 files changed, 4 insertions(+), 10 deletions(-)
> 
> diff -puN arch/x86/include/asm/msr-index.h~kill-dup-MSR_IA32_TSCDEADLINE arch/x86/include/asm/msr-index.h
> --- a/arch/x86/include/asm/msr-index.h~kill-dup-MSR_IA32_TSCDEADLINE	2020-03-05 09:42:37.049901042 -0800
> +++ b/arch/x86/include/asm/msr-index.h	2020-03-05 09:42:37.062901042 -0800
> @@ -576,8 +576,6 @@
>  #define MSR_IA32_APICBASE_ENABLE	(1<<11)
>  #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
>  
> -#define MSR_IA32_TSCDEADLINE		0x000006e0
> -
>  #define MSR_IA32_UCODE_WRITE		0x00000079
>  #define MSR_IA32_UCODE_REV		0x0000008b
>  
> diff -puN arch/x86/kvm/x86.c~kill-dup-MSR_IA32_TSCDEADLINE arch/x86/kvm/x86.c
> --- a/arch/x86/kvm/x86.c~kill-dup-MSR_IA32_TSCDEADLINE	2020-03-05 09:42:37.051901042 -0800
> +++ b/arch/x86/kvm/x86.c	2020-03-05 09:42:37.065901042 -0800
> @@ -1200,7 +1200,7 @@ static const u32 emulated_msrs_all[] = {
>  	MSR_KVM_PV_EOI_EN,
>  
>  	MSR_IA32_TSC_ADJUST,
> -	MSR_IA32_TSCDEADLINE,
> +	MSR_IA32_TSC_DEADLINE,
>  	MSR_IA32_ARCH_CAPABILITIES,
>  	MSR_IA32_MISC_ENABLE,
>  	MSR_IA32_MCG_STATUS,
> @@ -2688,7 +2688,7 @@ int kvm_set_msr_common(struct kvm_vcpu *
>  		return kvm_set_apic_base(vcpu, msr_info);
>  	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
>  		return kvm_x2apic_msr_write(vcpu, msr, data);
> -	case MSR_IA32_TSCDEADLINE:
> +	case MSR_IA32_TSC_DEADLINE:
>  		kvm_set_lapic_tscdeadline_msr(vcpu, data);
>  		break;
>  	case MSR_IA32_TSC_ADJUST:
> @@ -3009,7 +3009,7 @@ int kvm_get_msr_common(struct kvm_vcpu *
>  	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
>  		return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
>  		break;
> -	case MSR_IA32_TSCDEADLINE:
> +	case MSR_IA32_TSC_DEADLINE:
>  		msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
>  		break;
>  	case MSR_IA32_TSC_ADJUST:
> diff -puN tools/arch/x86/include/asm/msr-index.h~kill-dup-MSR_IA32_TSCDEADLINE tools/arch/x86/include/asm/msr-index.h
> --- a/tools/arch/x86/include/asm/msr-index.h~kill-dup-MSR_IA32_TSCDEADLINE	2020-03-05 09:42:37.055901042 -0800
> +++ b/tools/arch/x86/include/asm/msr-index.h	2020-03-05 09:42:37.066901042 -0800
> @@ -576,8 +576,6 @@
>  #define MSR_IA32_APICBASE_ENABLE	(1<<11)
>  #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
>  
> -#define MSR_IA32_TSCDEADLINE		0x000006e0
> -
>  #define MSR_IA32_UCODE_WRITE		0x00000079
>  #define MSR_IA32_UCODE_REV		0x0000008b
>  
> diff -puN tools/perf/trace/beauty/tracepoints/x86_msr.sh~kill-dup-MSR_IA32_TSCDEADLINE tools/perf/trace/beauty/tracepoints/x86_msr.sh
> --- a/tools/perf/trace/beauty/tracepoints/x86_msr.sh~kill-dup-MSR_IA32_TSCDEADLINE	2020-03-05 09:42:37.057901042 -0800
> +++ b/tools/perf/trace/beauty/tracepoints/x86_msr.sh	2020-03-05 09:42:37.066901042 -0800
> @@ -15,7 +15,7 @@ x86_msr_index=${arch_x86_header_dir}/msr
>  
>  printf "static const char *x86_MSRs[] = {\n"
>  regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MSR_([[:alnum:]][[:alnum:]_]+)[[:space:]]+(0x00000[[:xdigit:]]+)[[:space:]]*.*'
> -egrep $regex ${x86_msr_index} | egrep -v 'MSR_(ATOM|P[46]|AMD64|IA32_TSCDEADLINE|IDT_FCR4)' | \
> +egrep $regex ${x86_msr_index} | egrep -v 'MSR_(ATOM|P[46]|AMD64|IA32_TSC_DEADLINE|IDT_FCR4)' | \
>  	sed -r "s/$regex/\2 \1/g" | sort -n | \
>  	xargs printf "\t[%s] = \"%s\",\n"
>  printf "};\n\n"
> diff -puN tools/testing/selftests/kvm/include/x86_64/processor.h~kill-dup-MSR_IA32_TSCDEADLINE tools/testing/selftests/kvm/include/x86_64/processor.h
> --- a/tools/testing/selftests/kvm/include/x86_64/processor.h~kill-dup-MSR_IA32_TSCDEADLINE	2020-03-05 09:42:37.058901042 -0800
> +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h	2020-03-05 09:42:37.067901042 -0800
> @@ -813,8 +813,6 @@ void kvm_get_cpu_address_width(unsigned
>  #define		APIC_VECTOR_MASK	0x000FF
>  #define	APIC_ICR2	0x310
>  
> -#define MSR_IA32_TSCDEADLINE		0x000006e0
> -
>  #define MSR_IA32_UCODE_WRITE		0x00000079
>  #define MSR_IA32_UCODE_REV		0x0000008b
>  
> _

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH 2/2] x86: add extra serialization for non-serializing MSRs
  2020-03-05 17:47 ` [RFC][PATCH 2/2] x86: add extra serialization for non-serializing MSRs Dave Hansen
@ 2021-02-04 16:37   ` Dave Hansen
  2021-02-04 19:37   ` [tip: x86/urgent] x86/apic: Add " tip-bot2 for Dave Hansen
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 15+ messages in thread
From: Dave Hansen @ 2021-02-04 16:37 UTC (permalink / raw)
  To: Dave Hansen, linux-kernel; +Cc: jan.kiszka, x86, peterz

...
> Reported-by: Jan Kiszka <jan.kiszka@siemens.com>
> Cc: x86@kernel.org
> Cc: Peter Zijlstra <peterz@infradead.org>

Don't know how I managed to miss it in the first place, but:

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [tip: x86/urgent] x86/apic: Add extra serialization for non-serializing MSRs
  2020-03-05 17:47 ` [RFC][PATCH 2/2] x86: add extra serialization for non-serializing MSRs Dave Hansen
  2021-02-04 16:37   ` Dave Hansen
@ 2021-02-04 19:37   ` tip-bot2 for Dave Hansen
  2021-02-04 23:37   ` [RFC][PATCH 2/2] x86: add " Andrew Cooper
  2021-09-01 13:07   ` x86/apic: Add " Corey Minyard
  3 siblings, 0 replies; 15+ messages in thread
From: tip-bot2 for Dave Hansen @ 2021-02-04 19:37 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Jan Kiszka, Dave Hansen, Borislav Petkov, Peter Zijlstra (Intel),
	Thomas Gleixner, stable, x86, linux-kernel

The following commit has been merged into the x86/urgent branch of tip:

Commit-ID:     25a068b8e9a4eb193d755d58efcb3c98928636e0
Gitweb:        https://git.kernel.org/tip/25a068b8e9a4eb193d755d58efcb3c98928636e0
Author:        Dave Hansen <dave.hansen@linux.intel.com>
AuthorDate:    Thu, 05 Mar 2020 09:47:08 -08:00
Committer:     Borislav Petkov <bp@suse.de>
CommitterDate: Thu, 04 Feb 2021 19:36:31 +01:00

x86/apic: Add extra serialization for non-serializing MSRs

Jan Kiszka reported that the x2apic_wrmsr_fence() function uses a plain
MFENCE while the Intel SDM (10.12.3 MSR Access in x2APIC Mode) calls for
MFENCE; LFENCE.

Short summary: we have special MSRs that have weaker ordering than all
the rest. Add fencing consistent with current SDM recommendations.

This is not known to cause any issues in practice, only in theory.

Longer story below:

The reason the kernel uses a different semantic is that the SDM changed
(roughly in late 2017). The SDM changed because folks at Intel were
auditing all of the recommended fences in the SDM and realized that the
x2apic fences were insufficient.

Why was the pain MFENCE judged insufficient?

WRMSR itself is normally a serializing instruction. No fences are needed
because the instruction itself serializes everything.

But, there are explicit exceptions for this serializing behavior written
into the WRMSR instruction documentation for two classes of MSRs:
IA32_TSC_DEADLINE and the X2APIC MSRs.

Back to x2apic: WRMSR is *not* serializing in this specific case.
But why is MFENCE insufficient? MFENCE makes writes visible, but
only affects load/store instructions. WRMSR is unfortunately not a
load/store instruction and is unaffected by MFENCE. This means that a
non-serializing WRMSR could be reordered by the CPU to execute before
the writes made visible by the MFENCE have even occurred in the first
place.

This means that an x2apic IPI could theoretically be triggered before
there is any (visible) data to process.

Does this affect anything in practice? I honestly don't know. It seems
quite possible that by the time an interrupt gets to consume the (not
yet) MFENCE'd data, it has become visible, mostly by accident.

To be safe, add the SDM-recommended fences for all x2apic WRMSRs.

This also leaves open the question of the _other_ weakly-ordered WRMSR:
MSR_IA32_TSC_DEADLINE. While it has the same ordering architecture as
the x2APIC MSRs, it seems substantially less likely to be a problem in
practice. While writes to the in-memory Local Vector Table (LVT) might
theoretically be reordered with respect to a weakly-ordered WRMSR like
TSC_DEADLINE, the SDM has this to say:

  In x2APIC mode, the WRMSR instruction is used to write to the LVT
  entry. The processor ensures the ordering of this write and any
  subsequent WRMSR to the deadline; no fencing is required.

But, that might still leave xAPIC exposed. The safest thing to do for
now is to add the extra, recommended LFENCE.

 [ bp: Massage commit message, fix typos, drop accidentally added
   newline to tools/arch/x86/include/asm/barrier.h. ]

Reported-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Link: https://lkml.kernel.org/r/20200305174708.F77040DD@viggo.jf.intel.com
---
 arch/x86/include/asm/apic.h           | 10 ----------
 arch/x86/include/asm/barrier.h        | 18 ++++++++++++++++++
 arch/x86/kernel/apic/apic.c           |  4 ++++
 arch/x86/kernel/apic/x2apic_cluster.c |  6 ++++--
 arch/x86/kernel/apic/x2apic_phys.c    |  9 ++++++---
 5 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 34cb3c1..412b51e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -197,16 +197,6 @@ static inline bool apic_needs_pit(void) { return true; }
 #endif /* !CONFIG_X86_LOCAL_APIC */
 
 #ifdef CONFIG_X86_X2APIC
-/*
- * Make previous memory operations globally visible before
- * sending the IPI through x2apic wrmsr. We need a serializing instruction or
- * mfence for this.
- */
-static inline void x2apic_wrmsr_fence(void)
-{
-	asm volatile("mfence" : : : "memory");
-}
-
 static inline void native_apic_msr_write(u32 reg, u32 v)
 {
 	if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 7f828fe..4819d5e 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -84,4 +84,22 @@ do {									\
 
 #include <asm-generic/barrier.h>
 
+/*
+ * Make previous memory operations globally visible before
+ * a WRMSR.
+ *
+ * MFENCE makes writes visible, but only affects load/store
+ * instructions.  WRMSR is unfortunately not a load/store
+ * instruction and is unaffected by MFENCE.  The LFENCE ensures
+ * that the WRMSR is not reordered.
+ *
+ * Most WRMSRs are full serializing instructions themselves and
+ * do not require this barrier.  This is only required for the
+ * IA32_TSC_DEADLINE and X2APIC MSRs.
+ */
+static inline void weak_wrmsr_fence(void)
+{
+	asm volatile("mfence; lfence" : : : "memory");
+}
+
 #endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 6bd20c0..7f4c081 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -41,6 +41,7 @@
 #include <asm/perf_event.h>
 #include <asm/x86_init.h>
 #include <linux/atomic.h>
+#include <asm/barrier.h>
 #include <asm/mpspec.h>
 #include <asm/i8259.h>
 #include <asm/proto.h>
@@ -477,6 +478,9 @@ static int lapic_next_deadline(unsigned long delta,
 {
 	u64 tsc;
 
+	/* This MSR is special and need a special fence: */
+	weak_wrmsr_fence();
+
 	tsc = rdtsc();
 	wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
 	return 0;
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index df6adc5..f4da9bb 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -29,7 +29,8 @@ static void x2apic_send_IPI(int cpu, int vector)
 {
 	u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
 
-	x2apic_wrmsr_fence();
+	/* x2apic MSRs are special and need a special fence: */
+	weak_wrmsr_fence();
 	__x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
 }
 
@@ -41,7 +42,8 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
 	unsigned long flags;
 	u32 dest;
 
-	x2apic_wrmsr_fence();
+	/* x2apic MSRs are special and need a special fence: */
+	weak_wrmsr_fence();
 	local_irq_save(flags);
 
 	tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 0e4e819..6bde05a 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -43,7 +43,8 @@ static void x2apic_send_IPI(int cpu, int vector)
 {
 	u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
 
-	x2apic_wrmsr_fence();
+	/* x2apic MSRs are special and need a special fence: */
+	weak_wrmsr_fence();
 	__x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL);
 }
 
@@ -54,7 +55,8 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
 	unsigned long this_cpu;
 	unsigned long flags;
 
-	x2apic_wrmsr_fence();
+	/* x2apic MSRs are special and need a special fence: */
+	weak_wrmsr_fence();
 
 	local_irq_save(flags);
 
@@ -125,7 +127,8 @@ void __x2apic_send_IPI_shorthand(int vector, u32 which)
 {
 	unsigned long cfg = __prepare_ICR(which, vector, 0);
 
-	x2apic_wrmsr_fence();
+	/* x2apic MSRs are special and need a special fence: */
+	weak_wrmsr_fence();
 	native_x2apic_icr_write(cfg, 0);
 }
 

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH 2/2] x86: add extra serialization for non-serializing MSRs
  2020-03-05 17:47 ` [RFC][PATCH 2/2] x86: add extra serialization for non-serializing MSRs Dave Hansen
  2021-02-04 16:37   ` Dave Hansen
  2021-02-04 19:37   ` [tip: x86/urgent] x86/apic: Add " tip-bot2 for Dave Hansen
@ 2021-02-04 23:37   ` Andrew Cooper
  2021-02-05  0:11     ` Andy Lutomirski
  2021-09-01 13:07   ` x86/apic: Add " Corey Minyard
  3 siblings, 1 reply; 15+ messages in thread
From: Andrew Cooper @ 2021-02-04 23:37 UTC (permalink / raw)
  To: Dave Hansen, linux-kernel; +Cc: jan.kiszka, x86, peterz

On 05/03/2020 17:47, Dave Hansen wrote:
> Jan Kiszka reported that the x2apic_wrmsr_fence() function uses a
> plain "mfence" while the Intel SDM (10.12.3 MSR Access in x2APIC
> Mode) calls for "mfence;lfence".
>
> Short summary: we have special MSRs that have weaker ordering
> than all the rest.  Add fencing consistent with current SDM
> recommendatrions.
>
> This is not known to cause any issues in practice, only in
> theory.

So, I accept that Intel have their own reasons for what is written in
the SDM, but "not ordered with stores" is at best misleading.

The x2APIC (and other) MSRs, aren't serialising.  That's fine, as is the
fact that the WRMSR to trigger them doesn't have memory operands, and is
therefore not explicitly ordered with other loads and stores.

Consider:
    xor %edi, %edi
    movb (%rdi), %dl
    wrmsr

It is fine for a non-serialising wrmsr here to execute speculative in
terms of internal calculations, but nothing it does can escape the local
core until the movb has fully retired, and is therefore globally visible.

Otherwise, I can send IPIs from non-architectural paths (in this case,
behind a page fault), and causality is broken.

IPIs are (at minimum) a write-like-thing leaving the core, even if they
don't interact with the regular memory path, and their effects cannot
become visible until the effects of older instructions are visible.

What the SDM is trying to say is that this potentially matters for
writes queued in the WC buffers.

If some code is using WC memory, and wants to send an IPI, and wants to
have the remote IPI handler read said data, then yes - there is a
problem - but the problem is the lack of SFENCE required to make the WC
buffer visible in the first place.

WC code is already responsible for its own memory ordering, and the
x2APIC IPIs can't execute early even in the absence of architectural
ordering guarantees.

~Andrew

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH 2/2] x86: add extra serialization for non-serializing MSRs
  2021-02-04 23:37   ` [RFC][PATCH 2/2] x86: add " Andrew Cooper
@ 2021-02-05  0:11     ` Andy Lutomirski
  2021-02-05 10:02       ` Peter Zijlstra
  0 siblings, 1 reply; 15+ messages in thread
From: Andy Lutomirski @ 2021-02-05  0:11 UTC (permalink / raw)
  To: Andrew Cooper, H. Peter Anvin
  Cc: Dave Hansen, LKML, Jan Kiszka, X86 ML, Peter Zijlstra

On Thu, Feb 4, 2021 at 3:37 PM Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>
> On 05/03/2020 17:47, Dave Hansen wrote:
> > Jan Kiszka reported that the x2apic_wrmsr_fence() function uses a
> > plain "mfence" while the Intel SDM (10.12.3 MSR Access in x2APIC
> > Mode) calls for "mfence;lfence".
> >
> > Short summary: we have special MSRs that have weaker ordering
> > than all the rest.  Add fencing consistent with current SDM
> > recommendatrions.
> >
> > This is not known to cause any issues in practice, only in
> > theory.
>
> So, I accept that Intel have their own reasons for what is written in
> the SDM, but "not ordered with stores" is at best misleading.
>
> The x2APIC (and other) MSRs, aren't serialising.  That's fine, as is the
> fact that the WRMSR to trigger them doesn't have memory operands, and is
> therefore not explicitly ordered with other loads and stores.
>
> Consider:
>     xor %edi, %edi
>     movb (%rdi), %dl
>     wrmsr
>
> It is fine for a non-serialising wrmsr here to execute speculative in
> terms of internal calculations, but nothing it does can escape the local
> core until the movb has fully retired, and is therefore globally visible.
>
> Otherwise, I can send IPIs from non-architectural paths (in this case,
> behind a page fault), and causality is broken.

I'm wondering if a more mild violation is possible:

Initialize *addr = 0.

mov $1, (addr)
wrmsr

remote cpu's IDT vector:

mov (addr), %rax
%rax == 0!

There's no speculative-execution-becoming-visible-even-if-it-doesn't-retire
here -- there's just an ordering violation.  For Linux, this would
presumably only manifest as a potential deadlock or confusion if the
IPI vector code looks at the list of pending work and doesn't find the
expected work in it.

Dave?  hpa?  What is the SDM trying to tell us?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH 1/2] x86: remove duplicate TSC DEADLINE MSR definitions
  2020-03-05 17:47 [RFC][PATCH 1/2] x86: remove duplicate TSC DEADLINE MSR definitions Dave Hansen
  2020-03-05 17:47 ` [RFC][PATCH 2/2] x86: add extra serialization for non-serializing MSRs Dave Hansen
  2020-03-09 23:50 ` [RFC][PATCH 1/2] x86: remove duplicate TSC DEADLINE MSR definitions Sean Christopherson
@ 2021-02-05  9:31 ` Borislav Petkov
  2021-02-12 21:37   ` Arnaldo Carvalho de Melo
  2021-03-07  1:29 ` Dave Hansen
  2021-03-08 11:46 ` [tip: x86/cleanups] x86: Remove " tip-bot2 for Dave Hansen
  4 siblings, 1 reply; 15+ messages in thread
From: Borislav Petkov @ 2021-02-05  9:31 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo; +Cc: Dave Hansen, linux-kernel, x86, peterz

On Thu, Mar 05, 2020 at 09:47:06AM -0800, Dave Hansen wrote:
> 
> There are two definitions for the TSC deadline MSR in msr-index.h,
> one with an underscore and one without.  Axe one of them and move
> all the references over to the other one.
> 
> Cc: x86@kernel.org
> Cc: Peter Zijlstra <peterz@infradead.org>
> 
> ---
> 
>  b/arch/x86/include/asm/msr-index.h                       |    2 --
>  b/arch/x86/kvm/x86.c                                     |    6 +++---
>  b/tools/arch/x86/include/asm/msr-index.h                 |    2 --
>  b/tools/perf/trace/beauty/tracepoints/x86_msr.sh         |    2 +-
>  b/tools/testing/selftests/kvm/include/x86_64/processor.h |    2 --

acme, ACK to take the perf bits through tip?

Leaving in the rest.

>  5 files changed, 4 insertions(+), 10 deletions(-)
> 
> diff -puN arch/x86/include/asm/msr-index.h~kill-dup-MSR_IA32_TSCDEADLINE arch/x86/include/asm/msr-index.h
> --- a/arch/x86/include/asm/msr-index.h~kill-dup-MSR_IA32_TSCDEADLINE	2020-03-05 09:42:37.049901042 -0800
> +++ b/arch/x86/include/asm/msr-index.h	2020-03-05 09:42:37.062901042 -0800
> @@ -576,8 +576,6 @@
>  #define MSR_IA32_APICBASE_ENABLE	(1<<11)
>  #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
>  
> -#define MSR_IA32_TSCDEADLINE		0x000006e0
> -
>  #define MSR_IA32_UCODE_WRITE		0x00000079
>  #define MSR_IA32_UCODE_REV		0x0000008b
>  
> diff -puN arch/x86/kvm/x86.c~kill-dup-MSR_IA32_TSCDEADLINE arch/x86/kvm/x86.c
> --- a/arch/x86/kvm/x86.c~kill-dup-MSR_IA32_TSCDEADLINE	2020-03-05 09:42:37.051901042 -0800
> +++ b/arch/x86/kvm/x86.c	2020-03-05 09:42:37.065901042 -0800
> @@ -1200,7 +1200,7 @@ static const u32 emulated_msrs_all[] = {
>  	MSR_KVM_PV_EOI_EN,
>  
>  	MSR_IA32_TSC_ADJUST,
> -	MSR_IA32_TSCDEADLINE,
> +	MSR_IA32_TSC_DEADLINE,
>  	MSR_IA32_ARCH_CAPABILITIES,
>  	MSR_IA32_MISC_ENABLE,
>  	MSR_IA32_MCG_STATUS,
> @@ -2688,7 +2688,7 @@ int kvm_set_msr_common(struct kvm_vcpu *
>  		return kvm_set_apic_base(vcpu, msr_info);
>  	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
>  		return kvm_x2apic_msr_write(vcpu, msr, data);
> -	case MSR_IA32_TSCDEADLINE:
> +	case MSR_IA32_TSC_DEADLINE:
>  		kvm_set_lapic_tscdeadline_msr(vcpu, data);
>  		break;
>  	case MSR_IA32_TSC_ADJUST:
> @@ -3009,7 +3009,7 @@ int kvm_get_msr_common(struct kvm_vcpu *
>  	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
>  		return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
>  		break;
> -	case MSR_IA32_TSCDEADLINE:
> +	case MSR_IA32_TSC_DEADLINE:
>  		msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
>  		break;
>  	case MSR_IA32_TSC_ADJUST:
> diff -puN tools/arch/x86/include/asm/msr-index.h~kill-dup-MSR_IA32_TSCDEADLINE tools/arch/x86/include/asm/msr-index.h
> --- a/tools/arch/x86/include/asm/msr-index.h~kill-dup-MSR_IA32_TSCDEADLINE	2020-03-05 09:42:37.055901042 -0800
> +++ b/tools/arch/x86/include/asm/msr-index.h	2020-03-05 09:42:37.066901042 -0800
> @@ -576,8 +576,6 @@
>  #define MSR_IA32_APICBASE_ENABLE	(1<<11)
>  #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
>  
> -#define MSR_IA32_TSCDEADLINE		0x000006e0
> -
>  #define MSR_IA32_UCODE_WRITE		0x00000079
>  #define MSR_IA32_UCODE_REV		0x0000008b
>  
> diff -puN tools/perf/trace/beauty/tracepoints/x86_msr.sh~kill-dup-MSR_IA32_TSCDEADLINE tools/perf/trace/beauty/tracepoints/x86_msr.sh
> --- a/tools/perf/trace/beauty/tracepoints/x86_msr.sh~kill-dup-MSR_IA32_TSCDEADLINE	2020-03-05 09:42:37.057901042 -0800
> +++ b/tools/perf/trace/beauty/tracepoints/x86_msr.sh	2020-03-05 09:42:37.066901042 -0800
> @@ -15,7 +15,7 @@ x86_msr_index=${arch_x86_header_dir}/msr
>  
>  printf "static const char *x86_MSRs[] = {\n"
>  regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MSR_([[:alnum:]][[:alnum:]_]+)[[:space:]]+(0x00000[[:xdigit:]]+)[[:space:]]*.*'
> -egrep $regex ${x86_msr_index} | egrep -v 'MSR_(ATOM|P[46]|AMD64|IA32_TSCDEADLINE|IDT_FCR4)' | \
> +egrep $regex ${x86_msr_index} | egrep -v 'MSR_(ATOM|P[46]|AMD64|IA32_TSC_DEADLINE|IDT_FCR4)' | \
>  	sed -r "s/$regex/\2 \1/g" | sort -n | \
>  	xargs printf "\t[%s] = \"%s\",\n"
>  printf "};\n\n"
> diff -puN tools/testing/selftests/kvm/include/x86_64/processor.h~kill-dup-MSR_IA32_TSCDEADLINE tools/testing/selftests/kvm/include/x86_64/processor.h
> --- a/tools/testing/selftests/kvm/include/x86_64/processor.h~kill-dup-MSR_IA32_TSCDEADLINE	2020-03-05 09:42:37.058901042 -0800
> +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h	2020-03-05 09:42:37.067901042 -0800
> @@ -813,8 +813,6 @@ void kvm_get_cpu_address_width(unsigned
>  #define		APIC_VECTOR_MASK	0x000FF
>  #define	APIC_ICR2	0x310
>  
> -#define MSR_IA32_TSCDEADLINE		0x000006e0
> -
>  #define MSR_IA32_UCODE_WRITE		0x00000079
>  #define MSR_IA32_UCODE_REV		0x0000008b
>  
> _

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH 2/2] x86: add extra serialization for non-serializing MSRs
  2021-02-05  0:11     ` Andy Lutomirski
@ 2021-02-05 10:02       ` Peter Zijlstra
  2021-02-05 12:08         ` Andrew Cooper
  2021-02-05 12:21         ` Peter Zijlstra
  0 siblings, 2 replies; 15+ messages in thread
From: Peter Zijlstra @ 2021-02-05 10:02 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andrew Cooper, H. Peter Anvin, Dave Hansen, LKML, Jan Kiszka, X86 ML

On Thu, Feb 04, 2021 at 04:11:12PM -0800, Andy Lutomirski wrote:
> I'm wondering if a more mild violation is possible:
> 
> Initialize *addr = 0.
> 
> mov $1, (addr)
> wrmsr
> 
> remote cpu's IDT vector:
> 
> mov (addr), %rax
> %rax == 0!
> 
> There's no speculative-execution-becoming-visible-even-if-it-doesn't-retire
> here -- there's just an ordering violation.  For Linux, this would
> presumably only manifest as a potential deadlock or confusion if the
> IPI vector code looks at the list of pending work and doesn't find the
> expected work in it.
> 
> Dave?  hpa?  What is the SDM trying to tell us?

[ Big caveat, I've not spoken to any hardware people about this. The
below is purely my own understanding. ]

This is my interpretation as well. Without the MFENCE+LFENCE there is no
guarantee the store is out of the store-buffer and the remote load isn't
guaranteed to observe it.

What I think the SDM is trying to tell us, is that the IPI, even if it
goes on the same regular coherency fabric as memory transfers, is not
subject to the regular memory ordering rules.

Normal TSO rules tells us that when:

P1() {
	x = 1;
	y = 1;
}

P2() {
	r1 = y;
	r2 = x;
}

r2 must not be 0 when r1 is 1. Because if we see store to y, we must
also see store to x. But the IPI thing doesn't behave like a store. The
(fast) wrmsr isn't even considered a memop.

The thing is, the above ordering does not guarantee we have r2 != 0.
r2==0 is allowed when r1==0. And that's an entirely sane outcome even if
we run the instructions like:

		CPU1		CPU2

cycle-1		mov $1, ([x])
cycle-2		mov $1, ([y])
cycle-3				mov ([y]), rax
cycle-4				mov ([x]), rbx

There is no guarantee _any_ of the stores will have made it out. And
that's exactly the issue. The IPI might make it out of the core before
any of the stores will.

Furthermore, since there is no dependency between:

	mov	$1, ([x])
	wrmsr

The CPU is allowed to reorder the execution and retire the wrmsr before
the store. Very much like it would for normal non-dependent
instructions.

And presumably it is still allowed to do that when we write it like:

	mov	$1, ([x])
	mfence
	wrmsr

because, mfence only has dependencies to memops and (fast) wrmsr is not
a memop.

Which then brings us to:

	mov	$1, ([x])
	mfence
	lfence
	wrmsr

In this case, the lfence acts like the newly minted ifence (see
spectre), and will block execution of (any) later instructions until
completion of all prior instructions. This, and only this ensures the
wrmsr happens after the mfence, which in turn ensures the store to x is
globally visible.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH 2/2] x86: add extra serialization for non-serializing MSRs
  2021-02-05 10:02       ` Peter Zijlstra
@ 2021-02-05 12:08         ` Andrew Cooper
  2021-02-05 12:21         ` Peter Zijlstra
  1 sibling, 0 replies; 15+ messages in thread
From: Andrew Cooper @ 2021-02-05 12:08 UTC (permalink / raw)
  To: Peter Zijlstra, Andy Lutomirski
  Cc: H. Peter Anvin, Dave Hansen, LKML, Jan Kiszka, X86 ML

On 05/02/2021 10:02, Peter Zijlstra wrote:
> On Thu, Feb 04, 2021 at 04:11:12PM -0800, Andy Lutomirski wrote:
>> I'm wondering if a more mild violation is possible:
>>
>> Initialize *addr = 0.
>>
>> mov $1, (addr)
>> wrmsr
>>
>> remote cpu's IDT vector:
>>
>> mov (addr), %rax
>> %rax == 0!
>>
>> There's no speculative-execution-becoming-visible-even-if-it-doesn't-retire
>> here -- there's just an ordering violation.  For Linux, this would
>> presumably only manifest as a potential deadlock or confusion if the
>> IPI vector code looks at the list of pending work and doesn't find the
>> expected work in it.
>>
>> Dave?  hpa?  What is the SDM trying to tell us?
> [ Big caveat, I've not spoken to any hardware people about this. The
> below is purely my own understanding. ]
>
> This is my interpretation as well. Without the MFENCE+LFENCE there is no
> guarantee the store is out of the store-buffer and the remote load isn't
> guaranteed to observe it.
>
> What I think the SDM is trying to tell us, is that the IPI, even if it
> goes on the same regular coherency fabric as memory transfers, is not
> subject to the regular memory ordering rules.
>
> Normal TSO rules tells us that when:
>
> P1() {
> 	x = 1;
> 	y = 1;
> }
>
> P2() {
> 	r1 = y;
> 	r2 = x;
> }
>
> r2 must not be 0 when r1 is 1. Because if we see store to y, we must
> also see store to x. But the IPI thing doesn't behave like a store. The
> (fast) wrmsr isn't even considered a memop.
>
> The thing is, the above ordering does not guarantee we have r2 != 0.
> r2==0 is allowed when r1==0. And that's an entirely sane outcome even if
> we run the instructions like:
>
> 		CPU1		CPU2
>
> cycle-1		mov $1, ([x])
> cycle-2		mov $1, ([y])
> cycle-3				mov ([y]), rax
> cycle-4				mov ([x]), rbx
>
> There is no guarantee _any_ of the stores will have made it out. And
> that's exactly the issue. The IPI might make it out of the core before
> any of the stores will.
>
> Furthermore, since there is no dependency between:
>
> 	mov	$1, ([x])
> 	wrmsr
>
> The CPU is allowed to reorder the execution and retire the wrmsr before
> the store. Very much like it would for normal non-dependent
> instructions.

Execution, sure (for details which don't escape the core, just like any
other speculative execution).  Retirement, surely not - it is inherently
tied to the program order of things.

Causality would also be broken if the WRMSR retired ahead of the MOV,
and an interrupt were to hit the boundary between them.

> And presumably it is still allowed to do that when we write it like:
>
> 	mov	$1, ([x])
> 	mfence
> 	wrmsr
>
> because, mfence only has dependencies to memops and (fast) wrmsr is not
> a memop.
>
> Which then brings us to:
>
> 	mov	$1, ([x])
> 	mfence
> 	lfence
> 	wrmsr
>
> In this case, the lfence acts like the newly minted ifence (see
> spectre), and will block execution of (any) later instructions until
> completion of all prior instructions. This, and only this ensures the
> wrmsr happens after the mfence, which in turn ensures the store to x is
> globally visible.

I understand that "what the architecture guarantees" differs from "how
parts behave in practice".

And I also understand the reasoning behind declaring that MFENCE;LFENCE
the only architecturally guaranteed way of ensuring all stores are
globally visible before the WRMSR starts.


However, what is missing is a explanation of how it is possible to build
a causality-preserving part where those fences (for plain stores) are
necessary in practice.

That sequence is a large set of pipeline stalls in practice for what
appears to a problem in theory only.

~Andrew

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH 2/2] x86: add extra serialization for non-serializing MSRs
  2021-02-05 10:02       ` Peter Zijlstra
  2021-02-05 12:08         ` Andrew Cooper
@ 2021-02-05 12:21         ` Peter Zijlstra
  1 sibling, 0 replies; 15+ messages in thread
From: Peter Zijlstra @ 2021-02-05 12:21 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andrew Cooper, H. Peter Anvin, Dave Hansen, LKML, Jan Kiszka, X86 ML

On Fri, Feb 05, 2021 at 11:02:10AM +0100, Peter Zijlstra wrote:

> And presumably it is still allowed to do that when we write it like:
> 
> 	mov	$1, ([x])
> 	mfence
> 	wrmsr
> 
> because, mfence only has dependencies to memops and (fast) wrmsr is not
> a memop.
> 
> Which then brings us to:
> 
> 	mov	$1, ([x])
> 	mfence
> 	lfence
> 	wrmsr
> 
> In this case, the lfence acts like the newly minted ifence (see
> spectre), and will block execution of (any) later instructions until
> completion of all prior instructions. This, and only this ensures the
> wrmsr happens after the mfence, which in turn ensures the store to x is
> globally visible.

Note that I too do have a few questions.

Supposedly MFENCE is our LOAD/STORE completion fence of choice, and this
obviously works with MMIO, since that's memops. The MMIO write of the
buffer address to the DMA device must happen after completion of the
previous data writes etc..

But what about the legacy IN/OUT ports? Are those memops? If not, we
might need additional LFENCEs there too.

Also, would SFENCE+LFENCE be sufficient for the WRMSR case? AFAIU SFENCE
is the store completion barrier and should be strong enough to flush all
store buffers. If not, why not?


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH 1/2] x86: remove duplicate TSC DEADLINE MSR definitions
  2021-02-05  9:31 ` Borislav Petkov
@ 2021-02-12 21:37   ` Arnaldo Carvalho de Melo
  0 siblings, 0 replies; 15+ messages in thread
From: Arnaldo Carvalho de Melo @ 2021-02-12 21:37 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: Dave Hansen, linux-kernel, x86, peterz

Em Fri, Feb 05, 2021 at 10:31:05AM +0100, Borislav Petkov escreveu:
> On Thu, Mar 05, 2020 at 09:47:06AM -0800, Dave Hansen wrote:
> > 
> > There are two definitions for the TSC deadline MSR in msr-index.h,
> > one with an underscore and one without.  Axe one of them and move
> > all the references over to the other one.
> > 
> > Cc: x86@kernel.org
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > 
> > ---
> > 
> >  b/arch/x86/include/asm/msr-index.h                       |    2 --
> >  b/arch/x86/kvm/x86.c                                     |    6 +++---
> >  b/tools/arch/x86/include/asm/msr-index.h                 |    2 --
> >  b/tools/perf/trace/beauty/tracepoints/x86_msr.sh         |    2 +-
> >  b/tools/testing/selftests/kvm/include/x86_64/processor.h |    2 --
> 
> acme, ACK to take the perf bits through tip?

Sure
 
> Leaving in the rest.
> 
> >  5 files changed, 4 insertions(+), 10 deletions(-)
> > 
> > diff -puN arch/x86/include/asm/msr-index.h~kill-dup-MSR_IA32_TSCDEADLINE arch/x86/include/asm/msr-index.h
> > --- a/arch/x86/include/asm/msr-index.h~kill-dup-MSR_IA32_TSCDEADLINE	2020-03-05 09:42:37.049901042 -0800
> > +++ b/arch/x86/include/asm/msr-index.h	2020-03-05 09:42:37.062901042 -0800
> > @@ -576,8 +576,6 @@
> >  #define MSR_IA32_APICBASE_ENABLE	(1<<11)
> >  #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
> >  
> > -#define MSR_IA32_TSCDEADLINE		0x000006e0
> > -
> >  #define MSR_IA32_UCODE_WRITE		0x00000079
> >  #define MSR_IA32_UCODE_REV		0x0000008b
> >  
> > diff -puN arch/x86/kvm/x86.c~kill-dup-MSR_IA32_TSCDEADLINE arch/x86/kvm/x86.c
> > --- a/arch/x86/kvm/x86.c~kill-dup-MSR_IA32_TSCDEADLINE	2020-03-05 09:42:37.051901042 -0800
> > +++ b/arch/x86/kvm/x86.c	2020-03-05 09:42:37.065901042 -0800
> > @@ -1200,7 +1200,7 @@ static const u32 emulated_msrs_all[] = {
> >  	MSR_KVM_PV_EOI_EN,
> >  
> >  	MSR_IA32_TSC_ADJUST,
> > -	MSR_IA32_TSCDEADLINE,
> > +	MSR_IA32_TSC_DEADLINE,
> >  	MSR_IA32_ARCH_CAPABILITIES,
> >  	MSR_IA32_MISC_ENABLE,
> >  	MSR_IA32_MCG_STATUS,
> > @@ -2688,7 +2688,7 @@ int kvm_set_msr_common(struct kvm_vcpu *
> >  		return kvm_set_apic_base(vcpu, msr_info);
> >  	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
> >  		return kvm_x2apic_msr_write(vcpu, msr, data);
> > -	case MSR_IA32_TSCDEADLINE:
> > +	case MSR_IA32_TSC_DEADLINE:
> >  		kvm_set_lapic_tscdeadline_msr(vcpu, data);
> >  		break;
> >  	case MSR_IA32_TSC_ADJUST:
> > @@ -3009,7 +3009,7 @@ int kvm_get_msr_common(struct kvm_vcpu *
> >  	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
> >  		return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
> >  		break;
> > -	case MSR_IA32_TSCDEADLINE:
> > +	case MSR_IA32_TSC_DEADLINE:
> >  		msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
> >  		break;
> >  	case MSR_IA32_TSC_ADJUST:
> > diff -puN tools/arch/x86/include/asm/msr-index.h~kill-dup-MSR_IA32_TSCDEADLINE tools/arch/x86/include/asm/msr-index.h
> > --- a/tools/arch/x86/include/asm/msr-index.h~kill-dup-MSR_IA32_TSCDEADLINE	2020-03-05 09:42:37.055901042 -0800
> > +++ b/tools/arch/x86/include/asm/msr-index.h	2020-03-05 09:42:37.066901042 -0800
> > @@ -576,8 +576,6 @@
> >  #define MSR_IA32_APICBASE_ENABLE	(1<<11)
> >  #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
> >  
> > -#define MSR_IA32_TSCDEADLINE		0x000006e0
> > -
> >  #define MSR_IA32_UCODE_WRITE		0x00000079
> >  #define MSR_IA32_UCODE_REV		0x0000008b
> >  
> > diff -puN tools/perf/trace/beauty/tracepoints/x86_msr.sh~kill-dup-MSR_IA32_TSCDEADLINE tools/perf/trace/beauty/tracepoints/x86_msr.sh
> > --- a/tools/perf/trace/beauty/tracepoints/x86_msr.sh~kill-dup-MSR_IA32_TSCDEADLINE	2020-03-05 09:42:37.057901042 -0800
> > +++ b/tools/perf/trace/beauty/tracepoints/x86_msr.sh	2020-03-05 09:42:37.066901042 -0800
> > @@ -15,7 +15,7 @@ x86_msr_index=${arch_x86_header_dir}/msr
> >  
> >  printf "static const char *x86_MSRs[] = {\n"
> >  regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MSR_([[:alnum:]][[:alnum:]_]+)[[:space:]]+(0x00000[[:xdigit:]]+)[[:space:]]*.*'
> > -egrep $regex ${x86_msr_index} | egrep -v 'MSR_(ATOM|P[46]|AMD64|IA32_TSCDEADLINE|IDT_FCR4)' | \
> > +egrep $regex ${x86_msr_index} | egrep -v 'MSR_(ATOM|P[46]|AMD64|IA32_TSC_DEADLINE|IDT_FCR4)' | \
> >  	sed -r "s/$regex/\2 \1/g" | sort -n | \
> >  	xargs printf "\t[%s] = \"%s\",\n"
> >  printf "};\n\n"
> > diff -puN tools/testing/selftests/kvm/include/x86_64/processor.h~kill-dup-MSR_IA32_TSCDEADLINE tools/testing/selftests/kvm/include/x86_64/processor.h
> > --- a/tools/testing/selftests/kvm/include/x86_64/processor.h~kill-dup-MSR_IA32_TSCDEADLINE	2020-03-05 09:42:37.058901042 -0800
> > +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h	2020-03-05 09:42:37.067901042 -0800
> > @@ -813,8 +813,6 @@ void kvm_get_cpu_address_width(unsigned
> >  #define		APIC_VECTOR_MASK	0x000FF
> >  #define	APIC_ICR2	0x310
> >  
> > -#define MSR_IA32_TSCDEADLINE		0x000006e0
> > -
> >  #define MSR_IA32_UCODE_WRITE		0x00000079
> >  #define MSR_IA32_UCODE_REV		0x0000008b
> >  
> > _
> 
> -- 
> Regards/Gruss,
>     Boris.
> 
> https://people.kernel.org/tglx/notes-about-netiquette

-- 

- Arnaldo

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH 1/2] x86: remove duplicate TSC DEADLINE MSR definitions
  2020-03-05 17:47 [RFC][PATCH 1/2] x86: remove duplicate TSC DEADLINE MSR definitions Dave Hansen
                   ` (2 preceding siblings ...)
  2021-02-05  9:31 ` Borislav Petkov
@ 2021-03-07  1:29 ` Dave Hansen
  2021-03-08 11:46 ` [tip: x86/cleanups] x86: Remove " tip-bot2 for Dave Hansen
  4 siblings, 0 replies; 15+ messages in thread
From: Dave Hansen @ 2021-03-07  1:29 UTC (permalink / raw)
  To: Dave Hansen, linux-kernel; +Cc: x86, peterz

On 3/5/20 9:47 AM, Dave Hansen wrote:
> There are two definitions for the TSC deadline MSR in msr-index.h,
> one with an underscore and one without.  Axe one of them and move
> all the references over to the other one.
> 
> Cc: x86@kernel.org
> Cc: Peter Zijlstra <peterz@infradead.org>

Better late than never:

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [tip: x86/cleanups] x86: Remove duplicate TSC DEADLINE MSR definitions
  2020-03-05 17:47 [RFC][PATCH 1/2] x86: remove duplicate TSC DEADLINE MSR definitions Dave Hansen
                   ` (3 preceding siblings ...)
  2021-03-07  1:29 ` Dave Hansen
@ 2021-03-08 11:46 ` tip-bot2 for Dave Hansen
  4 siblings, 0 replies; 15+ messages in thread
From: tip-bot2 for Dave Hansen @ 2021-03-08 11:46 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: Dave Hansen, Borislav Petkov, x86, linux-kernel

The following commit has been merged into the x86/cleanups branch of tip:

Commit-ID:     09141ec0e4efede4fb5e2aa68cb819fba974325c
Gitweb:        https://git.kernel.org/tip/09141ec0e4efede4fb5e2aa68cb819fba974325c
Author:        Dave Hansen <dave.hansen@linux.intel.com>
AuthorDate:    Thu, 05 Mar 2020 09:47:06 -08:00
Committer:     Borislav Petkov <bp@suse.de>
CommitterDate: Mon, 08 Mar 2021 11:05:20 +01:00

x86: Remove duplicate TSC DEADLINE MSR definitions

There are two definitions for the TSC deadline MSR in msr-index.h,
one with an underscore and one without.  Axe one of them and move
all the references over to the other one.

 [ bp: Fixup the MSR define in handle_fastpath_set_msr_irqoff() too. ]

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20200305174706.0D6B8EE4@viggo.jf.intel.com
---
 arch/x86/include/asm/msr-index.h               | 2 --
 arch/x86/kvm/x86.c                             | 8 ++++----
 tools/arch/x86/include/asm/msr-index.h         | 2 --
 tools/perf/trace/beauty/tracepoints/x86_msr.sh | 2 +-
 4 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 546d6ec..4502935 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -628,8 +628,6 @@
 #define MSR_IA32_APICBASE_ENABLE	(1<<11)
 #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
 
-#define MSR_IA32_TSCDEADLINE		0x000006e0
-
 #define MSR_IA32_UCODE_WRITE		0x00000079
 #define MSR_IA32_UCODE_REV		0x0000008b
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2a20ce6..c020499 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1288,7 +1288,7 @@ static const u32 emulated_msrs_all[] = {
 	MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
 
 	MSR_IA32_TSC_ADJUST,
-	MSR_IA32_TSCDEADLINE,
+	MSR_IA32_TSC_DEADLINE,
 	MSR_IA32_ARCH_CAPABILITIES,
 	MSR_IA32_PERF_CAPABILITIES,
 	MSR_IA32_MISC_ENABLE,
@@ -1841,7 +1841,7 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
 			ret = EXIT_FASTPATH_EXIT_HANDLED;
 		}
 		break;
-	case MSR_IA32_TSCDEADLINE:
+	case MSR_IA32_TSC_DEADLINE:
 		data = kvm_read_edx_eax(vcpu);
 		if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
 			kvm_skip_emulated_instruction(vcpu);
@@ -3075,7 +3075,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		return kvm_set_apic_base(vcpu, msr_info);
 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
 		return kvm_x2apic_msr_write(vcpu, msr, data);
-	case MSR_IA32_TSCDEADLINE:
+	case MSR_IA32_TSC_DEADLINE:
 		kvm_set_lapic_tscdeadline_msr(vcpu, data);
 		break;
 	case MSR_IA32_TSC_ADJUST:
@@ -3437,7 +3437,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
 		return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
-	case MSR_IA32_TSCDEADLINE:
+	case MSR_IA32_TSC_DEADLINE:
 		msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
 		break;
 	case MSR_IA32_TSC_ADJUST:
diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h
index 546d6ec..4502935 100644
--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
@@ -628,8 +628,6 @@
 #define MSR_IA32_APICBASE_ENABLE	(1<<11)
 #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
 
-#define MSR_IA32_TSCDEADLINE		0x000006e0
-
 #define MSR_IA32_UCODE_WRITE		0x00000079
 #define MSR_IA32_UCODE_REV		0x0000008b
 
diff --git a/tools/perf/trace/beauty/tracepoints/x86_msr.sh b/tools/perf/trace/beauty/tracepoints/x86_msr.sh
index 27ee1ea..9b0614a 100755
--- a/tools/perf/trace/beauty/tracepoints/x86_msr.sh
+++ b/tools/perf/trace/beauty/tracepoints/x86_msr.sh
@@ -15,7 +15,7 @@ x86_msr_index=${arch_x86_header_dir}/msr-index.h
 
 printf "static const char *x86_MSRs[] = {\n"
 regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MSR_([[:alnum:]][[:alnum:]_]+)[[:space:]]+(0x00000[[:xdigit:]]+)[[:space:]]*.*'
-egrep $regex ${x86_msr_index} | egrep -v 'MSR_(ATOM|P[46]|IA32_(TSCDEADLINE|UCODE_REV)|IDT_FCR4)' | \
+egrep $regex ${x86_msr_index} | egrep -v 'MSR_(ATOM|P[46]|IA32_(TSC_DEADLINE|UCODE_REV)|IDT_FCR4)' | \
 	sed -r "s/$regex/\2 \1/g" | sort -n | \
 	xargs printf "\t[%s] = \"%s\",\n"
 printf "};\n\n"

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: x86/apic: Add extra serialization for non-serializing MSRs
  2020-03-05 17:47 ` [RFC][PATCH 2/2] x86: add extra serialization for non-serializing MSRs Dave Hansen
                     ` (2 preceding siblings ...)
  2021-02-04 23:37   ` [RFC][PATCH 2/2] x86: add " Andrew Cooper
@ 2021-09-01 13:07   ` Corey Minyard
  3 siblings, 0 replies; 15+ messages in thread
From: Corey Minyard @ 2021-09-01 13:07 UTC (permalink / raw)
  To: Linux Kernel, Dave Hansen
  Cc: Jan Kiszka, Borislav Petkov, Peter Zijlstra, Thomas Gleixner

> Jan Kiszka reported that the x2apic_wrmsr_fence() function uses a
> plain "mfence" while the Intel SDM (10.12.3 MSR Access in x2APIC
> Mode) calls for "mfence;lfence".
> 
> Short summary: we have special MSRs that have weaker ordering
> than all the rest.  Add fencing consistent with current SDM
> recommendatrions.
> 
> This is not known to cause any issues in practice, only in
> theory.

This appears to not just be theoretical.  I was working a problem for a
customer running 52 and 104 core Intel-based Dell machine.  After
looking at the coredumps a while, I realized the only common thread was
it seemed IPIs were very occasionally not working.  This was on a 3.10
based kernel.

I stumbled upon this patch, we tried it and the problems appear to have
gone away.

It appears to only happen on machines with a specific microcode version,
though we haven't tested enough to be 100% sure about that.  We had two
labs with different microcode versions.  One lab never experienced the
problem, the other had the issue occurring regularly.

Anyway, I thought I would add a report and thank you for the patch,
since it saved me a world of trouble.

-corey

> 
> Longer story below:
> 
> The reason the kernel uses a different semantic is that the SDM
> changed (roughly in late 2017).  The SDM changed because folks at
> Intel were auditing all of the recommended fences in the SDM and
> realized that the x2apic fences were insufficient.
> 
> Why was the pain "mfence" judged insufficient?
> 
> WRMSR itself is normally a serializing instruction.  No fences
> are needed because because the instruction itself serializes
> everything.
> 
> But, there are explicit exceptions for this serializing behavior
> written into the WRMSR instruction documentation for two classes
> of MSRs: IA32_TSC_DEADLINE and the X2APIC MSRs.
> 
> Back to x2apic: WRMSR is *not* serializing in this specific case.
> But why is MFENCE insufficient?  MFENCE makes writes visible, but
> only affects load/store instructions.  WRMSR is unfortunately not
> a load/store instruction and is unaffected by MEFNCE.  This means
> that a non-serializing WRMSR could be reordered by the CPU to
> execute before the writes made visible by the MFENCE have even
> occurred in the first place.
> 
> This mean that an x2apic IPI could theoretically be triggered
> before there is any (visible) data to process.
> 
> Does this affect anything in practice?  I honestly don't know.
> It seems quite possible that by the time an interrupt gets to
> consume the (not yet) MFENCE'd data, it has become visible,
> mostly by accident.
> 
> To be safe, add the SDM-recommended fences for all x2apic WRMSRs.
> 
> This also leaves open the question of the _other_ weakly-ordered
> WRMSR: MSR_IA32_TSC_DEADLINE.  While it has the same ordering
> architecture as the x2APIC MSRs, it seems substantially less
> likely to be a problem in practice.  While writes to the
> in-memory Local Vector Table (LVT) might theoretically be
> reordered with respect to a weakly-ordered WRMSR like
> TSC_DEADLINE, the SDM has this to say:
> 
> 	In x2APIC mode, the WRMSR instruction is used to write to
> 	the LVT entry. The processor ensures the ordering of this
> 	write and any subsequent WRMSR to the deadline; no
> 	fencing is required.
> 
> But, that might still leave xAPIC exposed.  The safest thing to
> do for now is to add the extra, recommended LFENCE.
> 
> Reported-by: Jan Kiszka <jan.kiszka@siemens.com>
> Cc: x86@kernel.org
> Cc: Peter Zijlstra <peterz@infradead.org>
> 
> ---
> 
>  b/arch/x86/include/asm/apic.h           |   10 ----------
>  b/arch/x86/include/asm/barrier.h        |   18 ++++++++++++++++++
>  b/arch/x86/kernel/apic/apic.c           |    4 ++++
>  b/arch/x86/kernel/apic/x2apic_cluster.c |    6 ++++--
>  b/arch/x86/kernel/apic/x2apic_phys.c    |    9 ++++++---
>  b/tools/arch/x86/include/asm/barrier.h  |    1 +
>  6 files changed, 33 insertions(+), 15 deletions(-)
> 
> diff -puN arch/x86/include/asm/apic.h~x2apic-wrmsr-serialization \
>                 arch/x86/include/asm/apic.h
> --- a/arch/x86/include/asm/apic.h~x2apic-wrmsr-serialization	2020-03-05 \
>                 09:42:38.876901038 -0800
> +++ b/arch/x86/include/asm/apic.h	2020-03-05 09:42:38.891901038 -0800
> @@ -195,16 +195,6 @@ static inline bool apic_needs_pit(void)
>  #endif /* !CONFIG_X86_LOCAL_APIC */
> 
>  #ifdef CONFIG_X86_X2APIC
> -/*
> - * Make previous memory operations globally visible before
> - * sending the IPI through x2apic wrmsr. We need a serializing instruction or
> - * mfence for this.
> - */
> -static inline void x2apic_wrmsr_fence(void)
> -{
> -	asm volatile("mfence" : : : "memory");
> -}
> -
>  static inline void native_apic_msr_write(u32 reg, u32 v)
>  {
>  	if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
> diff -puN arch/x86/include/asm/barrier.h~x2apic-wrmsr-serialization \
>                 arch/x86/include/asm/barrier.h
> --- a/arch/x86/include/asm/barrier.h~x2apic-wrmsr-serialization	2020-03-05 \
>                 09:42:38.878901038 -0800
> +++ b/arch/x86/include/asm/barrier.h	2020-03-05 09:42:38.893901038 -0800
> @@ -84,4 +84,22 @@ do {									\
> 
>  #include <asm-generic/barrier.h>
> 
> +/*
> + * Make previous memory operations globally visible before
> + * a WRMSR.
> + *
> + * MFENCE makes writes visible, but only affects load/store
> + * instructions.  WRMSR is unfortunately not a load/store
> + * instruction and is unaffected by MEFNCE.  The LFENCE ensures
> + * that the WRMSR is not reordered.
> + *
> + * Most WRMSRs are full serializing instructions themselves and
> + * do not require this barrier.  This is only required for the
> + * IA32_TSC_DEADLINE and X2APIC MSRs.
> + */
> +static inline void weak_wrmsr_fence(void)
> +{
> +	asm volatile("mfence; lfence" : : : "memory");
> +}
> +
>  #endif /* _ASM_X86_BARRIER_H */
> diff -puN arch/x86/kernel/apic/apic.c~x2apic-wrmsr-serialization \
>                 arch/x86/kernel/apic/apic.c
> --- a/arch/x86/kernel/apic/apic.c~x2apic-wrmsr-serialization	2020-03-05 \
>                 09:42:38.880901038 -0800
> +++ b/arch/x86/kernel/apic/apic.c	2020-03-05 09:42:38.892901038 -0800
> @@ -42,6 +42,7 @@
>  #include <asm/x86_init.h>
>  #include <asm/pgalloc.h>
>  #include <linux/atomic.h>
> +#include <asm/barrier.h>
>  #include <asm/mpspec.h>
>  #include <asm/i8259.h>
>  #include <asm/proto.h>
> @@ -474,6 +475,9 @@ static int lapic_next_deadline(unsigned
>  {
>  	u64 tsc;
> 
> +	/* This MSR is special and need a special fence: */
> +	weak_wrmsr_fence();
> +
>  	tsc = rdtsc();
>  	wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
>  	return 0;
> diff -puN arch/x86/kernel/apic/x2apic_cluster.c~x2apic-wrmsr-serialization \
>                 arch/x86/kernel/apic/x2apic_cluster.c
> --- a/arch/x86/kernel/apic/x2apic_cluster.c~x2apic-wrmsr-serialization	2020-03-05 \
>                 09:42:38.882901038 -0800
> +++ b/arch/x86/kernel/apic/x2apic_cluster.c	2020-03-05 09:42:38.892901038 -0800
> @@ -29,7 +29,8 @@ static void x2apic_send_IPI(int cpu, int
>  {
>  	u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
> 
> -	x2apic_wrmsr_fence();
> +	/* x2apic MSRs are special and need a special fence: */
> +	weak_wrmsr_fence();
>  	__x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
>  }
> 
> @@ -41,7 +42,8 @@ __x2apic_send_IPI_mask(const struct cpum
>  	unsigned long flags;
>  	u32 dest;
> 
> -	x2apic_wrmsr_fence();
> +	/* x2apic MSRs are special and need a special fence: */
> +	weak_wrmsr_fence();
>  	local_irq_save(flags);
> 
>  	tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask);
> diff -puN arch/x86/kernel/apic/x2apic_phys.c~x2apic-wrmsr-serialization \
>                 arch/x86/kernel/apic/x2apic_phys.c
> --- a/arch/x86/kernel/apic/x2apic_phys.c~x2apic-wrmsr-serialization	2020-03-05 \
>                 09:42:38.885901038 -0800
> +++ b/arch/x86/kernel/apic/x2apic_phys.c	2020-03-05 09:42:38.892901038 -0800
> @@ -37,7 +37,8 @@ static void x2apic_send_IPI(int cpu, int
>  {
>  	u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
> 
> -	x2apic_wrmsr_fence();
> +	/* x2apic MSRs are special and need a special fence: */
> +	weak_wrmsr_fence();
>  	__x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL);
>  }
> 
> @@ -48,7 +49,8 @@ __x2apic_send_IPI_mask(const struct cpum
>  	unsigned long this_cpu;
>  	unsigned long flags;
> 
> -	x2apic_wrmsr_fence();
> +	/* x2apic MSRs are special and need a special fence: */
> +	weak_wrmsr_fence();
> 
>  	local_irq_save(flags);
> 
> @@ -116,7 +118,8 @@ void __x2apic_send_IPI_shorthand(int vec
>  {
>  	unsigned long cfg = __prepare_ICR(which, vector, 0);
> 
> -	x2apic_wrmsr_fence();
> +	/* x2apic MSRs are special and need a special fence: */
> +	weak_wrmsr_fence();
>  	native_x2apic_icr_write(cfg, 0);
>  }
> 
> diff -puN tools/arch/x86/include/asm/barrier.h~x2apic-wrmsr-serialization \
>                 tools/arch/x86/include/asm/barrier.h
> --- a/tools/arch/x86/include/asm/barrier.h~x2apic-wrmsr-serialization	2020-03-05 \
>                 09:42:38.887901038 -0800
> +++ b/tools/arch/x86/include/asm/barrier.h	2020-03-05 09:42:38.892901038 -0800
> @@ -43,4 +43,5 @@ do {						\
>  	___p1;					\
>  })
>  #endif /* defined(__x86_64__) */
> +
>  #endif /* _TOOLS_LINUX_ASM_X86_BARRIER_H */

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2021-09-01 13:08 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-05 17:47 [RFC][PATCH 1/2] x86: remove duplicate TSC DEADLINE MSR definitions Dave Hansen
2020-03-05 17:47 ` [RFC][PATCH 2/2] x86: add extra serialization for non-serializing MSRs Dave Hansen
2021-02-04 16:37   ` Dave Hansen
2021-02-04 19:37   ` [tip: x86/urgent] x86/apic: Add " tip-bot2 for Dave Hansen
2021-02-04 23:37   ` [RFC][PATCH 2/2] x86: add " Andrew Cooper
2021-02-05  0:11     ` Andy Lutomirski
2021-02-05 10:02       ` Peter Zijlstra
2021-02-05 12:08         ` Andrew Cooper
2021-02-05 12:21         ` Peter Zijlstra
2021-09-01 13:07   ` x86/apic: Add " Corey Minyard
2020-03-09 23:50 ` [RFC][PATCH 1/2] x86: remove duplicate TSC DEADLINE MSR definitions Sean Christopherson
2021-02-05  9:31 ` Borislav Petkov
2021-02-12 21:37   ` Arnaldo Carvalho de Melo
2021-03-07  1:29 ` Dave Hansen
2021-03-08 11:46 ` [tip: x86/cleanups] x86: Remove " tip-bot2 for Dave Hansen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).