On 10/12/2018 17:53, speck for Andi Kleen wrote: > xdiff --git a/arch/x86/lib/clear_cpu.S b/arch/x86/lib/clear_cpu.S > new file mode 100644 > index 000000000000..5af33baf5427 > --- /dev/null > +++ b/arch/x86/lib/clear_cpu.S > @@ -0,0 +1,107 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > + > +#include > +#include > +#include > + > +/* > + * Clear internal CPU buffers on kernel boundaries. > + * > + * These sequences are somewhat fragile, please don't add > + * or change instructions in the middle of the areas marked with > + * start/end. > + * > + * Interrupts and NMIs we deal with by reclearing. We clear parts > + * of the kernel stack, which has other advantages too. > + * > + * Save all registers to make it easier to use for callers. > + * > + * This sequence is for Nehalem-IvyBridge. For Haswell we jump > + * to hsw_clear_buf. > + * > + * These functions need to be called on a full stack, as they may > + * use upto 1.5k of stack. They should be also called with > + * interrupts disabled. NMIs etc. are handled by letting every > + * NMI do its own clear sequence. > + */ > +ENTRY(ivb_clear_cpu) > +GLOBAL(do_clear_cpu) > + /* > + * obj[tf]ool complains about unreachable code here, > + * which appears to be spurious?!? > + */ > + ALTERNATIVE "", "jmp hsw_clear_cpu", X86_BUG_MDS_CLEAR_CPU_HSW > + push %__ASM_REG(si) > + push %__ASM_REG(di) > + push %__ASM_REG(cx) > + mov %_ASM_SP, %__ASM_REG(si) > + sub $2*16, %_ASM_SP > + and $-16,%_ASM_SP > + movdqa %xmm0, (%_ASM_SP) > + movdqa %xmm1, 1*16(%_ASM_SP) You don't need to preserve %xmm1 here.  It is unmodified by the sequence, because the orpd pulls zero out of its memory operand.  Similarly... > + sub $672, %_ASM_SP > + xorpd %xmm0,%xmm0 > + movdqa %xmm0, (%_ASM_SP) > + movdqa %xmm0, 16(%_ASM_SP) ... this store doesn't appear to do anything useful, as that stack slot isn't read again, and... > + mov %_ASM_SP, %__ASM_REG(di) > + /* Clear sequence start */ > + movdqu %xmm0,(%__ASM_REG(di)) > + lfence > + orpd (%__ASM_REG(di)), %xmm0 > + orpd (%__ASM_REG(di)), %xmm1 > + mfence > + movl $40, %ecx > + add $32, %__ASM_REG(di) ... I know this was in the recommended sequence, but bytes 16-31 aren't used at all, and it feels fishy. Either this wants to be $16, or the second orpd wants a displacement of 16 (and we do need to retain the second zeroing write) so all 32 bytes are used. > +1: movntdq %xmm0, (%__ASM_REG(di)) > + add $16, %__ASM_REG(di) > + decl %ecx > + jnz 1b > + mfence > + /* Clear sequence end */ > + add $672, %_ASM_SP > + movdqu (%_ASM_SP), %xmm0 > + movdqu 1*16(%_ASM_SP), %xmm1 > + mov %__ASM_REG(si),%_ASM_SP > + pop %__ASM_REG(cx) > + pop %__ASM_REG(di) > + pop %__ASM_REG(si) > + ret > +END(ivb_clear_cpu) > + > +/* > + * Version for Haswell/Broadwell. > + */ > +ENTRY(hsw_clear_cpu) > + push %__ASM_REG(si) > + push %__ASM_REG(di) > + push %__ASM_REG(cx) > + push %__ASM_REG(ax) > + mov %_ASM_SP, %__ASM_REG(ax) > + sub $16, %_ASM_SP > + and $-16,%_ASM_SP > + movdqa %xmm0, (%_ASM_SP) > + /* Clear sequence start */ > + xorpd %xmm0,%xmm0 > + sub $1536,%_ASM_SP > + mov %_ASM_SP, %__ASM_REG(si) > + mov %__ASM_REG(si), %__ASM_REG(di) Strictly speaking, these 3 instructions aren't part of the clear sequence.  They are just for setting up the buffer.  In particular, sub $1536 %rsp is unbalanced WRT its matching add. ~Andrew > + movl $40,%ecx > +1: movntdq %xmm0, (%__ASM_REG(di)) > + add $16, %__ASM_REG(di) > + decl %ecx > + jnz 1b > + mfence > + mov %__ASM_REG(si), %__ASM_REG(di) > + mov $1536, %ecx > + rep movsb > + lfence > + /* Clear sequence end */ > + add $1536,%_ASM_SP > + movdqa (%_ASM_SP), %xmm0 > + mov %__ASM_REG(ax),%_ASM_SP > + pop %__ASM_REG(ax) > + pop %__ASM_REG(cx) > + pop %__ASM_REG(di) > + pop %__ASM_REG(si) > + ret > +END(hsw_clear_cpu)