All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] x86/kvm/emulate: Avoid RET for fastops
@ 2023-11-12 20:12 Peter Zijlstra
  2023-11-29  1:37 ` Sean Christopherson
  0 siblings, 1 reply; 4+ messages in thread
From: Peter Zijlstra @ 2023-11-12 20:12 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini, Josh Poimboeuf, Thomas Gleixner
  Cc: linux-kernel, x86

Hi,

Inspired by the likes of ba5ca5e5e6a1 ("x86/retpoline: Don't clobber
RFLAGS during srso_safe_ret()") I had it on my TODO to look at this,
because the call-depth-tracking rethunk definitely also clobbers flags
and that's a ton harder to fix.

Looking at this recently I noticed that there's really only one callsite
(twice, the testcc thing is basically separate from the rest of the
fastop stuff) and thus CALL+RET is totally silly, we can JMP+JMP.

The below implements this, and aside from objtool going apeshit (it
fails to recognise the fastop JMP_NOSPEC as a jump-table and instead
classifies it as a tail-call), it actually builds and the asm looks
good sensible enough.

I've not yet figured out how to test this stuff, but does something like
this look sane to you guys?

Given that rethunks are quite fat and slow, this could be sold as a
performance optimization I suppose.

---

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index f93e9b96927a..2cd3b5a46e7a 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -412,6 +412,17 @@ static inline void call_depth_return_thunk(void) {}
 	"call *%[thunk_target]\n",				\
 	X86_FEATURE_RETPOLINE_LFENCE)
 
+# define JMP_NOSPEC						\
+	ALTERNATIVE_2(						\
+	ANNOTATE_RETPOLINE_SAFE					\
+	"jmp *%[thunk_target]\n",				\
+	"jmp __x86_indirect_thunk_%V[thunk_target]\n",		\
+	X86_FEATURE_RETPOLINE,					\
+	"lfence;\n"						\
+	ANNOTATE_RETPOLINE_SAFE					\
+	"jmp *%[thunk_target]\n",				\
+	X86_FEATURE_RETPOLINE_LFENCE)
+
 # define THUNK_TARGET(addr) [thunk_target] "r" (addr)
 
 #else /* CONFIG_X86_32 */
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2673cd5c46cb..9aae15d223a8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -294,7 +294,6 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
 
 #define __FOP_FUNC(name) \
 	".align " __stringify(FASTOP_SIZE) " \n\t" \
-	".type " name ", @function \n\t" \
 	name ":\n\t" \
 	ASM_ENDBR \
 	IBT_NOSEAL(name)
@@ -302,12 +301,15 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
 #define FOP_FUNC(name) \
 	__FOP_FUNC(#name)
 
-#define __FOP_RET(name) \
-	"11: " ASM_RET \
+#define __FOP_JMP(name, label) \
+	"11: jmp " label " ; int3 \n\t" \
 	".size " name ", .-" name "\n\t"
 
-#define FOP_RET(name) \
-	__FOP_RET(#name)
+#define FOP_JMP(name, label) \
+	__FOP_JMP(#name, #label)
+
+#define __FOP_RET(name) \
+	__FOP_JMP(name, "fastop_return")
 
 #define __FOP_START(op, align) \
 	extern void em_##op(struct fastop *fake); \
@@ -420,7 +422,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
 #define FOP_SETCC(op) \
 	FOP_FUNC(op) \
 	#op " %al \n\t" \
-	FOP_RET(op)
+	FOP_JMP(op, setcc_return)
 
 FOP_START(setcc)
 FOP_SETCC(seto)
@@ -444,7 +446,7 @@ FOP_END;
 FOP_START(salc)
 FOP_FUNC(salc)
 "pushf; sbb %al, %al; popf \n\t"
-FOP_RET(salc)
+FOP_JMP(salc, fastop_return)
 FOP_END;
 
 /*
@@ -1061,13 +1063,13 @@ static int em_bsr_c(struct x86_emulate_ctxt *ctxt)
 	return fastop(ctxt, em_bsr);
 }
 
-static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)
+static noinline u8 test_cc(unsigned int condition, unsigned long flags)
 {
 	u8 rc;
 	void (*fop)(void) = (void *)em_setcc + FASTOP_SIZE * (condition & 0xf);
 
 	flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
-	asm("push %[flags]; popf; " CALL_NOSPEC
+	asm("push %[flags]; popf; " JMP_NOSPEC "; setcc_return:"
 	    : "=a"(rc) : [thunk_target]"r"(fop), [flags]"r"(flags));
 	return rc;
 }
@@ -5101,14 +5103,14 @@ static void fetch_possible_mmx_operand(struct operand *op)
 		kvm_read_mmx_reg(op->addr.mm, &op->mm_val);
 }
 
-static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop)
+static noinline int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop)
 {
 	ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
 
 	if (!(ctxt->d & ByteOp))
 		fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
 
-	asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n"
+	asm("push %[flags]; popf; " JMP_NOSPEC " ; fastop_return: ; pushf; pop %[flags]\n"
 	    : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
 	      [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT
 	    : "c"(ctxt->src2.val));

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [RFC] x86/kvm/emulate: Avoid RET for fastops
  2023-11-12 20:12 [RFC] x86/kvm/emulate: Avoid RET for fastops Peter Zijlstra
@ 2023-11-29  1:37 ` Sean Christopherson
  2023-11-29  7:20   ` Peter Zijlstra
  2023-11-29 15:04   ` Peter Zijlstra
  0 siblings, 2 replies; 4+ messages in thread
From: Sean Christopherson @ 2023-11-29  1:37 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paolo Bonzini, Josh Poimboeuf, Thomas Gleixner, linux-kernel, x86

On Sun, Nov 12, 2023, Peter Zijlstra wrote:
> Hi,
> 
> Inspired by the likes of ba5ca5e5e6a1 ("x86/retpoline: Don't clobber
> RFLAGS during srso_safe_ret()") I had it on my TODO to look at this,
> because the call-depth-tracking rethunk definitely also clobbers flags
> and that's a ton harder to fix.
> 
> Looking at this recently I noticed that there's really only one callsite
> (twice, the testcc thing is basically separate from the rest of the
> fastop stuff) and thus CALL+RET is totally silly, we can JMP+JMP.
> 
> The below implements this, and aside from objtool going apeshit (it
> fails to recognise the fastop JMP_NOSPEC as a jump-table and instead
> classifies it as a tail-call), it actually builds and the asm looks
> good sensible enough.
> 
> I've not yet figured out how to test this stuff, but does something like
> this look sane to you guys?

Yes?  The idea seems sound, but I haven't thought _that_ hard about whether or not
there's any possible gotchas.   I did a quick test and nothing exploded (and
usually when this code breaks, it breaks spectacularly).

> Given that rethunks are quite fat and slow, this could be sold as a
> performance optimization I suppose.
> 
> ---
> 
> diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
> index f93e9b96927a..2cd3b5a46e7a 100644
> --- a/arch/x86/include/asm/nospec-branch.h
> +++ b/arch/x86/include/asm/nospec-branch.h
> @@ -412,6 +412,17 @@ static inline void call_depth_return_thunk(void) {}
>  	"call *%[thunk_target]\n",				\
>  	X86_FEATURE_RETPOLINE_LFENCE)
>  
> +# define JMP_NOSPEC						\
> +	ALTERNATIVE_2(						\
> +	ANNOTATE_RETPOLINE_SAFE					\
> +	"jmp *%[thunk_target]\n",				\
> +	"jmp __x86_indirect_thunk_%V[thunk_target]\n",		\
> +	X86_FEATURE_RETPOLINE,					\
> +	"lfence;\n"						\
> +	ANNOTATE_RETPOLINE_SAFE					\
> +	"jmp *%[thunk_target]\n",				\
> +	X86_FEATURE_RETPOLINE_LFENCE)

There needs a 32-bit version (eww) and a CONFIG_RETPOLINE=n version. :-/

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC] x86/kvm/emulate: Avoid RET for fastops
  2023-11-29  1:37 ` Sean Christopherson
@ 2023-11-29  7:20   ` Peter Zijlstra
  2023-11-29 15:04   ` Peter Zijlstra
  1 sibling, 0 replies; 4+ messages in thread
From: Peter Zijlstra @ 2023-11-29  7:20 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Paolo Bonzini, Josh Poimboeuf, Thomas Gleixner, linux-kernel, x86

On Tue, Nov 28, 2023 at 05:37:52PM -0800, Sean Christopherson wrote:
> On Sun, Nov 12, 2023, Peter Zijlstra wrote:
> > Hi,
> > 
> > Inspired by the likes of ba5ca5e5e6a1 ("x86/retpoline: Don't clobber
> > RFLAGS during srso_safe_ret()") I had it on my TODO to look at this,
> > because the call-depth-tracking rethunk definitely also clobbers flags
> > and that's a ton harder to fix.
> > 
> > Looking at this recently I noticed that there's really only one callsite
> > (twice, the testcc thing is basically separate from the rest of the
> > fastop stuff) and thus CALL+RET is totally silly, we can JMP+JMP.
> > 
> > The below implements this, and aside from objtool going apeshit (it
> > fails to recognise the fastop JMP_NOSPEC as a jump-table and instead
> > classifies it as a tail-call), it actually builds and the asm looks
> > good sensible enough.
> > 
> > I've not yet figured out how to test this stuff, but does something like
> > this look sane to you guys?
> 
> Yes?  The idea seems sound, but I haven't thought _that_ hard about whether or not
> there's any possible gotchas.   I did a quick test and nothing exploded (and
> usually when this code breaks, it breaks spectacularly).

That's encouraging..

> > Given that rethunks are quite fat and slow, this could be sold as a
> > performance optimization I suppose.
> > 
> > ---
> > 
> > diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
> > index f93e9b96927a..2cd3b5a46e7a 100644
> > --- a/arch/x86/include/asm/nospec-branch.h
> > +++ b/arch/x86/include/asm/nospec-branch.h
> > @@ -412,6 +412,17 @@ static inline void call_depth_return_thunk(void) {}
> >  	"call *%[thunk_target]\n",				\
> >  	X86_FEATURE_RETPOLINE_LFENCE)
> >  
> > +# define JMP_NOSPEC						\
> > +	ALTERNATIVE_2(						\
> > +	ANNOTATE_RETPOLINE_SAFE					\
> > +	"jmp *%[thunk_target]\n",				\
> > +	"jmp __x86_indirect_thunk_%V[thunk_target]\n",		\
> > +	X86_FEATURE_RETPOLINE,					\
> > +	"lfence;\n"						\
> > +	ANNOTATE_RETPOLINE_SAFE					\
> > +	"jmp *%[thunk_target]\n",				\
> > +	X86_FEATURE_RETPOLINE_LFENCE)
> 
> There needs a 32-bit version (eww) and a CONFIG_RETPOLINE=n version. :-/

I'll go make that happen. Thanks!

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC] x86/kvm/emulate: Avoid RET for fastops
  2023-11-29  1:37 ` Sean Christopherson
  2023-11-29  7:20   ` Peter Zijlstra
@ 2023-11-29 15:04   ` Peter Zijlstra
  1 sibling, 0 replies; 4+ messages in thread
From: Peter Zijlstra @ 2023-11-29 15:04 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Paolo Bonzini, Josh Poimboeuf, Thomas Gleixner, linux-kernel, x86

On Tue, Nov 28, 2023 at 05:37:52PM -0800, Sean Christopherson wrote:
> On Sun, Nov 12, 2023, Peter Zijlstra wrote:
> > Hi,
> > 
> > Inspired by the likes of ba5ca5e5e6a1 ("x86/retpoline: Don't clobber
> > RFLAGS during srso_safe_ret()") I had it on my TODO to look at this,
> > because the call-depth-tracking rethunk definitely also clobbers flags
> > and that's a ton harder to fix.
> > 
> > Looking at this recently I noticed that there's really only one callsite
> > (twice, the testcc thing is basically separate from the rest of the
> > fastop stuff) and thus CALL+RET is totally silly, we can JMP+JMP.
> > 
> > The below implements this, and aside from objtool going apeshit (it
> > fails to recognise the fastop JMP_NOSPEC as a jump-table and instead
> > classifies it as a tail-call), it actually builds and the asm looks
> > good sensible enough.
> > 
> > I've not yet figured out how to test this stuff, but does something like
> > this look sane to you guys?
> 
> Yes?  The idea seems sound, but I haven't thought _that_ hard about whether or not
> there's any possible gotchas.   I did a quick test and nothing exploded (and
> usually when this code breaks, it breaks spectacularly).

Looking at this more, I was wondering if there is something magical
about test_cc(). Both naming and purpose seems to be testing the flags,
but it has a side effect of actually setting the flags too, does
anything rely on that?

That is, we already have code that emulates the condition-codes, might
as well use it here and avoid a bunch of dodgy asm, no?

---
 arch/x86/include/asm/text-patching.h | 20 +++++++++++++-------
 arch/x86/kvm/emulate.c               | 34 ++--------------------------------
 2 files changed, 15 insertions(+), 39 deletions(-)

diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 29832c338cdc..7b9d7fe0ab64 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -186,9 +186,9 @@ void int3_emulate_ret(struct pt_regs *regs)
 }
 
 static __always_inline
-void int3_emulate_jcc(struct pt_regs *regs, u8 cc, unsigned long ip, unsigned long disp)
+bool __emulate_cc(unsigned long flags, u8 cc)
 {
-	static const unsigned long jcc_mask[6] = {
+	static const unsigned long cc_mask[6] = {
 		[0] = X86_EFLAGS_OF,
 		[1] = X86_EFLAGS_CF,
 		[2] = X86_EFLAGS_ZF,
@@ -201,15 +201,21 @@ void int3_emulate_jcc(struct pt_regs *regs, u8 cc, unsigned long ip, unsigned lo
 	bool match;
 
 	if (cc < 0xc) {
-		match = regs->flags & jcc_mask[cc >> 1];
+		match = flags & cc_mask[cc >> 1];
 	} else {
-		match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^
-			((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT);
+		match = ((flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^
+			((flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT);
 		if (cc >= 0xe)
-			match = match || (regs->flags & X86_EFLAGS_ZF);
+			match = match || (flags & X86_EFLAGS_ZF);
 	}
 
-	if ((match && !invert) || (!match && invert))
+	return (match && !invert) || (!match && invert);
+}
+
+static __always_inline
+void int3_emulate_jcc(struct pt_regs *regs, u8 cc, unsigned long ip, unsigned long disp)
+{
+	if (__emulate_cc(regs->flags, cc))
 		ip += disp;
 
 	int3_emulate_jmp(regs, ip);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2673cd5c46cb..0e971222c1f4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -26,6 +26,7 @@
 #include <asm/debugreg.h>
 #include <asm/nospec-branch.h>
 #include <asm/ibt.h>
+#include <asm/text-patching.h>
 
 #include "x86.h"
 #include "tss.h"
@@ -416,31 +417,6 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
 	ON64(FOP3E(op##q, rax, rdx, cl)) \
 	FOP_END
 
-/* Special case for SETcc - 1 instruction per cc */
-#define FOP_SETCC(op) \
-	FOP_FUNC(op) \
-	#op " %al \n\t" \
-	FOP_RET(op)
-
-FOP_START(setcc)
-FOP_SETCC(seto)
-FOP_SETCC(setno)
-FOP_SETCC(setc)
-FOP_SETCC(setnc)
-FOP_SETCC(setz)
-FOP_SETCC(setnz)
-FOP_SETCC(setbe)
-FOP_SETCC(setnbe)
-FOP_SETCC(sets)
-FOP_SETCC(setns)
-FOP_SETCC(setp)
-FOP_SETCC(setnp)
-FOP_SETCC(setl)
-FOP_SETCC(setnl)
-FOP_SETCC(setle)
-FOP_SETCC(setnle)
-FOP_END;
-
 FOP_START(salc)
 FOP_FUNC(salc)
 "pushf; sbb %al, %al; popf \n\t"
@@ -1063,13 +1039,7 @@ static int em_bsr_c(struct x86_emulate_ctxt *ctxt)
 
 static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)
 {
-	u8 rc;
-	void (*fop)(void) = (void *)em_setcc + FASTOP_SIZE * (condition & 0xf);
-
-	flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
-	asm("push %[flags]; popf; " CALL_NOSPEC
-	    : "=a"(rc) : [thunk_target]"r"(fop), [flags]"r"(flags));
-	return rc;
+	return __emulate_cc(flags, condition & 0xf);
 }
 
 static void fetch_register_operand(struct operand *op)

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-11-29 15:05 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-12 20:12 [RFC] x86/kvm/emulate: Avoid RET for fastops Peter Zijlstra
2023-11-29  1:37 ` Sean Christopherson
2023-11-29  7:20   ` Peter Zijlstra
2023-11-29 15:04   ` Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.