All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] static_call/x86: Handle clang's conditional tail calls
@ 2023-01-23 20:59 Peter Zijlstra
  2023-01-23 20:59 ` [PATCH 1/3] x86/alternative: Introduce int3_emulate_jcc() Peter Zijlstra
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Peter Zijlstra @ 2023-01-23 20:59 UTC (permalink / raw)
  To: mingo
  Cc: tglx, bp, dave.hansen, x86, hpa, peterz, jpoimboe, jbaron,
	rostedt, ardb, linux-kernel, erhard_f, ndesaulniers, mhiramat,
	sandipan.das

Erhard reported boot fails on this AMD machine when using clang and bisected it
to a commit introducing a few static_call()s. Turns out that when using clang
with -Os it it very likely to generate conditional tail calls like:

  0000000000000350 <amd_pmu_add_event>:
  350:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1) 351: R_X86_64_NONE      __fentry__-0x4
  355:       48 83 bf 20 01 00 00 00         cmpq   $0x0,0x120(%rdi)
  35d:       0f 85 00 00 00 00       jne    363 <amd_pmu_add_event+0x13>     35f: R_X86_64_PLT32     __SCT__amd_pmu_branch_add-0x4
  363:       e9 00 00 00 00          jmp    368 <amd_pmu_add_event+0x18>     364: R_X86_64_PLT32     __x86_return_thunk-0x4

And our inline static_call() patching code can't deal with those and BUG
happens -- really early.

These patches borrow the kprobe Jcc emulation to implement text_poke_bp() Jcc
support, which is then used to teach inline static_call() about this form.

---
 arch/x86/include/asm/text-patching.h | 31 ++++++++++++++++++
 arch/x86/kernel/alternative.c        | 62 +++++++++++++++++++++++++++---------
 arch/x86/kernel/kprobes/core.c       | 38 +++++-----------------
 arch/x86/kernel/static_call.c        | 50 +++++++++++++++++++++++++++--
 4 files changed, 133 insertions(+), 48 deletions(-)


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/3] x86/alternative: Introduce int3_emulate_jcc()
  2023-01-23 20:59 [PATCH 0/3] static_call/x86: Handle clang's conditional tail calls Peter Zijlstra
@ 2023-01-23 20:59 ` Peter Zijlstra
  2023-01-31 14:22   ` [tip: x86/alternatives] x86/alternatives: " tip-bot2 for Peter Zijlstra
  2023-01-23 20:59 ` [PATCH 2/3] x86/alternative: Teach text_poke_bp() to patch Jcc.d32 instructions Peter Zijlstra
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2023-01-23 20:59 UTC (permalink / raw)
  To: mingo
  Cc: tglx, bp, dave.hansen, x86, hpa, peterz, jpoimboe, jbaron,
	rostedt, ardb, linux-kernel, erhard_f, ndesaulniers, mhiramat,
	sandipan.das

Move the kprobe Jcc emulation into int3_emulate_jcc() so it can be
used by more code -- specifically static_call() will need this.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 arch/x86/include/asm/text-patching.h |   31 ++++++++++++++++++++++++++++
 arch/x86/kernel/kprobes/core.c       |   38 +++++++----------------------------
 2 files changed, 39 insertions(+), 30 deletions(-)

--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -184,6 +184,37 @@ void int3_emulate_ret(struct pt_regs *re
 	unsigned long ip = int3_emulate_pop(regs);
 	int3_emulate_jmp(regs, ip);
 }
+
+static __always_inline
+void int3_emulate_jcc(struct pt_regs *regs, u8 cc, unsigned long ip, unsigned long disp)
+{
+	static const unsigned long jcc_mask[6] = {
+		[0] = X86_EFLAGS_OF,
+		[1] = X86_EFLAGS_CF,
+		[2] = X86_EFLAGS_ZF,
+		[3] = X86_EFLAGS_CF | X86_EFLAGS_ZF,
+		[4] = X86_EFLAGS_SF,
+		[5] = X86_EFLAGS_PF,
+	};
+
+	bool invert = cc & 1;
+	bool match;
+
+	if (cc < 0xc) {
+		match = regs->flags & jcc_mask[cc >> 1];
+	} else {
+		match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^
+			((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT);
+		if (cc >= 0xe)
+			match = match || (regs->flags & X86_EFLAGS_ZF);
+	}
+
+	if ((match && !invert) || (!match && invert))
+		ip += disp;
+
+	int3_emulate_jmp(regs, ip);
+}
+
 #endif /* !CONFIG_UML_X86 */
 
 #endif /* _ASM_X86_TEXT_PATCHING_H */
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -464,50 +464,26 @@ static void kprobe_emulate_call(struct k
 }
 NOKPROBE_SYMBOL(kprobe_emulate_call);
 
-static nokprobe_inline
-void __kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs, bool cond)
+static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs)
 {
 	unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
 
-	if (cond)
-		ip += p->ainsn.rel32;
+	ip += p->ainsn.rel32;
 	int3_emulate_jmp(regs, ip);
 }
-
-static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs)
-{
-	__kprobe_emulate_jmp(p, regs, true);
-}
 NOKPROBE_SYMBOL(kprobe_emulate_jmp);
 
-static const unsigned long jcc_mask[6] = {
-	[0] = X86_EFLAGS_OF,
-	[1] = X86_EFLAGS_CF,
-	[2] = X86_EFLAGS_ZF,
-	[3] = X86_EFLAGS_CF | X86_EFLAGS_ZF,
-	[4] = X86_EFLAGS_SF,
-	[5] = X86_EFLAGS_PF,
-};
-
 static void kprobe_emulate_jcc(struct kprobe *p, struct pt_regs *regs)
 {
-	bool invert = p->ainsn.jcc.type & 1;
-	bool match;
+	unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
 
-	if (p->ainsn.jcc.type < 0xc) {
-		match = regs->flags & jcc_mask[p->ainsn.jcc.type >> 1];
-	} else {
-		match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^
-			((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT);
-		if (p->ainsn.jcc.type >= 0xe)
-			match = match || (regs->flags & X86_EFLAGS_ZF);
-	}
-	__kprobe_emulate_jmp(p, regs, (match && !invert) || (!match && invert));
+	int3_emulate_jcc(regs, p->ainsn.jcc.type, ip, p->ainsn.rel32);
 }
 NOKPROBE_SYMBOL(kprobe_emulate_jcc);
 
 static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs)
 {
+	unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
 	bool match;
 
 	if (p->ainsn.loop.type != 3) {	/* LOOP* */
@@ -535,7 +511,9 @@ static void kprobe_emulate_loop(struct k
 	else if (p->ainsn.loop.type == 1)	/* LOOPE */
 		match = match && (regs->flags & X86_EFLAGS_ZF);
 
-	__kprobe_emulate_jmp(p, regs, match);
+	if (match)
+		ip += p->ainsn.rel32;
+	int3_emulate_jmp(regs, ip);
 }
 NOKPROBE_SYMBOL(kprobe_emulate_loop);
 



^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 2/3] x86/alternative: Teach text_poke_bp() to patch Jcc.d32 instructions
  2023-01-23 20:59 [PATCH 0/3] static_call/x86: Handle clang's conditional tail calls Peter Zijlstra
  2023-01-23 20:59 ` [PATCH 1/3] x86/alternative: Introduce int3_emulate_jcc() Peter Zijlstra
@ 2023-01-23 20:59 ` Peter Zijlstra
  2023-01-31 14:22   ` [tip: x86/alternatives] x86/alternatives: " tip-bot2 for Peter Zijlstra
  2023-01-23 20:59 ` [PATCH 3/3] x86/static_call: Add support for Jcc tail-calls Peter Zijlstra
  2023-02-08 22:36 ` [PATCH 0/3] static_call/x86: Handle clang's conditional tail calls Nathan Chancellor
  3 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2023-01-23 20:59 UTC (permalink / raw)
  To: mingo
  Cc: tglx, bp, dave.hansen, x86, hpa, peterz, jpoimboe, jbaron,
	rostedt, ardb, linux-kernel, erhard_f, ndesaulniers, mhiramat,
	sandipan.das

In order to re-write Jcc.d32 instructions text_poke_bp() needs to be
taught about them.

The biggest hurdle is that the whole machinery is currently made for 5
byte instructions and extending this would grow struct text_poke_loc
which is currently a nice 16 bytes and used in an array.

However, since text_poke_loc contains a full copy of the (s32)
displacement, it is possible to map the Jcc.d32 2 byte opcodes to
Jcc.d8 1 byte opcode for the int3 emulation.

This then leaves the replacement bytes; fudge that by only storing the
last 5 bytes and adding the rule that 'length == 6' instruction will
be prefixed with a 0x0f byte.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 arch/x86/kernel/alternative.c |   62 +++++++++++++++++++++++++++++++-----------
 1 file changed, 47 insertions(+), 15 deletions(-)

--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -338,6 +338,12 @@ void __init_or_module noinline apply_alt
 	}
 }
 
+static inline bool is_jcc32(struct insn *insn)
+{
+	/* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
+	return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
+}
+
 #if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL)
 
 /*
@@ -376,12 +382,6 @@ static int emit_indirect(int op, int reg
 	return i;
 }
 
-static inline bool is_jcc32(struct insn *insn)
-{
-	/* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
-	return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
-}
-
 static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
 {
 	u8 op = insn->opcode.bytes[0];
@@ -1770,6 +1770,11 @@ void text_poke_sync(void)
 	on_each_cpu(do_sync_core, NULL, 1);
 }
 
+/*
+ * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of
+ * this thing. When len == 6 everything is prefixed with 0x0f and we map
+ * opcode to Jcc.d8, using len to distinguish.
+ */
 struct text_poke_loc {
 	/* addr := _stext + rel_addr */
 	s32 rel_addr;
@@ -1891,6 +1896,10 @@ noinstr int poke_int3_handler(struct pt_
 		int3_emulate_jmp(regs, (long)ip + tp->disp);
 		break;
 
+	case 0x70 ... 0x7f: /* Jcc */
+		int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp);
+		break;
+
 	default:
 		BUG();
 	}
@@ -1964,16 +1973,26 @@ static void text_poke_bp_batch(struct te
 	 * Second step: update all but the first byte of the patched range.
 	 */
 	for (do_sync = 0, i = 0; i < nr_entries; i++) {
-		u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, };
+		u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, };
+		u8 _new[POKE_MAX_OPCODE_SIZE+1];
+		const u8 *new = tp[i].text;
 		int len = tp[i].len;
 
 		if (len - INT3_INSN_SIZE > 0) {
 			memcpy(old + INT3_INSN_SIZE,
 			       text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
 			       len - INT3_INSN_SIZE);
+
+			if (len == 6) {
+				_new[0] = 0x0f;
+				memcpy(_new + 1, new, 5);
+				new = _new;
+			}
+
 			text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
-				  (const char *)tp[i].text + INT3_INSN_SIZE,
+				  new + INT3_INSN_SIZE,
 				  len - INT3_INSN_SIZE);
+
 			do_sync++;
 		}
 
@@ -2001,8 +2020,7 @@ static void text_poke_bp_batch(struct te
 		 * The old instruction is recorded so that the event can be
 		 * processed forwards or backwards.
 		 */
-		perf_event_text_poke(text_poke_addr(&tp[i]), old, len,
-				     tp[i].text, len);
+		perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len);
 	}
 
 	if (do_sync) {
@@ -2019,10 +2037,15 @@ static void text_poke_bp_batch(struct te
 	 * replacing opcode.
 	 */
 	for (do_sync = 0, i = 0; i < nr_entries; i++) {
-		if (tp[i].text[0] == INT3_INSN_OPCODE)
+		u8 byte = tp[i].text[0];
+
+		if (tp[i].len == 6)
+			byte = 0x0f;
+
+		if (byte == INT3_INSN_OPCODE)
 			continue;
 
-		text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE);
+		text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE);
 		do_sync++;
 	}
 
@@ -2040,9 +2063,11 @@ static void text_poke_loc_init(struct te
 			       const void *opcode, size_t len, const void *emulate)
 {
 	struct insn insn;
-	int ret, i;
+	int ret, i = 0;
 
-	memcpy((void *)tp->text, opcode, len);
+	if (len == 6)
+		i = 1;
+	memcpy((void *)tp->text, opcode+i, len-i);
 	if (!emulate)
 		emulate = opcode;
 
@@ -2053,6 +2078,13 @@ static void text_poke_loc_init(struct te
 	tp->len = len;
 	tp->opcode = insn.opcode.bytes[0];
 
+	if (is_jcc32(&insn)) {
+		/*
+		 * Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
+		 */
+		tp->opcode = insn.opcode.bytes[1] - 0x10;
+	}
+
 	switch (tp->opcode) {
 	case RET_INSN_OPCODE:
 	case JMP32_INSN_OPCODE:
@@ -2069,7 +2101,6 @@ static void text_poke_loc_init(struct te
 		BUG_ON(len != insn.length);
 	}
 
-
 	switch (tp->opcode) {
 	case INT3_INSN_OPCODE:
 	case RET_INSN_OPCODE:
@@ -2078,6 +2109,7 @@ static void text_poke_loc_init(struct te
 	case CALL_INSN_OPCODE:
 	case JMP32_INSN_OPCODE:
 	case JMP8_INSN_OPCODE:
+	case 0x70 ... 0x7f: /* Jcc */
 		tp->disp = insn.immediate.value;
 		break;
 



^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 3/3] x86/static_call: Add support for Jcc tail-calls
  2023-01-23 20:59 [PATCH 0/3] static_call/x86: Handle clang's conditional tail calls Peter Zijlstra
  2023-01-23 20:59 ` [PATCH 1/3] x86/alternative: Introduce int3_emulate_jcc() Peter Zijlstra
  2023-01-23 20:59 ` [PATCH 2/3] x86/alternative: Teach text_poke_bp() to patch Jcc.d32 instructions Peter Zijlstra
@ 2023-01-23 20:59 ` Peter Zijlstra
  2023-01-23 22:44   ` Steven Rostedt
  2023-02-08 22:36 ` [PATCH 0/3] static_call/x86: Handle clang's conditional tail calls Nathan Chancellor
  3 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2023-01-23 20:59 UTC (permalink / raw)
  To: mingo
  Cc: tglx, bp, dave.hansen, x86, hpa, peterz, jpoimboe, jbaron,
	rostedt, ardb, linux-kernel, erhard_f, ndesaulniers, mhiramat,
	sandipan.das

Clang likes to create conditional tail calls like:

0000000000000350 <amd_pmu_add_event>:
350:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1) 351: R_X86_64_NONE      __fentry__-0x4
355:       48 83 bf 20 01 00 00 00         cmpq   $0x0,0x120(%rdi)
35d:       0f 85 00 00 00 00       jne    363 <amd_pmu_add_event+0x13>     35f: R_X86_64_PLT32     __SCT__amd_pmu_branch_add-0x4
363:       e9 00 00 00 00          jmp    368 <amd_pmu_add_event+0x18>     364: R_X86_64_PLT32     __x86_return_thunk-0x4

Teach the in-line static call text patching about this.

Notably, since there is no conditional-ret, in that caes patch the Jcc
to point at an empty stub function that does the ret -- or the return
thunk when needed.

Reported-by: "Erhard F." <erhard_f@mailbox.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 arch/x86/kernel/static_call.c |   50 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 3 deletions(-)

--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -9,6 +9,7 @@ enum insn_type {
 	NOP = 1,  /* site cond-call */
 	JMP = 2,  /* tramp / site tail-call */
 	RET = 3,  /* tramp / site cond-tail-call */
+	JCC = 4,
 };
 
 /*
@@ -25,12 +26,40 @@ static const u8 xor5rax[] = { 0x2e, 0x2e
 
 static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc };
 
+static u8 __is_Jcc(u8 *insn) /* Jcc.d32 */
+{
+	u8 ret = 0;
+
+	if (insn[0] == 0x0f) {
+		u8 tmp = insn[1];
+		if ((tmp & 0xf0) == 0x80)
+			ret = tmp;
+	}
+
+	return ret;
+}
+
+extern void __static_call_return(void);
+
+asm (".global __static_call_return\n\t"
+     ".type __static_call_return, @function\n\t"
+     ASM_FUNC_ALIGN "\n\t"
+     "__static_call_return:\n\t"
+     ANNOTATE_NOENDBR
+     ANNOTATE_RETPOLINE_SAFE
+     "ret; int3\n\t"
+     ".size __static_call_return, . - __static_call_return \n\t");
+
 static void __ref __static_call_transform(void *insn, enum insn_type type,
 					  void *func, bool modinit)
 {
 	const void *emulate = NULL;
 	int size = CALL_INSN_SIZE;
 	const void *code;
+	u8 op, buf[6];
+
+	if ((type == JMP || type == RET) && (op = __is_Jcc(insn)))
+		type = JCC;
 
 	switch (type) {
 	case CALL:
@@ -57,6 +86,20 @@ static void __ref __static_call_transfor
 		else
 			code = &retinsn;
 		break;
+
+	case JCC:
+		if (!func) {
+			func = __static_call_return;
+			if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+				func = x86_return_thunk;
+		}
+
+		buf[0] = 0x0f;
+		__text_gen_insn(buf+1, op, insn+1, func, 5);
+		code = buf;
+		size = 6;
+
+		break;
 	}
 
 	if (memcmp(insn, code, size) == 0)
@@ -68,9 +111,9 @@ static void __ref __static_call_transfor
 	text_poke_bp(insn, code, size, emulate);
 }
 
-static void __static_call_validate(void *insn, bool tail, bool tramp)
+static void __static_call_validate(u8 *insn, bool tail, bool tramp)
 {
-	u8 opcode = *(u8 *)insn;
+	u8 opcode = insn[0];
 
 	if (tramp && memcmp(insn+5, tramp_ud, 3)) {
 		pr_err("trampoline signature fail");
@@ -79,7 +122,8 @@ static void __static_call_validate(void
 
 	if (tail) {
 		if (opcode == JMP32_INSN_OPCODE ||
-		    opcode == RET_INSN_OPCODE)
+		    opcode == RET_INSN_OPCODE ||
+		    __is_Jcc(insn))
 			return;
 	} else {
 		if (opcode == CALL_INSN_OPCODE ||



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] x86/static_call: Add support for Jcc tail-calls
  2023-01-23 20:59 ` [PATCH 3/3] x86/static_call: Add support for Jcc tail-calls Peter Zijlstra
@ 2023-01-23 22:44   ` Steven Rostedt
  2023-01-24 13:06     ` Peter Zijlstra
  0 siblings, 1 reply; 14+ messages in thread
From: Steven Rostedt @ 2023-01-23 22:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, tglx, bp, dave.hansen, x86, hpa, jpoimboe, jbaron, ardb,
	linux-kernel, erhard_f, ndesaulniers, mhiramat, sandipan.das

On Mon, 23 Jan 2023 21:59:18 +0100
Peter Zijlstra <peterz@infradead.org> wrote:

> Clang likes to create conditional tail calls like:
> 
> 0000000000000350 <amd_pmu_add_event>:
> 350:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1) 351: R_X86_64_NONE      __fentry__-0x4
> 355:       48 83 bf 20 01 00 00 00         cmpq   $0x0,0x120(%rdi)
> 35d:       0f 85 00 00 00 00       jne    363 <amd_pmu_add_event+0x13>     35f: R_X86_64_PLT32     __SCT__amd_pmu_branch_add-0x4
> 363:       e9 00 00 00 00          jmp    368 <amd_pmu_add_event+0x18>     364: R_X86_64_PLT32     __x86_return_thunk-0x4
> 

Just to confirm, as it's not clear if this is the static call site or one
of the functions that is being called.

I'm guessing that this is an issue because clang optimizes the static call
site, right?

> Teach the in-line static call text patching about this.
> 
> Notably, since there is no conditional-ret, in that caes patch the Jcc

					      "in that case"

-- Steve

> to point at an empty stub function that does the ret -- or the return
> thunk when needed.
> 
> Reported-by: "Erhard F." <erhard_f@mailbox.org>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> ---
>  arch/x86/kernel/static_call.c |   50 +++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 47 insertions(+), 3 deletions(-)
> 
> --- a/arch/x86/kernel/static_call.c
> +++ b/arch/x86/kernel/static_call.c
> @@ -9,6 +9,7 @@ enum insn_type {
>  	NOP = 1,  /* site cond-call */
>  	JMP = 2,  /* tramp / site tail-call */
>  	RET = 3,  /* tramp / site cond-tail-call */
> +	JCC = 4,
>  };
>  
>  /*
> @@ -25,12 +26,40 @@ static const u8 xor5rax[] = { 0x2e, 0x2e
>  
>  static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc };
>  
> +static u8 __is_Jcc(u8 *insn) /* Jcc.d32 */
> +{
> +	u8 ret = 0;
> +
> +	if (insn[0] == 0x0f) {
> +		u8 tmp = insn[1];
> +		if ((tmp & 0xf0) == 0x80)
> +			ret = tmp;
> +	}
> +
> +	return ret;
> +}
> +
> +extern void __static_call_return(void);
> +
> +asm (".global __static_call_return\n\t"
> +     ".type __static_call_return, @function\n\t"
> +     ASM_FUNC_ALIGN "\n\t"
> +     "__static_call_return:\n\t"
> +     ANNOTATE_NOENDBR
> +     ANNOTATE_RETPOLINE_SAFE
> +     "ret; int3\n\t"
> +     ".size __static_call_return, . - __static_call_return \n\t");
> +
>  static void __ref __static_call_transform(void *insn, enum insn_type type,
>  					  void *func, bool modinit)
>  {
>  	const void *emulate = NULL;
>  	int size = CALL_INSN_SIZE;
>  	const void *code;
> +	u8 op, buf[6];
> +
> +	if ((type == JMP || type == RET) && (op = __is_Jcc(insn)))
> +		type = JCC;
>  
>  	switch (type) {
>  	case CALL:
> @@ -57,6 +86,20 @@ static void __ref __static_call_transfor
>  		else
>  			code = &retinsn;
>  		break;
> +
> +	case JCC:
> +		if (!func) {
> +			func = __static_call_return;
> +			if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
> +				func = x86_return_thunk;
> +		}
> +
> +		buf[0] = 0x0f;
> +		__text_gen_insn(buf+1, op, insn+1, func, 5);
> +		code = buf;
> +		size = 6;
> +
> +		break;
>  	}
>  
>  	if (memcmp(insn, code, size) == 0)
> @@ -68,9 +111,9 @@ static void __ref __static_call_transfor
>  	text_poke_bp(insn, code, size, emulate);
>  }
>  
> -static void __static_call_validate(void *insn, bool tail, bool tramp)
> +static void __static_call_validate(u8 *insn, bool tail, bool tramp)
>  {
> -	u8 opcode = *(u8 *)insn;
> +	u8 opcode = insn[0];
>  
>  	if (tramp && memcmp(insn+5, tramp_ud, 3)) {
>  		pr_err("trampoline signature fail");
> @@ -79,7 +122,8 @@ static void __static_call_validate(void
>  
>  	if (tail) {
>  		if (opcode == JMP32_INSN_OPCODE ||
> -		    opcode == RET_INSN_OPCODE)
> +		    opcode == RET_INSN_OPCODE ||
> +		    __is_Jcc(insn))
>  			return;
>  	} else {
>  		if (opcode == CALL_INSN_OPCODE ||
> 


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] x86/static_call: Add support for Jcc tail-calls
  2023-01-23 22:44   ` Steven Rostedt
@ 2023-01-24 13:06     ` Peter Zijlstra
  2023-01-24 15:07       ` Steven Rostedt
  0 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2023-01-24 13:06 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: mingo, tglx, bp, dave.hansen, x86, hpa, jpoimboe, jbaron, ardb,
	linux-kernel, erhard_f, ndesaulniers, mhiramat, sandipan.das

On Mon, Jan 23, 2023 at 05:44:31PM -0500, Steven Rostedt wrote:
> On Mon, 23 Jan 2023 21:59:18 +0100
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > Clang likes to create conditional tail calls like:
> > 
> > 0000000000000350 <amd_pmu_add_event>:
> > 350:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1) 351: R_X86_64_NONE      __fentry__-0x4
> > 355:       48 83 bf 20 01 00 00 00         cmpq   $0x0,0x120(%rdi)
> > 35d:       0f 85 00 00 00 00       jne    363 <amd_pmu_add_event+0x13>     35f: R_X86_64_PLT32     __SCT__amd_pmu_branch_add-0x4
> > 363:       e9 00 00 00 00          jmp    368 <amd_pmu_add_event+0x18>     364: R_X86_64_PLT32     __x86_return_thunk-0x4
> > 
> 
> Just to confirm, as it's not clear if this is the static call site or one
> of the functions that is being called.

Ah, you've not looked at enough asm then? ;-) Yes this is the static
call site, see the __SCT_ target (instruction at 0x35d).

> I'm guessing that this is an issue because clang optimizes the static call
> site, right?

Specifically using Jcc (jne in this case) to tail-call the trampoline.

> > Teach the in-line static call text patching about this.
> > 
> > Notably, since there is no conditional-ret, in that caes patch the Jcc
> 
> 					      "in that case"

typing so hard.. :-)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] x86/static_call: Add support for Jcc tail-calls
  2023-01-24 13:06     ` Peter Zijlstra
@ 2023-01-24 15:07       ` Steven Rostedt
  2023-01-26 15:34         ` [PATCH v1.1 " Peter Zijlstra
  0 siblings, 1 reply; 14+ messages in thread
From: Steven Rostedt @ 2023-01-24 15:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, tglx, bp, dave.hansen, x86, hpa, jpoimboe, jbaron, ardb,
	linux-kernel, erhard_f, ndesaulniers, mhiramat, sandipan.das

On Tue, 24 Jan 2023 14:06:49 +0100
Peter Zijlstra <peterz@infradead.org> wrote:

> > Just to confirm, as it's not clear if this is the static call site or one
> > of the functions that is being called.  
> 
> Ah, you've not looked at enough asm then? ;-) Yes this is the static
> call site, see the __SCT_ target (instruction at 0x35d).

Yeah, could you specify it a bit more in the change log such that those
looking back at this don't have to have that requirement of staring at
enough asm ;-)

It's actually been some time since I stared at compiler output (although
now that I'm starting to play with rust, that's going to start back up
soon).

-- Steve

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v1.1 3/3] x86/static_call: Add support for Jcc tail-calls
  2023-01-24 15:07       ` Steven Rostedt
@ 2023-01-26 15:34         ` Peter Zijlstra
  2023-01-26 18:14           ` Nick Desaulniers
  2023-01-31 14:22           ` [tip: x86/alternatives] " tip-bot2 for Peter Zijlstra
  0 siblings, 2 replies; 14+ messages in thread
From: Peter Zijlstra @ 2023-01-26 15:34 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: mingo, tglx, bp, dave.hansen, x86, hpa, jpoimboe, jbaron, ardb,
	linux-kernel, erhard_f, ndesaulniers, mhiramat, sandipan.das

On Tue, Jan 24, 2023 at 10:07:53AM -0500, Steven Rostedt wrote:
> On Tue, 24 Jan 2023 14:06:49 +0100
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > > Just to confirm, as it's not clear if this is the static call site or one
> > > of the functions that is being called.  
> > 
> > Ah, you've not looked at enough asm then? ;-) Yes this is the static
> > call site, see the __SCT_ target (instruction at 0x35d).
> 
> Yeah, could you specify it a bit more in the change log such that those
> looking back at this don't have to have that requirement of staring at
> enough asm ;-)

How's this then?

---
Subject: x86/static_call: Add support for Jcc tail-calls
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri Jan 20 16:40:33 CET 2023

Clang likes to create conditional tail calls like:

0000000000000350 <amd_pmu_add_event>:
350:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1) 351: R_X86_64_NONE      __fentry__-0x4
355:       48 83 bf 20 01 00 00 00         cmpq   $0x0,0x120(%rdi)
35d:       0f 85 00 00 00 00       jne    363 <amd_pmu_add_event+0x13>     35f: R_X86_64_PLT32     __SCT__amd_pmu_branch_add-0x4
363:       e9 00 00 00 00          jmp    368 <amd_pmu_add_event+0x18>     364: R_X86_64_PLT32     __x86_return_thunk-0x4

Where 0x35d is a static call site that's turned into a conditional
tail-call using the Jcc class of instructions.

Teach the in-line static call text patching about this.

Notably, since there is no conditional-ret, in that case patch the Jcc
to point at an empty stub function that does the ret -- or the return
thunk when needed.

Reported-by: "Erhard F." <erhard_f@mailbox.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 arch/x86/kernel/static_call.c |   50 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 3 deletions(-)

--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -9,6 +9,7 @@ enum insn_type {
 	NOP = 1,  /* site cond-call */
 	JMP = 2,  /* tramp / site tail-call */
 	RET = 3,  /* tramp / site cond-tail-call */
+	JCC = 4,
 };
 
 /*
@@ -25,12 +26,40 @@ static const u8 xor5rax[] = { 0x2e, 0x2e
 
 static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc };
 
+static u8 __is_Jcc(u8 *insn) /* Jcc.d32 */
+{
+	u8 ret = 0;
+
+	if (insn[0] == 0x0f) {
+		u8 tmp = insn[1];
+		if ((tmp & 0xf0) == 0x80)
+			ret = tmp;
+	}
+
+	return ret;
+}
+
+extern void __static_call_return(void);
+
+asm (".global __static_call_return\n\t"
+     ".type __static_call_return, @function\n\t"
+     ASM_FUNC_ALIGN "\n\t"
+     "__static_call_return:\n\t"
+     ANNOTATE_NOENDBR
+     ANNOTATE_RETPOLINE_SAFE
+     "ret; int3\n\t"
+     ".size __static_call_return, . - __static_call_return \n\t");
+
 static void __ref __static_call_transform(void *insn, enum insn_type type,
 					  void *func, bool modinit)
 {
 	const void *emulate = NULL;
 	int size = CALL_INSN_SIZE;
 	const void *code;
+	u8 op, buf[6];
+
+	if ((type == JMP || type == RET) && (op = __is_Jcc(insn)))
+		type = JCC;
 
 	switch (type) {
 	case CALL:
@@ -57,6 +86,20 @@ static void __ref __static_call_transfor
 		else
 			code = &retinsn;
 		break;
+
+	case JCC:
+		if (!func) {
+			func = __static_call_return;
+			if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+				func = x86_return_thunk;
+		}
+
+		buf[0] = 0x0f;
+		__text_gen_insn(buf+1, op, insn+1, func, 5);
+		code = buf;
+		size = 6;
+
+		break;
 	}
 
 	if (memcmp(insn, code, size) == 0)
@@ -68,9 +111,9 @@ static void __ref __static_call_transfor
 	text_poke_bp(insn, code, size, emulate);
 }
 
-static void __static_call_validate(void *insn, bool tail, bool tramp)
+static void __static_call_validate(u8 *insn, bool tail, bool tramp)
 {
-	u8 opcode = *(u8 *)insn;
+	u8 opcode = insn[0];
 
 	if (tramp && memcmp(insn+5, tramp_ud, 3)) {
 		pr_err("trampoline signature fail");
@@ -79,7 +122,8 @@ static void __static_call_validate(void
 
 	if (tail) {
 		if (opcode == JMP32_INSN_OPCODE ||
-		    opcode == RET_INSN_OPCODE)
+		    opcode == RET_INSN_OPCODE ||
+		    __is_Jcc(insn))
 			return;
 	} else {
 		if (opcode == CALL_INSN_OPCODE ||

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v1.1 3/3] x86/static_call: Add support for Jcc tail-calls
  2023-01-26 15:34         ` [PATCH v1.1 " Peter Zijlstra
@ 2023-01-26 18:14           ` Nick Desaulniers
  2023-02-06 16:07             ` Steven Rostedt
  2023-01-31 14:22           ` [tip: x86/alternatives] " tip-bot2 for Peter Zijlstra
  1 sibling, 1 reply; 14+ messages in thread
From: Nick Desaulniers @ 2023-01-26 18:14 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Steven Rostedt, mingo, tglx, bp, dave.hansen, x86, hpa, jpoimboe,
	jbaron, ardb, linux-kernel, erhard_f, mhiramat, sandipan.das

On Thu, Jan 26, 2023 at 7:34 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Jan 24, 2023 at 10:07:53AM -0500, Steven Rostedt wrote:
> > On Tue, 24 Jan 2023 14:06:49 +0100
> > Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > > > Just to confirm, as it's not clear if this is the static call site or one
> > > > of the functions that is being called.
> > >
> > > Ah, you've not looked at enough asm then? ;-) Yes this is the static
> > > call site, see the __SCT_ target (instruction at 0x35d).
> >
> > Yeah, could you specify it a bit more in the change log such that those
> > looking back at this don't have to have that requirement of staring at
> > enough asm ;-)
>
> How's this then?
>
> ---
> Subject: x86/static_call: Add support for Jcc tail-calls
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Fri Jan 20 16:40:33 CET 2023
>
> Clang likes to create conditional tail calls like:
>
> 0000000000000350 <amd_pmu_add_event>:
> 350:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1) 351: R_X86_64_NONE      __fentry__-0x4
> 355:       48 83 bf 20 01 00 00 00         cmpq   $0x0,0x120(%rdi)
> 35d:       0f 85 00 00 00 00       jne    363 <amd_pmu_add_event+0x13>     35f: R_X86_64_PLT32     __SCT__amd_pmu_branch_add-0x4
> 363:       e9 00 00 00 00          jmp    368 <amd_pmu_add_event+0x18>     364: R_X86_64_PLT32     __x86_return_thunk-0x4

If it helps reduce the line width in the commit message to focus on
just the instructions, lately I've been using
`--no-addresses --no-show-raw-insn` flags to llvm-objdump.

In my ~/.zshrc, I have this function:
dis () {
  file=$1
  func=$2
  llvm-objdump -Dr --no-addresses --no-show-raw-insn
--disassemble-symbols=$func $file
}
Which let me run `dis vmlinux amd_pmu_add_event` or `dis foo.o func`.
Sometimes I don't want `-r` or `--no-show-raw-insn or
`--no-addresses`, but most of the time I do.

-- 
Thanks,
~Nick Desaulniers

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [tip: x86/alternatives] x86/static_call: Add support for Jcc tail-calls
  2023-01-26 15:34         ` [PATCH v1.1 " Peter Zijlstra
  2023-01-26 18:14           ` Nick Desaulniers
@ 2023-01-31 14:22           ` tip-bot2 for Peter Zijlstra
  1 sibling, 0 replies; 14+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2023-01-31 14:22 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Erhard F., Peter Zijlstra (Intel),
	Ingo Molnar, Masami Hiramatsu (Google),
	x86, linux-kernel

The following commit has been merged into the x86/alternatives branch of tip:

Commit-ID:     923510c88d2b7d947c4217835fd9ca6bd65cc56c
Gitweb:        https://git.kernel.org/tip/923510c88d2b7d947c4217835fd9ca6bd65cc56c
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Thu, 26 Jan 2023 16:34:27 +01:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Tue, 31 Jan 2023 15:05:31 +01:00

x86/static_call: Add support for Jcc tail-calls

Clang likes to create conditional tail calls like:

  0000000000000350 <amd_pmu_add_event>:
  350:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1) 351: R_X86_64_NONE      __fentry__-0x4
  355:       48 83 bf 20 01 00 00 00         cmpq   $0x0,0x120(%rdi)
  35d:       0f 85 00 00 00 00       jne    363 <amd_pmu_add_event+0x13>     35f: R_X86_64_PLT32     __SCT__amd_pmu_branch_add-0x4
  363:       e9 00 00 00 00          jmp    368 <amd_pmu_add_event+0x18>     364: R_X86_64_PLT32     __x86_return_thunk-0x4

Where 0x35d is a static call site that's turned into a conditional
tail-call using the Jcc class of instructions.

Teach the in-line static call text patching about this.

Notably, since there is no conditional-ret, in that case patch the Jcc
to point at an empty stub function that does the ret -- or the return
thunk when needed.

Reported-by: "Erhard F." <erhard_f@mailbox.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Link: https://lore.kernel.org/r/Y9Kdg9QjHkr9G5b5@hirez.programming.kicks-ass.net
---
 arch/x86/kernel/static_call.c | 50 +++++++++++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index 2ebc338..b70670a 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -9,6 +9,7 @@ enum insn_type {
 	NOP = 1,  /* site cond-call */
 	JMP = 2,  /* tramp / site tail-call */
 	RET = 3,  /* tramp / site cond-tail-call */
+	JCC = 4,
 };
 
 /*
@@ -25,12 +26,40 @@ static const u8 xor5rax[] = { 0x2e, 0x2e, 0x2e, 0x31, 0xc0 };
 
 static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc };
 
+static u8 __is_Jcc(u8 *insn) /* Jcc.d32 */
+{
+	u8 ret = 0;
+
+	if (insn[0] == 0x0f) {
+		u8 tmp = insn[1];
+		if ((tmp & 0xf0) == 0x80)
+			ret = tmp;
+	}
+
+	return ret;
+}
+
+extern void __static_call_return(void);
+
+asm (".global __static_call_return\n\t"
+     ".type __static_call_return, @function\n\t"
+     ASM_FUNC_ALIGN "\n\t"
+     "__static_call_return:\n\t"
+     ANNOTATE_NOENDBR
+     ANNOTATE_RETPOLINE_SAFE
+     "ret; int3\n\t"
+     ".size __static_call_return, . - __static_call_return \n\t");
+
 static void __ref __static_call_transform(void *insn, enum insn_type type,
 					  void *func, bool modinit)
 {
 	const void *emulate = NULL;
 	int size = CALL_INSN_SIZE;
 	const void *code;
+	u8 op, buf[6];
+
+	if ((type == JMP || type == RET) && (op = __is_Jcc(insn)))
+		type = JCC;
 
 	switch (type) {
 	case CALL:
@@ -57,6 +86,20 @@ static void __ref __static_call_transform(void *insn, enum insn_type type,
 		else
 			code = &retinsn;
 		break;
+
+	case JCC:
+		if (!func) {
+			func = __static_call_return;
+			if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+				func = x86_return_thunk;
+		}
+
+		buf[0] = 0x0f;
+		__text_gen_insn(buf+1, op, insn+1, func, 5);
+		code = buf;
+		size = 6;
+
+		break;
 	}
 
 	if (memcmp(insn, code, size) == 0)
@@ -68,9 +111,9 @@ static void __ref __static_call_transform(void *insn, enum insn_type type,
 	text_poke_bp(insn, code, size, emulate);
 }
 
-static void __static_call_validate(void *insn, bool tail, bool tramp)
+static void __static_call_validate(u8 *insn, bool tail, bool tramp)
 {
-	u8 opcode = *(u8 *)insn;
+	u8 opcode = insn[0];
 
 	if (tramp && memcmp(insn+5, tramp_ud, 3)) {
 		pr_err("trampoline signature fail");
@@ -79,7 +122,8 @@ static void __static_call_validate(void *insn, bool tail, bool tramp)
 
 	if (tail) {
 		if (opcode == JMP32_INSN_OPCODE ||
-		    opcode == RET_INSN_OPCODE)
+		    opcode == RET_INSN_OPCODE ||
+		    __is_Jcc(insn))
 			return;
 	} else {
 		if (opcode == CALL_INSN_OPCODE ||

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [tip: x86/alternatives] x86/alternatives: Teach text_poke_bp() to patch Jcc.d32 instructions
  2023-01-23 20:59 ` [PATCH 2/3] x86/alternative: Teach text_poke_bp() to patch Jcc.d32 instructions Peter Zijlstra
@ 2023-01-31 14:22   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 14+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2023-01-31 14:22 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel), Ingo Molnar, Masami Hiramatsu (Google),
	x86, linux-kernel

The following commit has been merged into the x86/alternatives branch of tip:

Commit-ID:     ac0ee0a9560c97fa5fe1409e450c2425d4ebd17a
Gitweb:        https://git.kernel.org/tip/ac0ee0a9560c97fa5fe1409e450c2425d4ebd17a
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Mon, 23 Jan 2023 21:59:17 +01:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Tue, 31 Jan 2023 15:05:31 +01:00

x86/alternatives: Teach text_poke_bp() to patch Jcc.d32 instructions

In order to re-write Jcc.d32 instructions text_poke_bp() needs to be
taught about them.

The biggest hurdle is that the whole machinery is currently made for 5
byte instructions and extending this would grow struct text_poke_loc
which is currently a nice 16 bytes and used in an array.

However, since text_poke_loc contains a full copy of the (s32)
displacement, it is possible to map the Jcc.d32 2 byte opcodes to
Jcc.d8 1 byte opcode for the int3 emulation.

This then leaves the replacement bytes; fudge that by only storing the
last 5 bytes and adding the rule that 'length == 6' instruction will
be prefixed with a 0x0f byte.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Link: https://lore.kernel.org/r/20230123210607.115718513@infradead.org
---
 arch/x86/kernel/alternative.c | 62 +++++++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 9a45aa1..f615e0c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -338,6 +338,12 @@ next:
 	}
 }
 
+static inline bool is_jcc32(struct insn *insn)
+{
+	/* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
+	return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
+}
+
 #if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL)
 
 /*
@@ -376,12 +382,6 @@ static int emit_indirect(int op, int reg, u8 *bytes)
 	return i;
 }
 
-static inline bool is_jcc32(struct insn *insn)
-{
-	/* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
-	return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
-}
-
 static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
 {
 	u8 op = insn->opcode.bytes[0];
@@ -1770,6 +1770,11 @@ void text_poke_sync(void)
 	on_each_cpu(do_sync_core, NULL, 1);
 }
 
+/*
+ * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of
+ * this thing. When len == 6 everything is prefixed with 0x0f and we map
+ * opcode to Jcc.d8, using len to distinguish.
+ */
 struct text_poke_loc {
 	/* addr := _stext + rel_addr */
 	s32 rel_addr;
@@ -1891,6 +1896,10 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
 		int3_emulate_jmp(regs, (long)ip + tp->disp);
 		break;
 
+	case 0x70 ... 0x7f: /* Jcc */
+		int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp);
+		break;
+
 	default:
 		BUG();
 	}
@@ -1964,16 +1973,26 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 	 * Second step: update all but the first byte of the patched range.
 	 */
 	for (do_sync = 0, i = 0; i < nr_entries; i++) {
-		u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, };
+		u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, };
+		u8 _new[POKE_MAX_OPCODE_SIZE+1];
+		const u8 *new = tp[i].text;
 		int len = tp[i].len;
 
 		if (len - INT3_INSN_SIZE > 0) {
 			memcpy(old + INT3_INSN_SIZE,
 			       text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
 			       len - INT3_INSN_SIZE);
+
+			if (len == 6) {
+				_new[0] = 0x0f;
+				memcpy(_new + 1, new, 5);
+				new = _new;
+			}
+
 			text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
-				  (const char *)tp[i].text + INT3_INSN_SIZE,
+				  new + INT3_INSN_SIZE,
 				  len - INT3_INSN_SIZE);
+
 			do_sync++;
 		}
 
@@ -2001,8 +2020,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 		 * The old instruction is recorded so that the event can be
 		 * processed forwards or backwards.
 		 */
-		perf_event_text_poke(text_poke_addr(&tp[i]), old, len,
-				     tp[i].text, len);
+		perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len);
 	}
 
 	if (do_sync) {
@@ -2019,10 +2037,15 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 	 * replacing opcode.
 	 */
 	for (do_sync = 0, i = 0; i < nr_entries; i++) {
-		if (tp[i].text[0] == INT3_INSN_OPCODE)
+		u8 byte = tp[i].text[0];
+
+		if (tp[i].len == 6)
+			byte = 0x0f;
+
+		if (byte == INT3_INSN_OPCODE)
 			continue;
 
-		text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE);
+		text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE);
 		do_sync++;
 	}
 
@@ -2040,9 +2063,11 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
 			       const void *opcode, size_t len, const void *emulate)
 {
 	struct insn insn;
-	int ret, i;
+	int ret, i = 0;
 
-	memcpy((void *)tp->text, opcode, len);
+	if (len == 6)
+		i = 1;
+	memcpy((void *)tp->text, opcode+i, len-i);
 	if (!emulate)
 		emulate = opcode;
 
@@ -2053,6 +2078,13 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
 	tp->len = len;
 	tp->opcode = insn.opcode.bytes[0];
 
+	if (is_jcc32(&insn)) {
+		/*
+		 * Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
+		 */
+		tp->opcode = insn.opcode.bytes[1] - 0x10;
+	}
+
 	switch (tp->opcode) {
 	case RET_INSN_OPCODE:
 	case JMP32_INSN_OPCODE:
@@ -2069,7 +2101,6 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
 		BUG_ON(len != insn.length);
 	}
 
-
 	switch (tp->opcode) {
 	case INT3_INSN_OPCODE:
 	case RET_INSN_OPCODE:
@@ -2078,6 +2109,7 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
 	case CALL_INSN_OPCODE:
 	case JMP32_INSN_OPCODE:
 	case JMP8_INSN_OPCODE:
+	case 0x70 ... 0x7f: /* Jcc */
 		tp->disp = insn.immediate.value;
 		break;
 

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [tip: x86/alternatives] x86/alternatives: Introduce int3_emulate_jcc()
  2023-01-23 20:59 ` [PATCH 1/3] x86/alternative: Introduce int3_emulate_jcc() Peter Zijlstra
@ 2023-01-31 14:22   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 14+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2023-01-31 14:22 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel), Ingo Molnar, Masami Hiramatsu (Google),
	x86, linux-kernel

The following commit has been merged into the x86/alternatives branch of tip:

Commit-ID:     db7adcfd1cec4e95155e37bc066fddab302c6340
Gitweb:        https://git.kernel.org/tip/db7adcfd1cec4e95155e37bc066fddab302c6340
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Mon, 23 Jan 2023 21:59:16 +01:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Tue, 31 Jan 2023 15:05:30 +01:00

x86/alternatives: Introduce int3_emulate_jcc()

Move the kprobe Jcc emulation into int3_emulate_jcc() so it can be
used by more code -- specifically static_call() will need this.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Link: https://lore.kernel.org/r/20230123210607.057678245@infradead.org
---
 arch/x86/include/asm/text-patching.h | 31 ++++++++++++++++++++++-
 arch/x86/kernel/kprobes/core.c       | 38 +++++----------------------
 2 files changed, 39 insertions(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index f4b87f0..29832c3 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -184,6 +184,37 @@ void int3_emulate_ret(struct pt_regs *regs)
 	unsigned long ip = int3_emulate_pop(regs);
 	int3_emulate_jmp(regs, ip);
 }
+
+static __always_inline
+void int3_emulate_jcc(struct pt_regs *regs, u8 cc, unsigned long ip, unsigned long disp)
+{
+	static const unsigned long jcc_mask[6] = {
+		[0] = X86_EFLAGS_OF,
+		[1] = X86_EFLAGS_CF,
+		[2] = X86_EFLAGS_ZF,
+		[3] = X86_EFLAGS_CF | X86_EFLAGS_ZF,
+		[4] = X86_EFLAGS_SF,
+		[5] = X86_EFLAGS_PF,
+	};
+
+	bool invert = cc & 1;
+	bool match;
+
+	if (cc < 0xc) {
+		match = regs->flags & jcc_mask[cc >> 1];
+	} else {
+		match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^
+			((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT);
+		if (cc >= 0xe)
+			match = match || (regs->flags & X86_EFLAGS_ZF);
+	}
+
+	if ((match && !invert) || (!match && invert))
+		ip += disp;
+
+	int3_emulate_jmp(regs, ip);
+}
+
 #endif /* !CONFIG_UML_X86 */
 
 #endif /* _ASM_X86_TEXT_PATCHING_H */
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 6629968..d48638c 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -460,50 +460,26 @@ static void kprobe_emulate_call(struct kprobe *p, struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(kprobe_emulate_call);
 
-static nokprobe_inline
-void __kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs, bool cond)
+static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs)
 {
 	unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
 
-	if (cond)
-		ip += p->ainsn.rel32;
+	ip += p->ainsn.rel32;
 	int3_emulate_jmp(regs, ip);
 }
-
-static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs)
-{
-	__kprobe_emulate_jmp(p, regs, true);
-}
 NOKPROBE_SYMBOL(kprobe_emulate_jmp);
 
-static const unsigned long jcc_mask[6] = {
-	[0] = X86_EFLAGS_OF,
-	[1] = X86_EFLAGS_CF,
-	[2] = X86_EFLAGS_ZF,
-	[3] = X86_EFLAGS_CF | X86_EFLAGS_ZF,
-	[4] = X86_EFLAGS_SF,
-	[5] = X86_EFLAGS_PF,
-};
-
 static void kprobe_emulate_jcc(struct kprobe *p, struct pt_regs *regs)
 {
-	bool invert = p->ainsn.jcc.type & 1;
-	bool match;
+	unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
 
-	if (p->ainsn.jcc.type < 0xc) {
-		match = regs->flags & jcc_mask[p->ainsn.jcc.type >> 1];
-	} else {
-		match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^
-			((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT);
-		if (p->ainsn.jcc.type >= 0xe)
-			match = match || (regs->flags & X86_EFLAGS_ZF);
-	}
-	__kprobe_emulate_jmp(p, regs, (match && !invert) || (!match && invert));
+	int3_emulate_jcc(regs, p->ainsn.jcc.type, ip, p->ainsn.rel32);
 }
 NOKPROBE_SYMBOL(kprobe_emulate_jcc);
 
 static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs)
 {
+	unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
 	bool match;
 
 	if (p->ainsn.loop.type != 3) {	/* LOOP* */
@@ -531,7 +507,9 @@ static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs)
 	else if (p->ainsn.loop.type == 1)	/* LOOPE */
 		match = match && (regs->flags & X86_EFLAGS_ZF);
 
-	__kprobe_emulate_jmp(p, regs, match);
+	if (match)
+		ip += p->ainsn.rel32;
+	int3_emulate_jmp(regs, ip);
 }
 NOKPROBE_SYMBOL(kprobe_emulate_loop);
 

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v1.1 3/3] x86/static_call: Add support for Jcc tail-calls
  2023-01-26 18:14           ` Nick Desaulniers
@ 2023-02-06 16:07             ` Steven Rostedt
  0 siblings, 0 replies; 14+ messages in thread
From: Steven Rostedt @ 2023-02-06 16:07 UTC (permalink / raw)
  To: Nick Desaulniers
  Cc: Peter Zijlstra, mingo, tglx, bp, dave.hansen, x86, hpa, jpoimboe,
	jbaron, ardb, linux-kernel, erhard_f, mhiramat, sandipan.das

On Thu, 26 Jan 2023 10:14:49 -0800
Nick Desaulniers <ndesaulniers@google.com> wrote:

> On Thu, Jan 26, 2023 at 7:34 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Tue, Jan 24, 2023 at 10:07:53AM -0500, Steven Rostedt wrote:  
> > > On Tue, 24 Jan 2023 14:06:49 +0100
> > > Peter Zijlstra <peterz@infradead.org> wrote:
> > >  
> > > > > Just to confirm, as it's not clear if this is the static call site or one
> > > > > of the functions that is being called.  
> > > >
> > > > Ah, you've not looked at enough asm then? ;-) Yes this is the static
> > > > call site, see the __SCT_ target (instruction at 0x35d).  
> > >
> > > Yeah, could you specify it a bit more in the change log such that those
> > > looking back at this don't have to have that requirement of staring at
> > > enough asm ;-)  
> >
> > How's this then?

Better :-)

Anyway... Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>

> >
> > ---
> > Subject: x86/static_call: Add support for Jcc tail-calls
> > From: Peter Zijlstra <peterz@infradead.org>
> > Date: Fri Jan 20 16:40:33 CET 2023
> >
> > Clang likes to create conditional tail calls like:
> >
> > 0000000000000350 <amd_pmu_add_event>:
> > 350:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1) 351: R_X86_64_NONE      __fentry__-0x4
> > 355:       48 83 bf 20 01 00 00 00         cmpq   $0x0,0x120(%rdi)
> > 35d:       0f 85 00 00 00 00       jne    363 <amd_pmu_add_event+0x13>     35f: R_X86_64_PLT32     __SCT__amd_pmu_branch_add-0x4
> > 363:       e9 00 00 00 00          jmp    368 <amd_pmu_add_event+0x18>     364: R_X86_64_PLT32     __x86_return_thunk-0x4  
> 
> If it helps reduce the line width in the commit message to focus on
> just the instructions, lately I've been using
> `--no-addresses --no-show-raw-insn` flags to llvm-objdump.

Actually, I like the full output. Even if we do reduce it, it will
still be over 80.

-- Steve

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 0/3] static_call/x86: Handle clang's conditional tail calls
  2023-01-23 20:59 [PATCH 0/3] static_call/x86: Handle clang's conditional tail calls Peter Zijlstra
                   ` (2 preceding siblings ...)
  2023-01-23 20:59 ` [PATCH 3/3] x86/static_call: Add support for Jcc tail-calls Peter Zijlstra
@ 2023-02-08 22:36 ` Nathan Chancellor
  3 siblings, 0 replies; 14+ messages in thread
From: Nathan Chancellor @ 2023-02-08 22:36 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar
  Cc: tglx, bp, dave.hansen, x86, hpa, jpoimboe, jbaron, rostedt, ardb,
	linux-kernel, erhard_f, ndesaulniers, mhiramat, sandipan.das,
	llvm

Hi Peter and Ingo,

On Mon, Jan 23, 2023 at 09:59:15PM +0100, Peter Zijlstra wrote:
> Erhard reported boot fails on this AMD machine when using clang and bisected it
> to a commit introducing a few static_call()s. Turns out that when using clang
> with -Os it it very likely to generate conditional tail calls like:
> 
>   0000000000000350 <amd_pmu_add_event>:
>   350:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1) 351: R_X86_64_NONE      __fentry__-0x4
>   355:       48 83 bf 20 01 00 00 00         cmpq   $0x0,0x120(%rdi)
>   35d:       0f 85 00 00 00 00       jne    363 <amd_pmu_add_event+0x13>     35f: R_X86_64_PLT32     __SCT__amd_pmu_branch_add-0x4
>   363:       e9 00 00 00 00          jmp    368 <amd_pmu_add_event+0x18>     364: R_X86_64_PLT32     __x86_return_thunk-0x4
> 
> And our inline static_call() patching code can't deal with those and BUG
> happens -- really early.
> 
> These patches borrow the kprobe Jcc emulation to implement text_poke_bp() Jcc
> support, which is then used to teach inline static_call() about this form.
> 
> ---
>  arch/x86/include/asm/text-patching.h | 31 ++++++++++++++++++
>  arch/x86/kernel/alternative.c        | 62 +++++++++++++++++++++++++++---------
>  arch/x86/kernel/kprobes/core.c       | 38 +++++-----------------
>  arch/x86/kernel/static_call.c        | 50 +++++++++++++++++++++++++++--
>  4 files changed, 133 insertions(+), 48 deletions(-)

I noticed this series was applied to x86/alternatives versus
x86/urgent, even though this appears to be a regression since 6.1, as
Erhard hit this issue in that tree.

Additionally, a new change in LLVM main [1] causes conditional tail
calls to be emitted even at -O2, so this breakage will become more
noticeable over time. Is it possible to expedite this to mainline so
that it can be backported to 6.1? If not, no worries, but I figured I
would ask :)

I have a backport of this series to 6.1 prepared already [2], where it
appears to work for me but I will get wider testing before sending it
after this is in Linus' tree (regardless of when that is). I figured it
would not hurt to have other eyes on it ahead of time though.

[1]: https://github.com/llvm/llvm-project/commit/ee5585ed09aff2e54cb540fad4c33f0c93626b1b
[2]: https://git.kernel.org/nathan/l/cbl-1800-1774-6.1

Cheers,
Nathan

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2023-02-08 22:36 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-23 20:59 [PATCH 0/3] static_call/x86: Handle clang's conditional tail calls Peter Zijlstra
2023-01-23 20:59 ` [PATCH 1/3] x86/alternative: Introduce int3_emulate_jcc() Peter Zijlstra
2023-01-31 14:22   ` [tip: x86/alternatives] x86/alternatives: " tip-bot2 for Peter Zijlstra
2023-01-23 20:59 ` [PATCH 2/3] x86/alternative: Teach text_poke_bp() to patch Jcc.d32 instructions Peter Zijlstra
2023-01-31 14:22   ` [tip: x86/alternatives] x86/alternatives: " tip-bot2 for Peter Zijlstra
2023-01-23 20:59 ` [PATCH 3/3] x86/static_call: Add support for Jcc tail-calls Peter Zijlstra
2023-01-23 22:44   ` Steven Rostedt
2023-01-24 13:06     ` Peter Zijlstra
2023-01-24 15:07       ` Steven Rostedt
2023-01-26 15:34         ` [PATCH v1.1 " Peter Zijlstra
2023-01-26 18:14           ` Nick Desaulniers
2023-02-06 16:07             ` Steven Rostedt
2023-01-31 14:22           ` [tip: x86/alternatives] " tip-bot2 for Peter Zijlstra
2023-02-08 22:36 ` [PATCH 0/3] static_call/x86: Handle clang's conditional tail calls Nathan Chancellor

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.