All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 0/4] x86: Fully relocatable alternatives and some NOPs
@ 2023-02-08 17:10 Peter Zijlstra
  2023-02-08 17:10 ` [PATCH v3 1/4] x86/alternative: Make debug-alternative selective Peter Zijlstra
                   ` (4 more replies)
  0 siblings, 5 replies; 28+ messages in thread
From: Peter Zijlstra @ 2023-02-08 17:10 UTC (permalink / raw)
  To: x86
  Cc: linux-kernel, peterz, mhiramat, kirill.shutemov, Andrew.Cooper3,
	jpoimboe

Hi,

This version is fully tested and includes a few NOP twiddles inspired by the
objtool patches I'll post separately.

Also available at:

  https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/log/?h=x86/core



^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v3 1/4] x86/alternative: Make debug-alternative selective
  2023-02-08 17:10 [PATCH v3 0/4] x86: Fully relocatable alternatives and some NOPs Peter Zijlstra
@ 2023-02-08 17:10 ` Peter Zijlstra
  2023-02-14 11:48   ` Borislav Petkov
  2023-05-13 13:03   ` [tip: x86/alternatives] " tip-bot2 for Peter Zijlstra
  2023-02-08 17:10 ` [PATCH v3 2/4] x86/alternative: Support relocations in alternatives Peter Zijlstra
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 28+ messages in thread
From: Peter Zijlstra @ 2023-02-08 17:10 UTC (permalink / raw)
  To: x86
  Cc: linux-kernel, peterz, mhiramat, kirill.shutemov, Andrew.Cooper3,
	jpoimboe

Using debug-alternative generates a *LOT* of output, extend it a bit
to select which of the many rewrites it reports on.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/alternative.c |   62 +++++++++++++++++++++++++-----------------
 1 file changed, 37 insertions(+), 25 deletions(-)

--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -37,11 +37,23 @@ EXPORT_SYMBOL_GPL(alternatives_patched);
 
 #define MAX_PATCH_LEN (255-1)
 
-static int __initdata_or_module debug_alternative;
+#define DA_ALL		(~0)
+#define DA_ALT		0x01
+#define DA_RET		0x02
+#define DA_RETPOLINE	0x04
+#define DA_ENDBR	0x08
+#define DA_SMP		0x10
+
+static unsigned int __initdata_or_module debug_alternative;
 
 static int __init debug_alt(char *str)
 {
-	debug_alternative = 1;
+	if (str && *str == '=')
+		str++;
+
+	if (!str || kstrtouint(str, 0, &debug_alternative))
+		debug_alternative = DA_ALL;
+
 	return 1;
 }
 __setup("debug-alternative", debug_alt);
@@ -55,15 +67,15 @@ static int __init setup_noreplace_smp(ch
 }
 __setup("noreplace-smp", setup_noreplace_smp);
 
-#define DPRINTK(fmt, args...)						\
+#define DPRINTK(type, fmt, args...)					\
 do {									\
-	if (debug_alternative)						\
+	if (debug_alternative & DA_##type)				\
 		printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args);		\
 } while (0)
 
-#define DUMP_BYTES(buf, len, fmt, args...)				\
+#define DUMP_BYTES(type, buf, len, fmt, args...)			\
 do {									\
-	if (unlikely(debug_alternative)) {				\
+	if (unlikely(debug_alternative & DA_##type)) {			\
 		int j;							\
 									\
 		if (!(len))						\
@@ -148,7 +160,7 @@ recompute_jump(struct alt_instr *a, u8 *
 	tgt_rip  = next_rip + o_dspl;
 	n_dspl = tgt_rip - orig_insn;
 
-	DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
+	DPRINTK(ALT, "target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
 
 	if (tgt_rip - orig_insn >= 0) {
 		if (n_dspl - 2 <= 127)
@@ -183,7 +195,7 @@ recompute_jump(struct alt_instr *a, u8 *
 
 done:
 
-	DPRINTK("final displ: 0x%08x, JMP 0x%lx",
+	DPRINTK(ALT, "final displ: 0x%08x, JMP 0x%lx",
 		n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
 }
 
@@ -217,7 +229,7 @@ static __always_inline int optimize_nops
 	add_nops(instr + off, nnops);
 	local_irq_restore(flags);
 
-	DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);
+	DUMP_BYTES(ALT, instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);
 
 	return nnops;
 }
@@ -270,7 +282,7 @@ void __init_or_module noinline apply_alt
 	u8 *instr, *replacement;
 	u8 insn_buff[MAX_PATCH_LEN];
 
-	DPRINTK("alt table %px, -> %px", start, end);
+	DPRINTK(ALT, "alt table %px, -> %px", start, end);
 	/*
 	 * The scan order should be from start to end. A later scanned
 	 * alternative code can overwrite previously scanned alternative code.
@@ -299,15 +311,15 @@ void __init_or_module noinline apply_alt
 		if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV))
 			goto next;
 
-		DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
+		DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
 			(a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "",
 			feature >> 5,
 			feature & 0x1f,
 			instr, instr, a->instrlen,
 			replacement, a->replacementlen);
 
-		DUMP_BYTES(instr, a->instrlen, "%px:   old_insn: ", instr);
-		DUMP_BYTES(replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);
+		DUMP_BYTES(ALT, instr, a->instrlen, "%px:   old_insn: ", instr);
+		DUMP_BYTES(ALT, replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);
 
 		memcpy(insn_buff, replacement, a->replacementlen);
 		insn_buff_sz = a->replacementlen;
@@ -320,7 +332,7 @@ void __init_or_module noinline apply_alt
 		 */
 		if (a->replacementlen == 5 && *insn_buff == 0xe8) {
 			*(s32 *)(insn_buff + 1) += replacement - instr;
-			DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
+			DPRINTK(ALT, "Fix CALL offset: 0x%x, CALL 0x%lx",
 				*(s32 *)(insn_buff + 1),
 				(unsigned long)instr + *(s32 *)(insn_buff + 1) + 5);
 		}
@@ -331,7 +343,7 @@ void __init_or_module noinline apply_alt
 		for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
 			insn_buff[insn_buff_sz] = 0x90;
 
-		DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
+		DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
 
 		text_poke_early(instr, insn_buff, insn_buff_sz);
 
@@ -557,15 +569,15 @@ void __init_or_module noinline apply_ret
 			continue;
 		}
 
-		DPRINTK("retpoline at: %pS (%px) len: %d to: %pS",
+		DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS",
 			addr, addr, insn.length,
 			addr + insn.length + insn.immediate.value);
 
 		len = patch_retpoline(addr, &insn, bytes);
 		if (len == insn.length) {
 			optimize_nops(bytes, len);
-			DUMP_BYTES(((u8*)addr),  len, "%px: orig: ", addr);
-			DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
+			DUMP_BYTES(RETPOLINE, ((u8*)addr),  len, "%px: orig: ", addr);
+			DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
 			text_poke_early(addr, bytes, len);
 		}
 	}
@@ -632,14 +644,14 @@ void __init_or_module noinline apply_ret
 			      addr, dest, 5, addr))
 			continue;
 
-		DPRINTK("return thunk at: %pS (%px) len: %d to: %pS",
+		DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS",
 			addr, addr, insn.length,
 			addr + insn.length + insn.immediate.value);
 
 		len = patch_return(addr, &insn, bytes);
 		if (len == insn.length) {
-			DUMP_BYTES(((u8*)addr),  len, "%px: orig: ", addr);
-			DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
+			DUMP_BYTES(RET, ((u8*)addr),  len, "%px: orig: ", addr);
+			DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
 			text_poke_early(addr, bytes, len);
 		}
 	}
@@ -669,13 +681,13 @@ static void poison_endbr(void *addr, boo
 		return;
 	}
 
-	DPRINTK("ENDBR at: %pS (%px)", addr, addr);
+	DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr);
 
 	/*
 	 * When we have IBT, the lack of ENDBR will trigger #CP
 	 */
-	DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr);
-	DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr);
+	DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
+	DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
 	text_poke_early(addr, &poison, 4);
 }
 
@@ -1150,7 +1162,7 @@ void __init_or_module alternatives_smp_m
 	smp->locks_end	= locks_end;
 	smp->text	= text;
 	smp->text_end	= text_end;
-	DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
+	DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n",
 		smp->locks, smp->locks_end,
 		smp->text, smp->text_end, smp->name);
 



^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v3 2/4] x86/alternative: Support relocations in alternatives
  2023-02-08 17:10 [PATCH v3 0/4] x86: Fully relocatable alternatives and some NOPs Peter Zijlstra
  2023-02-08 17:10 ` [PATCH v3 1/4] x86/alternative: Make debug-alternative selective Peter Zijlstra
@ 2023-02-08 17:10 ` Peter Zijlstra
  2023-02-17 20:28   ` Borislav Petkov
                     ` (2 more replies)
  2023-02-08 17:10 ` [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some Peter Zijlstra
                   ` (2 subsequent siblings)
  4 siblings, 3 replies; 28+ messages in thread
From: Peter Zijlstra @ 2023-02-08 17:10 UTC (permalink / raw)
  To: x86
  Cc: linux-kernel, peterz, mhiramat, kirill.shutemov, Andrew.Cooper3,
	jpoimboe

A little while ago someone (Kirill) ran into the whole 'alternatives don't
do relocations nonsense' again and I got annoyed enough to actually look
at the code.

Since the whole alternative machinery already fully decodes the
instructions it is simple enough to adjust immediates and displacement
when needed. Specifically, the immediates for IP modifying instructions
(JMP, CALL, Jcc) and the displacement for RIP-relative instructions.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/alternative.c    |  255 +++++++++++++++++++++++++--------------
 tools/objtool/arch/x86/special.c |    8 -
 2 files changed, 167 insertions(+), 96 deletions(-)

--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -135,71 +135,6 @@ extern s32 __smp_locks[], __smp_locks_en
 void text_poke_early(void *addr, const void *opcode, size_t len);
 
 /*
- * Are we looking at a near JMP with a 1 or 4-byte displacement.
- */
-static inline bool is_jmp(const u8 opcode)
-{
-	return opcode == 0xeb || opcode == 0xe9;
-}
-
-static void __init_or_module
-recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff)
-{
-	u8 *next_rip, *tgt_rip;
-	s32 n_dspl, o_dspl;
-	int repl_len;
-
-	if (a->replacementlen != 5)
-		return;
-
-	o_dspl = *(s32 *)(insn_buff + 1);
-
-	/* next_rip of the replacement JMP */
-	next_rip = repl_insn + a->replacementlen;
-	/* target rip of the replacement JMP */
-	tgt_rip  = next_rip + o_dspl;
-	n_dspl = tgt_rip - orig_insn;
-
-	DPRINTK(ALT, "target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
-
-	if (tgt_rip - orig_insn >= 0) {
-		if (n_dspl - 2 <= 127)
-			goto two_byte_jmp;
-		else
-			goto five_byte_jmp;
-	/* negative offset */
-	} else {
-		if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
-			goto two_byte_jmp;
-		else
-			goto five_byte_jmp;
-	}
-
-two_byte_jmp:
-	n_dspl -= 2;
-
-	insn_buff[0] = 0xeb;
-	insn_buff[1] = (s8)n_dspl;
-	add_nops(insn_buff + 2, 3);
-
-	repl_len = 2;
-	goto done;
-
-five_byte_jmp:
-	n_dspl -= 5;
-
-	insn_buff[0] = 0xe9;
-	*(s32 *)&insn_buff[1] = n_dspl;
-
-	repl_len = 5;
-
-done:
-
-	DPRINTK(ALT, "final displ: 0x%08x, JMP 0x%lx",
-		n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
-}
-
-/*
  * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
  *
  * @instr: instruction byte stream
@@ -266,6 +201,133 @@ static void __init_or_module noinline op
 }
 
 /*
+ * What we start with is:
+ *
+ *   src_imm = target - src_next_ip                  (1)
+ *
+ * what we want is:
+ *
+ *   dst_imm = target - dst_next_ip                  (2)
+ *
+ * so what we do is rework (1) as an expression for target like:
+ *
+ *   target = src_imm + src_next_ip                  (1a)
+ *
+ * and substitute in (2) to get:
+ *
+ *   dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
+ *
+ * Now, since the instruction stream is 'identical' at src and dst (we copy
+ * after all) we can state that:
+ *
+ *   src_next_ip = src + ip_offset
+ *   dst_next_ip = dst + ip_offset                   (4)
+ *
+ * Substitute (4) in (3) and observe ip_offset being cancelled out to
+ * obtain:
+ *
+ *   dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset)
+ *           = src_imm + src - dst + ip_offset - ip_offset
+ *           = src_imm + src - dst                   (5)
+ *
+ * IOW, only the relative displacement of the code block matters.
+ */
+
+#define apply_reloc_n(n_, p_, d_)				\
+	do {							\
+		s32 v = *(s##n_ *)(p_);				\
+		v += (d_);					\
+		BUG_ON((v >> 31) != (v >> (n_-1)));		\
+		*(s##n_ *)(p_) = (s##n_)v;			\
+	} while (0)
+
+
+static __always_inline
+void apply_reloc(int n, void *ptr, uintptr_t diff)
+{
+	switch (n) {
+	case 1: apply_reloc_n(8, ptr, diff); break;
+	case 2: apply_reloc_n(16, ptr, diff); break;
+	case 4: apply_reloc_n(32, ptr, diff); break;
+	default: BUG();
+	}
+}
+
+static __always_inline
+bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
+{
+	u8 *target = src + offset;
+	/*
+	 * If the target is inside the patched block, it's relative to the
+	 * block itself and does not need relocation.
+	 */
+	return (target < src || target > src + src_len);
+}
+
+static void __init_or_module noinline
+apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
+{
+	for (int next, i = 0; i < len; i = next) {
+		struct insn insn;
+
+		if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
+			return;
+
+		next = i + insn.length;
+
+		switch (insn.opcode.bytes[0]) {
+		case 0x0f:
+			if (insn.opcode.bytes[1] < 0x80 ||
+			    insn.opcode.bytes[1] > 0x8f)
+				break;
+
+			fallthrough;	/* Jcc.d32 */
+		case 0x70 ... 0x7f:	/* Jcc.d8 */
+		case JMP8_INSN_OPCODE:
+		case JMP32_INSN_OPCODE:
+		case CALL_INSN_OPCODE:
+			if (need_reloc(next + insn.immediate.value, src, src_len)) {
+				apply_reloc(insn.immediate.nbytes,
+					    buf + i + insn_offset_immediate(&insn),
+					    src - dest);
+			}
+
+			/*
+			 * Where possible, convert JMP.d32 into JMP.d8.
+			 */
+			if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
+				s32 imm = insn.immediate.value;
+				imm += src - dest;
+				imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
+				if ((imm >> 31) == (imm >> 7)) {
+					buf[i+0] = JMP8_INSN_OPCODE;
+					buf[i+1] = (s8)imm;
+					for (int j = 2; j < insn.length; j++)
+						buf[i+j] = INT3_INSN_OPCODE;
+				}
+			}
+			break;
+		}
+
+		if (insn_rip_relative(&insn)) {
+			if (need_reloc(next + insn.displacement.value, src, src_len)) {
+				apply_reloc(insn.displacement.nbytes,
+					    buf + i + insn_offset_displacement(&insn),
+					    src - dest);
+			}
+		}
+
+
+		/*
+		 * See if this and any potentially following NOPs can be
+		 * optimized.
+		 */
+		if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
+			next = i + optimize_nops_range(buf, len, i);
+	}
+}
+
+/*
  * Replace instructions with better alternatives for this CPU type. This runs
  * before SMP is initialized to avoid SMP problems with self modifying code.
  * This implies that asymmetric systems where APs have less capabilities than
@@ -308,8 +370,10 @@ void __init_or_module noinline apply_alt
 		 * - feature not present but ALTINSTR_FLAG_INV is set to mean,
 		 *   patch if feature is *NOT* present.
 		 */
-		if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV))
-			goto next;
+		if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) {
+			optimize_nops(instr, a->instrlen);
+			continue;
+		}
 
 		DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
 			(a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "",
@@ -318,37 +382,19 @@ void __init_or_module noinline apply_alt
 			instr, instr, a->instrlen,
 			replacement, a->replacementlen);
 
-		DUMP_BYTES(ALT, instr, a->instrlen, "%px:   old_insn: ", instr);
-		DUMP_BYTES(ALT, replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);
-
 		memcpy(insn_buff, replacement, a->replacementlen);
 		insn_buff_sz = a->replacementlen;
 
-		/*
-		 * 0xe8 is a relative jump; fix the offset.
-		 *
-		 * Instruction length is checked before the opcode to avoid
-		 * accessing uninitialized bytes for zero-length replacements.
-		 */
-		if (a->replacementlen == 5 && *insn_buff == 0xe8) {
-			*(s32 *)(insn_buff + 1) += replacement - instr;
-			DPRINTK(ALT, "Fix CALL offset: 0x%x, CALL 0x%lx",
-				*(s32 *)(insn_buff + 1),
-				(unsigned long)instr + *(s32 *)(insn_buff + 1) + 5);
-		}
-
-		if (a->replacementlen && is_jmp(replacement[0]))
-			recompute_jump(a, instr, replacement, insn_buff);
-
 		for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
 			insn_buff[insn_buff_sz] = 0x90;
 
+		apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen);
+
+		DUMP_BYTES(ALT, instr, a->instrlen, "%px:   old_insn: ", instr);
+		DUMP_BYTES(ALT, replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);
 		DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
 
 		text_poke_early(instr, insn_buff, insn_buff_sz);
-
-next:
-		optimize_nops(instr, a->instrlen);
 	}
 }
 
@@ -1346,6 +1392,35 @@ static noinline void __init int3_selftes
 	unregister_die_notifier(&int3_exception_nb);
 }
 
+static __initdata int __alt_reloc_selftest_addr;
+
+__visible noinline void __init __alt_reloc_selftest(void *arg)
+{
+	WARN_ON(arg != &__alt_reloc_selftest_addr);
+}
+
+static noinline void __init alt_reloc_selftest(void)
+{
+	/*
+	 * Tests apply_relocation().
+	 *
+	 * This has a relative immediate (CALL) in a place other than the first
+	 * instruction and additionally on x86_64 we get a RIP-relative LEA:
+	 *
+	 *   lea    0x0(%rip),%rdi  # 5d0: R_X86_64_PC32    .init.data+0x5566c
+	 *   call   +0              # 5d5: R_X86_64_PLT32   __alt_reloc_selftest-0x4
+	 *
+	 * Getting this wrong will either crash and burn or tickle the WARN
+	 * above.
+	 */
+	asm_inline volatile (
+		ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
+		: /* output */
+		: [mem] "m" (__alt_reloc_selftest_addr)
+		: _ASM_ARG1
+	);
+}
+
 void __init alternative_instructions(void)
 {
 	int3_selftest();
@@ -1433,6 +1508,8 @@ void __init alternative_instructions(voi
 
 	restart_nmi();
 	alternatives_patched = 1;
+
+	alt_reloc_selftest();
 }
 
 /**
--- a/tools/objtool/arch/x86/special.c
+++ b/tools/objtool/arch/x86/special.c
@@ -42,13 +42,7 @@ bool arch_support_alt_relocation(struct
 				 struct instruction *insn,
 				 struct reloc *reloc)
 {
-	/*
-	 * The x86 alternatives code adjusts the offsets only when it
-	 * encounters a branch instruction at the very beginning of the
-	 * replacement group.
-	 */
-	return insn->offset == special_alt->new_off &&
-	       (insn->type == INSN_CALL || is_jump(insn));
+	return true;
 }
 
 /*



^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some
  2023-02-08 17:10 [PATCH v3 0/4] x86: Fully relocatable alternatives and some NOPs Peter Zijlstra
  2023-02-08 17:10 ` [PATCH v3 1/4] x86/alternative: Make debug-alternative selective Peter Zijlstra
  2023-02-08 17:10 ` [PATCH v3 2/4] x86/alternative: Support relocations in alternatives Peter Zijlstra
@ 2023-02-08 17:10 ` Peter Zijlstra
  2023-02-08 19:52   ` Andrew.Cooper3
  2023-05-13 13:03   ` [tip: x86/alternatives] " tip-bot2 for Peter Zijlstra
  2023-02-08 17:10 ` [PATCH v3 4/4] x86/alternative: Complicate optimize_nops() some more Peter Zijlstra
  2023-02-27 10:49 ` [PATCH] x86/lib/memmove: Decouple ERMS from FSRM Borislav Petkov
  4 siblings, 2 replies; 28+ messages in thread
From: Peter Zijlstra @ 2023-02-08 17:10 UTC (permalink / raw)
  To: x86
  Cc: linux-kernel, peterz, mhiramat, kirill.shutemov, Andrew.Cooper3,
	jpoimboe

This rewrite address two issues:

 - it no longer hard requires single byte nop runs, it now accepts
   any NOP and NOPL encoded instruction (but not the more complicated
   32bit NOPs).

 - it writes a single 'instruction' replacement.

Specifically, ORC unwinder relies on the tail NOP of an alternative to
be a single instruction, in particular it relies on the inner bytes
not being executed.

Once we reach the max supported NOP length (currently 8, could easily
be extended to 11 on x86_64), switches to JMP.d8 and INT3 padding to
achieve the same result.

The ORC unwinder uses this guarantee in the analysis of
alternative/overlapping CFI state,

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/alternative.c |  103 ++++++++++++++++++++++--------------------
 1 file changed, 54 insertions(+), 49 deletions(-)

--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -126,6 +126,30 @@ static void __init_or_module add_nops(vo
 	}
 }
 
+static void __init_or_module add_nop(u8 *instr, unsigned int len)
+{
+	u8 *target = instr + len;
+
+	if (!len)
+		return;
+
+	if (len <= ASM_NOP_MAX) {
+		memcpy(instr, x86_nops[len], len);
+		return;
+	}
+
+	if (len < 128) {
+		__text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE);
+		instr += JMP8_INSN_SIZE;
+	} else {
+		__text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE);
+		instr += JMP32_INSN_SIZE;
+	}
+
+	for (;instr < target; instr++)
+		*instr = INT3_INSN_OPCODE;
+}
+
 extern s32 __retpoline_sites[], __retpoline_sites_end[];
 extern s32 __return_sites[], __return_sites_end[];
 extern s32 __cfi_sites[], __cfi_sites_end[];
@@ -134,39 +158,32 @@ extern struct alt_instr __alt_instructio
 extern s32 __smp_locks[], __smp_locks_end[];
 void text_poke_early(void *addr, const void *opcode, size_t len);
 
-/*
- * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
- *
- * @instr: instruction byte stream
- * @instrlen: length of the above
- * @off: offset within @instr where the first NOP has been detected
- *
- * Return: number of NOPs found (and replaced).
- */
-static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
+static bool insn_is_nop(struct insn *insn)
 {
-	unsigned long flags;
-	int i = off, nnops;
+	if (insn->opcode.bytes[0] == 0x90)
+		return true;
 
-	while (i < instrlen) {
-		if (instr[i] != 0x90)
-			break;
+	if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
+		return true;
 
-		i++;
-	}
+	/* TODO: more nops */
 
-	nnops = i - off;
+	return false;
+}
 
-	if (nnops <= 1)
-		return nnops;
+static int skip_nops(u8 *instr, int offset, int len)
+{
+	struct insn insn;
 
-	local_irq_save(flags);
-	add_nops(instr + off, nnops);
-	local_irq_restore(flags);
+	for (; offset < len; offset += insn.length) {
+		if (insn_decode_kernel(&insn, &instr[offset]))
+			break;
 
-	DUMP_BYTES(ALT, instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);
+		if (!insn_is_nop(&insn))
+			break;
+	}
 
-	return nnops;
+	return offset;
 }
 
 /*
@@ -175,28 +192,19 @@ static __always_inline int optimize_nops
  */
 static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
 {
-	struct insn insn;
-	int i = 0;
+	for (int next, i = 0; i < len; i = next) {
+		struct insn insn;
 
-	/*
-	 * Jump over the non-NOP insns and optimize single-byte NOPs into bigger
-	 * ones.
-	 */
-	for (;;) {
 		if (insn_decode_kernel(&insn, &instr[i]))
 			return;
 
-		/*
-		 * See if this and any potentially following NOPs can be
-		 * optimized.
-		 */
-		if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
-			i += optimize_nops_range(instr, len, i);
-		else
-			i += insn.length;
+		next = i + insn.length;
 
-		if (i >= len)
-			return;
+		if (insn_is_nop(&insn)) {
+			next = skip_nops(instr, next, len);
+			add_nop(instr + i, next - i);
+			DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, i, next);
+		}
 	}
 }
 
@@ -317,13 +325,10 @@ apply_relocation(u8 *buf, size_t len, u8
 			}
 		}
 
-
-		/*
-		 * See if this and any potentially following NOPs can be
-		 * optimized.
-		 */
-		if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
-			next = i + optimize_nops_range(buf, len, i);
+		if (insn_is_nop(&insn)) {
+			next = skip_nops(buf, next, len);
+			add_nop(buf + i, next - i);
+		}
 	}
 }
 



^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v3 4/4] x86/alternative: Complicate optimize_nops() some more
  2023-02-08 17:10 [PATCH v3 0/4] x86: Fully relocatable alternatives and some NOPs Peter Zijlstra
                   ` (2 preceding siblings ...)
  2023-02-08 17:10 ` [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some Peter Zijlstra
@ 2023-02-08 17:10 ` Peter Zijlstra
  2023-05-13 13:03   ` [tip: x86/alternatives] " tip-bot2 for Peter Zijlstra
  2023-02-27 10:49 ` [PATCH] x86/lib/memmove: Decouple ERMS from FSRM Borislav Petkov
  4 siblings, 1 reply; 28+ messages in thread
From: Peter Zijlstra @ 2023-02-08 17:10 UTC (permalink / raw)
  To: x86
  Cc: linux-kernel, peterz, mhiramat, kirill.shutemov, Andrew.Cooper3,
	jpoimboe

Because:

  SMP alternatives: ffffffff810026dc: [2:44) optimized NOPs: eb 2a eb 28 cc cc
    cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
    cc cc cc cc cc cc cc cc cc cc cc cc cc

is quite daft, make things more complicated and have the NOP runlength
detection eat the preceding JMP if they both end at the same target.

  SMP alternatives: ffffffff810026dc: [0:44) optimized NOPs: eb 2a cc cc cc cc
    cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
    cc cc cc cc cc cc cc cc cc cc cc cc cc

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/alternative.c |   59 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 49 insertions(+), 10 deletions(-)

--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -126,6 +126,7 @@ static void __init_or_module add_nops(vo
 	}
 }
 
+/* Fill the buffer with a single effective instruction of size @len */
 static void __init_or_module add_nop(u8 *instr, unsigned int len)
 {
 	u8 *target = instr + len;
@@ -158,6 +159,9 @@ extern struct alt_instr __alt_instructio
 extern s32 __smp_locks[], __smp_locks_end[];
 void text_poke_early(void *addr, const void *opcode, size_t len);
 
+/*
+ * Matches NOP and NOPL, not any of the other possible NOPs.
+ */
 static bool insn_is_nop(struct insn *insn)
 {
 	if (insn->opcode.bytes[0] == 0x90)
@@ -171,6 +175,10 @@ static bool insn_is_nop(struct insn *ins
 	return false;
 }
 
+/*
+ * Find the offset of the first non-nop instruction starting at @offset
+ * but no further than @len.
+ */
 static int skip_nops(u8 *instr, int offset, int len)
 {
 	struct insn insn;
@@ -187,11 +195,46 @@ static int skip_nops(u8 *instr, int offs
 }
 
 /*
+ * Optimize a sequence of NOPs, possibly preceded by a unconditional jump
+ * to the end of the NOP sequence into a single 'NOP'.
+ */
+static bool __optimize_nops(u8 *instr, size_t len, struct insn *insn,
+			    int *next, int *prev, int *target)
+{
+	int i = *next - insn->length;
+
+	switch (insn->opcode.bytes[0]) {
+	case JMP8_INSN_OPCODE:
+	case JMP32_INSN_OPCODE:
+		*prev = i;
+		*target = *next + insn->immediate.value;
+		return false;
+	}
+
+	if (insn_is_nop(insn)) {
+		int nop = i;
+
+		*next = skip_nops(instr, *next, len);
+		if (*target && *next == *target)
+			nop = *prev;
+
+		add_nop(instr + nop, *next - nop);
+		DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next);
+		return true;
+	}
+
+	*target = 0;
+	return false;
+}
+
+/*
  * "noinline" to cause control flow change and thus invalidate I$ and
  * cause refetch after modification.
  */
 static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
 {
+	int prev, target = 0;
+
 	for (int next, i = 0; i < len; i = next) {
 		struct insn insn;
 
@@ -200,11 +243,7 @@ static void __init_or_module noinline op
 
 		next = i + insn.length;
 
-		if (insn_is_nop(&insn)) {
-			next = skip_nops(instr, next, len);
-			add_nop(instr + i, next - i);
-			DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, i, next);
-		}
+		__optimize_nops(instr, len, &insn, &next, &prev, &target);
 	}
 }
 
@@ -275,6 +314,8 @@ bool need_reloc(unsigned long offset, u8
 static void __init_or_module noinline
 apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
 {
+	int prev, target = 0;
+
 	for (int next, i = 0; i < len; i = next) {
 		struct insn insn;
 
@@ -283,6 +324,9 @@ apply_relocation(u8 *buf, size_t len, u8
 
 		next = i + insn.length;
 
+		if (__optimize_nops(buf, len, &insn, &next, &prev, &target))
+			continue;
+
 		switch (insn.opcode.bytes[0]) {
 		case 0x0f:
 			if (insn.opcode.bytes[1] < 0x80 ||
@@ -324,11 +368,6 @@ apply_relocation(u8 *buf, size_t len, u8
 					    src - dest);
 			}
 		}
-
-		if (insn_is_nop(&insn)) {
-			next = skip_nops(buf, next, len);
-			add_nop(buf + i, next - i);
-		}
 	}
 }
 



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some
  2023-02-08 17:10 ` [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some Peter Zijlstra
@ 2023-02-08 19:52   ` Andrew.Cooper3
  2023-02-08 20:29     ` Peter Zijlstra
  2023-02-08 23:04     ` David Laight
  2023-05-13 13:03   ` [tip: x86/alternatives] " tip-bot2 for Peter Zijlstra
  1 sibling, 2 replies; 28+ messages in thread
From: Andrew.Cooper3 @ 2023-02-08 19:52 UTC (permalink / raw)
  To: Peter Zijlstra, x86; +Cc: linux-kernel, mhiramat, kirill.shutemov, jpoimboe

On 08/02/2023 5:10 pm, Peter Zijlstra wrote:
> This rewrite address two issues:
>
>  - it no longer hard requires single byte nop runs, it now accepts
>    any NOP and NOPL encoded instruction (but not the more complicated
>    32bit NOPs).
>
>  - it writes a single 'instruction' replacement.
>
> Specifically, ORC unwinder relies on the tail NOP of an alternative to
> be a single instruction, in particular it relies on the inner bytes
> not being executed.
>
> Once we reach the max supported NOP length (currently 8, could easily
> be extended to 11 on x86_64), switches to JMP.d8 and INT3 padding to
> achieve the same result.
>
> The ORC unwinder uses this guarantee in the analysis of
> alternative/overlapping CFI state,
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

How lucky are you feeling for your game of performance roulette?

Unconditional jmps cost branch prediction these days, and won't be
successfully predicted until taken.

There is a point after which a jmp is more efficient that brute forcing
through a line of nops, and where this point is is very uarch specific,
but it's not a single nop...

Whether you care or not is a different matter, but at least be aware
doing a jmp like this instead of e.g. 2 or 3 nops, is contrary to the
prior advice given by the architects.

~Andrew

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some
  2023-02-08 19:52   ` Andrew.Cooper3
@ 2023-02-08 20:29     ` Peter Zijlstra
  2023-02-08 20:36       ` Peter Zijlstra
  2023-02-09  1:33       ` Andrew.Cooper3
  2023-02-08 23:04     ` David Laight
  1 sibling, 2 replies; 28+ messages in thread
From: Peter Zijlstra @ 2023-02-08 20:29 UTC (permalink / raw)
  To: Andrew.Cooper3; +Cc: x86, linux-kernel, mhiramat, kirill.shutemov, jpoimboe

On Wed, Feb 08, 2023 at 07:52:04PM +0000, Andrew.Cooper3@citrix.com wrote:
> On 08/02/2023 5:10 pm, Peter Zijlstra wrote:
> > This rewrite address two issues:
> >
> >  - it no longer hard requires single byte nop runs, it now accepts
> >    any NOP and NOPL encoded instruction (but not the more complicated
> >    32bit NOPs).
> >
> >  - it writes a single 'instruction' replacement.
> >
> > Specifically, ORC unwinder relies on the tail NOP of an alternative to
> > be a single instruction, in particular it relies on the inner bytes
> > not being executed.
> >
> > Once we reach the max supported NOP length (currently 8, could easily
> > be extended to 11 on x86_64), switches to JMP.d8 and INT3 padding to
> > achieve the same result.
> >
> > The ORC unwinder uses this guarantee in the analysis of
> > alternative/overlapping CFI state,
> >
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> 
> How lucky are you feeling for your game of performance roulette?

Yeah, not very lucky.. I've been talking about this with Boris for a bit
already.

> Unconditional jmps cost branch prediction these days, and won't be
> successfully predicted until taken.

IKR, insane, but that's what it is.

> There is a point after which a jmp is more efficient that brute forcing
> through a line of nops, and where this point is is very uarch specific,
> but it's not a single nop...

Yeah, I know, even very big nops. $I prefetch is good at straight lines
etc..

> Whether you care or not is a different matter, but at least be aware
> doing a jmp like this instead of e.g. 2 or 3 nops, is contrary to the
> prior advice given by the architects.

So, it is either this, or make the CFI conflict analysis done by objtool
a lot more 'interesting'. I figured I'd try this first.

As is this is actually a correctness issue, not a performance issue.
Goes hand in hand with the patches:

  https://lkml.kernel.org/r/20230208172245.711471461@infradead.org
  https://lkml.kernel.org/r/20230208172245.783099843@infradead.org

Specifically, for short alternatives objtool adds a single nop of
size difference size at the end. The advantage is that you only get one
CFI entry for that thing and so don't have potential conflict for any of
the inner bytes.

The alternative is making it multiple NOPs and ensuring objtool and the
kernel both agree on the length of them.

As is, there were only a hand full of NOPs that turned into this jmp.d8
thing on the defconfig+kvm_guest.config I build to test this -- it is by
no means a common thing. And about half of them would be gone by
extending the max nop length to at least 10 or so.

In fact, I did that patch once, lemme see if I still have it...

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some
  2023-02-08 20:29     ` Peter Zijlstra
@ 2023-02-08 20:36       ` Peter Zijlstra
  2023-02-08 20:44         ` Peter Zijlstra
  2023-02-09  1:33       ` Andrew.Cooper3
  1 sibling, 1 reply; 28+ messages in thread
From: Peter Zijlstra @ 2023-02-08 20:36 UTC (permalink / raw)
  To: Andrew.Cooper3; +Cc: x86, linux-kernel, mhiramat, kirill.shutemov, jpoimboe

On Wed, Feb 08, 2023 at 09:29:23PM +0100, Peter Zijlstra wrote:

> As is, there were only a hand full of NOPs that turned into this jmp.d8
> thing on the defconfig+kvm_guest.config I build to test this -- it is by
> no means a common thing. And about half of them would be gone by
> extending the max nop length to at least 10 or so.
> 
> In fact, I did that patch once, lemme see if I still have it...

Even still applies too, lemme go test that.

Also, there's a bunch of alternatives() where we explicitly put in that
short jmp instead of taking the nops, see for example:

  $ git grep -i "alternative.*jmp" arch/x86/

---
Subject: x86_64: Longer NOPs
From: Peter Zijlstra <peterz@infradead.org>


Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/nops.h   |   13 +++++++++++--
 arch/x86/kernel/alternative.c |    8 ++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -34,6 +34,8 @@
 #define BYTES_NOP7	0x8d,0xb4,0x26,0x00,0x00,0x00,0x00
 #define BYTES_NOP8	0x3e,BYTES_NOP7
 
+#define ASM_NOP_MAX 8
+
 #else
 
 /*
@@ -47,6 +49,8 @@
  * 6: osp nopl 0x00(%eax,%eax,1)
  * 7: nopl 0x00000000(%eax)
  * 8: nopl 0x00000000(%eax,%eax,1)
+ * 9: cs nopl 0x00000000(%eax,%eax,1)
+ * 10: osp cs nopl 0x00000000(%eax,%eax,1)
  */
 #define BYTES_NOP1	0x90
 #define BYTES_NOP2	0x66,BYTES_NOP1
@@ -56,6 +60,13 @@
 #define BYTES_NOP6	0x66,BYTES_NOP5
 #define BYTES_NOP7	0x0f,0x1f,0x80,0x00,0x00,0x00,0x00
 #define BYTES_NOP8	0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+#define BYTES_NOP9	0x2e,BYTES_NOP8
+#define BYTES_NOP10	0x66,BYTES_NOP9
+
+#define ASM_NOP9  _ASM_BYTES(BYTES_NOP9)
+#define ASM_NOP10 _ASM_BYTES(BYTES_NOP10)
+
+#define ASM_NOP_MAX 10
 
 #endif /* CONFIG_64BIT */
 
@@ -68,8 +79,6 @@
 #define ASM_NOP7 _ASM_BYTES(BYTES_NOP7)
 #define ASM_NOP8 _ASM_BYTES(BYTES_NOP8)
 
-#define ASM_NOP_MAX 8
-
 #ifndef __ASSEMBLY__
 extern const unsigned char * const x86_nops[];
 #endif
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -86,6 +86,10 @@ static const unsigned char x86nops[] =
 	BYTES_NOP6,
 	BYTES_NOP7,
 	BYTES_NOP8,
+#ifdef CONFIG_64BIT
+	BYTES_NOP9,
+	BYTES_NOP10,
+#endif
 };
 
 const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
@@ -99,6 +103,10 @@ const unsigned char * const x86_nops[ASM
 	x86nops + 1 + 2 + 3 + 4 + 5,
 	x86nops + 1 + 2 + 3 + 4 + 5 + 6,
 	x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+#ifdef CONFIG_64BIT
+	x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+	x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9,
+#endif
 };
 
 /* Use this to add nops to a buffer, then text_poke the whole buffer. */

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some
  2023-02-08 20:36       ` Peter Zijlstra
@ 2023-02-08 20:44         ` Peter Zijlstra
  2023-02-08 20:45           ` Peter Zijlstra
                             ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: Peter Zijlstra @ 2023-02-08 20:44 UTC (permalink / raw)
  To: Andrew.Cooper3; +Cc: x86, linux-kernel, mhiramat, kirill.shutemov, jpoimboe

On Wed, Feb 08, 2023 at 09:36:24PM +0100, Peter Zijlstra wrote:
> On Wed, Feb 08, 2023 at 09:29:23PM +0100, Peter Zijlstra wrote:
> 
> > As is, there were only a hand full of NOPs that turned into this jmp.d8
> > thing on the defconfig+kvm_guest.config I build to test this -- it is by
> > no means a common thing. And about half of them would be gone by
> > extending the max nop length to at least 10 or so.
> > 
> > In fact, I did that patch once, lemme see if I still have it...
> 
> Even still applies too, lemme go test that.

Works as advertised. Next up are 8 12 byte nopes and 8 13 byte nops,
after that we're left with:

[   11.584069] SMP alternatives: ffffffff82000095: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
[   11.588621] SMP alternatives: ffffffff810026dc: [0:44) optimized NOPs: eb 2a cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cccc cc cc cc cc cc cc cc cc
[   11.590068] SMP alternatives: ffffffff820001f3: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
[   11.720069] SMP alternatives: ffffffff8200189f: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
[   11.731069] SMP alternatives: ffffffff820019ae: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
[   11.738069] SMP alternatives: ffffffff82001a4a: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
[   11.746069] SMP alternatives: ffffffff82001b2d: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
[   11.766069] SMP alternatives: ffffffff82001d14: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
[   11.770069] SMP alternatives: ffffffff82001dd5: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
[   11.779069] SMP alternatives: ffffffff82001f35: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc

Which is all pretty big...

GNU as seems to go to .nops 12. 13 gives me a single byte nop and a 12.
That's also exactly the 3 prefix limit, after which some uarchs start
taking heavy decode penalties.

Let me go extend that patch of mine to cover 12. Also, instead of
printing hex addresses, lemme go print symbol+off things.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some
  2023-02-08 20:44         ` Peter Zijlstra
@ 2023-02-08 20:45           ` Peter Zijlstra
  2023-02-08 21:01           ` Peter Zijlstra
  2023-02-08 21:08           ` Peter Zijlstra
  2 siblings, 0 replies; 28+ messages in thread
From: Peter Zijlstra @ 2023-02-08 20:45 UTC (permalink / raw)
  To: Andrew.Cooper3; +Cc: x86, linux-kernel, mhiramat, kirill.shutemov, jpoimboe

On Wed, Feb 08, 2023 at 09:44:04PM +0100, Peter Zijlstra wrote:
> GNU as seems to go to .nops 12. 13 gives me a single byte nop and a 12.
> That's also exactly the 3 prefix limit, after which some uarchs start
> taking heavy decode penalties.

Oh noes, counting hard. It maxes out at 11, not 12. Sadness.

Instead, lemme go figure out what those alternatives actually are.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some
  2023-02-08 20:44         ` Peter Zijlstra
  2023-02-08 20:45           ` Peter Zijlstra
@ 2023-02-08 21:01           ` Peter Zijlstra
  2023-02-08 21:08           ` Peter Zijlstra
  2 siblings, 0 replies; 28+ messages in thread
From: Peter Zijlstra @ 2023-02-08 21:01 UTC (permalink / raw)
  To: Andrew.Cooper3; +Cc: x86, linux-kernel, mhiramat, kirill.shutemov, jpoimboe

On Wed, Feb 08, 2023 at 09:44:04PM +0100, Peter Zijlstra wrote:

> Works as advertised. Next up are 8 12 byte nopes and 8 13 byte nops,
> after that we're left with:

Yeah, we can ignore those, those are the retpoline thunks being patched.
They should be totally unused. These NOPs are after an indirect jmp and
as such pure speculation food.

Arguably I should just merge that patch that turns them into UD2 --
funneling and all that.

But there was some i386 issues... oh well.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some
  2023-02-08 20:44         ` Peter Zijlstra
  2023-02-08 20:45           ` Peter Zijlstra
  2023-02-08 21:01           ` Peter Zijlstra
@ 2023-02-08 21:08           ` Peter Zijlstra
  2023-02-08 21:21             ` Peter Zijlstra
  2 siblings, 1 reply; 28+ messages in thread
From: Peter Zijlstra @ 2023-02-08 21:08 UTC (permalink / raw)
  To: Andrew.Cooper3; +Cc: x86, linux-kernel, mhiramat, kirill.shutemov, jpoimboe

On Wed, Feb 08, 2023 at 09:44:04PM +0100, Peter Zijlstra wrote:

> [   11.584069] SMP alternatives: ffffffff82000095: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> [   11.590068] SMP alternatives: ffffffff820001f3: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> [   11.720069] SMP alternatives: ffffffff8200189f: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> [   11.731069] SMP alternatives: ffffffff820019ae: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> [   11.738069] SMP alternatives: ffffffff82001a4a: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> [   11.746069] SMP alternatives: ffffffff82001b2d: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> [   11.766069] SMP alternatives: ffffffff82001d14: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> [   11.770069] SMP alternatives: ffffffff82001dd5: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> [   11.779069] SMP alternatives: ffffffff82001f35: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc

UNTRAIN_RET -- specifically RESET_CALL_DEPTH

> [   11.588621] SMP alternatives: ffffffff810026dc: [0:44) optimized NOPs: eb 2a cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cccc cc cc cc cc cc cc cc cc

FILL_RETURN_BUFFER, and I'm thinking a 44 byte nop we ought to just jmp.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some
  2023-02-08 21:08           ` Peter Zijlstra
@ 2023-02-08 21:21             ` Peter Zijlstra
  2023-02-09  1:11               ` Andrew.Cooper3
  0 siblings, 1 reply; 28+ messages in thread
From: Peter Zijlstra @ 2023-02-08 21:21 UTC (permalink / raw)
  To: Andrew.Cooper3; +Cc: x86, linux-kernel, mhiramat, kirill.shutemov, jpoimboe

On Wed, Feb 08, 2023 at 10:08:12PM +0100, Peter Zijlstra wrote:
> On Wed, Feb 08, 2023 at 09:44:04PM +0100, Peter Zijlstra wrote:
> 
> > [   11.584069] SMP alternatives: ffffffff82000095: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> > [   11.590068] SMP alternatives: ffffffff820001f3: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> > [   11.720069] SMP alternatives: ffffffff8200189f: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> > [   11.731069] SMP alternatives: ffffffff820019ae: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> > [   11.738069] SMP alternatives: ffffffff82001a4a: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> > [   11.746069] SMP alternatives: ffffffff82001b2d: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> > [   11.766069] SMP alternatives: ffffffff82001d14: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> > [   11.770069] SMP alternatives: ffffffff82001dd5: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> > [   11.779069] SMP alternatives: ffffffff82001f35: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
> 
> UNTRAIN_RET -- specifically RESET_CALL_DEPTH

19:       48 c7 c0 80 00 00 00    mov    $0x80,%rax
20:       48 c1 e0 38             shl    $0x38,%rax
24:       65 48 89 04 25 00 00 00 00      mov    %rax,%gs:0x0     29: R_X86_64_32S        pcpu_hot+0x10

Is ofc an atrocity.

We can easily trim that by 5 bytes to:

0:   b0 80                   mov    $0x80,%al
2:   48 c1 e0 38             shl    $0x38,%rax
6:   65 48 89 04 25 00 00 00 00      mov    %rax,%gs:0x0

Who cares about the top bytes, we're explicitly shifting them out
anyway. But that's still 15 bytes or so.

If it weren't for those pesky prefix penalties that would make exactly
one instruction :-)


diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index e04313e89f4f..be792f9407b5 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -84,7 +84,7 @@
 	movq	$-1, PER_CPU_VAR(pcpu_hot + X86_call_depth);
 
 #define RESET_CALL_DEPTH					\
-	mov	$0x80, %rax;					\
+	movb	$0x80, %al;					\
 	shl	$56, %rax;					\
 	movq	%rax, PER_CPU_VAR(pcpu_hot + X86_call_depth);
 

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* RE: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some
  2023-02-08 19:52   ` Andrew.Cooper3
  2023-02-08 20:29     ` Peter Zijlstra
@ 2023-02-08 23:04     ` David Laight
  1 sibling, 0 replies; 28+ messages in thread
From: David Laight @ 2023-02-08 23:04 UTC (permalink / raw)
  To: 'Andrew.Cooper3@citrix.com', Peter Zijlstra, x86
  Cc: linux-kernel, mhiramat, kirill.shutemov, jpoimboe

From: Andrew.Cooper3@citrix.com
> Sent: 08 February 2023 19:52
...
> Unconditional jmps cost branch prediction these days, and won't be
> successfully predicted until taken.

That is sad :-(

Don't they also use the BTB for the target address?
So even if predicted taken the cpu can speculate from
the wrong address??

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some
  2023-02-08 21:21             ` Peter Zijlstra
@ 2023-02-09  1:11               ` Andrew.Cooper3
  2023-02-09 22:27                 ` David Laight
  0 siblings, 1 reply; 28+ messages in thread
From: Andrew.Cooper3 @ 2023-02-09  1:11 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: x86, linux-kernel, mhiramat, kirill.shutemov, jpoimboe

On 08/02/2023 9:21 pm, Peter Zijlstra wrote:
> On Wed, Feb 08, 2023 at 10:08:12PM +0100, Peter Zijlstra wrote:
>> On Wed, Feb 08, 2023 at 09:44:04PM +0100, Peter Zijlstra wrote:
>>
>>> [   11.584069] SMP alternatives: ffffffff82000095: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
>>> [   11.590068] SMP alternatives: ffffffff820001f3: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
>>> [   11.720069] SMP alternatives: ffffffff8200189f: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
>>> [   11.731069] SMP alternatives: ffffffff820019ae: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
>>> [   11.738069] SMP alternatives: ffffffff82001a4a: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
>>> [   11.746069] SMP alternatives: ffffffff82001b2d: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
>>> [   11.766069] SMP alternatives: ffffffff82001d14: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
>>> [   11.770069] SMP alternatives: ffffffff82001dd5: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
>>> [   11.779069] SMP alternatives: ffffffff82001f35: [0:20) optimized NOPs: eb 12 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
>> UNTRAIN_RET -- specifically RESET_CALL_DEPTH
> 19:       48 c7 c0 80 00 00 00    mov    $0x80,%rax
> 20:       48 c1 e0 38             shl    $0x38,%rax
> 24:       65 48 89 04 25 00 00 00 00      mov    %rax,%gs:0x0     29: R_X86_64_32S        pcpu_hot+0x10
>
> Is ofc an atrocity.
>
> We can easily trim that by 5 bytes to:
>
> 0:   b0 80                   mov    $0x80,%al
> 2:   48 c1 e0 38             shl    $0x38,%rax
> 6:   65 48 89 04 25 00 00 00 00      mov    %rax,%gs:0x0
>
> Who cares about the top bytes, we're explicitly shifting them out
> anyway. But that's still 15 bytes or so.
>
> If it weren't for those pesky prefix penalties that would make exactly
> one instruction :-)

Yeah, but then you're taking a merge penalty instead.

Given that you can't reduce enough anyway, while only a 4 byte reduction
rather than 5, you're probably better off with:

0:   31 c0                   xor    %eax,%eax
2:   48 0f ba e8 3f          bts    $0x3f,%rax
7:   65 48 89 04 25 00 00 00 00      mov    %rax,%gs:0x0

because of the zeroing idiom splitting these 3 instructions away from
the previous operation on rax.

It's a shame that x86 doesn't have a mov $imm8, %d32 form, because
loading 1 into a register is an incredibly common operation to perform.

~Andrew

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some
  2023-02-08 20:29     ` Peter Zijlstra
  2023-02-08 20:36       ` Peter Zijlstra
@ 2023-02-09  1:33       ` Andrew.Cooper3
  1 sibling, 0 replies; 28+ messages in thread
From: Andrew.Cooper3 @ 2023-02-09  1:33 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: x86, linux-kernel, mhiramat, kirill.shutemov, jpoimboe

On 08/02/2023 8:29 pm, Peter Zijlstra wrote:
> On Wed, Feb 08, 2023 at 07:52:04PM +0000, Andrew.Cooper3@citrix.com wrote:
>> On 08/02/2023 5:10 pm, Peter Zijlstra wrote:
>>> This rewrite address two issues:
>>>
>>>  - it no longer hard requires single byte nop runs, it now accepts
>>>    any NOP and NOPL encoded instruction (but not the more complicated
>>>    32bit NOPs).
>>>
>>>  - it writes a single 'instruction' replacement.
>>>
>>> Specifically, ORC unwinder relies on the tail NOP of an alternative to
>>> be a single instruction, in particular it relies on the inner bytes
>>> not being executed.
>>>
>>> Once we reach the max supported NOP length (currently 8, could easily
>>> be extended to 11 on x86_64), switches to JMP.d8 and INT3 padding to
>>> achieve the same result.
>>>
>>> The ORC unwinder uses this guarantee in the analysis of
>>> alternative/overlapping CFI state,
>>>
>>> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> How lucky are you feeling for your game of performance roulette?
> Yeah, not very lucky.. I've been talking about this with Boris for a bit
> already.
>
>> Unconditional jmps cost branch prediction these days, and won't be
>> successfully predicted until taken.
> IKR, insane, but that's what it is.

In terms of rationalising how things work, sure, put the resulting perf
numbers speak for themselves.

For the benefit of others reading this and not following what's going
on, modern x86 processors have branch prediction occurring pre-decode,
not post-decode, to reduce the misprediction latency.

Branch prediction operates using the current %rip and past history, and
selects the $I lines to send for decode.  The "decoded bytes disagree
with prediction metadata" feedback cycle is fast, but missing this
disagreement is the root cause of the Branch Type Confusion speculation
issue (a.k.a. AMD Retbleed).

~Andrew

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some
  2023-02-09  1:11               ` Andrew.Cooper3
@ 2023-02-09 22:27                 ` David Laight
  0 siblings, 0 replies; 28+ messages in thread
From: David Laight @ 2023-02-09 22:27 UTC (permalink / raw)
  To: 'Andrew.Cooper3@citrix.com', Peter Zijlstra
  Cc: x86, linux-kernel, mhiramat, kirill.shutemov, jpoimboe

From: Andrew.Cooper3@citrix.com
> Sent: 09 February 2023 01:11
...
> >> UNTRAIN_RET -- specifically RESET_CALL_DEPTH
> > 19:       48 c7 c0 80 00 00 00    mov    $0x80,%rax
> > 20:       48 c1 e0 38             shl    $0x38,%rax
> > 24:       65 48 89 04 25 00 00 00 00      mov    %rax,%gs:0x0     29: R_X86_64_32S
> pcpu_hot+0x10
> >
> > Is ofc an atrocity.
> >
> > We can easily trim that by 5 bytes to:
> >
> > 0:   b0 80                   mov    $0x80,%al
> > 2:   48 c1 e0 38             shl    $0x38,%rax
> > 6:   65 48 89 04 25 00 00 00 00      mov    %rax,%gs:0x0
> >
> > Who cares about the top bytes, we're explicitly shifting them out
> > anyway. But that's still 15 bytes or so.
> >
> > If it weren't for those pesky prefix penalties that would make exactly
> > one instruction :-)
> 
> Yeah, but then you're taking a merge penalty instead.
> 
> Given that you can't reduce enough anyway, while only a 4 byte reduction
> rather than 5, you're probably better off with:
> 
> 0:   31 c0                   xor    %eax,%eax
> 2:   48 0f ba e8 3f          bts    $0x3f,%rax
> 7:   65 48 89 04 25 00 00 00 00      mov    %rax,%gs:0x0
> 
> because of the zeroing idiom splitting these 3 instructions away from
> the previous operation on rax.

How about:
		31 c0     xor %eax,%eax
		f9        stc
		48 d1 d8  rcr $1,%rax
So 6 bytes total.
But that might be a partial dependency on flags.
(Although that isn't any worse than the xor.)
It is also a longer dependency chain - so the execution time
rather depends on what else is going on.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/4] x86/alternative: Make debug-alternative selective
  2023-02-08 17:10 ` [PATCH v3 1/4] x86/alternative: Make debug-alternative selective Peter Zijlstra
@ 2023-02-14 11:48   ` Borislav Petkov
  2023-05-13 13:03   ` [tip: x86/alternatives] " tip-bot2 for Peter Zijlstra
  1 sibling, 0 replies; 28+ messages in thread
From: Borislav Petkov @ 2023-02-14 11:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, linux-kernel, mhiramat, kirill.shutemov, Andrew.Cooper3, jpoimboe

On Wed, Feb 08, 2023 at 06:10:51PM +0100, Peter Zijlstra wrote:
> Using debug-alternative generates a *LOT* of output, extend it a bit
> to select which of the many rewrites it reports on.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/x86/kernel/alternative.c |   62 +++++++++++++++++++++++++-----------------
>  1 file changed, 37 insertions(+), 25 deletions(-)

Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 2/4] x86/alternative: Support relocations in alternatives
  2023-02-08 17:10 ` [PATCH v3 2/4] x86/alternative: Support relocations in alternatives Peter Zijlstra
@ 2023-02-17 20:28   ` Borislav Petkov
  2023-02-17 22:21   ` Borislav Petkov
  2023-05-13 13:03   ` [tip: x86/alternatives] " tip-bot2 for Peter Zijlstra
  2 siblings, 0 replies; 28+ messages in thread
From: Borislav Petkov @ 2023-02-17 20:28 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, linux-kernel, mhiramat, kirill.shutemov, Andrew.Cooper3, jpoimboe

On Wed, Feb 08, 2023 at 06:10:52PM +0100, Peter Zijlstra wrote:
>  /*
> + * What we start with is:

Some more clarification and de-personalization of this comment. Diff
ontop:

---
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 625f05c2b255..e14bc15bf646 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -201,15 +201,21 @@ static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
 }
 
 /*
- * What we start with is:
+ * In this context, "source" is where the instructions are placed in the
+ * section .altinstr_replacement, for example during kernel build by the
+ * toolchain.
+ * "Destination" is where the instructions are being patched in by this
+ * machinery.
+ *
+ * The source offset is:
  *
  *   src_imm = target - src_next_ip                  (1)
  *
- * what we want is:
+ * and the target offset is:
  *
  *   dst_imm = target - dst_next_ip                  (2)
  *
- * so what we do is rework (1) as an expression for target like:
+ * so rework (1) as an expression for target like:
  *
  *   target = src_imm + src_next_ip                  (1a)
  *
@@ -217,8 +223,8 @@ static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
  *
  *   dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
  *
- * Now, since the instruction stream is 'identical' at src and dst (we copy
- * after all) we can state that:
+ * Now, since the instruction stream is 'identical' at src and dst (it
+ * is being copied after all) it can be stated that:
  *
  *   src_next_ip = src + ip_offset
  *   dst_next_ip = dst + ip_offset                   (4)


-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 2/4] x86/alternative: Support relocations in alternatives
  2023-02-08 17:10 ` [PATCH v3 2/4] x86/alternative: Support relocations in alternatives Peter Zijlstra
  2023-02-17 20:28   ` Borislav Petkov
@ 2023-02-17 22:21   ` Borislav Petkov
  2023-05-13 13:03   ` [tip: x86/alternatives] " tip-bot2 for Peter Zijlstra
  2 siblings, 0 replies; 28+ messages in thread
From: Borislav Petkov @ 2023-02-17 22:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, linux-kernel, mhiramat, kirill.shutemov, Andrew.Cooper3, jpoimboe

On Wed, Feb 08, 2023 at 06:10:52PM +0100, Peter Zijlstra wrote:
> +			if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
> +				s32 imm = insn.immediate.value;
> +				imm += src - dest;
> +				imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
> +				if ((imm >> 31) == (imm >> 7)) {
> +					buf[i+0] = JMP8_INSN_OPCODE;
> +					buf[i+1] = (s8)imm;
> +					for (int j = 2; j < insn.length; j++)
> +						buf[i+j] = INT3_INSN_OPCODE;

Let's get rid of that third nested loop:

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index e14bc15bf646..28eb1d0bc4a0 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -308,8 +308,8 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
 				if ((imm >> 31) == (imm >> 7)) {
 					buf[i+0] = JMP8_INSN_OPCODE;
 					buf[i+1] = (s8)imm;
-					for (int j = 2; j < insn.length; j++)
-						buf[i+j] = INT3_INSN_OPCODE;
+
+					memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2);
 				}
 			}
 			break;

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH] x86/lib/memmove: Decouple ERMS from FSRM
  2023-02-08 17:10 [PATCH v3 0/4] x86: Fully relocatable alternatives and some NOPs Peter Zijlstra
                   ` (3 preceding siblings ...)
  2023-02-08 17:10 ` [PATCH v3 4/4] x86/alternative: Complicate optimize_nops() some more Peter Zijlstra
@ 2023-02-27 10:49 ` Borislav Petkov
  2023-04-27  9:22   ` [PATCH TEST] " Yahu Gao
  4 siblings, 1 reply; 28+ messages in thread
From: Borislav Petkov @ 2023-02-27 10:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, linux-kernel, mhiramat, kirill.shutemov, Andrew.Cooper3,
	jpoimboe, Daniel Verkamp, Jiri Slaby, Tony Luck

On Wed, Feb 08, 2023 at 06:10:50PM +0100, Peter Zijlstra wrote:
> Hi,
> 
> This version is fully tested and includes a few NOP twiddles inspired by the
> objtool patches I'll post separately.
> 
> Also available at:
> 
>   https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/log/?h=x86/core

Ontop of the first two:

From: "Borislav Petkov (AMD)" <bp@alien8.de>
Date: Sun, 26 Feb 2023 21:04:26 +0100
Subject: [PATCH] x86/lib/memmove: Decouple ERMS from FSRM

Up until now it was perceived that FSRM is an improvement to ERMS and
thus it was made dependent on latter.

However, there are AMD BIOSes out there which allow for disabling of
either features and this preventing kernels from booting due to the CMP
disappearing and thus breaking the logic in the memmove() function.

Similar observation happens on some VM migration scenarios.

Patch the proper sequences depending on which feature is enabled.

Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reported-by: Daniel Verkamp <dverkamp@chromium.org>
Reported-by: Jiri Slaby <jirislaby@kernel.org>
---
 arch/x86/lib/memmove_64.S | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 02661861e5dd..0559b206fb11 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -38,10 +38,12 @@ SYM_FUNC_START(__memmove)
 	cmp %rdi, %r8
 	jg 2f
 
-	/* FSRM implies ERMS => no length checks, do the copy directly */
+#define CHECK_LEN	cmp $0x20, %rdx; jb 1f
+#define MEMMOVE_BYTES	movq %rdx, %rcx; rep movsb; RET
 .Lmemmove_begin_forward:
-	ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM
-	ALTERNATIVE "", "jmp .Lmemmove_erms", X86_FEATURE_ERMS
+	ALTERNATIVE_2 __stringify(CHECK_LEN), \
+		      __stringify(CHECK_LEN; MEMMOVE_BYTES), X86_FEATURE_ERMS, \
+		      __stringify(MEMMOVE_BYTES), X86_FEATURE_FSRM
 
 	/*
 	 * movsq instruction have many startup latency
@@ -207,11 +209,6 @@ SYM_FUNC_START(__memmove)
 	movb %r11b, (%rdi)
 13:
 	RET
-
-.Lmemmove_erms:
-	movq %rdx, %rcx
-	rep movsb
-	RET
 SYM_FUNC_END(__memmove)
 EXPORT_SYMBOL(__memmove)
 
-- 
2.35.1


-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH TEST] x86/lib/memmove: Decouple ERMS from FSRM
  2023-02-27 10:49 ` [PATCH] x86/lib/memmove: Decouple ERMS from FSRM Borislav Petkov
@ 2023-04-27  9:22   ` Yahu Gao
  0 siblings, 0 replies; 28+ messages in thread
From: Yahu Gao @ 2023-04-27  9:22 UTC (permalink / raw)
  To: bp
  Cc: Andrew.Cooper3, dverkamp, jirislaby, jpoimboe, kirill.shutemov,
	linux-kernel, mhiramat, peterz, tony.luck, x86

Tested-by: Yahu Gao gaoyh12@lenove.com

Now [PATCH V3 2/4] has a conflict with commit
5d1dd961e74334a2178264193ea813d44ce5e725

Except the conflict it fixed the issue disabling ERMS in AMD BIOSes.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [tip: x86/alternatives] x86/alternative: Complicate optimize_nops() some more
  2023-02-08 17:10 ` [PATCH v3 4/4] x86/alternative: Complicate optimize_nops() some more Peter Zijlstra
@ 2023-05-13 13:03   ` tip-bot2 for Peter Zijlstra
  2023-05-13 16:01     ` [PATCH] x86/alternatives: Fix section mismatch warnings Borislav Petkov
  0 siblings, 1 reply; 28+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2023-05-13 13:03 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel), Borislav Petkov (AMD), x86, linux-kernel

The following commit has been merged into the x86/alternatives branch of tip:

Commit-ID:     b6c881b248ef9d629ec2365808cb4894991c0837
Gitweb:        https://git.kernel.org/tip/b6c881b248ef9d629ec2365808cb4894991c0837
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 08 Feb 2023 18:10:54 +01:00
Committer:     Borislav Petkov (AMD) <bp@alien8.de>
CommitterDate: Thu, 11 May 2023 17:34:20 +02:00

x86/alternative: Complicate optimize_nops() some more

Because:

  SMP alternatives: ffffffff810026dc: [2:44) optimized NOPs: eb 2a eb 28 cc cc
    cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
    cc cc cc cc cc cc cc cc cc cc cc cc cc

is quite daft, make things more complicated and have the NOP runlength
detection eat the preceding JMP if they both end at the same target.

  SMP alternatives: ffffffff810026dc: [0:44) optimized NOPs: eb 2a cc cc cc cc
    cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc
    cc cc cc cc cc cc cc cc cc cc cc cc cc

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20230208171431.433132442@infradead.org
---
 arch/x86/kernel/alternative.c | 60 ++++++++++++++++++++++++++++------
 1 file changed, 50 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 839bc6d..b78d55f 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -114,6 +114,8 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
 };
 
 /*
+ * Fill the buffer with a single effective instruction of size @len.
+ *
  * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
  * for every single-byte NOP, try to generate the maximally available NOP of
  * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for
@@ -152,6 +154,9 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 extern s32 __smp_locks[], __smp_locks_end[];
 void text_poke_early(void *addr, const void *opcode, size_t len);
 
+/*
+ * Matches NOP and NOPL, not any of the other possible NOPs.
+ */
 static bool insn_is_nop(struct insn *insn)
 {
 	if (insn->opcode.bytes[0] == 0x90)
@@ -165,6 +170,10 @@ static bool insn_is_nop(struct insn *insn)
 	return false;
 }
 
+/*
+ * Find the offset of the first non-NOP instruction starting at @offset
+ * but no further than @len.
+ */
 static int skip_nops(u8 *instr, int offset, int len)
 {
 	struct insn insn;
@@ -181,11 +190,46 @@ static int skip_nops(u8 *instr, int offset, int len)
 }
 
 /*
+ * Optimize a sequence of NOPs, possibly preceded by an unconditional jump
+ * to the end of the NOP sequence into a single NOP.
+ */
+static bool __optimize_nops(u8 *instr, size_t len, struct insn *insn,
+			    int *next, int *prev, int *target)
+{
+	int i = *next - insn->length;
+
+	switch (insn->opcode.bytes[0]) {
+	case JMP8_INSN_OPCODE:
+	case JMP32_INSN_OPCODE:
+		*prev = i;
+		*target = *next + insn->immediate.value;
+		return false;
+	}
+
+	if (insn_is_nop(insn)) {
+		int nop = i;
+
+		*next = skip_nops(instr, *next, len);
+		if (*target && *next == *target)
+			nop = *prev;
+
+		add_nop(instr + nop, *next - nop);
+		DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next);
+		return true;
+	}
+
+	*target = 0;
+	return false;
+}
+
+/*
  * "noinline" to cause control flow change and thus invalidate I$ and
  * cause refetch after modification.
  */
 static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
 {
+	int prev, target = 0;
+
 	for (int next, i = 0; i < len; i = next) {
 		struct insn insn;
 
@@ -194,11 +238,7 @@ static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
 
 		next = i + insn.length;
 
-		if (insn_is_nop(&insn)) {
-			next = skip_nops(instr, next, len);
-			add_nop(instr + i, next - i);
-			DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, i, next);
-		}
+		__optimize_nops(instr, len, &insn, &next, &prev, &target);
 	}
 }
 
@@ -275,6 +315,8 @@ bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
 static void __init_or_module noinline
 apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
 {
+	int prev, target = 0;
+
 	for (int next, i = 0; i < len; i = next) {
 		struct insn insn;
 
@@ -283,6 +325,9 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
 
 		next = i + insn.length;
 
+		if (__optimize_nops(buf, len, &insn, &next, &prev, &target))
+			continue;
+
 		switch (insn.opcode.bytes[0]) {
 		case 0x0f:
 			if (insn.opcode.bytes[1] < 0x80 ||
@@ -324,11 +369,6 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
 					    src - dest);
 			}
 		}
-
-		if (insn_is_nop(&insn)) {
-			next = skip_nops(buf, next, len);
-			add_nop(buf + i, next - i);
-		}
 	}
 }
 

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [tip: x86/alternatives] x86/alternative: Rewrite optimize_nops() some
  2023-02-08 17:10 ` [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some Peter Zijlstra
  2023-02-08 19:52   ` Andrew.Cooper3
@ 2023-05-13 13:03   ` tip-bot2 for Peter Zijlstra
  1 sibling, 0 replies; 28+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2023-05-13 13:03 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel), Borislav Petkov (AMD), x86, linux-kernel

The following commit has been merged into the x86/alternatives branch of tip:

Commit-ID:     6c480f22212826425b57932f09b1f0abbec85485
Gitweb:        https://git.kernel.org/tip/6c480f22212826425b57932f09b1f0abbec85485
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 08 Feb 2023 18:10:53 +01:00
Committer:     Borislav Petkov (AMD) <bp@alien8.de>
CommitterDate: Thu, 11 May 2023 17:33:36 +02:00

x86/alternative: Rewrite optimize_nops() some

Address two issues:

 - it no longer hard requires single byte NOP runs - now it accepts any
   NOP and NOPL encoded instruction (but not the more complicated 32bit
   NOPs).

 - it writes a single 'instruction' replacement.

Specifically, ORC unwinder relies on the tail NOP of an alternative to
be a single instruction. In particular, it relies on the inner bytes not
being executed.

Once the max supported NOP length has been reached (currently 8, could easily
be extended to 11 on x86_64), switch to JMP.d8 and INT3 padding to
achieve the same result.

Objtool uses this guarantee in the analysis of alternative/overlapping
CFI state for the ORC unwinder data. Every instruction edge gets a CFI
state and the more instructions the larger the chance of conflicts.

  [ bp:
  - Add a comment over add_nop() to explain why it does it this way
  - Make add_nops() PARAVIRT only as it is used solely there now ]

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20230208171431.373412974@infradead.org
---
 arch/x86/kernel/alternative.c | 129 ++++++++++++++++++---------------
 1 file changed, 71 insertions(+), 58 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 28eb1d0..839bc6d 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -113,17 +113,35 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
 	x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
 };
 
-/* Use this to add nops to a buffer, then text_poke the whole buffer. */
-static void __init_or_module add_nops(void *insns, unsigned int len)
+/*
+ * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
+ * for every single-byte NOP, try to generate the maximally available NOP of
+ * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for
+ * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
+ * *jump* over instead of executing long and daft NOPs.
+ */
+static void __init_or_module add_nop(u8 *instr, unsigned int len)
 {
-	while (len > 0) {
-		unsigned int noplen = len;
-		if (noplen > ASM_NOP_MAX)
-			noplen = ASM_NOP_MAX;
-		memcpy(insns, x86_nops[noplen], noplen);
-		insns += noplen;
-		len -= noplen;
+	u8 *target = instr + len;
+
+	if (!len)
+		return;
+
+	if (len <= ASM_NOP_MAX) {
+		memcpy(instr, x86_nops[len], len);
+		return;
 	}
+
+	if (len < 128) {
+		__text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE);
+		instr += JMP8_INSN_SIZE;
+	} else {
+		__text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE);
+		instr += JMP32_INSN_SIZE;
+	}
+
+	for (;instr < target; instr++)
+		*instr = INT3_INSN_OPCODE;
 }
 
 extern s32 __retpoline_sites[], __retpoline_sites_end[];
@@ -134,39 +152,32 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 extern s32 __smp_locks[], __smp_locks_end[];
 void text_poke_early(void *addr, const void *opcode, size_t len);
 
-/*
- * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
- *
- * @instr: instruction byte stream
- * @instrlen: length of the above
- * @off: offset within @instr where the first NOP has been detected
- *
- * Return: number of NOPs found (and replaced).
- */
-static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
+static bool insn_is_nop(struct insn *insn)
 {
-	unsigned long flags;
-	int i = off, nnops;
+	if (insn->opcode.bytes[0] == 0x90)
+		return true;
 
-	while (i < instrlen) {
-		if (instr[i] != 0x90)
-			break;
+	if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
+		return true;
 
-		i++;
-	}
+	/* TODO: more nops */
 
-	nnops = i - off;
+	return false;
+}
 
-	if (nnops <= 1)
-		return nnops;
+static int skip_nops(u8 *instr, int offset, int len)
+{
+	struct insn insn;
 
-	local_irq_save(flags);
-	add_nops(instr + off, nnops);
-	local_irq_restore(flags);
+	for (; offset < len; offset += insn.length) {
+		if (insn_decode_kernel(&insn, &instr[offset]))
+			break;
 
-	DUMP_BYTES(ALT, instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);
+		if (!insn_is_nop(&insn))
+			break;
+	}
 
-	return nnops;
+	return offset;
 }
 
 /*
@@ -175,28 +186,19 @@ static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
  */
 static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
 {
-	struct insn insn;
-	int i = 0;
+	for (int next, i = 0; i < len; i = next) {
+		struct insn insn;
 
-	/*
-	 * Jump over the non-NOP insns and optimize single-byte NOPs into bigger
-	 * ones.
-	 */
-	for (;;) {
 		if (insn_decode_kernel(&insn, &instr[i]))
 			return;
 
-		/*
-		 * See if this and any potentially following NOPs can be
-		 * optimized.
-		 */
-		if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
-			i += optimize_nops_range(instr, len, i);
-		else
-			i += insn.length;
+		next = i + insn.length;
 
-		if (i >= len)
-			return;
+		if (insn_is_nop(&insn)) {
+			next = skip_nops(instr, next, len);
+			add_nop(instr + i, next - i);
+			DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, i, next);
+		}
 	}
 }
 
@@ -323,13 +325,10 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
 			}
 		}
 
-
-		/*
-		 * See if this and any potentially following NOPs can be
-		 * optimized.
-		 */
-		if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
-			next = i + optimize_nops_range(buf, len, i);
+		if (insn_is_nop(&insn)) {
+			next = skip_nops(buf, next, len);
+			add_nop(buf + i, next - i);
+		}
 	}
 }
 
@@ -1289,6 +1288,20 @@ int alternatives_text_reserved(void *start, void *end)
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_PARAVIRT
+
+/* Use this to add nops to a buffer, then text_poke the whole buffer. */
+static void __init_or_module add_nops(void *insns, unsigned int len)
+{
+	while (len > 0) {
+		unsigned int noplen = len;
+		if (noplen > ASM_NOP_MAX)
+			noplen = ASM_NOP_MAX;
+		memcpy(insns, x86_nops[noplen], noplen);
+		insns += noplen;
+		len -= noplen;
+	}
+}
+
 void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
 				     struct paravirt_patch_site *end)
 {

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [tip: x86/alternatives] x86/alternative: Support relocations in alternatives
  2023-02-08 17:10 ` [PATCH v3 2/4] x86/alternative: Support relocations in alternatives Peter Zijlstra
  2023-02-17 20:28   ` Borislav Petkov
  2023-02-17 22:21   ` Borislav Petkov
@ 2023-05-13 13:03   ` tip-bot2 for Peter Zijlstra
  2 siblings, 0 replies; 28+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2023-05-13 13:03 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel), Borislav Petkov (AMD), x86, linux-kernel

The following commit has been merged into the x86/alternatives branch of tip:

Commit-ID:     270a69c4485d7d07516d058bcc0473c90ee22185
Gitweb:        https://git.kernel.org/tip/270a69c4485d7d07516d058bcc0473c90ee22185
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 08 Feb 2023 18:10:52 +01:00
Committer:     Borislav Petkov (AMD) <bp@alien8.de>
CommitterDate: Wed, 10 May 2023 14:47:08 +02:00

x86/alternative: Support relocations in alternatives

A little while ago someone (Kirill) ran into the whole 'alternatives don't
do relocations nonsense' again and I got annoyed enough to actually look
at the code.

Since the whole alternative machinery already fully decodes the
instructions it is simple enough to adjust immediates and displacement
when needed. Specifically, the immediates for IP modifying instructions
(JMP, CALL, Jcc) and the displacement for RIP-relative instructions.

  [ bp: Massage comment some more and get rid of third loop in
    apply_relocation(). ]

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20230208171431.313857925@infradead.org
---
 arch/x86/kernel/alternative.c    | 261 +++++++++++++++++++-----------
 tools/objtool/arch/x86/special.c |   8 +-
 2 files changed, 173 insertions(+), 96 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index b3ae6cf..28eb1d0 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -135,71 +135,6 @@ extern s32 __smp_locks[], __smp_locks_end[];
 void text_poke_early(void *addr, const void *opcode, size_t len);
 
 /*
- * Are we looking at a near JMP with a 1 or 4-byte displacement.
- */
-static inline bool is_jmp(const u8 opcode)
-{
-	return opcode == 0xeb || opcode == 0xe9;
-}
-
-static void __init_or_module
-recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff)
-{
-	u8 *next_rip, *tgt_rip;
-	s32 n_dspl, o_dspl;
-	int repl_len;
-
-	if (a->replacementlen != 5)
-		return;
-
-	o_dspl = *(s32 *)(insn_buff + 1);
-
-	/* next_rip of the replacement JMP */
-	next_rip = repl_insn + a->replacementlen;
-	/* target rip of the replacement JMP */
-	tgt_rip  = next_rip + o_dspl;
-	n_dspl = tgt_rip - orig_insn;
-
-	DPRINTK(ALT, "target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
-
-	if (tgt_rip - orig_insn >= 0) {
-		if (n_dspl - 2 <= 127)
-			goto two_byte_jmp;
-		else
-			goto five_byte_jmp;
-	/* negative offset */
-	} else {
-		if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
-			goto two_byte_jmp;
-		else
-			goto five_byte_jmp;
-	}
-
-two_byte_jmp:
-	n_dspl -= 2;
-
-	insn_buff[0] = 0xeb;
-	insn_buff[1] = (s8)n_dspl;
-	add_nops(insn_buff + 2, 3);
-
-	repl_len = 2;
-	goto done;
-
-five_byte_jmp:
-	n_dspl -= 5;
-
-	insn_buff[0] = 0xe9;
-	*(s32 *)&insn_buff[1] = n_dspl;
-
-	repl_len = 5;
-
-done:
-
-	DPRINTK(ALT, "final displ: 0x%08x, JMP 0x%lx",
-		n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
-}
-
-/*
  * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
  *
  * @instr: instruction byte stream
@@ -266,6 +201,139 @@ static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
 }
 
 /*
+ * In this context, "source" is where the instructions are placed in the
+ * section .altinstr_replacement, for example during kernel build by the
+ * toolchain.
+ * "Destination" is where the instructions are being patched in by this
+ * machinery.
+ *
+ * The source offset is:
+ *
+ *   src_imm = target - src_next_ip                  (1)
+ *
+ * and the target offset is:
+ *
+ *   dst_imm = target - dst_next_ip                  (2)
+ *
+ * so rework (1) as an expression for target like:
+ *
+ *   target = src_imm + src_next_ip                  (1a)
+ *
+ * and substitute in (2) to get:
+ *
+ *   dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
+ *
+ * Now, since the instruction stream is 'identical' at src and dst (it
+ * is being copied after all) it can be stated that:
+ *
+ *   src_next_ip = src + ip_offset
+ *   dst_next_ip = dst + ip_offset                   (4)
+ *
+ * Substitute (4) in (3) and observe ip_offset being cancelled out to
+ * obtain:
+ *
+ *   dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset)
+ *           = src_imm + src - dst + ip_offset - ip_offset
+ *           = src_imm + src - dst                   (5)
+ *
+ * IOW, only the relative displacement of the code block matters.
+ */
+
+#define apply_reloc_n(n_, p_, d_)				\
+	do {							\
+		s32 v = *(s##n_ *)(p_);				\
+		v += (d_);					\
+		BUG_ON((v >> 31) != (v >> (n_-1)));		\
+		*(s##n_ *)(p_) = (s##n_)v;			\
+	} while (0)
+
+
+static __always_inline
+void apply_reloc(int n, void *ptr, uintptr_t diff)
+{
+	switch (n) {
+	case 1: apply_reloc_n(8, ptr, diff); break;
+	case 2: apply_reloc_n(16, ptr, diff); break;
+	case 4: apply_reloc_n(32, ptr, diff); break;
+	default: BUG();
+	}
+}
+
+static __always_inline
+bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
+{
+	u8 *target = src + offset;
+	/*
+	 * If the target is inside the patched block, it's relative to the
+	 * block itself and does not need relocation.
+	 */
+	return (target < src || target > src + src_len);
+}
+
+static void __init_or_module noinline
+apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
+{
+	for (int next, i = 0; i < len; i = next) {
+		struct insn insn;
+
+		if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
+			return;
+
+		next = i + insn.length;
+
+		switch (insn.opcode.bytes[0]) {
+		case 0x0f:
+			if (insn.opcode.bytes[1] < 0x80 ||
+			    insn.opcode.bytes[1] > 0x8f)
+				break;
+
+			fallthrough;	/* Jcc.d32 */
+		case 0x70 ... 0x7f:	/* Jcc.d8 */
+		case JMP8_INSN_OPCODE:
+		case JMP32_INSN_OPCODE:
+		case CALL_INSN_OPCODE:
+			if (need_reloc(next + insn.immediate.value, src, src_len)) {
+				apply_reloc(insn.immediate.nbytes,
+					    buf + i + insn_offset_immediate(&insn),
+					    src - dest);
+			}
+
+			/*
+			 * Where possible, convert JMP.d32 into JMP.d8.
+			 */
+			if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
+				s32 imm = insn.immediate.value;
+				imm += src - dest;
+				imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
+				if ((imm >> 31) == (imm >> 7)) {
+					buf[i+0] = JMP8_INSN_OPCODE;
+					buf[i+1] = (s8)imm;
+
+					memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2);
+				}
+			}
+			break;
+		}
+
+		if (insn_rip_relative(&insn)) {
+			if (need_reloc(next + insn.displacement.value, src, src_len)) {
+				apply_reloc(insn.displacement.nbytes,
+					    buf + i + insn_offset_displacement(&insn),
+					    src - dest);
+			}
+		}
+
+
+		/*
+		 * See if this and any potentially following NOPs can be
+		 * optimized.
+		 */
+		if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
+			next = i + optimize_nops_range(buf, len, i);
+	}
+}
+
+/*
  * Replace instructions with better alternatives for this CPU type. This runs
  * before SMP is initialized to avoid SMP problems with self modifying code.
  * This implies that asymmetric systems where APs have less capabilities than
@@ -306,8 +374,10 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
 		 * - feature not present but ALT_FLAG_NOT is set to mean,
 		 *   patch if feature is *NOT* present.
 		 */
-		if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT))
-			goto next;
+		if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
+			optimize_nops(instr, a->instrlen);
+			continue;
+		}
 
 		DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
 			(a->flags & ALT_FLAG_NOT) ? "!" : "",
@@ -316,37 +386,19 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
 			instr, instr, a->instrlen,
 			replacement, a->replacementlen);
 
-		DUMP_BYTES(ALT, instr, a->instrlen, "%px:   old_insn: ", instr);
-		DUMP_BYTES(ALT, replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);
-
 		memcpy(insn_buff, replacement, a->replacementlen);
 		insn_buff_sz = a->replacementlen;
 
-		/*
-		 * 0xe8 is a relative jump; fix the offset.
-		 *
-		 * Instruction length is checked before the opcode to avoid
-		 * accessing uninitialized bytes for zero-length replacements.
-		 */
-		if (a->replacementlen == 5 && *insn_buff == 0xe8) {
-			*(s32 *)(insn_buff + 1) += replacement - instr;
-			DPRINTK(ALT, "Fix CALL offset: 0x%x, CALL 0x%lx",
-				*(s32 *)(insn_buff + 1),
-				(unsigned long)instr + *(s32 *)(insn_buff + 1) + 5);
-		}
-
-		if (a->replacementlen && is_jmp(replacement[0]))
-			recompute_jump(a, instr, replacement, insn_buff);
-
 		for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
 			insn_buff[insn_buff_sz] = 0x90;
 
+		apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen);
+
+		DUMP_BYTES(ALT, instr, a->instrlen, "%px:   old_insn: ", instr);
+		DUMP_BYTES(ALT, replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);
 		DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
 
 		text_poke_early(instr, insn_buff, insn_buff_sz);
-
-next:
-		optimize_nops(instr, a->instrlen);
 	}
 }
 
@@ -1344,6 +1396,35 @@ static noinline void __init int3_selftest(void)
 	unregister_die_notifier(&int3_exception_nb);
 }
 
+static __initdata int __alt_reloc_selftest_addr;
+
+__visible noinline void __init __alt_reloc_selftest(void *arg)
+{
+	WARN_ON(arg != &__alt_reloc_selftest_addr);
+}
+
+static noinline void __init alt_reloc_selftest(void)
+{
+	/*
+	 * Tests apply_relocation().
+	 *
+	 * This has a relative immediate (CALL) in a place other than the first
+	 * instruction and additionally on x86_64 we get a RIP-relative LEA:
+	 *
+	 *   lea    0x0(%rip),%rdi  # 5d0: R_X86_64_PC32    .init.data+0x5566c
+	 *   call   +0              # 5d5: R_X86_64_PLT32   __alt_reloc_selftest-0x4
+	 *
+	 * Getting this wrong will either crash and burn or tickle the WARN
+	 * above.
+	 */
+	asm_inline volatile (
+		ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
+		: /* output */
+		: [mem] "m" (__alt_reloc_selftest_addr)
+		: _ASM_ARG1
+	);
+}
+
 void __init alternative_instructions(void)
 {
 	int3_selftest();
@@ -1431,6 +1512,8 @@ void __init alternative_instructions(void)
 
 	restart_nmi();
 	alternatives_patched = 1;
+
+	alt_reloc_selftest();
 }
 
 /**
diff --git a/tools/objtool/arch/x86/special.c b/tools/objtool/arch/x86/special.c
index 7c97b73..799ad6b 100644
--- a/tools/objtool/arch/x86/special.c
+++ b/tools/objtool/arch/x86/special.c
@@ -42,13 +42,7 @@ bool arch_support_alt_relocation(struct special_alt *special_alt,
 				 struct instruction *insn,
 				 struct reloc *reloc)
 {
-	/*
-	 * The x86 alternatives code adjusts the offsets only when it
-	 * encounters a branch instruction at the very beginning of the
-	 * replacement group.
-	 */
-	return insn->offset == special_alt->new_off &&
-	       (insn->type == INSN_CALL || is_jump(insn));
+	return true;
 }
 
 /*

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [tip: x86/alternatives] x86/alternative: Make debug-alternative selective
  2023-02-08 17:10 ` [PATCH v3 1/4] x86/alternative: Make debug-alternative selective Peter Zijlstra
  2023-02-14 11:48   ` Borislav Petkov
@ 2023-05-13 13:03   ` tip-bot2 for Peter Zijlstra
  1 sibling, 0 replies; 28+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2023-05-13 13:03 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel), Borislav Petkov (AMD), x86, linux-kernel

The following commit has been merged into the x86/alternatives branch of tip:

Commit-ID:     6becb5026b8192e0ed6619d6e7793c2f1288244f
Gitweb:        https://git.kernel.org/tip/6becb5026b8192e0ed6619d6e7793c2f1288244f
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 08 Feb 2023 18:10:51 +01:00
Committer:     Borislav Petkov (AMD) <bp@alien8.de>
CommitterDate: Wed, 10 May 2023 13:48:46 +02:00

x86/alternative: Make debug-alternative selective

Using debug-alternative generates a *LOT* of output, extend it a bit
to select which of the many rewrites it reports on.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20230208171431.253636689@infradead.org
---
 arch/x86/kernel/alternative.c | 62 ++++++++++++++++++++--------------
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index f615e0c..b3ae6cf 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -37,11 +37,23 @@ EXPORT_SYMBOL_GPL(alternatives_patched);
 
 #define MAX_PATCH_LEN (255-1)
 
-static int __initdata_or_module debug_alternative;
+#define DA_ALL		(~0)
+#define DA_ALT		0x01
+#define DA_RET		0x02
+#define DA_RETPOLINE	0x04
+#define DA_ENDBR	0x08
+#define DA_SMP		0x10
+
+static unsigned int __initdata_or_module debug_alternative;
 
 static int __init debug_alt(char *str)
 {
-	debug_alternative = 1;
+	if (str && *str == '=')
+		str++;
+
+	if (!str || kstrtouint(str, 0, &debug_alternative))
+		debug_alternative = DA_ALL;
+
 	return 1;
 }
 __setup("debug-alternative", debug_alt);
@@ -55,15 +67,15 @@ static int __init setup_noreplace_smp(char *str)
 }
 __setup("noreplace-smp", setup_noreplace_smp);
 
-#define DPRINTK(fmt, args...)						\
+#define DPRINTK(type, fmt, args...)					\
 do {									\
-	if (debug_alternative)						\
+	if (debug_alternative & DA_##type)				\
 		printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args);		\
 } while (0)
 
-#define DUMP_BYTES(buf, len, fmt, args...)				\
+#define DUMP_BYTES(type, buf, len, fmt, args...)			\
 do {									\
-	if (unlikely(debug_alternative)) {				\
+	if (unlikely(debug_alternative & DA_##type)) {			\
 		int j;							\
 									\
 		if (!(len))						\
@@ -148,7 +160,7 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff)
 	tgt_rip  = next_rip + o_dspl;
 	n_dspl = tgt_rip - orig_insn;
 
-	DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
+	DPRINTK(ALT, "target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
 
 	if (tgt_rip - orig_insn >= 0) {
 		if (n_dspl - 2 <= 127)
@@ -183,7 +195,7 @@ five_byte_jmp:
 
 done:
 
-	DPRINTK("final displ: 0x%08x, JMP 0x%lx",
+	DPRINTK(ALT, "final displ: 0x%08x, JMP 0x%lx",
 		n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
 }
 
@@ -217,7 +229,7 @@ static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
 	add_nops(instr + off, nnops);
 	local_irq_restore(flags);
 
-	DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);
+	DUMP_BYTES(ALT, instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);
 
 	return nnops;
 }
@@ -270,7 +282,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
 	u8 *instr, *replacement;
 	u8 insn_buff[MAX_PATCH_LEN];
 
-	DPRINTK("alt table %px, -> %px", start, end);
+	DPRINTK(ALT, "alt table %px, -> %px", start, end);
 	/*
 	 * The scan order should be from start to end. A later scanned
 	 * alternative code can overwrite previously scanned alternative code.
@@ -297,15 +309,15 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
 		if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT))
 			goto next;
 
-		DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
+		DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
 			(a->flags & ALT_FLAG_NOT) ? "!" : "",
 			a->cpuid >> 5,
 			a->cpuid & 0x1f,
 			instr, instr, a->instrlen,
 			replacement, a->replacementlen);
 
-		DUMP_BYTES(instr, a->instrlen, "%px:   old_insn: ", instr);
-		DUMP_BYTES(replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);
+		DUMP_BYTES(ALT, instr, a->instrlen, "%px:   old_insn: ", instr);
+		DUMP_BYTES(ALT, replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);
 
 		memcpy(insn_buff, replacement, a->replacementlen);
 		insn_buff_sz = a->replacementlen;
@@ -318,7 +330,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
 		 */
 		if (a->replacementlen == 5 && *insn_buff == 0xe8) {
 			*(s32 *)(insn_buff + 1) += replacement - instr;
-			DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
+			DPRINTK(ALT, "Fix CALL offset: 0x%x, CALL 0x%lx",
 				*(s32 *)(insn_buff + 1),
 				(unsigned long)instr + *(s32 *)(insn_buff + 1) + 5);
 		}
@@ -329,7 +341,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
 		for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
 			insn_buff[insn_buff_sz] = 0x90;
 
-		DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
+		DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
 
 		text_poke_early(instr, insn_buff, insn_buff_sz);
 
@@ -555,15 +567,15 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
 			continue;
 		}
 
-		DPRINTK("retpoline at: %pS (%px) len: %d to: %pS",
+		DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS",
 			addr, addr, insn.length,
 			addr + insn.length + insn.immediate.value);
 
 		len = patch_retpoline(addr, &insn, bytes);
 		if (len == insn.length) {
 			optimize_nops(bytes, len);
-			DUMP_BYTES(((u8*)addr),  len, "%px: orig: ", addr);
-			DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
+			DUMP_BYTES(RETPOLINE, ((u8*)addr),  len, "%px: orig: ", addr);
+			DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
 			text_poke_early(addr, bytes, len);
 		}
 	}
@@ -630,14 +642,14 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end)
 			      addr, dest, 5, addr))
 			continue;
 
-		DPRINTK("return thunk at: %pS (%px) len: %d to: %pS",
+		DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS",
 			addr, addr, insn.length,
 			addr + insn.length + insn.immediate.value);
 
 		len = patch_return(addr, &insn, bytes);
 		if (len == insn.length) {
-			DUMP_BYTES(((u8*)addr),  len, "%px: orig: ", addr);
-			DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
+			DUMP_BYTES(RET, ((u8*)addr),  len, "%px: orig: ", addr);
+			DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
 			text_poke_early(addr, bytes, len);
 		}
 	}
@@ -667,13 +679,13 @@ static void poison_endbr(void *addr, bool warn)
 		return;
 	}
 
-	DPRINTK("ENDBR at: %pS (%px)", addr, addr);
+	DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr);
 
 	/*
 	 * When we have IBT, the lack of ENDBR will trigger #CP
 	 */
-	DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr);
-	DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr);
+	DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
+	DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
 	text_poke_early(addr, &poison, 4);
 }
 
@@ -1148,7 +1160,7 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
 	smp->locks_end	= locks_end;
 	smp->text	= text;
 	smp->text_end	= text_end;
-	DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
+	DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n",
 		smp->locks, smp->locks_end,
 		smp->text, smp->text_end, smp->name);
 

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH] x86/alternatives: Fix section mismatch warnings
  2023-05-13 13:03   ` [tip: x86/alternatives] " tip-bot2 for Peter Zijlstra
@ 2023-05-13 16:01     ` Borislav Petkov
  2023-05-13 16:10       ` [tip: x86/alternatives] " tip-bot2 for Borislav Petkov (AMD)
  0 siblings, 1 reply; 28+ messages in thread
From: Borislav Petkov @ 2023-05-13 16:01 UTC (permalink / raw)
  To: X86 ML; +Cc: LKML

From: "Borislav Petkov (AMD)" <bp@alien8.de>

Fix stuff like:

  WARNING: modpost: vmlinux.o: section mismatch in reference: \
  __optimize_nops (section: .text) -> debug_alternative (section: .init.data)

Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
---
 arch/x86/kernel/alternative.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 3bb0a5f61e8c..93aa95afd005 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -193,8 +193,8 @@ static int skip_nops(u8 *instr, int offset, int len)
  * Optimize a sequence of NOPs, possibly preceded by an unconditional jump
  * to the end of the NOP sequence into a single NOP.
  */
-static bool __optimize_nops(u8 *instr, size_t len, struct insn *insn,
-			    int *next, int *prev, int *target)
+static bool __init_or_module
+__optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target)
 {
 	int i = *next - insn->length;
 
@@ -765,7 +765,7 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
 
 #ifdef CONFIG_X86_KERNEL_IBT
 
-static void poison_endbr(void *addr, bool warn)
+static void __init_or_module poison_endbr(void *addr, bool warn)
 {
 	u32 endbr, poison = gen_endbr_poison();
 
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [tip: x86/alternatives] x86/alternatives: Fix section mismatch warnings
  2023-05-13 16:01     ` [PATCH] x86/alternatives: Fix section mismatch warnings Borislav Petkov
@ 2023-05-13 16:10       ` tip-bot2 for Borislav Petkov (AMD)
  0 siblings, 0 replies; 28+ messages in thread
From: tip-bot2 for Borislav Petkov (AMD) @ 2023-05-13 16:10 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: Borislav Petkov (AMD), x86, linux-kernel

The following commit has been merged into the x86/alternatives branch of tip:

Commit-ID:     d42a2a89121071cc8dd285235253a4c739641635
Gitweb:        https://git.kernel.org/tip/d42a2a89121071cc8dd285235253a4c739641635
Author:        Borislav Petkov (AMD) <bp@alien8.de>
AuthorDate:    Sat, 13 May 2023 16:01:39 +02:00
Committer:     Borislav Petkov (AMD) <bp@alien8.de>
CommitterDate: Sat, 13 May 2023 18:04:42 +02:00

x86/alternatives: Fix section mismatch warnings

Fix stuff like:

  WARNING: modpost: vmlinux.o: section mismatch in reference: \
  __optimize_nops (section: .text) -> debug_alternative (section: .init.data)

Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20230513160146.16039-1-bp@alien8.de
---
 arch/x86/kernel/alternative.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 3bb0a5f..93aa95a 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -193,8 +193,8 @@ static int skip_nops(u8 *instr, int offset, int len)
  * Optimize a sequence of NOPs, possibly preceded by an unconditional jump
  * to the end of the NOP sequence into a single NOP.
  */
-static bool __optimize_nops(u8 *instr, size_t len, struct insn *insn,
-			    int *next, int *prev, int *target)
+static bool __init_or_module
+__optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target)
 {
 	int i = *next - insn->length;
 
@@ -765,7 +765,7 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
 
 #ifdef CONFIG_X86_KERNEL_IBT
 
-static void poison_endbr(void *addr, bool warn)
+static void __init_or_module poison_endbr(void *addr, bool warn)
 {
 	u32 endbr, poison = gen_endbr_poison();
 

^ permalink raw reply related	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2023-05-13 16:10 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-02-08 17:10 [PATCH v3 0/4] x86: Fully relocatable alternatives and some NOPs Peter Zijlstra
2023-02-08 17:10 ` [PATCH v3 1/4] x86/alternative: Make debug-alternative selective Peter Zijlstra
2023-02-14 11:48   ` Borislav Petkov
2023-05-13 13:03   ` [tip: x86/alternatives] " tip-bot2 for Peter Zijlstra
2023-02-08 17:10 ` [PATCH v3 2/4] x86/alternative: Support relocations in alternatives Peter Zijlstra
2023-02-17 20:28   ` Borislav Petkov
2023-02-17 22:21   ` Borislav Petkov
2023-05-13 13:03   ` [tip: x86/alternatives] " tip-bot2 for Peter Zijlstra
2023-02-08 17:10 ` [PATCH v3 3/4] x86/alternative: Rewrite optimize_nops() some Peter Zijlstra
2023-02-08 19:52   ` Andrew.Cooper3
2023-02-08 20:29     ` Peter Zijlstra
2023-02-08 20:36       ` Peter Zijlstra
2023-02-08 20:44         ` Peter Zijlstra
2023-02-08 20:45           ` Peter Zijlstra
2023-02-08 21:01           ` Peter Zijlstra
2023-02-08 21:08           ` Peter Zijlstra
2023-02-08 21:21             ` Peter Zijlstra
2023-02-09  1:11               ` Andrew.Cooper3
2023-02-09 22:27                 ` David Laight
2023-02-09  1:33       ` Andrew.Cooper3
2023-02-08 23:04     ` David Laight
2023-05-13 13:03   ` [tip: x86/alternatives] " tip-bot2 for Peter Zijlstra
2023-02-08 17:10 ` [PATCH v3 4/4] x86/alternative: Complicate optimize_nops() some more Peter Zijlstra
2023-05-13 13:03   ` [tip: x86/alternatives] " tip-bot2 for Peter Zijlstra
2023-05-13 16:01     ` [PATCH] x86/alternatives: Fix section mismatch warnings Borislav Petkov
2023-05-13 16:10       ` [tip: x86/alternatives] " tip-bot2 for Borislav Petkov (AMD)
2023-02-27 10:49 ` [PATCH] x86/lib/memmove: Decouple ERMS from FSRM Borislav Petkov
2023-04-27  9:22   ` [PATCH TEST] " Yahu Gao

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.