All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC][PATCH] x86/alternatives: Optimize optimize_nops()
@ 2021-03-15 15:45 Peter Zijlstra
  2021-03-15 16:23 ` Peter Zijlstra
  2021-03-15 16:45 ` David Laight
  0 siblings, 2 replies; 3+ messages in thread
From: Peter Zijlstra @ 2021-03-15 15:45 UTC (permalink / raw)
  To: x86; +Cc: linux-kernel


Currently optimize_nops() scans to see if the alternative starts with
NOPs. However, the emit pattern is:

  141:	\oldinstr
  142:	.skip (len-(142b-141b)), 0x90

That is, when oldinstr is short, we pad the tail with NOPs. This case
never gets optimized.

Rewrite optimize_nops() to replace any string of NOPs inside the
alternative to larger NOPs. Also run it irrespective of patching,
replacing NOPs in both the original and replaced code.

A direct consequence is that padlen becomes superfluous, so remove it.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/alternative.h            |   14 ++---
 arch/x86/kernel/alternative.c                 |   63 ++++++++++++++++----------
 tools/objtool/arch/x86/include/arch/special.h |    2 
 3 files changed, 46 insertions(+), 33 deletions(-)

--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -65,7 +65,6 @@ struct alt_instr {
 	u16 cpuid;		/* cpuid bit set for replacement */
 	u8  instrlen;		/* length of original instruction */
 	u8  replacementlen;	/* length of new instruction */
-	u8  padlen;		/* length of build-time padding */
 } __packed;
 
 /*
@@ -104,7 +103,6 @@ static inline int alternatives_text_rese
 
 #define alt_end_marker		"663"
 #define alt_slen		"662b-661b"
-#define alt_pad_len		alt_end_marker"b-662b"
 #define alt_total_slen		alt_end_marker"b-661b"
 #define alt_rlen(num)		e_replacement(num)"f-"b_replacement(num)"f"
 
@@ -151,8 +149,7 @@ static inline int alternatives_text_rese
 	" .long " b_replacement(num)"f - .\n"		/* new instruction */ \
 	" .word " __stringify(feature) "\n"		/* feature bit     */ \
 	" .byte " alt_total_slen "\n"			/* source len      */ \
-	" .byte " alt_rlen(num) "\n"			/* replacement len */ \
-	" .byte " alt_pad_len "\n"			/* pad len */
+	" .byte " alt_rlen(num) "\n"			/* replacement len */
 
 #define ALTINSTR_REPLACEMENT(newinstr, num)		/* replacement */	\
 	"# ALT: replacement " #num "\n"						\
@@ -315,13 +312,12 @@ static inline int alternatives_text_rese
  * enough information for the alternatives patching code to patch an
  * instruction. See apply_alternatives().
  */
-.macro altinstruction_entry orig alt feature orig_len alt_len pad_len
+.macro altinstruction_entry orig alt feature orig_len alt_len
 	.long \orig - .
 	.long \alt - .
 	.word \feature
 	.byte \orig_len
 	.byte \alt_len
-	.byte \pad_len
 .endm
 
 /*
@@ -338,7 +334,7 @@ static inline int alternatives_text_rese
 142:
 
 	.pushsection .altinstructions,"a"
-	altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b
+	altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f
 	.popsection
 
 	.pushsection .altinstr_replacement,"ax"
@@ -375,8 +371,8 @@ static inline int alternatives_text_rese
 142:
 
 	.pushsection .altinstructions,"a"
-	altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b
-	altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b
+	altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f
+	altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f
 	.popsection
 
 	.pushsection .altinstr_replacement,"ax"
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -345,19 +345,39 @@ recompute_jump(struct alt_instr *a, u8 *
 static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
 {
 	unsigned long flags;
-	int i;
+	int nops = 0, i = 0;
+	struct insn insn;
+	u8 *nop = NULL;
+
+	do {
+		kernel_insn_init(&insn, &instr[i], MAX_INSN_SIZE);
+		insn_get_length(&insn);
+
+		if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) {
+			if (!nop) {
+				nop = &instr[i];
+				nops = 1;
+			} else {
+				nops++;
+			}
+		}
+		i += insn.length;
 
-	for (i = 0; i < a->padlen; i++) {
-		if (instr[i] != 0x90)
-			return;
-	}
+		if ((insn.length != 1 || i == a->instrlen) && nop) {
+
+			local_irq_save(flags);
+			add_nops(nop, nops);
+			local_irq_restore(flags);
+
+			DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
+				   instr, (int)(unsigned long)(nop-instr), nops);
+
+			nop = NULL;
+		}
 
-	local_irq_save(flags);
-	add_nops(instr + (a->instrlen - a->padlen), a->padlen);
-	local_irq_restore(flags);
+	} while (i < a->instrlen);
 
-	DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
-		   instr, a->instrlen - a->padlen, a->padlen);
+	WARN_ON_ONCE(nop);
 }
 
 /*
@@ -403,19 +423,15 @@ void __init_or_module noinline apply_alt
 		 * - feature not present but ALTINSTR_FLAG_INV is set to mean,
 		 *   patch if feature is *NOT* present.
 		 */
-		if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) {
-			if (a->padlen > 1)
-				optimize_nops(a, instr);
+		if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV))
+			goto next;
 
-			continue;
-		}
-
-		DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d",
+		DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
 			(a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "",
 			feature >> 5,
 			feature & 0x1f,
 			instr, instr, a->instrlen,
-			replacement, a->replacementlen, a->padlen);
+			replacement, a->replacementlen);
 
 		DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
 		DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
@@ -439,14 +455,15 @@ void __init_or_module noinline apply_alt
 		if (a->replacementlen && is_jmp(replacement[0]))
 			recompute_jump(a, instr, replacement, insn_buff);
 
-		if (a->instrlen > a->replacementlen) {
-			add_nops(insn_buff + a->replacementlen,
-				 a->instrlen - a->replacementlen);
-			insn_buff_sz += a->instrlen - a->replacementlen;
-		}
+		for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
+			insn_buff[insn_buff_sz] = 0x90;
+
 		DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
 
 		text_poke_early(instr, insn_buff, insn_buff_sz);
+
+next:
+		optimize_nops(a, instr);
 	}
 }
 
--- a/tools/objtool/arch/x86/include/arch/special.h
+++ b/tools/objtool/arch/x86/include/arch/special.h
@@ -10,7 +10,7 @@
 #define JUMP_ORIG_OFFSET	0
 #define JUMP_NEW_OFFSET		4
 
-#define ALT_ENTRY_SIZE		13
+#define ALT_ENTRY_SIZE		12
 #define ALT_ORIG_OFFSET		0
 #define ALT_NEW_OFFSET		4
 #define ALT_FEATURE_OFFSET	8

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [RFC][PATCH] x86/alternatives: Optimize optimize_nops()
  2021-03-15 15:45 [RFC][PATCH] x86/alternatives: Optimize optimize_nops() Peter Zijlstra
@ 2021-03-15 16:23 ` Peter Zijlstra
  2021-03-15 16:45 ` David Laight
  1 sibling, 0 replies; 3+ messages in thread
From: Peter Zijlstra @ 2021-03-15 16:23 UTC (permalink / raw)
  To: x86; +Cc: linux-kernel

On Mon, Mar 15, 2021 at 04:45:24PM +0100, Peter Zijlstra wrote:
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -345,19 +345,39 @@ recompute_jump(struct alt_instr *a, u8 *
>  static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
>  {
>  	unsigned long flags;
> +	int nops = 0, i = 0;
> +	struct insn insn;
> +	u8 *nop = NULL;
> +
> +	do {
> +		kernel_insn_init(&insn, &instr[i], MAX_INSN_SIZE);
> +		insn_get_length(&insn);
> +

I suppose you'd like to see that replaced with something like this,
rite?


--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -350,8 +350,8 @@ static void __init_or_module noinline op
 	u8 *nop = NULL;
 
 	do {
-		kernel_insn_init(&insn, &instr[i], MAX_INSN_SIZE);
-		insn_get_length(&insn);
+		if (insn_decode(&insn, &instr[i], MAX_INSN_SIZE, INSN_MODE_KERN))
+			return;
 
 		if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) {
 			if (!nop) {

^ permalink raw reply	[flat|nested] 3+ messages in thread

* RE: [RFC][PATCH] x86/alternatives: Optimize optimize_nops()
  2021-03-15 15:45 [RFC][PATCH] x86/alternatives: Optimize optimize_nops() Peter Zijlstra
  2021-03-15 16:23 ` Peter Zijlstra
@ 2021-03-15 16:45 ` David Laight
  1 sibling, 0 replies; 3+ messages in thread
From: David Laight @ 2021-03-15 16:45 UTC (permalink / raw)
  To: 'Peter Zijlstra', x86; +Cc: linux-kernel

From: Peter Zijlstra
> Sent: 15 March 2021 15:45
> 
> Currently optimize_nops() scans to see if the alternative starts with
> NOPs. However, the emit pattern is:
> 
>   141:	\oldinstr
>   142:	.skip (len-(142b-141b)), 0x90
> 
> That is, when oldinstr is short, we pad the tail with NOPs. This case
> never gets optimized.
> 
> Rewrite optimize_nops() to replace any string of NOPs inside the
> alternative to larger NOPs. Also run it irrespective of patching,
> replacing NOPs in both the original and replaced code.

This should even speed up old 64bit cpu.
Although the nopl instruction has to go through the 'big decoder'
(so only one per clock) it is much bigger than the 0x90 'nop'
instructions currently used.
So although (probably) 3 'nop' can be decoded each clock
that is less bytes of padding per clock.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-03-15 16:46 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-15 15:45 [RFC][PATCH] x86/alternatives: Optimize optimize_nops() Peter Zijlstra
2021-03-15 16:23 ` Peter Zijlstra
2021-03-15 16:45 ` David Laight

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.