linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Alternative patching for prefetches & cleanup
@ 2003-04-27  2:19 Andi Kleen
  2003-04-27  3:09 ` Linus Torvalds
  0 siblings, 1 reply; 7+ messages in thread
From: Andi Kleen @ 2003-04-27  2:19 UTC (permalink / raw)
  To: torvalds; +Cc: linux-kernel


This patch extends and cleans up the early generic CPU alternative patching.

It adds automatic patching for (SSE) prefetches and 3dnow! write
prefetches. This is one step towards making the Athlon kernel work
on Intel CPUs again. It's not fully finished yet because the memcpy
still uses 3dnow!.

I had to split the alternative section into two for this. The reason
is that for big kernels gas or ld would sometimes add a few padding
bytes after the replacement instruction in .altinstructions, and that
would confuse the patcher who relied on knowing the exact length
of the instruction. Now the instructions are in another section 
with a pointer from the main patch record.

It also adds nop types for various CPUs straight from their optimization 
manuals. Now you can always get the fastest nops for K7,K8 and Intel.
I moved them into the include files to make it easy to use them
for padding alternative()s. Some cleanups in the patch function to use this.

Removal of the now obsolete X86_PREFETCH and X86_SSE2 options.

diff -u linux-gencpu/arch/i386/kernel/setup.c-o linux-gencpu/arch/i386/kernel/setup.c
--- linux-gencpu/arch/i386/kernel/setup.c-o	2003-04-25 01:17:20.000000000 +0200
+++ linux-gencpu/arch/i386/kernel/setup.c	2003-04-27 04:12:32.000000000 +0200
@@ -795,41 +795,42 @@
 		pci_mem_start = low_mem_size;
 }
 
+asm("nops: " 
+    ASM_NOP1 ASM_NOP2 ASM_NOP3 ASM_NOP4 ASM_NOP5 ASM_NOP6
+    ASM_NOP7 ASM_NOP8); 
+
+static inline unsigned char *findnop(int num) 
+{
+	extern unsigned char nops[];
+	int i, k = 0;
+	for (i = 1; i < num; i++) 
+		k += i; 
+	return nops + k;
+} 
+
 /* Replace instructions with better alternatives for this CPU type.
 
    This runs before SMP is initialized to avoid SMP problems with
    self modifying code. This implies that assymetric systems where
    APs have less capabilities than the boot processor are not handled. 
-    
    In this case boot with "noreplacement". */ 
 void apply_alternatives(void *start, void *end) 
 { 
 	struct alt_instr *a; 
 	int diff, i, k;
 
-	for (a = start; a < (struct alt_instr *)end; 
-	     a = (void *)ALIGN((unsigned long)(a + 1) + a->instrlen, 4)) { 
+	for (a = start; (void *)a < end; a++) { 
 		if (!boot_cpu_has(a->cpuid))
 			continue;
 		BUG_ON(a->replacementlen > a->instrlen); 
 		memcpy(a->instr, a->replacement, a->replacementlen); 
 		diff = a->instrlen - a->replacementlen; 
+		/* Pad the rest with nops */
 		for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
-			static const char *nops[] = {
-				0,
-				"\x90",
-#if CONFIG_MK7 || CONFIG_MK8
-				"\x66\x90",
-				"\x66\x66\x90",
-				"\x66\x66\x66\x90",
-#else
-				"\x89\xf6",
-				"\x8d\x76\x00",
-				"\x8d\x74\x26\x00",
-#endif
-			};
-			k = min_t(int, diff, ARRAY_SIZE(nops)); 
-			memcpy(a->instr + i, nops[k], k); 
+			k = diff;
+			if (k > ASM_NOP_MAX)
+				k = ASM_NOP_MAX;
+			memcpy(a->instr + i, findnop(k), k); 
 		} 
 	}
 } 
diff -u linux-gencpu/arch/i386/Kconfig-o linux-gencpu/arch/i386/Kconfig
--- linux-gencpu/arch/i386/Kconfig-o	2003-04-27 02:40:32.000000000 +0200
+++ linux-gencpu/arch/i386/Kconfig	2003-04-27 03:50:08.000000000 +0200
@@ -363,16 +370,6 @@
 	depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6
 	default y
 
-config X86_PREFETCH
-	bool
-	depends on MPENTIUMIII || MPENTIUM4 || MVIAC3_2
-	default y
-
-config X86_SSE2
-	bool
-	depends on MK8 || MPENTIUM4
-	default y
-
 config HUGETLB_PAGE
 	bool "Huge TLB Page Support"
 	help
diff -u linux-gencpu/arch/i386/vmlinux.lds.S-o linux-gencpu/arch/i386/vmlinux.lds.S
--- linux-gencpu/arch/i386/vmlinux.lds.S-o	2003-04-25 01:17:20.000000000 +0200
+++ linux-gencpu/arch/i386/vmlinux.lds.S	2003-04-27 03:50:08.000000000 +0200
@@ -85,6 +85,7 @@
   __alt_instructions = .;
   .altinstructions : { *(.altinstructions) } 
   __alt_instructions_end = .; 
+ .altinstr_replacement : { *(.altinstr_replacement) }
   . = ALIGN(4096);
   __initramfs_start = .;
   .init.ramfs : { *(.init.ramfs) }
diff -u linux-gencpu/include/asm-i386/processor.h-o linux-gencpu/include/asm-i386/processor.h
--- linux-gencpu/include/asm-i386/processor.h-o	2003-04-20 13:26:38.000000000 +0200
+++ linux-gencpu/include/asm-i386/processor.h	2003-04-27 03:50:08.000000000 +0200
@@ -15,6 +15,7 @@
 #include <asm/sigcontext.h>
 #include <asm/cpufeature.h>
 #include <asm/msr.h>
+#include <asm/system.h>
 #include <linux/cache.h>
 #include <linux/config.h>
 #include <linux/threads.h>
@@ -495,32 +496,61 @@
 
 #define cpu_relax()	rep_nop()
 
-/* Prefetch instructions for Pentium III and AMD Athlon */
-#ifdef 	CONFIG_X86_PREFETCH
+#define ASM_NOP1	".byte 0x90\n"
+#ifdef CONFIG_MK8
+#define ASM_NOP2	".byte 0x66,0x90\n" 
+#define ASM_NOP3	".byte 0x66,0x66,0x90\n" 
+#define ASM_NOP4	".byte 0x66,0x66,0x66,0x90\n" 
+#define ASM_NOP5	ASM_NOP3 ASM_NOP2 
+#define ASM_NOP6	ASM_NOP3 ASM_NOP3
+#define ASM_NOP7	ASM_NOP4 ASM_NOP3
+#define ASM_NOP8	ASM_NOP4 ASM_NOP4
+#elif CONFIG_MK7
+/* uses eax dependencies (arbitary choice) */
+#define ASM_NOP2	".byte 0x8b,0xc0\n" 
+#define ASM_NOP3	".byte 0x8d,0x04,0x20\n"
+#define ASM_NOP4	".byte 0x8d,0x44,0x20,0x00\n"
+#define ASM_NOP5	ASM_NOP4 ASM_NOP1
+#define ASM_NOP6	".byte 0x8d,0x80,0,0,0,0\n"
+#define ASM_NOP7        ".byte 0x8D,0x04,0x05,0,0,0,0\n"
+#define ASM_NOP8        ASM_NOP7 ASM_NOP1
+#else
+/* generic versions from gas */
+#define ASM_NOP2    	".byte 0x89,0xf6\n"
+#define ASM_NOP3        ".byte 0x8d,0x76,0x00\n"
+#define ASM_NOP4        ".byte 0x8d,0x74,0x26,0x00\n"
+#define ASM_NOP5        ASM_NOP1 ASM_NOP4
+#define ASM_NOP6	".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
+#define ASM_NOP7	".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
+#define ASM_NOP8	ASM_NOP1 ASM_NOP7
+#endif
+#define ASM_NOP_MAX 8
 
+/* Prefetch instructions for Pentium III and AMD Athlon */
+/* It's not worth to care about 3dnow! prefetches for the K6
+   because they are microcoded there and very slow. */
 #define ARCH_HAS_PREFETCH
 extern inline void prefetch(const void *x)
 {
-	__asm__ __volatile__ ("prefetchnta (%0)" : : "r"(x));
+	alternative_input(ASM_NOP3,
+			  "prefetchnta (%1)",
+			  X86_FEATURE_XMM,
+			  "r" (x));
 }
 
-#elif defined CONFIG_X86_USE_3DNOW
-
 #define ARCH_HAS_PREFETCH
 #define ARCH_HAS_PREFETCHW
 #define ARCH_HAS_SPINLOCK_PREFETCH
 
-extern inline void prefetch(const void *x)
-{
-	 __asm__ __volatile__ ("prefetch (%0)" : : "r"(x));
-}
-
+/* 3dnow! prefetch to get an exclusive cache line. Useful for 
+   spinlocks to avoid one state transition in the cache coherency protocol. */
 extern inline void prefetchw(const void *x)
 {
-	 __asm__ __volatile__ ("prefetchw (%0)" : : "r"(x));
+	alternative_input(ASM_NOP3,
+			  "prefetchw (%1)",
+			  X86_FEATURE_3DNOW,
+			  "r" (x));
 }
 #define spin_lock_prefetch(x)	prefetchw(x)
 
-#endif
-
 #endif /* __ASM_I386_PROCESSOR_H */
diff -u linux-gencpu/include/asm-i386/system.h-o linux-gencpu/include/asm-i386/system.h
--- linux-gencpu/include/asm-i386/system.h-o	2003-04-25 01:17:37.000000000 +0200
+++ linux-gencpu/include/asm-i386/system.h	2003-04-27 03:50:08.000000000 +0200
@@ -279,10 +279,11 @@
 
 struct alt_instr { 
 	u8 *instr; 		/* original instruction */
+	u8 *replacement;
 	u8  cpuid;		/* cpuid bit set for replacement */
 	u8  instrlen;		/* length of original instruction */
 	u8  replacementlen; 	/* length of new instruction, <= instrlen */ 
-	u8  replacement[0];   	/* new instruction */
+	u8  pad;
 }; 
 
 /* 
@@ -302,13 +303,40 @@
 		      ".section .altinstructions,\"a\"\n"     	     \
 		      "  .align 4\n"				       \
 		      "  .long 661b\n"            /* label */          \
+		      "  .long 663f\n"		  /* new instruction */ 	\
 		      "  .byte %c0\n"             /* feature bit */    \
 		      "  .byte 662b-661b\n"       /* sourcelen */      \
 		      "  .byte 664f-663f\n"       /* replacementlen */ \
+		      ".previous\n"						\
+		      ".section .altinstr_replacement,\"ax\"\n"			\
 		      "663:\n\t" newinstr "\n664:\n"   /* replacement */    \
 		      ".previous" :: "i" (feature) : "memory")  
 
 /*
+ * Alternative inline assembly with input.
+ * 
+ * Pecularities:
+ * No memory clobber here. 
+ * Argument numbers start with 1.
+ * Best is to use constraints that are fixed size (like (%1) ... "r")
+ * If you use variable sized constraints like "m" or "g" in the 
+ * replacement maake sure to pad to the worst case length.
+ */
+#define alternative_input(oldinstr, newinstr, feature, input)			\
+	asm volatile ("661:\n\t" oldinstr "\n662:\n"				\
+		      ".section .altinstructions,\"a\"\n"			\
+		      "  .align 4\n"						\
+		      "  .long 661b\n"            /* label */			\
+		      "  .long 663f\n"		  /* new instruction */ 	\
+		      "  .byte %c0\n"             /* feature bit */		\
+		      "  .byte 662b-661b\n"       /* sourcelen */		\
+		      "  .byte 664f-663f\n"       /* replacementlen */ 		\
+		      ".previous\n"						\
+		      ".section .altinstr_replacement,\"ax\"\n"			\
+		      "663:\n\t" newinstr "\n664:\n"   /* replacement */ 	\
+		      ".previous" :: "i" (feature), input)  
+
+/*
  * Force strict CPU ordering.
  * And yes, this is required on UP too when we're talking
  * to devices.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Alternative patching for prefetches & cleanup
  2003-04-27  2:19 [PATCH] Alternative patching for prefetches & cleanup Andi Kleen
@ 2003-04-27  3:09 ` Linus Torvalds
  2003-04-27  3:14   ` Andi Kleen
  0 siblings, 1 reply; 7+ messages in thread
From: Linus Torvalds @ 2003-04-27  3:09 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel


On Sun, 27 Apr 2003, Andi Kleen wrote:
> 
> It also adds nop types for various CPUs straight from their optimization 
> manuals. Now you can always get the fastest nops for K7,K8 and Intel.
> I moved them into the include files to make it easy to use them
> for padding alternative()s. Some cleanups in the patch function to use this.

Please, this part I definitely want cleaned up.

> +++ linux-gencpu/arch/i386/kernel/setup.c	2003-04-27 04:12:32.000000000 +0200
> @@ -795,41 +795,42 @@
>  		pci_mem_start = low_mem_size;
>  }
>  
> +asm("nops: " 
> +    ASM_NOP1 ASM_NOP2 ASM_NOP3 ASM_NOP4 ASM_NOP5 ASM_NOP6
> +    ASM_NOP7 ASM_NOP8); 
> +

This in particular is just too ugly for words. Why can't you just have a

	static const char *intel_nops[] = {
		NULL,
		INOP1, INOP2, INOP3, INOP4, INOP5, INOP6, INOP7, INOP8
	};

	static const char *k7_nops[] = {
		NULL,
		K7NOP1, K7NOP2, ...
	}

	....

	/*
	 * This will be overridden at boot once we find out what kind of 
	 * CPU we actually have
	 */
	static char **nops = intel_nops;

and then just use

	.. nops[size] ..

when you want a nop of size "size".

		Linus


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Alternative patching for prefetches & cleanup
  2003-04-27  3:09 ` Linus Torvalds
@ 2003-04-27  3:14   ` Andi Kleen
  2003-04-27  3:22     ` Linus Torvalds
  0 siblings, 1 reply; 7+ messages in thread
From: Andi Kleen @ 2003-04-27  3:14 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andi Kleen, linux-kernel

> > +++ linux-gencpu/arch/i386/kernel/setup.c	2003-04-27 04:12:32.000000000 +0200
> > @@ -795,41 +795,42 @@
> >  		pci_mem_start = low_mem_size;
> >  }
> >  
> > +asm("nops: " 
> > +    ASM_NOP1 ASM_NOP2 ASM_NOP3 ASM_NOP4 ASM_NOP5 ASM_NOP6
> > +    ASM_NOP7 ASM_NOP8); 
> > +
> 
> This in particular is just too ugly for words. Why can't you just have a

Hmm. I thought using the Fibonaci sequence for this was clever :-)

But ok I'll change it to the array.

-Andi

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Alternative patching for prefetches & cleanup
  2003-04-27  3:14   ` Andi Kleen
@ 2003-04-27  3:22     ` Linus Torvalds
  2003-04-27  3:27       ` Andi Kleen
                         ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Linus Torvalds @ 2003-04-27  3:22 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel


On Sun, 27 Apr 2003, Andi Kleen wrote:
> 
> Hmm. I thought using the Fibonaci sequence for this was clever :-)

That's not the fibonacci sequence, that's just a regular sigma(i)  
(i=1..n) sequence. And if you were to generate the sequence numbers at
compile-time I might agree with you, if you also were to avoid using 
inline asms.

		Linus


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Alternative patching for prefetches & cleanup
  2003-04-27  3:22     ` Linus Torvalds
@ 2003-04-27  3:27       ` Andi Kleen
  2003-04-27  3:32       ` William Lee Irwin III
  2003-04-27  8:50       ` Rene Herman
  2 siblings, 0 replies; 7+ messages in thread
From: Andi Kleen @ 2003-04-27  3:27 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andi Kleen, linux-kernel

On Sun, Apr 27, 2003 at 05:22:58AM +0200, Linus Torvalds wrote:
> 
> On Sun, 27 Apr 2003, Andi Kleen wrote:
> > 
> > Hmm. I thought using the Fibonaci sequence for this was clever :-)
> 
> That's not the fibonacci sequence, that's just a regular sigma(i)  
> (i=1..n) sequence. And if you were to generate the sequence numbers at
> compile-time I might agree with you, if you also were to avoid using 
> inline asms.

True %)

The real reason I did it was to avoid having two sets of macros - one
with .byte 0x... and another with string "\x..\x..."

Now it needs duplication.

-Andi

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Alternative patching for prefetches & cleanup
  2003-04-27  3:22     ` Linus Torvalds
  2003-04-27  3:27       ` Andi Kleen
@ 2003-04-27  3:32       ` William Lee Irwin III
  2003-04-27  8:50       ` Rene Herman
  2 siblings, 0 replies; 7+ messages in thread
From: William Lee Irwin III @ 2003-04-27  3:32 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andi Kleen, linux-kernel

On Sun, 27 Apr 2003, Andi Kleen wrote:
>> Hmm. I thought using the Fibonaci sequence for this was clever :-)

On Sat, Apr 26, 2003 at 08:22:58PM -0700, Linus Torvalds wrote:
> That's not the fibonacci sequence, that's just a regular sigma(i)  
> (i=1..n) sequence. And if you were to generate the sequence numbers at
> compile-time I might agree with you, if you also were to avoid using 
> inline asms.

Such things may be checked with:

http://www.research.att.com/~njas/sequences/index.html


-- wli

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Alternative patching for prefetches & cleanup
  2003-04-27  3:22     ` Linus Torvalds
  2003-04-27  3:27       ` Andi Kleen
  2003-04-27  3:32       ` William Lee Irwin III
@ 2003-04-27  8:50       ` Rene Herman
  2 siblings, 0 replies; 7+ messages in thread
From: Rene Herman @ 2003-04-27  8:50 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andi Kleen, linux-kernel

Linus Torvalds wrote:

> That's not the fibonacci sequence, that's just a regular sigma(i)  
> (i=1..n) sequence. And if you were to generate the sequence numbers at
> compile-time I might agree with you, if you also were to avoid using 
> inline asms.

If the sigma(i) method is to stay, please note that

sigma(1 <= i <= n | i) = n*(n+1) / 2

for all n > 0 (trivial proof by induction).

That is, no need to loop; in this case, with n = num - 1

k = ((num - 1) * num) / 2

Rene.



^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2003-04-27  8:41 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-04-27  2:19 [PATCH] Alternative patching for prefetches & cleanup Andi Kleen
2003-04-27  3:09 ` Linus Torvalds
2003-04-27  3:14   ` Andi Kleen
2003-04-27  3:22     ` Linus Torvalds
2003-04-27  3:27       ` Andi Kleen
2003-04-27  3:32       ` William Lee Irwin III
2003-04-27  8:50       ` Rene Herman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).