[patch] raid-xor-2.4.10-A0

From: Ingo Molnar <mingo@elte.hu>
To: Linus Torvalds <torvalds@transmeta.com>
Cc: Neil Brown <neilb@cse.unsw.edu.au>, <linux-raid@vger.kernel.org>,
	<linux-kernel@vger.kernel.org>
Subject: [patch] raid-xor-2.4.10-A0
Date: Fri, 14 Sep 2001 12:56:43 +0200 (CEST)	[thread overview]
Message-ID: <Pine.LNX.4.33.0109141251410.3820-200000@localhost.localdomain> (raw)

[-- Attachment #1: Type: TEXT/PLAIN, Size: 787 bytes --]


another RAID patch against 2.4.10-pre9:

 - update the SSE XOR routines to get compiled and used on recent kernels.

 - change prefetch code to pollute the cache less, and to prefetch in a
   wider window, to give enough time for prefetches to finish.

people with SSE-capable CPUs (PIII, PIV, newer Athlons) should see
something like this in the bootlog:

 raid5: measuring checksumming speed
    8regs     :  1292.400 MB/sec
    32regs    :   607.600 MB/sec
    pIII_sse  :  1407.200 MB/sec
    pII_mmx   :  1600.800 MB/sec
    p5_mmx    :  1670.000 MB/sec
 raid5: using function: pIII_sse (1407.200 MB/sec)

(if present then the SSE code is still picked up exclusively due to its
better cache-properties, even if it's 'cached performance' is lower than
that of MMX routines.)

	Ingo

[-- Attachment #2: Type: TEXT/PLAIN, Size: 2542 bytes --]

--- linux/include/asm-i386/xor.h.orig2	Fri Sep 14 12:10:21 2001
+++ linux/include/asm-i386/xor.h	Fri Sep 14 12:32:36 2001
@@ -527,8 +527,6 @@
 #undef FPU_SAVE
 #undef FPU_RESTORE
 
-#if defined(CONFIG_X86_FXSR) || defined(CONFIG_X86_RUNTIME_FXSR)
-
 /*
  * Cache avoiding checksumming functions utilizing KNI instructions
  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
@@ -559,19 +557,20 @@
 		: "memory")
 
 #define OFFS(x)		"16*("#x")"
-#define	PF0(x)		"	prefetcht0  "OFFS(x)"(%1)   ;\n"
-#define LD(x,y)		"       movaps   "OFFS(x)"(%1), %%xmm"#y"   ;\n"
-#define ST(x,y)		"       movaps %%xmm"#y",   "OFFS(x)"(%1)   ;\n"
-#define PF1(x)		"	prefetchnta "OFFS(x)"(%2)   ;\n"
-#define PF2(x)		"	prefetchnta "OFFS(x)"(%3)   ;\n"
-#define PF3(x)		"	prefetchnta "OFFS(x)"(%4)   ;\n"
-#define PF4(x)		"	prefetchnta "OFFS(x)"(%5)   ;\n"
-#define PF5(x)		"	prefetchnta "OFFS(x)"(%6)   ;\n"
-#define XO1(x,y)	"       xorps   "OFFS(x)"(%2), %%xmm"#y"   ;\n"
-#define XO2(x,y)	"       xorps   "OFFS(x)"(%3), %%xmm"#y"   ;\n"
-#define XO3(x,y)	"       xorps   "OFFS(x)"(%4), %%xmm"#y"   ;\n"
-#define XO4(x,y)	"       xorps   "OFFS(x)"(%5), %%xmm"#y"   ;\n"
-#define XO5(x,y)	"       xorps   "OFFS(x)"(%6), %%xmm"#y"   ;\n"
+#define PF_OFFS(x)	"256+16*("#x")"
+#define	PF0(x)		"	prefetchnta "PF_OFFS(x)"(%1)		;\n"
+#define LD(x,y)		"       movaps   "OFFS(x)"(%1), %%xmm"#y"	;\n"
+#define ST(x,y)		"       movaps %%xmm"#y",   "OFFS(x)"(%1)	;\n"
+#define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%2)		;\n"
+#define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%3)		;\n"
+#define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%4)		;\n"
+#define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%5)		;\n"
+#define PF5(x)		"	prefetchnta "PF_OFFS(x)"(%6)		;\n"
+#define XO1(x,y)	"       xorps   "OFFS(x)"(%2), %%xmm"#y"	;\n"
+#define XO2(x,y)	"       xorps   "OFFS(x)"(%3), %%xmm"#y"	;\n"
+#define XO3(x,y)	"       xorps   "OFFS(x)"(%4), %%xmm"#y"	;\n"
+#define XO4(x,y)	"       xorps   "OFFS(x)"(%5), %%xmm"#y"	;\n"
+#define XO5(x,y)	"       xorps   "OFFS(x)"(%6), %%xmm"#y"	;\n"
 
 
 static void
@@ -849,15 +848,6 @@
    deals with a load to a line that is being prefetched.  */
 #define XOR_SELECT_TEMPLATE(FASTEST) \
 	(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
-
-#else
-
-/* Don't try any SSE2 when FXSR is not enabled, because OSFXSR will not be set
-   -AK */ 
-#define XOR_SSE2 
-#define XOR_SELECT_TEMPLATE(FASTEST) (FASTEST)
-
-#endif
 
 /* Also try the generic routines.  */
 #include <asm-generic/xor.h>