linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* lib/raid6: SSSE3 optimized recovery functions v2
@ 2012-04-11 19:40 Jim Kukunas
  2012-04-11 19:40 ` [PATCH 1/4] lib/raid6: fix test program build Jim Kukunas
                   ` (4 more replies)
  0 siblings, 5 replies; 13+ messages in thread
From: Jim Kukunas @ 2012-04-11 19:40 UTC (permalink / raw)
  To: linux-raid; +Cc: linux-kernel, neilb, hpa

Hi Folks,

The following patchset adds SSSE3 optimized recovery
functions to RAID6.

This version differs from version 1, in that:
        0) cpu_has_ssse3 is moved to within the #ifdef block
	1) the functions are moved into their own file, recov_ssse3.c
	2) a superfluous memory load is removed from raid6_2data_recov_ssse3

A technical description of the algorithm can be found
at http://www.kernel.org/pub/linux/kernel/people/hpa/raid6.pdf

Thanks.



^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/4] lib/raid6: fix test program build
  2012-04-11 19:40 lib/raid6: SSSE3 optimized recovery functions v2 Jim Kukunas
@ 2012-04-11 19:40 ` Jim Kukunas
  2012-04-11 19:40 ` [PATCH 2/4] lib/raid6: Add SSSE3 optimized recovery functions Jim Kukunas
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 13+ messages in thread
From: Jim Kukunas @ 2012-04-11 19:40 UTC (permalink / raw)
  To: linux-raid; +Cc: linux-kernel, neilb, hpa

<linux/module.h> drags in headers which are not visible to userspace,
thus breaking the build for the test program.

Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
---
 lib/raid6/algos.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 8b02f60..f6a0f78 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -17,11 +17,11 @@
  */
 
 #include <linux/raid/pq.h>
-#include <linux/module.h>
 #ifndef __KERNEL__
 #include <sys/mman.h>
 #include <stdio.h>
 #else
+#include <linux/module.h>
 #include <linux/gfp.h>
 #if !RAID6_USE_EMPTY_ZERO_PAGE
 /* In .bss so it's zeroed */
-- 
1.7.8.5


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/4] lib/raid6: Add SSSE3 optimized recovery functions
  2012-04-11 19:40 lib/raid6: SSSE3 optimized recovery functions v2 Jim Kukunas
  2012-04-11 19:40 ` [PATCH 1/4] lib/raid6: fix test program build Jim Kukunas
@ 2012-04-11 19:40 ` Jim Kukunas
  2012-04-11 19:40 ` [PATCH 3/4] lib/raid6: update test program for " Jim Kukunas
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 13+ messages in thread
From: Jim Kukunas @ 2012-04-11 19:40 UTC (permalink / raw)
  To: linux-raid; +Cc: linux-kernel, neilb, hpa

Add SSSE3 optimized recovery functions, as well as a system
for selecting the most appropriate recovery functions to use.

Originally-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
---
 include/linux/raid/pq.h |   18 +++-
 lib/raid6/Makefile      |    2 +-
 lib/raid6/algos.c       |   37 +++++
 lib/raid6/mktables.c    |   25 ++++
 lib/raid6/recov.c       |   15 ++-
 lib/raid6/recov_ssse3.c |  336 +++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 426 insertions(+), 7 deletions(-)
 create mode 100644 lib/raid6/recov_ssse3.c

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 53272e9..640c69c 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -99,8 +99,20 @@ extern const struct raid6_calls raid6_altivec2;
 extern const struct raid6_calls raid6_altivec4;
 extern const struct raid6_calls raid6_altivec8;
 
+struct raid6_recov_calls {
+	void (*data2)(int, size_t, int, int, void **);
+	void (*datap)(int, size_t, int, void **);
+	int  (*valid)(void);
+	const char *name;
+	int priority;
+};
+
+extern const struct raid6_recov_calls raid6_recov_intx1;
+extern const struct raid6_recov_calls raid6_recov_ssse3;
+
 /* Algorithm list */
 extern const struct raid6_calls * const raid6_algos[];
+extern const struct raid6_recov_calls *const raid6_recov_algos[];
 int raid6_select_algo(void);
 
 /* Return values from chk_syndrome */
@@ -111,14 +123,16 @@ int raid6_select_algo(void);
 
 /* Galois field tables */
 extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
+extern const u8 raid6_vgfmul[256][32] __attribute__((aligned(256)));
 extern const u8 raid6_gfexp[256]      __attribute__((aligned(256)));
 extern const u8 raid6_gfinv[256]      __attribute__((aligned(256)));
 extern const u8 raid6_gfexi[256]      __attribute__((aligned(256)));
 
 /* Recovery routines */
-void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
+extern void (*raid6_2data_recov)(int disks, size_t bytes, int faila, int failb,
 		       void **ptrs);
-void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs);
+extern void (*raid6_datap_recov)(int disks, size_t bytes, int faila,
+			void **ptrs);
 void raid6_dual_recov(int disks, size_t bytes, int faila, int failb,
 		      void **ptrs);
 
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 8a38102..de06dfe 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -1,6 +1,6 @@
 obj-$(CONFIG_RAID6_PQ)	+= raid6_pq.o
 
-raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
+raid6_pq-y	+= algos.o recov.o recov_ssse3.o tables.o int1.o int2.o int4.o \
 		   int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \
 		   altivec8.o mmx.o sse1.o sse2.o
 hostprogs-y	+= mktables
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index f6a0f78..5a7f802 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -64,6 +64,20 @@ const struct raid6_calls * const raid6_algos[] = {
 	NULL
 };
 
+void (*raid6_2data_recov)(int, size_t, int, int, void **);
+EXPORT_SYMBOL_GPL(raid6_2data_recov);
+
+void (*raid6_datap_recov)(int, size_t, int, void **);
+EXPORT_SYMBOL_GPL(raid6_datap_recov);
+
+const struct raid6_recov_calls *const raid6_recov_algos[] = {
+#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
+	&raid6_recov_ssse3,
+#endif
+	&raid6_recov_intx1,
+	NULL
+};
+
 #ifdef __KERNEL__
 #define RAID6_TIME_JIFFIES_LG2	4
 #else
@@ -72,6 +86,26 @@ const struct raid6_calls * const raid6_algos[] = {
 #define time_before(x, y) ((x) < (y))
 #endif
 
+static inline void raid6_choose_recov(void)
+{
+	const struct raid6_recov_calls *const *algo;
+	const struct raid6_recov_calls *best;
+
+	for (best = NULL, algo = raid6_recov_algos; *algo; algo++)
+		if (!best || (*algo)->priority > best->priority)
+			if (!(*algo)->valid || (*algo)->valid())
+				best = *algo;
+
+	if (best) {
+		raid6_2data_recov = best->data2;
+		raid6_datap_recov = best->datap;
+
+		printk("raid6: using %s recovery algorithm\n", best->name);
+	} else
+		printk("raid6: Yikes! No recovery algorithm found!\n");
+}
+
+
 /* Try to pick the best algorithm */
 /* This code uses the gfmul table as convenient data set to abuse */
 
@@ -141,6 +175,9 @@ int __init raid6_select_algo(void)
 
 	free_pages((unsigned long)syndromes, 1);
 
+	/* select raid recover functions */
+	raid6_choose_recov();
+
 	return best ? 0 : -EINVAL;
 }
 
diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c
index 8a37809..39787db 100644
--- a/lib/raid6/mktables.c
+++ b/lib/raid6/mktables.c
@@ -81,6 +81,31 @@ int main(int argc, char *argv[])
 	printf("EXPORT_SYMBOL(raid6_gfmul);\n");
 	printf("#endif\n");
 
+	/* Compute vector multiplication table */
+	printf("\nconst u8  __attribute__((aligned(256)))\n"
+		"raid6_vgfmul[256][32] =\n"
+		"{\n");
+	for (i = 0; i < 256; i++) {
+		printf("\t{\n");
+		for (j = 0; j < 16; j += 8) {
+			printf("\t\t");
+			for (k = 0; k < 8; k++)
+				printf("0x%02x,%c", gfmul(i, j + k),
+				       (k == 7) ? '\n' : ' ');
+		}
+		for (j = 0; j < 16; j += 8) {
+			printf("\t\t");
+			for (k = 0; k < 8; k++)
+				printf("0x%02x,%c", gfmul(i, (j + k) << 4),
+				       (k == 7) ? '\n' : ' ');
+		}
+		printf("\t},\n");
+	}
+	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_vgfmul);\n");
+	printf("#endif\n");
+
 	/* Compute power-of-2 table (exponent) */
 	v = 1;
 	printf("\nconst u8 __attribute__((aligned(256)))\n"
diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c
index fe275d7..1805a5c 100644
--- a/lib/raid6/recov.c
+++ b/lib/raid6/recov.c
@@ -22,7 +22,7 @@
 #include <linux/raid/pq.h>
 
 /* Recover two failed data blocks. */
-void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
+void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, int failb,
 		       void **ptrs)
 {
 	u8 *p, *q, *dp, *dq;
@@ -64,10 +64,9 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
 		p++; q++;
 	}
 }
-EXPORT_SYMBOL_GPL(raid6_2data_recov);
 
 /* Recover failure of one data block plus the P block */
-void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
+void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, void **ptrs)
 {
 	u8 *p, *q, *dq;
 	const u8 *qmul;		/* Q multiplier table */
@@ -96,7 +95,15 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
 		q++; dq++;
 	}
 }
-EXPORT_SYMBOL_GPL(raid6_datap_recov);
+
+
+const struct raid6_recov_calls raid6_recov_intx1 = {
+	.data2 = raid6_2data_recov_intx1,
+	.datap = raid6_datap_recov_intx1,
+	.valid = NULL,
+	.name = "intx1",
+	.priority = 0,
+};
 
 #ifndef __KERNEL__
 /* Testing only */
diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c
new file mode 100644
index 0000000..43eb383
--- /dev/null
+++ b/lib/raid6/recov_ssse3.c
@@ -0,0 +1,336 @@
+/*
+ * Copyright (C) 2012 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
+
+#include <linux/raid/pq.h>
+#include "x86.h"
+
+static int raid6_has_ssse3(void)
+{
+	return boot_cpu_has(X86_FEATURE_XMM) &&
+		boot_cpu_has(X86_FEATURE_XMM2) &&
+		boot_cpu_has(X86_FEATURE_SSSE3);
+}
+
+void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, int failb,
+		       void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+	static const u8 __attribute__((aligned(16))) x0f[16] = {
+		 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+		 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f};
+
+	p = (u8 *)ptrs[disks-2];
+	q = (u8 *)ptrs[disks-1];
+
+	/* Compute syndrome with zero for the missing data pages
+	   Use the dead data pages as temporary storage for
+	   delta p and delta q */
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks-2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[disks-1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]   = dp;
+	ptrs[failb]   = dq;
+	ptrs[disks-2] = p;
+	ptrs[disks-1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+		raid6_gfexp[failb]]];
+
+	kernel_fpu_begin();
+
+	asm volatile("movdqa %0,%%xmm7" : : "m" (x0f[0]));
+
+#ifdef CONFIG_X86_64
+	asm volatile("movdqa %0,%%xmm6" : : "m" (qmul[0]));
+	asm volatile("movdqa %0,%%xmm14" : : "m" (pbmul[0]));
+	asm volatile("movdqa %0,%%xmm15" : : "m" (pbmul[16]));
+#endif
+
+	/* Now do it... */
+	while (bytes) {
+#ifdef CONFIG_X86_64
+	/* xmm6, xmm14, xmm15 */
+
+		asm volatile("movdqa %0,%%xmm1" : : "m" (q[0]));
+		asm volatile("movdqa %0,%%xmm9" : : "m" (q[16]));
+		asm volatile("movdqa %0,%%xmm0" : : "m" (p[0]));
+		asm volatile("movdqa %0,%%xmm8" : : "m" (p[16]));
+		asm volatile("pxor   %0,%%xmm1" : : "m" (dq[0]));
+		asm volatile("pxor   %0,%%xmm9" : : "m" (dq[16]));
+		asm volatile("pxor   %0,%%xmm0" : : "m" (dp[0]));
+		asm volatile("pxor   %0,%%xmm8" : : "m" (dp[16]));
+
+		/* xmm0/8 = px */
+
+		asm volatile("movdqa %xmm6,%xmm4");
+		asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16]));
+		asm volatile("movdqa %xmm6,%xmm12");
+		asm volatile("movdqa %xmm5,%xmm13");
+		asm volatile("movdqa %xmm1,%xmm3");
+		asm volatile("movdqa %xmm9,%xmm11");
+		asm volatile("movdqa %xmm0,%xmm2"); /* xmm2/10 = px */
+		asm volatile("movdqa %xmm8,%xmm10");
+		asm volatile("psraw  $4,%xmm1");
+		asm volatile("psraw  $4,%xmm9");
+		asm volatile("pand   %xmm7,%xmm3");
+		asm volatile("pand   %xmm7,%xmm11");
+		asm volatile("pand   %xmm7,%xmm1");
+		asm volatile("pand   %xmm7,%xmm9");
+		asm volatile("pshufb %xmm3,%xmm4");
+		asm volatile("pshufb %xmm11,%xmm12");
+		asm volatile("pshufb %xmm1,%xmm5");
+		asm volatile("pshufb %xmm9,%xmm13");
+		asm volatile("pxor   %xmm4,%xmm5");
+		asm volatile("pxor   %xmm12,%xmm13");
+
+		/* xmm5/13 = qx */
+
+		asm volatile("movdqa %xmm14,%xmm4");
+		asm volatile("movdqa %xmm15,%xmm1");
+		asm volatile("movdqa %xmm14,%xmm12");
+		asm volatile("movdqa %xmm15,%xmm9");
+		asm volatile("movdqa %xmm2,%xmm3");
+		asm volatile("movdqa %xmm10,%xmm11");
+		asm volatile("psraw  $4,%xmm2");
+		asm volatile("psraw  $4,%xmm10");
+		asm volatile("pand   %xmm7,%xmm3");
+		asm volatile("pand   %xmm7,%xmm11");
+		asm volatile("pand   %xmm7,%xmm2");
+		asm volatile("pand   %xmm7,%xmm10");
+		asm volatile("pshufb %xmm3,%xmm4");
+		asm volatile("pshufb %xmm11,%xmm12");
+		asm volatile("pshufb %xmm2,%xmm1");
+		asm volatile("pshufb %xmm10,%xmm9");
+		asm volatile("pxor   %xmm4,%xmm1");
+		asm volatile("pxor   %xmm12,%xmm9");
+
+		/* xmm1/9 = pbmul[px] */
+		asm volatile("pxor   %xmm5,%xmm1");
+		asm volatile("pxor   %xmm13,%xmm9");
+		/* xmm1/9 = db = DQ */
+		asm volatile("movdqa %%xmm1,%0" : "=m" (dq[0]));
+		asm volatile("movdqa %%xmm9,%0" : "=m" (dq[16]));
+
+		asm volatile("pxor   %xmm1,%xmm0");
+		asm volatile("pxor   %xmm9,%xmm8");
+		asm volatile("movdqa %%xmm0,%0" : "=m" (dp[0]));
+		asm volatile("movdqa %%xmm8,%0" : "=m" (dp[16]));
+
+		bytes -= 32;
+		p += 32;
+		q += 32;
+		dp += 32;
+		dq += 32;
+#else
+		asm volatile("movdqa %0,%%xmm1" : : "m" (*q));
+		asm volatile("movdqa %0,%%xmm0" : : "m" (*p));
+		asm volatile("pxor   %0,%%xmm1" : : "m" (*dq));
+		asm volatile("pxor   %0,%%xmm0" : : "m" (*dp));
+
+		/* 1 = dq ^ q
+		 * 0 = dp ^ p
+		 */
+		asm volatile("movdqa %0,%%xmm4" : : "m" (qmul[0]));
+		asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16]));
+
+		asm volatile("movdqa %xmm1,%xmm3");
+		asm volatile("psraw  $4,%xmm1");
+		asm volatile("pand   %xmm7,%xmm3");
+		asm volatile("pand   %xmm7,%xmm1");
+		asm volatile("pshufb %xmm3,%xmm4");
+		asm volatile("pshufb %xmm1,%xmm5");
+		asm volatile("pxor   %xmm4,%xmm5");
+
+		asm volatile("movdqa %xmm0,%xmm2"); /* xmm2 = px */
+
+		/* xmm5 = qx */
+
+		asm volatile("movdqa %0,%%xmm4" : : "m" (pbmul[0]));
+		asm volatile("movdqa %0,%%xmm1" : : "m" (pbmul[16]));
+		asm volatile("movdqa %xmm2,%xmm3");
+		asm volatile("psraw  $4,%xmm2");
+		asm volatile("pand   %xmm7,%xmm3");
+		asm volatile("pand   %xmm7,%xmm2");
+		asm volatile("pshufb %xmm3,%xmm4");
+		asm volatile("pshufb %xmm2,%xmm1");
+		asm volatile("pxor   %xmm4,%xmm1");
+
+		/* xmm1 = pbmul[px] */
+		asm volatile("pxor   %xmm5,%xmm1");
+		/* xmm1 = db = DQ */
+		asm volatile("movdqa %%xmm1,%0" : "=m" (*dq));
+
+		asm volatile("pxor   %xmm1,%xmm0");
+		asm volatile("movdqa %%xmm0,%0" : "=m" (*dp));
+
+		bytes -= 16;
+		p += 16;
+		q += 16;
+		dp += 16;
+		dq += 16;
+#endif
+	}
+
+	kernel_fpu_end();
+}
+
+
+void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+	static const u8 __attribute__((aligned(16))) x0f[16] = {
+		 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+		 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f};
+
+	p = (u8 *)ptrs[disks-2];
+	q = (u8 *)ptrs[disks-1];
+
+	/* Compute syndrome with zero for the missing data page
+	   Use the dead data page as temporary storage for delta q */
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks-1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]   = dq;
+	ptrs[disks-1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	kernel_fpu_begin();
+
+	asm volatile("movdqa %0, %%xmm7" : : "m" (x0f[0]));
+
+	while (bytes) {
+#ifdef CONFIG_X86_64
+		asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0]));
+		asm volatile("movdqa %0, %%xmm4" : : "m" (dq[16]));
+		asm volatile("pxor %0, %%xmm3" : : "m" (q[0]));
+		asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0]));
+
+		/* xmm3 = q[0] ^ dq[0] */
+
+		asm volatile("pxor %0, %%xmm4" : : "m" (q[16]));
+		asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16]));
+
+		/* xmm4 = q[16] ^ dq[16] */
+
+		asm volatile("movdqa %xmm3, %xmm6");
+		asm volatile("movdqa %xmm4, %xmm8");
+
+		/* xmm4 = xmm8 = q[16] ^ dq[16] */
+
+		asm volatile("psraw $4, %xmm3");
+		asm volatile("pand %xmm7, %xmm6");
+		asm volatile("pand %xmm7, %xmm3");
+		asm volatile("pshufb %xmm6, %xmm0");
+		asm volatile("pshufb %xmm3, %xmm1");
+		asm volatile("movdqa %0, %%xmm10" : : "m" (qmul[0]));
+		asm volatile("pxor %xmm0, %xmm1");
+		asm volatile("movdqa %0, %%xmm11" : : "m" (qmul[16]));
+
+		/* xmm1 = qmul[q[0] ^ dq[0]] */
+
+		asm volatile("psraw $4, %xmm4");
+		asm volatile("pand %xmm7, %xmm8");
+		asm volatile("pand %xmm7, %xmm4");
+		asm volatile("pshufb %xmm8, %xmm10");
+		asm volatile("pshufb %xmm4, %xmm11");
+		asm volatile("movdqa %0, %%xmm2" : : "m" (p[0]));
+		asm volatile("pxor %xmm10, %xmm11");
+		asm volatile("movdqa %0, %%xmm12" : : "m" (p[16]));
+
+		/* xmm11 = qmul[q[16] ^ dq[16]] */
+
+		asm volatile("pxor %xmm1, %xmm2");
+
+		/* xmm2 = p[0] ^ qmul[q[0] ^ dq[0]] */
+
+		asm volatile("pxor %xmm11, %xmm12");
+
+		/* xmm12 = p[16] ^ qmul[q[16] ^ dq[16]] */
+
+		asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0]));
+		asm volatile("movdqa %%xmm11, %0" : "=m" (dq[16]));
+
+		asm volatile("movdqa %%xmm2, %0" : "=m" (p[0]));
+		asm volatile("movdqa %%xmm12, %0" : "=m" (p[16]));
+
+		bytes -= 32;
+		p += 32;
+		q += 32;
+		dq += 32;
+
+#else
+		asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0]));
+		asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0]));
+		asm volatile("pxor %0, %%xmm3" : : "m" (q[0]));
+		asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16]));
+
+		/* xmm3 = *q ^ *dq */
+
+		asm volatile("movdqa %xmm3, %xmm6");
+		asm volatile("movdqa %0, %%xmm2" : : "m" (p[0]));
+		asm volatile("psraw $4, %xmm3");
+		asm volatile("pand %xmm7, %xmm6");
+		asm volatile("pand %xmm7, %xmm3");
+		asm volatile("pshufb %xmm6, %xmm0");
+		asm volatile("pshufb %xmm3, %xmm1");
+		asm volatile("pxor %xmm0, %xmm1");
+
+		/* xmm1 = qmul[*q ^ *dq */
+
+		asm volatile("pxor %xmm1, %xmm2");
+
+		/* xmm2 = *p ^ qmul[*q ^ *dq] */
+
+		asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0]));
+		asm volatile("movdqa %%xmm2, %0" : "=m" (p[0]));
+
+		bytes -= 16;
+		p += 16;
+		q += 16;
+		dq += 16;
+#endif
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_recov_calls raid6_recov_ssse3 = {
+	.data2 = raid6_2data_recov_ssse3,
+	.datap = raid6_datap_recov_ssse3,
+	.valid = raid6_has_ssse3,
+#ifdef CONFIG_X86_64
+	.name = "ssse3x2",
+#else
+	.name = "ssse3x1",
+#endif
+	.priority = 1,
+};
+
+#endif
+
-- 
1.7.8.5


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 3/4] lib/raid6: update test program for recovery functions
  2012-04-11 19:40 lib/raid6: SSSE3 optimized recovery functions v2 Jim Kukunas
  2012-04-11 19:40 ` [PATCH 1/4] lib/raid6: fix test program build Jim Kukunas
  2012-04-11 19:40 ` [PATCH 2/4] lib/raid6: Add SSSE3 optimized recovery functions Jim Kukunas
@ 2012-04-11 19:40 ` Jim Kukunas
  2012-04-11 19:40 ` [PATCH 4/4] lib/raid6: cleanup gen_syndrome function selection Jim Kukunas
  2012-04-12  6:18 ` lib/raid6: SSSE3 optimized recovery functions v2 NeilBrown
  4 siblings, 0 replies; 13+ messages in thread
From: Jim Kukunas @ 2012-04-11 19:40 UTC (permalink / raw)
  To: linux-raid; +Cc: linux-kernel, neilb, hpa

Test each combination of recovery and syndrome generation
functions.

Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
---
 lib/raid6/test/Makefile |    2 +-
 lib/raid6/test/test.c   |   32 +++++++++++++++++++++-----------
 lib/raid6/x86.h         |   13 ++++++++-----
 3 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index aa65169..c76151d 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -23,7 +23,7 @@ RANLIB	 = ranlib
 all:	raid6.a raid6test
 
 raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \
-	 altivec1.o altivec2.o altivec4.o altivec8.o recov.o algos.o \
+	 altivec1.o altivec2.o altivec4.o altivec8.o recov.o recov_ssse3.o algos.o \
 	 tables.o
 	 rm -f $@
 	 $(AR) cq $@ $^
diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c
index 7a93031..5a485b7 100644
--- a/lib/raid6/test/test.c
+++ b/lib/raid6/test/test.c
@@ -90,25 +90,35 @@ static int test_disks(int i, int j)
 int main(int argc, char *argv[])
 {
 	const struct raid6_calls *const *algo;
+	const struct raid6_recov_calls *const *ra;
 	int i, j;
 	int err = 0;
 
 	makedata();
 
-	for (algo = raid6_algos; *algo; algo++) {
-		if (!(*algo)->valid || (*algo)->valid()) {
-			raid6_call = **algo;
+	for (ra = raid6_recov_algos; *ra; ra++) {
+		if ((*ra)->valid  && !(*ra)->valid())
+			continue;
+		raid6_2data_recov = (*ra)->data2;
+		raid6_datap_recov = (*ra)->datap;
 
-			/* Nuke syndromes */
-			memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE);
+		printf("using recovery %s\n", (*ra)->name);
 
-			/* Generate assumed good syndrome */
-			raid6_call.gen_syndrome(NDISKS, PAGE_SIZE,
-						(void **)&dataptrs);
+		for (algo = raid6_algos; *algo; algo++) {
+			if (!(*algo)->valid || (*algo)->valid()) {
+				raid6_call = **algo;
 
-			for (i = 0; i < NDISKS-1; i++)
-				for (j = i+1; j < NDISKS; j++)
-					err += test_disks(i, j);
+				/* Nuke syndromes */
+				memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE);
+
+				/* Generate assumed good syndrome */
+				raid6_call.gen_syndrome(NDISKS, PAGE_SIZE,
+							(void **)&dataptrs);
+
+				for (i = 0; i < NDISKS-1; i++)
+					for (j = i+1; j < NDISKS; j++)
+						err += test_disks(i, j);
+			}
 		}
 		printf("\n");
 	}
diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h
index cb2a8c9..9aff51f 100644
--- a/lib/raid6/x86.h
+++ b/lib/raid6/x86.h
@@ -40,19 +40,22 @@ static inline void kernel_fpu_end(void)
 					   * (fast save and restore) */
 #define X86_FEATURE_XMM		(0*32+25) /* Streaming SIMD Extensions */
 #define X86_FEATURE_XMM2	(0*32+26) /* Streaming SIMD Extensions-2 */
+#define X86_FEATURE_XMM3	(4*32+ 0) /* "pni" SSE-3 */
+#define X86_FEATURE_SSSE3	(4*32+ 9) /* Supplemental SSE-3 */
+#define X86_FEATURE_AVX	(4*32+28) /* Advanced Vector Extensions */
 #define X86_FEATURE_MMXEXT	(1*32+22) /* AMD MMX extensions */
 
 /* Should work well enough on modern CPUs for testing */
 static inline int boot_cpu_has(int flag)
 {
-	u32 eax = (flag >> 5) ? 0x80000001 : 1;
-	u32 edx;
+	u32 eax = (flag & 0x20) ? 0x80000001 : 1;
+	u32 ecx, edx;
 
 	asm volatile("cpuid"
-		     : "+a" (eax), "=d" (edx)
-		     : : "ecx", "ebx");
+		     : "+a" (eax), "=d" (edx), "=c" (ecx)
+		     : : "ebx");
 
-	return (edx >> (flag & 31)) & 1;
+	return ((flag & 0x80 ? ecx : edx) >> (flag & 31)) & 1;
 }
 
 #endif /* ndef __KERNEL__ */
-- 
1.7.8.5


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 4/4] lib/raid6: cleanup gen_syndrome function selection
  2012-04-11 19:40 lib/raid6: SSSE3 optimized recovery functions v2 Jim Kukunas
                   ` (2 preceding siblings ...)
  2012-04-11 19:40 ` [PATCH 3/4] lib/raid6: update test program for " Jim Kukunas
@ 2012-04-11 19:40 ` Jim Kukunas
  2012-04-12  6:18 ` lib/raid6: SSSE3 optimized recovery functions v2 NeilBrown
  4 siblings, 0 replies; 13+ messages in thread
From: Jim Kukunas @ 2012-04-11 19:40 UTC (permalink / raw)
  To: linux-raid; +Cc: linux-kernel, neilb, hpa

Reorders functions in raid6_algos as well as the preference check
to reduce the number of functions tested on initialization.

Also, creates symmetry between choosing the gen_syndrome functions
and choosing the recovery functions.

Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
---
 lib/raid6/algos.c |  104 +++++++++++++++++++++++++++++------------------------
 1 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 5a7f802..589f5f5 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -34,10 +34,6 @@ struct raid6_calls raid6_call;
 EXPORT_SYMBOL_GPL(raid6_call);
 
 const struct raid6_calls * const raid6_algos[] = {
-	&raid6_intx1,
-	&raid6_intx2,
-	&raid6_intx4,
-	&raid6_intx8,
 #if defined(__ia64__)
 	&raid6_intx16,
 	&raid6_intx32,
@@ -61,6 +57,10 @@ const struct raid6_calls * const raid6_algos[] = {
 	&raid6_altivec4,
 	&raid6_altivec8,
 #endif
+	&raid6_intx1,
+	&raid6_intx2,
+	&raid6_intx4,
+	&raid6_intx8,
 	NULL
 };
 
@@ -86,7 +86,7 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
 #define time_before(x, y) ((x) < (y))
 #endif
 
-static inline void raid6_choose_recov(void)
+static inline const struct raid6_recov_calls *raid6_choose_recov(void)
 {
 	const struct raid6_recov_calls *const *algo;
 	const struct raid6_recov_calls *best;
@@ -103,62 +103,38 @@ static inline void raid6_choose_recov(void)
 		printk("raid6: using %s recovery algorithm\n", best->name);
 	} else
 		printk("raid6: Yikes! No recovery algorithm found!\n");
-}
-
 
-/* Try to pick the best algorithm */
-/* This code uses the gfmul table as convenient data set to abuse */
+	return best;
+}
 
-int __init raid6_select_algo(void)
+static inline const struct raid6_calls *raid6_choose_gen(
+	void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks)
 {
-	const struct raid6_calls * const * algo;
-	const struct raid6_calls * best;
-	char *syndromes;
-	void *dptrs[(65536/PAGE_SIZE)+2];
-	int i, disks;
-	unsigned long perf, bestperf;
-	int bestprefer;
-	unsigned long j0, j1;
-
-	disks = (65536/PAGE_SIZE)+2;
-	for ( i = 0 ; i < disks-2 ; i++ ) {
-		dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i;
-	}
-
-	/* Normal code - use a 2-page allocation to avoid D$ conflict */
-	syndromes = (void *) __get_free_pages(GFP_KERNEL, 1);
-
-	if ( !syndromes ) {
-		printk("raid6: Yikes!  No memory available.\n");
-		return -ENOMEM;
-	}
-
-	dptrs[disks-2] = syndromes;
-	dptrs[disks-1] = syndromes + PAGE_SIZE;
+	unsigned long perf, bestperf, j0, j1;
+	const struct raid6_calls *const *algo;
+	const struct raid6_calls *best;
 
-	bestperf = 0;  bestprefer = 0;  best = NULL;
+	for (bestperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) {
+		if (!best || (*algo)->prefer >= best->prefer) {
+			if ((*algo)->valid && !(*algo)->valid())
+				continue;
 
-	for ( algo = raid6_algos ; *algo ; algo++ ) {
-		if ( !(*algo)->valid || (*algo)->valid() ) {
 			perf = 0;
 
 			preempt_disable();
 			j0 = jiffies;
-			while ( (j1 = jiffies) == j0 )
+			while ((j1 = jiffies) == j0)
 				cpu_relax();
 			while (time_before(jiffies,
 					    j1 + (1<<RAID6_TIME_JIFFIES_LG2))) {
-				(*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs);
+				(*algo)->gen_syndrome(disks, PAGE_SIZE, *dptrs);
 				perf++;
 			}
 			preempt_enable();
 
-			if ( (*algo)->prefer > bestprefer ||
-			     ((*algo)->prefer == bestprefer &&
-			      perf > bestperf) ) {
-				best = *algo;
-				bestprefer = best->prefer;
+			if (perf > bestperf) {
 				bestperf = perf;
+				best = *algo;
 			}
 			printk("raid6: %-8s %5ld MB/s\n", (*algo)->name,
 			       (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2));
@@ -173,12 +149,46 @@ int __init raid6_select_algo(void)
 	} else
 		printk("raid6: Yikes!  No algorithm found!\n");
 
-	free_pages((unsigned long)syndromes, 1);
+	return best;
+}
+
+
+/* Try to pick the best algorithm */
+/* This code uses the gfmul table as convenient data set to abuse */
+
+int __init raid6_select_algo(void)
+{
+	const int disks = (65536/PAGE_SIZE)+2;
+
+	const struct raid6_calls *gen_best;
+	const struct raid6_recov_calls *rec_best;
+	char *syndromes;
+	void *dptrs[(65536/PAGE_SIZE)+2];
+	int i;
+
+	for (i = 0; i < disks-2; i++)
+		dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i;
+
+	/* Normal code - use a 2-page allocation to avoid D$ conflict */
+	syndromes = (void *) __get_free_pages(GFP_KERNEL, 1);
+
+	if (!syndromes) {
+		printk("raid6: Yikes!  No memory available.\n");
+		return -ENOMEM;
+	}
+
+	dptrs[disks-2] = syndromes;
+	dptrs[disks-1] = syndromes + PAGE_SIZE;
+
+	/* select raid gen_syndrome function */
+	gen_best = raid6_choose_gen(&dptrs, disks);
 
 	/* select raid recover functions */
-	raid6_choose_recov();
+	rec_best = raid6_choose_recov();
+
+	free_pages((unsigned long)syndromes, 1);
 
-	return best ? 0 : -EINVAL;
+	return gen_best && rec_best ? 0 : -EINVAL;
 }
 
 static void raid6_exit(void)
-- 
1.7.8.5


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: lib/raid6: SSSE3 optimized recovery functions v2
  2012-04-11 19:40 lib/raid6: SSSE3 optimized recovery functions v2 Jim Kukunas
                   ` (3 preceding siblings ...)
  2012-04-11 19:40 ` [PATCH 4/4] lib/raid6: cleanup gen_syndrome function selection Jim Kukunas
@ 2012-04-12  6:18 ` NeilBrown
  2012-04-12 20:04   ` Jim Kukunas
  4 siblings, 1 reply; 13+ messages in thread
From: NeilBrown @ 2012-04-12  6:18 UTC (permalink / raw)
  To: Jim Kukunas; +Cc: linux-raid, linux-kernel, hpa

[-- Attachment #1: Type: text/plain, Size: 1152 bytes --]

On Wed, 11 Apr 2012 12:40:27 -0700 Jim Kukunas
<james.t.kukunas@linux.intel.com> wrote:

> Hi Folks,
> 
> The following patchset adds SSSE3 optimized recovery
> functions to RAID6.
> 
> This version differs from version 1, in that:
>         0) cpu_has_ssse3 is moved to within the #ifdef block
> 	1) the functions are moved into their own file, recov_ssse3.c
> 	2) a superfluous memory load is removed from raid6_2data_recov_ssse3
> 
> A technical description of the algorithm can be found
> at http://www.kernel.org/pub/linux/kernel/people/hpa/raid6.pdf
> 
> Thanks.
> 


Thanks.

Could I trouble you to run 'checkpatch.pl' and fix up some of the more
reasonable complaints?

ERROR: open brace '{' following function declarations go on the next line
#243: FILE: lib/raid6/recov_ssse3.c:28:
+	static const u8 __attribute__((aligned(16))) x0f[16] = {

is clearly bogus, but 

WARNING: __aligned(size) is preferred over __attribute__((aligned(size)))
#243: FILE: lib/raid6/recov_ssse3.c:28:
+	static const u8 __attribute__((aligned(16))) x0f[16] = {

is probably worth considering, as are some others.

NeilBrown


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: lib/raid6: SSSE3 optimized recovery functions v2
  2012-04-12  6:18 ` lib/raid6: SSSE3 optimized recovery functions v2 NeilBrown
@ 2012-04-12 20:04   ` Jim Kukunas
  2012-04-12 20:04     ` [PATCH 1/4] lib/raid6: fix test program build Jim Kukunas
                       ` (4 more replies)
  0 siblings, 5 replies; 13+ messages in thread
From: Jim Kukunas @ 2012-04-12 20:04 UTC (permalink / raw)
  To: neilb; +Cc: linux-raid, linux-kernel, hpa

On Thu, Apr 12, 2012 at 04:18:05PM +1000, NeilBrown wrote:
> On Wed, 11 Apr 2012 12:40:27 -0700 Jim Kukunas
> <james.t.kukunas@linux.intel.com> wrote:

<snip> 

> Could I trouble you to run 'checkpatch.pl' and fix up some of the more
> reasonable complaints?
>
> ERROR: open brace '{' following function declarations go on the next line
> #243: FILE: lib/raid6/recov_ssse3.c:28:
> +     static const u8 __attribute__((aligned(16))) x0f[16] = {
>
> is clearly bogus, but
>
> WARNING: __aligned(size) is preferred over __attribute__((aligned(size)))
> #243: FILE: lib/raid6/recov_ssse3.c:28:
> +     static const u8 __attribute__((aligned(16))) x0f[16] = {
>
> is probably worth considering, as are some others.

Hi Neil,

Thanks for taking a look at this revision.

Before I submitted these patches, I actually had run checkpatch. I've included
my thought process below, along with the corresponding snippets of the
checkpatch output.

> WARNING: __aligned(size) is preferred over __attribute__((aligned(size)))
> #50: FILE: include/linux/raid/pq.h:126:
> +extern const u8 raid6_vgfmul[256][32] __attribute__((aligned(256)));

This keeps with the existing code conventions. The code block is:

extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
extern const u8 raid6_vgfmul[256][32] __attribute__((aligned(256)));
extern const u8 raid6_gfexp[256]      __attribute__((aligned(256)));
extern const u8 raid6_gfinv[256]      __attribute__((aligned(256)));
extern const u8 raid6_gfexi[256]      __attribute__((aligned(256)));

> WARNING: printk() should include KERN_ facility level
> #120: FILE: lib/raid6/algos.c:103:
> +		printk("raid6: using %s recovery algorith\n", nest->name);
> 
> WARNING: printk() should include KERN_ facility level
> #122: FILE: lib/raid6/algos.c:105:
> +		printk("raid6: Yikes! No recovery algorithm found!\n");
> 
> WARNING: printk() should include KERN_ facility level
> #159: FILE: lib/raid6/algos.c:176:
> +		printk("raid6: Yikes!  No memory available.\n");

Again, these are following the conventions of the existing code such as:

printk("raid6: using algorithm %s (%ld MB/s)\n",

In fact, the last printk, about no memory available, was simply moved to a 
different line in my patch.

> ERROR: open brace '{' following function declarations go on the next line
> #423: FILE: lib/raid6/recov_ssse3.c:201:
> +	static const u8 __attribute__((aligned(16))) x0f[16] = {
> 
> ERROR: open brace '{' following function declarations go on the next line
> #250: FILE: lib/raid6/recov_ssse3.c:28:
> +	static const u8 __attribute__((aligned(16))) x0f[16] = {

As you pointed out, these are both bogus.

> WARNING: suspect code indent for conditional statements (8, 8)
> #291: FILE: lib/raid6/recov_ssse3.c:69:
> +	while (bytes) {
> [...]
> +	/* xmm6, xmm14, xmm15 */

This one slipped through the cracks. I've corrected it in the attached patches.

> WARNING: __aligned(size) is preferred over __attribute__((aligned(size)))
> #423: FILE: lib/raid6/recov_ssse3.c:201:
> +	static const u8 __attribute__((aligned(16))) x0f[16] = {
> 
> WARNING: __aligned(size) is preferred over __attribute__((aligned(size)))
> #250: FILE: lib/raid6/recov_ssse3.c:28:
> +	static const u8 __attribute__((aligned(16))) x0f[16] = {

There were two reasons why I didn't use __aligned. The first being the
existing code conventions:

const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));

and also, by using __attribute__((aligned(size))), there isn't a need to add

#ifndef __KERNEL__
#define __aligned(x) __attribute__((aligned(x)))
#endif

to x86.h for the test program.

That being said, I've changed these two in the attached patches.

> ERROR: need consistent spacing around '+' (ctx:VxW)
> #88: FILE: lib/raid6/x86.h:43:
> +#define X86_FEATURE_XMM3	(4*32+ 0) /* "pni" SSE-3 */
> 
> ERROR: need consistent spacing around '+' (ctx:VxW)
> #89: FILE: lib/raid6/x86.h:44:
> +#define X86_FEATURE_SSSE3	(4*32+ 9) /* Supplemental SSE-3 */
                          	     ^
These defines are plucked from arch/x86/include/asm/cpufeature.h.

In an perfect world, we'd just include that header, but it isn't visible from
userspace, so instead we copy the values.

Thus, to me, it makes sense to me to keep the symmetry here, but I can change
this if you want.

>
> NeilBrown
>

Thanks again.

--
Jim Kukunas
Intel Open Source Technology Center



^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/4] lib/raid6: fix test program build
  2012-04-12 20:04   ` Jim Kukunas
@ 2012-04-12 20:04     ` Jim Kukunas
  2012-04-19  0:33       ` NeilBrown
  2012-04-12 20:04     ` [PATCH 2/4] lib/raid6: Add SSSE3 optimized recovery functions Jim Kukunas
                       ` (3 subsequent siblings)
  4 siblings, 1 reply; 13+ messages in thread
From: Jim Kukunas @ 2012-04-12 20:04 UTC (permalink / raw)
  To: neilb; +Cc: linux-raid, linux-kernel, hpa

<linux/module.h> drags in headers which are not visible to userspace,
thus breaking the build for the test program.

Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
---
 lib/raid6/algos.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 8b02f60..f6a0f78 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -17,11 +17,11 @@
  */
 
 #include <linux/raid/pq.h>
-#include <linux/module.h>
 #ifndef __KERNEL__
 #include <sys/mman.h>
 #include <stdio.h>
 #else
+#include <linux/module.h>
 #include <linux/gfp.h>
 #if !RAID6_USE_EMPTY_ZERO_PAGE
 /* In .bss so it's zeroed */
-- 
1.7.8.5


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/4] lib/raid6: Add SSSE3 optimized recovery functions
  2012-04-12 20:04   ` Jim Kukunas
  2012-04-12 20:04     ` [PATCH 1/4] lib/raid6: fix test program build Jim Kukunas
@ 2012-04-12 20:04     ` Jim Kukunas
  2012-04-12 20:04     ` [PATCH 3/4] lib/raid6: update test program for " Jim Kukunas
                       ` (2 subsequent siblings)
  4 siblings, 0 replies; 13+ messages in thread
From: Jim Kukunas @ 2012-04-12 20:04 UTC (permalink / raw)
  To: neilb; +Cc: linux-raid, linux-kernel, hpa

Add SSSE3 optimized recovery functions, as well as a system
for selecting the most appropriate recovery functions to use.

Originally-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
---
 include/linux/raid/pq.h |   18 +++-
 lib/raid6/Makefile      |    2 +-
 lib/raid6/algos.c       |   37 +++++
 lib/raid6/mktables.c    |   25 ++++
 lib/raid6/recov.c       |   15 ++-
 lib/raid6/recov_ssse3.c |  336 +++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 426 insertions(+), 7 deletions(-)
 create mode 100644 lib/raid6/recov_ssse3.c

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 53272e9..640c69c 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -99,8 +99,20 @@ extern const struct raid6_calls raid6_altivec2;
 extern const struct raid6_calls raid6_altivec4;
 extern const struct raid6_calls raid6_altivec8;
 
+struct raid6_recov_calls {
+	void (*data2)(int, size_t, int, int, void **);
+	void (*datap)(int, size_t, int, void **);
+	int  (*valid)(void);
+	const char *name;
+	int priority;
+};
+
+extern const struct raid6_recov_calls raid6_recov_intx1;
+extern const struct raid6_recov_calls raid6_recov_ssse3;
+
 /* Algorithm list */
 extern const struct raid6_calls * const raid6_algos[];
+extern const struct raid6_recov_calls *const raid6_recov_algos[];
 int raid6_select_algo(void);
 
 /* Return values from chk_syndrome */
@@ -111,14 +123,16 @@ int raid6_select_algo(void);
 
 /* Galois field tables */
 extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
+extern const u8 raid6_vgfmul[256][32] __attribute__((aligned(256)));
 extern const u8 raid6_gfexp[256]      __attribute__((aligned(256)));
 extern const u8 raid6_gfinv[256]      __attribute__((aligned(256)));
 extern const u8 raid6_gfexi[256]      __attribute__((aligned(256)));
 
 /* Recovery routines */
-void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
+extern void (*raid6_2data_recov)(int disks, size_t bytes, int faila, int failb,
 		       void **ptrs);
-void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs);
+extern void (*raid6_datap_recov)(int disks, size_t bytes, int faila,
+			void **ptrs);
 void raid6_dual_recov(int disks, size_t bytes, int faila, int failb,
 		      void **ptrs);
 
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 8a38102..de06dfe 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -1,6 +1,6 @@
 obj-$(CONFIG_RAID6_PQ)	+= raid6_pq.o
 
-raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
+raid6_pq-y	+= algos.o recov.o recov_ssse3.o tables.o int1.o int2.o int4.o \
 		   int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \
 		   altivec8.o mmx.o sse1.o sse2.o
 hostprogs-y	+= mktables
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index f6a0f78..5a7f802 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -64,6 +64,20 @@ const struct raid6_calls * const raid6_algos[] = {
 	NULL
 };
 
+void (*raid6_2data_recov)(int, size_t, int, int, void **);
+EXPORT_SYMBOL_GPL(raid6_2data_recov);
+
+void (*raid6_datap_recov)(int, size_t, int, void **);
+EXPORT_SYMBOL_GPL(raid6_datap_recov);
+
+const struct raid6_recov_calls *const raid6_recov_algos[] = {
+#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
+	&raid6_recov_ssse3,
+#endif
+	&raid6_recov_intx1,
+	NULL
+};
+
 #ifdef __KERNEL__
 #define RAID6_TIME_JIFFIES_LG2	4
 #else
@@ -72,6 +86,26 @@ const struct raid6_calls * const raid6_algos[] = {
 #define time_before(x, y) ((x) < (y))
 #endif
 
+static inline void raid6_choose_recov(void)
+{
+	const struct raid6_recov_calls *const *algo;
+	const struct raid6_recov_calls *best;
+
+	for (best = NULL, algo = raid6_recov_algos; *algo; algo++)
+		if (!best || (*algo)->priority > best->priority)
+			if (!(*algo)->valid || (*algo)->valid())
+				best = *algo;
+
+	if (best) {
+		raid6_2data_recov = best->data2;
+		raid6_datap_recov = best->datap;
+
+		printk("raid6: using %s recovery algorithm\n", best->name);
+	} else
+		printk("raid6: Yikes! No recovery algorithm found!\n");
+}
+
+
 /* Try to pick the best algorithm */
 /* This code uses the gfmul table as convenient data set to abuse */
 
@@ -141,6 +175,9 @@ int __init raid6_select_algo(void)
 
 	free_pages((unsigned long)syndromes, 1);
 
+	/* select raid recover functions */
+	raid6_choose_recov();
+
 	return best ? 0 : -EINVAL;
 }
 
diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c
index 8a37809..39787db 100644
--- a/lib/raid6/mktables.c
+++ b/lib/raid6/mktables.c
@@ -81,6 +81,31 @@ int main(int argc, char *argv[])
 	printf("EXPORT_SYMBOL(raid6_gfmul);\n");
 	printf("#endif\n");
 
+	/* Compute vector multiplication table */
+	printf("\nconst u8  __attribute__((aligned(256)))\n"
+		"raid6_vgfmul[256][32] =\n"
+		"{\n");
+	for (i = 0; i < 256; i++) {
+		printf("\t{\n");
+		for (j = 0; j < 16; j += 8) {
+			printf("\t\t");
+			for (k = 0; k < 8; k++)
+				printf("0x%02x,%c", gfmul(i, j + k),
+				       (k == 7) ? '\n' : ' ');
+		}
+		for (j = 0; j < 16; j += 8) {
+			printf("\t\t");
+			for (k = 0; k < 8; k++)
+				printf("0x%02x,%c", gfmul(i, (j + k) << 4),
+				       (k == 7) ? '\n' : ' ');
+		}
+		printf("\t},\n");
+	}
+	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_vgfmul);\n");
+	printf("#endif\n");
+
 	/* Compute power-of-2 table (exponent) */
 	v = 1;
 	printf("\nconst u8 __attribute__((aligned(256)))\n"
diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c
index fe275d7..1805a5c 100644
--- a/lib/raid6/recov.c
+++ b/lib/raid6/recov.c
@@ -22,7 +22,7 @@
 #include <linux/raid/pq.h>
 
 /* Recover two failed data blocks. */
-void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
+void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, int failb,
 		       void **ptrs)
 {
 	u8 *p, *q, *dp, *dq;
@@ -64,10 +64,9 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
 		p++; q++;
 	}
 }
-EXPORT_SYMBOL_GPL(raid6_2data_recov);
 
 /* Recover failure of one data block plus the P block */
-void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
+void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, void **ptrs)
 {
 	u8 *p, *q, *dq;
 	const u8 *qmul;		/* Q multiplier table */
@@ -96,7 +95,15 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
 		q++; dq++;
 	}
 }
-EXPORT_SYMBOL_GPL(raid6_datap_recov);
+
+
+const struct raid6_recov_calls raid6_recov_intx1 = {
+	.data2 = raid6_2data_recov_intx1,
+	.datap = raid6_datap_recov_intx1,
+	.valid = NULL,
+	.name = "intx1",
+	.priority = 0,
+};
 
 #ifndef __KERNEL__
 /* Testing only */
diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c
new file mode 100644
index 0000000..b77266f
--- /dev/null
+++ b/lib/raid6/recov_ssse3.c
@@ -0,0 +1,336 @@
+/*
+ * Copyright (C) 2012 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
+
+#include <linux/raid/pq.h>
+#include "x86.h"
+
+static int raid6_has_ssse3(void)
+{
+	return boot_cpu_has(X86_FEATURE_XMM) &&
+		boot_cpu_has(X86_FEATURE_XMM2) &&
+		boot_cpu_has(X86_FEATURE_SSSE3);
+}
+
+void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, int failb,
+		       void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+	static const u8 __aligned(16) x0f[16] = {
+		 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+		 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f};
+
+	p = (u8 *)ptrs[disks-2];
+	q = (u8 *)ptrs[disks-1];
+
+	/* Compute syndrome with zero for the missing data pages
+	   Use the dead data pages as temporary storage for
+	   delta p and delta q */
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks-2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[disks-1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]   = dp;
+	ptrs[failb]   = dq;
+	ptrs[disks-2] = p;
+	ptrs[disks-1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+		raid6_gfexp[failb]]];
+
+	kernel_fpu_begin();
+
+	asm volatile("movdqa %0,%%xmm7" : : "m" (x0f[0]));
+
+#ifdef CONFIG_X86_64
+	asm volatile("movdqa %0,%%xmm6" : : "m" (qmul[0]));
+	asm volatile("movdqa %0,%%xmm14" : : "m" (pbmul[0]));
+	asm volatile("movdqa %0,%%xmm15" : : "m" (pbmul[16]));
+#endif
+
+	/* Now do it... */
+	while (bytes) {
+#ifdef CONFIG_X86_64
+		/* xmm6, xmm14, xmm15 */
+
+		asm volatile("movdqa %0,%%xmm1" : : "m" (q[0]));
+		asm volatile("movdqa %0,%%xmm9" : : "m" (q[16]));
+		asm volatile("movdqa %0,%%xmm0" : : "m" (p[0]));
+		asm volatile("movdqa %0,%%xmm8" : : "m" (p[16]));
+		asm volatile("pxor   %0,%%xmm1" : : "m" (dq[0]));
+		asm volatile("pxor   %0,%%xmm9" : : "m" (dq[16]));
+		asm volatile("pxor   %0,%%xmm0" : : "m" (dp[0]));
+		asm volatile("pxor   %0,%%xmm8" : : "m" (dp[16]));
+
+		/* xmm0/8 = px */
+
+		asm volatile("movdqa %xmm6,%xmm4");
+		asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16]));
+		asm volatile("movdqa %xmm6,%xmm12");
+		asm volatile("movdqa %xmm5,%xmm13");
+		asm volatile("movdqa %xmm1,%xmm3");
+		asm volatile("movdqa %xmm9,%xmm11");
+		asm volatile("movdqa %xmm0,%xmm2"); /* xmm2/10 = px */
+		asm volatile("movdqa %xmm8,%xmm10");
+		asm volatile("psraw  $4,%xmm1");
+		asm volatile("psraw  $4,%xmm9");
+		asm volatile("pand   %xmm7,%xmm3");
+		asm volatile("pand   %xmm7,%xmm11");
+		asm volatile("pand   %xmm7,%xmm1");
+		asm volatile("pand   %xmm7,%xmm9");
+		asm volatile("pshufb %xmm3,%xmm4");
+		asm volatile("pshufb %xmm11,%xmm12");
+		asm volatile("pshufb %xmm1,%xmm5");
+		asm volatile("pshufb %xmm9,%xmm13");
+		asm volatile("pxor   %xmm4,%xmm5");
+		asm volatile("pxor   %xmm12,%xmm13");
+
+		/* xmm5/13 = qx */
+
+		asm volatile("movdqa %xmm14,%xmm4");
+		asm volatile("movdqa %xmm15,%xmm1");
+		asm volatile("movdqa %xmm14,%xmm12");
+		asm volatile("movdqa %xmm15,%xmm9");
+		asm volatile("movdqa %xmm2,%xmm3");
+		asm volatile("movdqa %xmm10,%xmm11");
+		asm volatile("psraw  $4,%xmm2");
+		asm volatile("psraw  $4,%xmm10");
+		asm volatile("pand   %xmm7,%xmm3");
+		asm volatile("pand   %xmm7,%xmm11");
+		asm volatile("pand   %xmm7,%xmm2");
+		asm volatile("pand   %xmm7,%xmm10");
+		asm volatile("pshufb %xmm3,%xmm4");
+		asm volatile("pshufb %xmm11,%xmm12");
+		asm volatile("pshufb %xmm2,%xmm1");
+		asm volatile("pshufb %xmm10,%xmm9");
+		asm volatile("pxor   %xmm4,%xmm1");
+		asm volatile("pxor   %xmm12,%xmm9");
+
+		/* xmm1/9 = pbmul[px] */
+		asm volatile("pxor   %xmm5,%xmm1");
+		asm volatile("pxor   %xmm13,%xmm9");
+		/* xmm1/9 = db = DQ */
+		asm volatile("movdqa %%xmm1,%0" : "=m" (dq[0]));
+		asm volatile("movdqa %%xmm9,%0" : "=m" (dq[16]));
+
+		asm volatile("pxor   %xmm1,%xmm0");
+		asm volatile("pxor   %xmm9,%xmm8");
+		asm volatile("movdqa %%xmm0,%0" : "=m" (dp[0]));
+		asm volatile("movdqa %%xmm8,%0" : "=m" (dp[16]));
+
+		bytes -= 32;
+		p += 32;
+		q += 32;
+		dp += 32;
+		dq += 32;
+#else
+		asm volatile("movdqa %0,%%xmm1" : : "m" (*q));
+		asm volatile("movdqa %0,%%xmm0" : : "m" (*p));
+		asm volatile("pxor   %0,%%xmm1" : : "m" (*dq));
+		asm volatile("pxor   %0,%%xmm0" : : "m" (*dp));
+
+		/* 1 = dq ^ q
+		 * 0 = dp ^ p
+		 */
+		asm volatile("movdqa %0,%%xmm4" : : "m" (qmul[0]));
+		asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16]));
+
+		asm volatile("movdqa %xmm1,%xmm3");
+		asm volatile("psraw  $4,%xmm1");
+		asm volatile("pand   %xmm7,%xmm3");
+		asm volatile("pand   %xmm7,%xmm1");
+		asm volatile("pshufb %xmm3,%xmm4");
+		asm volatile("pshufb %xmm1,%xmm5");
+		asm volatile("pxor   %xmm4,%xmm5");
+
+		asm volatile("movdqa %xmm0,%xmm2"); /* xmm2 = px */
+
+		/* xmm5 = qx */
+
+		asm volatile("movdqa %0,%%xmm4" : : "m" (pbmul[0]));
+		asm volatile("movdqa %0,%%xmm1" : : "m" (pbmul[16]));
+		asm volatile("movdqa %xmm2,%xmm3");
+		asm volatile("psraw  $4,%xmm2");
+		asm volatile("pand   %xmm7,%xmm3");
+		asm volatile("pand   %xmm7,%xmm2");
+		asm volatile("pshufb %xmm3,%xmm4");
+		asm volatile("pshufb %xmm2,%xmm1");
+		asm volatile("pxor   %xmm4,%xmm1");
+
+		/* xmm1 = pbmul[px] */
+		asm volatile("pxor   %xmm5,%xmm1");
+		/* xmm1 = db = DQ */
+		asm volatile("movdqa %%xmm1,%0" : "=m" (*dq));
+
+		asm volatile("pxor   %xmm1,%xmm0");
+		asm volatile("movdqa %%xmm0,%0" : "=m" (*dp));
+
+		bytes -= 16;
+		p += 16;
+		q += 16;
+		dp += 16;
+		dq += 16;
+#endif
+	}
+
+	kernel_fpu_end();
+}
+
+
+void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+	static const u8 __aligned(16) x0f[16] = {
+		 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+		 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f};
+
+	p = (u8 *)ptrs[disks-2];
+	q = (u8 *)ptrs[disks-1];
+
+	/* Compute syndrome with zero for the missing data page
+	   Use the dead data page as temporary storage for delta q */
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks-1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]   = dq;
+	ptrs[disks-1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	kernel_fpu_begin();
+
+	asm volatile("movdqa %0, %%xmm7" : : "m" (x0f[0]));
+
+	while (bytes) {
+#ifdef CONFIG_X86_64
+		asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0]));
+		asm volatile("movdqa %0, %%xmm4" : : "m" (dq[16]));
+		asm volatile("pxor %0, %%xmm3" : : "m" (q[0]));
+		asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0]));
+
+		/* xmm3 = q[0] ^ dq[0] */
+
+		asm volatile("pxor %0, %%xmm4" : : "m" (q[16]));
+		asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16]));
+
+		/* xmm4 = q[16] ^ dq[16] */
+
+		asm volatile("movdqa %xmm3, %xmm6");
+		asm volatile("movdqa %xmm4, %xmm8");
+
+		/* xmm4 = xmm8 = q[16] ^ dq[16] */
+
+		asm volatile("psraw $4, %xmm3");
+		asm volatile("pand %xmm7, %xmm6");
+		asm volatile("pand %xmm7, %xmm3");
+		asm volatile("pshufb %xmm6, %xmm0");
+		asm volatile("pshufb %xmm3, %xmm1");
+		asm volatile("movdqa %0, %%xmm10" : : "m" (qmul[0]));
+		asm volatile("pxor %xmm0, %xmm1");
+		asm volatile("movdqa %0, %%xmm11" : : "m" (qmul[16]));
+
+		/* xmm1 = qmul[q[0] ^ dq[0]] */
+
+		asm volatile("psraw $4, %xmm4");
+		asm volatile("pand %xmm7, %xmm8");
+		asm volatile("pand %xmm7, %xmm4");
+		asm volatile("pshufb %xmm8, %xmm10");
+		asm volatile("pshufb %xmm4, %xmm11");
+		asm volatile("movdqa %0, %%xmm2" : : "m" (p[0]));
+		asm volatile("pxor %xmm10, %xmm11");
+		asm volatile("movdqa %0, %%xmm12" : : "m" (p[16]));
+
+		/* xmm11 = qmul[q[16] ^ dq[16]] */
+
+		asm volatile("pxor %xmm1, %xmm2");
+
+		/* xmm2 = p[0] ^ qmul[q[0] ^ dq[0]] */
+
+		asm volatile("pxor %xmm11, %xmm12");
+
+		/* xmm12 = p[16] ^ qmul[q[16] ^ dq[16]] */
+
+		asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0]));
+		asm volatile("movdqa %%xmm11, %0" : "=m" (dq[16]));
+
+		asm volatile("movdqa %%xmm2, %0" : "=m" (p[0]));
+		asm volatile("movdqa %%xmm12, %0" : "=m" (p[16]));
+
+		bytes -= 32;
+		p += 32;
+		q += 32;
+		dq += 32;
+
+#else
+		asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0]));
+		asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0]));
+		asm volatile("pxor %0, %%xmm3" : : "m" (q[0]));
+		asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16]));
+
+		/* xmm3 = *q ^ *dq */
+
+		asm volatile("movdqa %xmm3, %xmm6");
+		asm volatile("movdqa %0, %%xmm2" : : "m" (p[0]));
+		asm volatile("psraw $4, %xmm3");
+		asm volatile("pand %xmm7, %xmm6");
+		asm volatile("pand %xmm7, %xmm3");
+		asm volatile("pshufb %xmm6, %xmm0");
+		asm volatile("pshufb %xmm3, %xmm1");
+		asm volatile("pxor %xmm0, %xmm1");
+
+		/* xmm1 = qmul[*q ^ *dq */
+
+		asm volatile("pxor %xmm1, %xmm2");
+
+		/* xmm2 = *p ^ qmul[*q ^ *dq] */
+
+		asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0]));
+		asm volatile("movdqa %%xmm2, %0" : "=m" (p[0]));
+
+		bytes -= 16;
+		p += 16;
+		q += 16;
+		dq += 16;
+#endif
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_recov_calls raid6_recov_ssse3 = {
+	.data2 = raid6_2data_recov_ssse3,
+	.datap = raid6_datap_recov_ssse3,
+	.valid = raid6_has_ssse3,
+#ifdef CONFIG_X86_64
+	.name = "ssse3x2",
+#else
+	.name = "ssse3x1",
+#endif
+	.priority = 1,
+};
+
+#endif
+
-- 
1.7.8.5


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 3/4] lib/raid6: update test program for recovery functions
  2012-04-12 20:04   ` Jim Kukunas
  2012-04-12 20:04     ` [PATCH 1/4] lib/raid6: fix test program build Jim Kukunas
  2012-04-12 20:04     ` [PATCH 2/4] lib/raid6: Add SSSE3 optimized recovery functions Jim Kukunas
@ 2012-04-12 20:04     ` Jim Kukunas
  2012-04-12 20:04     ` [PATCH 4/4] lib/raid6: cleanup gen_syndrome function selection Jim Kukunas
  2012-04-12 20:14     ` lib/raid6: SSSE3 optimized recovery functions v2 H. Peter Anvin
  4 siblings, 0 replies; 13+ messages in thread
From: Jim Kukunas @ 2012-04-12 20:04 UTC (permalink / raw)
  To: neilb; +Cc: linux-raid, linux-kernel, hpa

Test each combination of recovery and syndrome generation
functions.

Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
---
 lib/raid6/test/Makefile |    2 +-
 lib/raid6/test/test.c   |   32 +++++++++++++++++++++-----------
 lib/raid6/x86.h         |   15 ++++++++++-----
 3 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index aa65169..c76151d 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -23,7 +23,7 @@ RANLIB	 = ranlib
 all:	raid6.a raid6test
 
 raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \
-	 altivec1.o altivec2.o altivec4.o altivec8.o recov.o algos.o \
+	 altivec1.o altivec2.o altivec4.o altivec8.o recov.o recov_ssse3.o algos.o \
 	 tables.o
 	 rm -f $@
 	 $(AR) cq $@ $^
diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c
index 7a93031..5a485b7 100644
--- a/lib/raid6/test/test.c
+++ b/lib/raid6/test/test.c
@@ -90,25 +90,35 @@ static int test_disks(int i, int j)
 int main(int argc, char *argv[])
 {
 	const struct raid6_calls *const *algo;
+	const struct raid6_recov_calls *const *ra;
 	int i, j;
 	int err = 0;
 
 	makedata();
 
-	for (algo = raid6_algos; *algo; algo++) {
-		if (!(*algo)->valid || (*algo)->valid()) {
-			raid6_call = **algo;
+	for (ra = raid6_recov_algos; *ra; ra++) {
+		if ((*ra)->valid  && !(*ra)->valid())
+			continue;
+		raid6_2data_recov = (*ra)->data2;
+		raid6_datap_recov = (*ra)->datap;
 
-			/* Nuke syndromes */
-			memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE);
+		printf("using recovery %s\n", (*ra)->name);
 
-			/* Generate assumed good syndrome */
-			raid6_call.gen_syndrome(NDISKS, PAGE_SIZE,
-						(void **)&dataptrs);
+		for (algo = raid6_algos; *algo; algo++) {
+			if (!(*algo)->valid || (*algo)->valid()) {
+				raid6_call = **algo;
 
-			for (i = 0; i < NDISKS-1; i++)
-				for (j = i+1; j < NDISKS; j++)
-					err += test_disks(i, j);
+				/* Nuke syndromes */
+				memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE);
+
+				/* Generate assumed good syndrome */
+				raid6_call.gen_syndrome(NDISKS, PAGE_SIZE,
+							(void **)&dataptrs);
+
+				for (i = 0; i < NDISKS-1; i++)
+					for (j = i+1; j < NDISKS; j++)
+						err += test_disks(i, j);
+			}
 		}
 		printf("\n");
 	}
diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h
index cb2a8c9..d55d632 100644
--- a/lib/raid6/x86.h
+++ b/lib/raid6/x86.h
@@ -35,24 +35,29 @@ static inline void kernel_fpu_end(void)
 {
 }
 
+#define __aligned(x) __attribute__((aligned(x)))
+
 #define X86_FEATURE_MMX		(0*32+23) /* Multimedia Extensions */
 #define X86_FEATURE_FXSR	(0*32+24) /* FXSAVE and FXRSTOR instructions
 					   * (fast save and restore) */
 #define X86_FEATURE_XMM		(0*32+25) /* Streaming SIMD Extensions */
 #define X86_FEATURE_XMM2	(0*32+26) /* Streaming SIMD Extensions-2 */
+#define X86_FEATURE_XMM3	(4*32+ 0) /* "pni" SSE-3 */
+#define X86_FEATURE_SSSE3	(4*32+ 9) /* Supplemental SSE-3 */
+#define X86_FEATURE_AVX	(4*32+28) /* Advanced Vector Extensions */
 #define X86_FEATURE_MMXEXT	(1*32+22) /* AMD MMX extensions */
 
 /* Should work well enough on modern CPUs for testing */
 static inline int boot_cpu_has(int flag)
 {
-	u32 eax = (flag >> 5) ? 0x80000001 : 1;
-	u32 edx;
+	u32 eax = (flag & 0x20) ? 0x80000001 : 1;
+	u32 ecx, edx;
 
 	asm volatile("cpuid"
-		     : "+a" (eax), "=d" (edx)
-		     : : "ecx", "ebx");
+		     : "+a" (eax), "=d" (edx), "=c" (ecx)
+		     : : "ebx");
 
-	return (edx >> (flag & 31)) & 1;
+	return ((flag & 0x80 ? ecx : edx) >> (flag & 31)) & 1;
 }
 
 #endif /* ndef __KERNEL__ */
-- 
1.7.8.5


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 4/4] lib/raid6: cleanup gen_syndrome function selection
  2012-04-12 20:04   ` Jim Kukunas
                       ` (2 preceding siblings ...)
  2012-04-12 20:04     ` [PATCH 3/4] lib/raid6: update test program for " Jim Kukunas
@ 2012-04-12 20:04     ` Jim Kukunas
  2012-04-12 20:14     ` lib/raid6: SSSE3 optimized recovery functions v2 H. Peter Anvin
  4 siblings, 0 replies; 13+ messages in thread
From: Jim Kukunas @ 2012-04-12 20:04 UTC (permalink / raw)
  To: neilb; +Cc: linux-raid, linux-kernel, hpa

Reorders functions in raid6_algos as well as the preference check
to reduce the number of functions tested on initialization.

Also, creates symmetry between choosing the gen_syndrome functions
and choosing the recovery functions.

Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
---
 lib/raid6/algos.c |  104 +++++++++++++++++++++++++++++------------------------
 1 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 5a7f802..589f5f5 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -34,10 +34,6 @@ struct raid6_calls raid6_call;
 EXPORT_SYMBOL_GPL(raid6_call);
 
 const struct raid6_calls * const raid6_algos[] = {
-	&raid6_intx1,
-	&raid6_intx2,
-	&raid6_intx4,
-	&raid6_intx8,
 #if defined(__ia64__)
 	&raid6_intx16,
 	&raid6_intx32,
@@ -61,6 +57,10 @@ const struct raid6_calls * const raid6_algos[] = {
 	&raid6_altivec4,
 	&raid6_altivec8,
 #endif
+	&raid6_intx1,
+	&raid6_intx2,
+	&raid6_intx4,
+	&raid6_intx8,
 	NULL
 };
 
@@ -86,7 +86,7 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
 #define time_before(x, y) ((x) < (y))
 #endif
 
-static inline void raid6_choose_recov(void)
+static inline const struct raid6_recov_calls *raid6_choose_recov(void)
 {
 	const struct raid6_recov_calls *const *algo;
 	const struct raid6_recov_calls *best;
@@ -103,62 +103,38 @@ static inline void raid6_choose_recov(void)
 		printk("raid6: using %s recovery algorithm\n", best->name);
 	} else
 		printk("raid6: Yikes! No recovery algorithm found!\n");
-}
-
 
-/* Try to pick the best algorithm */
-/* This code uses the gfmul table as convenient data set to abuse */
+	return best;
+}
 
-int __init raid6_select_algo(void)
+static inline const struct raid6_calls *raid6_choose_gen(
+	void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks)
 {
-	const struct raid6_calls * const * algo;
-	const struct raid6_calls * best;
-	char *syndromes;
-	void *dptrs[(65536/PAGE_SIZE)+2];
-	int i, disks;
-	unsigned long perf, bestperf;
-	int bestprefer;
-	unsigned long j0, j1;
-
-	disks = (65536/PAGE_SIZE)+2;
-	for ( i = 0 ; i < disks-2 ; i++ ) {
-		dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i;
-	}
-
-	/* Normal code - use a 2-page allocation to avoid D$ conflict */
-	syndromes = (void *) __get_free_pages(GFP_KERNEL, 1);
-
-	if ( !syndromes ) {
-		printk("raid6: Yikes!  No memory available.\n");
-		return -ENOMEM;
-	}
-
-	dptrs[disks-2] = syndromes;
-	dptrs[disks-1] = syndromes + PAGE_SIZE;
+	unsigned long perf, bestperf, j0, j1;
+	const struct raid6_calls *const *algo;
+	const struct raid6_calls *best;
 
-	bestperf = 0;  bestprefer = 0;  best = NULL;
+	for (bestperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) {
+		if (!best || (*algo)->prefer >= best->prefer) {
+			if ((*algo)->valid && !(*algo)->valid())
+				continue;
 
-	for ( algo = raid6_algos ; *algo ; algo++ ) {
-		if ( !(*algo)->valid || (*algo)->valid() ) {
 			perf = 0;
 
 			preempt_disable();
 			j0 = jiffies;
-			while ( (j1 = jiffies) == j0 )
+			while ((j1 = jiffies) == j0)
 				cpu_relax();
 			while (time_before(jiffies,
 					    j1 + (1<<RAID6_TIME_JIFFIES_LG2))) {
-				(*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs);
+				(*algo)->gen_syndrome(disks, PAGE_SIZE, *dptrs);
 				perf++;
 			}
 			preempt_enable();
 
-			if ( (*algo)->prefer > bestprefer ||
-			     ((*algo)->prefer == bestprefer &&
-			      perf > bestperf) ) {
-				best = *algo;
-				bestprefer = best->prefer;
+			if (perf > bestperf) {
 				bestperf = perf;
+				best = *algo;
 			}
 			printk("raid6: %-8s %5ld MB/s\n", (*algo)->name,
 			       (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2));
@@ -173,12 +149,46 @@ int __init raid6_select_algo(void)
 	} else
 		printk("raid6: Yikes!  No algorithm found!\n");
 
-	free_pages((unsigned long)syndromes, 1);
+	return best;
+}
+
+
+/* Try to pick the best algorithm */
+/* This code uses the gfmul table as convenient data set to abuse */
+
+int __init raid6_select_algo(void)
+{
+	const int disks = (65536/PAGE_SIZE)+2;
+
+	const struct raid6_calls *gen_best;
+	const struct raid6_recov_calls *rec_best;
+	char *syndromes;
+	void *dptrs[(65536/PAGE_SIZE)+2];
+	int i;
+
+	for (i = 0; i < disks-2; i++)
+		dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i;
+
+	/* Normal code - use a 2-page allocation to avoid D$ conflict */
+	syndromes = (void *) __get_free_pages(GFP_KERNEL, 1);
+
+	if (!syndromes) {
+		printk("raid6: Yikes!  No memory available.\n");
+		return -ENOMEM;
+	}
+
+	dptrs[disks-2] = syndromes;
+	dptrs[disks-1] = syndromes + PAGE_SIZE;
+
+	/* select raid gen_syndrome function */
+	gen_best = raid6_choose_gen(&dptrs, disks);
 
 	/* select raid recover functions */
-	raid6_choose_recov();
+	rec_best = raid6_choose_recov();
+
+	free_pages((unsigned long)syndromes, 1);
 
-	return best ? 0 : -EINVAL;
+	return gen_best && rec_best ? 0 : -EINVAL;
 }
 
 static void raid6_exit(void)
-- 
1.7.8.5


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: lib/raid6: SSSE3 optimized recovery functions v2
  2012-04-12 20:04   ` Jim Kukunas
                       ` (3 preceding siblings ...)
  2012-04-12 20:04     ` [PATCH 4/4] lib/raid6: cleanup gen_syndrome function selection Jim Kukunas
@ 2012-04-12 20:14     ` H. Peter Anvin
  4 siblings, 0 replies; 13+ messages in thread
From: H. Peter Anvin @ 2012-04-12 20:14 UTC (permalink / raw)
  To: Jim Kukunas; +Cc: neilb, linux-raid, linux-kernel

On 04/12/2012 01:04 PM, Jim Kukunas wrote:
> 
> This keeps with the existing code conventions. The code block is:
> 
> extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
> extern const u8 raid6_vgfmul[256][32] __attribute__((aligned(256)));
> extern const u8 raid6_gfexp[256]      __attribute__((aligned(256)));
> extern const u8 raid6_gfinv[256]      __attribute__((aligned(256)));
> extern const u8 raid6_gfexi[256]      __attribute__((aligned(256)));
> 
>> WARNING: printk() should include KERN_ facility level
>> #120: FILE: lib/raid6/algos.c:103:
>> +		printk("raid6: using %s recovery algorith\n", nest->name);
>>
>> WARNING: printk() should include KERN_ facility level
>> #122: FILE: lib/raid6/algos.c:105:
>> +		printk("raid6: Yikes! No recovery algorithm found!\n");
>>
>> WARNING: printk() should include KERN_ facility level
>> #159: FILE: lib/raid6/algos.c:176:
>> +		printk("raid6: Yikes!  No memory available.\n");
> 
> Again, these are following the conventions of the existing code such as:
> 
> printk("raid6: using algorithm %s (%ld MB/s)\n",
> 
> In fact, the last printk, about no memory available, was simply moved to a 
> different line in my patch.
> 

A lot of the RAID-6 code predates the modern kernel conventions.  It
would be good to clean that up, but that is largely orthogonal to this
patch.

	-hpa


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/4] lib/raid6: fix test program build
  2012-04-12 20:04     ` [PATCH 1/4] lib/raid6: fix test program build Jim Kukunas
@ 2012-04-19  0:33       ` NeilBrown
  0 siblings, 0 replies; 13+ messages in thread
From: NeilBrown @ 2012-04-19  0:33 UTC (permalink / raw)
  To: Jim Kukunas; +Cc: linux-raid, linux-kernel, hpa

[-- Attachment #1: Type: text/plain, Size: 504 bytes --]

On Thu, 12 Apr 2012 13:04:30 -0700 Jim Kukunas
<james.t.kukunas@linux.intel.com> wrote:

> <linux/module.h> drags in headers which are not visible to userspace,
> thus breaking the build for the test program.
> 
> Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>

I've applied this and the following three.
They should be visible in
   git://neil.brown.name/md/ for-next
   git://github.com/neilbrown/linux md/for-next
(a fair way down the list from HEAD).

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2012-04-19  0:33 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-04-11 19:40 lib/raid6: SSSE3 optimized recovery functions v2 Jim Kukunas
2012-04-11 19:40 ` [PATCH 1/4] lib/raid6: fix test program build Jim Kukunas
2012-04-11 19:40 ` [PATCH 2/4] lib/raid6: Add SSSE3 optimized recovery functions Jim Kukunas
2012-04-11 19:40 ` [PATCH 3/4] lib/raid6: update test program for " Jim Kukunas
2012-04-11 19:40 ` [PATCH 4/4] lib/raid6: cleanup gen_syndrome function selection Jim Kukunas
2012-04-12  6:18 ` lib/raid6: SSSE3 optimized recovery functions v2 NeilBrown
2012-04-12 20:04   ` Jim Kukunas
2012-04-12 20:04     ` [PATCH 1/4] lib/raid6: fix test program build Jim Kukunas
2012-04-19  0:33       ` NeilBrown
2012-04-12 20:04     ` [PATCH 2/4] lib/raid6: Add SSSE3 optimized recovery functions Jim Kukunas
2012-04-12 20:04     ` [PATCH 3/4] lib/raid6: update test program for " Jim Kukunas
2012-04-12 20:04     ` [PATCH 4/4] lib/raid6: cleanup gen_syndrome function selection Jim Kukunas
2012-04-12 20:14     ` lib/raid6: SSSE3 optimized recovery functions v2 H. Peter Anvin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).