linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] lib/raid6: Add AVX2 optimized recovery functions
@ 2012-11-08 21:47 Jim Kukunas
  2012-11-09 11:35 ` Paul Menzel
  2012-11-29 20:09 ` Andi Kleen
  0 siblings, 2 replies; 13+ messages in thread
From: Jim Kukunas @ 2012-11-08 21:47 UTC (permalink / raw)
  To: Linux Raid; +Cc: Linux Kernel, Neil Brown, H. Peter Anvin

Optimize RAID6 recovery functions to take advantage of
the 256-bit YMM integer instructions introduced in AVX2.

Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
---
 arch/x86/Makefile       |   5 +-
 include/linux/raid/pq.h |   1 +
 lib/raid6/Makefile      |   2 +-
 lib/raid6/algos.c       |   3 +
 lib/raid6/recov_avx2.c  | 327 ++++++++++++++++++++++++++++++++++++++++++++++++
 lib/raid6/test/Makefile |   2 +-
 lib/raid6/x86.h         |  14 ++-
 7 files changed, 345 insertions(+), 9 deletions(-)
 create mode 100644 lib/raid6/recov_avx2.c

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 682e9c2..f24c037 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -123,9 +123,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
 # does binutils support specific instructions?
 asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
 avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
+avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
 
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr)
 
 LDFLAGS := -m elf_$(UTS_MACHINE)
 
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 640c69c..3156347 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -109,6 +109,7 @@ struct raid6_recov_calls {
 
 extern const struct raid6_recov_calls raid6_recov_intx1;
 extern const struct raid6_recov_calls raid6_recov_ssse3;
+extern const struct raid6_recov_calls raid6_recov_avx2;
 
 /* Algorithm list */
 extern const struct raid6_calls * const raid6_algos[];
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index de06dfe..8c2e22b 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -1,6 +1,6 @@
 obj-$(CONFIG_RAID6_PQ)	+= raid6_pq.o
 
-raid6_pq-y	+= algos.o recov.o recov_ssse3.o tables.o int1.o int2.o int4.o \
+raid6_pq-y	+= algos.o recov.o recov_ssse3.o recov_avx2.o tables.o int1.o int2.o int4.o \
 		   int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \
 		   altivec8.o mmx.o sse1.o sse2.o
 hostprogs-y	+= mktables
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 589f5f5..8b7f55c 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -72,6 +72,9 @@ EXPORT_SYMBOL_GPL(raid6_datap_recov);
 
 const struct raid6_recov_calls *const raid6_recov_algos[] = {
 #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
+#ifdef CONFIG_AS_AVX2
+	&raid6_recov_avx2,
+#endif
 	&raid6_recov_ssse3,
 #endif
 	&raid6_recov_intx1,
diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c
new file mode 100644
index 0000000..43a9bab
--- /dev/null
+++ b/lib/raid6/recov_avx2.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
+
+#if CONFIG_AS_AVX2
+
+#include <linux/raid/pq.h>
+#include "x86.h"
+
+static int raid6_has_avx2(void)
+{
+	return boot_cpu_has(X86_FEATURE_AVX2) &&
+		boot_cpu_has(X86_FEATURE_AVX);
+}
+
+static void raid6_2data_recov_avx2(int disks, size_t bytes, int faila,
+		int failb, void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+	const u8 x0f = 0x0f;
+
+	p = (u8 *)ptrs[disks-2];
+	q = (u8 *)ptrs[disks-1];
+
+	/* Compute syndrome with zero for the missing data pages
+	   Use the dead data pages as temporary storage for
+	   delta p and delta q */
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks-2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[disks-1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]   = dp;
+	ptrs[failb]   = dq;
+	ptrs[disks-2] = p;
+	ptrs[disks-1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+		raid6_gfexp[failb]]];
+
+	kernel_fpu_begin();
+
+	/* ymm0 = x0f[16] */
+	asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f));
+
+	while (bytes) {
+#ifdef CONFIG_X86_64
+		asm volatile("vmovdqa %0, %%ymm1" : : "m" (q[0]));
+		asm volatile("vmovdqa %0, %%ymm9" : : "m" (q[32]));
+		asm volatile("vmovdqa %0, %%ymm0" : : "m" (p[0]));
+		asm volatile("vmovdqa %0, %%ymm8" : : "m" (p[32]));
+		asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (dq[0]));
+		asm volatile("vpxor %0, %%ymm9, %%ymm9" : : "m" (dq[32]));
+		asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (dp[0]));
+		asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (dp[32]));
+
+		/*
+		 * 1 = dq[0]  ^ q[0]
+		 * 9 = dq[32] ^ q[32]
+		 * 0 = dp[0]  ^ p[0]
+		 * 8 = dp[32] ^ p[32]
+		 */
+
+		asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0]));
+		asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16]));
+
+		asm volatile("vpsraw $4, %ymm1, %ymm3");
+		asm volatile("vpsraw $4, %ymm9, %ymm12");
+		asm volatile("vpand %ymm7, %ymm1, %ymm1");
+		asm volatile("vpand %ymm7, %ymm9, %ymm9");
+		asm volatile("vpand %ymm7, %ymm3, %ymm3");
+		asm volatile("vpand %ymm7, %ymm12, %ymm12");
+		asm volatile("vpshufb %ymm9, %ymm4, %ymm14");
+		asm volatile("vpshufb %ymm1, %ymm4, %ymm4");
+		asm volatile("vpshufb %ymm12, %ymm5, %ymm15");
+		asm volatile("vpshufb %ymm3, %ymm5, %ymm5");
+		asm volatile("vpxor %ymm14, %ymm15, %ymm15");
+		asm volatile("vpxor %ymm4, %ymm5, %ymm5");
+
+		/*
+		 * 5 = qx[0]
+		 * 15 = qx[32]
+		 */
+
+		asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0]));
+		asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16]));
+		asm volatile("vpsraw $4, %ymm0, %ymm2");
+		asm volatile("vpsraw $4, %ymm8, %ymm6");
+		asm volatile("vpand %ymm7, %ymm0, %ymm3");
+		asm volatile("vpand %ymm7, %ymm8, %ymm14");
+		asm volatile("vpand %ymm7, %ymm2, %ymm2");
+		asm volatile("vpand %ymm7, %ymm6, %ymm6");
+		asm volatile("vpshufb %ymm14, %ymm4, %ymm12");
+		asm volatile("vpshufb %ymm3, %ymm4, %ymm4");
+		asm volatile("vpshufb %ymm6, %ymm1, %ymm13");
+		asm volatile("vpshufb %ymm2, %ymm1, %ymm1");
+		asm volatile("vpxor %ymm4, %ymm1, %ymm1");
+		asm volatile("vpxor %ymm12, %ymm13, %ymm13");
+
+		/*
+		 * 1  = pbmul[px[0]]
+		 * 13 = pbmul[px[32]]
+		 */
+		asm volatile("vpxor %ymm5, %ymm1, %ymm1");
+		asm volatile("vpxor %ymm15, %ymm13, %ymm13");
+
+		/*
+		 * 1 = db = DQ
+		 * 13 = db[32] = DQ[32]
+		 */
+		asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
+		asm volatile("vmovdqa %%ymm13,%0" : "=m" (dq[32]));
+		asm volatile("vpxor %ymm1, %ymm0, %ymm0");
+		asm volatile("vpxor %ymm13, %ymm8, %ymm8");
+
+		asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0]));
+		asm volatile("vmovdqa %%ymm8, %0" : "=m" (dp[32]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dp += 64;
+		dq += 64;
+#else
+		asm volatile("vmovdqa %0, %%ymm1" : : "m" (*q));
+		asm volatile("vmovdqa %0, %%ymm0" : : "m" (*p));
+		asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (*dq));
+		asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (*dp));
+
+		/* 1 = dq ^ q;  0 = dp ^ p */
+
+		asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0]));
+		asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16]));
+
+		/*
+		 * 1 = dq ^ q
+		 * 3 = dq ^ p >> 4
+		 */
+		asm volatile("vpsraw $4, %ymm1, %ymm3");
+		asm volatile("vpand %ymm7, %ymm1, %ymm1");
+		asm volatile("vpand %ymm7, %ymm3, %ymm3");
+		asm volatile("vpshufb %ymm1, %ymm4, %ymm4");
+		asm volatile("vpshufb %ymm3, %ymm5, %ymm5");
+		asm volatile("vpxor %ymm4, %ymm5, %ymm5");
+
+		/* 5 = qx */
+
+		asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0]));
+		asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16]));
+
+		asm volatile("vpsraw $4, %ymm0, %ymm2");
+		asm volatile("vpand %ymm7, %ymm0, %ymm3");
+		asm volatile("vpand %ymm7, %ymm2, %ymm2");
+		asm volatile("vpshufb %ymm3, %ymm4, %ymm4");
+		asm volatile("vpshufb %ymm2, %ymm1, %ymm1");
+		asm volatile("vpxor %ymm4, %ymm1, %ymm1");
+
+		/* 1 = pbmul[px] */
+		asm volatile("vpxor %ymm5, %ymm1, %ymm1");
+		/* 1 = db = DQ */
+		asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
+
+		asm volatile("vpxor %ymm1, %ymm0, %ymm0");
+		asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0]));
+
+		bytes -= 32;
+		p += 32;
+		q += 32;
+		dp += 32;
+		dq += 32;
+#endif
+	}
+
+	kernel_fpu_end();
+}
+
+static void raid6_datap_recov_avx2(int disks, size_t bytes, int faila,
+		void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+	const u8 x0f = 0x0f;
+
+	p = (u8 *)ptrs[disks-2];
+	q = (u8 *)ptrs[disks-1];
+
+	/* Compute syndrome with zero for the missing data page
+	   Use the dead data page as temporary storage for delta q */
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks-1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]   = dq;
+	ptrs[disks-1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	kernel_fpu_begin();
+
+	asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f));
+
+	while (bytes) {
+#ifdef CONFIG_X86_64
+		asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0]));
+		asm volatile("vmovdqa %0, %%ymm8" : : "m" (dq[32]));
+		asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0]));
+		asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (q[32]));
+
+		/*
+		 * 3 = q[0] ^ dq[0]
+		 * 8 = q[32] ^ dq[32]
+		 */
+		asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0]));
+		asm volatile("vmovapd %ymm0, %ymm13");
+		asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16]));
+		asm volatile("vmovapd %ymm1, %ymm14");
+
+		asm volatile("vpsraw $4, %ymm3, %ymm6");
+		asm volatile("vpsraw $4, %ymm8, %ymm12");
+		asm volatile("vpand %ymm7, %ymm3, %ymm3");
+		asm volatile("vpand %ymm7, %ymm8, %ymm8");
+		asm volatile("vpand %ymm7, %ymm6, %ymm6");
+		asm volatile("vpand %ymm7, %ymm12, %ymm12");
+		asm volatile("vpshufb %ymm3, %ymm0, %ymm0");
+		asm volatile("vpshufb %ymm8, %ymm13, %ymm13");
+		asm volatile("vpshufb %ymm6, %ymm1, %ymm1");
+		asm volatile("vpshufb %ymm12, %ymm14, %ymm14");
+		asm volatile("vpxor %ymm0, %ymm1, %ymm1");
+		asm volatile("vpxor %ymm13, %ymm14, %ymm14");
+
+		/*
+		 * 1  = qmul[q[0]  ^ dq[0]]
+		 * 14 = qmul[q[32] ^ dq[32]]
+		 */
+		asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0]));
+		asm volatile("vmovdqa %0, %%ymm12" : : "m" (p[32]));
+		asm volatile("vpxor %ymm1, %ymm2, %ymm2");
+		asm volatile("vpxor %ymm14, %ymm12, %ymm12");
+
+		/*
+		 * 2  = p[0]  ^ qmul[q[0]  ^ dq[0]]
+		 * 12 = p[32] ^ qmul[q[32] ^ dq[32]]
+		 */
+
+		asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
+		asm volatile("vmovdqa %%ymm14, %0" : "=m" (dq[32]));
+		asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0]));
+		asm volatile("vmovdqa %%ymm12,%0" : "=m" (p[32]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dq += 64;
+#else
+		asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0]));
+		asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0]));
+
+		/* 3 = q ^ dq */
+
+		asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0]));
+		asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16]));
+
+		asm volatile("vpsraw $4, %ymm3, %ymm6");
+		asm volatile("vpand %ymm7, %ymm3, %ymm3");
+		asm volatile("vpand %ymm7, %ymm6, %ymm6");
+		asm volatile("vpshufb %ymm3, %ymm0, %ymm0");
+		asm volatile("vpshufb %ymm6, %ymm1, %ymm1");
+		asm volatile("vpxor %ymm0, %ymm1, %ymm1");
+
+		/* 1 = qmul[q ^ dq] */
+
+		asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0]));
+		asm volatile("vpxor %ymm1, %ymm2, %ymm2");
+
+		/* 2 = p ^ qmul[q ^ dq] */
+
+		asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
+		asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0]));
+
+		bytes -= 32;
+		p += 32;
+		q += 32;
+		dq += 32;
+#endif
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_recov_calls raid6_recov_avx2 = {
+	.data2 = raid6_2data_recov_avx2,
+	.datap = raid6_datap_recov_avx2,
+	.valid = raid6_has_avx2,
+#ifdef CONFIG_X86_64
+	.name = "avx2x2",
+#else
+	.name = "avx2x1",
+#endif
+	.priority = 2,
+};
+
+#else
+#warning "your version of binutils lacks AVX2 support"
+#endif
+
+#endif
diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index c76151d..d919c98 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -23,7 +23,7 @@ RANLIB	 = ranlib
 all:	raid6.a raid6test
 
 raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \
-	 altivec1.o altivec2.o altivec4.o altivec8.o recov.o recov_ssse3.o algos.o \
+	 altivec1.o altivec2.o altivec4.o altivec8.o recov.o recov_ssse3.o recov_avx2.o algos.o \
 	 tables.o
 	 rm -f $@
 	 $(AR) cq $@ $^
diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h
index d55d632..b759548 100644
--- a/lib/raid6/x86.h
+++ b/lib/raid6/x86.h
@@ -45,19 +45,23 @@ static inline void kernel_fpu_end(void)
 #define X86_FEATURE_XMM3	(4*32+ 0) /* "pni" SSE-3 */
 #define X86_FEATURE_SSSE3	(4*32+ 9) /* Supplemental SSE-3 */
 #define X86_FEATURE_AVX	(4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_AVX2        (9*32+ 5) /* AVX2 instructions */
 #define X86_FEATURE_MMXEXT	(1*32+22) /* AMD MMX extensions */
 
 /* Should work well enough on modern CPUs for testing */
 static inline int boot_cpu_has(int flag)
 {
-	u32 eax = (flag & 0x20) ? 0x80000001 : 1;
-	u32 ecx, edx;
+	u32 eax, ebx, ecx, edx;
+
+	eax = (flag & 0x100) ? 7 :
+		(flag & 0x20) ? 0x80000001 : 1;
+	ecx = 0;
 
 	asm volatile("cpuid"
-		     : "+a" (eax), "=d" (edx), "=c" (ecx)
-		     : : "ebx");
+		     : "+a" (eax), "=b" (ebx), "=d" (edx), "+c" (ecx));
 
-	return ((flag & 0x80 ? ecx : edx) >> (flag & 31)) & 1;
+	return ((flag & 0x100 ? ebx :
+		(flag & 0x80) ? ecx : edx) >> (flag & 31)) & 1;
 }
 
 #endif /* ndef __KERNEL__ */
-- 
1.8.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions
  2012-11-08 21:47 [PATCH] lib/raid6: Add AVX2 optimized recovery functions Jim Kukunas
@ 2012-11-09 11:35 ` Paul Menzel
  2012-11-09 11:39   ` H. Peter Anvin
  2012-11-29 20:09 ` Andi Kleen
  1 sibling, 1 reply; 13+ messages in thread
From: Paul Menzel @ 2012-11-09 11:35 UTC (permalink / raw)
  To: Jim Kukunas; +Cc: Linux Raid, Linux Kernel, Neil Brown, H. Peter Anvin

[-- Attachment #1: Type: text/plain, Size: 422 bytes --]

Dear Jim,


Am Donnerstag, den 08.11.2012, 13:47 -0800 schrieb Jim Kukunas:
> Optimize RAID6 recovery functions to take advantage of
> the 256-bit YMM integer instructions introduced in AVX2.

in my experiencing optimizations always have to be back up by
benchmarks. Could you add those to the commit message please.

> Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>

[…]


Thanks,

Paul

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions
  2012-11-09 11:35 ` Paul Menzel
@ 2012-11-09 11:39   ` H. Peter Anvin
  2012-11-09 11:46     ` Paul Menzel
  2012-11-09 11:50     ` NeilBrown
  0 siblings, 2 replies; 13+ messages in thread
From: H. Peter Anvin @ 2012-11-09 11:39 UTC (permalink / raw)
  To: Paul Menzel, Jim Kukunas; +Cc: Linux Raid, Linux Kernel, Neil Brown

Sorry, we cannot share those at this time since the hardwarenis not yet released.

Paul Menzel <pm.debian@googlemail.com> wrote:

>Dear Jim,
>
>
>Am Donnerstag, den 08.11.2012, 13:47 -0800 schrieb Jim Kukunas:
>> Optimize RAID6 recovery functions to take advantage of
>> the 256-bit YMM integer instructions introduced in AVX2.
>
>in my experiencing optimizations always have to be back up by
>benchmarks. Could you add those to the commit message please.
>
>> Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
>
>[…]
>
>
>Thanks,
>
>Paul

-- 
Sent from my mobile phone. Please excuse brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions
  2012-11-09 11:39   ` H. Peter Anvin
@ 2012-11-09 11:46     ` Paul Menzel
  2012-11-09 11:50     ` NeilBrown
  1 sibling, 0 replies; 13+ messages in thread
From: Paul Menzel @ 2012-11-09 11:46 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: Jim Kukunas, Linux Raid, Linux Kernel, Neil Brown

[-- Attachment #1: Type: text/plain, Size: 311 bytes --]

Am Freitag, den 09.11.2012, 12:39 +0100 schrieb H. Peter Anvin:
> Sorry, we cannot share those at this time since the hardwarenis not yet released.

Too bad. Then I suggest an additional run time switch to enable and
disable that code path. So people later can easily test themselves.


Thanks,

Paul

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions
  2012-11-09 11:39   ` H. Peter Anvin
  2012-11-09 11:46     ` Paul Menzel
@ 2012-11-09 11:50     ` NeilBrown
  2012-11-09 12:24       ` H. Peter Anvin
  2012-11-09 19:56       ` Jim Kukunas
  1 sibling, 2 replies; 13+ messages in thread
From: NeilBrown @ 2012-11-09 11:50 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: Paul Menzel, Jim Kukunas, Linux Raid, Linux Kernel

[-- Attachment #1: Type: text/plain, Size: 1308 bytes --]

On Fri, 09 Nov 2012 12:39:05 +0100 "H. Peter Anvin" <hpa@zytor.com> wrote:

> Sorry, we cannot share those at this time since the hardwarenis not yet released.

Can I take that to imply "Acked-by: "H. Peter Anvin" <hpa@zytor.com>" ??

It would be nice to have at least a statement like:
 These patches have been tested both with the user-space testing tool and in 
 a RAID6 md array and the pass all test.  While we cannot release performance
 numbers as the hardwere is not released, we can confirm that on that hardware
 the performance with these patches is faster than without.

I guess I should be able to assume that - surely the patches would not be
posted if it were not true...  But I like to avoid assuming when I can.

Thanks,
NeilBrown


> 
> Paul Menzel <pm.debian@googlemail.com> wrote:
> 
> >Dear Jim,
> >
> >
> >Am Donnerstag, den 08.11.2012, 13:47 -0800 schrieb Jim Kukunas:
> >> Optimize RAID6 recovery functions to take advantage of
> >> the 256-bit YMM integer instructions introduced in AVX2.
> >
> >in my experiencing optimizations always have to be back up by
> >benchmarks. Could you add those to the commit message please.
> >
> >> Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
> >
> >[…]
> >
> >
> >Thanks,
> >
> >Paul
> 


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions
  2012-11-09 11:50     ` NeilBrown
@ 2012-11-09 12:24       ` H. Peter Anvin
  2012-11-09 19:56       ` Jim Kukunas
  1 sibling, 0 replies; 13+ messages in thread
From: H. Peter Anvin @ 2012-11-09 12:24 UTC (permalink / raw)
  To: NeilBrown; +Cc: Paul Menzel, Jim Kukunas, Linux Raid, Linux Kernel

Yes, consider it an implicit Acked-by.

NeilBrown <neilb@suse.de> wrote:

>On Fri, 09 Nov 2012 12:39:05 +0100 "H. Peter Anvin" <hpa@zytor.com>
>wrote:
>
>> Sorry, we cannot share those at this time since the hardwarenis not
>yet released.
>
>Can I take that to imply "Acked-by: "H. Peter Anvin" <hpa@zytor.com>"
>??
>
>It would be nice to have at least a statement like:
>These patches have been tested both with the user-space testing tool
>and in 
>a RAID6 md array and the pass all test.  While we cannot release
>performance
>numbers as the hardwere is not released, we can confirm that on that
>hardware
> the performance with these patches is faster than without.
>
>I guess I should be able to assume that - surely the patches would not
>be
>posted if it were not true...  But I like to avoid assuming when I can.
>
>Thanks,
>NeilBrown
>
>
>> 
>> Paul Menzel <pm.debian@googlemail.com> wrote:
>> 
>> >Dear Jim,
>> >
>> >
>> >Am Donnerstag, den 08.11.2012, 13:47 -0800 schrieb Jim Kukunas:
>> >> Optimize RAID6 recovery functions to take advantage of
>> >> the 256-bit YMM integer instructions introduced in AVX2.
>> >
>> >in my experiencing optimizations always have to be back up by
>> >benchmarks. Could you add those to the commit message please.
>> >
>> >> Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
>> >
>> >[…]
>> >
>> >
>> >Thanks,
>> >
>> >Paul
>> 

-- 
Sent from my mobile phone. Please excuse brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions
  2012-11-09 11:50     ` NeilBrown
  2012-11-09 12:24       ` H. Peter Anvin
@ 2012-11-09 19:56       ` Jim Kukunas
  2012-11-19  0:57         ` NeilBrown
  1 sibling, 1 reply; 13+ messages in thread
From: Jim Kukunas @ 2012-11-09 19:56 UTC (permalink / raw)
  To: NeilBrown; +Cc: H. Peter Anvin, Paul Menzel, Linux Raid, Linux Kernel

[-- Attachment #1: Type: text/plain, Size: 1200 bytes --]

On Fri, Nov 09, 2012 at 10:50:25PM +1100, Neil Brown wrote:
> On Fri, 09 Nov 2012 12:39:05 +0100 "H. Peter Anvin" <hpa@zytor.com> wrote:
> 
> > Sorry, we cannot share those at this time since the hardwarenis not yet released.
> 
> Can I take that to imply "Acked-by: "H. Peter Anvin" <hpa@zytor.com>" ??
> 
> It would be nice to have at least a statement like:
>  These patches have been tested both with the user-space testing tool and in 
>  a RAID6 md array and the pass all test.  While we cannot release performance
>  numbers as the hardwere is not released, we can confirm that on that hardware
>  the performance with these patches is faster than without.
> 
> I guess I should be able to assume that - surely the patches would not be
> posted if it were not true...  But I like to avoid assuming when I can.

Hi Neil,

That assumption is correct. The patch was tested and benchmarked before submission.

You'll notice that this code is very similar to the SSSE3-optimized
recovery routines I wrote earlier. This implementation extends that same
algorithm from 128-bit registers to 256-bit registers.

Thanks.

-- 
Jim Kukunas
Intel Open Source Technology Center

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions
  2012-11-09 19:56       ` Jim Kukunas
@ 2012-11-19  0:57         ` NeilBrown
  0 siblings, 0 replies; 13+ messages in thread
From: NeilBrown @ 2012-11-19  0:57 UTC (permalink / raw)
  To: Jim Kukunas; +Cc: H. Peter Anvin, Paul Menzel, Linux Raid, Linux Kernel

[-- Attachment #1: Type: text/plain, Size: 1520 bytes --]

On Fri, 9 Nov 2012 11:56:10 -0800 Jim Kukunas
<james.t.kukunas@linux.intel.com> wrote:

> On Fri, Nov 09, 2012 at 10:50:25PM +1100, Neil Brown wrote:
> > On Fri, 09 Nov 2012 12:39:05 +0100 "H. Peter Anvin" <hpa@zytor.com> wrote:
> > 
> > > Sorry, we cannot share those at this time since the hardwarenis not yet released.
> > 
> > Can I take that to imply "Acked-by: "H. Peter Anvin" <hpa@zytor.com>" ??
> > 
> > It would be nice to have at least a statement like:
> >  These patches have been tested both with the user-space testing tool and in 
> >  a RAID6 md array and the pass all test.  While we cannot release performance
> >  numbers as the hardwere is not released, we can confirm that on that hardware
> >  the performance with these patches is faster than without.
> > 
> > I guess I should be able to assume that - surely the patches would not be
> > posted if it were not true...  But I like to avoid assuming when I can.
> 
> Hi Neil,
> 
> That assumption is correct. The patch was tested and benchmarked before submission.

Thanks.  I've queued the patch up.
> 
> You'll notice that this code is very similar to the SSSE3-optimized
> recovery routines I wrote earlier. This implementation extends that same
> algorithm from 128-bit registers to 256-bit registers.

I might notice that if I actually looked, but it all starts swimming before
my eyes when I try :-)
If both you and hpa like it, then that is good enough for me.

Thanks,
NeilBrown


> 
> Thanks.
> 


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions
  2012-11-08 21:47 [PATCH] lib/raid6: Add AVX2 optimized recovery functions Jim Kukunas
  2012-11-09 11:35 ` Paul Menzel
@ 2012-11-29 20:09 ` Andi Kleen
  2012-11-29 21:13   ` H. Peter Anvin
  1 sibling, 1 reply; 13+ messages in thread
From: Andi Kleen @ 2012-11-29 20:09 UTC (permalink / raw)
  To: Jim Kukunas; +Cc: Linux Raid, Linux Kernel, Neil Brown, H. Peter Anvin

Jim Kukunas <james.t.kukunas@linux.intel.com> writes:
> +
> +	/* ymm0 = x0f[16] */
> +	asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f));
> +
> +	while (bytes) {
> +#ifdef CONFIG_X86_64
> +		asm volatile("vmovdqa %0, %%ymm1" : : "m" (q[0]));
> +		asm volatile("vmovdqa %0, %%ymm9" : : "m" (q[32]));
> +		asm volatile("vmovdqa %0, %%ymm0" : : "m" (p[0]));
> +		asm volatile("vmovdqa %0, %%ymm8" : : "m" (p[32]));

This is somewhat dangerous to assume registers do not get changed
between assembler statements or assembler statements do not get
reordered. Better always put such values into explicit variables or
merge them into a single asm statement.

asm volatile is also not enough to prevent reordering. If anything
you would need a memory clobber.

-Andi


-- 
ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions
  2012-11-29 20:09 ` Andi Kleen
@ 2012-11-29 21:13   ` H. Peter Anvin
  2012-11-29 21:18     ` Andi Kleen
  0 siblings, 1 reply; 13+ messages in thread
From: H. Peter Anvin @ 2012-11-29 21:13 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Jim Kukunas, Linux Raid, Linux Kernel, Neil Brown

On 11/29/2012 12:09 PM, Andi Kleen wrote:
> Jim Kukunas <james.t.kukunas@linux.intel.com> writes:
>> +
>> +	/* ymm0 = x0f[16] */
>> +	asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f));
>> +
>> +	while (bytes) {
>> +#ifdef CONFIG_X86_64
>> +		asm volatile("vmovdqa %0, %%ymm1" : : "m" (q[0]));
>> +		asm volatile("vmovdqa %0, %%ymm9" : : "m" (q[32]));
>> +		asm volatile("vmovdqa %0, %%ymm0" : : "m" (p[0]));
>> +		asm volatile("vmovdqa %0, %%ymm8" : : "m" (p[32]));
> 
> This is somewhat dangerous to assume registers do not get changed
> between assembler statements or assembler statements do not get
> reordered. Better always put such values into explicit variables or
> merge them into a single asm statement.
> 
> asm volatile is also not enough to prevent reordering. If anything
> you would need a memory clobber.
> 

The code is compiled so that the xmm/ymm registers are not available to
the compiler.  Do you have any known examples of asm volatiles being
reordered *with respect to each other*?  My understandings of gcc is
that volatile operations are ordered with respect to each other (not
necessarily with respect to non-volatile operations, though.)

Either way, this implementatin technique was used for the MMX/SSE
implementations without any problems for 9 years now.

	-h[a



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions
  2012-11-29 21:13   ` H. Peter Anvin
@ 2012-11-29 21:18     ` Andi Kleen
  2012-11-29 22:27       ` H. Peter Anvin
  2012-11-29 22:28       ` H. Peter Anvin
  0 siblings, 2 replies; 13+ messages in thread
From: Andi Kleen @ 2012-11-29 21:18 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andi Kleen, Jim Kukunas, Linux Raid, Linux Kernel, Neil Brown

> The code is compiled so that the xmm/ymm registers are not available to
> the compiler.  Do you have any known examples of asm volatiles being
> reordered *with respect to each other*?  My understandings of gcc is
> that volatile operations are ordered with respect to each other (not
> necessarily with respect to non-volatile operations, though.)

Can you quote it from the manual? As I understand volatile as usual
is not clearly defined. 

gcc has a lot of optimization passes and volatile bugs are common.


> Either way, this implementatin technique was used for the MMX/SSE
> implementations without any problems for 9 years now.

It's still wrong.

Lying to the compiler usually bites you at some point.

-Andi

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions
  2012-11-29 21:18     ` Andi Kleen
@ 2012-11-29 22:27       ` H. Peter Anvin
  2012-11-29 22:28       ` H. Peter Anvin
  1 sibling, 0 replies; 13+ messages in thread
From: H. Peter Anvin @ 2012-11-29 22:27 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Jim Kukunas, Linux Raid, Linux Kernel, Neil Brown

On 11/29/2012 01:18 PM, Andi Kleen wrote:
> 
> Can you quote it from the manual? As I understand volatile as usual
> is not clearly defined. 
> 
> gcc has a lot of optimization passes and volatile bugs are common.
> 

I talked it over with H.J. and we walked through the code.  The
documentation is lacking, of course, as is common with gcc.

However, that piece was pretty clear.

	-hpa


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions
  2012-11-29 21:18     ` Andi Kleen
  2012-11-29 22:27       ` H. Peter Anvin
@ 2012-11-29 22:28       ` H. Peter Anvin
  1 sibling, 0 replies; 13+ messages in thread
From: H. Peter Anvin @ 2012-11-29 22:28 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Jim Kukunas, Linux Raid, Linux Kernel, Neil Brown

On 11/29/2012 01:18 PM, Andi Kleen wrote:
> 
>> Either way, this implementatin technique was used for the MMX/SSE
>> implementations without any problems for 9 years now.
> 
> It's still wrong.
> 
> Lying to the compiler usually bites you at some point.
> 

It's not lying to the compiler.  It is telling the compiler "do
something that you have no way of knowing about".

It would be lying to the compiler if the compiler was allowed to touch
the FPU state, but it's not.

	-hpa



^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2012-11-29 22:29 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-11-08 21:47 [PATCH] lib/raid6: Add AVX2 optimized recovery functions Jim Kukunas
2012-11-09 11:35 ` Paul Menzel
2012-11-09 11:39   ` H. Peter Anvin
2012-11-09 11:46     ` Paul Menzel
2012-11-09 11:50     ` NeilBrown
2012-11-09 12:24       ` H. Peter Anvin
2012-11-09 19:56       ` Jim Kukunas
2012-11-19  0:57         ` NeilBrown
2012-11-29 20:09 ` Andi Kleen
2012-11-29 21:13   ` H. Peter Anvin
2012-11-29 21:18     ` Andi Kleen
2012-11-29 22:27       ` H. Peter Anvin
2012-11-29 22:28       ` H. Peter Anvin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).