* arch/86: AVX RAID5 xor checksumming @ 2012-04-10 17:22 Jim Kukunas 2012-04-10 17:22 ` [PATCH] raid5: add AVX optimized RAID5 checksumming Jim Kukunas 2012-04-10 17:37 ` arch/86: AVX RAID5 xor checksumming H. Peter Anvin 0 siblings, 2 replies; 8+ messages in thread From: Jim Kukunas @ 2012-04-10 17:22 UTC (permalink / raw) To: x86; +Cc: hpa, neilb, linux-kernel, mingo, tglx Hi Folks, The following patch adds an AVX implementation of the RAID5 xor checksumming functions. Based on xor_speed, the AVX implementation appears to be ~32% faster than the SSE implementation on my i7 2600: generic_sse: 15088.000 MB/sec avx: 19936.000 MB/sec Thanks. ^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH] raid5: add AVX optimized RAID5 checksumming 2012-04-10 17:22 arch/86: AVX RAID5 xor checksumming Jim Kukunas @ 2012-04-10 17:22 ` Jim Kukunas 2012-04-10 17:37 ` arch/86: AVX RAID5 xor checksumming H. Peter Anvin 1 sibling, 0 replies; 8+ messages in thread From: Jim Kukunas @ 2012-04-10 17:22 UTC (permalink / raw) To: x86; +Cc: hpa, neilb, linux-kernel, mingo, tglx Optimize RAID5 xor checksumming by taking advantage of 256-bit YMM registers introduced in AVX. Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com> Reviewed-by: H. Peter Anvin <hpa@zytor.com> Acked-by: NeilBrown <neilb@suse.de> --- arch/x86/include/asm/xor_32.h | 8 ++- arch/x86/include/asm/xor_64.h | 10 ++- arch/x86/include/asm/xor_avx.h | 184 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 200 insertions(+), 2 deletions(-) create mode 100644 arch/x86/include/asm/xor_avx.h diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index 133b40a..1799baa 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = { .do_5 = xor_sse_5, }; +/* Also try the AVX routines */ +#include "xor_avx.h" + /* Also try the generic routines. */ #include <asm-generic/xor.h> @@ -871,6 +874,8 @@ do { \ xor_speed(&xor_block_8regs_p); \ xor_speed(&xor_block_32regs); \ xor_speed(&xor_block_32regs_p); \ + if (cpu_has_avx) \ + xor_speed(&xor_block_avx); \ if (cpu_has_xmm) \ xor_speed(&xor_block_pIII_sse); \ if (cpu_has_mmx) { \ @@ -883,6 +888,7 @@ do { \ We may also be able to load into the L1 only depending on how the cpu deals with a load to a line that is being prefetched. */ #define XOR_SELECT_TEMPLATE(FASTEST) \ - (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) + (cpu has_avx ? &xor_block_avx : \ + cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) #endif /* _ASM_X86_XOR_32_H */ diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index 1549b5e..d331b41 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h @@ -347,15 +347,23 @@ static struct xor_block_template xor_block_sse = { .do_5 = xor_sse_5, }; + +/* Also try the AVX routines */ +#include "xor_avx.h" + #undef XOR_TRY_TEMPLATES #define XOR_TRY_TEMPLATES \ do { \ + if (cpu_has_avx) \ + xor_speed(&xor_block_avx); \ xor_speed(&xor_block_sse); \ } while (0) /* We force the use of the SSE xor block because it can write around L2. We may also be able to load into the L1 only depending on how the cpu deals with a load to a line that is being prefetched. */ -#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) +#define XOR_SELECT_TEMPLATE(FASTEST) \ + (cpu_has_avx ? &xor_block_avx : \ + &xor_block_sse) #endif /* _ASM_X86_XOR_64_H */ diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h new file mode 100644 index 0000000..dda165b --- /dev/null +++ b/arch/x86/include/asm/xor_avx.h @@ -0,0 +1,184 @@ +#ifndef _ASM_X86_XOR_AVX_H +#define _ASM_X86_XOR_AVX_H + +/* + * Optimized RAID-5 checksumming functions for AVX + * + * Copyright (C) 2012 Intel Corporation + * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> + * + * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <asm/i387.h> + +#define ALIGN32 __attribute__((aligned(32))) + +#define YMM_SAVED_REGS 4 + +#define YMMS_SAVE \ +do { \ + preempt_disable(); \ + cr0 = read_cr0(); \ + clts(); \ + asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \ + asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \ + asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \ + asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \ +} while (0); + +#define YMMS_RESTORE \ +do { \ + asm volatile("sfence" : : : "memory"); \ + asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \ + asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \ + asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \ + asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \ + write_cr0(cr0); \ + preempt_enable(); \ +} while (0); + +#define BLOCK4(i) \ + BLOCK(32 * i, 0) \ + BLOCK(32 * (i + 1), 1) \ + BLOCK(32 * (i + 2), 2) \ + BLOCK(32 * (i + 3), 3) + +#define BLOCK16() \ + BLOCK4(0) \ + BLOCK4(4) \ + BLOCK4(8) \ + BLOCK4(12) + +static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (p0[i / sizeof(*p0)])); + + BLOCK16() + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + } + + YMMS_RESTORE +} + +static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, + unsigned long *p2) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (p0[i / sizeof(*p0)])); + + BLOCK16() + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + p2 = (unsigned long *)((uintptr_t)p2 + 512); + } + + YMMS_RESTORE +} + +static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, + unsigned long *p2, unsigned long *p3) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p2[i / sizeof(*p2)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (p0[i / sizeof(*p0)])); + + BLOCK16(); + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + p2 = (unsigned long *)((uintptr_t)p2 + 512); + p3 = (unsigned long *)((uintptr_t)p3 + 512); + } + + YMMS_RESTORE +} + +static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, + unsigned long *p2, unsigned long *p3, unsigned long *p4) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p3[i / sizeof(*p3)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p2[i / sizeof(*p2)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (p0[i / sizeof(*p0)])); + + BLOCK16() + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + p2 = (unsigned long *)((uintptr_t)p2 + 512); + p3 = (unsigned long *)((uintptr_t)p3 + 512); + p4 = (unsigned long *)((uintptr_t)p4 + 512); + } + + YMMS_RESTORE +} + +static struct xor_block_template xor_block_avx = { + .name = "avx", + .do_2 = xor_avx_2, + .do_3 = xor_avx_3, + .do_4 = xor_avx_4, + .do_5 = xor_avx_5, +}; + +#endif + -- 1.7.8.5 ^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: arch/86: AVX RAID5 xor checksumming 2012-04-10 17:22 arch/86: AVX RAID5 xor checksumming Jim Kukunas 2012-04-10 17:22 ` [PATCH] raid5: add AVX optimized RAID5 checksumming Jim Kukunas @ 2012-04-10 17:37 ` H. Peter Anvin 2012-04-10 20:14 ` NeilBrown 1 sibling, 1 reply; 8+ messages in thread From: H. Peter Anvin @ 2012-04-10 17:37 UTC (permalink / raw) To: Jim Kukunas; +Cc: x86, neilb, linux-kernel, mingo, tglx, linux-raid On 04/10/2012 10:22 AM, Jim Kukunas wrote: > Hi Folks, > > The following patch adds an AVX implementation of the RAID5 xor checksumming > functions. > > Based on xor_speed, the AVX implementation appears to be ~32% faster than the > SSE implementation on my i7 2600: > > generic_sse: 15088.000 MB/sec > avx: 19936.000 MB/sec > > Thanks. > I think this (and the SSSE3 patches for RAID-6) should go in via the RAID tree. Neil, do you agree? However, Acked-by: H. Peter Anvin <hpa@zytor.com> -hpa ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: arch/86: AVX RAID5 xor checksumming 2012-04-10 17:37 ` arch/86: AVX RAID5 xor checksumming H. Peter Anvin @ 2012-04-10 20:14 ` NeilBrown 2012-04-10 21:20 ` H. Peter Anvin 0 siblings, 1 reply; 8+ messages in thread From: NeilBrown @ 2012-04-10 20:14 UTC (permalink / raw) To: H. Peter Anvin; +Cc: Jim Kukunas, x86, linux-kernel, mingo, tglx, linux-raid [-- Attachment #1: Type: text/plain, Size: 975 bytes --] On Tue, 10 Apr 2012 10:37:00 -0700 "H. Peter Anvin" <hpa@zytor.com> wrote: > On 04/10/2012 10:22 AM, Jim Kukunas wrote: > > Hi Folks, > > > > The following patch adds an AVX implementation of the RAID5 xor checksumming > > functions. > > > > Based on xor_speed, the AVX implementation appears to be ~32% faster than the > > SSE implementation on my i7 2600: > > > > generic_sse: 15088.000 MB/sec > > avx: 19936.000 MB/sec > > > > Thanks. > > > > I think this (and the SSSE3 patches for RAID-6) should go in via the > RAID tree. Neil, do you agree? I'm happy to take the SSSE3 RAID-6 patches as they only affect lib/raid6, but I felt that this one - being purely in arch/x86 - should probably go via the x86 tree. However I'm flexible and if you (collectively) are happy with these patches going in through my tree, I'll do that. Thanks, NeilBrown > > However, > > Acked-by: H. Peter Anvin <hpa@zytor.com> > > -hpa [-- Attachment #2: signature.asc --] [-- Type: application/pgp-signature, Size: 828 bytes --] ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: arch/86: AVX RAID5 xor checksumming 2012-04-10 20:14 ` NeilBrown @ 2012-04-10 21:20 ` H. Peter Anvin 2012-04-14 12:02 ` Ingo Molnar 0 siblings, 1 reply; 8+ messages in thread From: H. Peter Anvin @ 2012-04-10 21:20 UTC (permalink / raw) To: NeilBrown; +Cc: Jim Kukunas, x86, linux-kernel, mingo, tglx, linux-raid On 04/10/2012 01:14 PM, NeilBrown wrote: >> >> I think this (and the SSSE3 patches for RAID-6) should go in via the >> RAID tree. Neil, do you agree? > > I'm happy to take the SSSE3 RAID-6 patches as they only affect lib/raid6, but > I felt that this one - being purely in arch/x86 - should probably go via the > x86 tree. > However I'm flexible and if you (collectively) are happy with these patches > going in through my tree, I'll do that. > Go ahead and take them with my Acked-by: -- I think that's easier. -hpa ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: arch/86: AVX RAID5 xor checksumming 2012-04-10 21:20 ` H. Peter Anvin @ 2012-04-14 12:02 ` Ingo Molnar 0 siblings, 0 replies; 8+ messages in thread From: Ingo Molnar @ 2012-04-14 12:02 UTC (permalink / raw) To: H. Peter Anvin Cc: NeilBrown, Jim Kukunas, x86, linux-kernel, mingo, tglx, linux-raid * H. Peter Anvin <hpa@zytor.com> wrote: > On 04/10/2012 01:14 PM, NeilBrown wrote: > >> > >> I think this (and the SSSE3 patches for RAID-6) should go in via the > >> RAID tree. Neil, do you agree? > > > > I'm happy to take the SSSE3 RAID-6 patches as they only affect lib/raid6, but > > I felt that this one - being purely in arch/x86 - should probably go via the > > x86 tree. > > However I'm flexible and if you (collectively) are happy with these patches > > going in through my tree, I'll do that. > > > > Go ahead and take them with my Acked-by: -- I think that's easier. Yeah - and x86/include/asm/xor*.h should probably be renamed raid_xor*.h or so, to make this really obvious to do in the future. Thanks, Ingo ^ permalink raw reply [flat|nested] 8+ messages in thread
* arch/86: AVX RAID5 xor checksumming v1 @ 2012-04-18 22:58 Jim Kukunas 2012-04-18 22:58 ` [PATCH] raid5: add AVX optimized RAID5 checksumming Jim Kukunas 0 siblings, 1 reply; 8+ messages in thread From: Jim Kukunas @ 2012-04-18 22:58 UTC (permalink / raw) To: neilb; +Cc: hpa, linux-kernel, linux-raid Hi Folks, The following patch adds an AVX implementation of the RAID5 xor checksumming functions. This version differs from version 0, in that it: 0) checks whether assembler supports AVX 1) replaces __attribute__(aligned(32)) with __aligned(32) 2) adds do/while(0); to macro blocks Based on xor_speed, the AVX implementation appears to be ~32% faster than the SSE implementation on my i7 2600: generic_sse: 15088.000 MB/sec avx: 19936.000 MB/sec Thanks. ^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH] raid5: add AVX optimized RAID5 checksumming 2012-04-18 22:58 arch/86: AVX RAID5 xor checksumming v1 Jim Kukunas @ 2012-04-18 22:58 ` Jim Kukunas 0 siblings, 0 replies; 8+ messages in thread From: Jim Kukunas @ 2012-04-18 22:58 UTC (permalink / raw) To: neilb; +Cc: hpa, linux-kernel, linux-raid Optimize RAID5 xor checksumming by taking advantage of 256-bit YMM registers introduced in AVX. Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com> --- arch/x86/Makefile | 5 +- arch/x86/include/asm/xor_32.h | 6 +- arch/x86/include/asm/xor_64.h | 8 ++- arch/x86/include/asm/xor_avx.h | 215 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 230 insertions(+), 4 deletions(-) create mode 100644 arch/x86/include/asm/xor_avx.h diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 209ba12..803c76d 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -99,9 +99,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI # does binutils support specific instructions? asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) +avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) -KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) -KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) +KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) +KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) LDFLAGS := -m elf_$(UTS_MACHINE) diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index 133b40a..4545708 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = { .do_5 = xor_sse_5, }; +/* Also try the AVX routines */ +#include "xor_avx.h" + /* Also try the generic routines. */ #include <asm-generic/xor.h> @@ -871,6 +874,7 @@ do { \ xor_speed(&xor_block_8regs_p); \ xor_speed(&xor_block_32regs); \ xor_speed(&xor_block_32regs_p); \ + AVX_XOR_SPEED; \ if (cpu_has_xmm) \ xor_speed(&xor_block_pIII_sse); \ if (cpu_has_mmx) { \ @@ -883,6 +887,6 @@ do { \ We may also be able to load into the L1 only depending on how the cpu deals with a load to a line that is being prefetched. */ #define XOR_SELECT_TEMPLATE(FASTEST) \ - (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) + AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) #endif /* _ASM_X86_XOR_32_H */ diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index 1549b5e..b9b2323 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h @@ -347,15 +347,21 @@ static struct xor_block_template xor_block_sse = { .do_5 = xor_sse_5, }; + +/* Also try the AVX routines */ +#include "xor_avx.h" + #undef XOR_TRY_TEMPLATES #define XOR_TRY_TEMPLATES \ do { \ + AVX_XOR_SPEED; \ xor_speed(&xor_block_sse); \ } while (0) /* We force the use of the SSE xor block because it can write around L2. We may also be able to load into the L1 only depending on how the cpu deals with a load to a line that is being prefetched. */ -#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) +#define XOR_SELECT_TEMPLATE(FASTEST) \ + AVX_SELECT(&xor_block_sse) #endif /* _ASM_X86_XOR_64_H */ diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h new file mode 100644 index 0000000..a2c9aa5 --- /dev/null +++ b/arch/x86/include/asm/xor_avx.h @@ -0,0 +1,215 @@ +#ifndef _ASM_X86_XOR_AVX_H +#define _ASM_X86_XOR_AVX_H + +/* + * Optimized RAID-5 checksumming functions for AVX + * + * Copyright (C) 2012 Intel Corporation + * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> + * + * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#ifdef CONFIG_AS_AVX + +#include <linux/compiler.h> +#include <asm/i387.h> + +#define ALIGN32 __aligned(32) + +#define YMM_SAVED_REGS 4 + +#define YMMS_SAVE \ +do { \ + preempt_disable(); \ + cr0 = read_cr0(); \ + clts(); \ + asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \ + asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \ + asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \ + asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \ +} while (0); + +#define YMMS_RESTORE \ +do { \ + asm volatile("sfence" : : : "memory"); \ + asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \ + asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \ + asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \ + asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \ + write_cr0(cr0); \ + preempt_enable(); \ +} while (0); + +#define BLOCK4(i) \ + BLOCK(32 * i, 0) \ + BLOCK(32 * (i + 1), 1) \ + BLOCK(32 * (i + 2), 2) \ + BLOCK(32 * (i + 3), 3) + +#define BLOCK16() \ + BLOCK4(0) \ + BLOCK4(4) \ + BLOCK4(8) \ + BLOCK4(12) + +static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ +do { \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : \ + "=m" (p0[i / sizeof(*p0)])); \ +} while (0); + + BLOCK16() + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + } + + YMMS_RESTORE +} + +static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, + unsigned long *p2) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ +do { \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : \ + "=m" (p0[i / sizeof(*p0)])); \ +} while (0); + + BLOCK16() + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + p2 = (unsigned long *)((uintptr_t)p2 + 512); + } + + YMMS_RESTORE +} + +static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, + unsigned long *p2, unsigned long *p3) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ +do { \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p2[i / sizeof(*p2)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : \ + "=m" (p0[i / sizeof(*p0)])); \ +} while (0); + + BLOCK16(); + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + p2 = (unsigned long *)((uintptr_t)p2 + 512); + p3 = (unsigned long *)((uintptr_t)p3 + 512); + } + + YMMS_RESTORE +} + +static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, + unsigned long *p2, unsigned long *p3, unsigned long *p4) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ +do { \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p3[i / sizeof(*p3)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p2[i / sizeof(*p2)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : \ + "=m" (p0[i / sizeof(*p0)])); \ +} while (0); + + BLOCK16() + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + p2 = (unsigned long *)((uintptr_t)p2 + 512); + p3 = (unsigned long *)((uintptr_t)p3 + 512); + p4 = (unsigned long *)((uintptr_t)p4 + 512); + } + + YMMS_RESTORE +} + +static struct xor_block_template xor_block_avx = { + .name = "avx", + .do_2 = xor_avx_2, + .do_3 = xor_avx_3, + .do_4 = xor_avx_4, + .do_5 = xor_avx_5, +}; + +#define AVX_XOR_SPEED \ +do { \ + if (cpu_has_avx) \ + xor_speed(&xor_block_avx); \ +} while (0) + +#define AVX_SELECT(FASTEST) \ + (cpu_has_avx ? &xor_block_avx : FASTEST) + +#else + +#define AVX_XOR_SPEED {} + +#define AVX_SELECT(FASTEST) (FASTEST) + +#endif +#endif + -- 1.7.8.5 ^ permalink raw reply related [flat|nested] 8+ messages in thread
* AVX RAID5 xor checksumming @ 2012-03-29 21:44 Jim Kukunas 2012-03-29 21:44 ` [PATCH] raid5: add AVX optimized RAID5 checksumming Jim Kukunas 0 siblings, 1 reply; 8+ messages in thread From: Jim Kukunas @ 2012-03-29 21:44 UTC (permalink / raw) To: linux-raid; +Cc: hpa, neilb Hi Folks, The following patch adds an AVX implementation of the RAID5 xor checksumming functions. Based on xor_speed, the AVX implementation appears to be ~32% faster than the SSE implementation on my i7 2600: generic_sse: 15088.000 MB/sec avx: 19936.000 MB/sec Thanks. ^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH] raid5: add AVX optimized RAID5 checksumming 2012-03-29 21:44 AVX RAID5 xor checksumming Jim Kukunas @ 2012-03-29 21:44 ` Jim Kukunas 0 siblings, 0 replies; 8+ messages in thread From: Jim Kukunas @ 2012-03-29 21:44 UTC (permalink / raw) To: linux-raid; +Cc: hpa, neilb Optimize RAID5 xor checksumming by taking advantage of 256-bit YMM registers introduced in AVX. Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com> Reviewed-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/include/asm/xor_32.h | 8 ++- arch/x86/include/asm/xor_64.h | 10 ++- arch/x86/include/asm/xor_avx.h | 184 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 200 insertions(+), 2 deletions(-) create mode 100644 arch/x86/include/asm/xor_avx.h diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index 133b40a..1799baa 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = { .do_5 = xor_sse_5, }; +/* Also try the AVX routines */ +#include "xor_avx.h" + /* Also try the generic routines. */ #include <asm-generic/xor.h> @@ -871,6 +874,8 @@ do { \ xor_speed(&xor_block_8regs_p); \ xor_speed(&xor_block_32regs); \ xor_speed(&xor_block_32regs_p); \ + if (cpu_has_avx) \ + xor_speed(&xor_block_avx); \ if (cpu_has_xmm) \ xor_speed(&xor_block_pIII_sse); \ if (cpu_has_mmx) { \ @@ -883,6 +888,7 @@ do { \ We may also be able to load into the L1 only depending on how the cpu deals with a load to a line that is being prefetched. */ #define XOR_SELECT_TEMPLATE(FASTEST) \ - (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) + (cpu has_avx ? &xor_block_avx : \ + cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) #endif /* _ASM_X86_XOR_32_H */ diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index 1549b5e..d331b41 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h @@ -347,15 +347,23 @@ static struct xor_block_template xor_block_sse = { .do_5 = xor_sse_5, }; + +/* Also try the AVX routines */ +#include "xor_avx.h" + #undef XOR_TRY_TEMPLATES #define XOR_TRY_TEMPLATES \ do { \ + if (cpu_has_avx) \ + xor_speed(&xor_block_avx); \ xor_speed(&xor_block_sse); \ } while (0) /* We force the use of the SSE xor block because it can write around L2. We may also be able to load into the L1 only depending on how the cpu deals with a load to a line that is being prefetched. */ -#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) +#define XOR_SELECT_TEMPLATE(FASTEST) \ + (cpu_has_avx ? &xor_block_avx : \ + &xor_block_sse) #endif /* _ASM_X86_XOR_64_H */ diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h new file mode 100644 index 0000000..dda165b --- /dev/null +++ b/arch/x86/include/asm/xor_avx.h @@ -0,0 +1,184 @@ +#ifndef _ASM_X86_XOR_AVX_H +#define _ASM_X86_XOR_AVX_H + +/* + * Optimized RAID-5 checksumming functions for AVX + * + * Copyright (C) 2012 Intel Corporation + * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> + * + * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <asm/i387.h> + +#define ALIGN32 __attribute__((aligned(32))) + +#define YMM_SAVED_REGS 4 + +#define YMMS_SAVE \ +do { \ + preempt_disable(); \ + cr0 = read_cr0(); \ + clts(); \ + asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \ + asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \ + asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \ + asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \ +} while (0); + +#define YMMS_RESTORE \ +do { \ + asm volatile("sfence" : : : "memory"); \ + asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \ + asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \ + asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \ + asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \ + write_cr0(cr0); \ + preempt_enable(); \ +} while (0); + +#define BLOCK4(i) \ + BLOCK(32 * i, 0) \ + BLOCK(32 * (i + 1), 1) \ + BLOCK(32 * (i + 2), 2) \ + BLOCK(32 * (i + 3), 3) + +#define BLOCK16() \ + BLOCK4(0) \ + BLOCK4(4) \ + BLOCK4(8) \ + BLOCK4(12) + +static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (p0[i / sizeof(*p0)])); + + BLOCK16() + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + } + + YMMS_RESTORE +} + +static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, + unsigned long *p2) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (p0[i / sizeof(*p0)])); + + BLOCK16() + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + p2 = (unsigned long *)((uintptr_t)p2 + 512); + } + + YMMS_RESTORE +} + +static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, + unsigned long *p2, unsigned long *p3) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p2[i / sizeof(*p2)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (p0[i / sizeof(*p0)])); + + BLOCK16(); + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + p2 = (unsigned long *)((uintptr_t)p2 + 512); + p3 = (unsigned long *)((uintptr_t)p3 + 512); + } + + YMMS_RESTORE +} + +static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, + unsigned long *p2, unsigned long *p3, unsigned long *p4) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p3[i / sizeof(*p3)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p2[i / sizeof(*p2)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (p0[i / sizeof(*p0)])); + + BLOCK16() + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + p2 = (unsigned long *)((uintptr_t)p2 + 512); + p3 = (unsigned long *)((uintptr_t)p3 + 512); + p4 = (unsigned long *)((uintptr_t)p4 + 512); + } + + YMMS_RESTORE +} + +static struct xor_block_template xor_block_avx = { + .name = "avx", + .do_2 = xor_avx_2, + .do_3 = xor_avx_3, + .do_4 = xor_avx_4, + .do_5 = xor_avx_5, +}; + +#endif + -- 1.7.8.5 ^ permalink raw reply related [flat|nested] 8+ messages in thread
end of thread, other threads:[~2012-04-18 22:58 UTC | newest] Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2012-04-10 17:22 arch/86: AVX RAID5 xor checksumming Jim Kukunas 2012-04-10 17:22 ` [PATCH] raid5: add AVX optimized RAID5 checksumming Jim Kukunas 2012-04-10 17:37 ` arch/86: AVX RAID5 xor checksumming H. Peter Anvin 2012-04-10 20:14 ` NeilBrown 2012-04-10 21:20 ` H. Peter Anvin 2012-04-14 12:02 ` Ingo Molnar -- strict thread matches above, loose matches on Subject: below -- 2012-04-18 22:58 arch/86: AVX RAID5 xor checksumming v1 Jim Kukunas 2012-04-18 22:58 ` [PATCH] raid5: add AVX optimized RAID5 checksumming Jim Kukunas 2012-03-29 21:44 AVX RAID5 xor checksumming Jim Kukunas 2012-03-29 21:44 ` [PATCH] raid5: add AVX optimized RAID5 checksumming Jim Kukunas
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.