From: "Jason A. Donenfeld" <Jason@zx2c4.com>
To: linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
davem@davemloft.net
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>,
Andy Lutomirski <luto@kernel.org>,
Greg KH <gregkh@linuxfoundation.org>,
Samuel Neves <sneves@dei.uc.pt>,
Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>,
Andy Polyakov <appro@openssl.org>,
Thomas Gleixner <tglx@linutronix.de>,
Ingo Molnar <mingo@redhat.com>,
x86@kernel.org, linux-crypto@vger.kernel.org
Subject: [PATCH v2 05/17] zinc: ChaCha20 x86_64 implementation
Date: Fri, 24 Aug 2018 15:38:37 -0600 [thread overview]
Message-ID: <20180824213849.23647-6-Jason@zx2c4.com> (raw)
In-Reply-To: <20180824213849.23647-1-Jason@zx2c4.com>
This provides SSSE3, AVX-2, AVX-512F, and AVX-512VL implementations for
ChaCha20. The AVX-512F implementation is disabled on Skylake, due to
throttling, and the VL ymm implementation is used instead. These come
from Andy Polyakov's implementation, with some heavy modifications from
Samuel Neves and me.
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Andy Polyakov <appro@openssl.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: x86@kernel.org
Cc: linux-crypto@vger.kernel.org
---
lib/zinc/Makefile | 4 +
lib/zinc/chacha20/chacha20-x86_64-glue.h | 84 +
lib/zinc/chacha20/chacha20-x86_64.S | 2630 ++++++++++++++++++++++
3 files changed, 2718 insertions(+)
create mode 100644 lib/zinc/chacha20/chacha20-x86_64-glue.h
create mode 100644 lib/zinc/chacha20/chacha20-x86_64.S
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index 87be627f3354..fc6ca1c84cd1 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -4,6 +4,10 @@ ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt'
ifeq ($(CONFIG_ZINC_CHACHA20),y)
zinc-y += chacha20/chacha20.o
+ifeq ($(CONFIG_X86_64),y)
+zinc-y += chacha20/chacha20-x86_64.o
+CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-x86_64-glue.h
+endif
ifeq ($(CONFIG_ARM),y)
zinc-y += chacha20/chacha20-arm.o
CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-arm-glue.h
diff --git a/lib/zinc/chacha20/chacha20-x86_64-glue.h b/lib/zinc/chacha20/chacha20-x86_64-glue.h
new file mode 100644
index 000000000000..ece8279e93e0
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-x86_64-glue.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/chacha20.h>
+#include <asm/fpu/api.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/intel-family.h>
+
+#ifdef CONFIG_AS_SSSE3
+asmlinkage void hchacha20_ssse3(u8 *derived_key, const u8 *nonce, const u8 *key);
+asmlinkage void chacha20_ssse3(u8 *out, const u8 *in, const size_t len, const u32 key[8], const u32 counter[4]);
+#endif
+#ifdef CONFIG_AS_AVX2
+asmlinkage void chacha20_avx2(u8 *out, const u8 *in, const size_t len, const u32 key[8], const u32 counter[4]);
+#endif
+#ifdef CONFIG_AS_AVX512
+asmlinkage void chacha20_avx512(u8 *out, const u8 *in, const size_t len, const u32 key[8], const u32 counter[4]);
+asmlinkage void chacha20_avx512vl(u8 *out, const u8 *in, const size_t len, const u32 key[8], const u32 counter[4]);
+#endif
+
+static bool chacha20_use_ssse3 __ro_after_init;
+static bool chacha20_use_avx2 __ro_after_init;
+static bool chacha20_use_avx512 __ro_after_init;
+static bool chacha20_use_avx512vl __ro_after_init;
+
+void __init chacha20_fpu_init(void)
+{
+#ifndef CONFIG_UML
+ chacha20_use_ssse3 = boot_cpu_has(X86_FEATURE_SSSE3);
+ chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+ chacha20_use_avx512 = boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
+ boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X;
+ chacha20_use_avx512vl = boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL);
+#endif
+}
+
+static inline bool chacha20_arch(u8 *dst, const u8 *src, const size_t len, const u32 key[8], const u32 counter[4], simd_context_t simd_context)
+{
+ if (simd_context != HAVE_FULL_SIMD)
+ return false;
+
+#ifdef CONFIG_AS_AVX512
+ if (chacha20_use_avx512) {
+ chacha20_avx512(dst, src, len, key, counter);
+ return true;
+ }
+ if (chacha20_use_avx512vl) {
+ chacha20_avx512vl(dst, src, len, key, counter);
+ return true;
+ }
+#endif
+#ifdef CONFIG_AS_AVX2
+ if (chacha20_use_avx2) {
+ chacha20_avx2(dst, src, len, key, counter);
+ return true;
+ }
+#endif
+#ifdef CONFIG_AS_SSSE3
+ if (chacha20_use_ssse3) {
+ chacha20_ssse3(dst, src, len, key, counter);
+ return true;
+ }
+#endif
+ return false;
+}
+
+static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce, const u8 *key, simd_context_t simd_context)
+{
+#if defined(CONFIG_AS_SSSE3)
+ if (simd_context == HAVE_FULL_SIMD && chacha20_use_ssse3) {
+ hchacha20_ssse3(derived_key, nonce, key);
+ return true;
+ }
+#endif
+ return false;
+}
+
+#define HAVE_CHACHA20_ARCH_IMPLEMENTATION
diff --git a/lib/zinc/chacha20/chacha20-x86_64.S b/lib/zinc/chacha20/chacha20-x86_64.S
new file mode 100644
index 000000000000..39883f3718b6
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-x86_64.S
@@ -0,0 +1,2630 @@
+/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0)
+ *
+ * Copyright (C) 2017 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst16.Lzero, "aM", @progbits, 16
+.align 16
+.Lzero:
+.long 0,0,0,0
+.section .rodata.cst16.Lone, "aM", @progbits, 16
+.align 16
+.Lone:
+.long 1,0,0,0
+.section .rodata.cst16.Linc, "aM", @progbits, 16
+.align 16
+.Linc:
+.long 0,1,2,3
+.section .rodata.cst16.Lfour, "aM", @progbits, 16
+.align 16
+.Lfour:
+.long 4,4,4,4
+.section .rodata.cst32.Lincy, "aM", @progbits, 32
+.align 32
+.Lincy:
+.long 0,2,4,6,1,3,5,7
+.section .rodata.cst32.Leight, "aM", @progbits, 32
+.align 32
+.Leight:
+.long 8,8,8,8,8,8,8,8
+.section .rodata.cst16.Lrot16, "aM", @progbits, 16
+.align 16
+.Lrot16:
+.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
+.section .rodata.cst16.Lrot24, "aM", @progbits, 16
+.align 16
+.Lrot24:
+.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
+.section .rodata.cst16.Lsigma, "aM", @progbits, 16
+.align 16
+.Lsigma:
+.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.section .rodata.cst64.Lzeroz, "aM", @progbits, 64
+.align 64
+.Lzeroz:
+.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+.section .rodata.cst64.Lfourz, "aM", @progbits, 64
+.align 64
+.Lfourz:
+.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+.section .rodata.cst64.Lincz, "aM", @progbits, 64
+.align 64
+.Lincz:
+.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.section .rodata.cst64.Lsixteen, "aM", @progbits, 64
+.align 64
+.Lsixteen:
+.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
+.section .rodata.cst32.Ltwoy, "aM", @progbits, 32
+.align 64
+.Ltwoy:
+.long 2,0,0,0, 2,0,0,0
+
+.text
+
+#ifdef CONFIG_AS_SSSE3
+.align 32
+ENTRY(hchacha20_ssse3)
+ movdqa .Lsigma(%rip),%xmm0
+ movdqu (%rdx),%xmm1
+ movdqu 16(%rdx),%xmm2
+ movdqu (%rsi),%xmm3
+ movdqa .Lrot16(%rip),%xmm6
+ movdqa .Lrot24(%rip),%xmm7
+ movq $10,%r8
+ .align 32
+.Loop_hssse3:
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm6,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm7,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $57,%xmm1,%xmm1
+ pshufd $147,%xmm3,%xmm3
+ nop
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm6,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm7,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $147,%xmm1,%xmm1
+ pshufd $57,%xmm3,%xmm3
+ decq %r8
+ jnz .Loop_hssse3
+ movdqu %xmm0,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+ ret
+ENDPROC(hchacha20_ssse3)
+
+.align 32
+ENTRY(chacha20_ssse3)
+.Lchacha20_ssse3:
+ cmpq $0,%rdx
+ je .Lssse3_epilogue
+ leaq 8(%rsp),%r10
+
+ cmpq $128,%rdx
+ ja .Lchacha20_4x
+
+.Ldo_sse3_after_all:
+ subq $64+8,%rsp
+ andq $-32,%rsp
+ movdqa .Lsigma(%rip),%xmm0
+ movdqu (%rcx),%xmm1
+ movdqu 16(%rcx),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa .Lrot16(%rip),%xmm6
+ movdqa .Lrot24(%rip),%xmm7
+
+ movdqa %xmm0,0(%rsp)
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ movq $10,%r8
+ jmp .Loop_ssse3
+
+.align 32
+.Loop_outer_ssse3:
+ movdqa .Lone(%rip),%xmm3
+ movdqa 0(%rsp),%xmm0
+ movdqa 16(%rsp),%xmm1
+ movdqa 32(%rsp),%xmm2
+ paddd 48(%rsp),%xmm3
+ movq $10,%r8
+ movdqa %xmm3,48(%rsp)
+ jmp .Loop_ssse3
+
+.align 32
+.Loop_ssse3:
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm6,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm7,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $57,%xmm1,%xmm1
+ pshufd $147,%xmm3,%xmm3
+ nop
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm6,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm7,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $147,%xmm1,%xmm1
+ pshufd $57,%xmm3,%xmm3
+ decq %r8
+ jnz .Loop_ssse3
+ paddd 0(%rsp),%xmm0
+ paddd 16(%rsp),%xmm1
+ paddd 32(%rsp),%xmm2
+ paddd 48(%rsp),%xmm3
+
+ cmpq $64,%rdx
+ jb .Ltail_ssse3
+
+ movdqu 0(%rsi),%xmm4
+ movdqu 16(%rsi),%xmm5
+ pxor %xmm4,%xmm0
+ movdqu 32(%rsi),%xmm4
+ pxor %xmm5,%xmm1
+ movdqu 48(%rsi),%xmm5
+ leaq 64(%rsi),%rsi
+ pxor %xmm4,%xmm2
+ pxor %xmm5,%xmm3
+
+ movdqu %xmm0,0(%rdi)
+ movdqu %xmm1,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ subq $64,%rdx
+ jnz .Loop_outer_ssse3
+
+ jmp .Ldone_ssse3
+
+.align 16
+.Ltail_ssse3:
+ movdqa %xmm0,0(%rsp)
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ xorq %r8,%r8
+
+.Loop_tail_ssse3:
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r8,1)
+ decq %rdx
+ jnz .Loop_tail_ssse3
+
+.Ldone_ssse3:
+ leaq -8(%r10),%rsp
+
+.Lssse3_epilogue:
+ ret
+
+.align 32
+.Lchacha20_4x:
+ leaq 8(%rsp),%r10
+
+.Lproceed4x:
+ subq $0x140+8,%rsp
+ andq $-32,%rsp
+ movdqa .Lsigma(%rip),%xmm11
+ movdqu (%rcx),%xmm15
+ movdqu 16(%rcx),%xmm7
+ movdqu (%r8),%xmm3
+ leaq 256(%rsp),%rcx
+ leaq .Lrot16(%rip),%r9
+ leaq .Lrot24(%rip),%r11
+
+ pshufd $0x00,%xmm11,%xmm8
+ pshufd $0x55,%xmm11,%xmm9
+ movdqa %xmm8,64(%rsp)
+ pshufd $0xaa,%xmm11,%xmm10
+ movdqa %xmm9,80(%rsp)
+ pshufd $0xff,%xmm11,%xmm11
+ movdqa %xmm10,96(%rsp)
+ movdqa %xmm11,112(%rsp)
+
+ pshufd $0x00,%xmm15,%xmm12
+ pshufd $0x55,%xmm15,%xmm13
+ movdqa %xmm12,128-256(%rcx)
+ pshufd $0xaa,%xmm15,%xmm14
+ movdqa %xmm13,144-256(%rcx)
+ pshufd $0xff,%xmm15,%xmm15
+ movdqa %xmm14,160-256(%rcx)
+ movdqa %xmm15,176-256(%rcx)
+
+ pshufd $0x00,%xmm7,%xmm4
+ pshufd $0x55,%xmm7,%xmm5
+ movdqa %xmm4,192-256(%rcx)
+ pshufd $0xaa,%xmm7,%xmm6
+ movdqa %xmm5,208-256(%rcx)
+ pshufd $0xff,%xmm7,%xmm7
+ movdqa %xmm6,224-256(%rcx)
+ movdqa %xmm7,240-256(%rcx)
+
+ pshufd $0x00,%xmm3,%xmm0
+ pshufd $0x55,%xmm3,%xmm1
+ paddd .Linc(%rip),%xmm0
+ pshufd $0xaa,%xmm3,%xmm2
+ movdqa %xmm1,272-256(%rcx)
+ pshufd $0xff,%xmm3,%xmm3
+ movdqa %xmm2,288-256(%rcx)
+ movdqa %xmm3,304-256(%rcx)
+
+ jmp .Loop_enter4x
+
+.align 32
+.Loop_outer4x:
+ movdqa 64(%rsp),%xmm8
+ movdqa 80(%rsp),%xmm9
+ movdqa 96(%rsp),%xmm10
+ movdqa 112(%rsp),%xmm11
+ movdqa 128-256(%rcx),%xmm12
+ movdqa 144-256(%rcx),%xmm13
+ movdqa 160-256(%rcx),%xmm14
+ movdqa 176-256(%rcx),%xmm15
+ movdqa 192-256(%rcx),%xmm4
+ movdqa 208-256(%rcx),%xmm5
+ movdqa 224-256(%rcx),%xmm6
+ movdqa 240-256(%rcx),%xmm7
+ movdqa 256-256(%rcx),%xmm0
+ movdqa 272-256(%rcx),%xmm1
+ movdqa 288-256(%rcx),%xmm2
+ movdqa 304-256(%rcx),%xmm3
+ paddd .Lfour(%rip),%xmm0
+
+.Loop_enter4x:
+ movdqa %xmm6,32(%rsp)
+ movdqa %xmm7,48(%rsp)
+ movdqa (%r9),%xmm7
+ movl $10,%eax
+ movdqa %xmm0,256-256(%rcx)
+ jmp .Loop4x
+
+.align 32
+.Loop4x:
+ paddd %xmm12,%xmm8
+ paddd %xmm13,%xmm9
+ pxor %xmm8,%xmm0
+ pxor %xmm9,%xmm1
+ pshufb %xmm7,%xmm0
+ pshufb %xmm7,%xmm1
+ paddd %xmm0,%xmm4
+ paddd %xmm1,%xmm5
+ pxor %xmm4,%xmm12
+ pxor %xmm5,%xmm13
+ movdqa %xmm12,%xmm6
+ pslld $12,%xmm12
+ psrld $20,%xmm6
+ movdqa %xmm13,%xmm7
+ pslld $12,%xmm13
+ por %xmm6,%xmm12
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm13
+ paddd %xmm12,%xmm8
+ paddd %xmm13,%xmm9
+ pxor %xmm8,%xmm0
+ pxor %xmm9,%xmm1
+ pshufb %xmm6,%xmm0
+ pshufb %xmm6,%xmm1
+ paddd %xmm0,%xmm4
+ paddd %xmm1,%xmm5
+ pxor %xmm4,%xmm12
+ pxor %xmm5,%xmm13
+ movdqa %xmm12,%xmm7
+ pslld $7,%xmm12
+ psrld $25,%xmm7
+ movdqa %xmm13,%xmm6
+ pslld $7,%xmm13
+ por %xmm7,%xmm12
+ psrld $25,%xmm6
+ movdqa (%r9),%xmm7
+ por %xmm6,%xmm13
+ movdqa %xmm4,0(%rsp)
+ movdqa %xmm5,16(%rsp)
+ movdqa 32(%rsp),%xmm4
+ movdqa 48(%rsp),%xmm5
+ paddd %xmm14,%xmm10
+ paddd %xmm15,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm3
+ pshufb %xmm7,%xmm2
+ pshufb %xmm7,%xmm3
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ pxor %xmm4,%xmm14
+ pxor %xmm5,%xmm15
+ movdqa %xmm14,%xmm6
+ pslld $12,%xmm14
+ psrld $20,%xmm6
+ movdqa %xmm15,%xmm7
+ pslld $12,%xmm15
+ por %xmm6,%xmm14
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm15
+ paddd %xmm14,%xmm10
+ paddd %xmm15,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm3
+ pshufb %xmm6,%xmm2
+ pshufb %xmm6,%xmm3
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ pxor %xmm4,%xmm14
+ pxor %xmm5,%xmm15
+ movdqa %xmm14,%xmm7
+ pslld $7,%xmm14
+ psrld $25,%xmm7
+ movdqa %xmm15,%xmm6
+ pslld $7,%xmm15
+ por %xmm7,%xmm14
+ psrld $25,%xmm6
+ movdqa (%r9),%xmm7
+ por %xmm6,%xmm15
+ paddd %xmm13,%xmm8
+ paddd %xmm14,%xmm9
+ pxor %xmm8,%xmm3
+ pxor %xmm9,%xmm0
+ pshufb %xmm7,%xmm3
+ pshufb %xmm7,%xmm0
+ paddd %xmm3,%xmm4
+ paddd %xmm0,%xmm5
+ pxor %xmm4,%xmm13
+ pxor %xmm5,%xmm14
+ movdqa %xmm13,%xmm6
+ pslld $12,%xmm13
+ psrld $20,%xmm6
+ movdqa %xmm14,%xmm7
+ pslld $12,%xmm14
+ por %xmm6,%xmm13
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm14
+ paddd %xmm13,%xmm8
+ paddd %xmm14,%xmm9
+ pxor %xmm8,%xmm3
+ pxor %xmm9,%xmm0
+ pshufb %xmm6,%xmm3
+ pshufb %xmm6,%xmm0
+ paddd %xmm3,%xmm4
+ paddd %xmm0,%xmm5
+ pxor %xmm4,%xmm13
+ pxor %xmm5,%xmm14
+ movdqa %xmm13,%xmm7
+ pslld $7,%xmm13
+ psrld $25,%xmm7
+ movdqa %xmm14,%xmm6
+ pslld $7,%xmm14
+ por %xmm7,%xmm13
+ psrld $25,%xmm6
+ movdqa (%r9),%xmm7
+ por %xmm6,%xmm14
+ movdqa %xmm4,32(%rsp)
+ movdqa %xmm5,48(%rsp)
+ movdqa 0(%rsp),%xmm4
+ movdqa 16(%rsp),%xmm5
+ paddd %xmm15,%xmm10
+ paddd %xmm12,%xmm11
+ pxor %xmm10,%xmm1
+ pxor %xmm11,%xmm2
+ pshufb %xmm7,%xmm1
+ pshufb %xmm7,%xmm2
+ paddd %xmm1,%xmm4
+ paddd %xmm2,%xmm5
+ pxor %xmm4,%xmm15
+ pxor %xmm5,%xmm12
+ movdqa %xmm15,%xmm6
+ pslld $12,%xmm15
+ psrld $20,%xmm6
+ movdqa %xmm12,%xmm7
+ pslld $12,%xmm12
+ por %xmm6,%xmm15
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm12
+ paddd %xmm15,%xmm10
+ paddd %xmm12,%xmm11
+ pxor %xmm10,%xmm1
+ pxor %xmm11,%xmm2
+ pshufb %xmm6,%xmm1
+ pshufb %xmm6,%xmm2
+ paddd %xmm1,%xmm4
+ paddd %xmm2,%xmm5
+ pxor %xmm4,%xmm15
+ pxor %xmm5,%xmm12
+ movdqa %xmm15,%xmm7
+ pslld $7,%xmm15
+ psrld $25,%xmm7
+ movdqa %xmm12,%xmm6
+ pslld $7,%xmm12
+ por %xmm7,%xmm15
+ psrld $25,%xmm6
+ movdqa (%r9),%xmm7
+ por %xmm6,%xmm12
+ decl %eax
+ jnz .Loop4x
+
+ paddd 64(%rsp),%xmm8
+ paddd 80(%rsp),%xmm9
+ paddd 96(%rsp),%xmm10
+ paddd 112(%rsp),%xmm11
+
+ movdqa %xmm8,%xmm6
+ punpckldq %xmm9,%xmm8
+ movdqa %xmm10,%xmm7
+ punpckldq %xmm11,%xmm10
+ punpckhdq %xmm9,%xmm6
+ punpckhdq %xmm11,%xmm7
+ movdqa %xmm8,%xmm9
+ punpcklqdq %xmm10,%xmm8
+ movdqa %xmm6,%xmm11
+ punpcklqdq %xmm7,%xmm6
+ punpckhqdq %xmm10,%xmm9
+ punpckhqdq %xmm7,%xmm11
+ paddd 128-256(%rcx),%xmm12
+ paddd 144-256(%rcx),%xmm13
+ paddd 160-256(%rcx),%xmm14
+ paddd 176-256(%rcx),%xmm15
+
+ movdqa %xmm8,0(%rsp)
+ movdqa %xmm9,16(%rsp)
+ movdqa 32(%rsp),%xmm8
+ movdqa 48(%rsp),%xmm9
+
+ movdqa %xmm12,%xmm10
+ punpckldq %xmm13,%xmm12
+ movdqa %xmm14,%xmm7
+ punpckldq %xmm15,%xmm14
+ punpckhdq %xmm13,%xmm10
+ punpckhdq %xmm15,%xmm7
+ movdqa %xmm12,%xmm13
+ punpcklqdq %xmm14,%xmm12
+ movdqa %xmm10,%xmm15
+ punpcklqdq %xmm7,%xmm10
+ punpckhqdq %xmm14,%xmm13
+ punpckhqdq %xmm7,%xmm15
+ paddd 192-256(%rcx),%xmm4
+ paddd 208-256(%rcx),%xmm5
+ paddd 224-256(%rcx),%xmm8
+ paddd 240-256(%rcx),%xmm9
+
+ movdqa %xmm6,32(%rsp)
+ movdqa %xmm11,48(%rsp)
+
+ movdqa %xmm4,%xmm14
+ punpckldq %xmm5,%xmm4
+ movdqa %xmm8,%xmm7
+ punpckldq %xmm9,%xmm8
+ punpckhdq %xmm5,%xmm14
+ punpckhdq %xmm9,%xmm7
+ movdqa %xmm4,%xmm5
+ punpcklqdq %xmm8,%xmm4
+ movdqa %xmm14,%xmm9
+ punpcklqdq %xmm7,%xmm14
+ punpckhqdq %xmm8,%xmm5
+ punpckhqdq %xmm7,%xmm9
+ paddd 256-256(%rcx),%xmm0
+ paddd 272-256(%rcx),%xmm1
+ paddd 288-256(%rcx),%xmm2
+ paddd 304-256(%rcx),%xmm3
+
+ movdqa %xmm0,%xmm8
+ punpckldq %xmm1,%xmm0
+ movdqa %xmm2,%xmm7
+ punpckldq %xmm3,%xmm2
+ punpckhdq %xmm1,%xmm8
+ punpckhdq %xmm3,%xmm7
+ movdqa %xmm0,%xmm1
+ punpcklqdq %xmm2,%xmm0
+ movdqa %xmm8,%xmm3
+ punpcklqdq %xmm7,%xmm8
+ punpckhqdq %xmm2,%xmm1
+ punpckhqdq %xmm7,%xmm3
+ cmpq $256,%rdx
+ jb .Ltail4x
+
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+
+ movdqu %xmm6,64(%rdi)
+ movdqu 0(%rsi),%xmm6
+ movdqu %xmm11,80(%rdi)
+ movdqu 16(%rsi),%xmm11
+ movdqu %xmm2,96(%rdi)
+ movdqu 32(%rsi),%xmm2
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+ movdqu 48(%rsi),%xmm7
+ pxor 32(%rsp),%xmm6
+ pxor %xmm10,%xmm11
+ pxor %xmm14,%xmm2
+ pxor %xmm8,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 48(%rsp),%xmm6
+ pxor %xmm15,%xmm11
+ pxor %xmm9,%xmm2
+ pxor %xmm3,%xmm7
+ movdqu %xmm6,64(%rdi)
+ movdqu %xmm11,80(%rdi)
+ movdqu %xmm2,96(%rdi)
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $256,%rdx
+ jnz .Loop_outer4x
+
+ jmp .Ldone4x
+
+.Ltail4x:
+ cmpq $192,%rdx
+ jae .L192_or_more4x
+ cmpq $128,%rdx
+ jae .L128_or_more4x
+ cmpq $64,%rdx
+ jae .L64_or_more4x
+
+
+ xorq %r9,%r9
+
+ movdqa %xmm12,16(%rsp)
+ movdqa %xmm4,32(%rsp)
+ movdqa %xmm0,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L64_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+ movdqu %xmm6,0(%rdi)
+ movdqu %xmm11,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm7,48(%rdi)
+ je .Ldone4x
+
+ movdqa 16(%rsp),%xmm6
+ leaq 64(%rsi),%rsi
+ xorq %r9,%r9
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm13,16(%rsp)
+ leaq 64(%rdi),%rdi
+ movdqa %xmm5,32(%rsp)
+ subq $64,%rdx
+ movdqa %xmm1,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L128_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+ movdqu %xmm6,64(%rdi)
+ movdqu %xmm11,80(%rdi)
+ movdqu %xmm2,96(%rdi)
+ movdqu %xmm7,112(%rdi)
+ je .Ldone4x
+
+ movdqa 32(%rsp),%xmm6
+ leaq 128(%rsi),%rsi
+ xorq %r9,%r9
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm10,16(%rsp)
+ leaq 128(%rdi),%rdi
+ movdqa %xmm14,32(%rsp)
+ subq $128,%rdx
+ movdqa %xmm8,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L192_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+
+ movdqu %xmm6,64(%rdi)
+ movdqu 0(%rsi),%xmm6
+ movdqu %xmm11,80(%rdi)
+ movdqu 16(%rsi),%xmm11
+ movdqu %xmm2,96(%rdi)
+ movdqu 32(%rsi),%xmm2
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+ movdqu 48(%rsi),%xmm7
+ pxor 32(%rsp),%xmm6
+ pxor %xmm10,%xmm11
+ pxor %xmm14,%xmm2
+ pxor %xmm8,%xmm7
+ movdqu %xmm6,0(%rdi)
+ movdqu %xmm11,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm7,48(%rdi)
+ je .Ldone4x
+
+ movdqa 48(%rsp),%xmm6
+ leaq 64(%rsi),%rsi
+ xorq %r9,%r9
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm15,16(%rsp)
+ leaq 64(%rdi),%rdi
+ movdqa %xmm9,32(%rsp)
+ subq $192,%rdx
+ movdqa %xmm3,48(%rsp)
+
+.Loop_tail4x:
+ movzbl (%rsi,%r9,1),%eax
+ movzbl (%rsp,%r9,1),%ecx
+ leaq 1(%r9),%r9
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r9,1)
+ decq %rdx
+ jnz .Loop_tail4x
+
+.Ldone4x:
+ leaq -8(%r10),%rsp
+
+.L4x_epilogue:
+ ret
+ENDPROC(chacha20_ssse3)
+#endif /* CONFIG_AS_SSSE3 */
+
+#ifdef CONFIG_AS_AVX2
+.align 32
+ENTRY(chacha20_avx2)
+.Lchacha20_avx2:
+ cmpq $0,%rdx
+ je .L8x_epilogue
+ leaq 8(%rsp),%r10
+
+ subq $0x280+8,%rsp
+ andq $-32,%rsp
+ vzeroupper
+
+ vbroadcasti128 .Lsigma(%rip),%ymm11
+ vbroadcasti128 (%rcx),%ymm3
+ vbroadcasti128 16(%rcx),%ymm15
+ vbroadcasti128 (%r8),%ymm7
+ leaq 256(%rsp),%rcx
+ leaq 512(%rsp),%rax
+ leaq .Lrot16(%rip),%r9
+ leaq .Lrot24(%rip),%r11
+
+ vpshufd $0x00,%ymm11,%ymm8
+ vpshufd $0x55,%ymm11,%ymm9
+ vmovdqa %ymm8,128-256(%rcx)
+ vpshufd $0xaa,%ymm11,%ymm10
+ vmovdqa %ymm9,160-256(%rcx)
+ vpshufd $0xff,%ymm11,%ymm11
+ vmovdqa %ymm10,192-256(%rcx)
+ vmovdqa %ymm11,224-256(%rcx)
+
+ vpshufd $0x00,%ymm3,%ymm0
+ vpshufd $0x55,%ymm3,%ymm1
+ vmovdqa %ymm0,256-256(%rcx)
+ vpshufd $0xaa,%ymm3,%ymm2
+ vmovdqa %ymm1,288-256(%rcx)
+ vpshufd $0xff,%ymm3,%ymm3
+ vmovdqa %ymm2,320-256(%rcx)
+ vmovdqa %ymm3,352-256(%rcx)
+
+ vpshufd $0x00,%ymm15,%ymm12
+ vpshufd $0x55,%ymm15,%ymm13
+ vmovdqa %ymm12,384-512(%rax)
+ vpshufd $0xaa,%ymm15,%ymm14
+ vmovdqa %ymm13,416-512(%rax)
+ vpshufd $0xff,%ymm15,%ymm15
+ vmovdqa %ymm14,448-512(%rax)
+ vmovdqa %ymm15,480-512(%rax)
+
+ vpshufd $0x00,%ymm7,%ymm4
+ vpshufd $0x55,%ymm7,%ymm5
+ vpaddd .Lincy(%rip),%ymm4,%ymm4
+ vpshufd $0xaa,%ymm7,%ymm6
+ vmovdqa %ymm5,544-512(%rax)
+ vpshufd $0xff,%ymm7,%ymm7
+ vmovdqa %ymm6,576-512(%rax)
+ vmovdqa %ymm7,608-512(%rax)
+
+ jmp .Loop_enter8x
+
+.align 32
+.Loop_outer8x:
+ vmovdqa 128-256(%rcx),%ymm8
+ vmovdqa 160-256(%rcx),%ymm9
+ vmovdqa 192-256(%rcx),%ymm10
+ vmovdqa 224-256(%rcx),%ymm11
+ vmovdqa 256-256(%rcx),%ymm0
+ vmovdqa 288-256(%rcx),%ymm1
+ vmovdqa 320-256(%rcx),%ymm2
+ vmovdqa 352-256(%rcx),%ymm3
+ vmovdqa 384-512(%rax),%ymm12
+ vmovdqa 416-512(%rax),%ymm13
+ vmovdqa 448-512(%rax),%ymm14
+ vmovdqa 480-512(%rax),%ymm15
+ vmovdqa 512-512(%rax),%ymm4
+ vmovdqa 544-512(%rax),%ymm5
+ vmovdqa 576-512(%rax),%ymm6
+ vmovdqa 608-512(%rax),%ymm7
+ vpaddd .Leight(%rip),%ymm4,%ymm4
+
+.Loop_enter8x:
+ vmovdqa %ymm14,64(%rsp)
+ vmovdqa %ymm15,96(%rsp)
+ vbroadcasti128 (%r9),%ymm15
+ vmovdqa %ymm4,512-512(%rax)
+ movl $10,%eax
+ jmp .Loop8x
+
+.align 32
+.Loop8x:
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $12,%ymm0,%ymm14
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $12,%ymm1,%ymm15
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $7,%ymm0,%ymm15
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vbroadcasti128 (%r9),%ymm15
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $7,%ymm1,%ymm14
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vmovdqa %ymm12,0(%rsp)
+ vmovdqa %ymm13,32(%rsp)
+ vmovdqa 64(%rsp),%ymm12
+ vmovdqa 96(%rsp),%ymm13
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $12,%ymm2,%ymm14
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $12,%ymm3,%ymm15
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $7,%ymm2,%ymm15
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vbroadcasti128 (%r9),%ymm15
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $7,%ymm3,%ymm14
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $12,%ymm1,%ymm14
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $12,%ymm2,%ymm15
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $7,%ymm1,%ymm15
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vbroadcasti128 (%r9),%ymm15
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $7,%ymm2,%ymm14
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vmovdqa %ymm12,64(%rsp)
+ vmovdqa %ymm13,96(%rsp)
+ vmovdqa 0(%rsp),%ymm12
+ vmovdqa 32(%rsp),%ymm13
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $12,%ymm3,%ymm14
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $12,%ymm0,%ymm15
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $7,%ymm3,%ymm15
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vbroadcasti128 (%r9),%ymm15
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $7,%ymm0,%ymm14
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ decl %eax
+ jnz .Loop8x
+
+ leaq 512(%rsp),%rax
+ vpaddd 128-256(%rcx),%ymm8,%ymm8
+ vpaddd 160-256(%rcx),%ymm9,%ymm9
+ vpaddd 192-256(%rcx),%ymm10,%ymm10
+ vpaddd 224-256(%rcx),%ymm11,%ymm11
+
+ vpunpckldq %ymm9,%ymm8,%ymm14
+ vpunpckldq %ymm11,%ymm10,%ymm15
+ vpunpckhdq %ymm9,%ymm8,%ymm8
+ vpunpckhdq %ymm11,%ymm10,%ymm10
+ vpunpcklqdq %ymm15,%ymm14,%ymm9
+ vpunpckhqdq %ymm15,%ymm14,%ymm14
+ vpunpcklqdq %ymm10,%ymm8,%ymm11
+ vpunpckhqdq %ymm10,%ymm8,%ymm8
+ vpaddd 256-256(%rcx),%ymm0,%ymm0
+ vpaddd 288-256(%rcx),%ymm1,%ymm1
+ vpaddd 320-256(%rcx),%ymm2,%ymm2
+ vpaddd 352-256(%rcx),%ymm3,%ymm3
+
+ vpunpckldq %ymm1,%ymm0,%ymm10
+ vpunpckldq %ymm3,%ymm2,%ymm15
+ vpunpckhdq %ymm1,%ymm0,%ymm0
+ vpunpckhdq %ymm3,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm10,%ymm1
+ vpunpckhqdq %ymm15,%ymm10,%ymm10
+ vpunpcklqdq %ymm2,%ymm0,%ymm3
+ vpunpckhqdq %ymm2,%ymm0,%ymm0
+ vperm2i128 $0x20,%ymm1,%ymm9,%ymm15
+ vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
+ vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
+ vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
+ vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
+ vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
+ vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
+ vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
+ vmovdqa %ymm15,0(%rsp)
+ vmovdqa %ymm9,32(%rsp)
+ vmovdqa 64(%rsp),%ymm15
+ vmovdqa 96(%rsp),%ymm9
+
+ vpaddd 384-512(%rax),%ymm12,%ymm12
+ vpaddd 416-512(%rax),%ymm13,%ymm13
+ vpaddd 448-512(%rax),%ymm15,%ymm15
+ vpaddd 480-512(%rax),%ymm9,%ymm9
+
+ vpunpckldq %ymm13,%ymm12,%ymm2
+ vpunpckldq %ymm9,%ymm15,%ymm8
+ vpunpckhdq %ymm13,%ymm12,%ymm12
+ vpunpckhdq %ymm9,%ymm15,%ymm15
+ vpunpcklqdq %ymm8,%ymm2,%ymm13
+ vpunpckhqdq %ymm8,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm12,%ymm9
+ vpunpckhqdq %ymm15,%ymm12,%ymm12
+ vpaddd 512-512(%rax),%ymm4,%ymm4
+ vpaddd 544-512(%rax),%ymm5,%ymm5
+ vpaddd 576-512(%rax),%ymm6,%ymm6
+ vpaddd 608-512(%rax),%ymm7,%ymm7
+
+ vpunpckldq %ymm5,%ymm4,%ymm15
+ vpunpckldq %ymm7,%ymm6,%ymm8
+ vpunpckhdq %ymm5,%ymm4,%ymm4
+ vpunpckhdq %ymm7,%ymm6,%ymm6
+ vpunpcklqdq %ymm8,%ymm15,%ymm5
+ vpunpckhqdq %ymm8,%ymm15,%ymm15
+ vpunpcklqdq %ymm6,%ymm4,%ymm7
+ vpunpckhqdq %ymm6,%ymm4,%ymm4
+ vperm2i128 $0x20,%ymm5,%ymm13,%ymm8
+ vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
+ vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
+ vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
+ vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
+ vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
+ vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
+ vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
+ vmovdqa 0(%rsp),%ymm6
+ vmovdqa 32(%rsp),%ymm12
+
+ cmpq $512,%rdx
+ jb .Ltail8x
+
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm12,%ymm12
+ vpxor 32(%rsi),%ymm13,%ymm13
+ vpxor 64(%rsi),%ymm10,%ymm10
+ vpxor 96(%rsi),%ymm15,%ymm15
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm12,0(%rdi)
+ vmovdqu %ymm13,32(%rdi)
+ vmovdqu %ymm10,64(%rdi)
+ vmovdqu %ymm15,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm14,%ymm14
+ vpxor 32(%rsi),%ymm2,%ymm2
+ vpxor 64(%rsi),%ymm3,%ymm3
+ vpxor 96(%rsi),%ymm7,%ymm7
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm14,0(%rdi)
+ vmovdqu %ymm2,32(%rdi)
+ vmovdqu %ymm3,64(%rdi)
+ vmovdqu %ymm7,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm11,%ymm11
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vpxor 64(%rsi),%ymm0,%ymm0
+ vpxor 96(%rsi),%ymm4,%ymm4
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm11,0(%rdi)
+ vmovdqu %ymm9,32(%rdi)
+ vmovdqu %ymm0,64(%rdi)
+ vmovdqu %ymm4,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $512,%rdx
+ jnz .Loop_outer8x
+
+ jmp .Ldone8x
+
+.Ltail8x:
+ cmpq $448,%rdx
+ jae .L448_or_more8x
+ cmpq $384,%rdx
+ jae .L384_or_more8x
+ cmpq $320,%rdx
+ jae .L320_or_more8x
+ cmpq $256,%rdx
+ jae .L256_or_more8x
+ cmpq $192,%rdx
+ jae .L192_or_more8x
+ cmpq $128,%rdx
+ jae .L128_or_more8x
+ cmpq $64,%rdx
+ jae .L64_or_more8x
+
+ xorq %r9,%r9
+ vmovdqa %ymm6,0(%rsp)
+ vmovdqa %ymm8,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L64_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ je .Ldone8x
+
+ leaq 64(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm1,0(%rsp)
+ leaq 64(%rdi),%rdi
+ subq $64,%rdx
+ vmovdqa %ymm5,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L128_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ je .Ldone8x
+
+ leaq 128(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm12,0(%rsp)
+ leaq 128(%rdi),%rdi
+ subq $128,%rdx
+ vmovdqa %ymm13,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L192_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ je .Ldone8x
+
+ leaq 192(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm10,0(%rsp)
+ leaq 192(%rdi),%rdi
+ subq $192,%rdx
+ vmovdqa %ymm15,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L256_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ je .Ldone8x
+
+ leaq 256(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm14,0(%rsp)
+ leaq 256(%rdi),%rdi
+ subq $256,%rdx
+ vmovdqa %ymm2,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L320_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ je .Ldone8x
+
+ leaq 320(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm3,0(%rsp)
+ leaq 320(%rdi),%rdi
+ subq $320,%rdx
+ vmovdqa %ymm7,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L384_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ je .Ldone8x
+
+ leaq 384(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm11,0(%rsp)
+ leaq 384(%rdi),%rdi
+ subq $384,%rdx
+ vmovdqa %ymm9,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L448_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vpxor 384(%rsi),%ymm11,%ymm11
+ vpxor 416(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ vmovdqu %ymm11,384(%rdi)
+ vmovdqu %ymm9,416(%rdi)
+ je .Ldone8x
+
+ leaq 448(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm0,0(%rsp)
+ leaq 448(%rdi),%rdi
+ subq $448,%rdx
+ vmovdqa %ymm4,32(%rsp)
+
+.Loop_tail8x:
+ movzbl (%rsi,%r9,1),%eax
+ movzbl (%rsp,%r9,1),%ecx
+ leaq 1(%r9),%r9
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r9,1)
+ decq %rdx
+ jnz .Loop_tail8x
+
+.Ldone8x:
+ vzeroall
+ leaq -8(%r10),%rsp
+
+.L8x_epilogue:
+ ret
+ENDPROC(chacha20_avx2)
+#endif /* CONFIG_AS_AVX2 */
+
+#ifdef CONFIG_AS_AVX512
+.align 32
+ENTRY(chacha20_avx512)
+.Lchacha20_avx512:
+ cmpq $0,%rdx
+ je .Lavx512_epilogue
+ leaq 8(%rsp),%r10
+
+ cmpq $512,%rdx
+ ja .Lchacha20_16x
+
+ subq $64+8,%rsp
+ andq $-64,%rsp
+ vbroadcasti32x4 .Lsigma(%rip),%zmm0
+ vbroadcasti32x4 (%rcx),%zmm1
+ vbroadcasti32x4 16(%rcx),%zmm2
+ vbroadcasti32x4 (%r8),%zmm3
+
+ vmovdqa32 %zmm0,%zmm16
+ vmovdqa32 %zmm1,%zmm17
+ vmovdqa32 %zmm2,%zmm18
+ vpaddd .Lzeroz(%rip),%zmm3,%zmm3
+ vmovdqa32 .Lfourz(%rip),%zmm20
+ movq $10,%r8
+ vmovdqa32 %zmm3,%zmm19
+ jmp .Loop_avx512
+
+.align 16
+.Loop_outer_avx512:
+ vmovdqa32 %zmm16,%zmm0
+ vmovdqa32 %zmm17,%zmm1
+ vmovdqa32 %zmm18,%zmm2
+ vpaddd %zmm20,%zmm19,%zmm3
+ movq $10,%r8
+ vmovdqa32 %zmm3,%zmm19
+ jmp .Loop_avx512
+
+.align 32
+.Loop_avx512:
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $16,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $12,%zmm1,%zmm1
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $8,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $7,%zmm1,%zmm1
+ vpshufd $78,%zmm2,%zmm2
+ vpshufd $57,%zmm1,%zmm1
+ vpshufd $147,%zmm3,%zmm3
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $16,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $12,%zmm1,%zmm1
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $8,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $7,%zmm1,%zmm1
+ vpshufd $78,%zmm2,%zmm2
+ vpshufd $147,%zmm1,%zmm1
+ vpshufd $57,%zmm3,%zmm3
+ decq %r8
+ jnz .Loop_avx512
+ vpaddd %zmm16,%zmm0,%zmm0
+ vpaddd %zmm17,%zmm1,%zmm1
+ vpaddd %zmm18,%zmm2,%zmm2
+ vpaddd %zmm19,%zmm3,%zmm3
+
+ subq $64,%rdx
+ jb .Ltail64_avx512
+
+ vpxor 0(%rsi),%xmm0,%xmm4
+ vpxor 16(%rsi),%xmm1,%xmm5
+ vpxor 32(%rsi),%xmm2,%xmm6
+ vpxor 48(%rsi),%xmm3,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz .Ldone_avx512
+
+ vextracti32x4 $1,%zmm0,%xmm4
+ vextracti32x4 $1,%zmm1,%xmm5
+ vextracti32x4 $1,%zmm2,%xmm6
+ vextracti32x4 $1,%zmm3,%xmm7
+
+ subq $64,%rdx
+ jb .Ltail_avx512
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz .Ldone_avx512
+
+ vextracti32x4 $2,%zmm0,%xmm4
+ vextracti32x4 $2,%zmm1,%xmm5
+ vextracti32x4 $2,%zmm2,%xmm6
+ vextracti32x4 $2,%zmm3,%xmm7
+
+ subq $64,%rdx
+ jb .Ltail_avx512
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz .Ldone_avx512
+
+ vextracti32x4 $3,%zmm0,%xmm4
+ vextracti32x4 $3,%zmm1,%xmm5
+ vextracti32x4 $3,%zmm2,%xmm6
+ vextracti32x4 $3,%zmm3,%xmm7
+
+ subq $64,%rdx
+ jb .Ltail_avx512
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jnz .Loop_outer_avx512
+
+ jmp .Ldone_avx512
+
+.align 16
+.Ltail64_avx512:
+ vmovdqa %xmm0,0(%rsp)
+ vmovdqa %xmm1,16(%rsp)
+ vmovdqa %xmm2,32(%rsp)
+ vmovdqa %xmm3,48(%rsp)
+ addq $64,%rdx
+ jmp .Loop_tail_avx512
+
+.align 16
+.Ltail_avx512:
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ vmovdqa %xmm7,48(%rsp)
+ addq $64,%rdx
+
+.Loop_tail_avx512:
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r8,1)
+ decq %rdx
+ jnz .Loop_tail_avx512
+
+ vmovdqa32 %zmm16,0(%rsp)
+
+.Ldone_avx512:
+ vzeroall
+ leaq -8(%r10),%rsp
+
+.Lavx512_epilogue:
+ ret
+
+.align 32
+.Lchacha20_16x:
+ leaq 8(%rsp),%r10
+
+ subq $64+8,%rsp
+ andq $-64,%rsp
+ vzeroupper
+
+ leaq .Lsigma(%rip),%r9
+ vbroadcasti32x4 (%r9),%zmm3
+ vbroadcasti32x4 (%rcx),%zmm7
+ vbroadcasti32x4 16(%rcx),%zmm11
+ vbroadcasti32x4 (%r8),%zmm15
+
+ vpshufd $0x00,%zmm3,%zmm0
+ vpshufd $0x55,%zmm3,%zmm1
+ vpshufd $0xaa,%zmm3,%zmm2
+ vpshufd $0xff,%zmm3,%zmm3
+ vmovdqa64 %zmm0,%zmm16
+ vmovdqa64 %zmm1,%zmm17
+ vmovdqa64 %zmm2,%zmm18
+ vmovdqa64 %zmm3,%zmm19
+
+ vpshufd $0x00,%zmm7,%zmm4
+ vpshufd $0x55,%zmm7,%zmm5
+ vpshufd $0xaa,%zmm7,%zmm6
+ vpshufd $0xff,%zmm7,%zmm7
+ vmovdqa64 %zmm4,%zmm20
+ vmovdqa64 %zmm5,%zmm21
+ vmovdqa64 %zmm6,%zmm22
+ vmovdqa64 %zmm7,%zmm23
+
+ vpshufd $0x00,%zmm11,%zmm8
+ vpshufd $0x55,%zmm11,%zmm9
+ vpshufd $0xaa,%zmm11,%zmm10
+ vpshufd $0xff,%zmm11,%zmm11
+ vmovdqa64 %zmm8,%zmm24
+ vmovdqa64 %zmm9,%zmm25
+ vmovdqa64 %zmm10,%zmm26
+ vmovdqa64 %zmm11,%zmm27
+
+ vpshufd $0x00,%zmm15,%zmm12
+ vpshufd $0x55,%zmm15,%zmm13
+ vpshufd $0xaa,%zmm15,%zmm14
+ vpshufd $0xff,%zmm15,%zmm15
+ vpaddd .Lincz(%rip),%zmm12,%zmm12
+ vmovdqa64 %zmm12,%zmm28
+ vmovdqa64 %zmm13,%zmm29
+ vmovdqa64 %zmm14,%zmm30
+ vmovdqa64 %zmm15,%zmm31
+
+ movl $10,%eax
+ jmp .Loop16x
+
+.align 32
+.Loop_outer16x:
+ vpbroadcastd 0(%r9),%zmm0
+ vpbroadcastd 4(%r9),%zmm1
+ vpbroadcastd 8(%r9),%zmm2
+ vpbroadcastd 12(%r9),%zmm3
+ vpaddd .Lsixteen(%rip),%zmm28,%zmm28
+ vmovdqa64 %zmm20,%zmm4
+ vmovdqa64 %zmm21,%zmm5
+ vmovdqa64 %zmm22,%zmm6
+ vmovdqa64 %zmm23,%zmm7
+ vmovdqa64 %zmm24,%zmm8
+ vmovdqa64 %zmm25,%zmm9
+ vmovdqa64 %zmm26,%zmm10
+ vmovdqa64 %zmm27,%zmm11
+ vmovdqa64 %zmm28,%zmm12
+ vmovdqa64 %zmm29,%zmm13
+ vmovdqa64 %zmm30,%zmm14
+ vmovdqa64 %zmm31,%zmm15
+
+ vmovdqa64 %zmm0,%zmm16
+ vmovdqa64 %zmm1,%zmm17
+ vmovdqa64 %zmm2,%zmm18
+ vmovdqa64 %zmm3,%zmm19
+
+ movl $10,%eax
+ jmp .Loop16x
+
+.align 32
+.Loop16x:
+ vpaddd %zmm4,%zmm0,%zmm0
+ vpaddd %zmm5,%zmm1,%zmm1
+ vpaddd %zmm6,%zmm2,%zmm2
+ vpaddd %zmm7,%zmm3,%zmm3
+ vpxord %zmm0,%zmm12,%zmm12
+ vpxord %zmm1,%zmm13,%zmm13
+ vpxord %zmm2,%zmm14,%zmm14
+ vpxord %zmm3,%zmm15,%zmm15
+ vprold $16,%zmm12,%zmm12
+ vprold $16,%zmm13,%zmm13
+ vprold $16,%zmm14,%zmm14
+ vprold $16,%zmm15,%zmm15
+ vpaddd %zmm12,%zmm8,%zmm8
+ vpaddd %zmm13,%zmm9,%zmm9
+ vpaddd %zmm14,%zmm10,%zmm10
+ vpaddd %zmm15,%zmm11,%zmm11
+ vpxord %zmm8,%zmm4,%zmm4
+ vpxord %zmm9,%zmm5,%zmm5
+ vpxord %zmm10,%zmm6,%zmm6
+ vpxord %zmm11,%zmm7,%zmm7
+ vprold $12,%zmm4,%zmm4
+ vprold $12,%zmm5,%zmm5
+ vprold $12,%zmm6,%zmm6
+ vprold $12,%zmm7,%zmm7
+ vpaddd %zmm4,%zmm0,%zmm0
+ vpaddd %zmm5,%zmm1,%zmm1
+ vpaddd %zmm6,%zmm2,%zmm2
+ vpaddd %zmm7,%zmm3,%zmm3
+ vpxord %zmm0,%zmm12,%zmm12
+ vpxord %zmm1,%zmm13,%zmm13
+ vpxord %zmm2,%zmm14,%zmm14
+ vpxord %zmm3,%zmm15,%zmm15
+ vprold $8,%zmm12,%zmm12
+ vprold $8,%zmm13,%zmm13
+ vprold $8,%zmm14,%zmm14
+ vprold $8,%zmm15,%zmm15
+ vpaddd %zmm12,%zmm8,%zmm8
+ vpaddd %zmm13,%zmm9,%zmm9
+ vpaddd %zmm14,%zmm10,%zmm10
+ vpaddd %zmm15,%zmm11,%zmm11
+ vpxord %zmm8,%zmm4,%zmm4
+ vpxord %zmm9,%zmm5,%zmm5
+ vpxord %zmm10,%zmm6,%zmm6
+ vpxord %zmm11,%zmm7,%zmm7
+ vprold $7,%zmm4,%zmm4
+ vprold $7,%zmm5,%zmm5
+ vprold $7,%zmm6,%zmm6
+ vprold $7,%zmm7,%zmm7
+ vpaddd %zmm5,%zmm0,%zmm0
+ vpaddd %zmm6,%zmm1,%zmm1
+ vpaddd %zmm7,%zmm2,%zmm2
+ vpaddd %zmm4,%zmm3,%zmm3
+ vpxord %zmm0,%zmm15,%zmm15
+ vpxord %zmm1,%zmm12,%zmm12
+ vpxord %zmm2,%zmm13,%zmm13
+ vpxord %zmm3,%zmm14,%zmm14
+ vprold $16,%zmm15,%zmm15
+ vprold $16,%zmm12,%zmm12
+ vprold $16,%zmm13,%zmm13
+ vprold $16,%zmm14,%zmm14
+ vpaddd %zmm15,%zmm10,%zmm10
+ vpaddd %zmm12,%zmm11,%zmm11
+ vpaddd %zmm13,%zmm8,%zmm8
+ vpaddd %zmm14,%zmm9,%zmm9
+ vpxord %zmm10,%zmm5,%zmm5
+ vpxord %zmm11,%zmm6,%zmm6
+ vpxord %zmm8,%zmm7,%zmm7
+ vpxord %zmm9,%zmm4,%zmm4
+ vprold $12,%zmm5,%zmm5
+ vprold $12,%zmm6,%zmm6
+ vprold $12,%zmm7,%zmm7
+ vprold $12,%zmm4,%zmm4
+ vpaddd %zmm5,%zmm0,%zmm0
+ vpaddd %zmm6,%zmm1,%zmm1
+ vpaddd %zmm7,%zmm2,%zmm2
+ vpaddd %zmm4,%zmm3,%zmm3
+ vpxord %zmm0,%zmm15,%zmm15
+ vpxord %zmm1,%zmm12,%zmm12
+ vpxord %zmm2,%zmm13,%zmm13
+ vpxord %zmm3,%zmm14,%zmm14
+ vprold $8,%zmm15,%zmm15
+ vprold $8,%zmm12,%zmm12
+ vprold $8,%zmm13,%zmm13
+ vprold $8,%zmm14,%zmm14
+ vpaddd %zmm15,%zmm10,%zmm10
+ vpaddd %zmm12,%zmm11,%zmm11
+ vpaddd %zmm13,%zmm8,%zmm8
+ vpaddd %zmm14,%zmm9,%zmm9
+ vpxord %zmm10,%zmm5,%zmm5
+ vpxord %zmm11,%zmm6,%zmm6
+ vpxord %zmm8,%zmm7,%zmm7
+ vpxord %zmm9,%zmm4,%zmm4
+ vprold $7,%zmm5,%zmm5
+ vprold $7,%zmm6,%zmm6
+ vprold $7,%zmm7,%zmm7
+ vprold $7,%zmm4,%zmm4
+ decl %eax
+ jnz .Loop16x
+
+ vpaddd %zmm16,%zmm0,%zmm0
+ vpaddd %zmm17,%zmm1,%zmm1
+ vpaddd %zmm18,%zmm2,%zmm2
+ vpaddd %zmm19,%zmm3,%zmm3
+
+ vpunpckldq %zmm1,%zmm0,%zmm18
+ vpunpckldq %zmm3,%zmm2,%zmm19
+ vpunpckhdq %zmm1,%zmm0,%zmm0
+ vpunpckhdq %zmm3,%zmm2,%zmm2
+ vpunpcklqdq %zmm19,%zmm18,%zmm1
+ vpunpckhqdq %zmm19,%zmm18,%zmm18
+ vpunpcklqdq %zmm2,%zmm0,%zmm3
+ vpunpckhqdq %zmm2,%zmm0,%zmm0
+ vpaddd %zmm20,%zmm4,%zmm4
+ vpaddd %zmm21,%zmm5,%zmm5
+ vpaddd %zmm22,%zmm6,%zmm6
+ vpaddd %zmm23,%zmm7,%zmm7
+
+ vpunpckldq %zmm5,%zmm4,%zmm2
+ vpunpckldq %zmm7,%zmm6,%zmm19
+ vpunpckhdq %zmm5,%zmm4,%zmm4
+ vpunpckhdq %zmm7,%zmm6,%zmm6
+ vpunpcklqdq %zmm19,%zmm2,%zmm5
+ vpunpckhqdq %zmm19,%zmm2,%zmm2
+ vpunpcklqdq %zmm6,%zmm4,%zmm7
+ vpunpckhqdq %zmm6,%zmm4,%zmm4
+ vshufi32x4 $0x44,%zmm5,%zmm1,%zmm19
+ vshufi32x4 $0xee,%zmm5,%zmm1,%zmm5
+ vshufi32x4 $0x44,%zmm2,%zmm18,%zmm1
+ vshufi32x4 $0xee,%zmm2,%zmm18,%zmm2
+ vshufi32x4 $0x44,%zmm7,%zmm3,%zmm18
+ vshufi32x4 $0xee,%zmm7,%zmm3,%zmm7
+ vshufi32x4 $0x44,%zmm4,%zmm0,%zmm3
+ vshufi32x4 $0xee,%zmm4,%zmm0,%zmm4
+ vpaddd %zmm24,%zmm8,%zmm8
+ vpaddd %zmm25,%zmm9,%zmm9
+ vpaddd %zmm26,%zmm10,%zmm10
+ vpaddd %zmm27,%zmm11,%zmm11
+
+ vpunpckldq %zmm9,%zmm8,%zmm6
+ vpunpckldq %zmm11,%zmm10,%zmm0
+ vpunpckhdq %zmm9,%zmm8,%zmm8
+ vpunpckhdq %zmm11,%zmm10,%zmm10
+ vpunpcklqdq %zmm0,%zmm6,%zmm9
+ vpunpckhqdq %zmm0,%zmm6,%zmm6
+ vpunpcklqdq %zmm10,%zmm8,%zmm11
+ vpunpckhqdq %zmm10,%zmm8,%zmm8
+ vpaddd %zmm28,%zmm12,%zmm12
+ vpaddd %zmm29,%zmm13,%zmm13
+ vpaddd %zmm30,%zmm14,%zmm14
+ vpaddd %zmm31,%zmm15,%zmm15
+
+ vpunpckldq %zmm13,%zmm12,%zmm10
+ vpunpckldq %zmm15,%zmm14,%zmm0
+ vpunpckhdq %zmm13,%zmm12,%zmm12
+ vpunpckhdq %zmm15,%zmm14,%zmm14
+ vpunpcklqdq %zmm0,%zmm10,%zmm13
+ vpunpckhqdq %zmm0,%zmm10,%zmm10
+ vpunpcklqdq %zmm14,%zmm12,%zmm15
+ vpunpckhqdq %zmm14,%zmm12,%zmm12
+ vshufi32x4 $0x44,%zmm13,%zmm9,%zmm0
+ vshufi32x4 $0xee,%zmm13,%zmm9,%zmm13
+ vshufi32x4 $0x44,%zmm10,%zmm6,%zmm9
+ vshufi32x4 $0xee,%zmm10,%zmm6,%zmm10
+ vshufi32x4 $0x44,%zmm15,%zmm11,%zmm6
+ vshufi32x4 $0xee,%zmm15,%zmm11,%zmm15
+ vshufi32x4 $0x44,%zmm12,%zmm8,%zmm11
+ vshufi32x4 $0xee,%zmm12,%zmm8,%zmm12
+ vshufi32x4 $0x88,%zmm0,%zmm19,%zmm16
+ vshufi32x4 $0xdd,%zmm0,%zmm19,%zmm19
+ vshufi32x4 $0x88,%zmm13,%zmm5,%zmm0
+ vshufi32x4 $0xdd,%zmm13,%zmm5,%zmm13
+ vshufi32x4 $0x88,%zmm9,%zmm1,%zmm17
+ vshufi32x4 $0xdd,%zmm9,%zmm1,%zmm1
+ vshufi32x4 $0x88,%zmm10,%zmm2,%zmm9
+ vshufi32x4 $0xdd,%zmm10,%zmm2,%zmm10
+ vshufi32x4 $0x88,%zmm6,%zmm18,%zmm14
+ vshufi32x4 $0xdd,%zmm6,%zmm18,%zmm18
+ vshufi32x4 $0x88,%zmm15,%zmm7,%zmm6
+ vshufi32x4 $0xdd,%zmm15,%zmm7,%zmm15
+ vshufi32x4 $0x88,%zmm11,%zmm3,%zmm8
+ vshufi32x4 $0xdd,%zmm11,%zmm3,%zmm3
+ vshufi32x4 $0x88,%zmm12,%zmm4,%zmm11
+ vshufi32x4 $0xdd,%zmm12,%zmm4,%zmm12
+ cmpq $1024,%rdx
+ jb .Ltail16x
+
+ vpxord 0(%rsi),%zmm16,%zmm16
+ vpxord 64(%rsi),%zmm17,%zmm17
+ vpxord 128(%rsi),%zmm14,%zmm14
+ vpxord 192(%rsi),%zmm8,%zmm8
+ vmovdqu32 %zmm16,0(%rdi)
+ vmovdqu32 %zmm17,64(%rdi)
+ vmovdqu32 %zmm14,128(%rdi)
+ vmovdqu32 %zmm8,192(%rdi)
+
+ vpxord 256(%rsi),%zmm19,%zmm19
+ vpxord 320(%rsi),%zmm1,%zmm1
+ vpxord 384(%rsi),%zmm18,%zmm18
+ vpxord 448(%rsi),%zmm3,%zmm3
+ vmovdqu32 %zmm19,256(%rdi)
+ vmovdqu32 %zmm1,320(%rdi)
+ vmovdqu32 %zmm18,384(%rdi)
+ vmovdqu32 %zmm3,448(%rdi)
+
+ vpxord 512(%rsi),%zmm0,%zmm0
+ vpxord 576(%rsi),%zmm9,%zmm9
+ vpxord 640(%rsi),%zmm6,%zmm6
+ vpxord 704(%rsi),%zmm11,%zmm11
+ vmovdqu32 %zmm0,512(%rdi)
+ vmovdqu32 %zmm9,576(%rdi)
+ vmovdqu32 %zmm6,640(%rdi)
+ vmovdqu32 %zmm11,704(%rdi)
+
+ vpxord 768(%rsi),%zmm13,%zmm13
+ vpxord 832(%rsi),%zmm10,%zmm10
+ vpxord 896(%rsi),%zmm15,%zmm15
+ vpxord 960(%rsi),%zmm12,%zmm12
+ leaq 1024(%rsi),%rsi
+ vmovdqu32 %zmm13,768(%rdi)
+ vmovdqu32 %zmm10,832(%rdi)
+ vmovdqu32 %zmm15,896(%rdi)
+ vmovdqu32 %zmm12,960(%rdi)
+ leaq 1024(%rdi),%rdi
+
+ subq $1024,%rdx
+ jnz .Loop_outer16x
+
+ jmp .Ldone16x
+
+.align 32
+.Ltail16x:
+ xorq %r9,%r9
+ subq %rsi,%rdi
+ cmpq $64,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm16,%zmm16
+ vmovdqu32 %zmm16,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm17,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $128,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm17,%zmm17
+ vmovdqu32 %zmm17,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm14,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $192,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm14,%zmm14
+ vmovdqu32 %zmm14,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm8,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $256,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm8,%zmm8
+ vmovdqu32 %zmm8,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm19,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $320,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm19,%zmm19
+ vmovdqu32 %zmm19,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm1,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $384,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm1,%zmm1
+ vmovdqu32 %zmm1,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm18,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $448,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm18,%zmm18
+ vmovdqu32 %zmm18,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm3,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $512,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm3,%zmm3
+ vmovdqu32 %zmm3,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm0,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $576,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm0,%zmm0
+ vmovdqu32 %zmm0,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm9,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $640,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm9,%zmm9
+ vmovdqu32 %zmm9,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm6,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $704,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm6,%zmm6
+ vmovdqu32 %zmm6,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm11,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $768,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm11,%zmm11
+ vmovdqu32 %zmm11,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm13,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $832,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm13,%zmm13
+ vmovdqu32 %zmm13,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm10,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $896,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm10,%zmm10
+ vmovdqu32 %zmm10,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm15,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $960,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm15,%zmm15
+ vmovdqu32 %zmm15,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm12,%zmm16
+ leaq 64(%rsi),%rsi
+
+.Less_than_64_16x:
+ vmovdqa32 %zmm16,0(%rsp)
+ leaq (%rdi,%rsi,1),%rdi
+ andq $63,%rdx
+
+.Loop_tail16x:
+ movzbl (%rsi,%r9,1),%eax
+ movzbl (%rsp,%r9,1),%ecx
+ leaq 1(%r9),%r9
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r9,1)
+ decq %rdx
+ jnz .Loop_tail16x
+
+ vpxord %zmm16,%zmm16,%zmm16
+ vmovdqa32 %zmm16,0(%rsp)
+
+.Ldone16x:
+ vzeroall
+ leaq -8(%r10),%rsp
+
+.L16x_epilogue:
+ ret
+ENDPROC(chacha20_avx512)
+
+.align 32
+ENTRY(chacha20_avx512vl)
+ cmpq $0,%rdx
+ je .Lavx512vl_epilogue
+
+ leaq 8(%rsp),%r10
+
+ cmpq $128,%rdx
+ ja .Lchacha20_8xvl
+
+ subq $64+8,%rsp
+ andq $-64,%rsp
+ vbroadcasti128 .Lsigma(%rip),%ymm0
+ vbroadcasti128 (%rcx),%ymm1
+ vbroadcasti128 16(%rcx),%ymm2
+ vbroadcasti128 (%r8),%ymm3
+
+ vmovdqa32 %ymm0,%ymm16
+ vmovdqa32 %ymm1,%ymm17
+ vmovdqa32 %ymm2,%ymm18
+ vpaddd .Lzeroz(%rip),%ymm3,%ymm3
+ vmovdqa32 .Ltwoy(%rip),%ymm20
+ movq $10,%r8
+ vmovdqa32 %ymm3,%ymm19
+ jmp .Loop_avx512vl
+
+.align 16
+.Loop_outer_avx512vl:
+ vmovdqa32 %ymm18,%ymm2
+ vpaddd %ymm20,%ymm19,%ymm3
+ movq $10,%r8
+ vmovdqa32 %ymm3,%ymm19
+ jmp .Loop_avx512vl
+
+.align 32
+.Loop_avx512vl:
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+ vpshufd $78,%ymm2,%ymm2
+ vpshufd $57,%ymm1,%ymm1
+ vpshufd $147,%ymm3,%ymm3
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+ vpshufd $78,%ymm2,%ymm2
+ vpshufd $147,%ymm1,%ymm1
+ vpshufd $57,%ymm3,%ymm3
+ decq %r8
+ jnz .Loop_avx512vl
+ vpaddd %ymm16,%ymm0,%ymm0
+ vpaddd %ymm17,%ymm1,%ymm1
+ vpaddd %ymm18,%ymm2,%ymm2
+ vpaddd %ymm19,%ymm3,%ymm3
+
+ subq $64,%rdx
+ jb .Ltail64_avx512vl
+
+ vpxor 0(%rsi),%xmm0,%xmm4
+ vpxor 16(%rsi),%xmm1,%xmm5
+ vpxor 32(%rsi),%xmm2,%xmm6
+ vpxor 48(%rsi),%xmm3,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz .Ldone_avx512vl
+
+ vextracti128 $1,%ymm0,%xmm4
+ vextracti128 $1,%ymm1,%xmm5
+ vextracti128 $1,%ymm2,%xmm6
+ vextracti128 $1,%ymm3,%xmm7
+
+ subq $64,%rdx
+ jb .Ltail_avx512vl
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ vmovdqa32 %ymm16,%ymm0
+ vmovdqa32 %ymm17,%ymm1
+ jnz .Loop_outer_avx512vl
+
+ jmp .Ldone_avx512vl
+
+.align 16
+.Ltail64_avx512vl:
+ vmovdqa %xmm0,0(%rsp)
+ vmovdqa %xmm1,16(%rsp)
+ vmovdqa %xmm2,32(%rsp)
+ vmovdqa %xmm3,48(%rsp)
+ addq $64,%rdx
+ jmp .Loop_tail_avx512vl
+
+.align 16
+.Ltail_avx512vl:
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ vmovdqa %xmm7,48(%rsp)
+ addq $64,%rdx
+
+.Loop_tail_avx512vl:
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r8,1)
+ decq %rdx
+ jnz .Loop_tail_avx512vl
+
+ vmovdqa32 %ymm16,0(%rsp)
+ vmovdqa32 %ymm16,32(%rsp)
+
+.Ldone_avx512vl:
+ vzeroall
+ leaq -8(%r10),%rsp
+.Lavx512vl_epilogue:
+ ret
+
+.align 32
+.Lchacha20_8xvl:
+ leaq 8(%rsp),%r10
+ subq $64+8,%rsp
+ andq $-64,%rsp
+ vzeroupper
+
+ leaq .Lsigma(%rip),%r9
+ vbroadcasti128 (%r9),%ymm3
+ vbroadcasti128 (%rcx),%ymm7
+ vbroadcasti128 16(%rcx),%ymm11
+ vbroadcasti128 (%r8),%ymm15
+
+ vpshufd $0x00,%ymm3,%ymm0
+ vpshufd $0x55,%ymm3,%ymm1
+ vpshufd $0xaa,%ymm3,%ymm2
+ vpshufd $0xff,%ymm3,%ymm3
+ vmovdqa64 %ymm0,%ymm16
+ vmovdqa64 %ymm1,%ymm17
+ vmovdqa64 %ymm2,%ymm18
+ vmovdqa64 %ymm3,%ymm19
+
+ vpshufd $0x00,%ymm7,%ymm4
+ vpshufd $0x55,%ymm7,%ymm5
+ vpshufd $0xaa,%ymm7,%ymm6
+ vpshufd $0xff,%ymm7,%ymm7
+ vmovdqa64 %ymm4,%ymm20
+ vmovdqa64 %ymm5,%ymm21
+ vmovdqa64 %ymm6,%ymm22
+ vmovdqa64 %ymm7,%ymm23
+
+ vpshufd $0x00,%ymm11,%ymm8
+ vpshufd $0x55,%ymm11,%ymm9
+ vpshufd $0xaa,%ymm11,%ymm10
+ vpshufd $0xff,%ymm11,%ymm11
+ vmovdqa64 %ymm8,%ymm24
+ vmovdqa64 %ymm9,%ymm25
+ vmovdqa64 %ymm10,%ymm26
+ vmovdqa64 %ymm11,%ymm27
+
+ vpshufd $0x00,%ymm15,%ymm12
+ vpshufd $0x55,%ymm15,%ymm13
+ vpshufd $0xaa,%ymm15,%ymm14
+ vpshufd $0xff,%ymm15,%ymm15
+ vpaddd .Lincy(%rip),%ymm12,%ymm12
+ vmovdqa64 %ymm12,%ymm28
+ vmovdqa64 %ymm13,%ymm29
+ vmovdqa64 %ymm14,%ymm30
+ vmovdqa64 %ymm15,%ymm31
+
+ movl $10,%eax
+ jmp .Loop8xvl
+
+.align 32
+.Loop_outer8xvl:
+
+
+ vpbroadcastd 8(%r9),%ymm2
+ vpbroadcastd 12(%r9),%ymm3
+ vpaddd .Leight(%rip),%ymm28,%ymm28
+ vmovdqa64 %ymm20,%ymm4
+ vmovdqa64 %ymm21,%ymm5
+ vmovdqa64 %ymm22,%ymm6
+ vmovdqa64 %ymm23,%ymm7
+ vmovdqa64 %ymm24,%ymm8
+ vmovdqa64 %ymm25,%ymm9
+ vmovdqa64 %ymm26,%ymm10
+ vmovdqa64 %ymm27,%ymm11
+ vmovdqa64 %ymm28,%ymm12
+ vmovdqa64 %ymm29,%ymm13
+ vmovdqa64 %ymm30,%ymm14
+ vmovdqa64 %ymm31,%ymm15
+
+ vmovdqa64 %ymm0,%ymm16
+ vmovdqa64 %ymm1,%ymm17
+ vmovdqa64 %ymm2,%ymm18
+ vmovdqa64 %ymm3,%ymm19
+
+ movl $10,%eax
+ jmp .Loop8xvl
+
+.align 32
+.Loop8xvl:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpxor %ymm0,%ymm12,%ymm12
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm3,%ymm15,%ymm15
+ vprold $16,%ymm12,%ymm12
+ vprold $16,%ymm13,%ymm13
+ vprold $16,%ymm14,%ymm14
+ vprold $16,%ymm15,%ymm15
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm8,%ymm4,%ymm4
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm11,%ymm7,%ymm7
+ vprold $12,%ymm4,%ymm4
+ vprold $12,%ymm5,%ymm5
+ vprold $12,%ymm6,%ymm6
+ vprold $12,%ymm7,%ymm7
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpxor %ymm0,%ymm12,%ymm12
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm3,%ymm15,%ymm15
+ vprold $8,%ymm12,%ymm12
+ vprold $8,%ymm13,%ymm13
+ vprold $8,%ymm14,%ymm14
+ vprold $8,%ymm15,%ymm15
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm8,%ymm4,%ymm4
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm11,%ymm7,%ymm7
+ vprold $7,%ymm4,%ymm4
+ vprold $7,%ymm5,%ymm5
+ vprold $7,%ymm6,%ymm6
+ vprold $7,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpaddd %ymm4,%ymm3,%ymm3
+ vpxor %ymm0,%ymm15,%ymm15
+ vpxor %ymm1,%ymm12,%ymm12
+ vpxor %ymm2,%ymm13,%ymm13
+ vpxor %ymm3,%ymm14,%ymm14
+ vprold $16,%ymm15,%ymm15
+ vprold $16,%ymm12,%ymm12
+ vprold $16,%ymm13,%ymm13
+ vprold $16,%ymm14,%ymm14
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm10,%ymm5,%ymm5
+ vpxor %ymm11,%ymm6,%ymm6
+ vpxor %ymm8,%ymm7,%ymm7
+ vpxor %ymm9,%ymm4,%ymm4
+ vprold $12,%ymm5,%ymm5
+ vprold $12,%ymm6,%ymm6
+ vprold $12,%ymm7,%ymm7
+ vprold $12,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpaddd %ymm4,%ymm3,%ymm3
+ vpxor %ymm0,%ymm15,%ymm15
+ vpxor %ymm1,%ymm12,%ymm12
+ vpxor %ymm2,%ymm13,%ymm13
+ vpxor %ymm3,%ymm14,%ymm14
+ vprold $8,%ymm15,%ymm15
+ vprold $8,%ymm12,%ymm12
+ vprold $8,%ymm13,%ymm13
+ vprold $8,%ymm14,%ymm14
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm10,%ymm5,%ymm5
+ vpxor %ymm11,%ymm6,%ymm6
+ vpxor %ymm8,%ymm7,%ymm7
+ vpxor %ymm9,%ymm4,%ymm4
+ vprold $7,%ymm5,%ymm5
+ vprold $7,%ymm6,%ymm6
+ vprold $7,%ymm7,%ymm7
+ vprold $7,%ymm4,%ymm4
+ decl %eax
+ jnz .Loop8xvl
+
+ vpaddd %ymm16,%ymm0,%ymm0
+ vpaddd %ymm17,%ymm1,%ymm1
+ vpaddd %ymm18,%ymm2,%ymm2
+ vpaddd %ymm19,%ymm3,%ymm3
+
+ vpunpckldq %ymm1,%ymm0,%ymm18
+ vpunpckldq %ymm3,%ymm2,%ymm19
+ vpunpckhdq %ymm1,%ymm0,%ymm0
+ vpunpckhdq %ymm3,%ymm2,%ymm2
+ vpunpcklqdq %ymm19,%ymm18,%ymm1
+ vpunpckhqdq %ymm19,%ymm18,%ymm18
+ vpunpcklqdq %ymm2,%ymm0,%ymm3
+ vpunpckhqdq %ymm2,%ymm0,%ymm0
+ vpaddd %ymm20,%ymm4,%ymm4
+ vpaddd %ymm21,%ymm5,%ymm5
+ vpaddd %ymm22,%ymm6,%ymm6
+ vpaddd %ymm23,%ymm7,%ymm7
+
+ vpunpckldq %ymm5,%ymm4,%ymm2
+ vpunpckldq %ymm7,%ymm6,%ymm19
+ vpunpckhdq %ymm5,%ymm4,%ymm4
+ vpunpckhdq %ymm7,%ymm6,%ymm6
+ vpunpcklqdq %ymm19,%ymm2,%ymm5
+ vpunpckhqdq %ymm19,%ymm2,%ymm2
+ vpunpcklqdq %ymm6,%ymm4,%ymm7
+ vpunpckhqdq %ymm6,%ymm4,%ymm4
+ vshufi32x4 $0,%ymm5,%ymm1,%ymm19
+ vshufi32x4 $3,%ymm5,%ymm1,%ymm5
+ vshufi32x4 $0,%ymm2,%ymm18,%ymm1
+ vshufi32x4 $3,%ymm2,%ymm18,%ymm2
+ vshufi32x4 $0,%ymm7,%ymm3,%ymm18
+ vshufi32x4 $3,%ymm7,%ymm3,%ymm7
+ vshufi32x4 $0,%ymm4,%ymm0,%ymm3
+ vshufi32x4 $3,%ymm4,%ymm0,%ymm4
+ vpaddd %ymm24,%ymm8,%ymm8
+ vpaddd %ymm25,%ymm9,%ymm9
+ vpaddd %ymm26,%ymm10,%ymm10
+ vpaddd %ymm27,%ymm11,%ymm11
+
+ vpunpckldq %ymm9,%ymm8,%ymm6
+ vpunpckldq %ymm11,%ymm10,%ymm0
+ vpunpckhdq %ymm9,%ymm8,%ymm8
+ vpunpckhdq %ymm11,%ymm10,%ymm10
+ vpunpcklqdq %ymm0,%ymm6,%ymm9
+ vpunpckhqdq %ymm0,%ymm6,%ymm6
+ vpunpcklqdq %ymm10,%ymm8,%ymm11
+ vpunpckhqdq %ymm10,%ymm8,%ymm8
+ vpaddd %ymm28,%ymm12,%ymm12
+ vpaddd %ymm29,%ymm13,%ymm13
+ vpaddd %ymm30,%ymm14,%ymm14
+ vpaddd %ymm31,%ymm15,%ymm15
+
+ vpunpckldq %ymm13,%ymm12,%ymm10
+ vpunpckldq %ymm15,%ymm14,%ymm0
+ vpunpckhdq %ymm13,%ymm12,%ymm12
+ vpunpckhdq %ymm15,%ymm14,%ymm14
+ vpunpcklqdq %ymm0,%ymm10,%ymm13
+ vpunpckhqdq %ymm0,%ymm10,%ymm10
+ vpunpcklqdq %ymm14,%ymm12,%ymm15
+ vpunpckhqdq %ymm14,%ymm12,%ymm12
+ vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
+ vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
+ vperm2i128 $0x20,%ymm10,%ymm6,%ymm9
+ vperm2i128 $0x31,%ymm10,%ymm6,%ymm10
+ vperm2i128 $0x20,%ymm15,%ymm11,%ymm6
+ vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
+ vperm2i128 $0x20,%ymm12,%ymm8,%ymm11
+ vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
+ cmpq $512,%rdx
+ jb .Ltail8xvl
+
+ movl $0x80,%eax
+ vpxord 0(%rsi),%ymm19,%ymm19
+ vpxor 32(%rsi),%ymm0,%ymm0
+ vpxor 64(%rsi),%ymm5,%ymm5
+ vpxor 96(%rsi),%ymm13,%ymm13
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu32 %ymm19,0(%rdi)
+ vmovdqu %ymm0,32(%rdi)
+ vmovdqu %ymm5,64(%rdi)
+ vmovdqu %ymm13,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpxor 0(%rsi),%ymm1,%ymm1
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vpxor 64(%rsi),%ymm2,%ymm2
+ vpxor 96(%rsi),%ymm10,%ymm10
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu %ymm1,0(%rdi)
+ vmovdqu %ymm9,32(%rdi)
+ vmovdqu %ymm2,64(%rdi)
+ vmovdqu %ymm10,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpxord 0(%rsi),%ymm18,%ymm18
+ vpxor 32(%rsi),%ymm6,%ymm6
+ vpxor 64(%rsi),%ymm7,%ymm7
+ vpxor 96(%rsi),%ymm15,%ymm15
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu32 %ymm18,0(%rdi)
+ vmovdqu %ymm6,32(%rdi)
+ vmovdqu %ymm7,64(%rdi)
+ vmovdqu %ymm15,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpxor 0(%rsi),%ymm3,%ymm3
+ vpxor 32(%rsi),%ymm11,%ymm11
+ vpxor 64(%rsi),%ymm4,%ymm4
+ vpxor 96(%rsi),%ymm12,%ymm12
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu %ymm3,0(%rdi)
+ vmovdqu %ymm11,32(%rdi)
+ vmovdqu %ymm4,64(%rdi)
+ vmovdqu %ymm12,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpbroadcastd 0(%r9),%ymm0
+ vpbroadcastd 4(%r9),%ymm1
+
+ subq $512,%rdx
+ jnz .Loop_outer8xvl
+
+ jmp .Ldone8xvl
+
+.align 32
+.Ltail8xvl:
+ vmovdqa64 %ymm19,%ymm8
+ xorq %r9,%r9
+ subq %rsi,%rdi
+ cmpq $64,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm8,%ymm8
+ vpxor 32(%rsi),%ymm0,%ymm0
+ vmovdqu %ymm8,0(%rdi,%rsi,1)
+ vmovdqu %ymm0,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm5,%ymm8
+ vmovdqa %ymm13,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $128,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm5,%ymm5
+ vpxor 32(%rsi),%ymm13,%ymm13
+ vmovdqu %ymm5,0(%rdi,%rsi,1)
+ vmovdqu %ymm13,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm1,%ymm8
+ vmovdqa %ymm9,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $192,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm1,%ymm1
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm1,0(%rdi,%rsi,1)
+ vmovdqu %ymm9,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm2,%ymm8
+ vmovdqa %ymm10,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $256,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm2,%ymm2
+ vpxor 32(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm2,0(%rdi,%rsi,1)
+ vmovdqu %ymm10,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa32 %ymm18,%ymm8
+ vmovdqa %ymm6,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $320,%rdx
+ jb .Less_than_64_8xvl
+ vpxord 0(%rsi),%ymm18,%ymm18
+ vpxor 32(%rsi),%ymm6,%ymm6
+ vmovdqu32 %ymm18,0(%rdi,%rsi,1)
+ vmovdqu %ymm6,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm7,%ymm8
+ vmovdqa %ymm15,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $384,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm7,%ymm7
+ vpxor 32(%rsi),%ymm15,%ymm15
+ vmovdqu %ymm7,0(%rdi,%rsi,1)
+ vmovdqu %ymm15,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm3,%ymm8
+ vmovdqa %ymm11,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $448,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm3,%ymm3
+ vpxor 32(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm3,0(%rdi,%rsi,1)
+ vmovdqu %ymm11,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm4,%ymm8
+ vmovdqa %ymm12,%ymm0
+ leaq 64(%rsi),%rsi
+
+.Less_than_64_8xvl:
+ vmovdqa %ymm8,0(%rsp)
+ vmovdqa %ymm0,32(%rsp)
+ leaq (%rdi,%rsi,1),%rdi
+ andq $63,%rdx
+
+.Loop_tail8xvl:
+ movzbl (%rsi,%r9,1),%eax
+ movzbl (%rsp,%r9,1),%ecx
+ leaq 1(%r9),%r9
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r9,1)
+ decq %rdx
+ jnz .Loop_tail8xvl
+
+ vpxor %ymm8,%ymm8,%ymm8
+ vmovdqa %ymm8,0(%rsp)
+ vmovdqa %ymm8,32(%rsp)
+
+.Ldone8xvl:
+ vzeroall
+ leaq -8(%r10),%rsp
+.L8xvl_epilogue:
+ ret
+ENDPROC(chacha20_avx512vl)
+
+#endif /* CONFIG_AS_AVX512 */
--
2.18.0
next prev parent reply other threads:[~2018-08-24 21:40 UTC|newest]
Thread overview: 44+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-08-24 21:38 [PATCH v2 00/17] WireGuard: Secure Network Tunnel Jason A. Donenfeld
2018-08-24 21:38 ` [PATCH v2 01/17] asm: simd context helper API Jason A. Donenfeld
2018-08-26 12:10 ` Thomas Gleixner
2018-08-26 13:45 ` Jason A. Donenfeld
2018-08-26 14:06 ` Thomas Gleixner
2018-08-26 14:18 ` Jason A. Donenfeld
2018-08-26 14:25 ` Andy Lutomirski
2018-08-26 14:18 ` Andy Lutomirski
2018-08-26 16:53 ` Rik van Riel
2018-09-01 20:19 ` Jason A. Donenfeld
2018-09-01 20:32 ` Andy Lutomirski
2018-09-01 20:34 ` Jason A. Donenfeld
2018-09-06 13:42 ` Thomas Gleixner
2018-09-06 15:52 ` Jason A. Donenfeld
2018-08-27 19:50 ` Palmer Dabbelt
2018-08-24 21:38 ` [PATCH v2 02/17] zinc: introduce minimal cryptography library Jason A. Donenfeld
2018-08-25 6:29 ` Eric Biggers
2018-08-25 16:16 ` Andrew Lunn
2018-08-25 16:40 ` Jason A. Donenfeld
2018-08-25 17:26 ` Andrew Lunn
2018-08-26 15:59 ` Jason A. Donenfeld
2018-08-25 10:17 ` Ard Biesheuvel
2018-08-25 17:06 ` Jason A. Donenfeld
2018-08-25 17:17 ` Jason A. Donenfeld
2018-08-24 21:38 ` [PATCH v2 03/17] zinc: ChaCha20 generic C implementation Jason A. Donenfeld
2018-08-24 21:38 ` [PATCH v2 04/17] zinc: ChaCha20 ARM and ARM64 implementations Jason A. Donenfeld
2018-08-24 21:38 ` Jason A. Donenfeld [this message]
2018-08-24 21:38 ` [PATCH v2 06/17] zinc: ChaCha20 MIPS32r2 implementation Jason A. Donenfeld
2018-08-24 21:38 ` [PATCH v2 07/17] zinc: Poly1305 generic C implementation and selftest Jason A. Donenfeld
2018-08-24 21:38 ` [PATCH v2 08/17] zinc: Poly1305 ARM and ARM64 implementations Jason A. Donenfeld
2018-08-24 21:38 ` [PATCH v2 09/17] zinc: Poly1305 x86_64 implementation Jason A. Donenfeld
2018-08-24 21:38 ` [PATCH v2 10/17] zinc: Poly1305 MIPS32r2 and MIPS64 implementations Jason A. Donenfeld
2018-08-24 21:38 ` [PATCH v2 11/17] zinc: ChaCha20Poly1305 construction and selftest Jason A. Donenfeld
2018-08-24 21:38 ` [PATCH v2 12/17] zinc: BLAKE2s generic C implementation " Jason A. Donenfeld
2018-08-24 21:38 ` [PATCH v2 13/17] zinc: BLAKE2s x86_64 implementation Jason A. Donenfeld
2018-08-24 21:38 ` [PATCH v2 14/17] zinc: Curve25519 generic C implementations and selftest Jason A. Donenfeld
2018-08-24 21:38 ` [PATCH v2 15/17] zinc: Curve25519 ARM implementation Jason A. Donenfeld
2018-08-26 13:18 ` Ard Biesheuvel
2018-08-29 5:06 ` Jason A. Donenfeld
2018-08-24 21:38 ` [PATCH v2 16/17] zinc: Curve25519 x86_64 implementation Jason A. Donenfeld
2018-08-24 21:38 ` [PATCH v2 17/17] net: WireGuard secure network tunnel Jason A. Donenfeld
2018-08-24 23:00 ` Andrew Lunn
2018-08-27 11:13 ` kbuild test robot
2018-08-27 12:52 ` kbuild test robot
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180824213849.23647-6-Jason@zx2c4.com \
--to=jason@zx2c4.com \
--cc=appro@openssl.org \
--cc=davem@davemloft.net \
--cc=gregkh@linuxfoundation.org \
--cc=jeanphilippe.aumasson@gmail.com \
--cc=linux-crypto@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=luto@kernel.org \
--cc=mingo@redhat.com \
--cc=netdev@vger.kernel.org \
--cc=sneves@dei.uc.pt \
--cc=tglx@linutronix.de \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).