All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
To: linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	davem@davemloft.net, gregkh@linuxfoundation.org
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>,
	Andy Polyakov <appro@openssl.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>,
	x86@kernel.org, Samuel Neves <sneves@dei.uc.pt>,
	Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>,
	Andy Lutomirski <luto@kernel.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	kernel-hardening@lists.openwall.com,
	linux-crypto@vger.kernel.org
Subject: [PATCH net-next v7 12/28] zinc: import Andy Polyakov's Poly1305 x86_64 implementation
Date: Sat,  6 Oct 2018 04:56:53 +0200	[thread overview]
Message-ID: <20181006025709.4019-13-Jason@zx2c4.com> (raw)
In-Reply-To: <20181006025709.4019-1-Jason@zx2c4.com>

These x86_64 vectorized implementations come from Andy Polyakov's
implementation, and are included here in raw form without modification,
so that subsequent commits that fix these up for the kernel can see how
it has changed.

While this is CRYPTOGAMS code, the originating code for this happens to
be the same as OpenSSL's commit 4dfe4310c31c4483705991d9a798ce9be1ed1c68

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Based-on-code-from: Andy Polyakov <appro@openssl.org>
Cc: Andy Polyakov <appro@openssl.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: x86@kernel.org
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: kernel-hardening@lists.openwall.com
Cc: linux-crypto@vger.kernel.org
---
 .../poly1305/poly1305-x86_64-cryptogams.S     | 3565 +++++++++++++++++
 1 file changed, 3565 insertions(+)
 create mode 100644 lib/zinc/poly1305/poly1305-x86_64-cryptogams.S

diff --git a/lib/zinc/poly1305/poly1305-x86_64-cryptogams.S b/lib/zinc/poly1305/poly1305-x86_64-cryptogams.S
new file mode 100644
index 000000000000..ed634757354b
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-x86_64-cryptogams.S
@@ -0,0 +1,3565 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ */
+
+.text	
+
+
+
+.globl	poly1305_init
+.hidden	poly1305_init
+.globl	poly1305_blocks
+.hidden	poly1305_blocks
+.globl	poly1305_emit
+.hidden	poly1305_emit
+
+.type	poly1305_init,@function
+.align	32
+poly1305_init:
+	xorq	%rax,%rax
+	movq	%rax,0(%rdi)
+	movq	%rax,8(%rdi)
+	movq	%rax,16(%rdi)
+
+	cmpq	$0,%rsi
+	je	.Lno_key
+
+	leaq	poly1305_blocks(%rip),%r10
+	leaq	poly1305_emit(%rip),%r11
+	movq	OPENSSL_ia32cap_P+4(%rip),%r9
+	leaq	poly1305_blocks_avx(%rip),%rax
+	leaq	poly1305_emit_avx(%rip),%rcx
+	btq	$28,%r9
+	cmovcq	%rax,%r10
+	cmovcq	%rcx,%r11
+	leaq	poly1305_blocks_avx2(%rip),%rax
+	btq	$37,%r9
+	cmovcq	%rax,%r10
+	movq	$2149646336,%rax
+	shrq	$32,%r9
+	andq	%rax,%r9
+	cmpq	%rax,%r9
+	je	.Linit_base2_44
+	movq	$0x0ffffffc0fffffff,%rax
+	movq	$0x0ffffffc0ffffffc,%rcx
+	andq	0(%rsi),%rax
+	andq	8(%rsi),%rcx
+	movq	%rax,24(%rdi)
+	movq	%rcx,32(%rdi)
+	movq	%r10,0(%rdx)
+	movq	%r11,8(%rdx)
+	movl	$1,%eax
+.Lno_key:
+	.byte	0xf3,0xc3
+.size	poly1305_init,.-poly1305_init
+
+.type	poly1305_blocks,@function
+.align	32
+poly1305_blocks:
+.cfi_startproc	
+.Lblocks:
+	shrq	$4,%rdx
+	jz	.Lno_data
+
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lblocks_body:
+
+	movq	%rdx,%r15
+
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r13
+
+	movq	0(%rdi),%r14
+	movq	8(%rdi),%rbx
+	movq	16(%rdi),%rbp
+
+	movq	%r13,%r12
+	shrq	$2,%r13
+	movq	%r12,%rax
+	addq	%r12,%r13
+	jmp	.Loop
+
+.align	32
+.Loop:
+	addq	0(%rsi),%r14
+	adcq	8(%rsi),%rbx
+	leaq	16(%rsi),%rsi
+	adcq	%rcx,%rbp
+	mulq	%r14
+	movq	%rax,%r9
+	movq	%r11,%rax
+	movq	%rdx,%r10
+
+	mulq	%r14
+	movq	%rax,%r14
+	movq	%r11,%rax
+	movq	%rdx,%r8
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	%rdx,%r10
+
+	mulq	%rbx
+	movq	%rbp,%rbx
+	addq	%rax,%r14
+	adcq	%rdx,%r8
+
+	imulq	%r13,%rbx
+	addq	%rbx,%r9
+	movq	%r8,%rbx
+	adcq	$0,%r10
+
+	imulq	%r11,%rbp
+	addq	%r9,%rbx
+	movq	$-4,%rax
+	adcq	%rbp,%r10
+
+	andq	%r10,%rax
+	movq	%r10,%rbp
+	shrq	$2,%r10
+	andq	$3,%rbp
+	addq	%r10,%rax
+	addq	%rax,%r14
+	adcq	$0,%rbx
+	adcq	$0,%rbp
+	movq	%r12,%rax
+	decq	%r15
+	jnz	.Loop
+
+	movq	%r14,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rbp,16(%rdi)
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbp
+.cfi_restore	%rbp
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lno_data:
+.Lblocks_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	poly1305_blocks,.-poly1305_blocks
+
+.type	poly1305_emit,@function
+.align	32
+poly1305_emit:
+.Lemit:
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+
+	movq	%r8,%rax
+	addq	$5,%r8
+	movq	%r9,%rcx
+	adcq	$0,%r9
+	adcq	$0,%r10
+	shrq	$2,%r10
+	cmovnzq	%r8,%rax
+	cmovnzq	%r9,%rcx
+
+	addq	0(%rdx),%rax
+	adcq	8(%rdx),%rcx
+	movq	%rax,0(%rsi)
+	movq	%rcx,8(%rsi)
+
+	.byte	0xf3,0xc3
+.size	poly1305_emit,.-poly1305_emit
+.type	__poly1305_block,@function
+.align	32
+__poly1305_block:
+	mulq	%r14
+	movq	%rax,%r9
+	movq	%r11,%rax
+	movq	%rdx,%r10
+
+	mulq	%r14
+	movq	%rax,%r14
+	movq	%r11,%rax
+	movq	%rdx,%r8
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	%rdx,%r10
+
+	mulq	%rbx
+	movq	%rbp,%rbx
+	addq	%rax,%r14
+	adcq	%rdx,%r8
+
+	imulq	%r13,%rbx
+	addq	%rbx,%r9
+	movq	%r8,%rbx
+	adcq	$0,%r10
+
+	imulq	%r11,%rbp
+	addq	%r9,%rbx
+	movq	$-4,%rax
+	adcq	%rbp,%r10
+
+	andq	%r10,%rax
+	movq	%r10,%rbp
+	shrq	$2,%r10
+	andq	$3,%rbp
+	addq	%r10,%rax
+	addq	%rax,%r14
+	adcq	$0,%rbx
+	adcq	$0,%rbp
+	.byte	0xf3,0xc3
+.size	__poly1305_block,.-__poly1305_block
+
+.type	__poly1305_init_avx,@function
+.align	32
+__poly1305_init_avx:
+	movq	%r11,%r14
+	movq	%r12,%rbx
+	xorq	%rbp,%rbp
+
+	leaq	48+64(%rdi),%rdi
+
+	movq	%r12,%rax
+	call	__poly1305_block
+
+	movl	$0x3ffffff,%eax
+	movl	$0x3ffffff,%edx
+	movq	%r14,%r8
+	andl	%r14d,%eax
+	movq	%r11,%r9
+	andl	%r11d,%edx
+	movl	%eax,-64(%rdi)
+	shrq	$26,%r8
+	movl	%edx,-60(%rdi)
+	shrq	$26,%r9
+
+	movl	$0x3ffffff,%eax
+	movl	$0x3ffffff,%edx
+	andl	%r8d,%eax
+	andl	%r9d,%edx
+	movl	%eax,-48(%rdi)
+	leal	(%rax,%rax,4),%eax
+	movl	%edx,-44(%rdi)
+	leal	(%rdx,%rdx,4),%edx
+	movl	%eax,-32(%rdi)
+	shrq	$26,%r8
+	movl	%edx,-28(%rdi)
+	shrq	$26,%r9
+
+	movq	%rbx,%rax
+	movq	%r12,%rdx
+	shlq	$12,%rax
+	shlq	$12,%rdx
+	orq	%r8,%rax
+	orq	%r9,%rdx
+	andl	$0x3ffffff,%eax
+	andl	$0x3ffffff,%edx
+	movl	%eax,-16(%rdi)
+	leal	(%rax,%rax,4),%eax
+	movl	%edx,-12(%rdi)
+	leal	(%rdx,%rdx,4),%edx
+	movl	%eax,0(%rdi)
+	movq	%rbx,%r8
+	movl	%edx,4(%rdi)
+	movq	%r12,%r9
+
+	movl	$0x3ffffff,%eax
+	movl	$0x3ffffff,%edx
+	shrq	$14,%r8
+	shrq	$14,%r9
+	andl	%r8d,%eax
+	andl	%r9d,%edx
+	movl	%eax,16(%rdi)
+	leal	(%rax,%rax,4),%eax
+	movl	%edx,20(%rdi)
+	leal	(%rdx,%rdx,4),%edx
+	movl	%eax,32(%rdi)
+	shrq	$26,%r8
+	movl	%edx,36(%rdi)
+	shrq	$26,%r9
+
+	movq	%rbp,%rax
+	shlq	$24,%rax
+	orq	%rax,%r8
+	movl	%r8d,48(%rdi)
+	leaq	(%r8,%r8,4),%r8
+	movl	%r9d,52(%rdi)
+	leaq	(%r9,%r9,4),%r9
+	movl	%r8d,64(%rdi)
+	movl	%r9d,68(%rdi)
+
+	movq	%r12,%rax
+	call	__poly1305_block
+
+	movl	$0x3ffffff,%eax
+	movq	%r14,%r8
+	andl	%r14d,%eax
+	shrq	$26,%r8
+	movl	%eax,-52(%rdi)
+
+	movl	$0x3ffffff,%edx
+	andl	%r8d,%edx
+	movl	%edx,-36(%rdi)
+	leal	(%rdx,%rdx,4),%edx
+	shrq	$26,%r8
+	movl	%edx,-20(%rdi)
+
+	movq	%rbx,%rax
+	shlq	$12,%rax
+	orq	%r8,%rax
+	andl	$0x3ffffff,%eax
+	movl	%eax,-4(%rdi)
+	leal	(%rax,%rax,4),%eax
+	movq	%rbx,%r8
+	movl	%eax,12(%rdi)
+
+	movl	$0x3ffffff,%edx
+	shrq	$14,%r8
+	andl	%r8d,%edx
+	movl	%edx,28(%rdi)
+	leal	(%rdx,%rdx,4),%edx
+	shrq	$26,%r8
+	movl	%edx,44(%rdi)
+
+	movq	%rbp,%rax
+	shlq	$24,%rax
+	orq	%rax,%r8
+	movl	%r8d,60(%rdi)
+	leaq	(%r8,%r8,4),%r8
+	movl	%r8d,76(%rdi)
+
+	movq	%r12,%rax
+	call	__poly1305_block
+
+	movl	$0x3ffffff,%eax
+	movq	%r14,%r8
+	andl	%r14d,%eax
+	shrq	$26,%r8
+	movl	%eax,-56(%rdi)
+
+	movl	$0x3ffffff,%edx
+	andl	%r8d,%edx
+	movl	%edx,-40(%rdi)
+	leal	(%rdx,%rdx,4),%edx
+	shrq	$26,%r8
+	movl	%edx,-24(%rdi)
+
+	movq	%rbx,%rax
+	shlq	$12,%rax
+	orq	%r8,%rax
+	andl	$0x3ffffff,%eax
+	movl	%eax,-8(%rdi)
+	leal	(%rax,%rax,4),%eax
+	movq	%rbx,%r8
+	movl	%eax,8(%rdi)
+
+	movl	$0x3ffffff,%edx
+	shrq	$14,%r8
+	andl	%r8d,%edx
+	movl	%edx,24(%rdi)
+	leal	(%rdx,%rdx,4),%edx
+	shrq	$26,%r8
+	movl	%edx,40(%rdi)
+
+	movq	%rbp,%rax
+	shlq	$24,%rax
+	orq	%rax,%r8
+	movl	%r8d,56(%rdi)
+	leaq	(%r8,%r8,4),%r8
+	movl	%r8d,72(%rdi)
+
+	leaq	-48-64(%rdi),%rdi
+	.byte	0xf3,0xc3
+.size	__poly1305_init_avx,.-__poly1305_init_avx
+
+.type	poly1305_blocks_avx,@function
+.align	32
+poly1305_blocks_avx:
+.cfi_startproc	
+	movl	20(%rdi),%r8d
+	cmpq	$128,%rdx
+	jae	.Lblocks_avx
+	testl	%r8d,%r8d
+	jz	.Lblocks
+
+.Lblocks_avx:
+	andq	$-16,%rdx
+	jz	.Lno_data_avx
+
+	vzeroupper
+
+	testl	%r8d,%r8d
+	jz	.Lbase2_64_avx
+
+	testq	$31,%rdx
+	jz	.Leven_avx
+
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lblocks_avx_body:
+
+	movq	%rdx,%r15
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movl	16(%rdi),%ebp
+
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r13
+
+
+	movl	%r8d,%r14d
+	andq	$-2147483648,%r8
+	movq	%r9,%r12
+	movl	%r9d,%ebx
+	andq	$-2147483648,%r9
+
+	shrq	$6,%r8
+	shlq	$52,%r12
+	addq	%r8,%r14
+	shrq	$12,%rbx
+	shrq	$18,%r9
+	addq	%r12,%r14
+	adcq	%r9,%rbx
+
+	movq	%rbp,%r8
+	shlq	$40,%r8
+	shrq	$24,%rbp
+	addq	%r8,%rbx
+	adcq	$0,%rbp
+
+	movq	$-4,%r9
+	movq	%rbp,%r8
+	andq	%rbp,%r9
+	shrq	$2,%r8
+	andq	$3,%rbp
+	addq	%r9,%r8
+	addq	%r8,%r14
+	adcq	$0,%rbx
+	adcq	$0,%rbp
+
+	movq	%r13,%r12
+	movq	%r13,%rax
+	shrq	$2,%r13
+	addq	%r12,%r13
+
+	addq	0(%rsi),%r14
+	adcq	8(%rsi),%rbx
+	leaq	16(%rsi),%rsi
+	adcq	%rcx,%rbp
+
+	call	__poly1305_block
+
+	testq	%rcx,%rcx
+	jz	.Lstore_base2_64_avx
+
+
+	movq	%r14,%rax
+	movq	%r14,%rdx
+	shrq	$52,%r14
+	movq	%rbx,%r11
+	movq	%rbx,%r12
+	shrq	$26,%rdx
+	andq	$0x3ffffff,%rax
+	shlq	$12,%r11
+	andq	$0x3ffffff,%rdx
+	shrq	$14,%rbx
+	orq	%r11,%r14
+	shlq	$24,%rbp
+	andq	$0x3ffffff,%r14
+	shrq	$40,%r12
+	andq	$0x3ffffff,%rbx
+	orq	%r12,%rbp
+
+	subq	$16,%r15
+	jz	.Lstore_base2_26_avx
+
+	vmovd	%eax,%xmm0
+	vmovd	%edx,%xmm1
+	vmovd	%r14d,%xmm2
+	vmovd	%ebx,%xmm3
+	vmovd	%ebp,%xmm4
+	jmp	.Lproceed_avx
+
+.align	32
+.Lstore_base2_64_avx:
+	movq	%r14,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rbp,16(%rdi)
+	jmp	.Ldone_avx
+
+.align	16
+.Lstore_base2_26_avx:
+	movl	%eax,0(%rdi)
+	movl	%edx,4(%rdi)
+	movl	%r14d,8(%rdi)
+	movl	%ebx,12(%rdi)
+	movl	%ebp,16(%rdi)
+.align	16
+.Ldone_avx:
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbp
+.cfi_restore	%rbp
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lno_data_avx:
+.Lblocks_avx_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+.align	32
+.Lbase2_64_avx:
+.cfi_startproc	
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lbase2_64_avx_body:
+
+	movq	%rdx,%r15
+
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r13
+
+	movq	0(%rdi),%r14
+	movq	8(%rdi),%rbx
+	movl	16(%rdi),%ebp
+
+	movq	%r13,%r12
+	movq	%r13,%rax
+	shrq	$2,%r13
+	addq	%r12,%r13
+
+	testq	$31,%rdx
+	jz	.Linit_avx
+
+	addq	0(%rsi),%r14
+	adcq	8(%rsi),%rbx
+	leaq	16(%rsi),%rsi
+	adcq	%rcx,%rbp
+	subq	$16,%r15
+
+	call	__poly1305_block
+
+.Linit_avx:
+
+	movq	%r14,%rax
+	movq	%r14,%rdx
+	shrq	$52,%r14
+	movq	%rbx,%r8
+	movq	%rbx,%r9
+	shrq	$26,%rdx
+	andq	$0x3ffffff,%rax
+	shlq	$12,%r8
+	andq	$0x3ffffff,%rdx
+	shrq	$14,%rbx
+	orq	%r8,%r14
+	shlq	$24,%rbp
+	andq	$0x3ffffff,%r14
+	shrq	$40,%r9
+	andq	$0x3ffffff,%rbx
+	orq	%r9,%rbp
+
+	vmovd	%eax,%xmm0
+	vmovd	%edx,%xmm1
+	vmovd	%r14d,%xmm2
+	vmovd	%ebx,%xmm3
+	vmovd	%ebp,%xmm4
+	movl	$1,20(%rdi)
+
+	call	__poly1305_init_avx
+
+.Lproceed_avx:
+	movq	%r15,%rdx
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbp
+.cfi_restore	%rbp
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	leaq	48(%rsp),%rax
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lbase2_64_avx_epilogue:
+	jmp	.Ldo_avx
+.cfi_endproc	
+
+.align	32
+.Leven_avx:
+.cfi_startproc	
+	vmovd	0(%rdi),%xmm0
+	vmovd	4(%rdi),%xmm1
+	vmovd	8(%rdi),%xmm2
+	vmovd	12(%rdi),%xmm3
+	vmovd	16(%rdi),%xmm4
+
+.Ldo_avx:
+	leaq	-88(%rsp),%r11
+.cfi_def_cfa	%r11,0x60
+	subq	$0x178,%rsp
+	subq	$64,%rdx
+	leaq	-32(%rsi),%rax
+	cmovcq	%rax,%rsi
+
+	vmovdqu	48(%rdi),%xmm14
+	leaq	112(%rdi),%rdi
+	leaq	.Lconst(%rip),%rcx
+
+
+
+	vmovdqu	32(%rsi),%xmm5
+	vmovdqu	48(%rsi),%xmm6
+	vmovdqa	64(%rcx),%xmm15
+
+	vpsrldq	$6,%xmm5,%xmm7
+	vpsrldq	$6,%xmm6,%xmm8
+	vpunpckhqdq	%xmm6,%xmm5,%xmm9
+	vpunpcklqdq	%xmm6,%xmm5,%xmm5
+	vpunpcklqdq	%xmm8,%xmm7,%xmm8
+
+	vpsrlq	$40,%xmm9,%xmm9
+	vpsrlq	$26,%xmm5,%xmm6
+	vpand	%xmm15,%xmm5,%xmm5
+	vpsrlq	$4,%xmm8,%xmm7
+	vpand	%xmm15,%xmm6,%xmm6
+	vpsrlq	$30,%xmm8,%xmm8
+	vpand	%xmm15,%xmm7,%xmm7
+	vpand	%xmm15,%xmm8,%xmm8
+	vpor	32(%rcx),%xmm9,%xmm9
+
+	jbe	.Lskip_loop_avx
+
+
+	vmovdqu	-48(%rdi),%xmm11
+	vmovdqu	-32(%rdi),%xmm12
+	vpshufd	$0xEE,%xmm14,%xmm13
+	vpshufd	$0x44,%xmm14,%xmm10
+	vmovdqa	%xmm13,-144(%r11)
+	vmovdqa	%xmm10,0(%rsp)
+	vpshufd	$0xEE,%xmm11,%xmm14
+	vmovdqu	-16(%rdi),%xmm10
+	vpshufd	$0x44,%xmm11,%xmm11
+	vmovdqa	%xmm14,-128(%r11)
+	vmovdqa	%xmm11,16(%rsp)
+	vpshufd	$0xEE,%xmm12,%xmm13
+	vmovdqu	0(%rdi),%xmm11
+	vpshufd	$0x44,%xmm12,%xmm12
+	vmovdqa	%xmm13,-112(%r11)
+	vmovdqa	%xmm12,32(%rsp)
+	vpshufd	$0xEE,%xmm10,%xmm14
+	vmovdqu	16(%rdi),%xmm12
+	vpshufd	$0x44,%xmm10,%xmm10
+	vmovdqa	%xmm14,-96(%r11)
+	vmovdqa	%xmm10,48(%rsp)
+	vpshufd	$0xEE,%xmm11,%xmm13
+	vmovdqu	32(%rdi),%xmm10
+	vpshufd	$0x44,%xmm11,%xmm11
+	vmovdqa	%xmm13,-80(%r11)
+	vmovdqa	%xmm11,64(%rsp)
+	vpshufd	$0xEE,%xmm12,%xmm14
+	vmovdqu	48(%rdi),%xmm11
+	vpshufd	$0x44,%xmm12,%xmm12
+	vmovdqa	%xmm14,-64(%r11)
+	vmovdqa	%xmm12,80(%rsp)
+	vpshufd	$0xEE,%xmm10,%xmm13
+	vmovdqu	64(%rdi),%xmm12
+	vpshufd	$0x44,%xmm10,%xmm10
+	vmovdqa	%xmm13,-48(%r11)
+	vmovdqa	%xmm10,96(%rsp)
+	vpshufd	$0xEE,%xmm11,%xmm14
+	vpshufd	$0x44,%xmm11,%xmm11
+	vmovdqa	%xmm14,-32(%r11)
+	vmovdqa	%xmm11,112(%rsp)
+	vpshufd	$0xEE,%xmm12,%xmm13
+	vmovdqa	0(%rsp),%xmm14
+	vpshufd	$0x44,%xmm12,%xmm12
+	vmovdqa	%xmm13,-16(%r11)
+	vmovdqa	%xmm12,128(%rsp)
+
+	jmp	.Loop_avx
+
+.align	32
+.Loop_avx:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	vpmuludq	%xmm5,%xmm14,%xmm10
+	vpmuludq	%xmm6,%xmm14,%xmm11
+	vmovdqa	%xmm2,32(%r11)
+	vpmuludq	%xmm7,%xmm14,%xmm12
+	vmovdqa	16(%rsp),%xmm2
+	vpmuludq	%xmm8,%xmm14,%xmm13
+	vpmuludq	%xmm9,%xmm14,%xmm14
+
+	vmovdqa	%xmm0,0(%r11)
+	vpmuludq	32(%rsp),%xmm9,%xmm0
+	vmovdqa	%xmm1,16(%r11)
+	vpmuludq	%xmm8,%xmm2,%xmm1
+	vpaddq	%xmm0,%xmm10,%xmm10
+	vpaddq	%xmm1,%xmm14,%xmm14
+	vmovdqa	%xmm3,48(%r11)
+	vpmuludq	%xmm7,%xmm2,%xmm0
+	vpmuludq	%xmm6,%xmm2,%xmm1
+	vpaddq	%xmm0,%xmm13,%xmm13
+	vmovdqa	48(%rsp),%xmm3
+	vpaddq	%xmm1,%xmm12,%xmm12
+	vmovdqa	%xmm4,64(%r11)
+	vpmuludq	%xmm5,%xmm2,%xmm2
+	vpmuludq	%xmm7,%xmm3,%xmm0
+	vpaddq	%xmm2,%xmm11,%xmm11
+
+	vmovdqa	64(%rsp),%xmm4
+	vpaddq	%xmm0,%xmm14,%xmm14
+	vpmuludq	%xmm6,%xmm3,%xmm1
+	vpmuludq	%xmm5,%xmm3,%xmm3
+	vpaddq	%xmm1,%xmm13,%xmm13
+	vmovdqa	80(%rsp),%xmm2
+	vpaddq	%xmm3,%xmm12,%xmm12
+	vpmuludq	%xmm9,%xmm4,%xmm0
+	vpmuludq	%xmm8,%xmm4,%xmm4
+	vpaddq	%xmm0,%xmm11,%xmm11
+	vmovdqa	96(%rsp),%xmm3
+	vpaddq	%xmm4,%xmm10,%xmm10
+
+	vmovdqa	128(%rsp),%xmm4
+	vpmuludq	%xmm6,%xmm2,%xmm1
+	vpmuludq	%xmm5,%xmm2,%xmm2
+	vpaddq	%xmm1,%xmm14,%xmm14
+	vpaddq	%xmm2,%xmm13,%xmm13
+	vpmuludq	%xmm9,%xmm3,%xmm0
+	vpmuludq	%xmm8,%xmm3,%xmm1
+	vpaddq	%xmm0,%xmm12,%xmm12
+	vmovdqu	0(%rsi),%xmm0
+	vpaddq	%xmm1,%xmm11,%xmm11
+	vpmuludq	%xmm7,%xmm3,%xmm3
+	vpmuludq	%xmm7,%xmm4,%xmm7
+	vpaddq	%xmm3,%xmm10,%xmm10
+
+	vmovdqu	16(%rsi),%xmm1
+	vpaddq	%xmm7,%xmm11,%xmm11
+	vpmuludq	%xmm8,%xmm4,%xmm8
+	vpmuludq	%xmm9,%xmm4,%xmm9
+	vpsrldq	$6,%xmm0,%xmm2
+	vpaddq	%xmm8,%xmm12,%xmm12
+	vpaddq	%xmm9,%xmm13,%xmm13
+	vpsrldq	$6,%xmm1,%xmm3
+	vpmuludq	112(%rsp),%xmm5,%xmm9
+	vpmuludq	%xmm6,%xmm4,%xmm5
+	vpunpckhqdq	%xmm1,%xmm0,%xmm4
+	vpaddq	%xmm9,%xmm14,%xmm14
+	vmovdqa	-144(%r11),%xmm9
+	vpaddq	%xmm5,%xmm10,%xmm10
+
+	vpunpcklqdq	%xmm1,%xmm0,%xmm0
+	vpunpcklqdq	%xmm3,%xmm2,%xmm3
+
+
+	vpsrldq	$5,%xmm4,%xmm4
+	vpsrlq	$26,%xmm0,%xmm1
+	vpand	%xmm15,%xmm0,%xmm0
+	vpsrlq	$4,%xmm3,%xmm2
+	vpand	%xmm15,%xmm1,%xmm1
+	vpand	0(%rcx),%xmm4,%xmm4
+	vpsrlq	$30,%xmm3,%xmm3
+	vpand	%xmm15,%xmm2,%xmm2
+	vpand	%xmm15,%xmm3,%xmm3
+	vpor	32(%rcx),%xmm4,%xmm4
+
+	vpaddq	0(%r11),%xmm0,%xmm0
+	vpaddq	16(%r11),%xmm1,%xmm1
+	vpaddq	32(%r11),%xmm2,%xmm2
+	vpaddq	48(%r11),%xmm3,%xmm3
+	vpaddq	64(%r11),%xmm4,%xmm4
+
+	leaq	32(%rsi),%rax
+	leaq	64(%rsi),%rsi
+	subq	$64,%rdx
+	cmovcq	%rax,%rsi
+
+
+
+
+
+
+
+
+
+
+	vpmuludq	%xmm0,%xmm9,%xmm5
+	vpmuludq	%xmm1,%xmm9,%xmm6
+	vpaddq	%xmm5,%xmm10,%xmm10
+	vpaddq	%xmm6,%xmm11,%xmm11
+	vmovdqa	-128(%r11),%xmm7
+	vpmuludq	%xmm2,%xmm9,%xmm5
+	vpmuludq	%xmm3,%xmm9,%xmm6
+	vpaddq	%xmm5,%xmm12,%xmm12
+	vpaddq	%xmm6,%xmm13,%xmm13
+	vpmuludq	%xmm4,%xmm9,%xmm9
+	vpmuludq	-112(%r11),%xmm4,%xmm5
+	vpaddq	%xmm9,%xmm14,%xmm14
+
+	vpaddq	%xmm5,%xmm10,%xmm10
+	vpmuludq	%xmm2,%xmm7,%xmm6
+	vpmuludq	%xmm3,%xmm7,%xmm5
+	vpaddq	%xmm6,%xmm13,%xmm13
+	vmovdqa	-96(%r11),%xmm8
+	vpaddq	%xmm5,%xmm14,%xmm14
+	vpmuludq	%xmm1,%xmm7,%xmm6
+	vpmuludq	%xmm0,%xmm7,%xmm7
+	vpaddq	%xmm6,%xmm12,%xmm12
+	vpaddq	%xmm7,%xmm11,%xmm11
+
+	vmovdqa	-80(%r11),%xmm9
+	vpmuludq	%xmm2,%xmm8,%xmm5
+	vpmuludq	%xmm1,%xmm8,%xmm6
+	vpaddq	%xmm5,%xmm14,%xmm14
+	vpaddq	%xmm6,%xmm13,%xmm13
+	vmovdqa	-64(%r11),%xmm7
+	vpmuludq	%xmm0,%xmm8,%xmm8
+	vpmuludq	%xmm4,%xmm9,%xmm5
+	vpaddq	%xmm8,%xmm12,%xmm12
+	vpaddq	%xmm5,%xmm11,%xmm11
+	vmovdqa	-48(%r11),%xmm8
+	vpmuludq	%xmm3,%xmm9,%xmm9
+	vpmuludq	%xmm1,%xmm7,%xmm6
+	vpaddq	%xmm9,%xmm10,%xmm10
+
+	vmovdqa	-16(%r11),%xmm9
+	vpaddq	%xmm6,%xmm14,%xmm14
+	vpmuludq	%xmm0,%xmm7,%xmm7
+	vpmuludq	%xmm4,%xmm8,%xmm5
+	vpaddq	%xmm7,%xmm13,%xmm13
+	vpaddq	%xmm5,%xmm12,%xmm12
+	vmovdqu	32(%rsi),%xmm5
+	vpmuludq	%xmm3,%xmm8,%xmm7
+	vpmuludq	%xmm2,%xmm8,%xmm8
+	vpaddq	%xmm7,%xmm11,%xmm11
+	vmovdqu	48(%rsi),%xmm6
+	vpaddq	%xmm8,%xmm10,%xmm10
+
+	vpmuludq	%xmm2,%xmm9,%xmm2
+	vpmuludq	%xmm3,%xmm9,%xmm3
+	vpsrldq	$6,%xmm5,%xmm7
+	vpaddq	%xmm2,%xmm11,%xmm11
+	vpmuludq	%xmm4,%xmm9,%xmm4
+	vpsrldq	$6,%xmm6,%xmm8
+	vpaddq	%xmm3,%xmm12,%xmm2
+	vpaddq	%xmm4,%xmm13,%xmm3
+	vpmuludq	-32(%r11),%xmm0,%xmm4
+	vpmuludq	%xmm1,%xmm9,%xmm0
+	vpunpckhqdq	%xmm6,%xmm5,%xmm9
+	vpaddq	%xmm4,%xmm14,%xmm4
+	vpaddq	%xmm0,%xmm10,%xmm0
+
+	vpunpcklqdq	%xmm6,%xmm5,%xmm5
+	vpunpcklqdq	%xmm8,%xmm7,%xmm8
+
+
+	vpsrldq	$5,%xmm9,%xmm9
+	vpsrlq	$26,%xmm5,%xmm6
+	vmovdqa	0(%rsp),%xmm14
+	vpand	%xmm15,%xmm5,%xmm5
+	vpsrlq	$4,%xmm8,%xmm7
+	vpand	%xmm15,%xmm6,%xmm6
+	vpand	0(%rcx),%xmm9,%xmm9
+	vpsrlq	$30,%xmm8,%xmm8
+	vpand	%xmm15,%xmm7,%xmm7
+	vpand	%xmm15,%xmm8,%xmm8
+	vpor	32(%rcx),%xmm9,%xmm9
+
+
+
+
+
+	vpsrlq	$26,%xmm3,%xmm13
+	vpand	%xmm15,%xmm3,%xmm3
+	vpaddq	%xmm13,%xmm4,%xmm4
+
+	vpsrlq	$26,%xmm0,%xmm10
+	vpand	%xmm15,%xmm0,%xmm0
+	vpaddq	%xmm10,%xmm11,%xmm1
+
+	vpsrlq	$26,%xmm4,%xmm10
+	vpand	%xmm15,%xmm4,%xmm4
+
+	vpsrlq	$26,%xmm1,%xmm11
+	vpand	%xmm15,%xmm1,%xmm1
+	vpaddq	%xmm11,%xmm2,%xmm2
+
+	vpaddq	%xmm10,%xmm0,%xmm0
+	vpsllq	$2,%xmm10,%xmm10
+	vpaddq	%xmm10,%xmm0,%xmm0
+
+	vpsrlq	$26,%xmm2,%xmm12
+	vpand	%xmm15,%xmm2,%xmm2
+	vpaddq	%xmm12,%xmm3,%xmm3
+
+	vpsrlq	$26,%xmm0,%xmm10
+	vpand	%xmm15,%xmm0,%xmm0
+	vpaddq	%xmm10,%xmm1,%xmm1
+
+	vpsrlq	$26,%xmm3,%xmm13
+	vpand	%xmm15,%xmm3,%xmm3
+	vpaddq	%xmm13,%xmm4,%xmm4
+
+	ja	.Loop_avx
+
+.Lskip_loop_avx:
+
+
+
+	vpshufd	$0x10,%xmm14,%xmm14
+	addq	$32,%rdx
+	jnz	.Long_tail_avx
+
+	vpaddq	%xmm2,%xmm7,%xmm7
+	vpaddq	%xmm0,%xmm5,%xmm5
+	vpaddq	%xmm1,%xmm6,%xmm6
+	vpaddq	%xmm3,%xmm8,%xmm8
+	vpaddq	%xmm4,%xmm9,%xmm9
+
+.Long_tail_avx:
+	vmovdqa	%xmm2,32(%r11)
+	vmovdqa	%xmm0,0(%r11)
+	vmovdqa	%xmm1,16(%r11)
+	vmovdqa	%xmm3,48(%r11)
+	vmovdqa	%xmm4,64(%r11)
+
+
+
+
+
+
+
+	vpmuludq	%xmm7,%xmm14,%xmm12
+	vpmuludq	%xmm5,%xmm14,%xmm10
+	vpshufd	$0x10,-48(%rdi),%xmm2
+	vpmuludq	%xmm6,%xmm14,%xmm11
+	vpmuludq	%xmm8,%xmm14,%xmm13
+	vpmuludq	%xmm9,%xmm14,%xmm14
+
+	vpmuludq	%xmm8,%xmm2,%xmm0
+	vpaddq	%xmm0,%xmm14,%xmm14
+	vpshufd	$0x10,-32(%rdi),%xmm3
+	vpmuludq	%xmm7,%xmm2,%xmm1
+	vpaddq	%xmm1,%xmm13,%xmm13
+	vpshufd	$0x10,-16(%rdi),%xmm4
+	vpmuludq	%xmm6,%xmm2,%xmm0
+	vpaddq	%xmm0,%xmm12,%xmm12
+	vpmuludq	%xmm5,%xmm2,%xmm2
+	vpaddq	%xmm2,%xmm11,%xmm11
+	vpmuludq	%xmm9,%xmm3,%xmm3
+	vpaddq	%xmm3,%xmm10,%xmm10
+
+	vpshufd	$0x10,0(%rdi),%xmm2
+	vpmuludq	%xmm7,%xmm4,%xmm1
+	vpaddq	%xmm1,%xmm14,%xmm14
+	vpmuludq	%xmm6,%xmm4,%xmm0
+	vpaddq	%xmm0,%xmm13,%xmm13
+	vpshufd	$0x10,16(%rdi),%xmm3
+	vpmuludq	%xmm5,%xmm4,%xmm4
+	vpaddq	%xmm4,%xmm12,%xmm12
+	vpmuludq	%xmm9,%xmm2,%xmm1
+	vpaddq	%xmm1,%xmm11,%xmm11
+	vpshufd	$0x10,32(%rdi),%xmm4
+	vpmuludq	%xmm8,%xmm2,%xmm2
+	vpaddq	%xmm2,%xmm10,%xmm10
+
+	vpmuludq	%xmm6,%xmm3,%xmm0
+	vpaddq	%xmm0,%xmm14,%xmm14
+	vpmuludq	%xmm5,%xmm3,%xmm3
+	vpaddq	%xmm3,%xmm13,%xmm13
+	vpshufd	$0x10,48(%rdi),%xmm2
+	vpmuludq	%xmm9,%xmm4,%xmm1
+	vpaddq	%xmm1,%xmm12,%xmm12
+	vpshufd	$0x10,64(%rdi),%xmm3
+	vpmuludq	%xmm8,%xmm4,%xmm0
+	vpaddq	%xmm0,%xmm11,%xmm11
+	vpmuludq	%xmm7,%xmm4,%xmm4
+	vpaddq	%xmm4,%xmm10,%xmm10
+
+	vpmuludq	%xmm5,%xmm2,%xmm2
+	vpaddq	%xmm2,%xmm14,%xmm14
+	vpmuludq	%xmm9,%xmm3,%xmm1
+	vpaddq	%xmm1,%xmm13,%xmm13
+	vpmuludq	%xmm8,%xmm3,%xmm0
+	vpaddq	%xmm0,%xmm12,%xmm12
+	vpmuludq	%xmm7,%xmm3,%xmm1
+	vpaddq	%xmm1,%xmm11,%xmm11
+	vpmuludq	%xmm6,%xmm3,%xmm3
+	vpaddq	%xmm3,%xmm10,%xmm10
+
+	jz	.Lshort_tail_avx
+
+	vmovdqu	0(%rsi),%xmm0
+	vmovdqu	16(%rsi),%xmm1
+
+	vpsrldq	$6,%xmm0,%xmm2
+	vpsrldq	$6,%xmm1,%xmm3
+	vpunpckhqdq	%xmm1,%xmm0,%xmm4
+	vpunpcklqdq	%xmm1,%xmm0,%xmm0
+	vpunpcklqdq	%xmm3,%xmm2,%xmm3
+
+	vpsrlq	$40,%xmm4,%xmm4
+	vpsrlq	$26,%xmm0,%xmm1
+	vpand	%xmm15,%xmm0,%xmm0
+	vpsrlq	$4,%xmm3,%xmm2
+	vpand	%xmm15,%xmm1,%xmm1
+	vpsrlq	$30,%xmm3,%xmm3
+	vpand	%xmm15,%xmm2,%xmm2
+	vpand	%xmm15,%xmm3,%xmm3
+	vpor	32(%rcx),%xmm4,%xmm4
+
+	vpshufd	$0x32,-64(%rdi),%xmm9
+	vpaddq	0(%r11),%xmm0,%xmm0
+	vpaddq	16(%r11),%xmm1,%xmm1
+	vpaddq	32(%r11),%xmm2,%xmm2
+	vpaddq	48(%r11),%xmm3,%xmm3
+	vpaddq	64(%r11),%xmm4,%xmm4
+
+
+
+
+	vpmuludq	%xmm0,%xmm9,%xmm5
+	vpaddq	%xmm5,%xmm10,%xmm10
+	vpmuludq	%xmm1,%xmm9,%xmm6
+	vpaddq	%xmm6,%xmm11,%xmm11
+	vpmuludq	%xmm2,%xmm9,%xmm5
+	vpaddq	%xmm5,%xmm12,%xmm12
+	vpshufd	$0x32,-48(%rdi),%xmm7
+	vpmuludq	%xmm3,%xmm9,%xmm6
+	vpaddq	%xmm6,%xmm13,%xmm13
+	vpmuludq	%xmm4,%xmm9,%xmm9
+	vpaddq	%xmm9,%xmm14,%xmm14
+
+	vpmuludq	%xmm3,%xmm7,%xmm5
+	vpaddq	%xmm5,%xmm14,%xmm14
+	vpshufd	$0x32,-32(%rdi),%xmm8
+	vpmuludq	%xmm2,%xmm7,%xmm6
+	vpaddq	%xmm6,%xmm13,%xmm13
+	vpshufd	$0x32,-16(%rdi),%xmm9
+	vpmuludq	%xmm1,%xmm7,%xmm5
+	vpaddq	%xmm5,%xmm12,%xmm12
+	vpmuludq	%xmm0,%xmm7,%xmm7
+	vpaddq	%xmm7,%xmm11,%xmm11
+	vpmuludq	%xmm4,%xmm8,%xmm8
+	vpaddq	%xmm8,%xmm10,%xmm10
+
+	vpshufd	$0x32,0(%rdi),%xmm7
+	vpmuludq	%xmm2,%xmm9,%xmm6
+	vpaddq	%xmm6,%xmm14,%xmm14
+	vpmuludq	%xmm1,%xmm9,%xmm5
+	vpaddq	%xmm5,%xmm13,%xmm13
+	vpshufd	$0x32,16(%rdi),%xmm8
+	vpmuludq	%xmm0,%xmm9,%xmm9
+	vpaddq	%xmm9,%xmm12,%xmm12
+	vpmuludq	%xmm4,%xmm7,%xmm6
+	vpaddq	%xmm6,%xmm11,%xmm11
+	vpshufd	$0x32,32(%rdi),%xmm9
+	vpmuludq	%xmm3,%xmm7,%xmm7
+	vpaddq	%xmm7,%xmm10,%xmm10
+
+	vpmuludq	%xmm1,%xmm8,%xmm5
+	vpaddq	%xmm5,%xmm14,%xmm14
+	vpmuludq	%xmm0,%xmm8,%xmm8
+	vpaddq	%xmm8,%xmm13,%xmm13
+	vpshufd	$0x32,48(%rdi),%xmm7
+	vpmuludq	%xmm4,%xmm9,%xmm6
+	vpaddq	%xmm6,%xmm12,%xmm12
+	vpshufd	$0x32,64(%rdi),%xmm8
+	vpmuludq	%xmm3,%xmm9,%xmm5
+	vpaddq	%xmm5,%xmm11,%xmm11
+	vpmuludq	%xmm2,%xmm9,%xmm9
+	vpaddq	%xmm9,%xmm10,%xmm10
+
+	vpmuludq	%xmm0,%xmm7,%xmm7
+	vpaddq	%xmm7,%xmm14,%xmm14
+	vpmuludq	%xmm4,%xmm8,%xmm6
+	vpaddq	%xmm6,%xmm13,%xmm13
+	vpmuludq	%xmm3,%xmm8,%xmm5
+	vpaddq	%xmm5,%xmm12,%xmm12
+	vpmuludq	%xmm2,%xmm8,%xmm6
+	vpaddq	%xmm6,%xmm11,%xmm11
+	vpmuludq	%xmm1,%xmm8,%xmm8
+	vpaddq	%xmm8,%xmm10,%xmm10
+
+.Lshort_tail_avx:
+
+
+
+	vpsrldq	$8,%xmm14,%xmm9
+	vpsrldq	$8,%xmm13,%xmm8
+	vpsrldq	$8,%xmm11,%xmm6
+	vpsrldq	$8,%xmm10,%xmm5
+	vpsrldq	$8,%xmm12,%xmm7
+	vpaddq	%xmm8,%xmm13,%xmm13
+	vpaddq	%xmm9,%xmm14,%xmm14
+	vpaddq	%xmm5,%xmm10,%xmm10
+	vpaddq	%xmm6,%xmm11,%xmm11
+	vpaddq	%xmm7,%xmm12,%xmm12
+
+
+
+
+	vpsrlq	$26,%xmm13,%xmm3
+	vpand	%xmm15,%xmm13,%xmm13
+	vpaddq	%xmm3,%xmm14,%xmm14
+
+	vpsrlq	$26,%xmm10,%xmm0
+	vpand	%xmm15,%xmm10,%xmm10
+	vpaddq	%xmm0,%xmm11,%xmm11
+
+	vpsrlq	$26,%xmm14,%xmm4
+	vpand	%xmm15,%xmm14,%xmm14
+
+	vpsrlq	$26,%xmm11,%xmm1
+	vpand	%xmm15,%xmm11,%xmm11
+	vpaddq	%xmm1,%xmm12,%xmm12
+
+	vpaddq	%xmm4,%xmm10,%xmm10
+	vpsllq	$2,%xmm4,%xmm4
+	vpaddq	%xmm4,%xmm10,%xmm10
+
+	vpsrlq	$26,%xmm12,%xmm2
+	vpand	%xmm15,%xmm12,%xmm12
+	vpaddq	%xmm2,%xmm13,%xmm13
+
+	vpsrlq	$26,%xmm10,%xmm0
+	vpand	%xmm15,%xmm10,%xmm10
+	vpaddq	%xmm0,%xmm11,%xmm11
+
+	vpsrlq	$26,%xmm13,%xmm3
+	vpand	%xmm15,%xmm13,%xmm13
+	vpaddq	%xmm3,%xmm14,%xmm14
+
+	vmovd	%xmm10,-112(%rdi)
+	vmovd	%xmm11,-108(%rdi)
+	vmovd	%xmm12,-104(%rdi)
+	vmovd	%xmm13,-100(%rdi)
+	vmovd	%xmm14,-96(%rdi)
+	leaq	88(%r11),%rsp
+.cfi_def_cfa	%rsp,8
+	vzeroupper
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	poly1305_blocks_avx,.-poly1305_blocks_avx
+
+.type	poly1305_emit_avx,@function
+.align	32
+poly1305_emit_avx:
+	cmpl	$0,20(%rdi)
+	je	.Lemit
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ecx
+	movl	8(%rdi),%r8d
+	movl	12(%rdi),%r11d
+	movl	16(%rdi),%r10d
+
+	shlq	$26,%rcx
+	movq	%r8,%r9
+	shlq	$52,%r8
+	addq	%rcx,%rax
+	shrq	$12,%r9
+	addq	%rax,%r8
+	adcq	$0,%r9
+
+	shlq	$14,%r11
+	movq	%r10,%rax
+	shrq	$24,%r10
+	addq	%r11,%r9
+	shlq	$40,%rax
+	addq	%rax,%r9
+	adcq	$0,%r10
+
+	movq	%r10,%rax
+	movq	%r10,%rcx
+	andq	$3,%r10
+	shrq	$2,%rax
+	andq	$-4,%rcx
+	addq	%rcx,%rax
+	addq	%rax,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+
+	movq	%r8,%rax
+	addq	$5,%r8
+	movq	%r9,%rcx
+	adcq	$0,%r9
+	adcq	$0,%r10
+	shrq	$2,%r10
+	cmovnzq	%r8,%rax
+	cmovnzq	%r9,%rcx
+
+	addq	0(%rdx),%rax
+	adcq	8(%rdx),%rcx
+	movq	%rax,0(%rsi)
+	movq	%rcx,8(%rsi)
+
+	.byte	0xf3,0xc3
+.size	poly1305_emit_avx,.-poly1305_emit_avx
+.type	poly1305_blocks_avx2,@function
+.align	32
+poly1305_blocks_avx2:
+.cfi_startproc	
+	movl	20(%rdi),%r8d
+	cmpq	$128,%rdx
+	jae	.Lblocks_avx2
+	testl	%r8d,%r8d
+	jz	.Lblocks
+
+.Lblocks_avx2:
+	andq	$-16,%rdx
+	jz	.Lno_data_avx2
+
+	vzeroupper
+
+	testl	%r8d,%r8d
+	jz	.Lbase2_64_avx2
+
+	testq	$63,%rdx
+	jz	.Leven_avx2
+
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lblocks_avx2_body:
+
+	movq	%rdx,%r15
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movl	16(%rdi),%ebp
+
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r13
+
+
+	movl	%r8d,%r14d
+	andq	$-2147483648,%r8
+	movq	%r9,%r12
+	movl	%r9d,%ebx
+	andq	$-2147483648,%r9
+
+	shrq	$6,%r8
+	shlq	$52,%r12
+	addq	%r8,%r14
+	shrq	$12,%rbx
+	shrq	$18,%r9
+	addq	%r12,%r14
+	adcq	%r9,%rbx
+
+	movq	%rbp,%r8
+	shlq	$40,%r8
+	shrq	$24,%rbp
+	addq	%r8,%rbx
+	adcq	$0,%rbp
+
+	movq	$-4,%r9
+	movq	%rbp,%r8
+	andq	%rbp,%r9
+	shrq	$2,%r8
+	andq	$3,%rbp
+	addq	%r9,%r8
+	addq	%r8,%r14
+	adcq	$0,%rbx
+	adcq	$0,%rbp
+
+	movq	%r13,%r12
+	movq	%r13,%rax
+	shrq	$2,%r13
+	addq	%r12,%r13
+
+.Lbase2_26_pre_avx2:
+	addq	0(%rsi),%r14
+	adcq	8(%rsi),%rbx
+	leaq	16(%rsi),%rsi
+	adcq	%rcx,%rbp
+	subq	$16,%r15
+
+	call	__poly1305_block
+	movq	%r12,%rax
+
+	testq	$63,%r15
+	jnz	.Lbase2_26_pre_avx2
+
+	testq	%rcx,%rcx
+	jz	.Lstore_base2_64_avx2
+
+
+	movq	%r14,%rax
+	movq	%r14,%rdx
+	shrq	$52,%r14
+	movq	%rbx,%r11
+	movq	%rbx,%r12
+	shrq	$26,%rdx
+	andq	$0x3ffffff,%rax
+	shlq	$12,%r11
+	andq	$0x3ffffff,%rdx
+	shrq	$14,%rbx
+	orq	%r11,%r14
+	shlq	$24,%rbp
+	andq	$0x3ffffff,%r14
+	shrq	$40,%r12
+	andq	$0x3ffffff,%rbx
+	orq	%r12,%rbp
+
+	testq	%r15,%r15
+	jz	.Lstore_base2_26_avx2
+
+	vmovd	%eax,%xmm0
+	vmovd	%edx,%xmm1
+	vmovd	%r14d,%xmm2
+	vmovd	%ebx,%xmm3
+	vmovd	%ebp,%xmm4
+	jmp	.Lproceed_avx2
+
+.align	32
+.Lstore_base2_64_avx2:
+	movq	%r14,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rbp,16(%rdi)
+	jmp	.Ldone_avx2
+
+.align	16
+.Lstore_base2_26_avx2:
+	movl	%eax,0(%rdi)
+	movl	%edx,4(%rdi)
+	movl	%r14d,8(%rdi)
+	movl	%ebx,12(%rdi)
+	movl	%ebp,16(%rdi)
+.align	16
+.Ldone_avx2:
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbp
+.cfi_restore	%rbp
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lno_data_avx2:
+.Lblocks_avx2_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+.align	32
+.Lbase2_64_avx2:
+.cfi_startproc	
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lbase2_64_avx2_body:
+
+	movq	%rdx,%r15
+
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r13
+
+	movq	0(%rdi),%r14
+	movq	8(%rdi),%rbx
+	movl	16(%rdi),%ebp
+
+	movq	%r13,%r12
+	movq	%r13,%rax
+	shrq	$2,%r13
+	addq	%r12,%r13
+
+	testq	$63,%rdx
+	jz	.Linit_avx2
+
+.Lbase2_64_pre_avx2:
+	addq	0(%rsi),%r14
+	adcq	8(%rsi),%rbx
+	leaq	16(%rsi),%rsi
+	adcq	%rcx,%rbp
+	subq	$16,%r15
+
+	call	__poly1305_block
+	movq	%r12,%rax
+
+	testq	$63,%r15
+	jnz	.Lbase2_64_pre_avx2
+
+.Linit_avx2:
+
+	movq	%r14,%rax
+	movq	%r14,%rdx
+	shrq	$52,%r14
+	movq	%rbx,%r8
+	movq	%rbx,%r9
+	shrq	$26,%rdx
+	andq	$0x3ffffff,%rax
+	shlq	$12,%r8
+	andq	$0x3ffffff,%rdx
+	shrq	$14,%rbx
+	orq	%r8,%r14
+	shlq	$24,%rbp
+	andq	$0x3ffffff,%r14
+	shrq	$40,%r9
+	andq	$0x3ffffff,%rbx
+	orq	%r9,%rbp
+
+	vmovd	%eax,%xmm0
+	vmovd	%edx,%xmm1
+	vmovd	%r14d,%xmm2
+	vmovd	%ebx,%xmm3
+	vmovd	%ebp,%xmm4
+	movl	$1,20(%rdi)
+
+	call	__poly1305_init_avx
+
+.Lproceed_avx2:
+	movq	%r15,%rdx
+	movl	OPENSSL_ia32cap_P+8(%rip),%r10d
+	movl	$3221291008,%r11d
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbp
+.cfi_restore	%rbp
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	leaq	48(%rsp),%rax
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lbase2_64_avx2_epilogue:
+	jmp	.Ldo_avx2
+.cfi_endproc	
+
+.align	32
+.Leven_avx2:
+.cfi_startproc	
+	movl	OPENSSL_ia32cap_P+8(%rip),%r10d
+	vmovd	0(%rdi),%xmm0
+	vmovd	4(%rdi),%xmm1
+	vmovd	8(%rdi),%xmm2
+	vmovd	12(%rdi),%xmm3
+	vmovd	16(%rdi),%xmm4
+
+.Ldo_avx2:
+	cmpq	$512,%rdx
+	jb	.Lskip_avx512
+	andl	%r11d,%r10d
+	testl	$65536,%r10d
+	jnz	.Lblocks_avx512
+.Lskip_avx512:
+	leaq	-8(%rsp),%r11
+.cfi_def_cfa	%r11,16
+	subq	$0x128,%rsp
+	leaq	.Lconst(%rip),%rcx
+	leaq	48+64(%rdi),%rdi
+	vmovdqa	96(%rcx),%ymm7
+
+
+	vmovdqu	-64(%rdi),%xmm9
+	andq	$-512,%rsp
+	vmovdqu	-48(%rdi),%xmm10
+	vmovdqu	-32(%rdi),%xmm6
+	vmovdqu	-16(%rdi),%xmm11
+	vmovdqu	0(%rdi),%xmm12
+	vmovdqu	16(%rdi),%xmm13
+	leaq	144(%rsp),%rax
+	vmovdqu	32(%rdi),%xmm14
+	vpermd	%ymm9,%ymm7,%ymm9
+	vmovdqu	48(%rdi),%xmm15
+	vpermd	%ymm10,%ymm7,%ymm10
+	vmovdqu	64(%rdi),%xmm5
+	vpermd	%ymm6,%ymm7,%ymm6
+	vmovdqa	%ymm9,0(%rsp)
+	vpermd	%ymm11,%ymm7,%ymm11
+	vmovdqa	%ymm10,32-144(%rax)
+	vpermd	%ymm12,%ymm7,%ymm12
+	vmovdqa	%ymm6,64-144(%rax)
+	vpermd	%ymm13,%ymm7,%ymm13
+	vmovdqa	%ymm11,96-144(%rax)
+	vpermd	%ymm14,%ymm7,%ymm14
+	vmovdqa	%ymm12,128-144(%rax)
+	vpermd	%ymm15,%ymm7,%ymm15
+	vmovdqa	%ymm13,160-144(%rax)
+	vpermd	%ymm5,%ymm7,%ymm5
+	vmovdqa	%ymm14,192-144(%rax)
+	vmovdqa	%ymm15,224-144(%rax)
+	vmovdqa	%ymm5,256-144(%rax)
+	vmovdqa	64(%rcx),%ymm5
+
+
+
+	vmovdqu	0(%rsi),%xmm7
+	vmovdqu	16(%rsi),%xmm8
+	vinserti128	$1,32(%rsi),%ymm7,%ymm7
+	vinserti128	$1,48(%rsi),%ymm8,%ymm8
+	leaq	64(%rsi),%rsi
+
+	vpsrldq	$6,%ymm7,%ymm9
+	vpsrldq	$6,%ymm8,%ymm10
+	vpunpckhqdq	%ymm8,%ymm7,%ymm6
+	vpunpcklqdq	%ymm10,%ymm9,%ymm9
+	vpunpcklqdq	%ymm8,%ymm7,%ymm7
+
+	vpsrlq	$30,%ymm9,%ymm10
+	vpsrlq	$4,%ymm9,%ymm9
+	vpsrlq	$26,%ymm7,%ymm8
+	vpsrlq	$40,%ymm6,%ymm6
+	vpand	%ymm5,%ymm9,%ymm9
+	vpand	%ymm5,%ymm7,%ymm7
+	vpand	%ymm5,%ymm8,%ymm8
+	vpand	%ymm5,%ymm10,%ymm10
+	vpor	32(%rcx),%ymm6,%ymm6
+
+	vpaddq	%ymm2,%ymm9,%ymm2
+	subq	$64,%rdx
+	jz	.Ltail_avx2
+	jmp	.Loop_avx2
+
+.align	32
+.Loop_avx2:
+
+
+
+
+
+
+
+
+	vpaddq	%ymm0,%ymm7,%ymm0
+	vmovdqa	0(%rsp),%ymm7
+	vpaddq	%ymm1,%ymm8,%ymm1
+	vmovdqa	32(%rsp),%ymm8
+	vpaddq	%ymm3,%ymm10,%ymm3
+	vmovdqa	96(%rsp),%ymm9
+	vpaddq	%ymm4,%ymm6,%ymm4
+	vmovdqa	48(%rax),%ymm10
+	vmovdqa	112(%rax),%ymm5
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	vpmuludq	%ymm2,%ymm7,%ymm13
+	vpmuludq	%ymm2,%ymm8,%ymm14
+	vpmuludq	%ymm2,%ymm9,%ymm15
+	vpmuludq	%ymm2,%ymm10,%ymm11
+	vpmuludq	%ymm2,%ymm5,%ymm12
+
+	vpmuludq	%ymm0,%ymm8,%ymm6
+	vpmuludq	%ymm1,%ymm8,%ymm2
+	vpaddq	%ymm6,%ymm12,%ymm12
+	vpaddq	%ymm2,%ymm13,%ymm13
+	vpmuludq	%ymm3,%ymm8,%ymm6
+	vpmuludq	64(%rsp),%ymm4,%ymm2
+	vpaddq	%ymm6,%ymm15,%ymm15
+	vpaddq	%ymm2,%ymm11,%ymm11
+	vmovdqa	-16(%rax),%ymm8
+
+	vpmuludq	%ymm0,%ymm7,%ymm6
+	vpmuludq	%ymm1,%ymm7,%ymm2
+	vpaddq	%ymm6,%ymm11,%ymm11
+	vpaddq	%ymm2,%ymm12,%ymm12
+	vpmuludq	%ymm3,%ymm7,%ymm6
+	vpmuludq	%ymm4,%ymm7,%ymm2
+	vmovdqu	0(%rsi),%xmm7
+	vpaddq	%ymm6,%ymm14,%ymm14
+	vpaddq	%ymm2,%ymm15,%ymm15
+	vinserti128	$1,32(%rsi),%ymm7,%ymm7
+
+	vpmuludq	%ymm3,%ymm8,%ymm6
+	vpmuludq	%ymm4,%ymm8,%ymm2
+	vmovdqu	16(%rsi),%xmm8
+	vpaddq	%ymm6,%ymm11,%ymm11
+	vpaddq	%ymm2,%ymm12,%ymm12
+	vmovdqa	16(%rax),%ymm2
+	vpmuludq	%ymm1,%ymm9,%ymm6
+	vpmuludq	%ymm0,%ymm9,%ymm9
+	vpaddq	%ymm6,%ymm14,%ymm14
+	vpaddq	%ymm9,%ymm13,%ymm13
+	vinserti128	$1,48(%rsi),%ymm8,%ymm8
+	leaq	64(%rsi),%rsi
+
+	vpmuludq	%ymm1,%ymm2,%ymm6
+	vpmuludq	%ymm0,%ymm2,%ymm2
+	vpsrldq	$6,%ymm7,%ymm9
+	vpaddq	%ymm6,%ymm15,%ymm15
+	vpaddq	%ymm2,%ymm14,%ymm14
+	vpmuludq	%ymm3,%ymm10,%ymm6
+	vpmuludq	%ymm4,%ymm10,%ymm2
+	vpsrldq	$6,%ymm8,%ymm10
+	vpaddq	%ymm6,%ymm12,%ymm12
+	vpaddq	%ymm2,%ymm13,%ymm13
+	vpunpckhqdq	%ymm8,%ymm7,%ymm6
+
+	vpmuludq	%ymm3,%ymm5,%ymm3
+	vpmuludq	%ymm4,%ymm5,%ymm4
+	vpunpcklqdq	%ymm8,%ymm7,%ymm7
+	vpaddq	%ymm3,%ymm13,%ymm2
+	vpaddq	%ymm4,%ymm14,%ymm3
+	vpunpcklqdq	%ymm10,%ymm9,%ymm10
+	vpmuludq	80(%rax),%ymm0,%ymm4
+	vpmuludq	%ymm1,%ymm5,%ymm0
+	vmovdqa	64(%rcx),%ymm5
+	vpaddq	%ymm4,%ymm15,%ymm4
+	vpaddq	%ymm0,%ymm11,%ymm0
+
+
+
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpaddq	%ymm11,%ymm12,%ymm1
+
+	vpsrlq	$26,%ymm4,%ymm15
+	vpand	%ymm5,%ymm4,%ymm4
+
+	vpsrlq	$4,%ymm10,%ymm9
+
+	vpsrlq	$26,%ymm1,%ymm12
+	vpand	%ymm5,%ymm1,%ymm1
+	vpaddq	%ymm12,%ymm2,%ymm2
+
+	vpaddq	%ymm15,%ymm0,%ymm0
+	vpsllq	$2,%ymm15,%ymm15
+	vpaddq	%ymm15,%ymm0,%ymm0
+
+	vpand	%ymm5,%ymm9,%ymm9
+	vpsrlq	$26,%ymm7,%ymm8
+
+	vpsrlq	$26,%ymm2,%ymm13
+	vpand	%ymm5,%ymm2,%ymm2
+	vpaddq	%ymm13,%ymm3,%ymm3
+
+	vpaddq	%ymm9,%ymm2,%ymm2
+	vpsrlq	$30,%ymm10,%ymm10
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpaddq	%ymm11,%ymm1,%ymm1
+
+	vpsrlq	$40,%ymm6,%ymm6
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	vpand	%ymm5,%ymm7,%ymm7
+	vpand	%ymm5,%ymm8,%ymm8
+	vpand	%ymm5,%ymm10,%ymm10
+	vpor	32(%rcx),%ymm6,%ymm6
+
+	subq	$64,%rdx
+	jnz	.Loop_avx2
+
+.byte	0x66,0x90
+.Ltail_avx2:
+
+
+
+
+
+
+
+	vpaddq	%ymm0,%ymm7,%ymm0
+	vmovdqu	4(%rsp),%ymm7
+	vpaddq	%ymm1,%ymm8,%ymm1
+	vmovdqu	36(%rsp),%ymm8
+	vpaddq	%ymm3,%ymm10,%ymm3
+	vmovdqu	100(%rsp),%ymm9
+	vpaddq	%ymm4,%ymm6,%ymm4
+	vmovdqu	52(%rax),%ymm10
+	vmovdqu	116(%rax),%ymm5
+
+	vpmuludq	%ymm2,%ymm7,%ymm13
+	vpmuludq	%ymm2,%ymm8,%ymm14
+	vpmuludq	%ymm2,%ymm9,%ymm15
+	vpmuludq	%ymm2,%ymm10,%ymm11
+	vpmuludq	%ymm2,%ymm5,%ymm12
+
+	vpmuludq	%ymm0,%ymm8,%ymm6
+	vpmuludq	%ymm1,%ymm8,%ymm2
+	vpaddq	%ymm6,%ymm12,%ymm12
+	vpaddq	%ymm2,%ymm13,%ymm13
+	vpmuludq	%ymm3,%ymm8,%ymm6
+	vpmuludq	68(%rsp),%ymm4,%ymm2
+	vpaddq	%ymm6,%ymm15,%ymm15
+	vpaddq	%ymm2,%ymm11,%ymm11
+
+	vpmuludq	%ymm0,%ymm7,%ymm6
+	vpmuludq	%ymm1,%ymm7,%ymm2
+	vpaddq	%ymm6,%ymm11,%ymm11
+	vmovdqu	-12(%rax),%ymm8
+	vpaddq	%ymm2,%ymm12,%ymm12
+	vpmuludq	%ymm3,%ymm7,%ymm6
+	vpmuludq	%ymm4,%ymm7,%ymm2
+	vpaddq	%ymm6,%ymm14,%ymm14
+	vpaddq	%ymm2,%ymm15,%ymm15
+
+	vpmuludq	%ymm3,%ymm8,%ymm6
+	vpmuludq	%ymm4,%ymm8,%ymm2
+	vpaddq	%ymm6,%ymm11,%ymm11
+	vpaddq	%ymm2,%ymm12,%ymm12
+	vmovdqu	20(%rax),%ymm2
+	vpmuludq	%ymm1,%ymm9,%ymm6
+	vpmuludq	%ymm0,%ymm9,%ymm9
+	vpaddq	%ymm6,%ymm14,%ymm14
+	vpaddq	%ymm9,%ymm13,%ymm13
+
+	vpmuludq	%ymm1,%ymm2,%ymm6
+	vpmuludq	%ymm0,%ymm2,%ymm2
+	vpaddq	%ymm6,%ymm15,%ymm15
+	vpaddq	%ymm2,%ymm14,%ymm14
+	vpmuludq	%ymm3,%ymm10,%ymm6
+	vpmuludq	%ymm4,%ymm10,%ymm2
+	vpaddq	%ymm6,%ymm12,%ymm12
+	vpaddq	%ymm2,%ymm13,%ymm13
+
+	vpmuludq	%ymm3,%ymm5,%ymm3
+	vpmuludq	%ymm4,%ymm5,%ymm4
+	vpaddq	%ymm3,%ymm13,%ymm2
+	vpaddq	%ymm4,%ymm14,%ymm3
+	vpmuludq	84(%rax),%ymm0,%ymm4
+	vpmuludq	%ymm1,%ymm5,%ymm0
+	vmovdqa	64(%rcx),%ymm5
+	vpaddq	%ymm4,%ymm15,%ymm4
+	vpaddq	%ymm0,%ymm11,%ymm0
+
+
+
+
+	vpsrldq	$8,%ymm12,%ymm8
+	vpsrldq	$8,%ymm2,%ymm9
+	vpsrldq	$8,%ymm3,%ymm10
+	vpsrldq	$8,%ymm4,%ymm6
+	vpsrldq	$8,%ymm0,%ymm7
+	vpaddq	%ymm8,%ymm12,%ymm12
+	vpaddq	%ymm9,%ymm2,%ymm2
+	vpaddq	%ymm10,%ymm3,%ymm3
+	vpaddq	%ymm6,%ymm4,%ymm4
+	vpaddq	%ymm7,%ymm0,%ymm0
+
+	vpermq	$0x2,%ymm3,%ymm10
+	vpermq	$0x2,%ymm4,%ymm6
+	vpermq	$0x2,%ymm0,%ymm7
+	vpermq	$0x2,%ymm12,%ymm8
+	vpermq	$0x2,%ymm2,%ymm9
+	vpaddq	%ymm10,%ymm3,%ymm3
+	vpaddq	%ymm6,%ymm4,%ymm4
+	vpaddq	%ymm7,%ymm0,%ymm0
+	vpaddq	%ymm8,%ymm12,%ymm12
+	vpaddq	%ymm9,%ymm2,%ymm2
+
+
+
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpaddq	%ymm11,%ymm12,%ymm1
+
+	vpsrlq	$26,%ymm4,%ymm15
+	vpand	%ymm5,%ymm4,%ymm4
+
+	vpsrlq	$26,%ymm1,%ymm12
+	vpand	%ymm5,%ymm1,%ymm1
+	vpaddq	%ymm12,%ymm2,%ymm2
+
+	vpaddq	%ymm15,%ymm0,%ymm0
+	vpsllq	$2,%ymm15,%ymm15
+	vpaddq	%ymm15,%ymm0,%ymm0
+
+	vpsrlq	$26,%ymm2,%ymm13
+	vpand	%ymm5,%ymm2,%ymm2
+	vpaddq	%ymm13,%ymm3,%ymm3
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpaddq	%ymm11,%ymm1,%ymm1
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	vmovd	%xmm0,-112(%rdi)
+	vmovd	%xmm1,-108(%rdi)
+	vmovd	%xmm2,-104(%rdi)
+	vmovd	%xmm3,-100(%rdi)
+	vmovd	%xmm4,-96(%rdi)
+	leaq	8(%r11),%rsp
+.cfi_def_cfa	%rsp,8
+	vzeroupper
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	poly1305_blocks_avx2,.-poly1305_blocks_avx2
+.type	poly1305_blocks_avx512,@function
+.align	32
+poly1305_blocks_avx512:
+.cfi_startproc	
+.Lblocks_avx512:
+	movl	$15,%eax
+	kmovw	%eax,%k2
+	leaq	-8(%rsp),%r11
+.cfi_def_cfa	%r11,16
+	subq	$0x128,%rsp
+	leaq	.Lconst(%rip),%rcx
+	leaq	48+64(%rdi),%rdi
+	vmovdqa	96(%rcx),%ymm9
+
+
+	vmovdqu	-64(%rdi),%xmm11
+	andq	$-512,%rsp
+	vmovdqu	-48(%rdi),%xmm12
+	movq	$0x20,%rax
+	vmovdqu	-32(%rdi),%xmm7
+	vmovdqu	-16(%rdi),%xmm13
+	vmovdqu	0(%rdi),%xmm8
+	vmovdqu	16(%rdi),%xmm14
+	vmovdqu	32(%rdi),%xmm10
+	vmovdqu	48(%rdi),%xmm15
+	vmovdqu	64(%rdi),%xmm6
+	vpermd	%zmm11,%zmm9,%zmm16
+	vpbroadcastq	64(%rcx),%zmm5
+	vpermd	%zmm12,%zmm9,%zmm17
+	vpermd	%zmm7,%zmm9,%zmm21
+	vpermd	%zmm13,%zmm9,%zmm18
+	vmovdqa64	%zmm16,0(%rsp){%k2}
+	vpsrlq	$32,%zmm16,%zmm7
+	vpermd	%zmm8,%zmm9,%zmm22
+	vmovdqu64	%zmm17,0(%rsp,%rax,1){%k2}
+	vpsrlq	$32,%zmm17,%zmm8
+	vpermd	%zmm14,%zmm9,%zmm19
+	vmovdqa64	%zmm21,64(%rsp){%k2}
+	vpermd	%zmm10,%zmm9,%zmm23
+	vpermd	%zmm15,%zmm9,%zmm20
+	vmovdqu64	%zmm18,64(%rsp,%rax,1){%k2}
+	vpermd	%zmm6,%zmm9,%zmm24
+	vmovdqa64	%zmm22,128(%rsp){%k2}
+	vmovdqu64	%zmm19,128(%rsp,%rax,1){%k2}
+	vmovdqa64	%zmm23,192(%rsp){%k2}
+	vmovdqu64	%zmm20,192(%rsp,%rax,1){%k2}
+	vmovdqa64	%zmm24,256(%rsp){%k2}
+
+
+
+
+
+
+
+
+
+
+	vpmuludq	%zmm7,%zmm16,%zmm11
+	vpmuludq	%zmm7,%zmm17,%zmm12
+	vpmuludq	%zmm7,%zmm18,%zmm13
+	vpmuludq	%zmm7,%zmm19,%zmm14
+	vpmuludq	%zmm7,%zmm20,%zmm15
+	vpsrlq	$32,%zmm18,%zmm9
+
+	vpmuludq	%zmm8,%zmm24,%zmm25
+	vpmuludq	%zmm8,%zmm16,%zmm26
+	vpmuludq	%zmm8,%zmm17,%zmm27
+	vpmuludq	%zmm8,%zmm18,%zmm28
+	vpmuludq	%zmm8,%zmm19,%zmm29
+	vpsrlq	$32,%zmm19,%zmm10
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm27,%zmm13,%zmm13
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+
+	vpmuludq	%zmm9,%zmm23,%zmm25
+	vpmuludq	%zmm9,%zmm24,%zmm26
+	vpmuludq	%zmm9,%zmm17,%zmm28
+	vpmuludq	%zmm9,%zmm18,%zmm29
+	vpmuludq	%zmm9,%zmm16,%zmm27
+	vpsrlq	$32,%zmm20,%zmm6
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vpmuludq	%zmm10,%zmm22,%zmm25
+	vpmuludq	%zmm10,%zmm16,%zmm28
+	vpmuludq	%zmm10,%zmm17,%zmm29
+	vpmuludq	%zmm10,%zmm23,%zmm26
+	vpmuludq	%zmm10,%zmm24,%zmm27
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vpmuludq	%zmm6,%zmm24,%zmm28
+	vpmuludq	%zmm6,%zmm16,%zmm29
+	vpmuludq	%zmm6,%zmm21,%zmm25
+	vpmuludq	%zmm6,%zmm22,%zmm26
+	vpmuludq	%zmm6,%zmm23,%zmm27
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+
+
+	vmovdqu64	0(%rsi),%zmm10
+	vmovdqu64	64(%rsi),%zmm6
+	leaq	128(%rsi),%rsi
+
+
+
+
+	vpsrlq	$26,%zmm14,%zmm28
+	vpandq	%zmm5,%zmm14,%zmm14
+	vpaddq	%zmm28,%zmm15,%zmm15
+
+	vpsrlq	$26,%zmm11,%zmm25
+	vpandq	%zmm5,%zmm11,%zmm11
+	vpaddq	%zmm25,%zmm12,%zmm12
+
+	vpsrlq	$26,%zmm15,%zmm29
+	vpandq	%zmm5,%zmm15,%zmm15
+
+	vpsrlq	$26,%zmm12,%zmm26
+	vpandq	%zmm5,%zmm12,%zmm12
+	vpaddq	%zmm26,%zmm13,%zmm13
+
+	vpaddq	%zmm29,%zmm11,%zmm11
+	vpsllq	$2,%zmm29,%zmm29
+	vpaddq	%zmm29,%zmm11,%zmm11
+
+	vpsrlq	$26,%zmm13,%zmm27
+	vpandq	%zmm5,%zmm13,%zmm13
+	vpaddq	%zmm27,%zmm14,%zmm14
+
+	vpsrlq	$26,%zmm11,%zmm25
+	vpandq	%zmm5,%zmm11,%zmm11
+	vpaddq	%zmm25,%zmm12,%zmm12
+
+	vpsrlq	$26,%zmm14,%zmm28
+	vpandq	%zmm5,%zmm14,%zmm14
+	vpaddq	%zmm28,%zmm15,%zmm15
+
+
+
+
+
+	vpunpcklqdq	%zmm6,%zmm10,%zmm7
+	vpunpckhqdq	%zmm6,%zmm10,%zmm6
+
+
+
+
+
+
+	vmovdqa32	128(%rcx),%zmm25
+	movl	$0x7777,%eax
+	kmovw	%eax,%k1
+
+	vpermd	%zmm16,%zmm25,%zmm16
+	vpermd	%zmm17,%zmm25,%zmm17
+	vpermd	%zmm18,%zmm25,%zmm18
+	vpermd	%zmm19,%zmm25,%zmm19
+	vpermd	%zmm20,%zmm25,%zmm20
+
+	vpermd	%zmm11,%zmm25,%zmm16{%k1}
+	vpermd	%zmm12,%zmm25,%zmm17{%k1}
+	vpermd	%zmm13,%zmm25,%zmm18{%k1}
+	vpermd	%zmm14,%zmm25,%zmm19{%k1}
+	vpermd	%zmm15,%zmm25,%zmm20{%k1}
+
+	vpslld	$2,%zmm17,%zmm21
+	vpslld	$2,%zmm18,%zmm22
+	vpslld	$2,%zmm19,%zmm23
+	vpslld	$2,%zmm20,%zmm24
+	vpaddd	%zmm17,%zmm21,%zmm21
+	vpaddd	%zmm18,%zmm22,%zmm22
+	vpaddd	%zmm19,%zmm23,%zmm23
+	vpaddd	%zmm20,%zmm24,%zmm24
+
+	vpbroadcastq	32(%rcx),%zmm30
+
+	vpsrlq	$52,%zmm7,%zmm9
+	vpsllq	$12,%zmm6,%zmm10
+	vporq	%zmm10,%zmm9,%zmm9
+	vpsrlq	$26,%zmm7,%zmm8
+	vpsrlq	$14,%zmm6,%zmm10
+	vpsrlq	$40,%zmm6,%zmm6
+	vpandq	%zmm5,%zmm9,%zmm9
+	vpandq	%zmm5,%zmm7,%zmm7
+
+
+
+
+	vpaddq	%zmm2,%zmm9,%zmm2
+	subq	$192,%rdx
+	jbe	.Ltail_avx512
+	jmp	.Loop_avx512
+
+.align	32
+.Loop_avx512:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	vpmuludq	%zmm2,%zmm17,%zmm14
+	vpaddq	%zmm0,%zmm7,%zmm0
+	vpmuludq	%zmm2,%zmm18,%zmm15
+	vpandq	%zmm5,%zmm8,%zmm8
+	vpmuludq	%zmm2,%zmm23,%zmm11
+	vpandq	%zmm5,%zmm10,%zmm10
+	vpmuludq	%zmm2,%zmm24,%zmm12
+	vporq	%zmm30,%zmm6,%zmm6
+	vpmuludq	%zmm2,%zmm16,%zmm13
+	vpaddq	%zmm1,%zmm8,%zmm1
+	vpaddq	%zmm3,%zmm10,%zmm3
+	vpaddq	%zmm4,%zmm6,%zmm4
+
+	vmovdqu64	0(%rsi),%zmm10
+	vmovdqu64	64(%rsi),%zmm6
+	leaq	128(%rsi),%rsi
+	vpmuludq	%zmm0,%zmm19,%zmm28
+	vpmuludq	%zmm0,%zmm20,%zmm29
+	vpmuludq	%zmm0,%zmm16,%zmm25
+	vpmuludq	%zmm0,%zmm17,%zmm26
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm26,%zmm12,%zmm12
+
+	vpmuludq	%zmm1,%zmm18,%zmm28
+	vpmuludq	%zmm1,%zmm19,%zmm29
+	vpmuludq	%zmm1,%zmm24,%zmm25
+	vpmuludq	%zmm0,%zmm18,%zmm27
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vpunpcklqdq	%zmm6,%zmm10,%zmm7
+	vpunpckhqdq	%zmm6,%zmm10,%zmm6
+
+	vpmuludq	%zmm3,%zmm16,%zmm28
+	vpmuludq	%zmm3,%zmm17,%zmm29
+	vpmuludq	%zmm1,%zmm16,%zmm26
+	vpmuludq	%zmm1,%zmm17,%zmm27
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vpmuludq	%zmm4,%zmm24,%zmm28
+	vpmuludq	%zmm4,%zmm16,%zmm29
+	vpmuludq	%zmm3,%zmm22,%zmm25
+	vpmuludq	%zmm3,%zmm23,%zmm26
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpmuludq	%zmm3,%zmm24,%zmm27
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vpmuludq	%zmm4,%zmm21,%zmm25
+	vpmuludq	%zmm4,%zmm22,%zmm26
+	vpmuludq	%zmm4,%zmm23,%zmm27
+	vpaddq	%zmm25,%zmm11,%zmm0
+	vpaddq	%zmm26,%zmm12,%zmm1
+	vpaddq	%zmm27,%zmm13,%zmm2
+
+
+
+
+	vpsrlq	$52,%zmm7,%zmm9
+	vpsllq	$12,%zmm6,%zmm10
+
+	vpsrlq	$26,%zmm14,%zmm3
+	vpandq	%zmm5,%zmm14,%zmm14
+	vpaddq	%zmm3,%zmm15,%zmm4
+
+	vporq	%zmm10,%zmm9,%zmm9
+
+	vpsrlq	$26,%zmm0,%zmm11
+	vpandq	%zmm5,%zmm0,%zmm0
+	vpaddq	%zmm11,%zmm1,%zmm1
+
+	vpandq	%zmm5,%zmm9,%zmm9
+
+	vpsrlq	$26,%zmm4,%zmm15
+	vpandq	%zmm5,%zmm4,%zmm4
+
+	vpsrlq	$26,%zmm1,%zmm12
+	vpandq	%zmm5,%zmm1,%zmm1
+	vpaddq	%zmm12,%zmm2,%zmm2
+
+	vpaddq	%zmm15,%zmm0,%zmm0
+	vpsllq	$2,%zmm15,%zmm15
+	vpaddq	%zmm15,%zmm0,%zmm0
+
+	vpaddq	%zmm9,%zmm2,%zmm2
+	vpsrlq	$26,%zmm7,%zmm8
+
+	vpsrlq	$26,%zmm2,%zmm13
+	vpandq	%zmm5,%zmm2,%zmm2
+	vpaddq	%zmm13,%zmm14,%zmm3
+
+	vpsrlq	$14,%zmm6,%zmm10
+
+	vpsrlq	$26,%zmm0,%zmm11
+	vpandq	%zmm5,%zmm0,%zmm0
+	vpaddq	%zmm11,%zmm1,%zmm1
+
+	vpsrlq	$40,%zmm6,%zmm6
+
+	vpsrlq	$26,%zmm3,%zmm14
+	vpandq	%zmm5,%zmm3,%zmm3
+	vpaddq	%zmm14,%zmm4,%zmm4
+
+	vpandq	%zmm5,%zmm7,%zmm7
+
+
+
+
+	subq	$128,%rdx
+	ja	.Loop_avx512
+
+.Ltail_avx512:
+
+
+
+
+
+	vpsrlq	$32,%zmm16,%zmm16
+	vpsrlq	$32,%zmm17,%zmm17
+	vpsrlq	$32,%zmm18,%zmm18
+	vpsrlq	$32,%zmm23,%zmm23
+	vpsrlq	$32,%zmm24,%zmm24
+	vpsrlq	$32,%zmm19,%zmm19
+	vpsrlq	$32,%zmm20,%zmm20
+	vpsrlq	$32,%zmm21,%zmm21
+	vpsrlq	$32,%zmm22,%zmm22
+
+
+
+	leaq	(%rsi,%rdx,1),%rsi
+
+
+	vpaddq	%zmm0,%zmm7,%zmm0
+
+	vpmuludq	%zmm2,%zmm17,%zmm14
+	vpmuludq	%zmm2,%zmm18,%zmm15
+	vpmuludq	%zmm2,%zmm23,%zmm11
+	vpandq	%zmm5,%zmm8,%zmm8
+	vpmuludq	%zmm2,%zmm24,%zmm12
+	vpandq	%zmm5,%zmm10,%zmm10
+	vpmuludq	%zmm2,%zmm16,%zmm13
+	vporq	%zmm30,%zmm6,%zmm6
+	vpaddq	%zmm1,%zmm8,%zmm1
+	vpaddq	%zmm3,%zmm10,%zmm3
+	vpaddq	%zmm4,%zmm6,%zmm4
+
+	vmovdqu	0(%rsi),%xmm7
+	vpmuludq	%zmm0,%zmm19,%zmm28
+	vpmuludq	%zmm0,%zmm20,%zmm29
+	vpmuludq	%zmm0,%zmm16,%zmm25
+	vpmuludq	%zmm0,%zmm17,%zmm26
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm26,%zmm12,%zmm12
+
+	vmovdqu	16(%rsi),%xmm8
+	vpmuludq	%zmm1,%zmm18,%zmm28
+	vpmuludq	%zmm1,%zmm19,%zmm29
+	vpmuludq	%zmm1,%zmm24,%zmm25
+	vpmuludq	%zmm0,%zmm18,%zmm27
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vinserti128	$1,32(%rsi),%ymm7,%ymm7
+	vpmuludq	%zmm3,%zmm16,%zmm28
+	vpmuludq	%zmm3,%zmm17,%zmm29
+	vpmuludq	%zmm1,%zmm16,%zmm26
+	vpmuludq	%zmm1,%zmm17,%zmm27
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vinserti128	$1,48(%rsi),%ymm8,%ymm8
+	vpmuludq	%zmm4,%zmm24,%zmm28
+	vpmuludq	%zmm4,%zmm16,%zmm29
+	vpmuludq	%zmm3,%zmm22,%zmm25
+	vpmuludq	%zmm3,%zmm23,%zmm26
+	vpmuludq	%zmm3,%zmm24,%zmm27
+	vpaddq	%zmm28,%zmm14,%zmm3
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vpmuludq	%zmm4,%zmm21,%zmm25
+	vpmuludq	%zmm4,%zmm22,%zmm26
+	vpmuludq	%zmm4,%zmm23,%zmm27
+	vpaddq	%zmm25,%zmm11,%zmm0
+	vpaddq	%zmm26,%zmm12,%zmm1
+	vpaddq	%zmm27,%zmm13,%zmm2
+
+
+
+
+	movl	$1,%eax
+	vpermq	$0xb1,%zmm3,%zmm14
+	vpermq	$0xb1,%zmm15,%zmm4
+	vpermq	$0xb1,%zmm0,%zmm11
+	vpermq	$0xb1,%zmm1,%zmm12
+	vpermq	$0xb1,%zmm2,%zmm13
+	vpaddq	%zmm14,%zmm3,%zmm3
+	vpaddq	%zmm15,%zmm4,%zmm4
+	vpaddq	%zmm11,%zmm0,%zmm0
+	vpaddq	%zmm12,%zmm1,%zmm1
+	vpaddq	%zmm13,%zmm2,%zmm2
+
+	kmovw	%eax,%k3
+	vpermq	$0x2,%zmm3,%zmm14
+	vpermq	$0x2,%zmm4,%zmm15
+	vpermq	$0x2,%zmm0,%zmm11
+	vpermq	$0x2,%zmm1,%zmm12
+	vpermq	$0x2,%zmm2,%zmm13
+	vpaddq	%zmm14,%zmm3,%zmm3
+	vpaddq	%zmm15,%zmm4,%zmm4
+	vpaddq	%zmm11,%zmm0,%zmm0
+	vpaddq	%zmm12,%zmm1,%zmm1
+	vpaddq	%zmm13,%zmm2,%zmm2
+
+	vextracti64x4	$0x1,%zmm3,%ymm14
+	vextracti64x4	$0x1,%zmm4,%ymm15
+	vextracti64x4	$0x1,%zmm0,%ymm11
+	vextracti64x4	$0x1,%zmm1,%ymm12
+	vextracti64x4	$0x1,%zmm2,%ymm13
+	vpaddq	%zmm14,%zmm3,%zmm3{%k3}{z}
+	vpaddq	%zmm15,%zmm4,%zmm4{%k3}{z}
+	vpaddq	%zmm11,%zmm0,%zmm0{%k3}{z}
+	vpaddq	%zmm12,%zmm1,%zmm1{%k3}{z}
+	vpaddq	%zmm13,%zmm2,%zmm2{%k3}{z}
+
+
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpsrldq	$6,%ymm7,%ymm9
+	vpsrldq	$6,%ymm8,%ymm10
+	vpunpckhqdq	%ymm8,%ymm7,%ymm6
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpunpcklqdq	%ymm10,%ymm9,%ymm9
+	vpunpcklqdq	%ymm8,%ymm7,%ymm7
+	vpaddq	%ymm11,%ymm1,%ymm1
+
+	vpsrlq	$26,%ymm4,%ymm15
+	vpand	%ymm5,%ymm4,%ymm4
+
+	vpsrlq	$26,%ymm1,%ymm12
+	vpand	%ymm5,%ymm1,%ymm1
+	vpsrlq	$30,%ymm9,%ymm10
+	vpsrlq	$4,%ymm9,%ymm9
+	vpaddq	%ymm12,%ymm2,%ymm2
+
+	vpaddq	%ymm15,%ymm0,%ymm0
+	vpsllq	$2,%ymm15,%ymm15
+	vpsrlq	$26,%ymm7,%ymm8
+	vpsrlq	$40,%ymm6,%ymm6
+	vpaddq	%ymm15,%ymm0,%ymm0
+
+	vpsrlq	$26,%ymm2,%ymm13
+	vpand	%ymm5,%ymm2,%ymm2
+	vpand	%ymm5,%ymm9,%ymm9
+	vpand	%ymm5,%ymm7,%ymm7
+	vpaddq	%ymm13,%ymm3,%ymm3
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpaddq	%ymm2,%ymm9,%ymm2
+	vpand	%ymm5,%ymm8,%ymm8
+	vpaddq	%ymm11,%ymm1,%ymm1
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpand	%ymm5,%ymm10,%ymm10
+	vpor	32(%rcx),%ymm6,%ymm6
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	leaq	144(%rsp),%rax
+	addq	$64,%rdx
+	jnz	.Ltail_avx2
+
+	vpsubq	%ymm9,%ymm2,%ymm2
+	vmovd	%xmm0,-112(%rdi)
+	vmovd	%xmm1,-108(%rdi)
+	vmovd	%xmm2,-104(%rdi)
+	vmovd	%xmm3,-100(%rdi)
+	vmovd	%xmm4,-96(%rdi)
+	vzeroall
+	leaq	8(%r11),%rsp
+.cfi_def_cfa	%rsp,8
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	poly1305_blocks_avx512,.-poly1305_blocks_avx512
+.type	poly1305_init_base2_44,@function
+.align	32
+poly1305_init_base2_44:
+	xorq	%rax,%rax
+	movq	%rax,0(%rdi)
+	movq	%rax,8(%rdi)
+	movq	%rax,16(%rdi)
+
+.Linit_base2_44:
+	leaq	poly1305_blocks_vpmadd52(%rip),%r10
+	leaq	poly1305_emit_base2_44(%rip),%r11
+
+	movq	$0x0ffffffc0fffffff,%rax
+	movq	$0x0ffffffc0ffffffc,%rcx
+	andq	0(%rsi),%rax
+	movq	$0x00000fffffffffff,%r8
+	andq	8(%rsi),%rcx
+	movq	$0x00000fffffffffff,%r9
+	andq	%rax,%r8
+	shrdq	$44,%rcx,%rax
+	movq	%r8,40(%rdi)
+	andq	%r9,%rax
+	shrq	$24,%rcx
+	movq	%rax,48(%rdi)
+	leaq	(%rax,%rax,4),%rax
+	movq	%rcx,56(%rdi)
+	shlq	$2,%rax
+	leaq	(%rcx,%rcx,4),%rcx
+	shlq	$2,%rcx
+	movq	%rax,24(%rdi)
+	movq	%rcx,32(%rdi)
+	movq	$-1,64(%rdi)
+	movq	%r10,0(%rdx)
+	movq	%r11,8(%rdx)
+	movl	$1,%eax
+	.byte	0xf3,0xc3
+.size	poly1305_init_base2_44,.-poly1305_init_base2_44
+.type	poly1305_blocks_vpmadd52,@function
+.align	32
+poly1305_blocks_vpmadd52:
+	shrq	$4,%rdx
+	jz	.Lno_data_vpmadd52
+
+	shlq	$40,%rcx
+	movq	64(%rdi),%r8
+
+
+
+
+
+
+	movq	$3,%rax
+	movq	$1,%r10
+	cmpq	$4,%rdx
+	cmovaeq	%r10,%rax
+	testq	%r8,%r8
+	cmovnsq	%r10,%rax
+
+	andq	%rdx,%rax
+	jz	.Lblocks_vpmadd52_4x
+
+	subq	%rax,%rdx
+	movl	$7,%r10d
+	movl	$1,%r11d
+	kmovw	%r10d,%k7
+	leaq	.L2_44_inp_permd(%rip),%r10
+	kmovw	%r11d,%k1
+
+	vmovq	%rcx,%xmm21
+	vmovdqa64	0(%r10),%ymm19
+	vmovdqa64	32(%r10),%ymm20
+	vpermq	$0xcf,%ymm21,%ymm21
+	vmovdqa64	64(%r10),%ymm22
+
+	vmovdqu64	0(%rdi),%ymm16{%k7}{z}
+	vmovdqu64	40(%rdi),%ymm3{%k7}{z}
+	vmovdqu64	32(%rdi),%ymm4{%k7}{z}
+	vmovdqu64	24(%rdi),%ymm5{%k7}{z}
+
+	vmovdqa64	96(%r10),%ymm23
+	vmovdqa64	128(%r10),%ymm24
+
+	jmp	.Loop_vpmadd52
+
+.align	32
+.Loop_vpmadd52:
+	vmovdqu32	0(%rsi),%xmm18
+	leaq	16(%rsi),%rsi
+
+	vpermd	%ymm18,%ymm19,%ymm18
+	vpsrlvq	%ymm20,%ymm18,%ymm18
+	vpandq	%ymm22,%ymm18,%ymm18
+	vporq	%ymm21,%ymm18,%ymm18
+
+	vpaddq	%ymm18,%ymm16,%ymm16
+
+	vpermq	$0,%ymm16,%ymm0{%k7}{z}
+	vpermq	$85,%ymm16,%ymm1{%k7}{z}
+	vpermq	$170,%ymm16,%ymm2{%k7}{z}
+
+	vpxord	%ymm16,%ymm16,%ymm16
+	vpxord	%ymm17,%ymm17,%ymm17
+
+	vpmadd52luq	%ymm3,%ymm0,%ymm16
+	vpmadd52huq	%ymm3,%ymm0,%ymm17
+
+	vpmadd52luq	%ymm4,%ymm1,%ymm16
+	vpmadd52huq	%ymm4,%ymm1,%ymm17
+
+	vpmadd52luq	%ymm5,%ymm2,%ymm16
+	vpmadd52huq	%ymm5,%ymm2,%ymm17
+
+	vpsrlvq	%ymm23,%ymm16,%ymm18
+	vpsllvq	%ymm24,%ymm17,%ymm17
+	vpandq	%ymm22,%ymm16,%ymm16
+
+	vpaddq	%ymm18,%ymm17,%ymm17
+
+	vpermq	$147,%ymm17,%ymm17
+
+	vpaddq	%ymm17,%ymm16,%ymm16
+
+	vpsrlvq	%ymm23,%ymm16,%ymm18
+	vpandq	%ymm22,%ymm16,%ymm16
+
+	vpermq	$147,%ymm18,%ymm18
+
+	vpaddq	%ymm18,%ymm16,%ymm16
+
+	vpermq	$147,%ymm16,%ymm18{%k1}{z}
+
+	vpaddq	%ymm18,%ymm16,%ymm16
+	vpsllq	$2,%ymm18,%ymm18
+
+	vpaddq	%ymm18,%ymm16,%ymm16
+
+	decq	%rax
+	jnz	.Loop_vpmadd52
+
+	vmovdqu64	%ymm16,0(%rdi){%k7}
+
+	testq	%rdx,%rdx
+	jnz	.Lblocks_vpmadd52_4x
+
+.Lno_data_vpmadd52:
+	.byte	0xf3,0xc3
+.size	poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
+.type	poly1305_blocks_vpmadd52_4x,@function
+.align	32
+poly1305_blocks_vpmadd52_4x:
+	shrq	$4,%rdx
+	jz	.Lno_data_vpmadd52_4x
+
+	shlq	$40,%rcx
+	movq	64(%rdi),%r8
+
+.Lblocks_vpmadd52_4x:
+	vpbroadcastq	%rcx,%ymm31
+
+	vmovdqa64	.Lx_mask44(%rip),%ymm28
+	movl	$5,%eax
+	vmovdqa64	.Lx_mask42(%rip),%ymm29
+	kmovw	%eax,%k1
+
+	testq	%r8,%r8
+	js	.Linit_vpmadd52
+
+	vmovq	0(%rdi),%xmm0
+	vmovq	8(%rdi),%xmm1
+	vmovq	16(%rdi),%xmm2
+
+	testq	$3,%rdx
+	jnz	.Lblocks_vpmadd52_2x_do
+
+.Lblocks_vpmadd52_4x_do:
+	vpbroadcastq	64(%rdi),%ymm3
+	vpbroadcastq	96(%rdi),%ymm4
+	vpbroadcastq	128(%rdi),%ymm5
+	vpbroadcastq	160(%rdi),%ymm16
+
+.Lblocks_vpmadd52_4x_key_loaded:
+	vpsllq	$2,%ymm5,%ymm17
+	vpaddq	%ymm5,%ymm17,%ymm17
+	vpsllq	$2,%ymm17,%ymm17
+
+	testq	$7,%rdx
+	jz	.Lblocks_vpmadd52_8x
+
+	vmovdqu64	0(%rsi),%ymm26
+	vmovdqu64	32(%rsi),%ymm27
+	leaq	64(%rsi),%rsi
+
+	vpunpcklqdq	%ymm27,%ymm26,%ymm25
+	vpunpckhqdq	%ymm27,%ymm26,%ymm27
+
+
+
+	vpsrlq	$24,%ymm27,%ymm26
+	vporq	%ymm31,%ymm26,%ymm26
+	vpaddq	%ymm26,%ymm2,%ymm2
+	vpandq	%ymm28,%ymm25,%ymm24
+	vpsrlq	$44,%ymm25,%ymm25
+	vpsllq	$20,%ymm27,%ymm27
+	vporq	%ymm27,%ymm25,%ymm25
+	vpandq	%ymm28,%ymm25,%ymm25
+
+	subq	$4,%rdx
+	jz	.Ltail_vpmadd52_4x
+	jmp	.Loop_vpmadd52_4x
+	ud2
+
+.align	32
+.Linit_vpmadd52:
+	vmovq	24(%rdi),%xmm16
+	vmovq	56(%rdi),%xmm2
+	vmovq	32(%rdi),%xmm17
+	vmovq	40(%rdi),%xmm3
+	vmovq	48(%rdi),%xmm4
+
+	vmovdqa	%ymm3,%ymm0
+	vmovdqa	%ymm4,%ymm1
+	vmovdqa	%ymm2,%ymm5
+
+	movl	$2,%eax
+
+.Lmul_init_vpmadd52:
+	vpxorq	%ymm18,%ymm18,%ymm18
+	vpmadd52luq	%ymm2,%ymm16,%ymm18
+	vpxorq	%ymm19,%ymm19,%ymm19
+	vpmadd52huq	%ymm2,%ymm16,%ymm19
+	vpxorq	%ymm20,%ymm20,%ymm20
+	vpmadd52luq	%ymm2,%ymm17,%ymm20
+	vpxorq	%ymm21,%ymm21,%ymm21
+	vpmadd52huq	%ymm2,%ymm17,%ymm21
+	vpxorq	%ymm22,%ymm22,%ymm22
+	vpmadd52luq	%ymm2,%ymm3,%ymm22
+	vpxorq	%ymm23,%ymm23,%ymm23
+	vpmadd52huq	%ymm2,%ymm3,%ymm23
+
+	vpmadd52luq	%ymm0,%ymm3,%ymm18
+	vpmadd52huq	%ymm0,%ymm3,%ymm19
+	vpmadd52luq	%ymm0,%ymm4,%ymm20
+	vpmadd52huq	%ymm0,%ymm4,%ymm21
+	vpmadd52luq	%ymm0,%ymm5,%ymm22
+	vpmadd52huq	%ymm0,%ymm5,%ymm23
+
+	vpmadd52luq	%ymm1,%ymm17,%ymm18
+	vpmadd52huq	%ymm1,%ymm17,%ymm19
+	vpmadd52luq	%ymm1,%ymm3,%ymm20
+	vpmadd52huq	%ymm1,%ymm3,%ymm21
+	vpmadd52luq	%ymm1,%ymm4,%ymm22
+	vpmadd52huq	%ymm1,%ymm4,%ymm23
+
+
+
+	vpsrlq	$44,%ymm18,%ymm30
+	vpsllq	$8,%ymm19,%ymm19
+	vpandq	%ymm28,%ymm18,%ymm0
+	vpaddq	%ymm30,%ymm19,%ymm19
+
+	vpaddq	%ymm19,%ymm20,%ymm20
+
+	vpsrlq	$44,%ymm20,%ymm30
+	vpsllq	$8,%ymm21,%ymm21
+	vpandq	%ymm28,%ymm20,%ymm1
+	vpaddq	%ymm30,%ymm21,%ymm21
+
+	vpaddq	%ymm21,%ymm22,%ymm22
+
+	vpsrlq	$42,%ymm22,%ymm30
+	vpsllq	$10,%ymm23,%ymm23
+	vpandq	%ymm29,%ymm22,%ymm2
+	vpaddq	%ymm30,%ymm23,%ymm23
+
+	vpaddq	%ymm23,%ymm0,%ymm0
+	vpsllq	$2,%ymm23,%ymm23
+
+	vpaddq	%ymm23,%ymm0,%ymm0
+
+	vpsrlq	$44,%ymm0,%ymm30
+	vpandq	%ymm28,%ymm0,%ymm0
+
+	vpaddq	%ymm30,%ymm1,%ymm1
+
+	decl	%eax
+	jz	.Ldone_init_vpmadd52
+
+	vpunpcklqdq	%ymm4,%ymm1,%ymm4
+	vpbroadcastq	%xmm1,%xmm1
+	vpunpcklqdq	%ymm5,%ymm2,%ymm5
+	vpbroadcastq	%xmm2,%xmm2
+	vpunpcklqdq	%ymm3,%ymm0,%ymm3
+	vpbroadcastq	%xmm0,%xmm0
+
+	vpsllq	$2,%ymm4,%ymm16
+	vpsllq	$2,%ymm5,%ymm17
+	vpaddq	%ymm4,%ymm16,%ymm16
+	vpaddq	%ymm5,%ymm17,%ymm17
+	vpsllq	$2,%ymm16,%ymm16
+	vpsllq	$2,%ymm17,%ymm17
+
+	jmp	.Lmul_init_vpmadd52
+	ud2
+
+.align	32
+.Ldone_init_vpmadd52:
+	vinserti128	$1,%xmm4,%ymm1,%ymm4
+	vinserti128	$1,%xmm5,%ymm2,%ymm5
+	vinserti128	$1,%xmm3,%ymm0,%ymm3
+
+	vpermq	$216,%ymm4,%ymm4
+	vpermq	$216,%ymm5,%ymm5
+	vpermq	$216,%ymm3,%ymm3
+
+	vpsllq	$2,%ymm4,%ymm16
+	vpaddq	%ymm4,%ymm16,%ymm16
+	vpsllq	$2,%ymm16,%ymm16
+
+	vmovq	0(%rdi),%xmm0
+	vmovq	8(%rdi),%xmm1
+	vmovq	16(%rdi),%xmm2
+
+	testq	$3,%rdx
+	jnz	.Ldone_init_vpmadd52_2x
+
+	vmovdqu64	%ymm3,64(%rdi)
+	vpbroadcastq	%xmm3,%ymm3
+	vmovdqu64	%ymm4,96(%rdi)
+	vpbroadcastq	%xmm4,%ymm4
+	vmovdqu64	%ymm5,128(%rdi)
+	vpbroadcastq	%xmm5,%ymm5
+	vmovdqu64	%ymm16,160(%rdi)
+	vpbroadcastq	%xmm16,%ymm16
+
+	jmp	.Lblocks_vpmadd52_4x_key_loaded
+	ud2
+
+.align	32
+.Ldone_init_vpmadd52_2x:
+	vmovdqu64	%ymm3,64(%rdi)
+	vpsrldq	$8,%ymm3,%ymm3
+	vmovdqu64	%ymm4,96(%rdi)
+	vpsrldq	$8,%ymm4,%ymm4
+	vmovdqu64	%ymm5,128(%rdi)
+	vpsrldq	$8,%ymm5,%ymm5
+	vmovdqu64	%ymm16,160(%rdi)
+	vpsrldq	$8,%ymm16,%ymm16
+	jmp	.Lblocks_vpmadd52_2x_key_loaded
+	ud2
+
+.align	32
+.Lblocks_vpmadd52_2x_do:
+	vmovdqu64	128+8(%rdi),%ymm5{%k1}{z}
+	vmovdqu64	160+8(%rdi),%ymm16{%k1}{z}
+	vmovdqu64	64+8(%rdi),%ymm3{%k1}{z}
+	vmovdqu64	96+8(%rdi),%ymm4{%k1}{z}
+
+.Lblocks_vpmadd52_2x_key_loaded:
+	vmovdqu64	0(%rsi),%ymm26
+	vpxorq	%ymm27,%ymm27,%ymm27
+	leaq	32(%rsi),%rsi
+
+	vpunpcklqdq	%ymm27,%ymm26,%ymm25
+	vpunpckhqdq	%ymm27,%ymm26,%ymm27
+
+
+
+	vpsrlq	$24,%ymm27,%ymm26
+	vporq	%ymm31,%ymm26,%ymm26
+	vpaddq	%ymm26,%ymm2,%ymm2
+	vpandq	%ymm28,%ymm25,%ymm24
+	vpsrlq	$44,%ymm25,%ymm25
+	vpsllq	$20,%ymm27,%ymm27
+	vporq	%ymm27,%ymm25,%ymm25
+	vpandq	%ymm28,%ymm25,%ymm25
+
+	jmp	.Ltail_vpmadd52_2x
+	ud2
+
+.align	32
+.Loop_vpmadd52_4x:
+
+	vpaddq	%ymm24,%ymm0,%ymm0
+	vpaddq	%ymm25,%ymm1,%ymm1
+
+	vpxorq	%ymm18,%ymm18,%ymm18
+	vpmadd52luq	%ymm2,%ymm16,%ymm18
+	vpxorq	%ymm19,%ymm19,%ymm19
+	vpmadd52huq	%ymm2,%ymm16,%ymm19
+	vpxorq	%ymm20,%ymm20,%ymm20
+	vpmadd52luq	%ymm2,%ymm17,%ymm20
+	vpxorq	%ymm21,%ymm21,%ymm21
+	vpmadd52huq	%ymm2,%ymm17,%ymm21
+	vpxorq	%ymm22,%ymm22,%ymm22
+	vpmadd52luq	%ymm2,%ymm3,%ymm22
+	vpxorq	%ymm23,%ymm23,%ymm23
+	vpmadd52huq	%ymm2,%ymm3,%ymm23
+
+	vmovdqu64	0(%rsi),%ymm26
+	vmovdqu64	32(%rsi),%ymm27
+	leaq	64(%rsi),%rsi
+	vpmadd52luq	%ymm0,%ymm3,%ymm18
+	vpmadd52huq	%ymm0,%ymm3,%ymm19
+	vpmadd52luq	%ymm0,%ymm4,%ymm20
+	vpmadd52huq	%ymm0,%ymm4,%ymm21
+	vpmadd52luq	%ymm0,%ymm5,%ymm22
+	vpmadd52huq	%ymm0,%ymm5,%ymm23
+
+	vpunpcklqdq	%ymm27,%ymm26,%ymm25
+	vpunpckhqdq	%ymm27,%ymm26,%ymm27
+	vpmadd52luq	%ymm1,%ymm17,%ymm18
+	vpmadd52huq	%ymm1,%ymm17,%ymm19
+	vpmadd52luq	%ymm1,%ymm3,%ymm20
+	vpmadd52huq	%ymm1,%ymm3,%ymm21
+	vpmadd52luq	%ymm1,%ymm4,%ymm22
+	vpmadd52huq	%ymm1,%ymm4,%ymm23
+
+
+
+	vpsrlq	$44,%ymm18,%ymm30
+	vpsllq	$8,%ymm19,%ymm19
+	vpandq	%ymm28,%ymm18,%ymm0
+	vpaddq	%ymm30,%ymm19,%ymm19
+
+	vpsrlq	$24,%ymm27,%ymm26
+	vporq	%ymm31,%ymm26,%ymm26
+	vpaddq	%ymm19,%ymm20,%ymm20
+
+	vpsrlq	$44,%ymm20,%ymm30
+	vpsllq	$8,%ymm21,%ymm21
+	vpandq	%ymm28,%ymm20,%ymm1
+	vpaddq	%ymm30,%ymm21,%ymm21
+
+	vpandq	%ymm28,%ymm25,%ymm24
+	vpsrlq	$44,%ymm25,%ymm25
+	vpsllq	$20,%ymm27,%ymm27
+	vpaddq	%ymm21,%ymm22,%ymm22
+
+	vpsrlq	$42,%ymm22,%ymm30
+	vpsllq	$10,%ymm23,%ymm23
+	vpandq	%ymm29,%ymm22,%ymm2
+	vpaddq	%ymm30,%ymm23,%ymm23
+
+	vpaddq	%ymm26,%ymm2,%ymm2
+	vpaddq	%ymm23,%ymm0,%ymm0
+	vpsllq	$2,%ymm23,%ymm23
+
+	vpaddq	%ymm23,%ymm0,%ymm0
+	vporq	%ymm27,%ymm25,%ymm25
+	vpandq	%ymm28,%ymm25,%ymm25
+
+	vpsrlq	$44,%ymm0,%ymm30
+	vpandq	%ymm28,%ymm0,%ymm0
+
+	vpaddq	%ymm30,%ymm1,%ymm1
+
+	subq	$4,%rdx
+	jnz	.Loop_vpmadd52_4x
+
+.Ltail_vpmadd52_4x:
+	vmovdqu64	128(%rdi),%ymm5
+	vmovdqu64	160(%rdi),%ymm16
+	vmovdqu64	64(%rdi),%ymm3
+	vmovdqu64	96(%rdi),%ymm4
+
+.Ltail_vpmadd52_2x:
+	vpsllq	$2,%ymm5,%ymm17
+	vpaddq	%ymm5,%ymm17,%ymm17
+	vpsllq	$2,%ymm17,%ymm17
+
+
+	vpaddq	%ymm24,%ymm0,%ymm0
+	vpaddq	%ymm25,%ymm1,%ymm1
+
+	vpxorq	%ymm18,%ymm18,%ymm18
+	vpmadd52luq	%ymm2,%ymm16,%ymm18
+	vpxorq	%ymm19,%ymm19,%ymm19
+	vpmadd52huq	%ymm2,%ymm16,%ymm19
+	vpxorq	%ymm20,%ymm20,%ymm20
+	vpmadd52luq	%ymm2,%ymm17,%ymm20
+	vpxorq	%ymm21,%ymm21,%ymm21
+	vpmadd52huq	%ymm2,%ymm17,%ymm21
+	vpxorq	%ymm22,%ymm22,%ymm22
+	vpmadd52luq	%ymm2,%ymm3,%ymm22
+	vpxorq	%ymm23,%ymm23,%ymm23
+	vpmadd52huq	%ymm2,%ymm3,%ymm23
+
+	vpmadd52luq	%ymm0,%ymm3,%ymm18
+	vpmadd52huq	%ymm0,%ymm3,%ymm19
+	vpmadd52luq	%ymm0,%ymm4,%ymm20
+	vpmadd52huq	%ymm0,%ymm4,%ymm21
+	vpmadd52luq	%ymm0,%ymm5,%ymm22
+	vpmadd52huq	%ymm0,%ymm5,%ymm23
+
+	vpmadd52luq	%ymm1,%ymm17,%ymm18
+	vpmadd52huq	%ymm1,%ymm17,%ymm19
+	vpmadd52luq	%ymm1,%ymm3,%ymm20
+	vpmadd52huq	%ymm1,%ymm3,%ymm21
+	vpmadd52luq	%ymm1,%ymm4,%ymm22
+	vpmadd52huq	%ymm1,%ymm4,%ymm23
+
+
+
+
+	movl	$1,%eax
+	kmovw	%eax,%k1
+	vpsrldq	$8,%ymm18,%ymm24
+	vpsrldq	$8,%ymm19,%ymm0
+	vpsrldq	$8,%ymm20,%ymm25
+	vpsrldq	$8,%ymm21,%ymm1
+	vpaddq	%ymm24,%ymm18,%ymm18
+	vpaddq	%ymm0,%ymm19,%ymm19
+	vpsrldq	$8,%ymm22,%ymm26
+	vpsrldq	$8,%ymm23,%ymm2
+	vpaddq	%ymm25,%ymm20,%ymm20
+	vpaddq	%ymm1,%ymm21,%ymm21
+	vpermq	$0x2,%ymm18,%ymm24
+	vpermq	$0x2,%ymm19,%ymm0
+	vpaddq	%ymm26,%ymm22,%ymm22
+	vpaddq	%ymm2,%ymm23,%ymm23
+
+	vpermq	$0x2,%ymm20,%ymm25
+	vpermq	$0x2,%ymm21,%ymm1
+	vpaddq	%ymm24,%ymm18,%ymm18{%k1}{z}
+	vpaddq	%ymm0,%ymm19,%ymm19{%k1}{z}
+	vpermq	$0x2,%ymm22,%ymm26
+	vpermq	$0x2,%ymm23,%ymm2
+	vpaddq	%ymm25,%ymm20,%ymm20{%k1}{z}
+	vpaddq	%ymm1,%ymm21,%ymm21{%k1}{z}
+	vpaddq	%ymm26,%ymm22,%ymm22{%k1}{z}
+	vpaddq	%ymm2,%ymm23,%ymm23{%k1}{z}
+
+
+
+	vpsrlq	$44,%ymm18,%ymm30
+	vpsllq	$8,%ymm19,%ymm19
+	vpandq	%ymm28,%ymm18,%ymm0
+	vpaddq	%ymm30,%ymm19,%ymm19
+
+	vpaddq	%ymm19,%ymm20,%ymm20
+
+	vpsrlq	$44,%ymm20,%ymm30
+	vpsllq	$8,%ymm21,%ymm21
+	vpandq	%ymm28,%ymm20,%ymm1
+	vpaddq	%ymm30,%ymm21,%ymm21
+
+	vpaddq	%ymm21,%ymm22,%ymm22
+
+	vpsrlq	$42,%ymm22,%ymm30
+	vpsllq	$10,%ymm23,%ymm23
+	vpandq	%ymm29,%ymm22,%ymm2
+	vpaddq	%ymm30,%ymm23,%ymm23
+
+	vpaddq	%ymm23,%ymm0,%ymm0
+	vpsllq	$2,%ymm23,%ymm23
+
+	vpaddq	%ymm23,%ymm0,%ymm0
+
+	vpsrlq	$44,%ymm0,%ymm30
+	vpandq	%ymm28,%ymm0,%ymm0
+
+	vpaddq	%ymm30,%ymm1,%ymm1
+
+
+	subq	$2,%rdx
+	ja	.Lblocks_vpmadd52_4x_do
+
+	vmovq	%xmm0,0(%rdi)
+	vmovq	%xmm1,8(%rdi)
+	vmovq	%xmm2,16(%rdi)
+	vzeroall
+
+.Lno_data_vpmadd52_4x:
+	.byte	0xf3,0xc3
+.size	poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
+.type	poly1305_blocks_vpmadd52_8x,@function
+.align	32
+poly1305_blocks_vpmadd52_8x:
+	shrq	$4,%rdx
+	jz	.Lno_data_vpmadd52_8x
+
+	shlq	$40,%rcx
+	movq	64(%rdi),%r8
+
+	vmovdqa64	.Lx_mask44(%rip),%ymm28
+	vmovdqa64	.Lx_mask42(%rip),%ymm29
+
+	testq	%r8,%r8
+	js	.Linit_vpmadd52
+
+	vmovq	0(%rdi),%xmm0
+	vmovq	8(%rdi),%xmm1
+	vmovq	16(%rdi),%xmm2
+
+.Lblocks_vpmadd52_8x:
+
+
+
+	vmovdqu64	128(%rdi),%ymm5
+	vmovdqu64	160(%rdi),%ymm16
+	vmovdqu64	64(%rdi),%ymm3
+	vmovdqu64	96(%rdi),%ymm4
+
+	vpsllq	$2,%ymm5,%ymm17
+	vpaddq	%ymm5,%ymm17,%ymm17
+	vpsllq	$2,%ymm17,%ymm17
+
+	vpbroadcastq	%xmm5,%ymm8
+	vpbroadcastq	%xmm3,%ymm6
+	vpbroadcastq	%xmm4,%ymm7
+
+	vpxorq	%ymm18,%ymm18,%ymm18
+	vpmadd52luq	%ymm8,%ymm16,%ymm18
+	vpxorq	%ymm19,%ymm19,%ymm19
+	vpmadd52huq	%ymm8,%ymm16,%ymm19
+	vpxorq	%ymm20,%ymm20,%ymm20
+	vpmadd52luq	%ymm8,%ymm17,%ymm20
+	vpxorq	%ymm21,%ymm21,%ymm21
+	vpmadd52huq	%ymm8,%ymm17,%ymm21
+	vpxorq	%ymm22,%ymm22,%ymm22
+	vpmadd52luq	%ymm8,%ymm3,%ymm22
+	vpxorq	%ymm23,%ymm23,%ymm23
+	vpmadd52huq	%ymm8,%ymm3,%ymm23
+
+	vpmadd52luq	%ymm6,%ymm3,%ymm18
+	vpmadd52huq	%ymm6,%ymm3,%ymm19
+	vpmadd52luq	%ymm6,%ymm4,%ymm20
+	vpmadd52huq	%ymm6,%ymm4,%ymm21
+	vpmadd52luq	%ymm6,%ymm5,%ymm22
+	vpmadd52huq	%ymm6,%ymm5,%ymm23
+
+	vpmadd52luq	%ymm7,%ymm17,%ymm18
+	vpmadd52huq	%ymm7,%ymm17,%ymm19
+	vpmadd52luq	%ymm7,%ymm3,%ymm20
+	vpmadd52huq	%ymm7,%ymm3,%ymm21
+	vpmadd52luq	%ymm7,%ymm4,%ymm22
+	vpmadd52huq	%ymm7,%ymm4,%ymm23
+
+
+
+	vpsrlq	$44,%ymm18,%ymm30
+	vpsllq	$8,%ymm19,%ymm19
+	vpandq	%ymm28,%ymm18,%ymm6
+	vpaddq	%ymm30,%ymm19,%ymm19
+
+	vpaddq	%ymm19,%ymm20,%ymm20
+
+	vpsrlq	$44,%ymm20,%ymm30
+	vpsllq	$8,%ymm21,%ymm21
+	vpandq	%ymm28,%ymm20,%ymm7
+	vpaddq	%ymm30,%ymm21,%ymm21
+
+	vpaddq	%ymm21,%ymm22,%ymm22
+
+	vpsrlq	$42,%ymm22,%ymm30
+	vpsllq	$10,%ymm23,%ymm23
+	vpandq	%ymm29,%ymm22,%ymm8
+	vpaddq	%ymm30,%ymm23,%ymm23
+
+	vpaddq	%ymm23,%ymm6,%ymm6
+	vpsllq	$2,%ymm23,%ymm23
+
+	vpaddq	%ymm23,%ymm6,%ymm6
+
+	vpsrlq	$44,%ymm6,%ymm30
+	vpandq	%ymm28,%ymm6,%ymm6
+
+	vpaddq	%ymm30,%ymm7,%ymm7
+
+
+
+
+
+	vpunpcklqdq	%ymm5,%ymm8,%ymm26
+	vpunpckhqdq	%ymm5,%ymm8,%ymm5
+	vpunpcklqdq	%ymm3,%ymm6,%ymm24
+	vpunpckhqdq	%ymm3,%ymm6,%ymm3
+	vpunpcklqdq	%ymm4,%ymm7,%ymm25
+	vpunpckhqdq	%ymm4,%ymm7,%ymm4
+	vshufi64x2	$0x44,%zmm5,%zmm26,%zmm8
+	vshufi64x2	$0x44,%zmm3,%zmm24,%zmm6
+	vshufi64x2	$0x44,%zmm4,%zmm25,%zmm7
+
+	vmovdqu64	0(%rsi),%zmm26
+	vmovdqu64	64(%rsi),%zmm27
+	leaq	128(%rsi),%rsi
+
+	vpsllq	$2,%zmm8,%zmm10
+	vpsllq	$2,%zmm7,%zmm9
+	vpaddq	%zmm8,%zmm10,%zmm10
+	vpaddq	%zmm7,%zmm9,%zmm9
+	vpsllq	$2,%zmm10,%zmm10
+	vpsllq	$2,%zmm9,%zmm9
+
+	vpbroadcastq	%rcx,%zmm31
+	vpbroadcastq	%xmm28,%zmm28
+	vpbroadcastq	%xmm29,%zmm29
+
+	vpbroadcastq	%xmm9,%zmm16
+	vpbroadcastq	%xmm10,%zmm17
+	vpbroadcastq	%xmm6,%zmm3
+	vpbroadcastq	%xmm7,%zmm4
+	vpbroadcastq	%xmm8,%zmm5
+
+	vpunpcklqdq	%zmm27,%zmm26,%zmm25
+	vpunpckhqdq	%zmm27,%zmm26,%zmm27
+
+
+
+	vpsrlq	$24,%zmm27,%zmm26
+	vporq	%zmm31,%zmm26,%zmm26
+	vpaddq	%zmm26,%zmm2,%zmm2
+	vpandq	%zmm28,%zmm25,%zmm24
+	vpsrlq	$44,%zmm25,%zmm25
+	vpsllq	$20,%zmm27,%zmm27
+	vporq	%zmm27,%zmm25,%zmm25
+	vpandq	%zmm28,%zmm25,%zmm25
+
+	subq	$8,%rdx
+	jz	.Ltail_vpmadd52_8x
+	jmp	.Loop_vpmadd52_8x
+
+.align	32
+.Loop_vpmadd52_8x:
+
+	vpaddq	%zmm24,%zmm0,%zmm0
+	vpaddq	%zmm25,%zmm1,%zmm1
+
+	vpxorq	%zmm18,%zmm18,%zmm18
+	vpmadd52luq	%zmm2,%zmm16,%zmm18
+	vpxorq	%zmm19,%zmm19,%zmm19
+	vpmadd52huq	%zmm2,%zmm16,%zmm19
+	vpxorq	%zmm20,%zmm20,%zmm20
+	vpmadd52luq	%zmm2,%zmm17,%zmm20
+	vpxorq	%zmm21,%zmm21,%zmm21
+	vpmadd52huq	%zmm2,%zmm17,%zmm21
+	vpxorq	%zmm22,%zmm22,%zmm22
+	vpmadd52luq	%zmm2,%zmm3,%zmm22
+	vpxorq	%zmm23,%zmm23,%zmm23
+	vpmadd52huq	%zmm2,%zmm3,%zmm23
+
+	vmovdqu64	0(%rsi),%zmm26
+	vmovdqu64	64(%rsi),%zmm27
+	leaq	128(%rsi),%rsi
+	vpmadd52luq	%zmm0,%zmm3,%zmm18
+	vpmadd52huq	%zmm0,%zmm3,%zmm19
+	vpmadd52luq	%zmm0,%zmm4,%zmm20
+	vpmadd52huq	%zmm0,%zmm4,%zmm21
+	vpmadd52luq	%zmm0,%zmm5,%zmm22
+	vpmadd52huq	%zmm0,%zmm5,%zmm23
+
+	vpunpcklqdq	%zmm27,%zmm26,%zmm25
+	vpunpckhqdq	%zmm27,%zmm26,%zmm27
+	vpmadd52luq	%zmm1,%zmm17,%zmm18
+	vpmadd52huq	%zmm1,%zmm17,%zmm19
+	vpmadd52luq	%zmm1,%zmm3,%zmm20
+	vpmadd52huq	%zmm1,%zmm3,%zmm21
+	vpmadd52luq	%zmm1,%zmm4,%zmm22
+	vpmadd52huq	%zmm1,%zmm4,%zmm23
+
+
+
+	vpsrlq	$44,%zmm18,%zmm30
+	vpsllq	$8,%zmm19,%zmm19
+	vpandq	%zmm28,%zmm18,%zmm0
+	vpaddq	%zmm30,%zmm19,%zmm19
+
+	vpsrlq	$24,%zmm27,%zmm26
+	vporq	%zmm31,%zmm26,%zmm26
+	vpaddq	%zmm19,%zmm20,%zmm20
+
+	vpsrlq	$44,%zmm20,%zmm30
+	vpsllq	$8,%zmm21,%zmm21
+	vpandq	%zmm28,%zmm20,%zmm1
+	vpaddq	%zmm30,%zmm21,%zmm21
+
+	vpandq	%zmm28,%zmm25,%zmm24
+	vpsrlq	$44,%zmm25,%zmm25
+	vpsllq	$20,%zmm27,%zmm27
+	vpaddq	%zmm21,%zmm22,%zmm22
+
+	vpsrlq	$42,%zmm22,%zmm30
+	vpsllq	$10,%zmm23,%zmm23
+	vpandq	%zmm29,%zmm22,%zmm2
+	vpaddq	%zmm30,%zmm23,%zmm23
+
+	vpaddq	%zmm26,%zmm2,%zmm2
+	vpaddq	%zmm23,%zmm0,%zmm0
+	vpsllq	$2,%zmm23,%zmm23
+
+	vpaddq	%zmm23,%zmm0,%zmm0
+	vporq	%zmm27,%zmm25,%zmm25
+	vpandq	%zmm28,%zmm25,%zmm25
+
+	vpsrlq	$44,%zmm0,%zmm30
+	vpandq	%zmm28,%zmm0,%zmm0
+
+	vpaddq	%zmm30,%zmm1,%zmm1
+
+	subq	$8,%rdx
+	jnz	.Loop_vpmadd52_8x
+
+.Ltail_vpmadd52_8x:
+
+	vpaddq	%zmm24,%zmm0,%zmm0
+	vpaddq	%zmm25,%zmm1,%zmm1
+
+	vpxorq	%zmm18,%zmm18,%zmm18
+	vpmadd52luq	%zmm2,%zmm9,%zmm18
+	vpxorq	%zmm19,%zmm19,%zmm19
+	vpmadd52huq	%zmm2,%zmm9,%zmm19
+	vpxorq	%zmm20,%zmm20,%zmm20
+	vpmadd52luq	%zmm2,%zmm10,%zmm20
+	vpxorq	%zmm21,%zmm21,%zmm21
+	vpmadd52huq	%zmm2,%zmm10,%zmm21
+	vpxorq	%zmm22,%zmm22,%zmm22
+	vpmadd52luq	%zmm2,%zmm6,%zmm22
+	vpxorq	%zmm23,%zmm23,%zmm23
+	vpmadd52huq	%zmm2,%zmm6,%zmm23
+
+	vpmadd52luq	%zmm0,%zmm6,%zmm18
+	vpmadd52huq	%zmm0,%zmm6,%zmm19
+	vpmadd52luq	%zmm0,%zmm7,%zmm20
+	vpmadd52huq	%zmm0,%zmm7,%zmm21
+	vpmadd52luq	%zmm0,%zmm8,%zmm22
+	vpmadd52huq	%zmm0,%zmm8,%zmm23
+
+	vpmadd52luq	%zmm1,%zmm10,%zmm18
+	vpmadd52huq	%zmm1,%zmm10,%zmm19
+	vpmadd52luq	%zmm1,%zmm6,%zmm20
+	vpmadd52huq	%zmm1,%zmm6,%zmm21
+	vpmadd52luq	%zmm1,%zmm7,%zmm22
+	vpmadd52huq	%zmm1,%zmm7,%zmm23
+
+
+
+
+	movl	$1,%eax
+	kmovw	%eax,%k1
+	vpsrldq	$8,%zmm18,%zmm24
+	vpsrldq	$8,%zmm19,%zmm0
+	vpsrldq	$8,%zmm20,%zmm25
+	vpsrldq	$8,%zmm21,%zmm1
+	vpaddq	%zmm24,%zmm18,%zmm18
+	vpaddq	%zmm0,%zmm19,%zmm19
+	vpsrldq	$8,%zmm22,%zmm26
+	vpsrldq	$8,%zmm23,%zmm2
+	vpaddq	%zmm25,%zmm20,%zmm20
+	vpaddq	%zmm1,%zmm21,%zmm21
+	vpermq	$0x2,%zmm18,%zmm24
+	vpermq	$0x2,%zmm19,%zmm0
+	vpaddq	%zmm26,%zmm22,%zmm22
+	vpaddq	%zmm2,%zmm23,%zmm23
+
+	vpermq	$0x2,%zmm20,%zmm25
+	vpermq	$0x2,%zmm21,%zmm1
+	vpaddq	%zmm24,%zmm18,%zmm18
+	vpaddq	%zmm0,%zmm19,%zmm19
+	vpermq	$0x2,%zmm22,%zmm26
+	vpermq	$0x2,%zmm23,%zmm2
+	vpaddq	%zmm25,%zmm20,%zmm20
+	vpaddq	%zmm1,%zmm21,%zmm21
+	vextracti64x4	$1,%zmm18,%ymm24
+	vextracti64x4	$1,%zmm19,%ymm0
+	vpaddq	%zmm26,%zmm22,%zmm22
+	vpaddq	%zmm2,%zmm23,%zmm23
+
+	vextracti64x4	$1,%zmm20,%ymm25
+	vextracti64x4	$1,%zmm21,%ymm1
+	vextracti64x4	$1,%zmm22,%ymm26
+	vextracti64x4	$1,%zmm23,%ymm2
+	vpaddq	%ymm24,%ymm18,%ymm18{%k1}{z}
+	vpaddq	%ymm0,%ymm19,%ymm19{%k1}{z}
+	vpaddq	%ymm25,%ymm20,%ymm20{%k1}{z}
+	vpaddq	%ymm1,%ymm21,%ymm21{%k1}{z}
+	vpaddq	%ymm26,%ymm22,%ymm22{%k1}{z}
+	vpaddq	%ymm2,%ymm23,%ymm23{%k1}{z}
+
+
+
+	vpsrlq	$44,%ymm18,%ymm30
+	vpsllq	$8,%ymm19,%ymm19
+	vpandq	%ymm28,%ymm18,%ymm0
+	vpaddq	%ymm30,%ymm19,%ymm19
+
+	vpaddq	%ymm19,%ymm20,%ymm20
+
+	vpsrlq	$44,%ymm20,%ymm30
+	vpsllq	$8,%ymm21,%ymm21
+	vpandq	%ymm28,%ymm20,%ymm1
+	vpaddq	%ymm30,%ymm21,%ymm21
+
+	vpaddq	%ymm21,%ymm22,%ymm22
+
+	vpsrlq	$42,%ymm22,%ymm30
+	vpsllq	$10,%ymm23,%ymm23
+	vpandq	%ymm29,%ymm22,%ymm2
+	vpaddq	%ymm30,%ymm23,%ymm23
+
+	vpaddq	%ymm23,%ymm0,%ymm0
+	vpsllq	$2,%ymm23,%ymm23
+
+	vpaddq	%ymm23,%ymm0,%ymm0
+
+	vpsrlq	$44,%ymm0,%ymm30
+	vpandq	%ymm28,%ymm0,%ymm0
+
+	vpaddq	%ymm30,%ymm1,%ymm1
+
+
+
+	vmovq	%xmm0,0(%rdi)
+	vmovq	%xmm1,8(%rdi)
+	vmovq	%xmm2,16(%rdi)
+	vzeroall
+
+.Lno_data_vpmadd52_8x:
+	.byte	0xf3,0xc3
+.size	poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
+.type	poly1305_emit_base2_44,@function
+.align	32
+poly1305_emit_base2_44:
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+
+	movq	%r9,%rax
+	shrq	$20,%r9
+	shlq	$44,%rax
+	movq	%r10,%rcx
+	shrq	$40,%r10
+	shlq	$24,%rcx
+
+	addq	%rax,%r8
+	adcq	%rcx,%r9
+	adcq	$0,%r10
+
+	movq	%r8,%rax
+	addq	$5,%r8
+	movq	%r9,%rcx
+	adcq	$0,%r9
+	adcq	$0,%r10
+	shrq	$2,%r10
+	cmovnzq	%r8,%rax
+	cmovnzq	%r9,%rcx
+
+	addq	0(%rdx),%rax
+	adcq	8(%rdx),%rcx
+	movq	%rax,0(%rsi)
+	movq	%rcx,8(%rsi)
+
+	.byte	0xf3,0xc3
+.size	poly1305_emit_base2_44,.-poly1305_emit_base2_44
+.align	64
+.Lconst:
+.Lmask24:
+.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+.L129:
+.long	16777216,0,16777216,0,16777216,0,16777216,0
+.Lmask26:
+.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+.Lpermd_avx2:
+.long	2,2,2,3,2,0,2,1
+.Lpermd_avx512:
+.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
+
+.L2_44_inp_permd:
+.long	0,1,1,2,2,3,7,7
+.L2_44_inp_shift:
+.quad	0,12,24,64
+.L2_44_mask:
+.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
+.L2_44_shift_rgt:
+.quad	44,44,42,64
+.L2_44_shift_lft:
+.quad	8,8,10,64
+
+.align	64
+.Lx_mask44:
+.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.Lx_mask42:
+.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	16
+.globl	xor128_encrypt_n_pad
+.type	xor128_encrypt_n_pad,@function
+.align	16
+xor128_encrypt_n_pad:
+	subq	%rdx,%rsi
+	subq	%rdx,%rdi
+	movq	%rcx,%r10
+	shrq	$4,%rcx
+	jz	.Ltail_enc
+	nop
+.Loop_enc_xmm:
+	movdqu	(%rsi,%rdx,1),%xmm0
+	pxor	(%rdx),%xmm0
+	movdqu	%xmm0,(%rdi,%rdx,1)
+	movdqa	%xmm0,(%rdx)
+	leaq	16(%rdx),%rdx
+	decq	%rcx
+	jnz	.Loop_enc_xmm
+
+	andq	$15,%r10
+	jz	.Ldone_enc
+
+.Ltail_enc:
+	movq	$16,%rcx
+	subq	%r10,%rcx
+	xorl	%eax,%eax
+.Loop_enc_byte:
+	movb	(%rsi,%rdx,1),%al
+	xorb	(%rdx),%al
+	movb	%al,(%rdi,%rdx,1)
+	movb	%al,(%rdx)
+	leaq	1(%rdx),%rdx
+	decq	%r10
+	jnz	.Loop_enc_byte
+
+	xorl	%eax,%eax
+.Loop_enc_pad:
+	movb	%al,(%rdx)
+	leaq	1(%rdx),%rdx
+	decq	%rcx
+	jnz	.Loop_enc_pad
+
+.Ldone_enc:
+	movq	%rdx,%rax
+	.byte	0xf3,0xc3
+.size	xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
+
+.globl	xor128_decrypt_n_pad
+.type	xor128_decrypt_n_pad,@function
+.align	16
+xor128_decrypt_n_pad:
+	subq	%rdx,%rsi
+	subq	%rdx,%rdi
+	movq	%rcx,%r10
+	shrq	$4,%rcx
+	jz	.Ltail_dec
+	nop
+.Loop_dec_xmm:
+	movdqu	(%rsi,%rdx,1),%xmm0
+	movdqa	(%rdx),%xmm1
+	pxor	%xmm0,%xmm1
+	movdqu	%xmm1,(%rdi,%rdx,1)
+	movdqa	%xmm0,(%rdx)
+	leaq	16(%rdx),%rdx
+	decq	%rcx
+	jnz	.Loop_dec_xmm
+
+	pxor	%xmm1,%xmm1
+	andq	$15,%r10
+	jz	.Ldone_dec
+
+.Ltail_dec:
+	movq	$16,%rcx
+	subq	%r10,%rcx
+	xorl	%eax,%eax
+	xorq	%r11,%r11
+.Loop_dec_byte:
+	movb	(%rsi,%rdx,1),%r11b
+	movb	(%rdx),%al
+	xorb	%r11b,%al
+	movb	%al,(%rdi,%rdx,1)
+	movb	%r11b,(%rdx)
+	leaq	1(%rdx),%rdx
+	decq	%r10
+	jnz	.Loop_dec_byte
+
+	xorl	%eax,%eax
+.Loop_dec_pad:
+	movb	%al,(%rdx)
+	leaq	1(%rdx),%rdx
+	decq	%rcx
+	jnz	.Loop_dec_pad
+
+.Ldone_dec:
+	movq	%rdx,%rax
+	.byte	0xf3,0xc3
+.size	xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
-- 
2.19.0

  parent reply	other threads:[~2018-10-06  2:56 UTC|newest]

Thread overview: 55+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-10-06  2:56 [PATCH net-next v7 00/28] WireGuard: Secure Network Tunnel Jason A. Donenfeld
2018-10-06  2:56 ` [PATCH net-next v7 01/28] ARM: makefile: use ARMv3M mode for RiscPC Jason A. Donenfeld
2018-10-06  2:56 ` [PATCH net-next v7 02/28] asm: simd context helper API Jason A. Donenfeld
2018-10-06  2:56 ` [PATCH net-next v7 03/28] zinc: introduce minimal cryptography library Jason A. Donenfeld
2018-10-08 23:22   ` Eric Biggers
2018-10-06  2:56 ` [PATCH net-next v7 04/28] zinc: ChaCha20 generic C implementation and selftest Jason A. Donenfeld
2018-10-06  2:56 ` [PATCH net-next v7 05/28] zinc: import Andy Polyakov's ChaCha20 x86_64 implementation Jason A. Donenfeld
2018-10-06  2:56 ` [PATCH net-next v7 06/28] zinc: " Jason A. Donenfeld
2018-10-06  2:56 ` [PATCH net-next v7 07/28] zinc: import Andy Polyakov's ChaCha20 ARM and ARM64 implementations Jason A. Donenfeld
2018-10-06  2:56   ` Jason A. Donenfeld
2018-10-06  2:56 ` [PATCH net-next v7 08/28] zinc: port " Jason A. Donenfeld
2018-10-06  2:56   ` Jason A. Donenfeld
2018-10-06  2:56 ` [PATCH net-next v7 09/28] zinc: " Jason A. Donenfeld
2018-10-06  2:56   ` Jason A. Donenfeld
2018-10-06  2:56 ` [PATCH net-next v7 10/28] zinc: ChaCha20 MIPS32r2 implementation Jason A. Donenfeld
2018-10-06  2:56 ` [PATCH net-next v7 11/28] zinc: Poly1305 generic C implementations and selftest Jason A. Donenfeld
2018-10-06  2:56 ` Jason A. Donenfeld [this message]
2018-10-06  2:56 ` [PATCH net-next v7 13/28] zinc: Poly1305 x86_64 implementation Jason A. Donenfeld
2018-10-06  2:56 ` [PATCH net-next v7 14/28] zinc: import Andy Polyakov's Poly1305 ARM and ARM64 implementations Jason A. Donenfeld
2018-10-06  2:56   ` Jason A. Donenfeld
2018-10-06  2:56 ` [PATCH net-next v7 15/28] zinc: " Jason A. Donenfeld
2018-10-06  2:56   ` Jason A. Donenfeld
2018-10-06  2:56 ` [PATCH net-next v7 16/28] zinc: import Andy Polyakov's Poly1305 MIPS64 implementation Jason A. Donenfeld
2018-10-06  2:56 ` [PATCH net-next v7 17/28] zinc: Poly1305 MIPS32r2 and MIPS64 implementations Jason A. Donenfeld
2018-10-06  2:56 ` [PATCH net-next v7 18/28] zinc: ChaCha20Poly1305 construction and selftest Jason A. Donenfeld
2018-10-06  2:57 ` [PATCH net-next v7 19/28] zinc: BLAKE2s generic C implementation " Jason A. Donenfeld
2018-10-06  2:57 ` [PATCH net-next v7 20/28] zinc: BLAKE2s x86_64 implementation Jason A. Donenfeld
2018-10-06  2:57 ` [PATCH net-next v7 21/28] zinc: Curve25519 generic C implementations and selftest Jason A. Donenfeld
2018-10-06  2:57 ` [PATCH net-next v7 22/28] zinc: Curve25519 x86_64 implementation Jason A. Donenfeld
2018-10-06  2:57 ` [PATCH net-next v7 23/28] zinc: import Bernstein and Schwabe's Curve25519 ARM implementation Jason A. Donenfeld
2018-10-06  2:57   ` Jason A. Donenfeld
2018-10-06  2:57 ` [PATCH net-next v7 24/28] zinc: " Jason A. Donenfeld
2018-10-06  2:57   ` Jason A. Donenfeld
2018-10-06  2:57 ` [PATCH net-next v7 25/28] crypto: port Poly1305 to Zinc Jason A. Donenfeld
2018-10-08 23:21   ` Eric Biggers
2018-10-09  0:02     ` Jason A. Donenfeld
2018-10-06  2:57 ` [PATCH net-next v7 26/28] crypto: port ChaCha20 " Jason A. Donenfeld
2018-10-06 13:07   ` Martin Willi
2018-10-06 13:07     ` Martin Willi
2018-10-06  2:57 ` [PATCH net-next v7 27/28] security/keys: rewrite big_key crypto to use Zinc Jason A. Donenfeld
2018-10-06  2:57 ` [PATCH net-next v7 28/28] net: WireGuard secure network tunnel Jason A. Donenfeld
2018-10-06  6:58   ` Jiri Pirko
2018-10-06 12:25     ` Jason A. Donenfeld
2018-10-07 17:26     ` Jason A. Donenfeld
2018-10-07 18:14       ` Lukas Wunner
2018-10-07 18:42         ` Andrew Lunn
2018-10-10  9:13       ` Jiri Pirko
2018-10-10 20:27         ` Jason A. Donenfeld
2018-10-11  5:36           ` Jiri Pirko
2018-10-06 19:43   ` Eugene Syromiatnikov
2018-10-08  1:06     ` Jason A. Donenfeld
2018-10-06 23:54   ` Andrew Lunn
2018-10-07  1:57     ` Jason A. Donenfeld
2018-10-07 16:48       ` Andrew Lunn
2018-10-07 16:55         ` Jason A. Donenfeld

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181006025709.4019-13-Jason@zx2c4.com \
    --to=jason@zx2c4.com \
    --cc=akpm@linux-foundation.org \
    --cc=appro@openssl.org \
    --cc=davem@davemloft.net \
    --cc=gregkh@linuxfoundation.org \
    --cc=jeanphilippe.aumasson@gmail.com \
    --cc=kernel-hardening@lists.openwall.com \
    --cc=linux-crypto@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=mingo@redhat.com \
    --cc=netdev@vger.kernel.org \
    --cc=sneves@dei.uc.pt \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.