linux-crypto.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Eric Biggers <ebiggers@kernel.org>
To: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: linux-crypto@vger.kernel.org,
	Herbert Xu <herbert@gondor.apana.org.au>,
	David Miller <davem@davemloft.net>,
	"Jason A . Donenfeld" <Jason@zx2c4.com>,
	Samuel Neves <sneves@dei.uc.pt>, Arnd Bergmann <arnd@arndb.de>,
	Andy Lutomirski <luto@kernel.org>,
	Martin Willi <martin@strongswan.org>,
	Rene van Dorst <opensource@vdorst.com>,
	David Sterba <dsterba@suse.com>
Subject: Re: [PATCH v4 25/35] crypto: BLAKE2s - x86_64 SIMD implementation
Date: Tue, 22 Oct 2019 21:55:11 -0700	[thread overview]
Message-ID: <20191023045511.GC361298@sol.localdomain> (raw)
In-Reply-To: <20191017190932.1947-26-ard.biesheuvel@linaro.org>

On Thu, Oct 17, 2019 at 09:09:22PM +0200, Ard Biesheuvel wrote:
> From: "Jason A. Donenfeld" <Jason@zx2c4.com>
> 
> These implementations from Samuel Neves support AVX and AVX-512VL.
> Originally this used AVX-512F, but Skylake thermal throttling made
> AVX-512VL more attractive and possible to do with negligable difference.
> 
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
> Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
> [ardb: move to arch/x86/crypto, wire into lib/crypto framework]
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> ---
>  arch/x86/crypto/Makefile       |   2 +
>  arch/x86/crypto/blake2s-core.S | 685 ++++++++++++++++++++
>  arch/x86/crypto/blake2s-glue.c | 235 +++++++
>  crypto/Kconfig                 |   6 +
>  4 files changed, 928 insertions(+)
> 
> diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
> index 759b1a927826..922c8ecfa00f 100644
> --- a/arch/x86/crypto/Makefile
> +++ b/arch/x86/crypto/Makefile
> @@ -48,6 +48,7 @@ ifeq ($(avx_supported),yes)
>  	obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
>  	obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
>  	obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
> +	obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
>  endif
>  
>  # These modules require assembler to support AVX2.
> @@ -70,6 +71,7 @@ serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
>  aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
>  
>  nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
> +blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
>  
>  ifeq ($(avx_supported),yes)
>  	camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
> diff --git a/arch/x86/crypto/blake2s-core.S b/arch/x86/crypto/blake2s-core.S
> new file mode 100644
> index 000000000000..675288fa4cca
> --- /dev/null
> +++ b/arch/x86/crypto/blake2s-core.S
> @@ -0,0 +1,685 @@
> +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
> +/*
> + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
> + * Copyright (C) 2017 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
> + */
> +
> +#include <linux/linkage.h>
> +
> +.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
> +.align 32
> +IV:	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
> +	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
> +.section .rodata.cst16.ROT16, "aM", @progbits, 16
> +.align 16
> +ROT16:	.octa 0x0D0C0F0E09080B0A0504070601000302
> +.section .rodata.cst16.ROR328, "aM", @progbits, 16
> +.align 16
> +ROR328:	.octa 0x0C0F0E0D080B0A090407060500030201
> +#ifdef CONFIG_AS_AVX512
> +.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 640
> +.align 64
> +SIGMA:
> +.long 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15
> +.long 11, 2, 12, 14, 9, 8, 15, 3, 4, 0, 13, 6, 10, 1, 7, 5
> +.long 10, 12, 11, 6, 5, 9, 13, 3, 4, 15, 14, 2, 0, 7, 8, 1
> +.long 10, 9, 7, 0, 11, 14, 1, 12, 6, 2, 15, 3, 13, 8, 5, 4
> +.long 4, 9, 8, 13, 14, 0, 10, 11, 7, 3, 12, 1, 5, 6, 15, 2
> +.long 2, 10, 4, 14, 13, 3, 9, 11, 6, 5, 7, 12, 15, 1, 8, 0
> +.long 4, 11, 14, 8, 13, 10, 12, 5, 2, 1, 15, 3, 9, 7, 0, 6
> +.long 6, 12, 0, 13, 15, 2, 1, 10, 4, 5, 11, 14, 8, 3, 9, 7
> +.long 14, 5, 4, 12, 9, 7, 3, 10, 2, 0, 6, 15, 11, 1, 13, 8
> +.long 11, 7, 13, 10, 12, 14, 0, 15, 4, 5, 6, 9, 2, 1, 8, 3
> +#endif /* CONFIG_AS_AVX512 */
> +
> +.text
> +#ifdef CONFIG_AS_AVX
> +ENTRY(blake2s_compress_avx)
> +	movl		%ecx, %ecx
> +	testq		%rdx, %rdx
> +	je		.Lendofloop
> +	.align 32
> +.Lbeginofloop:
> +	addq		%rcx, 32(%rdi)
> +	vmovdqu		IV+16(%rip), %xmm1
> +	vmovdqu		(%rsi), %xmm4
> +	vpxor		32(%rdi), %xmm1, %xmm1
> +	vmovdqu		16(%rsi), %xmm3
> +	vshufps		$136, %xmm3, %xmm4, %xmm6
> +	vmovdqa		ROT16(%rip), %xmm7
> +	vpaddd		(%rdi), %xmm6, %xmm6
> +	vpaddd		16(%rdi), %xmm6, %xmm6
> +	vpxor		%xmm6, %xmm1, %xmm1
> +	vmovdqu		IV(%rip), %xmm8
> +	vpshufb		%xmm7, %xmm1, %xmm1
> +	vmovdqu		48(%rsi), %xmm5
> +	vpaddd		%xmm1, %xmm8, %xmm8
> +	vpxor		16(%rdi), %xmm8, %xmm9
> +	vmovdqu		32(%rsi), %xmm2
> +	vpblendw	$12, %xmm3, %xmm5, %xmm13
> +	vshufps		$221, %xmm5, %xmm2, %xmm12
> +	vpunpckhqdq	%xmm2, %xmm4, %xmm14
> +	vpslld		$20, %xmm9, %xmm0
> +	vpsrld		$12, %xmm9, %xmm9
> +	vpxor		%xmm0, %xmm9, %xmm0
> +	vshufps		$221, %xmm3, %xmm4, %xmm9
> +	vpaddd		%xmm9, %xmm6, %xmm9
> +	vpaddd		%xmm0, %xmm9, %xmm9
> +	vpxor		%xmm9, %xmm1, %xmm1
> +	vmovdqa		ROR328(%rip), %xmm6
> +	vpshufb		%xmm6, %xmm1, %xmm1
> +	vpaddd		%xmm1, %xmm8, %xmm8
> +	vpxor		%xmm8, %xmm0, %xmm0
> +	vpshufd		$147, %xmm1, %xmm1
> +	vpshufd		$78, %xmm8, %xmm8
> +	vpslld		$25, %xmm0, %xmm10
> +	vpsrld		$7, %xmm0, %xmm0
> +	vpxor		%xmm10, %xmm0, %xmm0
> +	vshufps		$136, %xmm5, %xmm2, %xmm10
> +	vpshufd		$57, %xmm0, %xmm0
> +	vpaddd		%xmm10, %xmm9, %xmm9
> +	vpaddd		%xmm0, %xmm9, %xmm9
> +	vpxor		%xmm9, %xmm1, %xmm1
> +	vpaddd		%xmm12, %xmm9, %xmm9
> +	vpblendw	$12, %xmm2, %xmm3, %xmm12
> +	vpshufb		%xmm7, %xmm1, %xmm1
> +	vpaddd		%xmm1, %xmm8, %xmm8
> +	vpxor		%xmm8, %xmm0, %xmm10
> +	vpslld		$20, %xmm10, %xmm0
> +	vpsrld		$12, %xmm10, %xmm10
> +	vpxor		%xmm0, %xmm10, %xmm0
> +	vpaddd		%xmm0, %xmm9, %xmm9
> +	vpxor		%xmm9, %xmm1, %xmm1
> +	vpshufb		%xmm6, %xmm1, %xmm1
> +	vpaddd		%xmm1, %xmm8, %xmm8
[...]

There are no comments in this 685-line assembly language file.
Is this the original version, or is it a generated/stripped version?

- Eric

  reply	other threads:[~2019-10-23  4:55 UTC|newest]

Thread overview: 50+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-10-17 19:08 [PATCH v4 00/35] crypto: crypto API library interfaces for WireGuard Ard Biesheuvel
2019-10-17 19:08 ` [PATCH v4 01/35] crypto: tidy up lib/crypto Kconfig and Makefile Ard Biesheuvel
2019-10-17 19:08 ` [PATCH v4 02/35] crypto: chacha - move existing library code into lib/crypto Ard Biesheuvel
2019-10-23  3:05   ` Eric Biggers
2019-11-04  9:06     ` Ard Biesheuvel
2019-10-23  3:12   ` Eric Biggers
2019-10-17 19:09 ` [PATCH v4 03/35] crypto: x86/chacha - depend on generic chacha library instead of crypto driver Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 04/35] crypto: x86/chacha - expose SIMD ChaCha routine as library function Ard Biesheuvel
2019-10-23  3:10   ` Eric Biggers
2019-10-23  4:40   ` Eric Biggers
2019-10-17 19:09 ` [PATCH v4 05/35] crypto: arm64/chacha - depend on generic chacha library instead of crypto driver Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 06/35] crypto: arm64/chacha - expose arm64 ChaCha routine as library function Ard Biesheuvel
2019-10-23  3:16   ` Eric Biggers
2019-10-17 19:09 ` [PATCH v4 07/35] crypto: arm/chacha - import Eric Biggers's scalar accelerated ChaCha code Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 08/35] crypto: arm/chacha - remove dependency on generic ChaCha driver Ard Biesheuvel
2019-10-23  3:21   ` Eric Biggers
2019-10-17 19:09 ` [PATCH v4 09/35] crypto: arm/chacha - expose ARM ChaCha routine as library function Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 10/35] crypto: mips/chacha - import 32r2 ChaCha code from Zinc Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 11/35] crypto: mips/chacha - wire up accelerated 32r2 " Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 12/35] crypto: chacha - unexport chacha_generic routines Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 13/35] crypto: poly1305 - move core routines into a separate library Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 14/35] crypto: x86/poly1305 - unify Poly1305 state struct with generic code Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 15/35] crypto: poly1305 - expose init/update/final library interface Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 16/35] crypto: x86/poly1305 - depend on generic library not generic shash Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 17/35] crypto: x86/poly1305 - expose existing driver as poly1305 library Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 18/35] crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 19/35] crypto: arm/poly1305 " Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 20/35] crypto: mips/poly1305 - incorporate OpenSSL/CRYPTOGAMS optimized implementation Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 21/35] int128: move __uint128_t compiler test to Kconfig Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 22/35] crypto: BLAKE2s - generic C library implementation and selftest Ard Biesheuvel
2019-10-23  4:51   ` Eric Biggers
2019-11-06 16:41     ` Ard Biesheuvel
2019-11-08 11:28       ` Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 23/35] crypto: testmgr - add test cases for Blake2s Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 24/35] crypto: blake2s - implement generic shash driver Ard Biesheuvel
2019-10-23  3:25   ` Eric Biggers
2019-10-17 19:09 ` [PATCH v4 25/35] crypto: BLAKE2s - x86_64 SIMD implementation Ard Biesheuvel
2019-10-23  4:55   ` Eric Biggers [this message]
2019-10-23 14:08     ` Jason A. Donenfeld
2019-10-23 15:04       ` Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 26/35] crypto: Curve25519 - generic C library implementations Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 27/35] crypto: testmgr - implement testing for KPP failures Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 28/35] crypto: curve25519 - add kpp selftest Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 29/35] crypto: curve25519 - implement generic KPP driver Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 30/35] crypto: lib/curve25519 - work around Clang stack spilling issue Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 31/35] crypto: Curve25519 - x86_64 library and KPP implementations Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 32/35] crypto: arm - import Bernstein and Schwabe's Curve25519 ARM implementation Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 33/35] crypto: arm/Curve25519 - wire up NEON implementation Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 34/35] crypto: chacha20poly1305 - import construction and selftest from Zinc Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 35/35] crypto: lib/chacha20poly1305 - reimplement crypt_from_sg() routine Ard Biesheuvel

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20191023045511.GC361298@sol.localdomain \
    --to=ebiggers@kernel.org \
    --cc=Jason@zx2c4.com \
    --cc=ard.biesheuvel@linaro.org \
    --cc=arnd@arndb.de \
    --cc=davem@davemloft.net \
    --cc=dsterba@suse.com \
    --cc=herbert@gondor.apana.org.au \
    --cc=linux-crypto@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=martin@strongswan.org \
    --cc=opensource@vdorst.com \
    --cc=sneves@dei.uc.pt \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).