All of lore.kernel.org
 help / color / mirror / Atom feed
From: Eric Biggers <ebiggers@kernel.org>
To: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: linux-crypto@vger.kernel.org,
	Herbert Xu <herbert@gondor.apana.org.au>,
	David Miller <davem@davemloft.net>,
	"Jason A . Donenfeld" <Jason@zx2c4.com>,
	Samuel Neves <sneves@dei.uc.pt>, Arnd Bergmann <arnd@arndb.de>,
	Andy Lutomirski <luto@kernel.org>,
	Martin Willi <martin@strongswan.org>,
	Rene van Dorst <opensource@vdorst.com>,
	David Sterba <dsterba@suse.com>
Subject: Re: [PATCH v4 25/35] crypto: BLAKE2s - x86_64 SIMD implementation
Date: Tue, 22 Oct 2019 21:55:11 -0700	[thread overview]
Message-ID: <20191023045511.GC361298@sol.localdomain> (raw)
In-Reply-To: <20191017190932.1947-26-ard.biesheuvel@linaro.org>

On Thu, Oct 17, 2019 at 09:09:22PM +0200, Ard Biesheuvel wrote:
> From: "Jason A. Donenfeld" <Jason@zx2c4.com>
> 
> These implementations from Samuel Neves support AVX and AVX-512VL.
> Originally this used AVX-512F, but Skylake thermal throttling made
> AVX-512VL more attractive and possible to do with negligable difference.
> 
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
> Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
> [ardb: move to arch/x86/crypto, wire into lib/crypto framework]
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> ---
>  arch/x86/crypto/Makefile       |   2 +
>  arch/x86/crypto/blake2s-core.S | 685 ++++++++++++++++++++
>  arch/x86/crypto/blake2s-glue.c | 235 +++++++
>  crypto/Kconfig                 |   6 +
>  4 files changed, 928 insertions(+)
> 
> diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
> index 759b1a927826..922c8ecfa00f 100644
> --- a/arch/x86/crypto/Makefile
> +++ b/arch/x86/crypto/Makefile
> @@ -48,6 +48,7 @@ ifeq ($(avx_supported),yes)
>  	obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
>  	obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
>  	obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
> +	obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
>  endif
>  
>  # These modules require assembler to support AVX2.
> @@ -70,6 +71,7 @@ serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
>  aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
>  
>  nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
> +blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
>  
>  ifeq ($(avx_supported),yes)
>  	camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
> diff --git a/arch/x86/crypto/blake2s-core.S b/arch/x86/crypto/blake2s-core.S
> new file mode 100644
> index 000000000000..675288fa4cca
> --- /dev/null
> +++ b/arch/x86/crypto/blake2s-core.S
> @@ -0,0 +1,685 @@
> +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
> +/*
> + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
> + * Copyright (C) 2017 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
> + */
> +
> +#include <linux/linkage.h>
> +
> +.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
> +.align 32
> +IV:	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
> +	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
> +.section .rodata.cst16.ROT16, "aM", @progbits, 16
> +.align 16
> +ROT16:	.octa 0x0D0C0F0E09080B0A0504070601000302
> +.section .rodata.cst16.ROR328, "aM", @progbits, 16
> +.align 16
> +ROR328:	.octa 0x0C0F0E0D080B0A090407060500030201
> +#ifdef CONFIG_AS_AVX512
> +.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 640
> +.align 64
> +SIGMA:
> +.long 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15
> +.long 11, 2, 12, 14, 9, 8, 15, 3, 4, 0, 13, 6, 10, 1, 7, 5
> +.long 10, 12, 11, 6, 5, 9, 13, 3, 4, 15, 14, 2, 0, 7, 8, 1
> +.long 10, 9, 7, 0, 11, 14, 1, 12, 6, 2, 15, 3, 13, 8, 5, 4
> +.long 4, 9, 8, 13, 14, 0, 10, 11, 7, 3, 12, 1, 5, 6, 15, 2
> +.long 2, 10, 4, 14, 13, 3, 9, 11, 6, 5, 7, 12, 15, 1, 8, 0
> +.long 4, 11, 14, 8, 13, 10, 12, 5, 2, 1, 15, 3, 9, 7, 0, 6
> +.long 6, 12, 0, 13, 15, 2, 1, 10, 4, 5, 11, 14, 8, 3, 9, 7
> +.long 14, 5, 4, 12, 9, 7, 3, 10, 2, 0, 6, 15, 11, 1, 13, 8
> +.long 11, 7, 13, 10, 12, 14, 0, 15, 4, 5, 6, 9, 2, 1, 8, 3
> +#endif /* CONFIG_AS_AVX512 */
> +
> +.text
> +#ifdef CONFIG_AS_AVX
> +ENTRY(blake2s_compress_avx)
> +	movl		%ecx, %ecx
> +	testq		%rdx, %rdx
> +	je		.Lendofloop
> +	.align 32
> +.Lbeginofloop:
> +	addq		%rcx, 32(%rdi)
> +	vmovdqu		IV+16(%rip), %xmm1
> +	vmovdqu		(%rsi), %xmm4
> +	vpxor		32(%rdi), %xmm1, %xmm1
> +	vmovdqu		16(%rsi), %xmm3
> +	vshufps		$136, %xmm3, %xmm4, %xmm6
> +	vmovdqa		ROT16(%rip), %xmm7
> +	vpaddd		(%rdi), %xmm6, %xmm6
> +	vpaddd		16(%rdi), %xmm6, %xmm6
> +	vpxor		%xmm6, %xmm1, %xmm1
> +	vmovdqu		IV(%rip), %xmm8
> +	vpshufb		%xmm7, %xmm1, %xmm1
> +	vmovdqu		48(%rsi), %xmm5
> +	vpaddd		%xmm1, %xmm8, %xmm8
> +	vpxor		16(%rdi), %xmm8, %xmm9
> +	vmovdqu		32(%rsi), %xmm2
> +	vpblendw	$12, %xmm3, %xmm5, %xmm13
> +	vshufps		$221, %xmm5, %xmm2, %xmm12
> +	vpunpckhqdq	%xmm2, %xmm4, %xmm14
> +	vpslld		$20, %xmm9, %xmm0
> +	vpsrld		$12, %xmm9, %xmm9
> +	vpxor		%xmm0, %xmm9, %xmm0
> +	vshufps		$221, %xmm3, %xmm4, %xmm9
> +	vpaddd		%xmm9, %xmm6, %xmm9
> +	vpaddd		%xmm0, %xmm9, %xmm9
> +	vpxor		%xmm9, %xmm1, %xmm1
> +	vmovdqa		ROR328(%rip), %xmm6
> +	vpshufb		%xmm6, %xmm1, %xmm1
> +	vpaddd		%xmm1, %xmm8, %xmm8
> +	vpxor		%xmm8, %xmm0, %xmm0
> +	vpshufd		$147, %xmm1, %xmm1
> +	vpshufd		$78, %xmm8, %xmm8
> +	vpslld		$25, %xmm0, %xmm10
> +	vpsrld		$7, %xmm0, %xmm0
> +	vpxor		%xmm10, %xmm0, %xmm0
> +	vshufps		$136, %xmm5, %xmm2, %xmm10
> +	vpshufd		$57, %xmm0, %xmm0
> +	vpaddd		%xmm10, %xmm9, %xmm9
> +	vpaddd		%xmm0, %xmm9, %xmm9
> +	vpxor		%xmm9, %xmm1, %xmm1
> +	vpaddd		%xmm12, %xmm9, %xmm9
> +	vpblendw	$12, %xmm2, %xmm3, %xmm12
> +	vpshufb		%xmm7, %xmm1, %xmm1
> +	vpaddd		%xmm1, %xmm8, %xmm8
> +	vpxor		%xmm8, %xmm0, %xmm10
> +	vpslld		$20, %xmm10, %xmm0
> +	vpsrld		$12, %xmm10, %xmm10
> +	vpxor		%xmm0, %xmm10, %xmm0
> +	vpaddd		%xmm0, %xmm9, %xmm9
> +	vpxor		%xmm9, %xmm1, %xmm1
> +	vpshufb		%xmm6, %xmm1, %xmm1
> +	vpaddd		%xmm1, %xmm8, %xmm8
[...]

There are no comments in this 685-line assembly language file.
Is this the original version, or is it a generated/stripped version?

- Eric

  reply	other threads:[~2019-10-23  4:55 UTC|newest]

Thread overview: 50+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-10-17 19:08 [PATCH v4 00/35] crypto: crypto API library interfaces for WireGuard Ard Biesheuvel
2019-10-17 19:08 ` [PATCH v4 01/35] crypto: tidy up lib/crypto Kconfig and Makefile Ard Biesheuvel
2019-10-17 19:08 ` [PATCH v4 02/35] crypto: chacha - move existing library code into lib/crypto Ard Biesheuvel
2019-10-23  3:05   ` Eric Biggers
2019-11-04  9:06     ` Ard Biesheuvel
2019-10-23  3:12   ` Eric Biggers
2019-10-17 19:09 ` [PATCH v4 03/35] crypto: x86/chacha - depend on generic chacha library instead of crypto driver Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 04/35] crypto: x86/chacha - expose SIMD ChaCha routine as library function Ard Biesheuvel
2019-10-23  3:10   ` Eric Biggers
2019-10-23  4:40   ` Eric Biggers
2019-10-17 19:09 ` [PATCH v4 05/35] crypto: arm64/chacha - depend on generic chacha library instead of crypto driver Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 06/35] crypto: arm64/chacha - expose arm64 ChaCha routine as library function Ard Biesheuvel
2019-10-23  3:16   ` Eric Biggers
2019-10-17 19:09 ` [PATCH v4 07/35] crypto: arm/chacha - import Eric Biggers's scalar accelerated ChaCha code Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 08/35] crypto: arm/chacha - remove dependency on generic ChaCha driver Ard Biesheuvel
2019-10-23  3:21   ` Eric Biggers
2019-10-17 19:09 ` [PATCH v4 09/35] crypto: arm/chacha - expose ARM ChaCha routine as library function Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 10/35] crypto: mips/chacha - import 32r2 ChaCha code from Zinc Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 11/35] crypto: mips/chacha - wire up accelerated 32r2 " Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 12/35] crypto: chacha - unexport chacha_generic routines Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 13/35] crypto: poly1305 - move core routines into a separate library Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 14/35] crypto: x86/poly1305 - unify Poly1305 state struct with generic code Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 15/35] crypto: poly1305 - expose init/update/final library interface Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 16/35] crypto: x86/poly1305 - depend on generic library not generic shash Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 17/35] crypto: x86/poly1305 - expose existing driver as poly1305 library Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 18/35] crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 19/35] crypto: arm/poly1305 " Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 20/35] crypto: mips/poly1305 - incorporate OpenSSL/CRYPTOGAMS optimized implementation Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 21/35] int128: move __uint128_t compiler test to Kconfig Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 22/35] crypto: BLAKE2s - generic C library implementation and selftest Ard Biesheuvel
2019-10-23  4:51   ` Eric Biggers
2019-11-06 16:41     ` Ard Biesheuvel
2019-11-08 11:28       ` Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 23/35] crypto: testmgr - add test cases for Blake2s Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 24/35] crypto: blake2s - implement generic shash driver Ard Biesheuvel
2019-10-23  3:25   ` Eric Biggers
2019-10-17 19:09 ` [PATCH v4 25/35] crypto: BLAKE2s - x86_64 SIMD implementation Ard Biesheuvel
2019-10-23  4:55   ` Eric Biggers [this message]
2019-10-23 14:08     ` Jason A. Donenfeld
2019-10-23 15:04       ` Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 26/35] crypto: Curve25519 - generic C library implementations Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 27/35] crypto: testmgr - implement testing for KPP failures Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 28/35] crypto: curve25519 - add kpp selftest Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 29/35] crypto: curve25519 - implement generic KPP driver Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 30/35] crypto: lib/curve25519 - work around Clang stack spilling issue Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 31/35] crypto: Curve25519 - x86_64 library and KPP implementations Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 32/35] crypto: arm - import Bernstein and Schwabe's Curve25519 ARM implementation Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 33/35] crypto: arm/Curve25519 - wire up NEON implementation Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 34/35] crypto: chacha20poly1305 - import construction and selftest from Zinc Ard Biesheuvel
2019-10-17 19:09 ` [PATCH v4 35/35] crypto: lib/chacha20poly1305 - reimplement crypt_from_sg() routine Ard Biesheuvel

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20191023045511.GC361298@sol.localdomain \
    --to=ebiggers@kernel.org \
    --cc=Jason@zx2c4.com \
    --cc=ard.biesheuvel@linaro.org \
    --cc=arnd@arndb.de \
    --cc=davem@davemloft.net \
    --cc=dsterba@suse.com \
    --cc=herbert@gondor.apana.org.au \
    --cc=linux-crypto@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=martin@strongswan.org \
    --cc=opensource@vdorst.com \
    --cc=sneves@dei.uc.pt \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.