I ran the crypto self-tests on big endian arm and arm64 kernels in QEMU. The arm64 implementation of ChaCha (and XChaCha) is failing, so this series fixes it. Please consider sending to Linus for v5.0 if there's time, as this fixes commits that are new in v5.0. Eric Biggers (2): crypto: arm64/chacha - fix chacha_4block_xor_neon() for big endian crypto: arm64/chacha - fix hchacha_block_neon() for big endian arch/arm64/crypto/chacha-neon-core.S | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) -- 2.20.1
From: Eric Biggers <ebiggers@google.com> The change to encrypt a fifth ChaCha block using scalar instructions caused the chacha20-neon, xchacha20-neon, and xchacha12-neon self-tests to start failing on big endian arm64 kernels. The bug is that the keystream block produced in 32-bit scalar registers is directly XOR'd with the data words, which are loaded and stored in native endianness. Thus in big endian mode the data bytes end up XOR'd with the wrong bytes. Fix it by byte-swapping the keystream words in big endian mode. Fixes: 2fe55987b262 ("crypto: arm64/chacha - use combined SIMD/ALU routine for more speed") Signed-off-by: Eric Biggers <ebiggers@google.com> --- arch/arm64/crypto/chacha-neon-core.S | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S index 021bb9e9784b2..bfb80e10ff7b0 100644 --- a/arch/arm64/crypto/chacha-neon-core.S +++ b/arch/arm64/crypto/chacha-neon-core.S @@ -532,6 +532,10 @@ ENTRY(chacha_4block_xor_neon) add v3.4s, v3.4s, v19.4s add a2, a2, w8 add a3, a3, w9 +CPU_BE( rev a0, a0 ) +CPU_BE( rev a1, a1 ) +CPU_BE( rev a2, a2 ) +CPU_BE( rev a3, a3 ) ld4r {v24.4s-v27.4s}, [x0], #16 ld4r {v28.4s-v31.4s}, [x0] @@ -552,6 +556,10 @@ ENTRY(chacha_4block_xor_neon) add v7.4s, v7.4s, v23.4s add a6, a6, w8 add a7, a7, w9 +CPU_BE( rev a4, a4 ) +CPU_BE( rev a5, a5 ) +CPU_BE( rev a6, a6 ) +CPU_BE( rev a7, a7 ) // x8[0-3] += s2[0] // x9[0-3] += s2[1] @@ -569,6 +577,10 @@ ENTRY(chacha_4block_xor_neon) add v11.4s, v11.4s, v27.4s add a10, a10, w8 add a11, a11, w9 +CPU_BE( rev a8, a8 ) +CPU_BE( rev a9, a9 ) +CPU_BE( rev a10, a10 ) +CPU_BE( rev a11, a11 ) // x12[0-3] += s3[0] // x13[0-3] += s3[1] @@ -586,6 +598,10 @@ ENTRY(chacha_4block_xor_neon) add v15.4s, v15.4s, v31.4s add a14, a14, w8 add a15, a15, w9 +CPU_BE( rev a12, a12 ) +CPU_BE( rev a13, a13 ) +CPU_BE( rev a14, a14 ) +CPU_BE( rev a15, a15 ) // interleave 32-bit words in state n, n+1 ldp w6, w7, [x2], #64 -- 2.20.1
From: Eric Biggers <ebiggers@google.com> On big endian arm64 kernels, the xchacha20-neon and xchacha12-neon self-tests fail because hchacha_block_neon() outputs little endian words but the C code expects native endianness. Fix it to output the words in native endianness (which also makes it match the arm32 version). Fixes: cc7cf991e9eb ("crypto: arm64/chacha20 - add XChaCha20 support") Signed-off-by: Eric Biggers <ebiggers@google.com> --- arch/arm64/crypto/chacha-neon-core.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S index bfb80e10ff7b0..706c4e10e9e29 100644 --- a/arch/arm64/crypto/chacha-neon-core.S +++ b/arch/arm64/crypto/chacha-neon-core.S @@ -158,8 +158,8 @@ ENTRY(hchacha_block_neon) mov w3, w2 bl chacha_permute - st1 {v0.16b}, [x1], #16 - st1 {v3.16b}, [x1] + st1 {v0.4s}, [x1], #16 + st1 {v3.4s}, [x1] ldp x29, x30, [sp], #16 ret -- 2.20.1
On Sat, 23 Feb 2019 at 07:54, Eric Biggers <ebiggers@kernel.org> wrote: > > From: Eric Biggers <ebiggers@google.com> > > The change to encrypt a fifth ChaCha block using scalar instructions > caused the chacha20-neon, xchacha20-neon, and xchacha12-neon self-tests > to start failing on big endian arm64 kernels. The bug is that the > keystream block produced in 32-bit scalar registers is directly XOR'd > with the data words, which are loaded and stored in native endianness. > Thus in big endian mode the data bytes end up XOR'd with the wrong > bytes. Fix it by byte-swapping the keystream words in big endian mode. > > Fixes: 2fe55987b262 ("crypto: arm64/chacha - use combined SIMD/ALU routine for more speed") > Signed-off-by: Eric Biggers <ebiggers@google.com> Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> > --- > arch/arm64/crypto/chacha-neon-core.S | 16 ++++++++++++++++ > 1 file changed, 16 insertions(+) > > diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S > index 021bb9e9784b2..bfb80e10ff7b0 100644 > --- a/arch/arm64/crypto/chacha-neon-core.S > +++ b/arch/arm64/crypto/chacha-neon-core.S > @@ -532,6 +532,10 @@ ENTRY(chacha_4block_xor_neon) > add v3.4s, v3.4s, v19.4s > add a2, a2, w8 > add a3, a3, w9 > +CPU_BE( rev a0, a0 ) > +CPU_BE( rev a1, a1 ) > +CPU_BE( rev a2, a2 ) > +CPU_BE( rev a3, a3 ) > > ld4r {v24.4s-v27.4s}, [x0], #16 > ld4r {v28.4s-v31.4s}, [x0] > @@ -552,6 +556,10 @@ ENTRY(chacha_4block_xor_neon) > add v7.4s, v7.4s, v23.4s > add a6, a6, w8 > add a7, a7, w9 > +CPU_BE( rev a4, a4 ) > +CPU_BE( rev a5, a5 ) > +CPU_BE( rev a6, a6 ) > +CPU_BE( rev a7, a7 ) > > // x8[0-3] += s2[0] > // x9[0-3] += s2[1] > @@ -569,6 +577,10 @@ ENTRY(chacha_4block_xor_neon) > add v11.4s, v11.4s, v27.4s > add a10, a10, w8 > add a11, a11, w9 > +CPU_BE( rev a8, a8 ) > +CPU_BE( rev a9, a9 ) > +CPU_BE( rev a10, a10 ) > +CPU_BE( rev a11, a11 ) > > // x12[0-3] += s3[0] > // x13[0-3] += s3[1] > @@ -586,6 +598,10 @@ ENTRY(chacha_4block_xor_neon) > add v15.4s, v15.4s, v31.4s > add a14, a14, w8 > add a15, a15, w9 > +CPU_BE( rev a12, a12 ) > +CPU_BE( rev a13, a13 ) > +CPU_BE( rev a14, a14 ) > +CPU_BE( rev a15, a15 ) > > // interleave 32-bit words in state n, n+1 > ldp w6, w7, [x2], #64 > -- > 2.20.1 >
On Sat, 23 Feb 2019 at 07:54, Eric Biggers <ebiggers@kernel.org> wrote: > > From: Eric Biggers <ebiggers@google.com> > > On big endian arm64 kernels, the xchacha20-neon and xchacha12-neon > self-tests fail because hchacha_block_neon() outputs little endian words > but the C code expects native endianness. Fix it to output the words in > native endianness (which also makes it match the arm32 version). > > Fixes: cc7cf991e9eb ("crypto: arm64/chacha20 - add XChaCha20 support") > Signed-off-by: Eric Biggers <ebiggers@google.com> Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> > --- > arch/arm64/crypto/chacha-neon-core.S | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S > index bfb80e10ff7b0..706c4e10e9e29 100644 > --- a/arch/arm64/crypto/chacha-neon-core.S > +++ b/arch/arm64/crypto/chacha-neon-core.S > @@ -158,8 +158,8 @@ ENTRY(hchacha_block_neon) > mov w3, w2 > bl chacha_permute > > - st1 {v0.16b}, [x1], #16 > - st1 {v3.16b}, [x1] > + st1 {v0.4s}, [x1], #16 > + st1 {v3.4s}, [x1] > > ldp x29, x30, [sp], #16 > ret > -- > 2.20.1 >
On Fri, Feb 22, 2019 at 10:54:06PM -0800, Eric Biggers wrote: > I ran the crypto self-tests on big endian arm and arm64 kernels in QEMU. > The arm64 implementation of ChaCha (and XChaCha) is failing, so this > series fixes it. > > Please consider sending to Linus for v5.0 if there's time, as this fixes > commits that are new in v5.0. > > Eric Biggers (2): > crypto: arm64/chacha - fix chacha_4block_xor_neon() for big endian > crypto: arm64/chacha - fix hchacha_block_neon() for big endian > > arch/arm64/crypto/chacha-neon-core.S | 20 ++++++++++++++++++-- > 1 file changed, 18 insertions(+), 2 deletions(-) All applied. Thanks. -- Email: Herbert Xu <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt