* [PATCH] arm64/crypto: fix and improve GHASH secure hash implementation
@ 2014-06-12 14:43 Ard Biesheuvel
2014-06-12 14:48 ` Catalin Marinas
2014-06-16 9:31 ` Catalin Marinas
0 siblings, 2 replies; 4+ messages in thread
From: Ard Biesheuvel @ 2014-06-12 14:43 UTC (permalink / raw)
To: linux-arm-kernel
This fixes a bug in the arm64 GHASH implementation, and switches to a faster,
polynomial multiplication based reduction instead of one that uses
shifts and rotates.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
This is a bug fix and a performance optimization in a single patch. As the code
has never worked correctly and was merged just a couple of days ago, I am
assuming this is OK but if anyone would prefer the bug fix separately, I'm happy
to split them as well.
Ard.
arch/arm64/crypto/ghash-ce-core.S | 92 ++++++++++++++++-----------------------
arch/arm64/crypto/ghash-ce-glue.c | 5 ++-
2 files changed, 41 insertions(+), 56 deletions(-)
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index b9e6eaf41c9b..dc457015884e 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -3,14 +3,6 @@
*
* Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
*
- * Based on arch/x86/crypto/ghash-pmullni-intel_asm.S
- *
- * Copyright (c) 2009 Intel Corp.
- * Author: Huang Ying <ying.huang@intel.com>
- * Vinodh Gopal
- * Erdinc Ozturk
- * Deniz Karakoyunlu
- *
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation.
@@ -19,13 +11,15 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
- DATA .req v0
- SHASH .req v1
- IN1 .req v2
+ SHASH .req v0
+ SHASH2 .req v1
T1 .req v2
T2 .req v3
- T3 .req v4
- VZR .req v5
+ MASK .req v4
+ XL .req v5
+ XM .req v6
+ XH .req v7
+ IN1 .req v7
.text
.arch armv8-a+crypto
@@ -35,61 +29,51 @@
* struct ghash_key const *k, const char *head)
*/
ENTRY(pmull_ghash_update)
- ld1 {DATA.16b}, [x1]
ld1 {SHASH.16b}, [x3]
- eor VZR.16b, VZR.16b, VZR.16b
+ ld1 {XL.16b}, [x1]
+ movi MASK.16b, #0xe1
+ ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
+ shl MASK.2d, MASK.2d, #57
+ eor SHASH2.16b, SHASH2.16b, SHASH.16b
/* do the head block first, if supplied */
cbz x4, 0f
- ld1 {IN1.2d}, [x4]
+ ld1 {T1.2d}, [x4]
b 1f
-0: ld1 {IN1.2d}, [x2], #16
+0: ld1 {T1.2d}, [x2], #16
sub w0, w0, #1
-1: ext IN1.16b, IN1.16b, IN1.16b, #8
-CPU_LE( rev64 IN1.16b, IN1.16b )
- eor DATA.16b, DATA.16b, IN1.16b
- /* multiply DATA by SHASH in GF(2^128) */
- ext T2.16b, DATA.16b, DATA.16b, #8
- ext T3.16b, SHASH.16b, SHASH.16b, #8
- eor T2.16b, T2.16b, DATA.16b
- eor T3.16b, T3.16b, SHASH.16b
+1: /* multiply XL by SHASH in GF(2^128) */
+CPU_LE( rev64 T1.16b, T1.16b )
- pmull2 T1.1q, SHASH.2d, DATA.2d // a1 * b1
- pmull DATA.1q, SHASH.1d, DATA.1d // a0 * b0
- pmull T2.1q, T2.1d, T3.1d // (a1 + a0)(b1 + b0)
- eor T2.16b, T2.16b, T1.16b // (a0 * b1) + (a1 * b0)
- eor T2.16b, T2.16b, DATA.16b
+ ext T2.16b, XL.16b, XL.16b, #8
+ ext IN1.16b, T1.16b, T1.16b, #8
+ eor T1.16b, T1.16b, T2.16b
+ eor XL.16b, XL.16b, IN1.16b
- ext T3.16b, VZR.16b, T2.16b, #8
- ext T2.16b, T2.16b, VZR.16b, #8
- eor DATA.16b, DATA.16b, T3.16b
- eor T1.16b, T1.16b, T2.16b // <T1:DATA> is result of
- // carry-less multiplication
+ pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
+ eor T1.16b, T1.16b, XL.16b
+ pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
+ pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
- /* first phase of the reduction */
- shl T3.2d, DATA.2d, #1
- eor T3.16b, T3.16b, DATA.16b
- shl T3.2d, T3.2d, #5
- eor T3.16b, T3.16b, DATA.16b
- shl T3.2d, T3.2d, #57
- ext T2.16b, VZR.16b, T3.16b, #8
- ext T3.16b, T3.16b, VZR.16b, #8
- eor DATA.16b, DATA.16b, T2.16b
- eor T1.16b, T1.16b, T3.16b
+ ext T1.16b, XL.16b, XH.16b, #8
+ eor T2.16b, XL.16b, XH.16b
+ eor XM.16b, XM.16b, T1.16b
+ eor XM.16b, XM.16b, T2.16b
+ pmull T2.1q, XL.1d, MASK.1d
- /* second phase of the reduction */
- ushr T2.2d, DATA.2d, #5
- eor T2.16b, T2.16b, DATA.16b
- ushr T2.2d, T2.2d, #1
- eor T2.16b, T2.16b, DATA.16b
- ushr T2.2d, T2.2d, #1
- eor T1.16b, T1.16b, T2.16b
- eor DATA.16b, DATA.16b, T1.16b
+ mov XH.d[0], XM.d[1]
+ mov XM.d[1], XL.d[0]
+
+ eor XL.16b, XM.16b, T2.16b
+ ext T2.16b, XL.16b, XL.16b, #8
+ pmull XL.1q, XL.1d, MASK.1d
+ eor T2.16b, T2.16b, XH.16b
+ eor XL.16b, XL.16b, T2.16b
cbnz w0, 0b
- st1 {DATA.16b}, [x1]
+ st1 {XL.16b}, [x1]
ret
ENDPROC(pmull_ghash_update)
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index b92baf3f68c7..833ec1e3f3e9 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -67,11 +67,12 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,
blocks = len / GHASH_BLOCK_SIZE;
len %= GHASH_BLOCK_SIZE;
- kernel_neon_begin_partial(6);
+ kernel_neon_begin_partial(8);
pmull_ghash_update(blocks, ctx->digest, src, key,
partial ? ctx->buf : NULL);
kernel_neon_end();
src += blocks * GHASH_BLOCK_SIZE;
+ partial = 0;
}
if (len)
memcpy(ctx->buf + partial, src, len);
@@ -88,7 +89,7 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)
memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
- kernel_neon_begin_partial(6);
+ kernel_neon_begin_partial(8);
pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL);
kernel_neon_end();
}
--
1.8.3.2
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH] arm64/crypto: fix and improve GHASH secure hash implementation
2014-06-12 14:43 [PATCH] arm64/crypto: fix and improve GHASH secure hash implementation Ard Biesheuvel
@ 2014-06-12 14:48 ` Catalin Marinas
2014-06-12 14:54 ` Ard Biesheuvel
2014-06-16 9:31 ` Catalin Marinas
1 sibling, 1 reply; 4+ messages in thread
From: Catalin Marinas @ 2014-06-12 14:48 UTC (permalink / raw)
To: linux-arm-kernel
On Thu, Jun 12, 2014 at 03:43:07PM +0100, Ard Biesheuvel wrote:
> This fixes a bug in the arm64 GHASH implementation, and switches to a faster,
> polynomial multiplication based reduction instead of one that uses
> shifts and rotates.
>
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> ---
>
> This is a bug fix and a performance optimization in a single patch.
Thanks. I'll push it after -rc1.
> As the code has never worked correctly and was merged just a couple of
> days ago,
IIRC you said you tested the crypto patches.
--
Catalin
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH] arm64/crypto: fix and improve GHASH secure hash implementation
2014-06-12 14:48 ` Catalin Marinas
@ 2014-06-12 14:54 ` Ard Biesheuvel
0 siblings, 0 replies; 4+ messages in thread
From: Ard Biesheuvel @ 2014-06-12 14:54 UTC (permalink / raw)
To: linux-arm-kernel
On 12 June 2014 16:48, Catalin Marinas <catalin.marinas@arm.com> wrote:
> On Thu, Jun 12, 2014 at 03:43:07PM +0100, Ard Biesheuvel wrote:
>> This fixes a bug in the arm64 GHASH implementation, and switches to a faster,
>> polynomial multiplication based reduction instead of one that uses
>> shifts and rotates.
>>
>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
>> ---
>>
>> This is a bug fix and a performance optimization in a single patch.
>
> Thanks. I'll push it after -rc1.
>
>> As the code has never worked correctly and was merged just a couple of
>> days ago,
>
> IIRC you said you tested the crypto patches.
>
Yes, I did. Unfortunately [as I found out today] the built-in test
suite for GHASH consists of a single test vector, which happens to
pass with the old code. I will be submitting a patch to linux-crypto
shortly to add more test vectors that would have caught this
particular bug.
--
Ard.
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH] arm64/crypto: fix and improve GHASH secure hash implementation
2014-06-12 14:43 [PATCH] arm64/crypto: fix and improve GHASH secure hash implementation Ard Biesheuvel
2014-06-12 14:48 ` Catalin Marinas
@ 2014-06-16 9:31 ` Catalin Marinas
1 sibling, 0 replies; 4+ messages in thread
From: Catalin Marinas @ 2014-06-16 9:31 UTC (permalink / raw)
To: linux-arm-kernel
On Thu, Jun 12, 2014 at 03:43:07PM +0100, Ard Biesheuvel wrote:
> This fixes a bug in the arm64 GHASH implementation, and switches to a faster,
> polynomial multiplication based reduction instead of one that uses
> shifts and rotates.
>
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> ---
>
> This is a bug fix and a performance optimization in a single patch. As the code
> has never worked correctly and was merged just a couple of days ago, I am
> assuming this is OK but if anyone would prefer the bug fix separately, I'm happy
> to split them as well.
I can push a performance optimisation patch as well but I would prefer
to separate the bug-fixing part. Could you please also describe what the
bug was for future reference?
Thanks.
--
Catalin
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2014-06-16 9:31 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-06-12 14:43 [PATCH] arm64/crypto: fix and improve GHASH secure hash implementation Ard Biesheuvel
2014-06-12 14:48 ` Catalin Marinas
2014-06-12 14:54 ` Ard Biesheuvel
2014-06-16 9:31 ` Catalin Marinas
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.