From: Ard Biesheuvel <ardb@kernel.org> To: linux-crypto@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org, will@kernel.org, mark.rutland@arm.com, catalin.marinas@arm.com, herbert@gondor.apana.org.au, Ard Biesheuvel <ardb@kernel.org>, Dave Martin <dave.martin@arm.com>, Eric Biggers <ebiggers@google.com> Subject: [PATCH v2 2/9] crypto: arm64/sha1-ce - simplify NEON yield Date: Wed, 3 Feb 2021 12:36:19 +0100 [thread overview] Message-ID: <20210203113626.220151-3-ardb@kernel.org> (raw) In-Reply-To: <20210203113626.220151-1-ardb@kernel.org> Instead of calling into kernel_neon_end() and kernel_neon_begin() (and potentially into schedule()) from the assembler code when running in task mode and a reschedule is pending, perform only the preempt count check in assembler, but simply return early in this case, and let the C code deal with the consequences. This reverts commit 7df8d164753e6e6f229b72767595072bc6a71f48. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> --- arch/arm64/crypto/sha1-ce-core.S | 47 +++++++------------- arch/arm64/crypto/sha1-ce-glue.c | 22 ++++----- 2 files changed, 29 insertions(+), 40 deletions(-) diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S index 92d0d2753e81..8c02bbc2684e 100644 --- a/arch/arm64/crypto/sha1-ce-core.S +++ b/arch/arm64/crypto/sha1-ce-core.S @@ -62,40 +62,34 @@ .endm /* - * void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, - * int blocks) + * int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, + * int blocks) */ SYM_FUNC_START(sha1_ce_transform) - frame_push 3 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - /* load round constants */ -0: loadrc k0.4s, 0x5a827999, w6 + loadrc k0.4s, 0x5a827999, w6 loadrc k1.4s, 0x6ed9eba1, w6 loadrc k2.4s, 0x8f1bbcdc, w6 loadrc k3.4s, 0xca62c1d6, w6 /* load state */ - ld1 {dgav.4s}, [x19] - ldr dgb, [x19, #16] + ld1 {dgav.4s}, [x0] + ldr dgb, [x0, #16] /* load sha1_ce_state::finalize */ ldr_l w4, sha1_ce_offsetof_finalize, x4 - ldr w4, [x19, x4] + ldr w4, [x0, x4] /* load input */ -1: ld1 {v8.4s-v11.4s}, [x20], #64 - sub w21, w21, #1 +0: ld1 {v8.4s-v11.4s}, [x1], #64 + sub w2, w2, #1 CPU_LE( rev32 v8.16b, v8.16b ) CPU_LE( rev32 v9.16b, v9.16b ) CPU_LE( rev32 v10.16b, v10.16b ) CPU_LE( rev32 v11.16b, v11.16b ) -2: add t0.4s, v8.4s, k0.4s +1: add t0.4s, v8.4s, k0.4s mov dg0v.16b, dgav.16b add_update c, ev, k0, 8, 9, 10, 11, dgb @@ -126,25 +120,18 @@ CPU_LE( rev32 v11.16b, v11.16b ) add dgbv.2s, dgbv.2s, dg1v.2s add dgav.4s, dgav.4s, dg0v.4s - cbz w21, 3f - - if_will_cond_yield_neon - st1 {dgav.4s}, [x19] - str dgb, [x19, #16] - do_cond_yield_neon + cbz w2, 2f + cond_yield 3f, x5 b 0b - endif_yield_neon - - b 1b /* * Final block: add padding and total bit count. * Skip if the input size was not a round multiple of the block size, * the padding is handled by the C code in that case. */ -3: cbz x4, 4f +2: cbz x4, 3f ldr_l w4, sha1_ce_offsetof_count, x4 - ldr x4, [x19, x4] + ldr x4, [x0, x4] movi v9.2d, #0 mov x8, #0x80000000 movi v10.2d, #0 @@ -153,11 +140,11 @@ CPU_LE( rev32 v11.16b, v11.16b ) mov x4, #0 mov v11.d[0], xzr mov v11.d[1], x7 - b 2b + b 1b /* store new state */ -4: st1 {dgav.4s}, [x19] - str dgb, [x19, #16] - frame_pop +3: st1 {dgav.4s}, [x0] + str dgb, [x0, #16] + mov w0, w2 ret SYM_FUNC_END(sha1_ce_transform) diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c index c1362861765f..71fa4f1122d7 100644 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -29,14 +29,22 @@ struct sha1_ce_state { extern const u32 sha1_ce_offsetof_count; extern const u32 sha1_ce_offsetof_finalize; -asmlinkage void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, - int blocks); +asmlinkage int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, + int blocks); static void __sha1_ce_transform(struct sha1_state *sst, u8 const *src, int blocks) { - sha1_ce_transform(container_of(sst, struct sha1_ce_state, sst), src, - blocks); + while (blocks) { + int rem; + + kernel_neon_begin(); + rem = sha1_ce_transform(container_of(sst, struct sha1_ce_state, + sst), src, blocks); + kernel_neon_end(); + src += (blocks - rem) * SHA1_BLOCK_SIZE; + blocks = rem; + } } const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count); @@ -51,9 +59,7 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data, return crypto_sha1_update(desc, data, len); sctx->finalize = 0; - kernel_neon_begin(); sha1_base_do_update(desc, data, len, __sha1_ce_transform); - kernel_neon_end(); return 0; } @@ -73,11 +79,9 @@ static int sha1_ce_finup(struct shash_desc *desc, const u8 *data, */ sctx->finalize = finalize; - kernel_neon_begin(); sha1_base_do_update(desc, data, len, __sha1_ce_transform); if (!finalize) sha1_base_do_finalize(desc, __sha1_ce_transform); - kernel_neon_end(); return sha1_base_finish(desc, out); } @@ -89,9 +93,7 @@ static int sha1_ce_final(struct shash_desc *desc, u8 *out) return crypto_sha1_finup(desc, NULL, 0, out); sctx->finalize = 0; - kernel_neon_begin(); sha1_base_do_finalize(desc, __sha1_ce_transform); - kernel_neon_end(); return sha1_base_finish(desc, out); } -- 2.30.0
WARNING: multiple messages have this Message-ID (diff)
From: Ard Biesheuvel <ardb@kernel.org> To: linux-crypto@vger.kernel.org Cc: mark.rutland@arm.com, herbert@gondor.apana.org.au, Eric Biggers <ebiggers@google.com>, catalin.marinas@arm.com, Dave Martin <dave.martin@arm.com>, will@kernel.org, Ard Biesheuvel <ardb@kernel.org>, linux-arm-kernel@lists.infradead.org Subject: [PATCH v2 2/9] crypto: arm64/sha1-ce - simplify NEON yield Date: Wed, 3 Feb 2021 12:36:19 +0100 [thread overview] Message-ID: <20210203113626.220151-3-ardb@kernel.org> (raw) In-Reply-To: <20210203113626.220151-1-ardb@kernel.org> Instead of calling into kernel_neon_end() and kernel_neon_begin() (and potentially into schedule()) from the assembler code when running in task mode and a reschedule is pending, perform only the preempt count check in assembler, but simply return early in this case, and let the C code deal with the consequences. This reverts commit 7df8d164753e6e6f229b72767595072bc6a71f48. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> --- arch/arm64/crypto/sha1-ce-core.S | 47 +++++++------------- arch/arm64/crypto/sha1-ce-glue.c | 22 ++++----- 2 files changed, 29 insertions(+), 40 deletions(-) diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S index 92d0d2753e81..8c02bbc2684e 100644 --- a/arch/arm64/crypto/sha1-ce-core.S +++ b/arch/arm64/crypto/sha1-ce-core.S @@ -62,40 +62,34 @@ .endm /* - * void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, - * int blocks) + * int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, + * int blocks) */ SYM_FUNC_START(sha1_ce_transform) - frame_push 3 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - /* load round constants */ -0: loadrc k0.4s, 0x5a827999, w6 + loadrc k0.4s, 0x5a827999, w6 loadrc k1.4s, 0x6ed9eba1, w6 loadrc k2.4s, 0x8f1bbcdc, w6 loadrc k3.4s, 0xca62c1d6, w6 /* load state */ - ld1 {dgav.4s}, [x19] - ldr dgb, [x19, #16] + ld1 {dgav.4s}, [x0] + ldr dgb, [x0, #16] /* load sha1_ce_state::finalize */ ldr_l w4, sha1_ce_offsetof_finalize, x4 - ldr w4, [x19, x4] + ldr w4, [x0, x4] /* load input */ -1: ld1 {v8.4s-v11.4s}, [x20], #64 - sub w21, w21, #1 +0: ld1 {v8.4s-v11.4s}, [x1], #64 + sub w2, w2, #1 CPU_LE( rev32 v8.16b, v8.16b ) CPU_LE( rev32 v9.16b, v9.16b ) CPU_LE( rev32 v10.16b, v10.16b ) CPU_LE( rev32 v11.16b, v11.16b ) -2: add t0.4s, v8.4s, k0.4s +1: add t0.4s, v8.4s, k0.4s mov dg0v.16b, dgav.16b add_update c, ev, k0, 8, 9, 10, 11, dgb @@ -126,25 +120,18 @@ CPU_LE( rev32 v11.16b, v11.16b ) add dgbv.2s, dgbv.2s, dg1v.2s add dgav.4s, dgav.4s, dg0v.4s - cbz w21, 3f - - if_will_cond_yield_neon - st1 {dgav.4s}, [x19] - str dgb, [x19, #16] - do_cond_yield_neon + cbz w2, 2f + cond_yield 3f, x5 b 0b - endif_yield_neon - - b 1b /* * Final block: add padding and total bit count. * Skip if the input size was not a round multiple of the block size, * the padding is handled by the C code in that case. */ -3: cbz x4, 4f +2: cbz x4, 3f ldr_l w4, sha1_ce_offsetof_count, x4 - ldr x4, [x19, x4] + ldr x4, [x0, x4] movi v9.2d, #0 mov x8, #0x80000000 movi v10.2d, #0 @@ -153,11 +140,11 @@ CPU_LE( rev32 v11.16b, v11.16b ) mov x4, #0 mov v11.d[0], xzr mov v11.d[1], x7 - b 2b + b 1b /* store new state */ -4: st1 {dgav.4s}, [x19] - str dgb, [x19, #16] - frame_pop +3: st1 {dgav.4s}, [x0] + str dgb, [x0, #16] + mov w0, w2 ret SYM_FUNC_END(sha1_ce_transform) diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c index c1362861765f..71fa4f1122d7 100644 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -29,14 +29,22 @@ struct sha1_ce_state { extern const u32 sha1_ce_offsetof_count; extern const u32 sha1_ce_offsetof_finalize; -asmlinkage void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, - int blocks); +asmlinkage int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, + int blocks); static void __sha1_ce_transform(struct sha1_state *sst, u8 const *src, int blocks) { - sha1_ce_transform(container_of(sst, struct sha1_ce_state, sst), src, - blocks); + while (blocks) { + int rem; + + kernel_neon_begin(); + rem = sha1_ce_transform(container_of(sst, struct sha1_ce_state, + sst), src, blocks); + kernel_neon_end(); + src += (blocks - rem) * SHA1_BLOCK_SIZE; + blocks = rem; + } } const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count); @@ -51,9 +59,7 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data, return crypto_sha1_update(desc, data, len); sctx->finalize = 0; - kernel_neon_begin(); sha1_base_do_update(desc, data, len, __sha1_ce_transform); - kernel_neon_end(); return 0; } @@ -73,11 +79,9 @@ static int sha1_ce_finup(struct shash_desc *desc, const u8 *data, */ sctx->finalize = finalize; - kernel_neon_begin(); sha1_base_do_update(desc, data, len, __sha1_ce_transform); if (!finalize) sha1_base_do_finalize(desc, __sha1_ce_transform); - kernel_neon_end(); return sha1_base_finish(desc, out); } @@ -89,9 +93,7 @@ static int sha1_ce_final(struct shash_desc *desc, u8 *out) return crypto_sha1_finup(desc, NULL, 0, out); sctx->finalize = 0; - kernel_neon_begin(); sha1_base_do_finalize(desc, __sha1_ce_transform); - kernel_neon_end(); return sha1_base_finish(desc, out); } -- 2.30.0 _______________________________________________ linux-arm-kernel mailing list linux-arm-kernel@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
next prev parent reply other threads:[~2021-02-03 11:37 UTC|newest] Thread overview: 36+ messages / expand[flat|nested] mbox.gz Atom feed top 2021-02-03 11:36 [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code Ard Biesheuvel 2021-02-03 11:36 ` Ard Biesheuvel 2021-02-03 11:36 ` [PATCH v2 1/9] arm64: assembler: add cond_yield macro Ard Biesheuvel 2021-02-03 11:36 ` Ard Biesheuvel 2021-02-03 11:36 ` Ard Biesheuvel [this message] 2021-02-03 11:36 ` [PATCH v2 2/9] crypto: arm64/sha1-ce - simplify NEON yield Ard Biesheuvel 2021-02-03 11:36 ` [PATCH v2 3/9] crypto: arm64/sha2-ce " Ard Biesheuvel 2021-02-03 11:36 ` Ard Biesheuvel 2021-02-03 11:36 ` [PATCH v2 4/9] crypto: arm64/sha3-ce " Ard Biesheuvel 2021-02-03 11:36 ` Ard Biesheuvel 2021-02-03 11:36 ` [PATCH v2 5/9] crypto: arm64/sha512-ce " Ard Biesheuvel 2021-02-03 11:36 ` Ard Biesheuvel 2021-02-03 11:36 ` [PATCH v2 6/9] crypto: arm64/aes-neonbs - remove NEON yield calls Ard Biesheuvel 2021-02-03 11:36 ` Ard Biesheuvel 2021-02-03 11:36 ` [PATCH v2 7/9] crypto: arm64/aes-ce-mac - simplify NEON yield Ard Biesheuvel 2021-02-03 11:36 ` Ard Biesheuvel 2021-02-03 11:36 ` [PATCH v2 8/9] crypto: arm64/crc-t10dif - move NEON yield to C code Ard Biesheuvel 2021-02-03 11:36 ` Ard Biesheuvel 2021-02-03 11:36 ` [PATCH v2 9/9] arm64: assembler: remove conditional NEON yield macros Ard Biesheuvel 2021-02-03 11:36 ` Ard Biesheuvel 2021-02-03 21:31 ` (subset) Re: [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code Will Deacon 2021-02-03 21:31 ` Will Deacon 2021-02-04 2:44 ` Herbert Xu 2021-02-04 2:44 ` Herbert Xu 2021-02-04 8:29 ` Ard Biesheuvel 2021-02-04 8:29 ` Ard Biesheuvel 2021-02-04 11:10 ` Herbert Xu 2021-02-04 11:10 ` Herbert Xu 2021-02-04 13:03 ` Will Deacon 2021-02-04 13:03 ` Will Deacon 2021-02-04 19:45 ` Herbert Xu 2021-02-04 19:45 ` Herbert Xu 2021-02-04 10:33 ` Will Deacon 2021-02-04 10:33 ` Will Deacon 2021-02-10 7:23 ` Herbert Xu 2021-02-10 7:23 ` Herbert Xu
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20210203113626.220151-3-ardb@kernel.org \ --to=ardb@kernel.org \ --cc=catalin.marinas@arm.com \ --cc=dave.martin@arm.com \ --cc=ebiggers@google.com \ --cc=herbert@gondor.apana.org.au \ --cc=linux-arm-kernel@lists.infradead.org \ --cc=linux-crypto@vger.kernel.org \ --cc=mark.rutland@arm.com \ --cc=will@kernel.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.