* [PATCH v2 1/9] arm64: assembler: add cond_yield macro
2021-02-03 11:36 [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code Ard Biesheuvel
@ 2021-02-03 11:36 ` Ard Biesheuvel
2021-02-03 11:36 ` [PATCH v2 2/9] crypto: arm64/sha1-ce - simplify NEON yield Ard Biesheuvel
` (9 subsequent siblings)
10 siblings, 0 replies; 18+ messages in thread
From: Ard Biesheuvel @ 2021-02-03 11:36 UTC (permalink / raw)
To: linux-crypto
Cc: linux-arm-kernel, will, mark.rutland, catalin.marinas, herbert,
Ard Biesheuvel, Dave Martin, Eric Biggers
Add a macro cond_yield that branches to a specified label when called if
the TIF_NEED_RESCHED flag is set and decreasing the preempt count would
make the task preemptible again, resulting in a schedule to occur. This
can be used by kernel mode SIMD code that keeps a lot of state in SIMD
registers, which would make chunking the input in order to perform the
cond_resched() check from C code disproportionately costly.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
arch/arm64/include/asm/assembler.h | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index bf125c591116..27b1ea721c2d 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -745,6 +745,22 @@ USER(\label, ic ivau, \tmp2) // invalidate I line PoU
.Lyield_out_\@ :
.endm
+ /*
+ * Check whether preempt-disabled code should yield as soon as it
+ * is able. This is the case if re-enabling preemption a single
+ * time results in a preempt count of zero, and the TIF_NEED_RESCHED
+ * flag is set. (Note that the latter is stored negated in the
+ * top word of the thread_info::preempt_count field)
+ */
+ .macro cond_yield, lbl:req, tmp:req
+#ifdef CONFIG_PREEMPTION
+ get_current_task \tmp
+ ldr \tmp, [\tmp, #TSK_TI_PREEMPT]
+ sub \tmp, \tmp, #PREEMPT_DISABLE_OFFSET
+ cbz \tmp, \lbl
+#endif
+ .endm
+
/*
* This macro emits a program property note section identifying
* architecture features which require special handling, mainly for
--
2.30.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH v2 2/9] crypto: arm64/sha1-ce - simplify NEON yield
2021-02-03 11:36 [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code Ard Biesheuvel
2021-02-03 11:36 ` [PATCH v2 1/9] arm64: assembler: add cond_yield macro Ard Biesheuvel
@ 2021-02-03 11:36 ` Ard Biesheuvel
2021-02-03 11:36 ` [PATCH v2 3/9] crypto: arm64/sha2-ce " Ard Biesheuvel
` (8 subsequent siblings)
10 siblings, 0 replies; 18+ messages in thread
From: Ard Biesheuvel @ 2021-02-03 11:36 UTC (permalink / raw)
To: linux-crypto
Cc: linux-arm-kernel, will, mark.rutland, catalin.marinas, herbert,
Ard Biesheuvel, Dave Martin, Eric Biggers
Instead of calling into kernel_neon_end() and kernel_neon_begin() (and
potentially into schedule()) from the assembler code when running in
task mode and a reschedule is pending, perform only the preempt count
check in assembler, but simply return early in this case, and let the C
code deal with the consequences.
This reverts commit 7df8d164753e6e6f229b72767595072bc6a71f48.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
arch/arm64/crypto/sha1-ce-core.S | 47 +++++++-------------
arch/arm64/crypto/sha1-ce-glue.c | 22 ++++-----
2 files changed, 29 insertions(+), 40 deletions(-)
diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S
index 92d0d2753e81..8c02bbc2684e 100644
--- a/arch/arm64/crypto/sha1-ce-core.S
+++ b/arch/arm64/crypto/sha1-ce-core.S
@@ -62,40 +62,34 @@
.endm
/*
- * void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
- * int blocks)
+ * int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
+ * int blocks)
*/
SYM_FUNC_START(sha1_ce_transform)
- frame_push 3
-
- mov x19, x0
- mov x20, x1
- mov x21, x2
-
/* load round constants */
-0: loadrc k0.4s, 0x5a827999, w6
+ loadrc k0.4s, 0x5a827999, w6
loadrc k1.4s, 0x6ed9eba1, w6
loadrc k2.4s, 0x8f1bbcdc, w6
loadrc k3.4s, 0xca62c1d6, w6
/* load state */
- ld1 {dgav.4s}, [x19]
- ldr dgb, [x19, #16]
+ ld1 {dgav.4s}, [x0]
+ ldr dgb, [x0, #16]
/* load sha1_ce_state::finalize */
ldr_l w4, sha1_ce_offsetof_finalize, x4
- ldr w4, [x19, x4]
+ ldr w4, [x0, x4]
/* load input */
-1: ld1 {v8.4s-v11.4s}, [x20], #64
- sub w21, w21, #1
+0: ld1 {v8.4s-v11.4s}, [x1], #64
+ sub w2, w2, #1
CPU_LE( rev32 v8.16b, v8.16b )
CPU_LE( rev32 v9.16b, v9.16b )
CPU_LE( rev32 v10.16b, v10.16b )
CPU_LE( rev32 v11.16b, v11.16b )
-2: add t0.4s, v8.4s, k0.4s
+1: add t0.4s, v8.4s, k0.4s
mov dg0v.16b, dgav.16b
add_update c, ev, k0, 8, 9, 10, 11, dgb
@@ -126,25 +120,18 @@ CPU_LE( rev32 v11.16b, v11.16b )
add dgbv.2s, dgbv.2s, dg1v.2s
add dgav.4s, dgav.4s, dg0v.4s
- cbz w21, 3f
-
- if_will_cond_yield_neon
- st1 {dgav.4s}, [x19]
- str dgb, [x19, #16]
- do_cond_yield_neon
+ cbz w2, 2f
+ cond_yield 3f, x5
b 0b
- endif_yield_neon
-
- b 1b
/*
* Final block: add padding and total bit count.
* Skip if the input size was not a round multiple of the block size,
* the padding is handled by the C code in that case.
*/
-3: cbz x4, 4f
+2: cbz x4, 3f
ldr_l w4, sha1_ce_offsetof_count, x4
- ldr x4, [x19, x4]
+ ldr x4, [x0, x4]
movi v9.2d, #0
mov x8, #0x80000000
movi v10.2d, #0
@@ -153,11 +140,11 @@ CPU_LE( rev32 v11.16b, v11.16b )
mov x4, #0
mov v11.d[0], xzr
mov v11.d[1], x7
- b 2b
+ b 1b
/* store new state */
-4: st1 {dgav.4s}, [x19]
- str dgb, [x19, #16]
- frame_pop
+3: st1 {dgav.4s}, [x0]
+ str dgb, [x0, #16]
+ mov w0, w2
ret
SYM_FUNC_END(sha1_ce_transform)
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
index c1362861765f..71fa4f1122d7 100644
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -29,14 +29,22 @@ struct sha1_ce_state {
extern const u32 sha1_ce_offsetof_count;
extern const u32 sha1_ce_offsetof_finalize;
-asmlinkage void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
- int blocks);
+asmlinkage int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
+ int blocks);
static void __sha1_ce_transform(struct sha1_state *sst, u8 const *src,
int blocks)
{
- sha1_ce_transform(container_of(sst, struct sha1_ce_state, sst), src,
- blocks);
+ while (blocks) {
+ int rem;
+
+ kernel_neon_begin();
+ rem = sha1_ce_transform(container_of(sst, struct sha1_ce_state,
+ sst), src, blocks);
+ kernel_neon_end();
+ src += (blocks - rem) * SHA1_BLOCK_SIZE;
+ blocks = rem;
+ }
}
const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count);
@@ -51,9 +59,7 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data,
return crypto_sha1_update(desc, data, len);
sctx->finalize = 0;
- kernel_neon_begin();
sha1_base_do_update(desc, data, len, __sha1_ce_transform);
- kernel_neon_end();
return 0;
}
@@ -73,11 +79,9 @@ static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
*/
sctx->finalize = finalize;
- kernel_neon_begin();
sha1_base_do_update(desc, data, len, __sha1_ce_transform);
if (!finalize)
sha1_base_do_finalize(desc, __sha1_ce_transform);
- kernel_neon_end();
return sha1_base_finish(desc, out);
}
@@ -89,9 +93,7 @@ static int sha1_ce_final(struct shash_desc *desc, u8 *out)
return crypto_sha1_finup(desc, NULL, 0, out);
sctx->finalize = 0;
- kernel_neon_begin();
sha1_base_do_finalize(desc, __sha1_ce_transform);
- kernel_neon_end();
return sha1_base_finish(desc, out);
}
--
2.30.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH v2 3/9] crypto: arm64/sha2-ce - simplify NEON yield
2021-02-03 11:36 [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code Ard Biesheuvel
2021-02-03 11:36 ` [PATCH v2 1/9] arm64: assembler: add cond_yield macro Ard Biesheuvel
2021-02-03 11:36 ` [PATCH v2 2/9] crypto: arm64/sha1-ce - simplify NEON yield Ard Biesheuvel
@ 2021-02-03 11:36 ` Ard Biesheuvel
2021-02-03 11:36 ` [PATCH v2 4/9] crypto: arm64/sha3-ce " Ard Biesheuvel
` (7 subsequent siblings)
10 siblings, 0 replies; 18+ messages in thread
From: Ard Biesheuvel @ 2021-02-03 11:36 UTC (permalink / raw)
To: linux-crypto
Cc: linux-arm-kernel, will, mark.rutland, catalin.marinas, herbert,
Ard Biesheuvel, Dave Martin, Eric Biggers
Instead of calling into kernel_neon_end() and kernel_neon_begin() (and
potentially into schedule()) from the assembler code when running in
task mode and a reschedule is pending, perform only the preempt count
check in assembler, but simply return early in this case, and let the C
code deal with the consequences.
This reverts commit d82f37ab5e2426287013eba38b1212e8b71e5be3.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
arch/arm64/crypto/sha2-ce-core.S | 38 +++++++-------------
arch/arm64/crypto/sha2-ce-glue.c | 22 ++++++------
2 files changed, 25 insertions(+), 35 deletions(-)
diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S
index 3f9d0f326987..6cdea7d56059 100644
--- a/arch/arm64/crypto/sha2-ce-core.S
+++ b/arch/arm64/crypto/sha2-ce-core.S
@@ -76,36 +76,30 @@
*/
.text
SYM_FUNC_START(sha2_ce_transform)
- frame_push 3
-
- mov x19, x0
- mov x20, x1
- mov x21, x2
-
/* load round constants */
-0: adr_l x8, .Lsha2_rcon
+ adr_l x8, .Lsha2_rcon
ld1 { v0.4s- v3.4s}, [x8], #64
ld1 { v4.4s- v7.4s}, [x8], #64
ld1 { v8.4s-v11.4s}, [x8], #64
ld1 {v12.4s-v15.4s}, [x8]
/* load state */
- ld1 {dgav.4s, dgbv.4s}, [x19]
+ ld1 {dgav.4s, dgbv.4s}, [x0]
/* load sha256_ce_state::finalize */
ldr_l w4, sha256_ce_offsetof_finalize, x4
- ldr w4, [x19, x4]
+ ldr w4, [x0, x4]
/* load input */
-1: ld1 {v16.4s-v19.4s}, [x20], #64
- sub w21, w21, #1
+0: ld1 {v16.4s-v19.4s}, [x1], #64
+ sub w2, w2, #1
CPU_LE( rev32 v16.16b, v16.16b )
CPU_LE( rev32 v17.16b, v17.16b )
CPU_LE( rev32 v18.16b, v18.16b )
CPU_LE( rev32 v19.16b, v19.16b )
-2: add t0.4s, v16.4s, v0.4s
+1: add t0.4s, v16.4s, v0.4s
mov dg0v.16b, dgav.16b
mov dg1v.16b, dgbv.16b
@@ -134,24 +128,18 @@ CPU_LE( rev32 v19.16b, v19.16b )
add dgbv.4s, dgbv.4s, dg1v.4s
/* handled all input blocks? */
- cbz w21, 3f
-
- if_will_cond_yield_neon
- st1 {dgav.4s, dgbv.4s}, [x19]
- do_cond_yield_neon
+ cbz w2, 2f
+ cond_yield 3f, x5
b 0b
- endif_yield_neon
-
- b 1b
/*
* Final block: add padding and total bit count.
* Skip if the input size was not a round multiple of the block size,
* the padding is handled by the C code in that case.
*/
-3: cbz x4, 4f
+2: cbz x4, 3f
ldr_l w4, sha256_ce_offsetof_count, x4
- ldr x4, [x19, x4]
+ ldr x4, [x0, x4]
movi v17.2d, #0
mov x8, #0x80000000
movi v18.2d, #0
@@ -160,10 +148,10 @@ CPU_LE( rev32 v19.16b, v19.16b )
mov x4, #0
mov v19.d[0], xzr
mov v19.d[1], x7
- b 2b
+ b 1b
/* store new state */
-4: st1 {dgav.4s, dgbv.4s}, [x19]
- frame_pop
+3: st1 {dgav.4s, dgbv.4s}, [x0]
+ mov w0, w2
ret
SYM_FUNC_END(sha2_ce_transform)
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
index ded3a6488f81..c57a6119fefc 100644
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -30,14 +30,22 @@ struct sha256_ce_state {
extern const u32 sha256_ce_offsetof_count;
extern const u32 sha256_ce_offsetof_finalize;
-asmlinkage void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src,
- int blocks);
+asmlinkage int sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src,
+ int blocks);
static void __sha2_ce_transform(struct sha256_state *sst, u8 const *src,
int blocks)
{
- sha2_ce_transform(container_of(sst, struct sha256_ce_state, sst), src,
- blocks);
+ while (blocks) {
+ int rem;
+
+ kernel_neon_begin();
+ rem = sha2_ce_transform(container_of(sst, struct sha256_ce_state,
+ sst), src, blocks);
+ kernel_neon_end();
+ src += (blocks - rem) * SHA256_BLOCK_SIZE;
+ blocks = rem;
+ }
}
const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state,
@@ -63,9 +71,7 @@ static int sha256_ce_update(struct shash_desc *desc, const u8 *data,
__sha256_block_data_order);
sctx->finalize = 0;
- kernel_neon_begin();
sha256_base_do_update(desc, data, len, __sha2_ce_transform);
- kernel_neon_end();
return 0;
}
@@ -90,11 +96,9 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data,
*/
sctx->finalize = finalize;
- kernel_neon_begin();
sha256_base_do_update(desc, data, len, __sha2_ce_transform);
if (!finalize)
sha256_base_do_finalize(desc, __sha2_ce_transform);
- kernel_neon_end();
return sha256_base_finish(desc, out);
}
@@ -108,9 +112,7 @@ static int sha256_ce_final(struct shash_desc *desc, u8 *out)
}
sctx->finalize = 0;
- kernel_neon_begin();
sha256_base_do_finalize(desc, __sha2_ce_transform);
- kernel_neon_end();
return sha256_base_finish(desc, out);
}
--
2.30.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH v2 4/9] crypto: arm64/sha3-ce - simplify NEON yield
2021-02-03 11:36 [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code Ard Biesheuvel
` (2 preceding siblings ...)
2021-02-03 11:36 ` [PATCH v2 3/9] crypto: arm64/sha2-ce " Ard Biesheuvel
@ 2021-02-03 11:36 ` Ard Biesheuvel
2021-02-03 11:36 ` [PATCH v2 5/9] crypto: arm64/sha512-ce " Ard Biesheuvel
` (6 subsequent siblings)
10 siblings, 0 replies; 18+ messages in thread
From: Ard Biesheuvel @ 2021-02-03 11:36 UTC (permalink / raw)
To: linux-crypto
Cc: linux-arm-kernel, will, mark.rutland, catalin.marinas, herbert,
Ard Biesheuvel, Dave Martin, Eric Biggers
Instead of calling into kernel_neon_end() and kernel_neon_begin() (and
potentially into schedule()) from the assembler code when running in
task mode and a reschedule is pending, perform only the preempt count
check in assembler, but simply return early in this case, and let the C
code deal with the consequences.
This reverts commit 7edc86cb1c18b4c274672232117586ea2bef1d9a.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
arch/arm64/crypto/sha3-ce-core.S | 81 ++++++++------------
arch/arm64/crypto/sha3-ce-glue.c | 14 ++--
2 files changed, 39 insertions(+), 56 deletions(-)
diff --git a/arch/arm64/crypto/sha3-ce-core.S b/arch/arm64/crypto/sha3-ce-core.S
index 1cfb768df350..6f5208414fe3 100644
--- a/arch/arm64/crypto/sha3-ce-core.S
+++ b/arch/arm64/crypto/sha3-ce-core.S
@@ -37,20 +37,13 @@
.endm
/*
- * sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
+ * int sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
*/
.text
SYM_FUNC_START(sha3_ce_transform)
- frame_push 4
-
- mov x19, x0
- mov x20, x1
- mov x21, x2
- mov x22, x3
-
-0: /* load state */
- add x8, x19, #32
- ld1 { v0.1d- v3.1d}, [x19]
+ /* load state */
+ add x8, x0, #32
+ ld1 { v0.1d- v3.1d}, [x0]
ld1 { v4.1d- v7.1d}, [x8], #32
ld1 { v8.1d-v11.1d}, [x8], #32
ld1 {v12.1d-v15.1d}, [x8], #32
@@ -58,13 +51,13 @@ SYM_FUNC_START(sha3_ce_transform)
ld1 {v20.1d-v23.1d}, [x8], #32
ld1 {v24.1d}, [x8]
-1: sub w21, w21, #1
+0: sub w2, w2, #1
mov w8, #24
adr_l x9, .Lsha3_rcon
/* load input */
- ld1 {v25.8b-v28.8b}, [x20], #32
- ld1 {v29.8b-v31.8b}, [x20], #24
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ ld1 {v29.8b-v31.8b}, [x1], #24
eor v0.8b, v0.8b, v25.8b
eor v1.8b, v1.8b, v26.8b
eor v2.8b, v2.8b, v27.8b
@@ -73,10 +66,10 @@ SYM_FUNC_START(sha3_ce_transform)
eor v5.8b, v5.8b, v30.8b
eor v6.8b, v6.8b, v31.8b
- tbnz x22, #6, 3f // SHA3-512
+ tbnz x3, #6, 2f // SHA3-512
- ld1 {v25.8b-v28.8b}, [x20], #32
- ld1 {v29.8b-v30.8b}, [x20], #16
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ ld1 {v29.8b-v30.8b}, [x1], #16
eor v7.8b, v7.8b, v25.8b
eor v8.8b, v8.8b, v26.8b
eor v9.8b, v9.8b, v27.8b
@@ -84,34 +77,34 @@ SYM_FUNC_START(sha3_ce_transform)
eor v11.8b, v11.8b, v29.8b
eor v12.8b, v12.8b, v30.8b
- tbnz x22, #4, 2f // SHA3-384 or SHA3-224
+ tbnz x3, #4, 1f // SHA3-384 or SHA3-224
// SHA3-256
- ld1 {v25.8b-v28.8b}, [x20], #32
+ ld1 {v25.8b-v28.8b}, [x1], #32
eor v13.8b, v13.8b, v25.8b
eor v14.8b, v14.8b, v26.8b
eor v15.8b, v15.8b, v27.8b
eor v16.8b, v16.8b, v28.8b
- b 4f
+ b 3f
-2: tbz x22, #2, 4f // bit 2 cleared? SHA-384
+1: tbz x3, #2, 3f // bit 2 cleared? SHA-384
// SHA3-224
- ld1 {v25.8b-v28.8b}, [x20], #32
- ld1 {v29.8b}, [x20], #8
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ ld1 {v29.8b}, [x1], #8
eor v13.8b, v13.8b, v25.8b
eor v14.8b, v14.8b, v26.8b
eor v15.8b, v15.8b, v27.8b
eor v16.8b, v16.8b, v28.8b
eor v17.8b, v17.8b, v29.8b
- b 4f
+ b 3f
// SHA3-512
-3: ld1 {v25.8b-v26.8b}, [x20], #16
+2: ld1 {v25.8b-v26.8b}, [x1], #16
eor v7.8b, v7.8b, v25.8b
eor v8.8b, v8.8b, v26.8b
-4: sub w8, w8, #1
+3: sub w8, w8, #1
eor3 v29.16b, v4.16b, v9.16b, v14.16b
eor3 v26.16b, v1.16b, v6.16b, v11.16b
@@ -190,33 +183,19 @@ SYM_FUNC_START(sha3_ce_transform)
eor v0.16b, v0.16b, v31.16b
- cbnz w8, 4b
- cbz w21, 5f
-
- if_will_cond_yield_neon
- add x8, x19, #32
- st1 { v0.1d- v3.1d}, [x19]
- st1 { v4.1d- v7.1d}, [x8], #32
- st1 { v8.1d-v11.1d}, [x8], #32
- st1 {v12.1d-v15.1d}, [x8], #32
- st1 {v16.1d-v19.1d}, [x8], #32
- st1 {v20.1d-v23.1d}, [x8], #32
- st1 {v24.1d}, [x8]
- do_cond_yield_neon
- b 0b
- endif_yield_neon
-
- b 1b
+ cbnz w8, 3b
+ cond_yield 3f, x8
+ cbnz w2, 0b
/* save state */
-5: st1 { v0.1d- v3.1d}, [x19], #32
- st1 { v4.1d- v7.1d}, [x19], #32
- st1 { v8.1d-v11.1d}, [x19], #32
- st1 {v12.1d-v15.1d}, [x19], #32
- st1 {v16.1d-v19.1d}, [x19], #32
- st1 {v20.1d-v23.1d}, [x19], #32
- st1 {v24.1d}, [x19]
- frame_pop
+3: st1 { v0.1d- v3.1d}, [x0], #32
+ st1 { v4.1d- v7.1d}, [x0], #32
+ st1 { v8.1d-v11.1d}, [x0], #32
+ st1 {v12.1d-v15.1d}, [x0], #32
+ st1 {v16.1d-v19.1d}, [x0], #32
+ st1 {v20.1d-v23.1d}, [x0], #32
+ st1 {v24.1d}, [x0]
+ mov w0, w2
ret
SYM_FUNC_END(sha3_ce_transform)
diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c
index 7288d3046354..8c65cecf560a 100644
--- a/arch/arm64/crypto/sha3-ce-glue.c
+++ b/arch/arm64/crypto/sha3-ce-glue.c
@@ -28,8 +28,8 @@ MODULE_ALIAS_CRYPTO("sha3-256");
MODULE_ALIAS_CRYPTO("sha3-384");
MODULE_ALIAS_CRYPTO("sha3-512");
-asmlinkage void sha3_ce_transform(u64 *st, const u8 *data, int blocks,
- int md_len);
+asmlinkage int sha3_ce_transform(u64 *st, const u8 *data, int blocks,
+ int md_len);
static int sha3_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
@@ -59,11 +59,15 @@ static int sha3_update(struct shash_desc *desc, const u8 *data,
blocks = len / sctx->rsiz;
len %= sctx->rsiz;
- if (blocks) {
+ while (blocks) {
+ int rem;
+
kernel_neon_begin();
- sha3_ce_transform(sctx->st, data, blocks, digest_size);
+ rem = sha3_ce_transform(sctx->st, data, blocks,
+ digest_size);
kernel_neon_end();
- data += blocks * sctx->rsiz;
+ data += (blocks - rem) * sctx->rsiz;
+ blocks = rem;
}
}
--
2.30.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH v2 5/9] crypto: arm64/sha512-ce - simplify NEON yield
2021-02-03 11:36 [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code Ard Biesheuvel
` (3 preceding siblings ...)
2021-02-03 11:36 ` [PATCH v2 4/9] crypto: arm64/sha3-ce " Ard Biesheuvel
@ 2021-02-03 11:36 ` Ard Biesheuvel
2021-02-03 11:36 ` [PATCH v2 6/9] crypto: arm64/aes-neonbs - remove NEON yield calls Ard Biesheuvel
` (5 subsequent siblings)
10 siblings, 0 replies; 18+ messages in thread
From: Ard Biesheuvel @ 2021-02-03 11:36 UTC (permalink / raw)
To: linux-crypto
Cc: linux-arm-kernel, will, mark.rutland, catalin.marinas, herbert,
Ard Biesheuvel, Dave Martin, Eric Biggers
Instead of calling into kernel_neon_end() and kernel_neon_begin() (and
potentially into schedule()) from the assembler code when running in
task mode and a reschedule is pending, perform only the preempt count
check in assembler, but simply return early in this case, and let the C
code deal with the consequences.
This reverts commit 6caf7adc5e458f77f550b6c6ca8effa152d61b4a.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
arch/arm64/crypto/sha512-ce-core.S | 29 +++--------
arch/arm64/crypto/sha512-ce-glue.c | 53 ++++++++++----------
2 files changed, 34 insertions(+), 48 deletions(-)
diff --git a/arch/arm64/crypto/sha512-ce-core.S b/arch/arm64/crypto/sha512-ce-core.S
index cde606c0323e..d6e7f6c95fa6 100644
--- a/arch/arm64/crypto/sha512-ce-core.S
+++ b/arch/arm64/crypto/sha512-ce-core.S
@@ -107,23 +107,17 @@
*/
.text
SYM_FUNC_START(sha512_ce_transform)
- frame_push 3
-
- mov x19, x0
- mov x20, x1
- mov x21, x2
-
/* load state */
-0: ld1 {v8.2d-v11.2d}, [x19]
+ ld1 {v8.2d-v11.2d}, [x0]
/* load first 4 round constants */
adr_l x3, .Lsha512_rcon
ld1 {v20.2d-v23.2d}, [x3], #64
/* load input */
-1: ld1 {v12.2d-v15.2d}, [x20], #64
- ld1 {v16.2d-v19.2d}, [x20], #64
- sub w21, w21, #1
+0: ld1 {v12.2d-v15.2d}, [x1], #64
+ ld1 {v16.2d-v19.2d}, [x1], #64
+ sub w2, w2, #1
CPU_LE( rev64 v12.16b, v12.16b )
CPU_LE( rev64 v13.16b, v13.16b )
@@ -201,19 +195,12 @@ CPU_LE( rev64 v19.16b, v19.16b )
add v10.2d, v10.2d, v2.2d
add v11.2d, v11.2d, v3.2d
+ cond_yield 3f, x4
/* handled all input blocks? */
- cbz w21, 3f
-
- if_will_cond_yield_neon
- st1 {v8.2d-v11.2d}, [x19]
- do_cond_yield_neon
- b 0b
- endif_yield_neon
-
- b 1b
+ cbnz w2, 0b
/* store new state */
-3: st1 {v8.2d-v11.2d}, [x19]
- frame_pop
+3: st1 {v8.2d-v11.2d}, [x0]
+ mov w0, w2
ret
SYM_FUNC_END(sha512_ce_transform)
diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c
index a6b1adf31c56..e62a094a9d52 100644
--- a/arch/arm64/crypto/sha512-ce-glue.c
+++ b/arch/arm64/crypto/sha512-ce-glue.c
@@ -26,11 +26,25 @@ MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("sha384");
MODULE_ALIAS_CRYPTO("sha512");
-asmlinkage void sha512_ce_transform(struct sha512_state *sst, u8 const *src,
- int blocks);
+asmlinkage int sha512_ce_transform(struct sha512_state *sst, u8 const *src,
+ int blocks);
asmlinkage void sha512_block_data_order(u64 *digest, u8 const *src, int blocks);
+static void __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
+ int blocks)
+{
+ while (blocks) {
+ int rem;
+
+ kernel_neon_begin();
+ rem = sha512_ce_transform(sst, src, blocks);
+ kernel_neon_end();
+ src += (blocks - rem) * SHA512_BLOCK_SIZE;
+ blocks = rem;
+ }
+}
+
static void __sha512_block_data_order(struct sha512_state *sst, u8 const *src,
int blocks)
{
@@ -40,45 +54,30 @@ static void __sha512_block_data_order(struct sha512_state *sst, u8 const *src,
static int sha512_ce_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
- if (!crypto_simd_usable())
- return sha512_base_do_update(desc, data, len,
- __sha512_block_data_order);
-
- kernel_neon_begin();
- sha512_base_do_update(desc, data, len, sha512_ce_transform);
- kernel_neon_end();
+ sha512_block_fn *fn = crypto_simd_usable() ? __sha512_ce_transform
+ : __sha512_block_data_order;
+ sha512_base_do_update(desc, data, len, fn);
return 0;
}
static int sha512_ce_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
- if (!crypto_simd_usable()) {
- if (len)
- sha512_base_do_update(desc, data, len,
- __sha512_block_data_order);
- sha512_base_do_finalize(desc, __sha512_block_data_order);
- return sha512_base_finish(desc, out);
- }
+ sha512_block_fn *fn = crypto_simd_usable() ? __sha512_ce_transform
+ : __sha512_block_data_order;
- kernel_neon_begin();
- sha512_base_do_update(desc, data, len, sha512_ce_transform);
- sha512_base_do_finalize(desc, sha512_ce_transform);
- kernel_neon_end();
+ sha512_base_do_update(desc, data, len, fn);
+ sha512_base_do_finalize(desc, fn);
return sha512_base_finish(desc, out);
}
static int sha512_ce_final(struct shash_desc *desc, u8 *out)
{
- if (!crypto_simd_usable()) {
- sha512_base_do_finalize(desc, __sha512_block_data_order);
- return sha512_base_finish(desc, out);
- }
+ sha512_block_fn *fn = crypto_simd_usable() ? __sha512_ce_transform
+ : __sha512_block_data_order;
- kernel_neon_begin();
- sha512_base_do_finalize(desc, sha512_ce_transform);
- kernel_neon_end();
+ sha512_base_do_finalize(desc, fn);
return sha512_base_finish(desc, out);
}
--
2.30.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH v2 6/9] crypto: arm64/aes-neonbs - remove NEON yield calls
2021-02-03 11:36 [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code Ard Biesheuvel
` (4 preceding siblings ...)
2021-02-03 11:36 ` [PATCH v2 5/9] crypto: arm64/sha512-ce " Ard Biesheuvel
@ 2021-02-03 11:36 ` Ard Biesheuvel
2021-02-03 11:36 ` [PATCH v2 7/9] crypto: arm64/aes-ce-mac - simplify NEON yield Ard Biesheuvel
` (4 subsequent siblings)
10 siblings, 0 replies; 18+ messages in thread
From: Ard Biesheuvel @ 2021-02-03 11:36 UTC (permalink / raw)
To: linux-crypto
Cc: linux-arm-kernel, will, mark.rutland, catalin.marinas, herbert,
Ard Biesheuvel, Dave Martin, Eric Biggers
There is no need for elaborate yield handling in the bit-sliced NEON
implementation of AES, given that skciphers are naturally bounded by the
size of the chunks returned by the skcipher_walk API. So remove the
yield calls from the asm code.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
arch/arm64/crypto/aes-neonbs-core.S | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S
index 63a52ad9a75c..a3405b8c344b 100644
--- a/arch/arm64/crypto/aes-neonbs-core.S
+++ b/arch/arm64/crypto/aes-neonbs-core.S
@@ -613,7 +613,6 @@ SYM_FUNC_END(aesbs_decrypt8)
st1 {\o7\().16b}, [x19], #16
cbz x23, 1f
- cond_yield_neon
b 99b
1: frame_pop
@@ -715,7 +714,6 @@ SYM_FUNC_START(aesbs_cbc_decrypt)
1: st1 {v24.16b}, [x24] // store IV
cbz x23, 2f
- cond_yield_neon
b 99b
2: frame_pop
@@ -801,7 +799,7 @@ SYM_FUNC_END(__xts_crypt8)
mov x23, x4
mov x24, x5
-0: movi v30.2s, #0x1
+ movi v30.2s, #0x1
movi v25.2s, #0x87
uzp1 v30.4s, v30.4s, v25.4s
ld1 {v25.16b}, [x24]
@@ -846,7 +844,6 @@ SYM_FUNC_END(__xts_crypt8)
cbz x23, 1f
st1 {v25.16b}, [x24]
- cond_yield_neon 0b
b 99b
1: st1 {v25.16b}, [x24]
@@ -889,7 +886,7 @@ SYM_FUNC_START(aesbs_ctr_encrypt)
cset x26, ne
add x23, x23, x26 // do one extra block if final
-98: ldp x7, x8, [x24]
+ ldp x7, x8, [x24]
ld1 {v0.16b}, [x24]
CPU_LE( rev x7, x7 )
CPU_LE( rev x8, x8 )
@@ -967,7 +964,6 @@ CPU_LE( rev x8, x8 )
st1 {v0.16b}, [x24]
cbz x23, .Lctr_done
- cond_yield_neon 98b
b 99b
.Lctr_done:
--
2.30.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH v2 7/9] crypto: arm64/aes-ce-mac - simplify NEON yield
2021-02-03 11:36 [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code Ard Biesheuvel
` (5 preceding siblings ...)
2021-02-03 11:36 ` [PATCH v2 6/9] crypto: arm64/aes-neonbs - remove NEON yield calls Ard Biesheuvel
@ 2021-02-03 11:36 ` Ard Biesheuvel
2021-02-03 11:36 ` [PATCH v2 8/9] crypto: arm64/crc-t10dif - move NEON yield to C code Ard Biesheuvel
` (3 subsequent siblings)
10 siblings, 0 replies; 18+ messages in thread
From: Ard Biesheuvel @ 2021-02-03 11:36 UTC (permalink / raw)
To: linux-crypto
Cc: linux-arm-kernel, will, mark.rutland, catalin.marinas, herbert,
Ard Biesheuvel, Dave Martin, Eric Biggers
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
arch/arm64/crypto/aes-glue.c | 21 +++++---
arch/arm64/crypto/aes-modes.S | 52 +++++++-------------
2 files changed, 33 insertions(+), 40 deletions(-)
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index e7f116d833b9..17e735931a0c 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -105,9 +105,9 @@ asmlinkage void aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
int rounds, int blocks, u8 iv[],
u32 const rk2[]);
-asmlinkage void aes_mac_update(u8 const in[], u32 const rk[], int rounds,
- int blocks, u8 dg[], int enc_before,
- int enc_after);
+asmlinkage int aes_mac_update(u8 const in[], u32 const rk[], int rounds,
+ int blocks, u8 dg[], int enc_before,
+ int enc_after);
struct crypto_aes_xts_ctx {
struct crypto_aes_ctx key1;
@@ -856,10 +856,17 @@ static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks,
int rounds = 6 + ctx->key_length / 4;
if (crypto_simd_usable()) {
- kernel_neon_begin();
- aes_mac_update(in, ctx->key_enc, rounds, blocks, dg, enc_before,
- enc_after);
- kernel_neon_end();
+ int rem;
+
+ do {
+ kernel_neon_begin();
+ rem = aes_mac_update(in, ctx->key_enc, rounds, blocks,
+ dg, enc_before, enc_after);
+ kernel_neon_end();
+ in += (blocks - rem) * AES_BLOCK_SIZE;
+ blocks = rem;
+ enc_before = 0;
+ } while (blocks);
} else {
if (enc_before)
aes_encrypt(ctx, dg, dg);
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
index 3d1f97799899..bbdb54702aa7 100644
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -678,61 +678,47 @@ AES_FUNC_END(aes_xts_decrypt)
* int blocks, u8 dg[], int enc_before, int enc_after)
*/
AES_FUNC_START(aes_mac_update)
- frame_push 6
-
- mov x19, x0
- mov x20, x1
- mov x21, x2
- mov x22, x3
- mov x23, x4
- mov x24, x6
-
- ld1 {v0.16b}, [x23] /* get dg */
+ ld1 {v0.16b}, [x4] /* get dg */
enc_prepare w2, x1, x7
cbz w5, .Lmacloop4x
encrypt_block v0, w2, x1, x7, w8
.Lmacloop4x:
- subs w22, w22, #4
+ subs w3, w3, #4
bmi .Lmac1x
- ld1 {v1.16b-v4.16b}, [x19], #64 /* get next pt block */
+ ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */
eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
- encrypt_block v0, w21, x20, x7, w8
+ encrypt_block v0, w2, x1, x7, w8
eor v0.16b, v0.16b, v2.16b
- encrypt_block v0, w21, x20, x7, w8
+ encrypt_block v0, w2, x1, x7, w8
eor v0.16b, v0.16b, v3.16b
- encrypt_block v0, w21, x20, x7, w8
+ encrypt_block v0, w2, x1, x7, w8
eor v0.16b, v0.16b, v4.16b
- cmp w22, wzr
- csinv x5, x24, xzr, eq
+ cmp w3, wzr
+ csinv x5, x6, xzr, eq
cbz w5, .Lmacout
- encrypt_block v0, w21, x20, x7, w8
- st1 {v0.16b}, [x23] /* return dg */
- cond_yield_neon .Lmacrestart
+ encrypt_block v0, w2, x1, x7, w8
+ st1 {v0.16b}, [x4] /* return dg */
+ cond_yield .Lmacout, x7
b .Lmacloop4x
.Lmac1x:
- add w22, w22, #4
+ add w3, w3, #4
.Lmacloop:
- cbz w22, .Lmacout
- ld1 {v1.16b}, [x19], #16 /* get next pt block */
+ cbz w3, .Lmacout
+ ld1 {v1.16b}, [x0], #16 /* get next pt block */
eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
- subs w22, w22, #1
- csinv x5, x24, xzr, eq
+ subs w3, w3, #1
+ csinv x5, x6, xzr, eq
cbz w5, .Lmacout
.Lmacenc:
- encrypt_block v0, w21, x20, x7, w8
+ encrypt_block v0, w2, x1, x7, w8
b .Lmacloop
.Lmacout:
- st1 {v0.16b}, [x23] /* return dg */
- frame_pop
+ st1 {v0.16b}, [x4] /* return dg */
+ mov w0, w3
ret
-
-.Lmacrestart:
- ld1 {v0.16b}, [x23] /* get dg */
- enc_prepare w21, x20, x0
- b .Lmacloop4x
AES_FUNC_END(aes_mac_update)
--
2.30.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH v2 8/9] crypto: arm64/crc-t10dif - move NEON yield to C code
2021-02-03 11:36 [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code Ard Biesheuvel
` (6 preceding siblings ...)
2021-02-03 11:36 ` [PATCH v2 7/9] crypto: arm64/aes-ce-mac - simplify NEON yield Ard Biesheuvel
@ 2021-02-03 11:36 ` Ard Biesheuvel
2021-02-03 11:36 ` [PATCH v2 9/9] arm64: assembler: remove conditional NEON yield macros Ard Biesheuvel
` (2 subsequent siblings)
10 siblings, 0 replies; 18+ messages in thread
From: Ard Biesheuvel @ 2021-02-03 11:36 UTC (permalink / raw)
To: linux-crypto
Cc: linux-arm-kernel, will, mark.rutland, catalin.marinas, herbert,
Ard Biesheuvel, Dave Martin, Eric Biggers
Instead of yielding from the bowels of the asm routine if a reschedule
is needed, divide up the input into 4 KB chunks in the C glue. This
simplifies the code substantially, and avoids scheduling out the task
with the asm routine on the call stack, which is undesirable from a
CFI/instrumentation point of view.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
arch/arm64/crypto/crct10dif-ce-core.S | 43 +++++---------------
arch/arm64/crypto/crct10dif-ce-glue.c | 30 +++++++++++---
2 files changed, 35 insertions(+), 38 deletions(-)
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
index 111d9c9abddd..dce6dcebfca1 100644
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -68,10 +68,10 @@
.text
.arch armv8-a+crypto
- init_crc .req w19
- buf .req x20
- len .req x21
- fold_consts_ptr .req x22
+ init_crc .req w0
+ buf .req x1
+ len .req x2
+ fold_consts_ptr .req x3
fold_consts .req v10
@@ -257,12 +257,6 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
.endm
.macro crc_t10dif_pmull, p
- frame_push 4, 128
-
- mov init_crc, w0
- mov buf, x1
- mov len, x2
-
__pmull_init_\p
// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
@@ -317,26 +311,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
fold_32_bytes \p, v6, v7
subs len, len, #128
- b.lt .Lfold_128_bytes_loop_done_\@
-
- if_will_cond_yield_neon
- stp q0, q1, [sp, #.Lframe_local_offset]
- stp q2, q3, [sp, #.Lframe_local_offset + 32]
- stp q4, q5, [sp, #.Lframe_local_offset + 64]
- stp q6, q7, [sp, #.Lframe_local_offset + 96]
- do_cond_yield_neon
- ldp q0, q1, [sp, #.Lframe_local_offset]
- ldp q2, q3, [sp, #.Lframe_local_offset + 32]
- ldp q4, q5, [sp, #.Lframe_local_offset + 64]
- ldp q6, q7, [sp, #.Lframe_local_offset + 96]
- ld1 {fold_consts.2d}, [fold_consts_ptr]
- __pmull_init_\p
- __pmull_pre_\p fold_consts
- endif_yield_neon
-
- b .Lfold_128_bytes_loop_\@
-
-.Lfold_128_bytes_loop_done_\@:
+ b.ge .Lfold_128_bytes_loop_\@
// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
@@ -453,7 +428,9 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
umov w0, v0.h[0]
- frame_pop
+ .ifc \p, p8
+ ldp x29, x30, [sp], #16
+ .endif
ret
.Lless_than_256_bytes_\@:
@@ -489,7 +466,9 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// Assumes len >= 16.
//
SYM_FUNC_START(crc_t10dif_pmull_p8)
- crc_t10dif_pmull p8
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+ crc_t10dif_pmull p8
SYM_FUNC_END(crc_t10dif_pmull_p8)
.align 5
diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
index ccc3f6067742..09eb1456aed4 100644
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -37,9 +37,18 @@ static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
u16 *crc = shash_desc_ctx(desc);
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
- kernel_neon_begin();
- *crc = crc_t10dif_pmull_p8(*crc, data, length);
- kernel_neon_end();
+ do {
+ unsigned int chunk = length;
+
+ if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
+ chunk = SZ_4K;
+
+ kernel_neon_begin();
+ *crc = crc_t10dif_pmull_p8(*crc, data, chunk);
+ kernel_neon_end();
+ data += chunk;
+ length -= chunk;
+ } while (length);
} else {
*crc = crc_t10dif_generic(*crc, data, length);
}
@@ -53,9 +62,18 @@ static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
u16 *crc = shash_desc_ctx(desc);
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
- kernel_neon_begin();
- *crc = crc_t10dif_pmull_p64(*crc, data, length);
- kernel_neon_end();
+ do {
+ unsigned int chunk = length;
+
+ if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
+ chunk = SZ_4K;
+
+ kernel_neon_begin();
+ *crc = crc_t10dif_pmull_p64(*crc, data, chunk);
+ kernel_neon_end();
+ data += chunk;
+ length -= chunk;
+ } while (length);
} else {
*crc = crc_t10dif_generic(*crc, data, length);
}
--
2.30.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH v2 9/9] arm64: assembler: remove conditional NEON yield macros
2021-02-03 11:36 [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code Ard Biesheuvel
` (7 preceding siblings ...)
2021-02-03 11:36 ` [PATCH v2 8/9] crypto: arm64/crc-t10dif - move NEON yield to C code Ard Biesheuvel
@ 2021-02-03 11:36 ` Ard Biesheuvel
2021-02-03 21:31 ` (subset) Re: [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code Will Deacon
2021-02-10 7:23 ` Herbert Xu
10 siblings, 0 replies; 18+ messages in thread
From: Ard Biesheuvel @ 2021-02-03 11:36 UTC (permalink / raw)
To: linux-crypto
Cc: linux-arm-kernel, will, mark.rutland, catalin.marinas, herbert,
Ard Biesheuvel, Dave Martin, Eric Biggers
The users of the conditional NEON yield macros have all been switched to
the simplified cond_yield macro, and so the NEON specific ones can be
removed.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
arch/arm64/include/asm/assembler.h | 70 --------------------
1 file changed, 70 deletions(-)
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 27b1ea721c2d..0bb5829ff06f 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -675,76 +675,6 @@ USER(\label, ic ivau, \tmp2) // invalidate I line PoU
.endif
.endm
-/*
- * Check whether to yield to another runnable task from kernel mode NEON code
- * (which runs with preemption disabled).
- *
- * if_will_cond_yield_neon
- * // pre-yield patchup code
- * do_cond_yield_neon
- * // post-yield patchup code
- * endif_yield_neon <label>
- *
- * where <label> is optional, and marks the point where execution will resume
- * after a yield has been performed. If omitted, execution resumes right after
- * the endif_yield_neon invocation. Note that the entire sequence, including
- * the provided patchup code, will be omitted from the image if
- * CONFIG_PREEMPTION is not defined.
- *
- * As a convenience, in the case where no patchup code is required, the above
- * sequence may be abbreviated to
- *
- * cond_yield_neon <label>
- *
- * Note that the patchup code does not support assembler directives that change
- * the output section, any use of such directives is undefined.
- *
- * The yield itself consists of the following:
- * - Check whether the preempt count is exactly 1 and a reschedule is also
- * needed. If so, calling of preempt_enable() in kernel_neon_end() will
- * trigger a reschedule. If it is not the case, yielding is pointless.
- * - Disable and re-enable kernel mode NEON, and branch to the yield fixup
- * code.
- *
- * This macro sequence may clobber all CPU state that is not guaranteed by the
- * AAPCS to be preserved across an ordinary function call.
- */
-
- .macro cond_yield_neon, lbl
- if_will_cond_yield_neon
- do_cond_yield_neon
- endif_yield_neon \lbl
- .endm
-
- .macro if_will_cond_yield_neon
-#ifdef CONFIG_PREEMPTION
- get_current_task x0
- ldr x0, [x0, #TSK_TI_PREEMPT]
- sub x0, x0, #PREEMPT_DISABLE_OFFSET
- cbz x0, .Lyield_\@
- /* fall through to endif_yield_neon */
- .subsection 1
-.Lyield_\@ :
-#else
- .section ".discard.cond_yield_neon", "ax"
-#endif
- .endm
-
- .macro do_cond_yield_neon
- bl kernel_neon_end
- bl kernel_neon_begin
- .endm
-
- .macro endif_yield_neon, lbl
- .ifnb \lbl
- b \lbl
- .else
- b .Lyield_out_\@
- .endif
- .previous
-.Lyield_out_\@ :
- .endm
-
/*
* Check whether preempt-disabled code should yield as soon as it
* is able. This is the case if re-enabling preemption a single
--
2.30.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* (subset) Re: [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code
2021-02-03 11:36 [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code Ard Biesheuvel
` (8 preceding siblings ...)
2021-02-03 11:36 ` [PATCH v2 9/9] arm64: assembler: remove conditional NEON yield macros Ard Biesheuvel
@ 2021-02-03 21:31 ` Will Deacon
2021-02-04 2:44 ` Herbert Xu
2021-02-04 10:33 ` Will Deacon
2021-02-10 7:23 ` Herbert Xu
10 siblings, 2 replies; 18+ messages in thread
From: Will Deacon @ 2021-02-03 21:31 UTC (permalink / raw)
To: Ard Biesheuvel, linux-crypto
Cc: catalin.marinas, kernel-team, Will Deacon, mark.rutland,
Dave Martin, Eric Biggers, herbert, linux-arm-kernel
On Wed, 3 Feb 2021 12:36:17 +0100, Ard Biesheuvel wrote:
> Given how kernel mode NEON code disables preemption (to ensure that the
> FP/SIMD register state is protected without having to context switch it),
> we need to take care not to let those algorithms operate on unbounded
> input data, or we may end up with excessive scheduling blackouts on
> CONFIG_PREEMPT kernels.
>
> This is currently handled by the cond_yield_neon macros, which check the
> preempt count and the TIF_NEED_RESCHED flag from assembler code, and call
> into kernel_neon_end()+kernel_neon_begin(), triggering a reschedule.
> This works as expected, but is a bit messy, given how much of the state
> preserve/restore code in the algorithm needs to be duplicated, as well as
> causing the need to manage the stack frame explicitly. All of this is better
> handled by the compiler, especially now that we have enabled features such
> as the shadow call stack and BTI, and are working to improve call stack
> validation.
>
> [...]
Applied first patch only to arm64 (for-next/crypto), thanks!
[1/9] arm64: assembler: add cond_yield macro
https://git.kernel.org/arm64/c/d13c613f136c
This is the only patch on the branch, so I'm happy for it to be pulled
into the crypto tree too if it enables some of the other patches to land
in 5.12.
Cheers,
--
Will
https://fixes.arm64.dev
https://next.arm64.dev
https://will.arm64.dev
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: (subset) Re: [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code
2021-02-03 21:31 ` (subset) Re: [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code Will Deacon
@ 2021-02-04 2:44 ` Herbert Xu
2021-02-04 8:29 ` Ard Biesheuvel
2021-02-04 10:33 ` Will Deacon
1 sibling, 1 reply; 18+ messages in thread
From: Herbert Xu @ 2021-02-04 2:44 UTC (permalink / raw)
To: Will Deacon
Cc: Ard Biesheuvel, linux-crypto, catalin.marinas, kernel-team,
mark.rutland, Dave Martin, Eric Biggers, linux-arm-kernel
On Wed, Feb 03, 2021 at 09:31:31PM +0000, Will Deacon wrote:
>
> Applied first patch only to arm64 (for-next/crypto), thanks!
>
> [1/9] arm64: assembler: add cond_yield macro
> https://git.kernel.org/arm64/c/d13c613f136c
>
> This is the only patch on the branch, so I'm happy for it to be pulled
> into the crypto tree too if it enables some of the other patches to land
> in 5.12.
Hi Will:
I think it might be easier if you take the lot.
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Thanks,
--
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: (subset) Re: [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code
2021-02-04 2:44 ` Herbert Xu
@ 2021-02-04 8:29 ` Ard Biesheuvel
2021-02-04 11:10 ` Herbert Xu
0 siblings, 1 reply; 18+ messages in thread
From: Ard Biesheuvel @ 2021-02-04 8:29 UTC (permalink / raw)
To: Herbert Xu
Cc: Will Deacon, Linux Crypto Mailing List, Catalin Marinas,
Android Kernel Team, Mark Rutland, Dave Martin, Eric Biggers,
Linux ARM
On Thu, 4 Feb 2021 at 03:44, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Wed, Feb 03, 2021 at 09:31:31PM +0000, Will Deacon wrote:
> >
> > Applied first patch only to arm64 (for-next/crypto), thanks!
> >
> > [1/9] arm64: assembler: add cond_yield macro
> > https://git.kernel.org/arm64/c/d13c613f136c
> >
> > This is the only patch on the branch, so I'm happy for it to be pulled
> > into the crypto tree too if it enables some of the other patches to land
> > in 5.12.
>
> Hi Will:
>
> I think it might be easier if you take the lot.
>
> Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
>
Half of the patches in this series conflict with
0df07d8117c3 crypto: arm64/sha - add missing module aliases
in the cryptodev tree, so that won't work.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: (subset) Re: [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code
2021-02-04 8:29 ` Ard Biesheuvel
@ 2021-02-04 11:10 ` Herbert Xu
2021-02-04 13:03 ` Will Deacon
0 siblings, 1 reply; 18+ messages in thread
From: Herbert Xu @ 2021-02-04 11:10 UTC (permalink / raw)
To: Ard Biesheuvel
Cc: Will Deacon, Linux Crypto Mailing List, Catalin Marinas,
Android Kernel Team, Mark Rutland, Dave Martin, Eric Biggers,
Linux ARM
On Thu, Feb 04, 2021 at 09:29:16AM +0100, Ard Biesheuvel wrote:
>
> Half of the patches in this series conflict with
>
> 0df07d8117c3 crypto: arm64/sha - add missing module aliases
>
> in the cryptodev tree, so that won't work.
Fair enough. I'll take the patches.
Thanks,
--
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: (subset) Re: [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code
2021-02-04 11:10 ` Herbert Xu
@ 2021-02-04 13:03 ` Will Deacon
2021-02-04 19:45 ` Herbert Xu
0 siblings, 1 reply; 18+ messages in thread
From: Will Deacon @ 2021-02-04 13:03 UTC (permalink / raw)
To: Herbert Xu
Cc: Ard Biesheuvel, Linux Crypto Mailing List, Catalin Marinas,
Android Kernel Team, Mark Rutland, Dave Martin, Eric Biggers,
Linux ARM
On Thu, Feb 04, 2021 at 10:10:26PM +1100, Herbert Xu wrote:
> On Thu, Feb 04, 2021 at 09:29:16AM +0100, Ard Biesheuvel wrote:
> >
> > Half of the patches in this series conflict with
> >
> > 0df07d8117c3 crypto: arm64/sha - add missing module aliases
> >
> > in the cryptodev tree, so that won't work.
>
> Fair enough. I'll take the patches.
Cheers, Herbert. Please just leave the final patch ("arm64: assembler:
remove conditional NEON yield macro"), as we can come back to that for
5.13.
Will
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: (subset) Re: [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code
2021-02-04 13:03 ` Will Deacon
@ 2021-02-04 19:45 ` Herbert Xu
0 siblings, 0 replies; 18+ messages in thread
From: Herbert Xu @ 2021-02-04 19:45 UTC (permalink / raw)
To: Will Deacon
Cc: Ard Biesheuvel, Linux Crypto Mailing List, Catalin Marinas,
Android Kernel Team, Mark Rutland, Dave Martin, Eric Biggers,
Linux ARM
On Thu, Feb 04, 2021 at 01:03:11PM +0000, Will Deacon wrote:
>
> Cheers, Herbert. Please just leave the final patch ("arm64: assembler:
> remove conditional NEON yield macro"), as we can come back to that for
> 5.13.
Sure I'll leave out the last patch.
Thanks,
--
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: (subset) Re: [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code
2021-02-03 21:31 ` (subset) Re: [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code Will Deacon
2021-02-04 2:44 ` Herbert Xu
@ 2021-02-04 10:33 ` Will Deacon
1 sibling, 0 replies; 18+ messages in thread
From: Will Deacon @ 2021-02-04 10:33 UTC (permalink / raw)
To: Ard Biesheuvel, linux-crypto
Cc: catalin.marinas, kernel-team, mark.rutland, Dave Martin,
Eric Biggers, herbert, linux-arm-kernel
On Wed, Feb 03, 2021 at 09:31:31PM +0000, Will Deacon wrote:
> On Wed, 3 Feb 2021 12:36:17 +0100, Ard Biesheuvel wrote:
> > Given how kernel mode NEON code disables preemption (to ensure that the
> > FP/SIMD register state is protected without having to context switch it),
> > we need to take care not to let those algorithms operate on unbounded
> > input data, or we may end up with excessive scheduling blackouts on
> > CONFIG_PREEMPT kernels.
> >
> > This is currently handled by the cond_yield_neon macros, which check the
> > preempt count and the TIF_NEED_RESCHED flag from assembler code, and call
> > into kernel_neon_end()+kernel_neon_begin(), triggering a reschedule.
> > This works as expected, but is a bit messy, given how much of the state
> > preserve/restore code in the algorithm needs to be duplicated, as well as
> > causing the need to manage the stack frame explicitly. All of this is better
> > handled by the compiler, especially now that we have enabled features such
> > as the shadow call stack and BTI, and are working to improve call stack
> > validation.
> >
> > [...]
>
> Applied first patch only to arm64 (for-next/crypto), thanks!
Oops, looks like I typo'd the external branch (for-next/crypo). No offense
intended! I'll rename it now; SHAs will stay the same.
Will
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code
2021-02-03 11:36 [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code Ard Biesheuvel
` (9 preceding siblings ...)
2021-02-03 21:31 ` (subset) Re: [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code Will Deacon
@ 2021-02-10 7:23 ` Herbert Xu
10 siblings, 0 replies; 18+ messages in thread
From: Herbert Xu @ 2021-02-10 7:23 UTC (permalink / raw)
To: Ard Biesheuvel
Cc: linux-crypto, linux-arm-kernel, will, mark.rutland,
catalin.marinas, Dave Martin, Eric Biggers
On Wed, Feb 03, 2021 at 12:36:17PM +0100, Ard Biesheuvel wrote:
> Given how kernel mode NEON code disables preemption (to ensure that the
> FP/SIMD register state is protected without having to context switch it),
> we need to take care not to let those algorithms operate on unbounded
> input data, or we may end up with excessive scheduling blackouts on
> CONFIG_PREEMPT kernels.
>
> This is currently handled by the cond_yield_neon macros, which check the
> preempt count and the TIF_NEED_RESCHED flag from assembler code, and call
> into kernel_neon_end()+kernel_neon_begin(), triggering a reschedule.
> This works as expected, but is a bit messy, given how much of the state
> preserve/restore code in the algorithm needs to be duplicated, as well as
> causing the need to manage the stack frame explicitly. All of this is better
> handled by the compiler, especially now that we have enabled features such
> as the shadow call stack and BTI, and are working to improve call stack
> validation.
>
> In some cases, yielding is not necessary at all: algoritms that implement
> skciphers and use the skcipher walk API will be invoked at page granularity,
> which is granular enough for our purpose.
>
> In other cases, it is better to simply return early from the assembler
> routine if a reschedule is pending, and let the C code handle with this, by
> retrying the call until it completes. This removes any voluntary schedule()
> calls from the call stack, making the code much easier to reason about in
> the context of stack validation, rcu_tasks synchronization, etc.
>
> Practical note: assuming there are no objections to these changes, it may
> be the most convenient to take patch #1 into the arm64 tree for v5.12,
> and postpone the rest for merging via the crypto tree. (Note that this
> series was created against the cryptodev tree, and so the arm64 maintainers
> are also welcome to take the whole set if it applies cleanly to the arm64
> tree)
>
> Will: if you stick #1 on a separate branch, please base it on v5.11-rc1
>
> Changes since v1:
> - use sub+cbz instead of cmp+b.eq to avoid clobbering the flags in cond_yield
> (patch #1)
>
> Cc: Dave Martin <dave.martin@arm.com>
> Cc: Eric Biggers <ebiggers@google.com>
>
> Ard Biesheuvel (9):
> arm64: assembler: add cond_yield macro
> crypto: arm64/sha1-ce - simplify NEON yield
> crypto: arm64/sha2-ce - simplify NEON yield
> crypto: arm64/sha3-ce - simplify NEON yield
> crypto: arm64/sha512-ce - simplify NEON yield
> crypto: arm64/aes-neonbs - remove NEON yield calls
> crypto: arm64/aes-ce-mac - simplify NEON yield
> crypto: arm64/crc-t10dif - move NEON yield to C code
> arm64: assembler: remove conditional NEON yield macros
>
> arch/arm64/crypto/aes-glue.c | 21 +++--
> arch/arm64/crypto/aes-modes.S | 52 +++++--------
> arch/arm64/crypto/aes-neonbs-core.S | 8 +-
> arch/arm64/crypto/crct10dif-ce-core.S | 43 +++--------
> arch/arm64/crypto/crct10dif-ce-glue.c | 30 ++++++--
> arch/arm64/crypto/sha1-ce-core.S | 47 ++++--------
> arch/arm64/crypto/sha1-ce-glue.c | 22 +++---
> arch/arm64/crypto/sha2-ce-core.S | 38 ++++-----
> arch/arm64/crypto/sha2-ce-glue.c | 22 +++---
> arch/arm64/crypto/sha3-ce-core.S | 81 ++++++++------------
> arch/arm64/crypto/sha3-ce-glue.c | 14 ++--
> arch/arm64/crypto/sha512-ce-core.S | 29 ++-----
> arch/arm64/crypto/sha512-ce-glue.c | 53 +++++++------
> arch/arm64/include/asm/assembler.h | 78 +++----------------
> 14 files changed, 209 insertions(+), 329 deletions(-)
Patches 2-8 applied. Thanks.
--
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply [flat|nested] 18+ messages in thread