All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ard Biesheuvel <ardb@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org, will@kernel.org,
	mark.rutland@arm.com, catalin.marinas@arm.com,
	herbert@gondor.apana.org.au, Ard Biesheuvel <ardb@kernel.org>,
	Dave Martin <dave.martin@arm.com>,
	Eric Biggers <ebiggers@google.com>
Subject: [PATCH v2 3/9] crypto: arm64/sha2-ce - simplify NEON yield
Date: Wed,  3 Feb 2021 12:36:20 +0100	[thread overview]
Message-ID: <20210203113626.220151-4-ardb@kernel.org> (raw)
In-Reply-To: <20210203113626.220151-1-ardb@kernel.org>

Instead of calling into kernel_neon_end() and kernel_neon_begin() (and
potentially into schedule()) from the assembler code when running in
task mode and a reschedule is pending, perform only the preempt count
check in assembler, but simply return early in this case, and let the C
code deal with the consequences.

This reverts commit d82f37ab5e2426287013eba38b1212e8b71e5be3.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm64/crypto/sha2-ce-core.S | 38 +++++++-------------
 arch/arm64/crypto/sha2-ce-glue.c | 22 ++++++------
 2 files changed, 25 insertions(+), 35 deletions(-)

diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S
index 3f9d0f326987..6cdea7d56059 100644
--- a/arch/arm64/crypto/sha2-ce-core.S
+++ b/arch/arm64/crypto/sha2-ce-core.S
@@ -76,36 +76,30 @@
 	 */
 	.text
 SYM_FUNC_START(sha2_ce_transform)
-	frame_push	3
-
-	mov		x19, x0
-	mov		x20, x1
-	mov		x21, x2
-
 	/* load round constants */
-0:	adr_l		x8, .Lsha2_rcon
+	adr_l		x8, .Lsha2_rcon
 	ld1		{ v0.4s- v3.4s}, [x8], #64
 	ld1		{ v4.4s- v7.4s}, [x8], #64
 	ld1		{ v8.4s-v11.4s}, [x8], #64
 	ld1		{v12.4s-v15.4s}, [x8]
 
 	/* load state */
-	ld1		{dgav.4s, dgbv.4s}, [x19]
+	ld1		{dgav.4s, dgbv.4s}, [x0]
 
 	/* load sha256_ce_state::finalize */
 	ldr_l		w4, sha256_ce_offsetof_finalize, x4
-	ldr		w4, [x19, x4]
+	ldr		w4, [x0, x4]
 
 	/* load input */
-1:	ld1		{v16.4s-v19.4s}, [x20], #64
-	sub		w21, w21, #1
+0:	ld1		{v16.4s-v19.4s}, [x1], #64
+	sub		w2, w2, #1
 
 CPU_LE(	rev32		v16.16b, v16.16b	)
 CPU_LE(	rev32		v17.16b, v17.16b	)
 CPU_LE(	rev32		v18.16b, v18.16b	)
 CPU_LE(	rev32		v19.16b, v19.16b	)
 
-2:	add		t0.4s, v16.4s, v0.4s
+1:	add		t0.4s, v16.4s, v0.4s
 	mov		dg0v.16b, dgav.16b
 	mov		dg1v.16b, dgbv.16b
 
@@ -134,24 +128,18 @@ CPU_LE(	rev32		v19.16b, v19.16b	)
 	add		dgbv.4s, dgbv.4s, dg1v.4s
 
 	/* handled all input blocks? */
-	cbz		w21, 3f
-
-	if_will_cond_yield_neon
-	st1		{dgav.4s, dgbv.4s}, [x19]
-	do_cond_yield_neon
+	cbz		w2, 2f
+	cond_yield	3f, x5
 	b		0b
-	endif_yield_neon
-
-	b		1b
 
 	/*
 	 * Final block: add padding and total bit count.
 	 * Skip if the input size was not a round multiple of the block size,
 	 * the padding is handled by the C code in that case.
 	 */
-3:	cbz		x4, 4f
+2:	cbz		x4, 3f
 	ldr_l		w4, sha256_ce_offsetof_count, x4
-	ldr		x4, [x19, x4]
+	ldr		x4, [x0, x4]
 	movi		v17.2d, #0
 	mov		x8, #0x80000000
 	movi		v18.2d, #0
@@ -160,10 +148,10 @@ CPU_LE(	rev32		v19.16b, v19.16b	)
 	mov		x4, #0
 	mov		v19.d[0], xzr
 	mov		v19.d[1], x7
-	b		2b
+	b		1b
 
 	/* store new state */
-4:	st1		{dgav.4s, dgbv.4s}, [x19]
-	frame_pop
+3:	st1		{dgav.4s, dgbv.4s}, [x0]
+	mov		w0, w2
 	ret
 SYM_FUNC_END(sha2_ce_transform)
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
index ded3a6488f81..c57a6119fefc 100644
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -30,14 +30,22 @@ struct sha256_ce_state {
 extern const u32 sha256_ce_offsetof_count;
 extern const u32 sha256_ce_offsetof_finalize;
 
-asmlinkage void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src,
-				  int blocks);
+asmlinkage int sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src,
+				 int blocks);
 
 static void __sha2_ce_transform(struct sha256_state *sst, u8 const *src,
 				int blocks)
 {
-	sha2_ce_transform(container_of(sst, struct sha256_ce_state, sst), src,
-			  blocks);
+	while (blocks) {
+		int rem;
+
+		kernel_neon_begin();
+		rem = sha2_ce_transform(container_of(sst, struct sha256_ce_state,
+						     sst), src, blocks);
+		kernel_neon_end();
+		src += (blocks - rem) * SHA256_BLOCK_SIZE;
+		blocks = rem;
+	}
 }
 
 const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state,
@@ -63,9 +71,7 @@ static int sha256_ce_update(struct shash_desc *desc, const u8 *data,
 				__sha256_block_data_order);
 
 	sctx->finalize = 0;
-	kernel_neon_begin();
 	sha256_base_do_update(desc, data, len, __sha2_ce_transform);
-	kernel_neon_end();
 
 	return 0;
 }
@@ -90,11 +96,9 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data,
 	 */
 	sctx->finalize = finalize;
 
-	kernel_neon_begin();
 	sha256_base_do_update(desc, data, len, __sha2_ce_transform);
 	if (!finalize)
 		sha256_base_do_finalize(desc, __sha2_ce_transform);
-	kernel_neon_end();
 	return sha256_base_finish(desc, out);
 }
 
@@ -108,9 +112,7 @@ static int sha256_ce_final(struct shash_desc *desc, u8 *out)
 	}
 
 	sctx->finalize = 0;
-	kernel_neon_begin();
 	sha256_base_do_finalize(desc, __sha2_ce_transform);
-	kernel_neon_end();
 	return sha256_base_finish(desc, out);
 }
 
-- 
2.30.0


WARNING: multiple messages have this Message-ID (diff)
From: Ard Biesheuvel <ardb@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: mark.rutland@arm.com, herbert@gondor.apana.org.au,
	Eric Biggers <ebiggers@google.com>,
	catalin.marinas@arm.com, Dave Martin <dave.martin@arm.com>,
	will@kernel.org, Ard Biesheuvel <ardb@kernel.org>,
	linux-arm-kernel@lists.infradead.org
Subject: [PATCH v2 3/9] crypto: arm64/sha2-ce - simplify NEON yield
Date: Wed,  3 Feb 2021 12:36:20 +0100	[thread overview]
Message-ID: <20210203113626.220151-4-ardb@kernel.org> (raw)
In-Reply-To: <20210203113626.220151-1-ardb@kernel.org>

Instead of calling into kernel_neon_end() and kernel_neon_begin() (and
potentially into schedule()) from the assembler code when running in
task mode and a reschedule is pending, perform only the preempt count
check in assembler, but simply return early in this case, and let the C
code deal with the consequences.

This reverts commit d82f37ab5e2426287013eba38b1212e8b71e5be3.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm64/crypto/sha2-ce-core.S | 38 +++++++-------------
 arch/arm64/crypto/sha2-ce-glue.c | 22 ++++++------
 2 files changed, 25 insertions(+), 35 deletions(-)

diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S
index 3f9d0f326987..6cdea7d56059 100644
--- a/arch/arm64/crypto/sha2-ce-core.S
+++ b/arch/arm64/crypto/sha2-ce-core.S
@@ -76,36 +76,30 @@
 	 */
 	.text
 SYM_FUNC_START(sha2_ce_transform)
-	frame_push	3
-
-	mov		x19, x0
-	mov		x20, x1
-	mov		x21, x2
-
 	/* load round constants */
-0:	adr_l		x8, .Lsha2_rcon
+	adr_l		x8, .Lsha2_rcon
 	ld1		{ v0.4s- v3.4s}, [x8], #64
 	ld1		{ v4.4s- v7.4s}, [x8], #64
 	ld1		{ v8.4s-v11.4s}, [x8], #64
 	ld1		{v12.4s-v15.4s}, [x8]
 
 	/* load state */
-	ld1		{dgav.4s, dgbv.4s}, [x19]
+	ld1		{dgav.4s, dgbv.4s}, [x0]
 
 	/* load sha256_ce_state::finalize */
 	ldr_l		w4, sha256_ce_offsetof_finalize, x4
-	ldr		w4, [x19, x4]
+	ldr		w4, [x0, x4]
 
 	/* load input */
-1:	ld1		{v16.4s-v19.4s}, [x20], #64
-	sub		w21, w21, #1
+0:	ld1		{v16.4s-v19.4s}, [x1], #64
+	sub		w2, w2, #1
 
 CPU_LE(	rev32		v16.16b, v16.16b	)
 CPU_LE(	rev32		v17.16b, v17.16b	)
 CPU_LE(	rev32		v18.16b, v18.16b	)
 CPU_LE(	rev32		v19.16b, v19.16b	)
 
-2:	add		t0.4s, v16.4s, v0.4s
+1:	add		t0.4s, v16.4s, v0.4s
 	mov		dg0v.16b, dgav.16b
 	mov		dg1v.16b, dgbv.16b
 
@@ -134,24 +128,18 @@ CPU_LE(	rev32		v19.16b, v19.16b	)
 	add		dgbv.4s, dgbv.4s, dg1v.4s
 
 	/* handled all input blocks? */
-	cbz		w21, 3f
-
-	if_will_cond_yield_neon
-	st1		{dgav.4s, dgbv.4s}, [x19]
-	do_cond_yield_neon
+	cbz		w2, 2f
+	cond_yield	3f, x5
 	b		0b
-	endif_yield_neon
-
-	b		1b
 
 	/*
 	 * Final block: add padding and total bit count.
 	 * Skip if the input size was not a round multiple of the block size,
 	 * the padding is handled by the C code in that case.
 	 */
-3:	cbz		x4, 4f
+2:	cbz		x4, 3f
 	ldr_l		w4, sha256_ce_offsetof_count, x4
-	ldr		x4, [x19, x4]
+	ldr		x4, [x0, x4]
 	movi		v17.2d, #0
 	mov		x8, #0x80000000
 	movi		v18.2d, #0
@@ -160,10 +148,10 @@ CPU_LE(	rev32		v19.16b, v19.16b	)
 	mov		x4, #0
 	mov		v19.d[0], xzr
 	mov		v19.d[1], x7
-	b		2b
+	b		1b
 
 	/* store new state */
-4:	st1		{dgav.4s, dgbv.4s}, [x19]
-	frame_pop
+3:	st1		{dgav.4s, dgbv.4s}, [x0]
+	mov		w0, w2
 	ret
 SYM_FUNC_END(sha2_ce_transform)
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
index ded3a6488f81..c57a6119fefc 100644
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -30,14 +30,22 @@ struct sha256_ce_state {
 extern const u32 sha256_ce_offsetof_count;
 extern const u32 sha256_ce_offsetof_finalize;
 
-asmlinkage void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src,
-				  int blocks);
+asmlinkage int sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src,
+				 int blocks);
 
 static void __sha2_ce_transform(struct sha256_state *sst, u8 const *src,
 				int blocks)
 {
-	sha2_ce_transform(container_of(sst, struct sha256_ce_state, sst), src,
-			  blocks);
+	while (blocks) {
+		int rem;
+
+		kernel_neon_begin();
+		rem = sha2_ce_transform(container_of(sst, struct sha256_ce_state,
+						     sst), src, blocks);
+		kernel_neon_end();
+		src += (blocks - rem) * SHA256_BLOCK_SIZE;
+		blocks = rem;
+	}
 }
 
 const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state,
@@ -63,9 +71,7 @@ static int sha256_ce_update(struct shash_desc *desc, const u8 *data,
 				__sha256_block_data_order);
 
 	sctx->finalize = 0;
-	kernel_neon_begin();
 	sha256_base_do_update(desc, data, len, __sha2_ce_transform);
-	kernel_neon_end();
 
 	return 0;
 }
@@ -90,11 +96,9 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data,
 	 */
 	sctx->finalize = finalize;
 
-	kernel_neon_begin();
 	sha256_base_do_update(desc, data, len, __sha2_ce_transform);
 	if (!finalize)
 		sha256_base_do_finalize(desc, __sha2_ce_transform);
-	kernel_neon_end();
 	return sha256_base_finish(desc, out);
 }
 
@@ -108,9 +112,7 @@ static int sha256_ce_final(struct shash_desc *desc, u8 *out)
 	}
 
 	sctx->finalize = 0;
-	kernel_neon_begin();
 	sha256_base_do_finalize(desc, __sha2_ce_transform);
-	kernel_neon_end();
 	return sha256_base_finish(desc, out);
 }
 
-- 
2.30.0


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

  parent reply	other threads:[~2021-02-03 11:37 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-02-03 11:36 [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code Ard Biesheuvel
2021-02-03 11:36 ` Ard Biesheuvel
2021-02-03 11:36 ` [PATCH v2 1/9] arm64: assembler: add cond_yield macro Ard Biesheuvel
2021-02-03 11:36   ` Ard Biesheuvel
2021-02-03 11:36 ` [PATCH v2 2/9] crypto: arm64/sha1-ce - simplify NEON yield Ard Biesheuvel
2021-02-03 11:36   ` Ard Biesheuvel
2021-02-03 11:36 ` Ard Biesheuvel [this message]
2021-02-03 11:36   ` [PATCH v2 3/9] crypto: arm64/sha2-ce " Ard Biesheuvel
2021-02-03 11:36 ` [PATCH v2 4/9] crypto: arm64/sha3-ce " Ard Biesheuvel
2021-02-03 11:36   ` Ard Biesheuvel
2021-02-03 11:36 ` [PATCH v2 5/9] crypto: arm64/sha512-ce " Ard Biesheuvel
2021-02-03 11:36   ` Ard Biesheuvel
2021-02-03 11:36 ` [PATCH v2 6/9] crypto: arm64/aes-neonbs - remove NEON yield calls Ard Biesheuvel
2021-02-03 11:36   ` Ard Biesheuvel
2021-02-03 11:36 ` [PATCH v2 7/9] crypto: arm64/aes-ce-mac - simplify NEON yield Ard Biesheuvel
2021-02-03 11:36   ` Ard Biesheuvel
2021-02-03 11:36 ` [PATCH v2 8/9] crypto: arm64/crc-t10dif - move NEON yield to C code Ard Biesheuvel
2021-02-03 11:36   ` Ard Biesheuvel
2021-02-03 11:36 ` [PATCH v2 9/9] arm64: assembler: remove conditional NEON yield macros Ard Biesheuvel
2021-02-03 11:36   ` Ard Biesheuvel
2021-02-03 21:31 ` (subset) Re: [PATCH v2 0/9] arm64: rework NEON yielding to avoid scheduling from asm code Will Deacon
2021-02-03 21:31   ` Will Deacon
2021-02-04  2:44   ` Herbert Xu
2021-02-04  2:44     ` Herbert Xu
2021-02-04  8:29     ` Ard Biesheuvel
2021-02-04  8:29       ` Ard Biesheuvel
2021-02-04 11:10       ` Herbert Xu
2021-02-04 11:10         ` Herbert Xu
2021-02-04 13:03         ` Will Deacon
2021-02-04 13:03           ` Will Deacon
2021-02-04 19:45           ` Herbert Xu
2021-02-04 19:45             ` Herbert Xu
2021-02-04 10:33   ` Will Deacon
2021-02-04 10:33     ` Will Deacon
2021-02-10  7:23 ` Herbert Xu
2021-02-10  7:23   ` Herbert Xu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210203113626.220151-4-ardb@kernel.org \
    --to=ardb@kernel.org \
    --cc=catalin.marinas@arm.com \
    --cc=dave.martin@arm.com \
    --cc=ebiggers@google.com \
    --cc=herbert@gondor.apana.org.au \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-crypto@vger.kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.