All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] crypto: fix crct10dif for ARM and arm64
@ 2019-01-24 18:27 ` Ard Biesheuvel
  0 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2019-01-24 18:27 UTC (permalink / raw)
  To: linux-crypto; +Cc: linux-arm-kernel, herbert, ebiggers, Ard Biesheuvel

Fix the issues in both NEON implementations of the CRC-T10DIF routines,
that were reported by Eric's new testing code.

Ard Biesheuvel (2):
  crypto: arm/crct10dif - revert to C code for short inputs
  crypto: arm64/crct10dif - revert to C code for short inputs

 arch/arm/crypto/crct10dif-ce-core.S   | 20 ++++++++--------
 arch/arm/crypto/crct10dif-ce-glue.c   | 23 +++++-------------
 arch/arm64/crypto/crct10dif-ce-glue.c | 25 +++++---------------
 3 files changed, 22 insertions(+), 46 deletions(-)

-- 
2.17.1


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 0/2] crypto: fix crct10dif for ARM and arm64
@ 2019-01-24 18:27 ` Ard Biesheuvel
  0 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2019-01-24 18:27 UTC (permalink / raw)
  To: linux-crypto; +Cc: ebiggers, herbert, linux-arm-kernel, Ard Biesheuvel

Fix the issues in both NEON implementations of the CRC-T10DIF routines,
that were reported by Eric's new testing code.

Ard Biesheuvel (2):
  crypto: arm/crct10dif - revert to C code for short inputs
  crypto: arm64/crct10dif - revert to C code for short inputs

 arch/arm/crypto/crct10dif-ce-core.S   | 20 ++++++++--------
 arch/arm/crypto/crct10dif-ce-glue.c   | 23 +++++-------------
 arch/arm64/crypto/crct10dif-ce-glue.c | 25 +++++---------------
 3 files changed, 22 insertions(+), 46 deletions(-)

-- 
2.17.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/2] crypto: arm/crct10dif - revert to C code for short inputs
  2019-01-24 18:27 ` Ard Biesheuvel
@ 2019-01-24 18:27   ` Ard Biesheuvel
  -1 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2019-01-24 18:27 UTC (permalink / raw)
  To: linux-crypto; +Cc: linux-arm-kernel, herbert, ebiggers, Ard Biesheuvel

The SIMD routine ported from x86 used to have a special code path
for inputs < 16 bytes, which got lost somewhere along the way.
Instead, the current glue code aligns the input pointer to permit
the NEON routine to use special versions of the vld1 instructions
that assume 16 byte alignment, but this could result in inputs of
less than 16 bytes to be passed in. This not only fails the new
extended tests that Eric has implemented, it also results in the
code reading before the input pointer, which could potentially
result in crashes when dealing with less than 16 bytes of input
at the start of a page which is preceded by an unmapped page.

So update the glue code to only invoke the NEON routine if the
input is more than 16 bytes.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm/crypto/crct10dif-ce-core.S | 20 ++++++++---------
 arch/arm/crypto/crct10dif-ce-glue.c | 23 +++++---------------
 2 files changed, 16 insertions(+), 27 deletions(-)

diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
index ce45ba0c0687..3fd13d7c842c 100644
--- a/arch/arm/crypto/crct10dif-ce-core.S
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -124,10 +124,10 @@ ENTRY(crc_t10dif_pmull)
 	vext.8		q10, qzr, q0, #4
 
 	// receive the initial 64B data, xor the initial crc value
-	vld1.64		{q0-q1}, [arg2, :128]!
-	vld1.64		{q2-q3}, [arg2, :128]!
-	vld1.64		{q4-q5}, [arg2, :128]!
-	vld1.64		{q6-q7}, [arg2, :128]!
+	vld1.64		{q0-q1}, [arg2]!
+	vld1.64		{q2-q3}, [arg2]!
+	vld1.64		{q4-q5}, [arg2]!
+	vld1.64		{q6-q7}, [arg2]!
 CPU_LE(	vrev64.8	q0, q0			)
 CPU_LE(	vrev64.8	q1, q1			)
 CPU_LE(	vrev64.8	q2, q2			)
@@ -150,7 +150,7 @@ CPU_LE(	vrev64.8	q7, q7			)
 	veor.8		q0, q0, q10
 
 	adr		ip, rk3
-	vld1.64		{q10}, [ip, :128]	// xmm10 has rk3 and rk4
+	vld1.64		{q10}, [ip]	// xmm10 has rk3 and rk4
 
 	//
 	// we subtract 256 instead of 128 to save one instruction from the loop
@@ -167,7 +167,7 @@ CPU_LE(	vrev64.8	q7, q7			)
 _fold_64_B_loop:
 
 	.macro		fold64, reg1, reg2
-	vld1.64		{q11-q12}, [arg2, :128]!
+	vld1.64		{q11-q12}, [arg2]!
 
 	vmull.p64	q8, \reg1\()h, d21
 	vmull.p64	\reg1, \reg1\()l, d20
@@ -203,13 +203,13 @@ CPU_LE(	vrev64.8	q12, q12		)
 	// constants
 
 	adr		ip, rk9
-	vld1.64		{q10}, [ip, :128]!
+	vld1.64		{q10}, [ip]!
 
 	.macro		fold16, reg, rk
 	vmull.p64	q8, \reg\()l, d20
 	vmull.p64	\reg, \reg\()h, d21
 	.ifnb		\rk
-	vld1.64		{q10}, [ip, :128]!
+	vld1.64		{q10}, [ip]!
 	.endif
 	veor.8		q7, q7, q8
 	veor.8		q7, q7, \reg
@@ -238,7 +238,7 @@ _16B_reduction_loop:
 	vmull.p64	q7, d15, d21
 	veor.8		q7, q7, q8
 
-	vld1.64		{q0}, [arg2, :128]!
+	vld1.64		{q0}, [arg2]!
 CPU_LE(	vrev64.8	q0, q0		)
 	vswp		d0, d1
 	veor.8		q7, q7, q0
@@ -335,7 +335,7 @@ _less_than_128:
 	vmov.i8		q0, #0
 	vmov		s3, arg1_low32		// get the initial crc value
 
-	vld1.64		{q7}, [arg2, :128]!
+	vld1.64		{q7}, [arg2]!
 CPU_LE(	vrev64.8	q7, q7		)
 	vswp		d14, d15
 	veor.8		q7, q7, q0
diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
index d428355cf38d..14c19c70a841 100644
--- a/arch/arm/crypto/crct10dif-ce-glue.c
+++ b/arch/arm/crypto/crct10dif-ce-glue.c
@@ -35,26 +35,15 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data,
 			    unsigned int length)
 {
 	u16 *crc = shash_desc_ctx(desc);
-	unsigned int l;
 
-	if (!may_use_simd()) {
-		*crc = crc_t10dif_generic(*crc, data, length);
+	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
+		kernel_neon_begin();
+		*crc = crc_t10dif_pmull(*crc, data, length);
+		kernel_neon_end();
 	} else {
-		if (unlikely((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
-			l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
-				  ((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
-
-			*crc = crc_t10dif_generic(*crc, data, l);
-
-			length -= l;
-			data += l;
-		}
-		if (length > 0) {
-			kernel_neon_begin();
-			*crc = crc_t10dif_pmull(*crc, data, length);
-			kernel_neon_end();
-		}
+		*crc = crc_t10dif_generic(*crc, data, length);
 	}
+
 	return 0;
 }
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 1/2] crypto: arm/crct10dif - revert to C code for short inputs
@ 2019-01-24 18:27   ` Ard Biesheuvel
  0 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2019-01-24 18:27 UTC (permalink / raw)
  To: linux-crypto; +Cc: ebiggers, herbert, linux-arm-kernel, Ard Biesheuvel

The SIMD routine ported from x86 used to have a special code path
for inputs < 16 bytes, which got lost somewhere along the way.
Instead, the current glue code aligns the input pointer to permit
the NEON routine to use special versions of the vld1 instructions
that assume 16 byte alignment, but this could result in inputs of
less than 16 bytes to be passed in. This not only fails the new
extended tests that Eric has implemented, it also results in the
code reading before the input pointer, which could potentially
result in crashes when dealing with less than 16 bytes of input
at the start of a page which is preceded by an unmapped page.

So update the glue code to only invoke the NEON routine if the
input is more than 16 bytes.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm/crypto/crct10dif-ce-core.S | 20 ++++++++---------
 arch/arm/crypto/crct10dif-ce-glue.c | 23 +++++---------------
 2 files changed, 16 insertions(+), 27 deletions(-)

diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
index ce45ba0c0687..3fd13d7c842c 100644
--- a/arch/arm/crypto/crct10dif-ce-core.S
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -124,10 +124,10 @@ ENTRY(crc_t10dif_pmull)
 	vext.8		q10, qzr, q0, #4
 
 	// receive the initial 64B data, xor the initial crc value
-	vld1.64		{q0-q1}, [arg2, :128]!
-	vld1.64		{q2-q3}, [arg2, :128]!
-	vld1.64		{q4-q5}, [arg2, :128]!
-	vld1.64		{q6-q7}, [arg2, :128]!
+	vld1.64		{q0-q1}, [arg2]!
+	vld1.64		{q2-q3}, [arg2]!
+	vld1.64		{q4-q5}, [arg2]!
+	vld1.64		{q6-q7}, [arg2]!
 CPU_LE(	vrev64.8	q0, q0			)
 CPU_LE(	vrev64.8	q1, q1			)
 CPU_LE(	vrev64.8	q2, q2			)
@@ -150,7 +150,7 @@ CPU_LE(	vrev64.8	q7, q7			)
 	veor.8		q0, q0, q10
 
 	adr		ip, rk3
-	vld1.64		{q10}, [ip, :128]	// xmm10 has rk3 and rk4
+	vld1.64		{q10}, [ip]	// xmm10 has rk3 and rk4
 
 	//
 	// we subtract 256 instead of 128 to save one instruction from the loop
@@ -167,7 +167,7 @@ CPU_LE(	vrev64.8	q7, q7			)
 _fold_64_B_loop:
 
 	.macro		fold64, reg1, reg2
-	vld1.64		{q11-q12}, [arg2, :128]!
+	vld1.64		{q11-q12}, [arg2]!
 
 	vmull.p64	q8, \reg1\()h, d21
 	vmull.p64	\reg1, \reg1\()l, d20
@@ -203,13 +203,13 @@ CPU_LE(	vrev64.8	q12, q12		)
 	// constants
 
 	adr		ip, rk9
-	vld1.64		{q10}, [ip, :128]!
+	vld1.64		{q10}, [ip]!
 
 	.macro		fold16, reg, rk
 	vmull.p64	q8, \reg\()l, d20
 	vmull.p64	\reg, \reg\()h, d21
 	.ifnb		\rk
-	vld1.64		{q10}, [ip, :128]!
+	vld1.64		{q10}, [ip]!
 	.endif
 	veor.8		q7, q7, q8
 	veor.8		q7, q7, \reg
@@ -238,7 +238,7 @@ _16B_reduction_loop:
 	vmull.p64	q7, d15, d21
 	veor.8		q7, q7, q8
 
-	vld1.64		{q0}, [arg2, :128]!
+	vld1.64		{q0}, [arg2]!
 CPU_LE(	vrev64.8	q0, q0		)
 	vswp		d0, d1
 	veor.8		q7, q7, q0
@@ -335,7 +335,7 @@ _less_than_128:
 	vmov.i8		q0, #0
 	vmov		s3, arg1_low32		// get the initial crc value
 
-	vld1.64		{q7}, [arg2, :128]!
+	vld1.64		{q7}, [arg2]!
 CPU_LE(	vrev64.8	q7, q7		)
 	vswp		d14, d15
 	veor.8		q7, q7, q0
diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
index d428355cf38d..14c19c70a841 100644
--- a/arch/arm/crypto/crct10dif-ce-glue.c
+++ b/arch/arm/crypto/crct10dif-ce-glue.c
@@ -35,26 +35,15 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data,
 			    unsigned int length)
 {
 	u16 *crc = shash_desc_ctx(desc);
-	unsigned int l;
 
-	if (!may_use_simd()) {
-		*crc = crc_t10dif_generic(*crc, data, length);
+	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
+		kernel_neon_begin();
+		*crc = crc_t10dif_pmull(*crc, data, length);
+		kernel_neon_end();
 	} else {
-		if (unlikely((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
-			l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
-				  ((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
-
-			*crc = crc_t10dif_generic(*crc, data, l);
-
-			length -= l;
-			data += l;
-		}
-		if (length > 0) {
-			kernel_neon_begin();
-			*crc = crc_t10dif_pmull(*crc, data, length);
-			kernel_neon_end();
-		}
+		*crc = crc_t10dif_generic(*crc, data, length);
 	}
+
 	return 0;
 }
 
-- 
2.17.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/2] crypto: arm64/crct10dif - revert to C code for short inputs
  2019-01-24 18:27 ` Ard Biesheuvel
@ 2019-01-24 18:27   ` Ard Biesheuvel
  -1 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2019-01-24 18:27 UTC (permalink / raw)
  To: linux-crypto; +Cc: linux-arm-kernel, herbert, ebiggers, Ard Biesheuvel

The SIMD routine ported from x86 used to have a special code path
for inputs < 16 bytes, which got lost somewhere along the way.
Instead, the current glue code aligns the input pointer to permit
the NEON routine to use special versions of the vld1 instructions
that assume 16 byte alignment, but this could result in inputs of
less than 16 bytes to be passed in. This not only fails the new
extended tests that Eric has implemented, it also results in the
code reading before the input pointer, which could potentially
result in crashes when dealing with less than 16 bytes of input
at the start of a page which is preceded by an unmapped page.

So update the glue code to only invoke the NEON routine if the
input is more than 16 bytes.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/crct10dif-ce-glue.c | 25 +++++---------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
index b461d62023f2..567c24f3d224 100644
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -39,26 +39,13 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data,
 			    unsigned int length)
 {
 	u16 *crc = shash_desc_ctx(desc);
-	unsigned int l;
 
-	if (unlikely((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
-		l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
-			  ((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
-
-		*crc = crc_t10dif_generic(*crc, data, l);
-
-		length -= l;
-		data += l;
-	}
-
-	if (length > 0) {
-		if (may_use_simd()) {
-			kernel_neon_begin();
-			*crc = crc_t10dif_pmull(*crc, data, length);
-			kernel_neon_end();
-		} else {
-			*crc = crc_t10dif_generic(*crc, data, length);
-		}
+	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
+		kernel_neon_begin();
+		*crc = crc_t10dif_pmull(*crc, data, length);
+		kernel_neon_end();
+	} else {
+		*crc = crc_t10dif_generic(*crc, data, length);
 	}
 
 	return 0;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/2] crypto: arm64/crct10dif - revert to C code for short inputs
@ 2019-01-24 18:27   ` Ard Biesheuvel
  0 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2019-01-24 18:27 UTC (permalink / raw)
  To: linux-crypto; +Cc: ebiggers, herbert, linux-arm-kernel, Ard Biesheuvel

The SIMD routine ported from x86 used to have a special code path
for inputs < 16 bytes, which got lost somewhere along the way.
Instead, the current glue code aligns the input pointer to permit
the NEON routine to use special versions of the vld1 instructions
that assume 16 byte alignment, but this could result in inputs of
less than 16 bytes to be passed in. This not only fails the new
extended tests that Eric has implemented, it also results in the
code reading before the input pointer, which could potentially
result in crashes when dealing with less than 16 bytes of input
at the start of a page which is preceded by an unmapped page.

So update the glue code to only invoke the NEON routine if the
input is more than 16 bytes.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/crct10dif-ce-glue.c | 25 +++++---------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
index b461d62023f2..567c24f3d224 100644
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -39,26 +39,13 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data,
 			    unsigned int length)
 {
 	u16 *crc = shash_desc_ctx(desc);
-	unsigned int l;
 
-	if (unlikely((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
-		l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
-			  ((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
-
-		*crc = crc_t10dif_generic(*crc, data, l);
-
-		length -= l;
-		data += l;
-	}
-
-	if (length > 0) {
-		if (may_use_simd()) {
-			kernel_neon_begin();
-			*crc = crc_t10dif_pmull(*crc, data, length);
-			kernel_neon_end();
-		} else {
-			*crc = crc_t10dif_generic(*crc, data, length);
-		}
+	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
+		kernel_neon_begin();
+		*crc = crc_t10dif_pmull(*crc, data, length);
+		kernel_neon_end();
+	} else {
+		*crc = crc_t10dif_generic(*crc, data, length);
 	}
 
 	return 0;
-- 
2.17.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] crypto: arm/crct10dif - revert to C code for short inputs
  2019-01-24 18:27   ` Ard Biesheuvel
@ 2019-01-25  7:22     ` Eric Biggers
  -1 siblings, 0 replies; 14+ messages in thread
From: Eric Biggers @ 2019-01-25  7:22 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: linux-crypto, linux-arm-kernel, herbert

On Thu, Jan 24, 2019 at 07:27:11PM +0100, Ard Biesheuvel wrote:
> The SIMD routine ported from x86 used to have a special code path
> for inputs < 16 bytes, which got lost somewhere along the way.
> Instead, the current glue code aligns the input pointer to permit
> the NEON routine to use special versions of the vld1 instructions
> that assume 16 byte alignment, but this could result in inputs of
> less than 16 bytes to be passed in. This not only fails the new
> extended tests that Eric has implemented, it also results in the
> code reading before the input pointer, which could potentially
> result in crashes when dealing with less than 16 bytes of input
> at the start of a page which is preceded by an unmapped page.
> 
> So update the glue code to only invoke the NEON routine if the
> input is more than 16 bytes.
> 
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

Can you add proper tags?

Fixes: 1d481f1cd892 ("crypto: arm/crct10dif - port x86 SSE implementation to ARM")
Cc: <stable@vger.kernel.org> # v4.10+

Just double checking as I don't have a system immediately available to run this
one on -- I assume it passes with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y now?

Another comment below:

> ---
>  arch/arm/crypto/crct10dif-ce-core.S | 20 ++++++++---------
>  arch/arm/crypto/crct10dif-ce-glue.c | 23 +++++---------------
>  2 files changed, 16 insertions(+), 27 deletions(-)
> 
> diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
> index ce45ba0c0687..3fd13d7c842c 100644
> --- a/arch/arm/crypto/crct10dif-ce-core.S
> +++ b/arch/arm/crypto/crct10dif-ce-core.S
> @@ -124,10 +124,10 @@ ENTRY(crc_t10dif_pmull)
>  	vext.8		q10, qzr, q0, #4
>  
>  	// receive the initial 64B data, xor the initial crc value
> -	vld1.64		{q0-q1}, [arg2, :128]!
> -	vld1.64		{q2-q3}, [arg2, :128]!
> -	vld1.64		{q4-q5}, [arg2, :128]!
> -	vld1.64		{q6-q7}, [arg2, :128]!
> +	vld1.64		{q0-q1}, [arg2]!
> +	vld1.64		{q2-q3}, [arg2]!
> +	vld1.64		{q4-q5}, [arg2]!
> +	vld1.64		{q6-q7}, [arg2]!
>  CPU_LE(	vrev64.8	q0, q0			)
>  CPU_LE(	vrev64.8	q1, q1			)
>  CPU_LE(	vrev64.8	q2, q2			)
> @@ -150,7 +150,7 @@ CPU_LE(	vrev64.8	q7, q7			)
>  	veor.8		q0, q0, q10
>  
>  	adr		ip, rk3
> -	vld1.64		{q10}, [ip, :128]	// xmm10 has rk3 and rk4
> +	vld1.64		{q10}, [ip]	// xmm10 has rk3 and rk4

This one is loading static data that is 16 byte aligned, so the :128 can be kept
here.  Same in the two other places below that load from [ip].

>  
>  	//
>  	// we subtract 256 instead of 128 to save one instruction from the loop
> @@ -167,7 +167,7 @@ CPU_LE(	vrev64.8	q7, q7			)
>  _fold_64_B_loop:
>  
>  	.macro		fold64, reg1, reg2
> -	vld1.64		{q11-q12}, [arg2, :128]!
> +	vld1.64		{q11-q12}, [arg2]!
>  
>  	vmull.p64	q8, \reg1\()h, d21
>  	vmull.p64	\reg1, \reg1\()l, d20
> @@ -203,13 +203,13 @@ CPU_LE(	vrev64.8	q12, q12		)
>  	// constants
>  
>  	adr		ip, rk9
> -	vld1.64		{q10}, [ip, :128]!
> +	vld1.64		{q10}, [ip]!
>  
>  	.macro		fold16, reg, rk
>  	vmull.p64	q8, \reg\()l, d20
>  	vmull.p64	\reg, \reg\()h, d21
>  	.ifnb		\rk
> -	vld1.64		{q10}, [ip, :128]!
> +	vld1.64		{q10}, [ip]!
>  	.endif
>  	veor.8		q7, q7, q8
>  	veor.8		q7, q7, \reg
> @@ -238,7 +238,7 @@ _16B_reduction_loop:
>  	vmull.p64	q7, d15, d21
>  	veor.8		q7, q7, q8
>  
> -	vld1.64		{q0}, [arg2, :128]!
> +	vld1.64		{q0}, [arg2]!
>  CPU_LE(	vrev64.8	q0, q0		)
>  	vswp		d0, d1
>  	veor.8		q7, q7, q0
> @@ -335,7 +335,7 @@ _less_than_128:
>  	vmov.i8		q0, #0
>  	vmov		s3, arg1_low32		// get the initial crc value
>  
> -	vld1.64		{q7}, [arg2, :128]!
> +	vld1.64		{q7}, [arg2]!
>  CPU_LE(	vrev64.8	q7, q7		)
>  	vswp		d14, d15
>  	veor.8		q7, q7, q0
> diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
> index d428355cf38d..14c19c70a841 100644
> --- a/arch/arm/crypto/crct10dif-ce-glue.c
> +++ b/arch/arm/crypto/crct10dif-ce-glue.c
> @@ -35,26 +35,15 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data,
>  			    unsigned int length)
>  {
>  	u16 *crc = shash_desc_ctx(desc);
> -	unsigned int l;
>  
> -	if (!may_use_simd()) {
> -		*crc = crc_t10dif_generic(*crc, data, length);
> +	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
> +		kernel_neon_begin();
> +		*crc = crc_t10dif_pmull(*crc, data, length);
> +		kernel_neon_end();
>  	} else {
> -		if (unlikely((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
> -			l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
> -				  ((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
> -
> -			*crc = crc_t10dif_generic(*crc, data, l);
> -
> -			length -= l;
> -			data += l;
> -		}
> -		if (length > 0) {
> -			kernel_neon_begin();
> -			*crc = crc_t10dif_pmull(*crc, data, length);
> -			kernel_neon_end();
> -		}
> +		*crc = crc_t10dif_generic(*crc, data, length);
>  	}
> +
>  	return 0;
>  }
>  
> -- 
> 2.17.1
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] crypto: arm/crct10dif - revert to C code for short inputs
@ 2019-01-25  7:22     ` Eric Biggers
  0 siblings, 0 replies; 14+ messages in thread
From: Eric Biggers @ 2019-01-25  7:22 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: linux-crypto, linux-arm-kernel, herbert

On Thu, Jan 24, 2019 at 07:27:11PM +0100, Ard Biesheuvel wrote:
> The SIMD routine ported from x86 used to have a special code path
> for inputs < 16 bytes, which got lost somewhere along the way.
> Instead, the current glue code aligns the input pointer to permit
> the NEON routine to use special versions of the vld1 instructions
> that assume 16 byte alignment, but this could result in inputs of
> less than 16 bytes to be passed in. This not only fails the new
> extended tests that Eric has implemented, it also results in the
> code reading before the input pointer, which could potentially
> result in crashes when dealing with less than 16 bytes of input
> at the start of a page which is preceded by an unmapped page.
> 
> So update the glue code to only invoke the NEON routine if the
> input is more than 16 bytes.
> 
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

Can you add proper tags?

Fixes: 1d481f1cd892 ("crypto: arm/crct10dif - port x86 SSE implementation to ARM")
Cc: <stable@vger.kernel.org> # v4.10+

Just double checking as I don't have a system immediately available to run this
one on -- I assume it passes with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y now?

Another comment below:

> ---
>  arch/arm/crypto/crct10dif-ce-core.S | 20 ++++++++---------
>  arch/arm/crypto/crct10dif-ce-glue.c | 23 +++++---------------
>  2 files changed, 16 insertions(+), 27 deletions(-)
> 
> diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
> index ce45ba0c0687..3fd13d7c842c 100644
> --- a/arch/arm/crypto/crct10dif-ce-core.S
> +++ b/arch/arm/crypto/crct10dif-ce-core.S
> @@ -124,10 +124,10 @@ ENTRY(crc_t10dif_pmull)
>  	vext.8		q10, qzr, q0, #4
>  
>  	// receive the initial 64B data, xor the initial crc value
> -	vld1.64		{q0-q1}, [arg2, :128]!
> -	vld1.64		{q2-q3}, [arg2, :128]!
> -	vld1.64		{q4-q5}, [arg2, :128]!
> -	vld1.64		{q6-q7}, [arg2, :128]!
> +	vld1.64		{q0-q1}, [arg2]!
> +	vld1.64		{q2-q3}, [arg2]!
> +	vld1.64		{q4-q5}, [arg2]!
> +	vld1.64		{q6-q7}, [arg2]!
>  CPU_LE(	vrev64.8	q0, q0			)
>  CPU_LE(	vrev64.8	q1, q1			)
>  CPU_LE(	vrev64.8	q2, q2			)
> @@ -150,7 +150,7 @@ CPU_LE(	vrev64.8	q7, q7			)
>  	veor.8		q0, q0, q10
>  
>  	adr		ip, rk3
> -	vld1.64		{q10}, [ip, :128]	// xmm10 has rk3 and rk4
> +	vld1.64		{q10}, [ip]	// xmm10 has rk3 and rk4

This one is loading static data that is 16 byte aligned, so the :128 can be kept
here.  Same in the two other places below that load from [ip].

>  
>  	//
>  	// we subtract 256 instead of 128 to save one instruction from the loop
> @@ -167,7 +167,7 @@ CPU_LE(	vrev64.8	q7, q7			)
>  _fold_64_B_loop:
>  
>  	.macro		fold64, reg1, reg2
> -	vld1.64		{q11-q12}, [arg2, :128]!
> +	vld1.64		{q11-q12}, [arg2]!
>  
>  	vmull.p64	q8, \reg1\()h, d21
>  	vmull.p64	\reg1, \reg1\()l, d20
> @@ -203,13 +203,13 @@ CPU_LE(	vrev64.8	q12, q12		)
>  	// constants
>  
>  	adr		ip, rk9
> -	vld1.64		{q10}, [ip, :128]!
> +	vld1.64		{q10}, [ip]!
>  
>  	.macro		fold16, reg, rk
>  	vmull.p64	q8, \reg\()l, d20
>  	vmull.p64	\reg, \reg\()h, d21
>  	.ifnb		\rk
> -	vld1.64		{q10}, [ip, :128]!
> +	vld1.64		{q10}, [ip]!
>  	.endif
>  	veor.8		q7, q7, q8
>  	veor.8		q7, q7, \reg
> @@ -238,7 +238,7 @@ _16B_reduction_loop:
>  	vmull.p64	q7, d15, d21
>  	veor.8		q7, q7, q8
>  
> -	vld1.64		{q0}, [arg2, :128]!
> +	vld1.64		{q0}, [arg2]!
>  CPU_LE(	vrev64.8	q0, q0		)
>  	vswp		d0, d1
>  	veor.8		q7, q7, q0
> @@ -335,7 +335,7 @@ _less_than_128:
>  	vmov.i8		q0, #0
>  	vmov		s3, arg1_low32		// get the initial crc value
>  
> -	vld1.64		{q7}, [arg2, :128]!
> +	vld1.64		{q7}, [arg2]!
>  CPU_LE(	vrev64.8	q7, q7		)
>  	vswp		d14, d15
>  	veor.8		q7, q7, q0
> diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
> index d428355cf38d..14c19c70a841 100644
> --- a/arch/arm/crypto/crct10dif-ce-glue.c
> +++ b/arch/arm/crypto/crct10dif-ce-glue.c
> @@ -35,26 +35,15 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data,
>  			    unsigned int length)
>  {
>  	u16 *crc = shash_desc_ctx(desc);
> -	unsigned int l;
>  
> -	if (!may_use_simd()) {
> -		*crc = crc_t10dif_generic(*crc, data, length);
> +	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
> +		kernel_neon_begin();
> +		*crc = crc_t10dif_pmull(*crc, data, length);
> +		kernel_neon_end();
>  	} else {
> -		if (unlikely((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
> -			l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
> -				  ((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
> -
> -			*crc = crc_t10dif_generic(*crc, data, l);
> -
> -			length -= l;
> -			data += l;
> -		}
> -		if (length > 0) {
> -			kernel_neon_begin();
> -			*crc = crc_t10dif_pmull(*crc, data, length);
> -			kernel_neon_end();
> -		}
> +		*crc = crc_t10dif_generic(*crc, data, length);
>  	}
> +
>  	return 0;
>  }
>  
> -- 
> 2.17.1
> 

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] crypto: arm64/crct10dif - revert to C code for short inputs
  2019-01-24 18:27   ` Ard Biesheuvel
@ 2019-01-25  7:29     ` Eric Biggers
  -1 siblings, 0 replies; 14+ messages in thread
From: Eric Biggers @ 2019-01-25  7:29 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: linux-crypto, linux-arm-kernel, herbert

On Thu, Jan 24, 2019 at 07:27:12PM +0100, Ard Biesheuvel wrote:
> The SIMD routine ported from x86 used to have a special code path
> for inputs < 16 bytes, which got lost somewhere along the way.
> Instead, the current glue code aligns the input pointer to permit
> the NEON routine to use special versions of the vld1 instructions
> that assume 16 byte alignment, but this could result in inputs of
> less than 16 bytes to be passed in. 

This description doesn't quite match the patch since the arm64 version of the
assembly doesn't use any alignment specifiers.  I take it that actually means
the alignment in the glue code wasn't necessary in the first place?

> This not only fails the new
> extended tests that Eric has implemented, it also results in the
> code reading before the input pointer, which could potentially
> result in crashes when dealing with less than 16 bytes of input
> at the start of a page which is preceded by an unmapped page.
> 
> So update the glue code to only invoke the NEON routine if the
> input is more than 16 bytes.
> 
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

Can you add:

Fixes: 6ef5737f3931 ("crypto: arm64/crct10dif - port x86 SSE implementation to arm64")
Cc: stable@vger.kernel.org

> ---
>  arch/arm64/crypto/crct10dif-ce-glue.c | 25 +++++---------------
>  1 file changed, 6 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
> index b461d62023f2..567c24f3d224 100644
> --- a/arch/arm64/crypto/crct10dif-ce-glue.c
> +++ b/arch/arm64/crypto/crct10dif-ce-glue.c
> @@ -39,26 +39,13 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data,
>  			    unsigned int length)
>  {
>  	u16 *crc = shash_desc_ctx(desc);
> -	unsigned int l;
>  
> -	if (unlikely((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
> -		l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
> -			  ((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
> -
> -		*crc = crc_t10dif_generic(*crc, data, l);
> -
> -		length -= l;
> -		data += l;
> -	}
> -
> -	if (length > 0) {
> -		if (may_use_simd()) {
> -			kernel_neon_begin();
> -			*crc = crc_t10dif_pmull(*crc, data, length);
> -			kernel_neon_end();
> -		} else {
> -			*crc = crc_t10dif_generic(*crc, data, length);
> -		}
> +	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
> +		kernel_neon_begin();
> +		*crc = crc_t10dif_pmull(*crc, data, length);
> +		kernel_neon_end();
> +	} else {
> +		*crc = crc_t10dif_generic(*crc, data, length);
>  	}
>  
>  	return 0;
> -- 
> 2.17.1
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] crypto: arm64/crct10dif - revert to C code for short inputs
@ 2019-01-25  7:29     ` Eric Biggers
  0 siblings, 0 replies; 14+ messages in thread
From: Eric Biggers @ 2019-01-25  7:29 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: linux-crypto, linux-arm-kernel, herbert

On Thu, Jan 24, 2019 at 07:27:12PM +0100, Ard Biesheuvel wrote:
> The SIMD routine ported from x86 used to have a special code path
> for inputs < 16 bytes, which got lost somewhere along the way.
> Instead, the current glue code aligns the input pointer to permit
> the NEON routine to use special versions of the vld1 instructions
> that assume 16 byte alignment, but this could result in inputs of
> less than 16 bytes to be passed in. 

This description doesn't quite match the patch since the arm64 version of the
assembly doesn't use any alignment specifiers.  I take it that actually means
the alignment in the glue code wasn't necessary in the first place?

> This not only fails the new
> extended tests that Eric has implemented, it also results in the
> code reading before the input pointer, which could potentially
> result in crashes when dealing with less than 16 bytes of input
> at the start of a page which is preceded by an unmapped page.
> 
> So update the glue code to only invoke the NEON routine if the
> input is more than 16 bytes.
> 
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

Can you add:

Fixes: 6ef5737f3931 ("crypto: arm64/crct10dif - port x86 SSE implementation to arm64")
Cc: stable@vger.kernel.org

> ---
>  arch/arm64/crypto/crct10dif-ce-glue.c | 25 +++++---------------
>  1 file changed, 6 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
> index b461d62023f2..567c24f3d224 100644
> --- a/arch/arm64/crypto/crct10dif-ce-glue.c
> +++ b/arch/arm64/crypto/crct10dif-ce-glue.c
> @@ -39,26 +39,13 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data,
>  			    unsigned int length)
>  {
>  	u16 *crc = shash_desc_ctx(desc);
> -	unsigned int l;
>  
> -	if (unlikely((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
> -		l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
> -			  ((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
> -
> -		*crc = crc_t10dif_generic(*crc, data, l);
> -
> -		length -= l;
> -		data += l;
> -	}
> -
> -	if (length > 0) {
> -		if (may_use_simd()) {
> -			kernel_neon_begin();
> -			*crc = crc_t10dif_pmull(*crc, data, length);
> -			kernel_neon_end();
> -		} else {
> -			*crc = crc_t10dif_generic(*crc, data, length);
> -		}
> +	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
> +		kernel_neon_begin();
> +		*crc = crc_t10dif_pmull(*crc, data, length);
> +		kernel_neon_end();
> +	} else {
> +		*crc = crc_t10dif_generic(*crc, data, length);
>  	}
>  
>  	return 0;
> -- 
> 2.17.1
> 

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] crypto: arm/crct10dif - revert to C code for short inputs
  2019-01-25  7:22     ` Eric Biggers
@ 2019-01-25  7:48       ` Ard Biesheuvel
  -1 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2019-01-25  7:48 UTC (permalink / raw)
  To: Eric Biggers
  Cc: open list:HARDWARE RANDOM NUMBER GENERATOR CORE,
	linux-arm-kernel, Herbert Xu

On Fri, 25 Jan 2019 at 08:22, Eric Biggers <ebiggers@kernel.org> wrote:
>
> On Thu, Jan 24, 2019 at 07:27:11PM +0100, Ard Biesheuvel wrote:
> > The SIMD routine ported from x86 used to have a special code path
> > for inputs < 16 bytes, which got lost somewhere along the way.
> > Instead, the current glue code aligns the input pointer to permit
> > the NEON routine to use special versions of the vld1 instructions
> > that assume 16 byte alignment, but this could result in inputs of
> > less than 16 bytes to be passed in. This not only fails the new
> > extended tests that Eric has implemented, it also results in the
> > code reading before the input pointer, which could potentially
> > result in crashes when dealing with less than 16 bytes of input
> > at the start of a page which is preceded by an unmapped page.
> >
> > So update the glue code to only invoke the NEON routine if the
> > input is more than 16 bytes.
> >
> > Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
>
> Can you add proper tags?
>
> Fixes: 1d481f1cd892 ("crypto: arm/crct10dif - port x86 SSE implementation to ARM")
> Cc: <stable@vger.kernel.org> # v4.10+
>

Ah yes, thanks for digging that up.

> Just double checking as I don't have a system immediately available to run this
> one on -- I assume it passes with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y now?
>

Yes.

> Another comment below:
>
> > ---
> >  arch/arm/crypto/crct10dif-ce-core.S | 20 ++++++++---------
> >  arch/arm/crypto/crct10dif-ce-glue.c | 23 +++++---------------
> >  2 files changed, 16 insertions(+), 27 deletions(-)
> >
> > diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
> > index ce45ba0c0687..3fd13d7c842c 100644
> > --- a/arch/arm/crypto/crct10dif-ce-core.S
> > +++ b/arch/arm/crypto/crct10dif-ce-core.S
> > @@ -124,10 +124,10 @@ ENTRY(crc_t10dif_pmull)
> >       vext.8          q10, qzr, q0, #4
> >
> >       // receive the initial 64B data, xor the initial crc value
> > -     vld1.64         {q0-q1}, [arg2, :128]!
> > -     vld1.64         {q2-q3}, [arg2, :128]!
> > -     vld1.64         {q4-q5}, [arg2, :128]!
> > -     vld1.64         {q6-q7}, [arg2, :128]!
> > +     vld1.64         {q0-q1}, [arg2]!
> > +     vld1.64         {q2-q3}, [arg2]!
> > +     vld1.64         {q4-q5}, [arg2]!
> > +     vld1.64         {q6-q7}, [arg2]!
> >  CPU_LE(      vrev64.8        q0, q0                  )
> >  CPU_LE(      vrev64.8        q1, q1                  )
> >  CPU_LE(      vrev64.8        q2, q2                  )
> > @@ -150,7 +150,7 @@ CPU_LE(   vrev64.8        q7, q7                  )
> >       veor.8          q0, q0, q10
> >
> >       adr             ip, rk3
> > -     vld1.64         {q10}, [ip, :128]       // xmm10 has rk3 and rk4
> > +     vld1.64         {q10}, [ip]     // xmm10 has rk3 and rk4
>
> This one is loading static data that is 16 byte aligned, so the :128 can be kept
> here.  Same in the two other places below that load from [ip].
>

OK, will change that back.

> >
> >       //
> >       // we subtract 256 instead of 128 to save one instruction from the loop
> > @@ -167,7 +167,7 @@ CPU_LE(   vrev64.8        q7, q7                  )
> >  _fold_64_B_loop:
> >
> >       .macro          fold64, reg1, reg2
> > -     vld1.64         {q11-q12}, [arg2, :128]!
> > +     vld1.64         {q11-q12}, [arg2]!
> >
> >       vmull.p64       q8, \reg1\()h, d21
> >       vmull.p64       \reg1, \reg1\()l, d20
> > @@ -203,13 +203,13 @@ CPU_LE( vrev64.8        q12, q12                )
> >       // constants
> >
> >       adr             ip, rk9
> > -     vld1.64         {q10}, [ip, :128]!
> > +     vld1.64         {q10}, [ip]!
> >
> >       .macro          fold16, reg, rk
> >       vmull.p64       q8, \reg\()l, d20
> >       vmull.p64       \reg, \reg\()h, d21
> >       .ifnb           \rk
> > -     vld1.64         {q10}, [ip, :128]!
> > +     vld1.64         {q10}, [ip]!
> >       .endif
> >       veor.8          q7, q7, q8
> >       veor.8          q7, q7, \reg
> > @@ -238,7 +238,7 @@ _16B_reduction_loop:
> >       vmull.p64       q7, d15, d21
> >       veor.8          q7, q7, q8
> >
> > -     vld1.64         {q0}, [arg2, :128]!
> > +     vld1.64         {q0}, [arg2]!
> >  CPU_LE(      vrev64.8        q0, q0          )
> >       vswp            d0, d1
> >       veor.8          q7, q7, q0
> > @@ -335,7 +335,7 @@ _less_than_128:
> >       vmov.i8         q0, #0
> >       vmov            s3, arg1_low32          // get the initial crc value
> >
> > -     vld1.64         {q7}, [arg2, :128]!
> > +     vld1.64         {q7}, [arg2]!
> >  CPU_LE(      vrev64.8        q7, q7          )
> >       vswp            d14, d15
> >       veor.8          q7, q7, q0
> > diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
> > index d428355cf38d..14c19c70a841 100644
> > --- a/arch/arm/crypto/crct10dif-ce-glue.c
> > +++ b/arch/arm/crypto/crct10dif-ce-glue.c
> > @@ -35,26 +35,15 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data,
> >                           unsigned int length)
> >  {
> >       u16 *crc = shash_desc_ctx(desc);
> > -     unsigned int l;
> >
> > -     if (!may_use_simd()) {
> > -             *crc = crc_t10dif_generic(*crc, data, length);
> > +     if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
> > +             kernel_neon_begin();
> > +             *crc = crc_t10dif_pmull(*crc, data, length);
> > +             kernel_neon_end();
> >       } else {
> > -             if (unlikely((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
> > -                     l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
> > -                               ((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
> > -
> > -                     *crc = crc_t10dif_generic(*crc, data, l);
> > -
> > -                     length -= l;
> > -                     data += l;
> > -             }
> > -             if (length > 0) {
> > -                     kernel_neon_begin();
> > -                     *crc = crc_t10dif_pmull(*crc, data, length);
> > -                     kernel_neon_end();
> > -             }
> > +             *crc = crc_t10dif_generic(*crc, data, length);
> >       }
> > +
> >       return 0;
> >  }
> >
> > --
> > 2.17.1
> >

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] crypto: arm/crct10dif - revert to C code for short inputs
@ 2019-01-25  7:48       ` Ard Biesheuvel
  0 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2019-01-25  7:48 UTC (permalink / raw)
  To: Eric Biggers
  Cc: open list:HARDWARE RANDOM NUMBER GENERATOR CORE,
	linux-arm-kernel, Herbert Xu

On Fri, 25 Jan 2019 at 08:22, Eric Biggers <ebiggers@kernel.org> wrote:
>
> On Thu, Jan 24, 2019 at 07:27:11PM +0100, Ard Biesheuvel wrote:
> > The SIMD routine ported from x86 used to have a special code path
> > for inputs < 16 bytes, which got lost somewhere along the way.
> > Instead, the current glue code aligns the input pointer to permit
> > the NEON routine to use special versions of the vld1 instructions
> > that assume 16 byte alignment, but this could result in inputs of
> > less than 16 bytes to be passed in. This not only fails the new
> > extended tests that Eric has implemented, it also results in the
> > code reading before the input pointer, which could potentially
> > result in crashes when dealing with less than 16 bytes of input
> > at the start of a page which is preceded by an unmapped page.
> >
> > So update the glue code to only invoke the NEON routine if the
> > input is more than 16 bytes.
> >
> > Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
>
> Can you add proper tags?
>
> Fixes: 1d481f1cd892 ("crypto: arm/crct10dif - port x86 SSE implementation to ARM")
> Cc: <stable@vger.kernel.org> # v4.10+
>

Ah yes, thanks for digging that up.

> Just double checking as I don't have a system immediately available to run this
> one on -- I assume it passes with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y now?
>

Yes.

> Another comment below:
>
> > ---
> >  arch/arm/crypto/crct10dif-ce-core.S | 20 ++++++++---------
> >  arch/arm/crypto/crct10dif-ce-glue.c | 23 +++++---------------
> >  2 files changed, 16 insertions(+), 27 deletions(-)
> >
> > diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
> > index ce45ba0c0687..3fd13d7c842c 100644
> > --- a/arch/arm/crypto/crct10dif-ce-core.S
> > +++ b/arch/arm/crypto/crct10dif-ce-core.S
> > @@ -124,10 +124,10 @@ ENTRY(crc_t10dif_pmull)
> >       vext.8          q10, qzr, q0, #4
> >
> >       // receive the initial 64B data, xor the initial crc value
> > -     vld1.64         {q0-q1}, [arg2, :128]!
> > -     vld1.64         {q2-q3}, [arg2, :128]!
> > -     vld1.64         {q4-q5}, [arg2, :128]!
> > -     vld1.64         {q6-q7}, [arg2, :128]!
> > +     vld1.64         {q0-q1}, [arg2]!
> > +     vld1.64         {q2-q3}, [arg2]!
> > +     vld1.64         {q4-q5}, [arg2]!
> > +     vld1.64         {q6-q7}, [arg2]!
> >  CPU_LE(      vrev64.8        q0, q0                  )
> >  CPU_LE(      vrev64.8        q1, q1                  )
> >  CPU_LE(      vrev64.8        q2, q2                  )
> > @@ -150,7 +150,7 @@ CPU_LE(   vrev64.8        q7, q7                  )
> >       veor.8          q0, q0, q10
> >
> >       adr             ip, rk3
> > -     vld1.64         {q10}, [ip, :128]       // xmm10 has rk3 and rk4
> > +     vld1.64         {q10}, [ip]     // xmm10 has rk3 and rk4
>
> This one is loading static data that is 16 byte aligned, so the :128 can be kept
> here.  Same in the two other places below that load from [ip].
>

OK, will change that back.

> >
> >       //
> >       // we subtract 256 instead of 128 to save one instruction from the loop
> > @@ -167,7 +167,7 @@ CPU_LE(   vrev64.8        q7, q7                  )
> >  _fold_64_B_loop:
> >
> >       .macro          fold64, reg1, reg2
> > -     vld1.64         {q11-q12}, [arg2, :128]!
> > +     vld1.64         {q11-q12}, [arg2]!
> >
> >       vmull.p64       q8, \reg1\()h, d21
> >       vmull.p64       \reg1, \reg1\()l, d20
> > @@ -203,13 +203,13 @@ CPU_LE( vrev64.8        q12, q12                )
> >       // constants
> >
> >       adr             ip, rk9
> > -     vld1.64         {q10}, [ip, :128]!
> > +     vld1.64         {q10}, [ip]!
> >
> >       .macro          fold16, reg, rk
> >       vmull.p64       q8, \reg\()l, d20
> >       vmull.p64       \reg, \reg\()h, d21
> >       .ifnb           \rk
> > -     vld1.64         {q10}, [ip, :128]!
> > +     vld1.64         {q10}, [ip]!
> >       .endif
> >       veor.8          q7, q7, q8
> >       veor.8          q7, q7, \reg
> > @@ -238,7 +238,7 @@ _16B_reduction_loop:
> >       vmull.p64       q7, d15, d21
> >       veor.8          q7, q7, q8
> >
> > -     vld1.64         {q0}, [arg2, :128]!
> > +     vld1.64         {q0}, [arg2]!
> >  CPU_LE(      vrev64.8        q0, q0          )
> >       vswp            d0, d1
> >       veor.8          q7, q7, q0
> > @@ -335,7 +335,7 @@ _less_than_128:
> >       vmov.i8         q0, #0
> >       vmov            s3, arg1_low32          // get the initial crc value
> >
> > -     vld1.64         {q7}, [arg2, :128]!
> > +     vld1.64         {q7}, [arg2]!
> >  CPU_LE(      vrev64.8        q7, q7          )
> >       vswp            d14, d15
> >       veor.8          q7, q7, q0
> > diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
> > index d428355cf38d..14c19c70a841 100644
> > --- a/arch/arm/crypto/crct10dif-ce-glue.c
> > +++ b/arch/arm/crypto/crct10dif-ce-glue.c
> > @@ -35,26 +35,15 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data,
> >                           unsigned int length)
> >  {
> >       u16 *crc = shash_desc_ctx(desc);
> > -     unsigned int l;
> >
> > -     if (!may_use_simd()) {
> > -             *crc = crc_t10dif_generic(*crc, data, length);
> > +     if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
> > +             kernel_neon_begin();
> > +             *crc = crc_t10dif_pmull(*crc, data, length);
> > +             kernel_neon_end();
> >       } else {
> > -             if (unlikely((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
> > -                     l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
> > -                               ((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
> > -
> > -                     *crc = crc_t10dif_generic(*crc, data, l);
> > -
> > -                     length -= l;
> > -                     data += l;
> > -             }
> > -             if (length > 0) {
> > -                     kernel_neon_begin();
> > -                     *crc = crc_t10dif_pmull(*crc, data, length);
> > -                     kernel_neon_end();
> > -             }
> > +             *crc = crc_t10dif_generic(*crc, data, length);
> >       }
> > +
> >       return 0;
> >  }
> >
> > --
> > 2.17.1
> >

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] crypto: arm64/crct10dif - revert to C code for short inputs
  2019-01-25  7:29     ` Eric Biggers
@ 2019-01-25  7:49       ` Ard Biesheuvel
  -1 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2019-01-25  7:49 UTC (permalink / raw)
  To: Eric Biggers
  Cc: open list:HARDWARE RANDOM NUMBER GENERATOR CORE,
	linux-arm-kernel, Herbert Xu

On Fri, 25 Jan 2019 at 08:29, Eric Biggers <ebiggers@kernel.org> wrote:
>
> On Thu, Jan 24, 2019 at 07:27:12PM +0100, Ard Biesheuvel wrote:
> > The SIMD routine ported from x86 used to have a special code path
> > for inputs < 16 bytes, which got lost somewhere along the way.
> > Instead, the current glue code aligns the input pointer to permit
> > the NEON routine to use special versions of the vld1 instructions
> > that assume 16 byte alignment, but this could result in inputs of
> > less than 16 bytes to be passed in.
>
> This description doesn't quite match the patch since the arm64 version of the
> assembly doesn't use any alignment specifiers.  I take it that actually means
> the alignment in the glue code wasn't necessary in the first place?
>

Not for correctness, but it could affect performance.

> > This not only fails the new
> > extended tests that Eric has implemented, it also results in the
> > code reading before the input pointer, which could potentially
> > result in crashes when dealing with less than 16 bytes of input
> > at the start of a page which is preceded by an unmapped page.
> >
> > So update the glue code to only invoke the NEON routine if the
> > input is more than 16 bytes.
> >
> > Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
>
> Can you add:
>
> Fixes: 6ef5737f3931 ("crypto: arm64/crct10dif - port x86 SSE implementation to arm64")
> Cc: stable@vger.kernel.org
>

Will do

> > ---
> >  arch/arm64/crypto/crct10dif-ce-glue.c | 25 +++++---------------
> >  1 file changed, 6 insertions(+), 19 deletions(-)
> >
> > diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
> > index b461d62023f2..567c24f3d224 100644
> > --- a/arch/arm64/crypto/crct10dif-ce-glue.c
> > +++ b/arch/arm64/crypto/crct10dif-ce-glue.c
> > @@ -39,26 +39,13 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data,
> >                           unsigned int length)
> >  {
> >       u16 *crc = shash_desc_ctx(desc);
> > -     unsigned int l;
> >
> > -     if (unlikely((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
> > -             l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
> > -                       ((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
> > -
> > -             *crc = crc_t10dif_generic(*crc, data, l);
> > -
> > -             length -= l;
> > -             data += l;
> > -     }
> > -
> > -     if (length > 0) {
> > -             if (may_use_simd()) {
> > -                     kernel_neon_begin();
> > -                     *crc = crc_t10dif_pmull(*crc, data, length);
> > -                     kernel_neon_end();
> > -             } else {
> > -                     *crc = crc_t10dif_generic(*crc, data, length);
> > -             }
> > +     if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
> > +             kernel_neon_begin();
> > +             *crc = crc_t10dif_pmull(*crc, data, length);
> > +             kernel_neon_end();
> > +     } else {
> > +             *crc = crc_t10dif_generic(*crc, data, length);
> >       }
> >
> >       return 0;
> > --
> > 2.17.1
> >

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] crypto: arm64/crct10dif - revert to C code for short inputs
@ 2019-01-25  7:49       ` Ard Biesheuvel
  0 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2019-01-25  7:49 UTC (permalink / raw)
  To: Eric Biggers
  Cc: open list:HARDWARE RANDOM NUMBER GENERATOR CORE,
	linux-arm-kernel, Herbert Xu

On Fri, 25 Jan 2019 at 08:29, Eric Biggers <ebiggers@kernel.org> wrote:
>
> On Thu, Jan 24, 2019 at 07:27:12PM +0100, Ard Biesheuvel wrote:
> > The SIMD routine ported from x86 used to have a special code path
> > for inputs < 16 bytes, which got lost somewhere along the way.
> > Instead, the current glue code aligns the input pointer to permit
> > the NEON routine to use special versions of the vld1 instructions
> > that assume 16 byte alignment, but this could result in inputs of
> > less than 16 bytes to be passed in.
>
> This description doesn't quite match the patch since the arm64 version of the
> assembly doesn't use any alignment specifiers.  I take it that actually means
> the alignment in the glue code wasn't necessary in the first place?
>

Not for correctness, but it could affect performance.

> > This not only fails the new
> > extended tests that Eric has implemented, it also results in the
> > code reading before the input pointer, which could potentially
> > result in crashes when dealing with less than 16 bytes of input
> > at the start of a page which is preceded by an unmapped page.
> >
> > So update the glue code to only invoke the NEON routine if the
> > input is more than 16 bytes.
> >
> > Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
>
> Can you add:
>
> Fixes: 6ef5737f3931 ("crypto: arm64/crct10dif - port x86 SSE implementation to arm64")
> Cc: stable@vger.kernel.org
>

Will do

> > ---
> >  arch/arm64/crypto/crct10dif-ce-glue.c | 25 +++++---------------
> >  1 file changed, 6 insertions(+), 19 deletions(-)
> >
> > diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
> > index b461d62023f2..567c24f3d224 100644
> > --- a/arch/arm64/crypto/crct10dif-ce-glue.c
> > +++ b/arch/arm64/crypto/crct10dif-ce-glue.c
> > @@ -39,26 +39,13 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data,
> >                           unsigned int length)
> >  {
> >       u16 *crc = shash_desc_ctx(desc);
> > -     unsigned int l;
> >
> > -     if (unlikely((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
> > -             l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
> > -                       ((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
> > -
> > -             *crc = crc_t10dif_generic(*crc, data, l);
> > -
> > -             length -= l;
> > -             data += l;
> > -     }
> > -
> > -     if (length > 0) {
> > -             if (may_use_simd()) {
> > -                     kernel_neon_begin();
> > -                     *crc = crc_t10dif_pmull(*crc, data, length);
> > -                     kernel_neon_end();
> > -             } else {
> > -                     *crc = crc_t10dif_generic(*crc, data, length);
> > -             }
> > +     if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
> > +             kernel_neon_begin();
> > +             *crc = crc_t10dif_pmull(*crc, data, length);
> > +             kernel_neon_end();
> > +     } else {
> > +             *crc = crc_t10dif_generic(*crc, data, length);
> >       }
> >
> >       return 0;
> > --
> > 2.17.1
> >

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2019-01-25  7:49 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-24 18:27 [PATCH 0/2] crypto: fix crct10dif for ARM and arm64 Ard Biesheuvel
2019-01-24 18:27 ` Ard Biesheuvel
2019-01-24 18:27 ` [PATCH 1/2] crypto: arm/crct10dif - revert to C code for short inputs Ard Biesheuvel
2019-01-24 18:27   ` Ard Biesheuvel
2019-01-25  7:22   ` Eric Biggers
2019-01-25  7:22     ` Eric Biggers
2019-01-25  7:48     ` Ard Biesheuvel
2019-01-25  7:48       ` Ard Biesheuvel
2019-01-24 18:27 ` [PATCH 2/2] crypto: arm64/crct10dif " Ard Biesheuvel
2019-01-24 18:27   ` Ard Biesheuvel
2019-01-25  7:29   ` Eric Biggers
2019-01-25  7:29     ` Eric Biggers
2019-01-25  7:49     ` Ard Biesheuvel
2019-01-25  7:49       ` Ard Biesheuvel

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.