linux-crypto.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] crypto: arm/aes-neonbs - process 8 blocks in parallel if we can
@ 2016-12-09 13:47 Ard Biesheuvel
  2016-12-27  8:57 ` Herbert Xu
  0 siblings, 1 reply; 9+ messages in thread
From: Ard Biesheuvel @ 2016-12-09 13:47 UTC (permalink / raw)
  To: linux-crypto, herbert; +Cc: linux-arm-kernel, Ard Biesheuvel

The bit-sliced NEON implementation of AES only performs optimally if
it can process 8 blocks of input in parallel. This is due to the nature
of bit slicing, where the n-th bit of each byte of AES state of each input
block is collected into NEON register 'n', for registers q0 - q7.

This implies that the amount of work for the transform is fixed,
regardless of whether we are handling just one block or 8 in parallel.

So let's try a bit harder to iterate over the input in suitably sized
chunks, by increasing the chunksize to 8 * AES_BLOCK_SIZE, and tweaking
the loops to only process multiples of the chunk size, unless we are
handling the last chunk in the input stream.

Note that the skcipher walk API guarantees that a step in the walk never
returns less that 'chunksize' bytes if there are at least that many bytes
of input still available. However, it does *not* guarantee that those steps
produce an exact multiple of the chunk size.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm/crypto/aesbs-glue.c | 68 +++++++++++++++++++++++++-------------------
 1 file changed, 38 insertions(+), 30 deletions(-)

diff --git a/arch/arm/crypto/aesbs-glue.c b/arch/arm/crypto/aesbs-glue.c
index d8e06de72ef3..938d1e1bf9a3 100644
--- a/arch/arm/crypto/aesbs-glue.c
+++ b/arch/arm/crypto/aesbs-glue.c
@@ -121,39 +121,26 @@ static int aesbs_cbc_encrypt(struct skcipher_request *req)
 	return crypto_cbc_encrypt_walk(req, aesbs_encrypt_one);
 }
 
-static inline void aesbs_decrypt_one(struct crypto_skcipher *tfm,
-				     const u8 *src, u8 *dst)
-{
-	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	AES_decrypt(src, dst, &ctx->dec.rk);
-}
-
 static int aesbs_cbc_decrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
 	struct skcipher_walk walk;
-	unsigned int nbytes;
 	int err;
 
-	for (err = skcipher_walk_virt(&walk, req, false);
-	     (nbytes = walk.nbytes); err = skcipher_walk_done(&walk, nbytes)) {
-		u32 blocks = nbytes / AES_BLOCK_SIZE;
-		u8 *dst = walk.dst.virt.addr;
-		u8 *src = walk.src.virt.addr;
-		u8 *iv = walk.iv;
-
-		if (blocks >= 8) {
-			kernel_neon_begin();
-			bsaes_cbc_encrypt(src, dst, nbytes, &ctx->dec, iv);
-			kernel_neon_end();
-			nbytes %= AES_BLOCK_SIZE;
-			continue;
-		}
+	err = skcipher_walk_virt(&walk, req, false);
+
+	while (walk.nbytes) {
+		unsigned int nbytes = walk.nbytes;
+
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, walk.chunksize);
 
-		nbytes = crypto_cbc_decrypt_blocks(&walk, tfm,
-						   aesbs_decrypt_one);
+		kernel_neon_begin();
+		bsaes_cbc_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
+				  nbytes, &ctx->dec, walk.iv);
+		kernel_neon_end();
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
 	}
 	return err;
 }
@@ -186,6 +173,12 @@ static int aesbs_ctr_encrypt(struct skcipher_request *req)
 		__be32 *ctr = (__be32 *)walk.iv;
 		u32 headroom = UINT_MAX - be32_to_cpu(ctr[3]);
 
+		if (walk.nbytes < walk.total) {
+			blocks = round_down(blocks,
+					    walk.chunksize / AES_BLOCK_SIZE);
+			tail = walk.nbytes - blocks * AES_BLOCK_SIZE;
+		}
+
 		/* avoid 32 bit counter overflow in the NEON code */
 		if (unlikely(headroom < blocks)) {
 			blocks = headroom + 1;
@@ -198,6 +191,9 @@ static int aesbs_ctr_encrypt(struct skcipher_request *req)
 		kernel_neon_end();
 		inc_be128_ctr(ctr, blocks);
 
+		if (tail > 0 && tail < AES_BLOCK_SIZE)
+			break;
+
 		err = skcipher_walk_done(&walk, tail);
 	}
 	if (walk.nbytes) {
@@ -227,11 +223,16 @@ static int aesbs_xts_encrypt(struct skcipher_request *req)
 	AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
 
 	while (walk.nbytes) {
+		unsigned int nbytes = walk.nbytes;
+
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, walk.chunksize);
+
 		kernel_neon_begin();
 		bsaes_xts_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
-				  walk.nbytes, &ctx->enc, walk.iv);
+				  nbytes, &ctx->enc, walk.iv);
 		kernel_neon_end();
-		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
 	}
 	return err;
 }
@@ -249,11 +250,16 @@ static int aesbs_xts_decrypt(struct skcipher_request *req)
 	AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
 
 	while (walk.nbytes) {
+		unsigned int nbytes = walk.nbytes;
+
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, walk.chunksize);
+
 		kernel_neon_begin();
 		bsaes_xts_decrypt(walk.src.virt.addr, walk.dst.virt.addr,
-				  walk.nbytes, &ctx->dec, walk.iv);
+				  nbytes, &ctx->dec, walk.iv);
 		kernel_neon_end();
-		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
 	}
 	return err;
 }
@@ -272,6 +278,7 @@ static struct skcipher_alg aesbs_algs[] = { {
 	.min_keysize	= AES_MIN_KEY_SIZE,
 	.max_keysize	= AES_MAX_KEY_SIZE,
 	.ivsize		= AES_BLOCK_SIZE,
+	.chunksize	= 8 * AES_BLOCK_SIZE,
 	.setkey		= aesbs_cbc_set_key,
 	.encrypt	= aesbs_cbc_encrypt,
 	.decrypt	= aesbs_cbc_decrypt,
@@ -289,7 +296,7 @@ static struct skcipher_alg aesbs_algs[] = { {
 	.min_keysize	= AES_MIN_KEY_SIZE,
 	.max_keysize	= AES_MAX_KEY_SIZE,
 	.ivsize		= AES_BLOCK_SIZE,
-	.chunksize	= AES_BLOCK_SIZE,
+	.chunksize	= 8 * AES_BLOCK_SIZE,
 	.setkey		= aesbs_ctr_set_key,
 	.encrypt	= aesbs_ctr_encrypt,
 	.decrypt	= aesbs_ctr_encrypt,
@@ -307,6 +314,7 @@ static struct skcipher_alg aesbs_algs[] = { {
 	.min_keysize	= 2 * AES_MIN_KEY_SIZE,
 	.max_keysize	= 2 * AES_MAX_KEY_SIZE,
 	.ivsize		= AES_BLOCK_SIZE,
+	.chunksize	= 8 * AES_BLOCK_SIZE,
 	.setkey		= aesbs_xts_set_key,
 	.encrypt	= aesbs_xts_encrypt,
 	.decrypt	= aesbs_xts_decrypt,
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] crypto: arm/aes-neonbs - process 8 blocks in parallel if we can
  2016-12-09 13:47 [PATCH] crypto: arm/aes-neonbs - process 8 blocks in parallel if we can Ard Biesheuvel
@ 2016-12-27  8:57 ` Herbert Xu
  2016-12-27 18:35   ` Ard Biesheuvel
  0 siblings, 1 reply; 9+ messages in thread
From: Herbert Xu @ 2016-12-27  8:57 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: linux-crypto, linux-arm-kernel

On Fri, Dec 09, 2016 at 01:47:26PM +0000, Ard Biesheuvel wrote:
> The bit-sliced NEON implementation of AES only performs optimally if
> it can process 8 blocks of input in parallel. This is due to the nature
> of bit slicing, where the n-th bit of each byte of AES state of each input
> block is collected into NEON register 'n', for registers q0 - q7.
> 
> This implies that the amount of work for the transform is fixed,
> regardless of whether we are handling just one block or 8 in parallel.
> 
> So let's try a bit harder to iterate over the input in suitably sized
> chunks, by increasing the chunksize to 8 * AES_BLOCK_SIZE, and tweaking
> the loops to only process multiples of the chunk size, unless we are
> handling the last chunk in the input stream.
> 
> Note that the skcipher walk API guarantees that a step in the walk never
> returns less that 'chunksize' bytes if there are at least that many bytes
> of input still available. However, it does *not* guarantee that those steps
> produce an exact multiple of the chunk size.
> 
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

I like this patch.  However, I had different plans for the chunksize
attribute.  It's primarily meant to be a hint to the upper layer
in case it does partial updates.  It's meant to provide the minimum
number of bytes a partial update can carry without screwing up
subsequent updates.

It just happens to be the same value that we were using during
an skcipher walk.

So I think for your case we should add a new attribute, perhaps
walk_chunksize or walksize, which doesn't need to be exported to
the outside at all and can then be used by the walk interface.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] crypto: arm/aes-neonbs - process 8 blocks in parallel if we can
  2016-12-27  8:57 ` Herbert Xu
@ 2016-12-27 18:35   ` Ard Biesheuvel
  2016-12-28  9:10     ` Herbert Xu
  0 siblings, 1 reply; 9+ messages in thread
From: Ard Biesheuvel @ 2016-12-27 18:35 UTC (permalink / raw)
  To: Herbert Xu; +Cc: linux-crypto, linux-arm-kernel

On 27 December 2016 at 08:57, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> On Fri, Dec 09, 2016 at 01:47:26PM +0000, Ard Biesheuvel wrote:
>> The bit-sliced NEON implementation of AES only performs optimally if
>> it can process 8 blocks of input in parallel. This is due to the nature
>> of bit slicing, where the n-th bit of each byte of AES state of each input
>> block is collected into NEON register 'n', for registers q0 - q7.
>>
>> This implies that the amount of work for the transform is fixed,
>> regardless of whether we are handling just one block or 8 in parallel.
>>
>> So let's try a bit harder to iterate over the input in suitably sized
>> chunks, by increasing the chunksize to 8 * AES_BLOCK_SIZE, and tweaking
>> the loops to only process multiples of the chunk size, unless we are
>> handling the last chunk in the input stream.
>>
>> Note that the skcipher walk API guarantees that a step in the walk never
>> returns less that 'chunksize' bytes if there are at least that many bytes
>> of input still available. However, it does *not* guarantee that those steps
>> produce an exact multiple of the chunk size.
>>
>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
>
> I like this patch.  However, I had different plans for the chunksize
> attribute.  It's primarily meant to be a hint to the upper layer
> in case it does partial updates.  It's meant to provide the minimum
> number of bytes a partial update can carry without screwing up
> subsequent updates.
>
> It just happens to be the same value that we were using during
> an skcipher walk.
>
> So I think for your case we should add a new attribute, perhaps
> walk_chunksize or walksize, which doesn't need to be exported to
> the outside at all and can then be used by the walk interface.
>

OK, I will try to hack something up.

One thing to keep in mind though is that stacked chaining modes should
present the data with the same granularity for optimal performance.
E.g., xts(ecb(aes)) should pass 8 blocks at a time. How should this
requirement be incorporated according to you?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] crypto: arm/aes-neonbs - process 8 blocks in parallel if we can
  2016-12-27 18:35   ` Ard Biesheuvel
@ 2016-12-28  9:10     ` Herbert Xu
  2016-12-28  9:19       ` Ard Biesheuvel
  0 siblings, 1 reply; 9+ messages in thread
From: Herbert Xu @ 2016-12-28  9:10 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: linux-crypto, linux-arm-kernel

On Tue, Dec 27, 2016 at 06:35:46PM +0000, Ard Biesheuvel wrote:
>
> OK, I will try to hack something up.
> 
> One thing to keep in mind though is that stacked chaining modes should
> present the data with the same granularity for optimal performance.
> E.g., xts(ecb(aes)) should pass 8 blocks at a time. How should this
> requirement be incorporated according to you?

xts tries to do a page at a time, which should be good enough, no?

In general this parameter should be visible to internal API users
such as xts so they could use it to determine how it wants to
structure its walks.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] crypto: arm/aes-neonbs - process 8 blocks in parallel if we can
  2016-12-28  9:10     ` Herbert Xu
@ 2016-12-28  9:19       ` Ard Biesheuvel
  2016-12-28  9:23         ` Herbert Xu
  0 siblings, 1 reply; 9+ messages in thread
From: Ard Biesheuvel @ 2016-12-28  9:19 UTC (permalink / raw)
  To: Herbert Xu; +Cc: linux-crypto, linux-arm-kernel


> On 28 Dec 2016, at 09:10, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> 
>> On Tue, Dec 27, 2016 at 06:35:46PM +0000, Ard Biesheuvel wrote:
>> 
>> OK, I will try to hack something up.
>> 
>> One thing to keep in mind though is that stacked chaining modes should
>> present the data with the same granularity for optimal performance.
>> E.g., xts(ecb(aes)) should pass 8 blocks at a time. How should this
>> requirement be incorporated according to you?
> 
> xts tries to do a page at a time, which should be good enough, no?
> 

Yes, if the xts chaining mode driver invokes the ecb transform with that granularity, it should be fine. Note that this is a theoretical
concern for this mode in particular, given that the bit sliced aes code implements the xts bits as well

> In general this parameter should be visible to internal API users
> such as xts so they could use it to determine how it wants to
> structure its walks.
> 

Ok, so that implies a field in the skcipher algo struct then, rather than some definition internal to the driver?

> Cheers,
> -- 
> Email: Herbert Xu <herbert@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] crypto: arm/aes-neonbs - process 8 blocks in parallel if we can
  2016-12-28  9:19       ` Ard Biesheuvel
@ 2016-12-28  9:23         ` Herbert Xu
  2016-12-28 19:50           ` Ard Biesheuvel
  0 siblings, 1 reply; 9+ messages in thread
From: Herbert Xu @ 2016-12-28  9:23 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: linux-crypto, linux-arm-kernel

On Wed, Dec 28, 2016 at 09:19:32AM +0000, Ard Biesheuvel wrote:
>
> Ok, so that implies a field in the skcipher algo struct then, rather than some definition internal to the driver?

Oh yes it should definitely be visible to other crypto API drivers
and algorithms.  It's just that we don't want to export it outside
of the crypto API, e.g., to IPsec or algif.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] crypto: arm/aes-neonbs - process 8 blocks in parallel if we can
  2016-12-28  9:23         ` Herbert Xu
@ 2016-12-28 19:50           ` Ard Biesheuvel
  2016-12-29  2:23             ` Herbert Xu
  0 siblings, 1 reply; 9+ messages in thread
From: Ard Biesheuvel @ 2016-12-28 19:50 UTC (permalink / raw)
  To: Herbert Xu; +Cc: linux-crypto, linux-arm-kernel

On 28 December 2016 at 09:23, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> On Wed, Dec 28, 2016 at 09:19:32AM +0000, Ard Biesheuvel wrote:
>>
>> Ok, so that implies a field in the skcipher algo struct then, rather than some definition internal to the driver?
>
> Oh yes it should definitely be visible to other crypto API drivers
> and algorithms.  It's just that we don't want to export it outside
> of the crypto API, e.g., to IPsec or algif.
>

Understood.

So about this chunksize, is it ever expected to assume other values
than 1 (for stream ciphers) or the block size (for block ciphers)?
Having block size, IV size *and* chunk size fields may be confusing to
some already, so if the purpose of chunk size can be fulfilled by a
single 'stream cipher' flag, perhaps we should change that first.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] crypto: arm/aes-neonbs - process 8 blocks in parallel if we can
  2016-12-28 19:50           ` Ard Biesheuvel
@ 2016-12-29  2:23             ` Herbert Xu
  2016-12-29 12:13               ` Ard Biesheuvel
  0 siblings, 1 reply; 9+ messages in thread
From: Herbert Xu @ 2016-12-29  2:23 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: linux-crypto, linux-arm-kernel

On Wed, Dec 28, 2016 at 07:50:44PM +0000, Ard Biesheuvel wrote:
> 
> So about this chunksize, is it ever expected to assume other values
> than 1 (for stream ciphers) or the block size (for block ciphers)?
> Having block size, IV size *and* chunk size fields may be confusing to
> some already, so if the purpose of chunk size can be fulfilled by a
> single 'stream cipher' flag, perhaps we should change that first.

For users (such as algif) it's much more convenient to have a size
rather than a flag because that's what they need to determine the
minimum size for partial updates.

For implementors you don't need to specify the chunksize at all
unless you're a stream cipher (or some other case in future where
the minimum partial update size is not equal to your block size).

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] crypto: arm/aes-neonbs - process 8 blocks in parallel if we can
  2016-12-29  2:23             ` Herbert Xu
@ 2016-12-29 12:13               ` Ard Biesheuvel
  0 siblings, 0 replies; 9+ messages in thread
From: Ard Biesheuvel @ 2016-12-29 12:13 UTC (permalink / raw)
  To: Herbert Xu; +Cc: linux-crypto, linux-arm-kernel

On 29 December 2016 at 02:23, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> On Wed, Dec 28, 2016 at 07:50:44PM +0000, Ard Biesheuvel wrote:
>>
>> So about this chunksize, is it ever expected to assume other values
>> than 1 (for stream ciphers) or the block size (for block ciphers)?
>> Having block size, IV size *and* chunk size fields may be confusing to
>> some already, so if the purpose of chunk size can be fulfilled by a
>> single 'stream cipher' flag, perhaps we should change that first.
>
> For users (such as algif) it's much more convenient to have a size
> rather than a flag because that's what they need to determine the
> minimum size for partial updates.
>
> For implementors you don't need to specify the chunksize at all
> unless you're a stream cipher (or some other case in future where
> the minimum partial update size is not equal to your block size).
>

OK, fair enough. So I will add a field 'walksize' to the skcipher_alg
struct in my proposal. I think the walk logic itself needs to change
very little, though: we can simply set the walk's chunksize to the
skcipher's walksize if it exceeds its chunksize (and walksize %
chunksize should be 0 in any case, and walksize should default to the
chunksize if not supplied)

If this sounds reasonable to you, I will hack something up next week.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2016-12-29 12:13 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-09 13:47 [PATCH] crypto: arm/aes-neonbs - process 8 blocks in parallel if we can Ard Biesheuvel
2016-12-27  8:57 ` Herbert Xu
2016-12-27 18:35   ` Ard Biesheuvel
2016-12-28  9:10     ` Herbert Xu
2016-12-28  9:19       ` Ard Biesheuvel
2016-12-28  9:23         ` Herbert Xu
2016-12-28 19:50           ` Ard Biesheuvel
2016-12-29  2:23             ` Herbert Xu
2016-12-29 12:13               ` Ard Biesheuvel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).