linux-crypto.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] crypto: x86/aesni - implement accelerated CBCMAC, CMAC and XCBC shashes
@ 2020-08-02  9:06 Ard Biesheuvel
  2020-08-03 19:11 ` Ben Greear
  2020-08-18  8:24 ` [PATCH 0/5] crypto: Implement cmac based on cbc skcipher Herbert Xu
  0 siblings, 2 replies; 46+ messages in thread
From: Ard Biesheuvel @ 2020-08-02  9:06 UTC (permalink / raw)
  To: linux-crypto; +Cc: herbert, ebiggers, Ard Biesheuvel, Ben Greear

Ben reports that CCM using AES-NI instructions performs pathologically
poorly, which is due to the overhead of preserving/restoring the SIMD
state, which is repeated after every 16 bytes of input when executing
the CBCMAC portion of the algorithm.

So let's clone the arm64 implementation of cbcmac(aes), which takes
care to only preserve/restore the SIMD state after processing the
whole input. Since cmac(aes) and xcbc(aes) can reuse most of the code,
let's expose those as well.

Cc: Ben Greear <greearb@candelatech.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/x86/crypto/Makefile           |   2 +-
 arch/x86/crypto/aesni-intel.h      |  39 +++
 arch/x86/crypto/aesni-intel_glue.c |  42 +---
 arch/x86/crypto/aesni-intel_mac.c  | 257 ++++++++++++++++++++
 4 files changed, 306 insertions(+), 34 deletions(-)

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index a31de0c6ccde..f83e162f87ad 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -51,7 +51,7 @@ chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha_glue.o
 chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o
 
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
-aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
+aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o aesni-intel_mac.o
 aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
 
 obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
diff --git a/arch/x86/crypto/aesni-intel.h b/arch/x86/crypto/aesni-intel.h
new file mode 100644
index 000000000000..d9204c043184
--- /dev/null
+++ b/arch/x86/crypto/aesni-intel.h
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <crypto/algapi.h>
+#include <crypto/aes.h>
+#include <crypto/internal/hash.h>
+
+#define AESNI_ALIGN	16
+#define AESNI_ALIGN_ATTR __attribute__ ((__aligned__(AESNI_ALIGN)))
+#define AES_BLOCK_MASK	(~(AES_BLOCK_SIZE - 1))
+#define RFC4106_HASH_SUBKEY_SIZE 16
+#define AESNI_ALIGN_EXTRA ((AESNI_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
+#define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA)
+#define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA)
+
+extern struct shash_alg aesni_mac_algs[];
+extern const int aesni_num_mac_algs;
+
+asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+			     unsigned int key_len);
+asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in);
+asmlinkage void aesni_dec(const void *ctx, u8 *out, const u8 *in);
+asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out,
+			      const u8 *in, unsigned int len);
+asmlinkage void aesni_ecb_dec(struct crypto_aes_ctx *ctx, u8 *out,
+			      const u8 *in, unsigned int len);
+asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
+			      const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
+			      const u8 *in, unsigned int len, u8 *iv);
+
+static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
+{
+	unsigned long addr = (unsigned long)raw_ctx;
+	unsigned long align = AESNI_ALIGN;
+
+	if (align <= crypto_tfm_ctx_alignment())
+		align = 1;
+	return (struct crypto_aes_ctx *)ALIGN(addr, align);
+}
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index ad8a7188a2bf..bc51c5f80858 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -19,8 +19,6 @@
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/err.h>
-#include <crypto/algapi.h>
-#include <crypto/aes.h>
 #include <crypto/ctr.h>
 #include <crypto/b128ops.h>
 #include <crypto/gcm.h>
@@ -37,14 +35,7 @@
 #include <asm/crypto/glue_helper.h>
 #endif
 
-
-#define AESNI_ALIGN	16
-#define AESNI_ALIGN_ATTR __attribute__ ((__aligned__(AESNI_ALIGN)))
-#define AES_BLOCK_MASK	(~(AES_BLOCK_SIZE - 1))
-#define RFC4106_HASH_SUBKEY_SIZE 16
-#define AESNI_ALIGN_EXTRA ((AESNI_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
-#define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA)
-#define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA)
+#include "aesni-intel.h"
 
 /* This data is stored at the end of the crypto_tfm struct.
  * It's a type of per "session" data storage location.
@@ -81,19 +72,6 @@ struct gcm_context_data {
 	u8 hash_keys[GCM_BLOCK_LEN * 16];
 };
 
-asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
-			     unsigned int key_len);
-asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in);
-asmlinkage void aesni_dec(const void *ctx, u8 *out, const u8 *in);
-asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out,
-			      const u8 *in, unsigned int len);
-asmlinkage void aesni_ecb_dec(struct crypto_aes_ctx *ctx, u8 *out,
-			      const u8 *in, unsigned int len);
-asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
-			      const u8 *in, unsigned int len, u8 *iv);
-asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
-			      const u8 *in, unsigned int len, u8 *iv);
-
 #define AVX_GEN2_OPTSIZE 640
 #define AVX_GEN4_OPTSIZE 4096
 
@@ -296,16 +274,6 @@ generic_gcmaes_ctx *generic_gcmaes_ctx_get(struct crypto_aead *tfm)
 }
 #endif
 
-static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
-{
-	unsigned long addr = (unsigned long)raw_ctx;
-	unsigned long align = AESNI_ALIGN;
-
-	if (align <= crypto_tfm_ctx_alignment())
-		align = 1;
-	return (struct crypto_aes_ctx *)ALIGN(addr, align);
-}
-
 static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx,
 			      const u8 *in_key, unsigned int key_len)
 {
@@ -1098,8 +1066,15 @@ static int __init aesni_init(void)
 	if (err)
 		goto unregister_skciphers;
 
+	err = crypto_register_shashes(aesni_mac_algs, aesni_num_mac_algs);
+	if (err)
+		goto unregister_aeads;
+
 	return 0;
 
+unregister_aeads:
+	simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads),
+			      aesni_simd_aeads);
 unregister_skciphers:
 	simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
 				  aesni_simd_skciphers);
@@ -1110,6 +1085,7 @@ static int __init aesni_init(void)
 
 static void __exit aesni_exit(void)
 {
+	crypto_unregister_shashes(aesni_mac_algs, aesni_num_mac_algs);
 	simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads),
 			      aesni_simd_aeads);
 	simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
diff --git a/arch/x86/crypto/aesni-intel_mac.c b/arch/x86/crypto/aesni-intel_mac.c
new file mode 100644
index 000000000000..7d546bbf21e5
--- /dev/null
+++ b/arch/x86/crypto/aesni-intel_mac.c
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2020 Arm Ltd <ard.biesheuvel@arm.com>
+ */
+
+#include <asm/simd.h>
+#include <crypto/b128ops.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <linux/module.h>
+
+#include "aesni-intel.h"
+
+MODULE_ALIAS_CRYPTO("cmac(aes)");
+MODULE_ALIAS_CRYPTO("xcbc(aes)");
+MODULE_ALIAS_CRYPTO("cbcmac(aes)");
+
+struct mac_tfm_ctx {
+	u8 key[CRYPTO_AES_CTX_SIZE];
+	u8 __aligned(8) consts[];
+};
+
+struct mac_desc_ctx {
+	u8 dg[AES_BLOCK_SIZE];
+	unsigned int len;
+};
+
+asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in);
+asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out,
+			      const u8 *in, unsigned int len);
+asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
+			      const u8 *in, unsigned int len, u8 *iv);
+
+static int cbcmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
+			 unsigned int key_len)
+{
+	struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
+
+	return aes_expandkey(aes_ctx(&ctx->key), in_key, key_len);
+}
+
+static void cmac_gf128_mul_by_x(be128 *y, const be128 *x)
+{
+	u64 a = be64_to_cpu(x->a);
+	u64 b = be64_to_cpu(x->b);
+
+	y->a = cpu_to_be64((a << 1) | (b >> 63));
+	y->b = cpu_to_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0));
+}
+
+static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
+		       unsigned int key_len)
+{
+	struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
+	be128 *consts = (be128 *)ctx->consts;
+	int err;
+
+	err = cbcmac_setkey(tfm, in_key, key_len);
+	if (err)
+		return err;
+
+	/* encrypt the zero vector */
+	kernel_fpu_begin();
+	aesni_ecb_enc(aes_ctx(&ctx->key), ctx->consts, (u8[AES_BLOCK_SIZE]){},
+		      AES_BLOCK_SIZE);
+	kernel_fpu_end();
+
+	cmac_gf128_mul_by_x(consts, consts);
+	cmac_gf128_mul_by_x(consts + 1, consts);
+
+	return 0;
+}
+
+static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key,
+		       unsigned int key_len)
+{
+	static u8 const ks[3][AES_BLOCK_SIZE] = {
+		{ [0 ... AES_BLOCK_SIZE - 1] = 0x1 },
+		{ [0 ... AES_BLOCK_SIZE - 1] = 0x2 },
+		{ [0 ... AES_BLOCK_SIZE - 1] = 0x3 },
+	};
+
+	struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
+	u8 key[AES_BLOCK_SIZE];
+	int err;
+
+	err = cbcmac_setkey(tfm, in_key, key_len);
+	if (err)
+		return err;
+
+	kernel_fpu_begin();
+	aesni_ecb_enc(aes_ctx(&ctx->key), key, ks[0], AES_BLOCK_SIZE);
+	aesni_ecb_enc(aes_ctx(&ctx->key), ctx->consts, ks[1], 2 * AES_BLOCK_SIZE);
+	kernel_fpu_end();
+
+	return cbcmac_setkey(tfm, key, sizeof(key));
+}
+
+static int mac_init(struct shash_desc *desc)
+{
+	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	memset(ctx->dg, 0, AES_BLOCK_SIZE);
+	ctx->len = 0;
+
+	return 0;
+}
+
+static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks,
+			  u8 dg[], int enc_before, int enc_after)
+{
+	if (likely(crypto_simd_usable())) {
+		kernel_fpu_begin();
+		if (enc_before)
+			aesni_enc(ctx, dg, dg);
+
+		while (blocks--) {
+			if (!blocks && !enc_after) {
+				crypto_xor(dg, in, AES_BLOCK_SIZE);
+				break;
+			}
+			aesni_cbc_enc(ctx, dg, in, AES_BLOCK_SIZE, dg);
+			in += AES_BLOCK_SIZE;
+		}
+		kernel_fpu_end();
+	} else {
+		if (enc_before)
+			aes_encrypt(ctx, dg, dg);
+
+		while (blocks--) {
+			crypto_xor(dg, in, AES_BLOCK_SIZE);
+			in += AES_BLOCK_SIZE;
+
+			if (blocks || enc_after)
+				aes_encrypt(ctx, dg, dg);
+		}
+	}
+}
+
+static int mac_update(struct shash_desc *desc, const u8 *p, unsigned int len)
+{
+	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	while (len > 0) {
+		unsigned int l;
+
+		if ((ctx->len % AES_BLOCK_SIZE) == 0 &&
+		    (ctx->len + len) > AES_BLOCK_SIZE) {
+
+			int blocks = len / AES_BLOCK_SIZE;
+
+			len %= AES_BLOCK_SIZE;
+
+			mac_do_update(aes_ctx(&tctx->key), p, blocks, ctx->dg,
+				      (ctx->len != 0), (len != 0));
+
+			p += blocks * AES_BLOCK_SIZE;
+
+			if (!len) {
+				ctx->len = AES_BLOCK_SIZE;
+				break;
+			}
+			ctx->len = 0;
+		}
+
+		l = min(len, AES_BLOCK_SIZE - ctx->len);
+
+		if (l <= AES_BLOCK_SIZE) {
+			crypto_xor(ctx->dg + ctx->len, p, l);
+			ctx->len += l;
+			len -= l;
+			p += l;
+		}
+	}
+
+	return 0;
+}
+
+static int cbcmac_final(struct shash_desc *desc, u8 *out)
+{
+	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	mac_do_update(aes_ctx(&tctx->key), NULL, 0, ctx->dg, (ctx->len != 0), 0);
+
+	memcpy(out, ctx->dg, AES_BLOCK_SIZE);
+
+	return 0;
+}
+
+static int cmac_final(struct shash_desc *desc, u8 *out)
+{
+	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
+	u8 *consts = tctx->consts;
+
+	if (ctx->len != AES_BLOCK_SIZE) {
+		ctx->dg[ctx->len] ^= 0x80;
+		consts += AES_BLOCK_SIZE;
+	}
+
+	mac_do_update(aes_ctx(&tctx->key), consts, 1, ctx->dg, 0, 1);
+
+	memcpy(out, ctx->dg, AES_BLOCK_SIZE);
+
+	return 0;
+}
+
+struct shash_alg aesni_mac_algs[] = { {
+	.base.cra_name		= "cmac(aes)",
+	.base.cra_driver_name	= "cmac-aes-aesni",
+	.base.cra_priority	= 400,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx) +
+				  2 * AES_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+
+	.digestsize		= AES_BLOCK_SIZE,
+	.init			= mac_init,
+	.update			= mac_update,
+	.final			= cmac_final,
+	.setkey			= cmac_setkey,
+	.descsize		= sizeof(struct mac_desc_ctx),
+}, {
+	.base.cra_name		= "xcbc(aes)",
+	.base.cra_driver_name	= "xcbc-aes-aesni",
+	.base.cra_priority	= 400,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx) +
+				  2 * AES_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+
+	.digestsize		= AES_BLOCK_SIZE,
+	.init			= mac_init,
+	.update			= mac_update,
+	.final			= cmac_final,
+	.setkey			= xcbc_setkey,
+	.descsize		= sizeof(struct mac_desc_ctx),
+}, {
+	.base.cra_name		= "cbcmac(aes)",
+	.base.cra_driver_name	= "cbcmac-aes-aesni",
+	.base.cra_priority	= 400,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx),
+	.base.cra_module	= THIS_MODULE,
+
+	.digestsize		= AES_BLOCK_SIZE,
+	.init			= mac_init,
+	.update			= mac_update,
+	.final			= cbcmac_final,
+	.setkey			= cbcmac_setkey,
+	.descsize		= sizeof(struct mac_desc_ctx),
+} };
+
+const int aesni_num_mac_algs = ARRAY_SIZE(aesni_mac_algs);
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [PATCH] crypto: x86/aesni - implement accelerated CBCMAC, CMAC and XCBC shashes
  2020-08-02  9:06 [PATCH] crypto: x86/aesni - implement accelerated CBCMAC, CMAC and XCBC shashes Ard Biesheuvel
@ 2020-08-03 19:11 ` Ben Greear
  2020-08-04 12:55   ` Ard Biesheuvel
  2020-08-18  8:24 ` [PATCH 0/5] crypto: Implement cmac based on cbc skcipher Herbert Xu
  1 sibling, 1 reply; 46+ messages in thread
From: Ben Greear @ 2020-08-03 19:11 UTC (permalink / raw)
  To: Ard Biesheuvel, linux-crypto; +Cc: herbert, ebiggers

Hello,

This helps a bit...now download sw-crypt performance is about 150Mbps,
but still not as good as with my patch on 5.4 kernel, and fpu is still
high in perf top:

    13.89%  libc-2.29.so   [.] __memset_sse2_unaligned_erms
      6.62%  [kernel]       [k] kernel_fpu_begin
      4.14%  [kernel]       [k] _aesni_enc1
      2.06%  [kernel]       [k] __crypto_xor
      1.95%  [kernel]       [k] copy_user_generic_string
      1.93%  libjvm.so      [.] SpinPause
      1.01%  [kernel]       [k] aesni_encrypt
      0.98%  [kernel]       [k] crypto_ctr_crypt
      0.93%  [kernel]       [k] udp_sendmsg
      0.78%  [kernel]       [k] crypto_inc
      0.74%  [kernel]       [k] __ip_append_data.isra.53
      0.65%  [kernel]       [k] aesni_cbc_enc
      0.64%  [kernel]       [k] __dev_queue_xmit
      0.62%  [kernel]       [k] ipt_do_table
      0.62%  [kernel]       [k] igb_xmit_frame_ring
      0.59%  [kernel]       [k] ip_route_output_key_hash_rcu
      0.57%  [kernel]       [k] memcpy
      0.57%  libjvm.so      [.] InstanceKlass::oop_follow_contents
      0.56%  [kernel]       [k] irq_fpu_usable
      0.56%  [kernel]       [k] mac_do_update

If you'd like help setting up a test rig and have an ath10k pcie NIC or ath9k pcie NIC,
then I can help.  Possibly hwsim would also be a good test case, but I have not tried
that.

Not sure it is related or not, but system was throwing this during
the test.  We'll try to verify whether or not it is related to that
patch.

...
Message from syslogd@lf0350-35c0 at Aug  3 09:55:05 ...
  kernel:[Hardware Error]: CPU:0 (16:30:1) MC2_STATUS[Over|CE|-|AddrV|-|CECC|-|-]: 0xd45840900012017a

Message from syslogd@lf0350-35c0 at Aug  3 09:55:05 ...
  kernel:[Hardware Error]: Error Addr: 0x00000000a656a810

Message from syslogd@lf0350-35c0 at Aug  3 09:55:05 ...
  kernel:[Hardware Error]: MC2 Error: ECC error in L2 data array (Vict).

Message from syslogd@lf0350-35c0 at Aug  3 09:55:05 ...
  kernel:[Hardware Error]: cache level: L2, tx: GEN, mem-tx: EV

Message from syslogd@lf0350-35c0 at Aug  3 10:00:33 ...
  kernel:[Hardware Error]: Corrected error, no action required.

Message from syslogd@lf0350-35c0 at Aug  3 10:00:33 ...
  kernel:[Hardware Error]: CPU:0 (16:30:1) MC2_STATUS[Over|CE|-|AddrV|-|CECC|-|-]: 0xd55840920010011a


Thanks,
Ben


On 8/2/20 2:06 AM, Ard Biesheuvel wrote:
> Ben reports that CCM using AES-NI instructions performs pathologically
> poorly, which is due to the overhead of preserving/restoring the SIMD
> state, which is repeated after every 16 bytes of input when executing
> the CBCMAC portion of the algorithm.
> 
> So let's clone the arm64 implementation of cbcmac(aes), which takes
> care to only preserve/restore the SIMD state after processing the
> whole input. Since cmac(aes) and xcbc(aes) can reuse most of the code,
> let's expose those as well.
> 
> Cc: Ben Greear <greearb@candelatech.com>
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
>   arch/x86/crypto/Makefile           |   2 +-
>   arch/x86/crypto/aesni-intel.h      |  39 +++
>   arch/x86/crypto/aesni-intel_glue.c |  42 +---
>   arch/x86/crypto/aesni-intel_mac.c  | 257 ++++++++++++++++++++
>   4 files changed, 306 insertions(+), 34 deletions(-)
> 
> diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
> index a31de0c6ccde..f83e162f87ad 100644
> --- a/arch/x86/crypto/Makefile
> +++ b/arch/x86/crypto/Makefile
> @@ -51,7 +51,7 @@ chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha_glue.o
>   chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o
>   
>   obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
> -aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
> +aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o aesni-intel_mac.o
>   aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
>   
>   obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
> diff --git a/arch/x86/crypto/aesni-intel.h b/arch/x86/crypto/aesni-intel.h
> new file mode 100644
> index 000000000000..d9204c043184
> --- /dev/null
> +++ b/arch/x86/crypto/aesni-intel.h
> @@ -0,0 +1,39 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +
> +#include <crypto/algapi.h>
> +#include <crypto/aes.h>
> +#include <crypto/internal/hash.h>
> +
> +#define AESNI_ALIGN	16
> +#define AESNI_ALIGN_ATTR __attribute__ ((__aligned__(AESNI_ALIGN)))
> +#define AES_BLOCK_MASK	(~(AES_BLOCK_SIZE - 1))
> +#define RFC4106_HASH_SUBKEY_SIZE 16
> +#define AESNI_ALIGN_EXTRA ((AESNI_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
> +#define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA)
> +#define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA)
> +
> +extern struct shash_alg aesni_mac_algs[];
> +extern const int aesni_num_mac_algs;
> +
> +asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
> +			     unsigned int key_len);
> +asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in);
> +asmlinkage void aesni_dec(const void *ctx, u8 *out, const u8 *in);
> +asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out,
> +			      const u8 *in, unsigned int len);
> +asmlinkage void aesni_ecb_dec(struct crypto_aes_ctx *ctx, u8 *out,
> +			      const u8 *in, unsigned int len);
> +asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
> +			      const u8 *in, unsigned int len, u8 *iv);
> +asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
> +			      const u8 *in, unsigned int len, u8 *iv);
> +
> +static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
> +{
> +	unsigned long addr = (unsigned long)raw_ctx;
> +	unsigned long align = AESNI_ALIGN;
> +
> +	if (align <= crypto_tfm_ctx_alignment())
> +		align = 1;
> +	return (struct crypto_aes_ctx *)ALIGN(addr, align);
> +}
> diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
> index ad8a7188a2bf..bc51c5f80858 100644
> --- a/arch/x86/crypto/aesni-intel_glue.c
> +++ b/arch/x86/crypto/aesni-intel_glue.c
> @@ -19,8 +19,6 @@
>   #include <linux/types.h>
>   #include <linux/module.h>
>   #include <linux/err.h>
> -#include <crypto/algapi.h>
> -#include <crypto/aes.h>
>   #include <crypto/ctr.h>
>   #include <crypto/b128ops.h>
>   #include <crypto/gcm.h>
> @@ -37,14 +35,7 @@
>   #include <asm/crypto/glue_helper.h>
>   #endif
>   
> -
> -#define AESNI_ALIGN	16
> -#define AESNI_ALIGN_ATTR __attribute__ ((__aligned__(AESNI_ALIGN)))
> -#define AES_BLOCK_MASK	(~(AES_BLOCK_SIZE - 1))
> -#define RFC4106_HASH_SUBKEY_SIZE 16
> -#define AESNI_ALIGN_EXTRA ((AESNI_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
> -#define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA)
> -#define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA)
> +#include "aesni-intel.h"
>   
>   /* This data is stored at the end of the crypto_tfm struct.
>    * It's a type of per "session" data storage location.
> @@ -81,19 +72,6 @@ struct gcm_context_data {
>   	u8 hash_keys[GCM_BLOCK_LEN * 16];
>   };
>   
> -asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
> -			     unsigned int key_len);
> -asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in);
> -asmlinkage void aesni_dec(const void *ctx, u8 *out, const u8 *in);
> -asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out,
> -			      const u8 *in, unsigned int len);
> -asmlinkage void aesni_ecb_dec(struct crypto_aes_ctx *ctx, u8 *out,
> -			      const u8 *in, unsigned int len);
> -asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
> -			      const u8 *in, unsigned int len, u8 *iv);
> -asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
> -			      const u8 *in, unsigned int len, u8 *iv);
> -
>   #define AVX_GEN2_OPTSIZE 640
>   #define AVX_GEN4_OPTSIZE 4096
>   
> @@ -296,16 +274,6 @@ generic_gcmaes_ctx *generic_gcmaes_ctx_get(struct crypto_aead *tfm)
>   }
>   #endif
>   
> -static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
> -{
> -	unsigned long addr = (unsigned long)raw_ctx;
> -	unsigned long align = AESNI_ALIGN;
> -
> -	if (align <= crypto_tfm_ctx_alignment())
> -		align = 1;
> -	return (struct crypto_aes_ctx *)ALIGN(addr, align);
> -}
> -
>   static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx,
>   			      const u8 *in_key, unsigned int key_len)
>   {
> @@ -1098,8 +1066,15 @@ static int __init aesni_init(void)
>   	if (err)
>   		goto unregister_skciphers;
>   
> +	err = crypto_register_shashes(aesni_mac_algs, aesni_num_mac_algs);
> +	if (err)
> +		goto unregister_aeads;
> +
>   	return 0;
>   
> +unregister_aeads:
> +	simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads),
> +			      aesni_simd_aeads);
>   unregister_skciphers:
>   	simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
>   				  aesni_simd_skciphers);
> @@ -1110,6 +1085,7 @@ static int __init aesni_init(void)
>   
>   static void __exit aesni_exit(void)
>   {
> +	crypto_unregister_shashes(aesni_mac_algs, aesni_num_mac_algs);
>   	simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads),
>   			      aesni_simd_aeads);
>   	simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
> diff --git a/arch/x86/crypto/aesni-intel_mac.c b/arch/x86/crypto/aesni-intel_mac.c
> new file mode 100644
> index 000000000000..7d546bbf21e5
> --- /dev/null
> +++ b/arch/x86/crypto/aesni-intel_mac.c
> @@ -0,0 +1,257 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
> + * Copyright (C) 2020 Arm Ltd <ard.biesheuvel@arm.com>
> + */
> +
> +#include <asm/simd.h>
> +#include <crypto/b128ops.h>
> +#include <crypto/internal/hash.h>
> +#include <crypto/internal/simd.h>
> +#include <linux/module.h>
> +
> +#include "aesni-intel.h"
> +
> +MODULE_ALIAS_CRYPTO("cmac(aes)");
> +MODULE_ALIAS_CRYPTO("xcbc(aes)");
> +MODULE_ALIAS_CRYPTO("cbcmac(aes)");
> +
> +struct mac_tfm_ctx {
> +	u8 key[CRYPTO_AES_CTX_SIZE];
> +	u8 __aligned(8) consts[];
> +};
> +
> +struct mac_desc_ctx {
> +	u8 dg[AES_BLOCK_SIZE];
> +	unsigned int len;
> +};
> +
> +asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in);
> +asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out,
> +			      const u8 *in, unsigned int len);
> +asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
> +			      const u8 *in, unsigned int len, u8 *iv);
> +
> +static int cbcmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
> +			 unsigned int key_len)
> +{
> +	struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
> +
> +	return aes_expandkey(aes_ctx(&ctx->key), in_key, key_len);
> +}
> +
> +static void cmac_gf128_mul_by_x(be128 *y, const be128 *x)
> +{
> +	u64 a = be64_to_cpu(x->a);
> +	u64 b = be64_to_cpu(x->b);
> +
> +	y->a = cpu_to_be64((a << 1) | (b >> 63));
> +	y->b = cpu_to_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0));
> +}
> +
> +static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
> +		       unsigned int key_len)
> +{
> +	struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
> +	be128 *consts = (be128 *)ctx->consts;
> +	int err;
> +
> +	err = cbcmac_setkey(tfm, in_key, key_len);
> +	if (err)
> +		return err;
> +
> +	/* encrypt the zero vector */
> +	kernel_fpu_begin();
> +	aesni_ecb_enc(aes_ctx(&ctx->key), ctx->consts, (u8[AES_BLOCK_SIZE]){},
> +		      AES_BLOCK_SIZE);
> +	kernel_fpu_end();
> +
> +	cmac_gf128_mul_by_x(consts, consts);
> +	cmac_gf128_mul_by_x(consts + 1, consts);
> +
> +	return 0;
> +}
> +
> +static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key,
> +		       unsigned int key_len)
> +{
> +	static u8 const ks[3][AES_BLOCK_SIZE] = {
> +		{ [0 ... AES_BLOCK_SIZE - 1] = 0x1 },
> +		{ [0 ... AES_BLOCK_SIZE - 1] = 0x2 },
> +		{ [0 ... AES_BLOCK_SIZE - 1] = 0x3 },
> +	};
> +
> +	struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
> +	u8 key[AES_BLOCK_SIZE];
> +	int err;
> +
> +	err = cbcmac_setkey(tfm, in_key, key_len);
> +	if (err)
> +		return err;
> +
> +	kernel_fpu_begin();
> +	aesni_ecb_enc(aes_ctx(&ctx->key), key, ks[0], AES_BLOCK_SIZE);
> +	aesni_ecb_enc(aes_ctx(&ctx->key), ctx->consts, ks[1], 2 * AES_BLOCK_SIZE);
> +	kernel_fpu_end();
> +
> +	return cbcmac_setkey(tfm, key, sizeof(key));
> +}
> +
> +static int mac_init(struct shash_desc *desc)
> +{
> +	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
> +
> +	memset(ctx->dg, 0, AES_BLOCK_SIZE);
> +	ctx->len = 0;
> +
> +	return 0;
> +}
> +
> +static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks,
> +			  u8 dg[], int enc_before, int enc_after)
> +{
> +	if (likely(crypto_simd_usable())) {
> +		kernel_fpu_begin();
> +		if (enc_before)
> +			aesni_enc(ctx, dg, dg);
> +
> +		while (blocks--) {
> +			if (!blocks && !enc_after) {
> +				crypto_xor(dg, in, AES_BLOCK_SIZE);
> +				break;
> +			}
> +			aesni_cbc_enc(ctx, dg, in, AES_BLOCK_SIZE, dg);
> +			in += AES_BLOCK_SIZE;
> +		}
> +		kernel_fpu_end();
> +	} else {
> +		if (enc_before)
> +			aes_encrypt(ctx, dg, dg);
> +
> +		while (blocks--) {
> +			crypto_xor(dg, in, AES_BLOCK_SIZE);
> +			in += AES_BLOCK_SIZE;
> +
> +			if (blocks || enc_after)
> +				aes_encrypt(ctx, dg, dg);
> +		}
> +	}
> +}
> +
> +static int mac_update(struct shash_desc *desc, const u8 *p, unsigned int len)
> +{
> +	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
> +	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
> +
> +	while (len > 0) {
> +		unsigned int l;
> +
> +		if ((ctx->len % AES_BLOCK_SIZE) == 0 &&
> +		    (ctx->len + len) > AES_BLOCK_SIZE) {
> +
> +			int blocks = len / AES_BLOCK_SIZE;
> +
> +			len %= AES_BLOCK_SIZE;
> +
> +			mac_do_update(aes_ctx(&tctx->key), p, blocks, ctx->dg,
> +				      (ctx->len != 0), (len != 0));
> +
> +			p += blocks * AES_BLOCK_SIZE;
> +
> +			if (!len) {
> +				ctx->len = AES_BLOCK_SIZE;
> +				break;
> +			}
> +			ctx->len = 0;
> +		}
> +
> +		l = min(len, AES_BLOCK_SIZE - ctx->len);
> +
> +		if (l <= AES_BLOCK_SIZE) {
> +			crypto_xor(ctx->dg + ctx->len, p, l);
> +			ctx->len += l;
> +			len -= l;
> +			p += l;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static int cbcmac_final(struct shash_desc *desc, u8 *out)
> +{
> +	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
> +	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
> +
> +	mac_do_update(aes_ctx(&tctx->key), NULL, 0, ctx->dg, (ctx->len != 0), 0);
> +
> +	memcpy(out, ctx->dg, AES_BLOCK_SIZE);
> +
> +	return 0;
> +}
> +
> +static int cmac_final(struct shash_desc *desc, u8 *out)
> +{
> +	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
> +	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
> +	u8 *consts = tctx->consts;
> +
> +	if (ctx->len != AES_BLOCK_SIZE) {
> +		ctx->dg[ctx->len] ^= 0x80;
> +		consts += AES_BLOCK_SIZE;
> +	}
> +
> +	mac_do_update(aes_ctx(&tctx->key), consts, 1, ctx->dg, 0, 1);
> +
> +	memcpy(out, ctx->dg, AES_BLOCK_SIZE);
> +
> +	return 0;
> +}
> +
> +struct shash_alg aesni_mac_algs[] = { {
> +	.base.cra_name		= "cmac(aes)",
> +	.base.cra_driver_name	= "cmac-aes-aesni",
> +	.base.cra_priority	= 400,
> +	.base.cra_blocksize	= AES_BLOCK_SIZE,
> +	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx) +
> +				  2 * AES_BLOCK_SIZE,
> +	.base.cra_module	= THIS_MODULE,
> +
> +	.digestsize		= AES_BLOCK_SIZE,
> +	.init			= mac_init,
> +	.update			= mac_update,
> +	.final			= cmac_final,
> +	.setkey			= cmac_setkey,
> +	.descsize		= sizeof(struct mac_desc_ctx),
> +}, {
> +	.base.cra_name		= "xcbc(aes)",
> +	.base.cra_driver_name	= "xcbc-aes-aesni",
> +	.base.cra_priority	= 400,
> +	.base.cra_blocksize	= AES_BLOCK_SIZE,
> +	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx) +
> +				  2 * AES_BLOCK_SIZE,
> +	.base.cra_module	= THIS_MODULE,
> +
> +	.digestsize		= AES_BLOCK_SIZE,
> +	.init			= mac_init,
> +	.update			= mac_update,
> +	.final			= cmac_final,
> +	.setkey			= xcbc_setkey,
> +	.descsize		= sizeof(struct mac_desc_ctx),
> +}, {
> +	.base.cra_name		= "cbcmac(aes)",
> +	.base.cra_driver_name	= "cbcmac-aes-aesni",
> +	.base.cra_priority	= 400,
> +	.base.cra_blocksize	= 1,
> +	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx),
> +	.base.cra_module	= THIS_MODULE,
> +
> +	.digestsize		= AES_BLOCK_SIZE,
> +	.init			= mac_init,
> +	.update			= mac_update,
> +	.final			= cbcmac_final,
> +	.setkey			= cbcmac_setkey,
> +	.descsize		= sizeof(struct mac_desc_ctx),
> +} };
> +
> +const int aesni_num_mac_algs = ARRAY_SIZE(aesni_mac_algs);
> 


-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] crypto: x86/aesni - implement accelerated CBCMAC, CMAC and XCBC shashes
  2020-08-03 19:11 ` Ben Greear
@ 2020-08-04 12:55   ` Ard Biesheuvel
  2020-08-04 13:01     ` Ben Greear
  0 siblings, 1 reply; 46+ messages in thread
From: Ard Biesheuvel @ 2020-08-04 12:55 UTC (permalink / raw)
  To: Ben Greear; +Cc: Linux Crypto Mailing List, Herbert Xu, Eric Biggers

On Mon, 3 Aug 2020 at 21:11, Ben Greear <greearb@candelatech.com> wrote:
>
> Hello,
>
> This helps a bit...now download sw-crypt performance is about 150Mbps,
> but still not as good as with my patch on 5.4 kernel, and fpu is still
> high in perf top:
>
>     13.89%  libc-2.29.so   [.] __memset_sse2_unaligned_erms
>       6.62%  [kernel]       [k] kernel_fpu_begin
>       4.14%  [kernel]       [k] _aesni_enc1
>       2.06%  [kernel]       [k] __crypto_xor
>       1.95%  [kernel]       [k] copy_user_generic_string
>       1.93%  libjvm.so      [.] SpinPause
>       1.01%  [kernel]       [k] aesni_encrypt
>       0.98%  [kernel]       [k] crypto_ctr_crypt
>       0.93%  [kernel]       [k] udp_sendmsg
>       0.78%  [kernel]       [k] crypto_inc
>       0.74%  [kernel]       [k] __ip_append_data.isra.53
>       0.65%  [kernel]       [k] aesni_cbc_enc
>       0.64%  [kernel]       [k] __dev_queue_xmit
>       0.62%  [kernel]       [k] ipt_do_table
>       0.62%  [kernel]       [k] igb_xmit_frame_ring
>       0.59%  [kernel]       [k] ip_route_output_key_hash_rcu
>       0.57%  [kernel]       [k] memcpy
>       0.57%  libjvm.so      [.] InstanceKlass::oop_follow_contents
>       0.56%  [kernel]       [k] irq_fpu_usable
>       0.56%  [kernel]       [k] mac_do_update
>
> If you'd like help setting up a test rig and have an ath10k pcie NIC or ath9k pcie NIC,
> then I can help.  Possibly hwsim would also be a good test case, but I have not tried
> that.
>

I don't think this is likely to be reproducible on other
micro-architectures, so setting up a test rig is unlikely to help.

I'll send out a v2 which implements a ahash instead of a shash (and
implements some other tweaks) so that kernel_fpu_begin() is only
called twice for each packet on the cbcmac path.

Do you have any numbers for the old kernel without your patch? This
pathological FPU preserve/restore behavior could be caused be the
optimizations, or by other changes that landed in the meantime, so I
would like to know if kernel_fpu_begin() is as prominent in those
traces as well.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] crypto: x86/aesni - implement accelerated CBCMAC, CMAC and XCBC shashes
  2020-08-04 12:55   ` Ard Biesheuvel
@ 2020-08-04 13:01     ` Ben Greear
  2020-08-04 13:08       ` Ard Biesheuvel
  0 siblings, 1 reply; 46+ messages in thread
From: Ben Greear @ 2020-08-04 13:01 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: Linux Crypto Mailing List, Herbert Xu, Eric Biggers

On 8/4/20 5:55 AM, Ard Biesheuvel wrote:
> On Mon, 3 Aug 2020 at 21:11, Ben Greear <greearb@candelatech.com> wrote:
>>
>> Hello,
>>
>> This helps a bit...now download sw-crypt performance is about 150Mbps,
>> but still not as good as with my patch on 5.4 kernel, and fpu is still
>> high in perf top:
>>
>>      13.89%  libc-2.29.so   [.] __memset_sse2_unaligned_erms
>>        6.62%  [kernel]       [k] kernel_fpu_begin
>>        4.14%  [kernel]       [k] _aesni_enc1
>>        2.06%  [kernel]       [k] __crypto_xor
>>        1.95%  [kernel]       [k] copy_user_generic_string
>>        1.93%  libjvm.so      [.] SpinPause
>>        1.01%  [kernel]       [k] aesni_encrypt
>>        0.98%  [kernel]       [k] crypto_ctr_crypt
>>        0.93%  [kernel]       [k] udp_sendmsg
>>        0.78%  [kernel]       [k] crypto_inc
>>        0.74%  [kernel]       [k] __ip_append_data.isra.53
>>        0.65%  [kernel]       [k] aesni_cbc_enc
>>        0.64%  [kernel]       [k] __dev_queue_xmit
>>        0.62%  [kernel]       [k] ipt_do_table
>>        0.62%  [kernel]       [k] igb_xmit_frame_ring
>>        0.59%  [kernel]       [k] ip_route_output_key_hash_rcu
>>        0.57%  [kernel]       [k] memcpy
>>        0.57%  libjvm.so      [.] InstanceKlass::oop_follow_contents
>>        0.56%  [kernel]       [k] irq_fpu_usable
>>        0.56%  [kernel]       [k] mac_do_update
>>
>> If you'd like help setting up a test rig and have an ath10k pcie NIC or ath9k pcie NIC,
>> then I can help.  Possibly hwsim would also be a good test case, but I have not tried
>> that.
>>
> 
> I don't think this is likely to be reproducible on other
> micro-architectures, so setting up a test rig is unlikely to help.
> 
> I'll send out a v2 which implements a ahash instead of a shash (and
> implements some other tweaks) so that kernel_fpu_begin() is only
> called twice for each packet on the cbcmac path.
> 
> Do you have any numbers for the old kernel without your patch? This
> pathological FPU preserve/restore behavior could be caused be the
> optimizations, or by other changes that landed in the meantime, so I
> would like to know if kernel_fpu_begin() is as prominent in those
> traces as well.
> 

This same patch makes i7 mobile processors able to handle 1Gbps+ software
decrypt rates, where without the patch, the rate was badly constrained and CPU
load was much higher, so it is definitely noticeable on other processors too.
The weak processor on the current test rig is convenient because the problem
is so noticeable even at slower wifi speeds.

We can do some tests on 5.4 with our patch reverted.

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] crypto: x86/aesni - implement accelerated CBCMAC, CMAC and XCBC shashes
  2020-08-04 13:01     ` Ben Greear
@ 2020-08-04 13:08       ` Ard Biesheuvel
  2020-08-04 13:22         ` Ben Greear
  2020-08-04 19:45         ` Ben Greear
  0 siblings, 2 replies; 46+ messages in thread
From: Ard Biesheuvel @ 2020-08-04 13:08 UTC (permalink / raw)
  To: Ben Greear; +Cc: Linux Crypto Mailing List, Herbert Xu, Eric Biggers

On Tue, 4 Aug 2020 at 15:01, Ben Greear <greearb@candelatech.com> wrote:
>
> On 8/4/20 5:55 AM, Ard Biesheuvel wrote:
> > On Mon, 3 Aug 2020 at 21:11, Ben Greear <greearb@candelatech.com> wrote:
> >>
> >> Hello,
> >>
> >> This helps a bit...now download sw-crypt performance is about 150Mbps,
> >> but still not as good as with my patch on 5.4 kernel, and fpu is still
> >> high in perf top:
> >>
> >>      13.89%  libc-2.29.so   [.] __memset_sse2_unaligned_erms
> >>        6.62%  [kernel]       [k] kernel_fpu_begin
> >>        4.14%  [kernel]       [k] _aesni_enc1
> >>        2.06%  [kernel]       [k] __crypto_xor
> >>        1.95%  [kernel]       [k] copy_user_generic_string
> >>        1.93%  libjvm.so      [.] SpinPause
> >>        1.01%  [kernel]       [k] aesni_encrypt
> >>        0.98%  [kernel]       [k] crypto_ctr_crypt
> >>        0.93%  [kernel]       [k] udp_sendmsg
> >>        0.78%  [kernel]       [k] crypto_inc
> >>        0.74%  [kernel]       [k] __ip_append_data.isra.53
> >>        0.65%  [kernel]       [k] aesni_cbc_enc
> >>        0.64%  [kernel]       [k] __dev_queue_xmit
> >>        0.62%  [kernel]       [k] ipt_do_table
> >>        0.62%  [kernel]       [k] igb_xmit_frame_ring
> >>        0.59%  [kernel]       [k] ip_route_output_key_hash_rcu
> >>        0.57%  [kernel]       [k] memcpy
> >>        0.57%  libjvm.so      [.] InstanceKlass::oop_follow_contents
> >>        0.56%  [kernel]       [k] irq_fpu_usable
> >>        0.56%  [kernel]       [k] mac_do_update
> >>
> >> If you'd like help setting up a test rig and have an ath10k pcie NIC or ath9k pcie NIC,
> >> then I can help.  Possibly hwsim would also be a good test case, but I have not tried
> >> that.
> >>
> >
> > I don't think this is likely to be reproducible on other
> > micro-architectures, so setting up a test rig is unlikely to help.
> >
> > I'll send out a v2 which implements a ahash instead of a shash (and
> > implements some other tweaks) so that kernel_fpu_begin() is only
> > called twice for each packet on the cbcmac path.
> >
> > Do you have any numbers for the old kernel without your patch? This
> > pathological FPU preserve/restore behavior could be caused be the
> > optimizations, or by other changes that landed in the meantime, so I
> > would like to know if kernel_fpu_begin() is as prominent in those
> > traces as well.
> >
>
> This same patch makes i7 mobile processors able to handle 1Gbps+ software
> decrypt rates, where without the patch, the rate was badly constrained and CPU
> load was much higher, so it is definitely noticeable on other processors too.

OK

> The weak processor on the current test rig is convenient because the problem
> is so noticeable even at slower wifi speeds.
>
> We can do some tests on 5.4 with our patch reverted.
>

The issue with your CCM patch is that it keeps the FPU enabled for the
entire input, which also means that preemption is disabled, which
makes the -rt people grumpy. (Of course, it also uses APIs that no
longer exists, but that should be easy to fix)

Do you happen to have any ballpark figures for the packet sizes and
the time spent doing encryption?

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] crypto: x86/aesni - implement accelerated CBCMAC, CMAC and XCBC shashes
  2020-08-04 13:08       ` Ard Biesheuvel
@ 2020-08-04 13:22         ` Ben Greear
  2020-08-04 19:45         ` Ben Greear
  1 sibling, 0 replies; 46+ messages in thread
From: Ben Greear @ 2020-08-04 13:22 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: Linux Crypto Mailing List, Herbert Xu, Eric Biggers

On 8/4/20 6:08 AM, Ard Biesheuvel wrote:
> On Tue, 4 Aug 2020 at 15:01, Ben Greear <greearb@candelatech.com> wrote:
>>
>> On 8/4/20 5:55 AM, Ard Biesheuvel wrote:
>>> On Mon, 3 Aug 2020 at 21:11, Ben Greear <greearb@candelatech.com> wrote:
>>>>
>>>> Hello,
>>>>
>>>> This helps a bit...now download sw-crypt performance is about 150Mbps,
>>>> but still not as good as with my patch on 5.4 kernel, and fpu is still
>>>> high in perf top:
>>>>
>>>>       13.89%  libc-2.29.so   [.] __memset_sse2_unaligned_erms
>>>>         6.62%  [kernel]       [k] kernel_fpu_begin
>>>>         4.14%  [kernel]       [k] _aesni_enc1
>>>>         2.06%  [kernel]       [k] __crypto_xor
>>>>         1.95%  [kernel]       [k] copy_user_generic_string
>>>>         1.93%  libjvm.so      [.] SpinPause
>>>>         1.01%  [kernel]       [k] aesni_encrypt
>>>>         0.98%  [kernel]       [k] crypto_ctr_crypt
>>>>         0.93%  [kernel]       [k] udp_sendmsg
>>>>         0.78%  [kernel]       [k] crypto_inc
>>>>         0.74%  [kernel]       [k] __ip_append_data.isra.53
>>>>         0.65%  [kernel]       [k] aesni_cbc_enc
>>>>         0.64%  [kernel]       [k] __dev_queue_xmit
>>>>         0.62%  [kernel]       [k] ipt_do_table
>>>>         0.62%  [kernel]       [k] igb_xmit_frame_ring
>>>>         0.59%  [kernel]       [k] ip_route_output_key_hash_rcu
>>>>         0.57%  [kernel]       [k] memcpy
>>>>         0.57%  libjvm.so      [.] InstanceKlass::oop_follow_contents
>>>>         0.56%  [kernel]       [k] irq_fpu_usable
>>>>         0.56%  [kernel]       [k] mac_do_update
>>>>
>>>> If you'd like help setting up a test rig and have an ath10k pcie NIC or ath9k pcie NIC,
>>>> then I can help.  Possibly hwsim would also be a good test case, but I have not tried
>>>> that.
>>>>
>>>
>>> I don't think this is likely to be reproducible on other
>>> micro-architectures, so setting up a test rig is unlikely to help.
>>>
>>> I'll send out a v2 which implements a ahash instead of a shash (and
>>> implements some other tweaks) so that kernel_fpu_begin() is only
>>> called twice for each packet on the cbcmac path.
>>>
>>> Do you have any numbers for the old kernel without your patch? This
>>> pathological FPU preserve/restore behavior could be caused be the
>>> optimizations, or by other changes that landed in the meantime, so I
>>> would like to know if kernel_fpu_begin() is as prominent in those
>>> traces as well.
>>>
>>
>> This same patch makes i7 mobile processors able to handle 1Gbps+ software
>> decrypt rates, where without the patch, the rate was badly constrained and CPU
>> load was much higher, so it is definitely noticeable on other processors too.
> 
> OK
> 
>> The weak processor on the current test rig is convenient because the problem
>> is so noticeable even at slower wifi speeds.
>>
>> We can do some tests on 5.4 with our patch reverted.
>>
> 
> The issue with your CCM patch is that it keeps the FPU enabled for the
> entire input, which also means that preemption is disabled, which
> makes the -rt people grumpy. (Of course, it also uses APIs that no
> longer exists, but that should be easy to fix)

So, if there is no other way to get back the performance, can it be a compile
or runtime option (disabled by default for -RT type folks) to re-enable the feature
that helps our CPU usage?

Or, can you do an add-on patch to enable keeping fpu enabled so that I can test
how that affects our performance?

> 
> Do you happen to have any ballpark figures for the packet sizes and
> the time spent doing encryption?

This test was using MTU UDP frames I think, and mostly it is just sending
and receiving frames.  perf top output gives you as much detail as I have about
what the kernel is spending time doing.

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] crypto: x86/aesni - implement accelerated CBCMAC, CMAC and XCBC shashes
  2020-08-04 13:08       ` Ard Biesheuvel
  2020-08-04 13:22         ` Ben Greear
@ 2020-08-04 19:45         ` Ben Greear
  2020-08-04 20:12           ` Ard Biesheuvel
  2020-09-23 11:03           ` Ben Greear
  1 sibling, 2 replies; 46+ messages in thread
From: Ben Greear @ 2020-08-04 19:45 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: Linux Crypto Mailing List, Herbert Xu, Eric Biggers

On 8/4/20 6:08 AM, Ard Biesheuvel wrote:
> On Tue, 4 Aug 2020 at 15:01, Ben Greear <greearb@candelatech.com> wrote:
>>
>> On 8/4/20 5:55 AM, Ard Biesheuvel wrote:
>>> On Mon, 3 Aug 2020 at 21:11, Ben Greear <greearb@candelatech.com> wrote:
>>>>
>>>> Hello,
>>>>
>>>> This helps a bit...now download sw-crypt performance is about 150Mbps,
>>>> but still not as good as with my patch on 5.4 kernel, and fpu is still
>>>> high in perf top:
>>>>
>>>>       13.89%  libc-2.29.so   [.] __memset_sse2_unaligned_erms
>>>>         6.62%  [kernel]       [k] kernel_fpu_begin
>>>>         4.14%  [kernel]       [k] _aesni_enc1
>>>>         2.06%  [kernel]       [k] __crypto_xor
>>>>         1.95%  [kernel]       [k] copy_user_generic_string
>>>>         1.93%  libjvm.so      [.] SpinPause
>>>>         1.01%  [kernel]       [k] aesni_encrypt
>>>>         0.98%  [kernel]       [k] crypto_ctr_crypt
>>>>         0.93%  [kernel]       [k] udp_sendmsg
>>>>         0.78%  [kernel]       [k] crypto_inc
>>>>         0.74%  [kernel]       [k] __ip_append_data.isra.53
>>>>         0.65%  [kernel]       [k] aesni_cbc_enc
>>>>         0.64%  [kernel]       [k] __dev_queue_xmit
>>>>         0.62%  [kernel]       [k] ipt_do_table
>>>>         0.62%  [kernel]       [k] igb_xmit_frame_ring
>>>>         0.59%  [kernel]       [k] ip_route_output_key_hash_rcu
>>>>         0.57%  [kernel]       [k] memcpy
>>>>         0.57%  libjvm.so      [.] InstanceKlass::oop_follow_contents
>>>>         0.56%  [kernel]       [k] irq_fpu_usable
>>>>         0.56%  [kernel]       [k] mac_do_update
>>>>
>>>> If you'd like help setting up a test rig and have an ath10k pcie NIC or ath9k pcie NIC,
>>>> then I can help.  Possibly hwsim would also be a good test case, but I have not tried
>>>> that.
>>>>
>>>
>>> I don't think this is likely to be reproducible on other
>>> micro-architectures, so setting up a test rig is unlikely to help.
>>>
>>> I'll send out a v2 which implements a ahash instead of a shash (and
>>> implements some other tweaks) so that kernel_fpu_begin() is only
>>> called twice for each packet on the cbcmac path.
>>>
>>> Do you have any numbers for the old kernel without your patch? This
>>> pathological FPU preserve/restore behavior could be caused be the
>>> optimizations, or by other changes that landed in the meantime, so I
>>> would like to know if kernel_fpu_begin() is as prominent in those
>>> traces as well.
>>>
>>
>> This same patch makes i7 mobile processors able to handle 1Gbps+ software
>> decrypt rates, where without the patch, the rate was badly constrained and CPU
>> load was much higher, so it is definitely noticeable on other processors too.
> 
> OK
> 
>> The weak processor on the current test rig is convenient because the problem
>> is so noticeable even at slower wifi speeds.
>>
>> We can do some tests on 5.4 with our patch reverted.
>>
> 
> The issue with your CCM patch is that it keeps the FPU enabled for the
> entire input, which also means that preemption is disabled, which
> makes the -rt people grumpy. (Of course, it also uses APIs that no
> longer exists, but that should be easy to fix)
> 
> Do you happen to have any ballpark figures for the packet sizes and
> the time spent doing encryption?
> 

My tester reports this last patch appears to break wpa-2 entirely, so we
cannot test it as is.

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] crypto: x86/aesni - implement accelerated CBCMAC, CMAC and XCBC shashes
  2020-08-04 19:45         ` Ben Greear
@ 2020-08-04 20:12           ` Ard Biesheuvel
  2020-09-23 11:03           ` Ben Greear
  1 sibling, 0 replies; 46+ messages in thread
From: Ard Biesheuvel @ 2020-08-04 20:12 UTC (permalink / raw)
  To: Ben Greear; +Cc: Linux Crypto Mailing List, Herbert Xu, Eric Biggers

On Tue, 4 Aug 2020 at 21:45, Ben Greear <greearb@candelatech.com> wrote:
>
> On 8/4/20 6:08 AM, Ard Biesheuvel wrote:
> > On Tue, 4 Aug 2020 at 15:01, Ben Greear <greearb@candelatech.com> wrote:
> >>
> >> On 8/4/20 5:55 AM, Ard Biesheuvel wrote:
> >>> On Mon, 3 Aug 2020 at 21:11, Ben Greear <greearb@candelatech.com> wrote:
> >>>>
> >>>> Hello,
> >>>>
> >>>> This helps a bit...now download sw-crypt performance is about 150Mbps,
> >>>> but still not as good as with my patch on 5.4 kernel, and fpu is still
> >>>> high in perf top:
> >>>>
> >>>>       13.89%  libc-2.29.so   [.] __memset_sse2_unaligned_erms
> >>>>         6.62%  [kernel]       [k] kernel_fpu_begin
> >>>>         4.14%  [kernel]       [k] _aesni_enc1
> >>>>         2.06%  [kernel]       [k] __crypto_xor
> >>>>         1.95%  [kernel]       [k] copy_user_generic_string
> >>>>         1.93%  libjvm.so      [.] SpinPause
> >>>>         1.01%  [kernel]       [k] aesni_encrypt
> >>>>         0.98%  [kernel]       [k] crypto_ctr_crypt
> >>>>         0.93%  [kernel]       [k] udp_sendmsg
> >>>>         0.78%  [kernel]       [k] crypto_inc
> >>>>         0.74%  [kernel]       [k] __ip_append_data.isra.53
> >>>>         0.65%  [kernel]       [k] aesni_cbc_enc
> >>>>         0.64%  [kernel]       [k] __dev_queue_xmit
> >>>>         0.62%  [kernel]       [k] ipt_do_table
> >>>>         0.62%  [kernel]       [k] igb_xmit_frame_ring
> >>>>         0.59%  [kernel]       [k] ip_route_output_key_hash_rcu
> >>>>         0.57%  [kernel]       [k] memcpy
> >>>>         0.57%  libjvm.so      [.] InstanceKlass::oop_follow_contents
> >>>>         0.56%  [kernel]       [k] irq_fpu_usable
> >>>>         0.56%  [kernel]       [k] mac_do_update
> >>>>
> >>>> If you'd like help setting up a test rig and have an ath10k pcie NIC or ath9k pcie NIC,
> >>>> then I can help.  Possibly hwsim would also be a good test case, but I have not tried
> >>>> that.
> >>>>
> >>>
> >>> I don't think this is likely to be reproducible on other
> >>> micro-architectures, so setting up a test rig is unlikely to help.
> >>>
> >>> I'll send out a v2 which implements a ahash instead of a shash (and
> >>> implements some other tweaks) so that kernel_fpu_begin() is only
> >>> called twice for each packet on the cbcmac path.
> >>>
> >>> Do you have any numbers for the old kernel without your patch? This
> >>> pathological FPU preserve/restore behavior could be caused be the
> >>> optimizations, or by other changes that landed in the meantime, so I
> >>> would like to know if kernel_fpu_begin() is as prominent in those
> >>> traces as well.
> >>>
> >>
> >> This same patch makes i7 mobile processors able to handle 1Gbps+ software
> >> decrypt rates, where without the patch, the rate was badly constrained and CPU
> >> load was much higher, so it is definitely noticeable on other processors too.
> >
> > OK
> >
> >> The weak processor on the current test rig is convenient because the problem
> >> is so noticeable even at slower wifi speeds.
> >>
> >> We can do some tests on 5.4 with our patch reverted.
> >>
> >
> > The issue with your CCM patch is that it keeps the FPU enabled for the
> > entire input, which also means that preemption is disabled, which
> > makes the -rt people grumpy. (Of course, it also uses APIs that no
> > longer exists, but that should be easy to fix)
> >
> > Do you happen to have any ballpark figures for the packet sizes and
> > the time spent doing encryption?
> >
>
> My tester reports this last patch appears to break wpa-2 entirely, so we
> cannot test it as is.
>

Ah, that is unfortunate. It passed the internal selftests we have in
the kernel, but apparently, the coverage is not 100%.

I will take another look into this next week.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-02  9:06 [PATCH] crypto: x86/aesni - implement accelerated CBCMAC, CMAC and XCBC shashes Ard Biesheuvel
  2020-08-03 19:11 ` Ben Greear
@ 2020-08-18  8:24 ` Herbert Xu
  2020-08-18  8:25   ` [PATCH 1/6] crypto: skcipher - Add helpers for sync skcipher spawn Herbert Xu
                     ` (6 more replies)
  1 sibling, 7 replies; 46+ messages in thread
From: Herbert Xu @ 2020-08-18  8:24 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: linux-crypto, ebiggers, Ben Greear

On Sun, Aug 02, 2020 at 12:06:16PM +0300, Ard Biesheuvel wrote:
> Ben reports that CCM using AES-NI instructions performs pathologically
> poorly, which is due to the overhead of preserving/restoring the SIMD
> state, which is repeated after every 16 bytes of input when executing
> the CBCMAC portion of the algorithm.
> 
> So let's clone the arm64 implementation of cbcmac(aes), which takes
> care to only preserve/restore the SIMD state after processing the
> whole input. Since cmac(aes) and xcbc(aes) can reuse most of the code,
> let's expose those as well.
> 
> Cc: Ben Greear <greearb@candelatech.com>
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
>  arch/x86/crypto/Makefile           |   2 +-
>  arch/x86/crypto/aesni-intel.h      |  39 +++
>  arch/x86/crypto/aesni-intel_glue.c |  42 +---
>  arch/x86/crypto/aesni-intel_mac.c  | 257 ++++++++++++++++++++
>  4 files changed, 306 insertions(+), 34 deletions(-)

We should just use the accelerated cbc skcipher.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH 1/6] crypto: skcipher - Add helpers for sync skcipher spawn
  2020-08-18  8:24 ` [PATCH 0/5] crypto: Implement cmac based on cbc skcipher Herbert Xu
@ 2020-08-18  8:25   ` Herbert Xu
  2020-08-18  8:25   ` [PATCH 2/6] crypto: ahash - Add helper to free single spawn instance Herbert Xu
                     ` (5 subsequent siblings)
  6 siblings, 0 replies; 46+ messages in thread
From: Herbert Xu @ 2020-08-18  8:25 UTC (permalink / raw)
  To: Ard Biesheuvel, linux-crypto, ebiggers, Ben Greear

This patch adds helpers for using sync skciphers as spawns from
crypto instances.

In doing so it also changes the error returned when an algorithm
exceeds the maximum sync skcipher request size from EINVAL to E2BIG
as the former is used for way too many things.

Also the CRYPTO_ALG_ASYNC bit is now masked off just in case someone
tries to be clever.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 crypto/skcipher.c                  |   38 ++++++++++++++++++++++++++-----------
 include/crypto/internal/skcipher.h |   28 +++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 467af525848a1..bc9fc0c07a659 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -14,7 +14,7 @@
 #include <crypto/scatterwalk.h>
 #include <linux/bug.h>
 #include <linux/cryptouser.h>
-#include <linux/compiler.h>
+#include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/rtnetlink.h>
@@ -762,16 +762,9 @@ struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name,
 }
 EXPORT_SYMBOL_GPL(crypto_alloc_skcipher);
 
-struct crypto_sync_skcipher *crypto_alloc_sync_skcipher(
-				const char *alg_name, u32 type, u32 mask)
+static struct crypto_sync_skcipher *crypto_verify_sync_skcipher(
+	struct crypto_skcipher *tfm)
 {
-	struct crypto_skcipher *tfm;
-
-	/* Only sync algorithms allowed. */
-	mask |= CRYPTO_ALG_ASYNC;
-
-	tfm = crypto_alloc_tfm(alg_name, &crypto_skcipher_type, type, mask);
-
 	/*
 	 * Make sure we do not allocate something that might get used with
 	 * an on-stack request: check the request size.
@@ -779,13 +772,36 @@ struct crypto_sync_skcipher *crypto_alloc_sync_skcipher(
 	if (!IS_ERR(tfm) && WARN_ON(crypto_skcipher_reqsize(tfm) >
 				    MAX_SYNC_SKCIPHER_REQSIZE)) {
 		crypto_free_skcipher(tfm);
-		return ERR_PTR(-EINVAL);
+		tfm = ERR_PTR(-E2BIG);
 	}
 
 	return (struct crypto_sync_skcipher *)tfm;
 }
+
+struct crypto_sync_skcipher *crypto_alloc_sync_skcipher(
+				const char *alg_name, u32 type, u32 mask)
+{
+	struct crypto_skcipher *tfm;
+
+	/* Only sync algorithms allowed. */
+	type &= ~CRYPTO_ALG_ASYNC;
+	mask |= CRYPTO_ALG_ASYNC;
+
+	tfm = crypto_alloc_tfm(alg_name, &crypto_skcipher_type, type, mask);
+	return crypto_verify_sync_skcipher(tfm);
+}
 EXPORT_SYMBOL_GPL(crypto_alloc_sync_skcipher);
 
+struct crypto_sync_skcipher *crypto_spawn_sync_skcipher(
+	struct crypto_sync_skcipher_spawn *spawn)
+{
+	struct crypto_skcipher *tfm;
+
+	tfm = crypto_spawn_skcipher(&spawn->base);
+	return crypto_verify_sync_skcipher(tfm);
+}
+EXPORT_SYMBOL_GPL(crypto_spawn_sync_skcipher);
+
 int crypto_has_skcipher(const char *alg_name, u32 type, u32 mask)
 {
 	return crypto_type_has_alg(alg_name, &crypto_skcipher_type, type, mask);
diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h
index 10226c12c5df0..19f99643c0fe2 100644
--- a/include/crypto/internal/skcipher.h
+++ b/include/crypto/internal/skcipher.h
@@ -31,6 +31,10 @@ struct crypto_skcipher_spawn {
 	struct crypto_spawn base;
 };
 
+struct crypto_sync_skcipher_spawn {
+	struct crypto_skcipher_spawn base;
+};
+
 struct skcipher_walk {
 	union {
 		struct {
@@ -92,11 +96,26 @@ int crypto_grab_skcipher(struct crypto_skcipher_spawn *spawn,
 			 struct crypto_instance *inst,
 			 const char *name, u32 type, u32 mask);
 
+static inline int crypto_grab_sync_skcipher(
+	struct crypto_sync_skcipher_spawn *spawn,
+	struct crypto_instance *inst, const char *name, u32 type, u32 mask)
+{
+	return crypto_grab_skcipher(&spawn->base, inst, name,
+				    type & ~CRYPTO_ALG_ASYNC,
+				    mask | CRYPTO_ALG_ASYNC);
+}
+
 static inline void crypto_drop_skcipher(struct crypto_skcipher_spawn *spawn)
 {
 	crypto_drop_spawn(&spawn->base);
 }
 
+static inline void crypto_drop_sync_skcipher(
+	struct crypto_sync_skcipher_spawn *spawn)
+{
+	crypto_drop_skcipher(&spawn->base);
+}
+
 static inline struct skcipher_alg *crypto_skcipher_spawn_alg(
 	struct crypto_skcipher_spawn *spawn)
 {
@@ -109,12 +128,21 @@ static inline struct skcipher_alg *crypto_spawn_skcipher_alg(
 	return crypto_skcipher_spawn_alg(spawn);
 }
 
+static inline struct skcipher_alg *crypto_sync_spawn_skcipher_alg(
+	struct crypto_sync_skcipher_spawn *spawn)
+{
+	return crypto_spawn_skcipher_alg(&spawn->base);
+}
+
 static inline struct crypto_skcipher *crypto_spawn_skcipher(
 	struct crypto_skcipher_spawn *spawn)
 {
 	return crypto_spawn_tfm2(&spawn->base);
 }
 
+struct crypto_sync_skcipher *crypto_spawn_sync_skcipher(
+	struct crypto_sync_skcipher_spawn *spawn);
+
 static inline void crypto_skcipher_set_reqsize(
 	struct crypto_skcipher *skcipher, unsigned int reqsize)
 {

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 2/6] crypto: ahash - Add helper to free single spawn instance
  2020-08-18  8:24 ` [PATCH 0/5] crypto: Implement cmac based on cbc skcipher Herbert Xu
  2020-08-18  8:25   ` [PATCH 1/6] crypto: skcipher - Add helpers for sync skcipher spawn Herbert Xu
@ 2020-08-18  8:25   ` Herbert Xu
  2020-08-18  8:25   ` [PATCH 3/6] crypto: ahash - Add init_tfm/exit_tfm Herbert Xu
                     ` (4 subsequent siblings)
  6 siblings, 0 replies; 46+ messages in thread
From: Herbert Xu @ 2020-08-18  8:25 UTC (permalink / raw)
  To: Ard Biesheuvel, linux-crypto, ebiggers, Ben Greear

This patch adds ahash_free_singlespawn_instance which is the
ahash counterpart to the shash helper.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 crypto/ahash.c                 |    7 +++++++
 include/crypto/internal/hash.h |    1 +
 2 files changed, 8 insertions(+)

diff --git a/crypto/ahash.c b/crypto/ahash.c
index 68a0f0cb75c4c..3398e43d66f01 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -678,5 +678,12 @@ bool crypto_hash_alg_has_setkey(struct hash_alg_common *halg)
 }
 EXPORT_SYMBOL_GPL(crypto_hash_alg_has_setkey);
 
+void ahash_free_singlespawn_instance(struct ahash_instance *inst)
+{
+	crypto_drop_spawn(ahash_instance_ctx(inst));
+	kfree(inst);
+}
+EXPORT_SYMBOL_GPL(ahash_free_singlespawn_instance);
+
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Asynchronous cryptographic hash type");
diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h
index 89f6f46ab2b8b..12665b72672d3 100644
--- a/include/crypto/internal/hash.h
+++ b/include/crypto/internal/hash.h
@@ -87,6 +87,7 @@ int crypto_register_ahashes(struct ahash_alg *algs, int count);
 void crypto_unregister_ahashes(struct ahash_alg *algs, int count);
 int ahash_register_instance(struct crypto_template *tmpl,
 			    struct ahash_instance *inst);
+void ahash_free_singlespawn_instance(struct ahash_instance *inst);
 
 int shash_no_setkey(struct crypto_shash *tfm, const u8 *key,
 		    unsigned int keylen);

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 3/6] crypto: ahash - Add init_tfm/exit_tfm
  2020-08-18  8:24 ` [PATCH 0/5] crypto: Implement cmac based on cbc skcipher Herbert Xu
  2020-08-18  8:25   ` [PATCH 1/6] crypto: skcipher - Add helpers for sync skcipher spawn Herbert Xu
  2020-08-18  8:25   ` [PATCH 2/6] crypto: ahash - Add helper to free single spawn instance Herbert Xu
@ 2020-08-18  8:25   ` Herbert Xu
  2020-08-18  8:25   ` [PATCH 4/6] crypto: ahash - Add ahash_alg_instance Herbert Xu
                     ` (3 subsequent siblings)
  6 siblings, 0 replies; 46+ messages in thread
From: Herbert Xu @ 2020-08-18  8:25 UTC (permalink / raw)
  To: Ard Biesheuvel, linux-crypto, ebiggers, Ben Greear

This patch adds the type-safe init_tfm/exit_tfm functions to the
ahash interface.  This is meant to replace the unsafe cra_init and
cra_exit interface.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 crypto/ahash.c        |   13 ++++++++++++-
 include/crypto/hash.h |   13 +++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/crypto/ahash.c b/crypto/ahash.c
index 3398e43d66f01..c55b10e6c825d 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -477,6 +477,14 @@ static int ahash_def_finup(struct ahash_request *req)
 	return ahash_def_finup_finish1(req, err);
 }
 
+static void crypto_ahash_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct crypto_ahash *hash = __crypto_ahash_cast(tfm);
+	struct ahash_alg *alg = crypto_ahash_alg(hash);
+
+	alg->exit_tfm(hash);
+}
+
 static int crypto_ahash_init_tfm(struct crypto_tfm *tfm)
 {
 	struct crypto_ahash *hash = __crypto_ahash_cast(tfm);
@@ -500,7 +508,10 @@ static int crypto_ahash_init_tfm(struct crypto_tfm *tfm)
 		ahash_set_needkey(hash);
 	}
 
-	return 0;
+	if (alg->exit_tfm)
+		tfm->exit = crypto_ahash_exit_tfm;
+
+	return alg->init_tfm ? alg->init_tfm(hash) : 0;
 }
 
 static unsigned int crypto_ahash_extsize(struct crypto_alg *alg)
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index 19ce91f2359f5..c9d3fd3efa1b0 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -123,6 +123,17 @@ struct ahash_request {
  *	    data so the transformation can continue from this point onward. No
  *	    data processing happens at this point. Driver must not use
  *	    req->result.
+ * @init_tfm: Initialize the cryptographic transformation object.
+ *	      This function is called only once at the instantiation
+ *	      time, right after the transformation context was
+ *	      allocated. In case the cryptographic hardware has
+ *	      some special requirements which need to be handled
+ *	      by software, this function shall check for the precise
+ *	      requirement of the transformation and put any software
+ *	      fallbacks in place.
+ * @exit_tfm: Deinitialize the cryptographic transformation object.
+ *	      This is a counterpart to @init_tfm, used to remove
+ *	      various changes set in @init_tfm.
  * @halg: see struct hash_alg_common
  */
 struct ahash_alg {
@@ -135,6 +146,8 @@ struct ahash_alg {
 	int (*import)(struct ahash_request *req, const void *in);
 	int (*setkey)(struct crypto_ahash *tfm, const u8 *key,
 		      unsigned int keylen);
+	int (*init_tfm)(struct crypto_ahash *tfm);
+	void (*exit_tfm)(struct crypto_ahash *tfm);
 
 	struct hash_alg_common halg;
 };

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 4/6] crypto: ahash - Add ahash_alg_instance
  2020-08-18  8:24 ` [PATCH 0/5] crypto: Implement cmac based on cbc skcipher Herbert Xu
                     ` (2 preceding siblings ...)
  2020-08-18  8:25   ` [PATCH 3/6] crypto: ahash - Add init_tfm/exit_tfm Herbert Xu
@ 2020-08-18  8:25   ` Herbert Xu
  2020-08-18  8:25   ` [PATCH 5/6] crypto: ahash - Remove AHASH_REQUEST_ON_STACK Herbert Xu
                     ` (2 subsequent siblings)
  6 siblings, 0 replies; 46+ messages in thread
From: Herbert Xu @ 2020-08-18  8:25 UTC (permalink / raw)
  To: Ard Biesheuvel, linux-crypto, ebiggers, Ben Greear

This patch adds the helper ahash_alg_instance which is used to
convert a crypto_ahash object into its corresponding ahash_instance.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 include/crypto/internal/hash.h |    6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h
index 12665b72672d3..a6fbc4363e5e1 100644
--- a/include/crypto/internal/hash.h
+++ b/include/crypto/internal/hash.h
@@ -178,6 +178,12 @@ static inline struct ahash_instance *ahash_instance(
 	return container_of(inst, struct ahash_instance, s.base);
 }
 
+static inline struct ahash_instance *ahash_alg_instance(
+	struct crypto_ahash *ahash)
+{
+	return ahash_instance(crypto_tfm_alg_instance(&ahash->base));
+}
+
 static inline void *ahash_instance_ctx(struct ahash_instance *inst)
 {
 	return crypto_instance_ctx(ahash_crypto_instance(inst));

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 5/6] crypto: ahash - Remove AHASH_REQUEST_ON_STACK
  2020-08-18  8:24 ` [PATCH 0/5] crypto: Implement cmac based on cbc skcipher Herbert Xu
                     ` (3 preceding siblings ...)
  2020-08-18  8:25   ` [PATCH 4/6] crypto: ahash - Add ahash_alg_instance Herbert Xu
@ 2020-08-18  8:25   ` Herbert Xu
  2020-08-26 10:55     ` Ard Biesheuvel
  2020-08-18  8:25   ` [PATCH 6/6] crypto: cmac - Use cbc skcipher instead of raw cipher Herbert Xu
  2020-08-18  8:31   ` [PATCH 0/5] crypto: Implement cmac based on cbc skcipher Ard Biesheuvel
  6 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2020-08-18  8:25 UTC (permalink / raw)
  To: Ard Biesheuvel, linux-crypto, ebiggers, Ben Greear

This patch removes AHASH_REQUEST_ON_STACK which is unused.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 include/crypto/hash.h |    5 -----
 1 file changed, 5 deletions(-)

diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index c9d3fd3efa1b0..f16f5d4afc102 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -59,11 +59,6 @@ struct ahash_request {
 	void *__ctx[] CRYPTO_MINALIGN_ATTR;
 };
 
-#define AHASH_REQUEST_ON_STACK(name, ahash) \
-	char __##name##_desc[sizeof(struct ahash_request) + \
-		crypto_ahash_reqsize(ahash)] CRYPTO_MINALIGN_ATTR; \
-	struct ahash_request *name = (void *)__##name##_desc
-
 /**
  * struct ahash_alg - asynchronous message digest definition
  * @init: **[mandatory]** Initialize the transformation context. Intended only to initialize the

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 6/6] crypto: cmac - Use cbc skcipher instead of raw cipher
  2020-08-18  8:24 ` [PATCH 0/5] crypto: Implement cmac based on cbc skcipher Herbert Xu
                     ` (4 preceding siblings ...)
  2020-08-18  8:25   ` [PATCH 5/6] crypto: ahash - Remove AHASH_REQUEST_ON_STACK Herbert Xu
@ 2020-08-18  8:25   ` Herbert Xu
  2020-08-24  9:47     ` Ard Biesheuvel
  2020-08-18  8:31   ` [PATCH 0/5] crypto: Implement cmac based on cbc skcipher Ard Biesheuvel
  6 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2020-08-18  8:25 UTC (permalink / raw)
  To: Ard Biesheuvel, linux-crypto, ebiggers, Ben Greear

Many architectures provide an accelerated implementation of cbc(aes)
skcipher that is far superior to using the standard cbc template
over an accelerated aes cipher.  As cmac uses the raw cipher, it
was not able to benefit from the accelerated cbc(aes) skcpipher.

This patch changes cmac to use an underlying cbc skcipher.  In order
to do so safely, cmac has been split into an ahash version as well
as an shash version.  If the underlying cbc(aes) skcipher is async,
then only the ahash version would provide the full acceleration.

Here are the numbers on x86:

1. Old cmac:
testing speed of async cmac(aes) (cmac(aes-aesni))
tcrypt: test  0 (   16 byte blocks,   16 bytes per update,   1 updates):    753 cycles/operation,   47 cycles/byte
tcrypt: test  1 (   64 byte blocks,   16 bytes per update,   4 updates):   2305 cycles/operation,   36 cycles/byte
tcrypt: test  2 (   64 byte blocks,   64 bytes per update,   1 updates):   1398 cycles/operation,   21 cycles/byte
tcrypt: test  3 (  256 byte blocks,   16 bytes per update,  16 updates):  10996 cycles/operation,   42 cycles/byte
tcrypt: test  4 (  256 byte blocks,   64 bytes per update,   4 updates):   4808 cycles/operation,   18 cycles/byte
tcrypt: test  5 (  256 byte blocks,  256 bytes per update,   1 updates):   3819 cycles/operation,   14 cycles/byte
tcrypt: test  6 ( 1024 byte blocks,   16 bytes per update,  64 updates):  30528 cycles/operation,   29 cycles/byte
tcrypt: test  7 ( 1024 byte blocks,  256 bytes per update,   4 updates):  14471 cycles/operation,   14 cycles/byte
tcrypt: test  8 ( 1024 byte blocks, 1024 bytes per update,   1 updates):  13469 cycles/operation,   13 cycles/byte
tcrypt: test  9 ( 2048 byte blocks,   16 bytes per update, 128 updates):  62646 cycles/operation,   30 cycles/byte
tcrypt: test 10 ( 2048 byte blocks,  256 bytes per update,   8 updates):  28492 cycles/operation,   13 cycles/byte
tcrypt: test 11 ( 2048 byte blocks, 1024 bytes per update,   2 updates):  25935 cycles/operation,   12 cycles/byte
tcrypt: test 12 ( 2048 byte blocks, 2048 bytes per update,   1 updates):  25545 cycles/operation,   12 cycles/byte
tcrypt: test 13 ( 4096 byte blocks,   16 bytes per update, 256 updates): 126308 cycles/operation,   30 cycles/byte
tcrypt: test 14 ( 4096 byte blocks,  256 bytes per update,  16 updates):  56565 cycles/operation,   13 cycles/byte
tcrypt: test 15 ( 4096 byte blocks, 1024 bytes per update,   4 updates):  64675 cycles/operation,   15 cycles/byte
tcrypt: test 16 ( 4096 byte blocks, 4096 bytes per update,   1 updates):  59264 cycles/operation,   14 cycles/byte
tcrypt: test 17 ( 8192 byte blocks,   16 bytes per update, 512 updates): 244882 cycles/operation,   29 cycles/byte
tcrypt: test 18 ( 8192 byte blocks,  256 bytes per update,  32 updates): 117842 cycles/operation,   14 cycles/byte
tcrypt: test 19 ( 8192 byte blocks, 1024 bytes per update,   8 updates): 106637 cycles/operation,   13 cycles/byte
tcrypt: test 20 ( 8192 byte blocks, 4096 bytes per update,   2 updates): 103895 cycles/operation,   12 cycles/byte
tcrypt: test 21 ( 8192 byte blocks, 8192 bytes per update,   1 updates): 103742 cycles/operation,   12 cycles/byte

2. New shash cmac:
testing speed of async cmac(cbc(aes-aesni)) (cmac(cbc(aes-aesni)))
tcrypt: test  0 (   16 byte blocks,   16 bytes per update,   1 updates):    959 cycles/operation,   59 cycles/byte
tcrypt: test  1 (   64 byte blocks,   16 bytes per update,   4 updates):   2375 cycles/operation,   37 cycles/byte
tcrypt: test  2 (   64 byte blocks,   64 bytes per update,   1 updates):   3528 cycles/operation,   55 cycles/byte
tcrypt: test  3 (  256 byte blocks,   16 bytes per update,  16 updates):   6904 cycles/operation,   26 cycles/byte
tcrypt: test  4 (  256 byte blocks,   64 bytes per update,   4 updates):   8130 cycles/operation,   31 cycles/byte
tcrypt: test  5 (  256 byte blocks,  256 bytes per update,   1 updates):   8873 cycles/operation,   34 cycles/byte
tcrypt: test  6 ( 1024 byte blocks,   16 bytes per update,  64 updates):  25937 cycles/operation,   25 cycles/byte
tcrypt: test  7 ( 1024 byte blocks,  256 bytes per update,   4 updates):  16445 cycles/operation,   16 cycles/byte
tcrypt: test  8 ( 1024 byte blocks, 1024 bytes per update,   1 updates):  19753 cycles/operation,   19 cycles/byte
tcrypt: test  9 ( 2048 byte blocks,   16 bytes per update, 128 updates):  52589 cycles/operation,   25 cycles/byte
tcrypt: test 10 ( 2048 byte blocks,  256 bytes per update,   8 updates):  40020 cycles/operation,   19 cycles/byte
tcrypt: test 11 ( 2048 byte blocks, 1024 bytes per update,   2 updates):  40315 cycles/operation,   19 cycles/byte
tcrypt: test 12 ( 2048 byte blocks, 2048 bytes per update,   1 updates):  31988 cycles/operation,   15 cycles/byte
tcrypt: test 13 ( 4096 byte blocks,   16 bytes per update, 256 updates): 123648 cycles/operation,   30 cycles/byte
tcrypt: test 14 ( 4096 byte blocks,  256 bytes per update,  16 updates):  65420 cycles/operation,   15 cycles/byte
tcrypt: test 15 ( 4096 byte blocks, 1024 bytes per update,   4 updates):  65512 cycles/operation,   15 cycles/byte
tcrypt: test 16 ( 4096 byte blocks, 4096 bytes per update,   1 updates):  62763 cycles/operation,   15 cycles/byte
tcrypt: test 17 ( 8192 byte blocks,   16 bytes per update, 512 updates): 242429 cycles/operation,   29 cycles/byte
tcrypt: test 18 ( 8192 byte blocks,  256 bytes per update,  32 updates): 126182 cycles/operation,   15 cycles/byte
tcrypt: test 19 ( 8192 byte blocks, 1024 bytes per update,   8 updates): 130364 cycles/operation,   15 cycles/byte
tcrypt: test 20 ( 8192 byte blocks, 4096 bytes per update,   2 updates): 121450 cycles/operation,   14 cycles/byte
tcrypt: test 21 ( 8192 byte blocks, 8192 bytes per update,   1 updates): 124334 cycles/operation,   15 cycles/byte

3. New ahash cmac:
testing speed of async cmac(aes) (cmac(cbc-aes-aesni))
tcrypt: test  0 (   16 byte blocks,   16 bytes per update,   1 updates):   1026 cycles/operation,   64 cycles/byte
tcrypt: test  1 (   64 byte blocks,   16 bytes per update,   4 updates):   2080 cycles/operation,   32 cycles/byte
tcrypt: test  2 (   64 byte blocks,   64 bytes per update,   1 updates):   1250 cycles/operation,   19 cycles/byte
tcrypt: test  3 (  256 byte blocks,   16 bytes per update,  16 updates):   5348 cycles/operation,   20 cycles/byte
tcrypt: test  4 (  256 byte blocks,   64 bytes per update,   4 updates):   2998 cycles/operation,   11 cycles/byte
tcrypt: test  5 (  256 byte blocks,  256 bytes per update,   1 updates):   2233 cycles/operation,    8 cycles/byte
tcrypt: test  6 ( 1024 byte blocks,   16 bytes per update,  64 updates):  18879 cycles/operation,   18 cycles/byte
tcrypt: test  7 ( 1024 byte blocks,  256 bytes per update,   4 updates):   7964 cycles/operation,    7 cycles/byte
tcrypt: test  8 ( 1024 byte blocks, 1024 bytes per update,   1 updates):   6826 cycles/operation,    6 cycles/byte
tcrypt: test  9 ( 2048 byte blocks,   16 bytes per update, 128 updates):  51125 cycles/operation,   24 cycles/byte
tcrypt: test 10 ( 2048 byte blocks,  256 bytes per update,   8 updates):  14921 cycles/operation,    7 cycles/byte
tcrypt: test 11 ( 2048 byte blocks, 1024 bytes per update,   2 updates):  13174 cycles/operation,    6 cycles/byte
tcrypt: test 12 ( 2048 byte blocks, 2048 bytes per update,   1 updates):  11911 cycles/operation,    5 cycles/byte
tcrypt: test 13 ( 4096 byte blocks,   16 bytes per update, 256 updates):  74883 cycles/operation,   18 cycles/byte
tcrypt: test 14 ( 4096 byte blocks,  256 bytes per update,  16 updates):  34169 cycles/operation,    8 cycles/byte
tcrypt: test 15 ( 4096 byte blocks, 1024 bytes per update,   4 updates):  24703 cycles/operation,    6 cycles/byte
tcrypt: test 16 ( 4096 byte blocks, 4096 bytes per update,   1 updates):  22081 cycles/operation,    5 cycles/byte
tcrypt: test 17 ( 8192 byte blocks,   16 bytes per update, 512 updates): 157086 cycles/operation,   19 cycles/byte
tcrypt: test 18 ( 8192 byte blocks,  256 bytes per update,  32 updates):  56920 cycles/operation,    6 cycles/byte
tcrypt: test 19 ( 8192 byte blocks, 1024 bytes per update,   8 updates):  50063 cycles/operation,    6 cycles/byte
tcrypt: test 20 ( 8192 byte blocks, 4096 bytes per update,   2 updates):  43677 cycles/operation,    5 cycles/byte
tcrypt: test 21 ( 8192 byte blocks, 8192 bytes per update,   1 updates):  42464 cycles/operation,    5 cycles/byte

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 crypto/cmac.c |  864 ++++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 745 insertions(+), 119 deletions(-)

diff --git a/crypto/cmac.c b/crypto/cmac.c
index df36be1efb817..78cbc16818b66 100644
--- a/crypto/cmac.c
+++ b/crypto/cmac.c
@@ -12,9 +12,13 @@
  */
 
 #include <crypto/internal/hash.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
 
 /*
  * +------------------------
@@ -26,8 +30,13 @@
  * +------------------------
  */
 struct cmac_tfm_ctx {
-	struct crypto_cipher *child;
-	u8 ctx[];
+	struct crypto_sync_skcipher *child;
+	__be64 consts[];
+};
+
+struct cmac_atfm_ctx {
+	struct crypto_skcipher *child;
+	__be64 consts[];
 };
 
 /*
@@ -36,9 +45,9 @@ struct cmac_tfm_ctx {
  * +------------------------
  * | cmac_desc_ctx
  * +------------------------
- * | odds (block size)
+ * | prev (block size, alignmask aligned)
  * +------------------------
- * | prev (block size)
+ * | odds (alignmask aligned)
  * +------------------------
  */
 struct cmac_desc_ctx {
@@ -46,25 +55,93 @@ struct cmac_desc_ctx {
 	u8 ctx[];
 };
 
-static int crypto_cmac_digest_setkey(struct crypto_shash *parent,
-				     const u8 *inkey, unsigned int keylen)
+/*
+ * +------------------------
+ * | <ahash request>
+ * +------------------------
+ * | cmac_req_ctx
+ * +------------------------
+ * | skcipher_request_ctx
+ * +------------------------
+ * | prev (block size, alignmask aligned)
+ * +------------------------
+ * | odds (alignmask aligned)
+ * +------------------------
+ */
+struct cmac_req_ctx {
+	struct page *page;
+	struct scatterlist src[2];
+	bool more;
+	bool final;
+	struct scatterlist dst[3];
+	unsigned int len;
+	struct skcipher_request req;
+};
+
+struct cmac_inst_ctx {
+	struct crypto_sync_skcipher_spawn spawn;
+	char name[CRYPTO_MAX_ALG_NAME];
+};
+
+struct cmac_ainst_ctx {
+	struct crypto_skcipher_spawn spawn;
+	char name[CRYPTO_MAX_ALG_NAME];
+};
+
+static inline void *cmac_ctx_consts(struct crypto_shash *parent)
 {
 	unsigned long alignmask = crypto_shash_alignmask(parent);
 	struct cmac_tfm_ctx *ctx = crypto_shash_ctx(parent);
-	unsigned int bs = crypto_shash_blocksize(parent);
-	__be64 *consts = PTR_ALIGN((void *)ctx->ctx,
-				   (alignmask | (__alignof__(__be64) - 1)) + 1);
-	u64 _const[2];
-	int i, err = 0;
-	u8 msb_mask, gfmask;
 
-	err = crypto_cipher_setkey(ctx->child, inkey, keylen);
-	if (err)
-		return err;
+	alignmask |= __alignof__(__be64) - 1;
+
+	return PTR_ALIGN(&ctx->consts[0], alignmask + 1);
+}
+
+static inline void *cmac_async_consts(struct crypto_ahash *parent)
+{
+	unsigned long alignmask = crypto_ahash_alignmask(parent);
+	struct cmac_atfm_ctx *ctx = crypto_ahash_ctx(parent);
+
+	alignmask |= __alignof__(__be64) - 1;
+
+	return PTR_ALIGN(&ctx->consts[0], alignmask + 1);
+}
+
+static inline unsigned int cmac_ctx_size(unsigned int base,
+					 unsigned int alignmask,
+					 unsigned int bs)
+{
+	const unsigned int minalign = crypto_tfm_ctx_alignment();
+
+	return ALIGN(base, minalign) +
+	       ((alignmask | (__alignof__(__be64) - 1)) & ~(minalign - 1)) +
+	       bs * 2;
+}
+
+static int cmac_compute_consts(__be64 *consts, unsigned int bs, u32 flags,
+			       struct skcipher_request *req)
+{
+	u64 _const[2] __attribute__((aligned(16)));
+	DECLARE_CRYPTO_WAIT(wait);
+	struct scatterlist sg;
+	u8 msb_mask, gfmask;
+	int i, err;
 
 	/* encrypt the zero block */
+	memset(_const, 0, bs);
 	memset(consts, 0, bs);
-	crypto_cipher_encrypt_one(ctx->child, (u8 *)consts, (u8 *)consts);
+	sg_init_one(&sg, consts, bs);
+
+	flags &= CRYPTO_TFM_REQ_MAY_SLEEP;
+	flags |= CRYPTO_TFM_REQ_MAY_BACKLOG;
+
+	skcipher_request_set_callback(req, flags, crypto_req_done, &wait);
+	skcipher_request_set_crypt(req, &sg, &sg, bs, _const);
+
+	err = crypto_skcipher_encrypt(req);
+	if (err)
+		return err;
 
 	switch (bs) {
 	case 16:
@@ -101,129 +178,245 @@ static int crypto_cmac_digest_setkey(struct crypto_shash *parent,
 	return 0;
 }
 
-static int crypto_cmac_digest_init(struct shash_desc *pdesc)
+static int crypto_cmac_digest_setkey(struct crypto_shash *parent,
+				     const u8 *inkey, unsigned int keylen)
 {
-	unsigned long alignmask = crypto_shash_alignmask(pdesc->tfm);
-	struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
-	int bs = crypto_shash_blocksize(pdesc->tfm);
-	u8 *prev = PTR_ALIGN((void *)ctx->ctx, alignmask + 1) + bs;
+	struct cmac_tfm_ctx *ctx = crypto_shash_ctx(parent);
+	unsigned int bs = crypto_shash_blocksize(parent);
+	struct crypto_sync_skcipher *child = ctx->child;
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, child);
+	__be64 *consts = cmac_ctx_consts(parent);
+	u32 flags;
+	int err;
 
-	ctx->len = 0;
-	memset(prev, 0, bs);
+	crypto_sync_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
+	flags = crypto_shash_get_flags(parent) & CRYPTO_TFM_REQ_MASK;
+	crypto_sync_skcipher_set_flags(child, flags);
+					      
+	err = crypto_sync_skcipher_setkey(child, inkey, keylen);
+	if (err)
+		return err;
+
+	skcipher_request_set_sync_tfm(req, child);
+
+	return cmac_compute_consts(consts, bs, flags, req);
+}
 
+static int crypto_cmac_digest_init(struct shash_desc *pdesc)
+{
+	memset(shash_desc_ctx(pdesc), 0, HASH_MAX_DESCSIZE);
 	return 0;
 }
 
-static int crypto_cmac_digest_update(struct shash_desc *pdesc, const u8 *p,
-				     unsigned int len)
+static unsigned int fill_final_block(char *odds, unsigned int len,
+				     unsigned int bs)
+{
+	u8 *p = odds + len;
+	unsigned int rlen;
+
+	*p = 0x80;
+	p++;
+
+	rlen = -(len + 1) & (bs - 1);
+	memset(p, 0, rlen);
+
+	return len + rlen + 1;
+}
+
+static int crypto_cmac_digest_finup(struct shash_desc *pdesc, const u8 *p,
+				    unsigned int len, u8 *out)
 {
 	struct crypto_shash *parent = pdesc->tfm;
 	unsigned long alignmask = crypto_shash_alignmask(parent);
 	struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
 	struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
-	struct crypto_cipher *tfm = tctx->child;
+	u8 *prev = PTR_ALIGN(&ctx->ctx[0], alignmask + 1);
+	struct crypto_sync_skcipher *tfm = tctx->child;
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
 	int bs = crypto_shash_blocksize(parent);
-	u8 *odds = PTR_ALIGN((void *)ctx->ctx, alignmask + 1);
-	u8 *prev = odds + bs;
+	u8 *consts = cmac_ctx_consts(parent);
+	unsigned int clen = ctx->len;
+	bool empty = !(clen + len);
+	struct scatterlist sg;
+	u8 *odds = prev + bs;
+	unsigned int cs, ds;
+	void *page = NULL;
+	u8 *buf;
+	int err;
 
-	/* checking the data can fill the block */
-	if ((ctx->len + len) <= bs) {
-		memcpy(odds + ctx->len, p, len);
+	odds = PTR_ALIGN(odds, alignmask + 1);
+	cs = HASH_MAX_DESCSIZE - (odds - (u8 *)pdesc);
+	cs &= ~(bs - 1);
+	BUILD_BUG_ON(4 + 60 + 16 + 48 + 16 > HASH_MAX_DESCSIZE);
+
+	/* checking the data can fill the blocks */
+	if ((clen + len) <= cs && !out) {
+		memcpy(odds + clen, p, len);
 		ctx->len += len;
 		return 0;
 	}
 
-	/* filling odds with new data and encrypting it */
-	memcpy(odds + ctx->len, p, bs - ctx->len);
-	len -= bs - ctx->len;
-	p += bs - ctx->len;
-
-	crypto_xor(prev, odds, bs);
-	crypto_cipher_encrypt_one(tfm, prev, prev);
-
 	/* clearing the length */
 	ctx->len = 0;
 
-	/* encrypting the rest of data */
-	while (len > bs) {
-		crypto_xor(prev, p, bs);
-		crypto_cipher_encrypt_one(tfm, prev, prev);
-		p += bs;
-		len -= bs;
-	}
+	buf = odds;
+	ds = cs;
 
-	/* keeping the surplus of blocksize */
-	if (len) {
-		memcpy(odds, p, len);
-		ctx->len = len;
+	if (clen + len > cs * 2 &&
+	    ((page = (void *)__get_free_page(GFP_ATOMIC)))) {
+		buf = page;
+		ds = PAGE_SIZE;
+		memcpy(buf, odds, clen);
 	}
 
-	return 0;
-}
+	sg_init_one(&sg, buf, ds);
 
-static int crypto_cmac_digest_final(struct shash_desc *pdesc, u8 *out)
-{
-	struct crypto_shash *parent = pdesc->tfm;
-	unsigned long alignmask = crypto_shash_alignmask(parent);
-	struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
-	struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
-	struct crypto_cipher *tfm = tctx->child;
-	int bs = crypto_shash_blocksize(parent);
-	u8 *consts = PTR_ALIGN((void *)tctx->ctx,
-			       (alignmask | (__alignof__(__be64) - 1)) + 1);
-	u8 *odds = PTR_ALIGN((void *)ctx->ctx, alignmask + 1);
-	u8 *prev = odds + bs;
-	unsigned int offset = 0;
+	/* encrypting the rest of data */
+	do {
+		unsigned int es = ds - clen;
+		bool final = false;
+
+		if (len <= es) {
+			if (out) {
+				ds = len + clen;
+				es = len;
+
+				if (ds & (bs - 1) || empty) {
+					ds = fill_final_block(buf, ds, bs);
+					consts += bs;
+				}
+
+				final = true;
+			} else {
+				/* Leave at least one byte for final. */
+				ds = (len + clen - 1) & ~(bs - 1);
+				es = ds - clen;
+			}
+		}
 
-	if (ctx->len != bs) {
-		unsigned int rlen;
-		u8 *p = odds + ctx->len;
+		memcpy(buf + clen, p, es);
 
-		*p = 0x80;
-		p++;
+		if (final)
+			crypto_xor(buf + ds - bs, consts, bs);
 
-		rlen = bs - ctx->len - 1;
-		if (rlen)
-			memset(p, 0, rlen);
+		clen = 0;
+		empty = false;
 
-		offset += bs;
-	}
+		skcipher_request_set_sync_tfm(req, tfm);
+		skcipher_request_set_callback(req, 0, NULL, NULL);
+		skcipher_request_set_crypt(req, &sg, &sg, ds, prev);
+
+		err = crypto_skcipher_encrypt(req);
+		if (err)
+			return err;
+
+		p += es;
+		len -= es;
+	} while (len > (out ? 0 : cs));
+
+	if (page)
+		free_page((unsigned long)page);
 
-	crypto_xor(prev, odds, bs);
-	crypto_xor(prev, consts + offset, bs);
+	/* keeping the surplus */
+	memcpy(odds, p, len);
+	ctx->len = len;
 
-	crypto_cipher_encrypt_one(tfm, out, prev);
+	if (out)
+		memcpy(out, prev, bs);
 
 	return 0;
 }
 
-static int cmac_init_tfm(struct crypto_tfm *tfm)
+static int crypto_cmac_digest_update(struct shash_desc *pdesc, const u8 *p,
+				     unsigned int len)
+{
+	return crypto_cmac_digest_finup(pdesc, p, len, NULL);
+}
+
+static int crypto_cmac_digest_final(struct shash_desc *pdesc, u8 *out)
+{
+	return crypto_cmac_digest_finup(pdesc, NULL, 0, out);
+}
+
+static int cmac_init_tfm(struct crypto_shash *tfm)
 {
-	struct crypto_cipher *cipher;
-	struct crypto_instance *inst = (void *)tfm->__crt_alg;
-	struct crypto_cipher_spawn *spawn = crypto_instance_ctx(inst);
-	struct cmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct shash_instance *inst = shash_alg_instance(tfm);
+	struct cmac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
+	struct crypto_sync_skcipher_spawn *spawn;
+	struct crypto_sync_skcipher *cipher;
+	struct cmac_inst_ctx *ictx;
 
-	cipher = crypto_spawn_cipher(spawn);
+	ictx = shash_instance_ctx(inst);
+	spawn = &ictx->spawn;
+
+	cipher = crypto_spawn_sync_skcipher(spawn);
 	if (IS_ERR(cipher))
 		return PTR_ERR(cipher);
 
 	ctx->child = cipher;
 
 	return 0;
-};
+}
 
-static void cmac_exit_tfm(struct crypto_tfm *tfm)
+static void cmac_exit_tfm(struct crypto_shash *tfm)
 {
-	struct cmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
-	crypto_free_cipher(ctx->child);
+	struct cmac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
+	crypto_free_sync_skcipher(ctx->child);
 }
 
-static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb)
+static int cmac_set_cbc(char *name, const char *cipher_name)
+{
+	if (snprintf(name, CRYPTO_MAX_ALG_NAME, "cbc(%s)", cipher_name) >=
+	    CRYPTO_MAX_ALG_NAME)
+		return -ENAMETOOLONG;
+
+	return 0;
+}
+
+static int cmac_check_blocksize(struct skcipher_alg *alg)
+{
+	switch (alg->base.cra_blocksize) {
+	case 16:
+	case 8:
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static int cmac_set_name(char *cra_name, char *name, const char *cipher_name)
+{
+	unsigned len;
+
+	/* Alas we screwed up the naming so we have to mangle the
+	 * cipher name.
+	 */
+	if (strncmp(cipher_name, "cbc(", 4))
+		return -EINVAL;
+
+	len = strlcpy(name, cipher_name + 4, CRYPTO_MAX_ALG_NAME);
+	if (len < 2 || len >= CRYPTO_MAX_ALG_NAME)
+		return -EINVAL;
+
+	if (name[len - 1] != ')')
+		return -EINVAL;
+
+	name[len - 1] = 0;
+
+	if (snprintf(cra_name, CRYPTO_MAX_ALG_NAME, "cmac(%s)", name) >=
+	    CRYPTO_MAX_ALG_NAME)
+		return -ENAMETOOLONG;
+
+	return 0;
+}
+
+static int cmac_create_sync(struct crypto_template *tmpl, struct rtattr **tb)
 {
+	struct crypto_sync_skcipher_spawn *spawn;
 	struct shash_instance *inst;
-	struct crypto_cipher_spawn *spawn;
-	struct crypto_alg *alg;
+	struct cmac_inst_ctx *ctx;
+	struct skcipher_alg *alg;
+	const char *cipher_name;
 	unsigned long alignmask;
 	u32 mask;
 	int err;
@@ -232,53 +425,60 @@ static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb)
 	if (err)
 		return err;
 
+	cipher_name = crypto_attr_alg_name(tb[1]);
+	if (IS_ERR(cipher_name))
+		return PTR_ERR(cipher_name);
+
 	inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
 	if (!inst)
 		return -ENOMEM;
-	spawn = shash_instance_ctx(inst);
 
-	err = crypto_grab_cipher(spawn, shash_crypto_instance(inst),
-				 crypto_attr_alg_name(tb[1]), 0, mask);
+	ctx = shash_instance_ctx(inst);
+	spawn = &ctx->spawn;
+
+	err = crypto_grab_sync_skcipher(spawn, shash_crypto_instance(inst),
+					cipher_name, 0, mask);
+	if (err == -ENOENT)
+		err = cmac_set_cbc(ctx->name, cipher_name) ?:
+		      crypto_grab_sync_skcipher(spawn,
+						shash_crypto_instance(inst),
+						ctx->name, 0, mask);
 	if (err)
 		goto err_free_inst;
-	alg = crypto_spawn_cipher_alg(spawn);
 
-	switch (alg->cra_blocksize) {
-	case 16:
-	case 8:
-		break;
-	default:
-		err = -EINVAL;
-		goto err_free_inst;
-	}
+	alg = crypto_sync_spawn_skcipher_alg(spawn);
 
-	err = crypto_inst_setname(shash_crypto_instance(inst), tmpl->name, alg);
+	err = cmac_check_blocksize(alg) ?:
+	      crypto_inst_setname(shash_crypto_instance(inst), "cmac",
+				  &alg->base) ?:
+	      cmac_set_name(inst->alg.base.cra_name, ctx->name,
+			    alg->base.cra_name);
 	if (err)
 		goto err_free_inst;
 
-	alignmask = alg->cra_alignmask;
+	err = -EINVAL;
+	alignmask = alg->base.cra_alignmask;
+	if (alignmask > alg->base.cra_blocksize)
+		goto err_free_inst;
+
 	inst->alg.base.cra_alignmask = alignmask;
-	inst->alg.base.cra_priority = alg->cra_priority;
-	inst->alg.base.cra_blocksize = alg->cra_blocksize;
+	inst->alg.base.cra_priority = alg->base.cra_priority;
+	inst->alg.base.cra_blocksize = alg->base.cra_blocksize;
 
-	inst->alg.digestsize = alg->cra_blocksize;
-	inst->alg.descsize =
-		ALIGN(sizeof(struct cmac_desc_ctx), crypto_tfm_ctx_alignment())
-		+ (alignmask & ~(crypto_tfm_ctx_alignment() - 1))
-		+ alg->cra_blocksize * 2;
+	inst->alg.digestsize = alg->base.cra_blocksize;
+	inst->alg.descsize = HASH_MAX_DESCSIZE;
 
-	inst->alg.base.cra_ctxsize =
-		ALIGN(sizeof(struct cmac_tfm_ctx), crypto_tfm_ctx_alignment())
-		+ ((alignmask | (__alignof__(__be64) - 1)) &
-		   ~(crypto_tfm_ctx_alignment() - 1))
-		+ alg->cra_blocksize * 2;
+	inst->alg.base.cra_ctxsize = cmac_ctx_size(sizeof(struct cmac_tfm_ctx),
+						   alignmask,
+						   alg->base.cra_blocksize);
 
-	inst->alg.base.cra_init = cmac_init_tfm;
-	inst->alg.base.cra_exit = cmac_exit_tfm;
+	inst->alg.init_tfm = cmac_init_tfm;
+	inst->alg.exit_tfm = cmac_exit_tfm;
 
 	inst->alg.init = crypto_cmac_digest_init;
 	inst->alg.update = crypto_cmac_digest_update;
 	inst->alg.final = crypto_cmac_digest_final;
+	inst->alg.finup = crypto_cmac_digest_finup;
 	inst->alg.setkey = crypto_cmac_digest_setkey;
 
 	inst->free = shash_free_singlespawn_instance;
@@ -291,6 +491,432 @@ static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb)
 	return err;
 }
 
+static int crypto_cmac_ahash_setkey(struct crypto_ahash *parent,
+				    const u8 *inkey, unsigned int keylen)
+{
+	struct cmac_atfm_ctx *ctx = crypto_ahash_ctx(parent);
+	unsigned int bs = crypto_ahash_blocksize(parent);
+	struct crypto_skcipher *child = ctx->child;
+	__be64 *consts = cmac_async_consts(parent);
+	struct skcipher_request *req;
+	u32 flags;
+	int err;
+
+	crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
+	flags = crypto_ahash_get_flags(parent) & CRYPTO_TFM_REQ_MASK;
+	crypto_skcipher_set_flags(child, flags);
+					      
+	err = crypto_skcipher_setkey(child, inkey, keylen);
+	if (err)
+		return err;
+
+	flags &= CRYPTO_TFM_REQ_MAY_SLEEP;
+	req = skcipher_request_alloc(child, flags ? GFP_KERNEL : GFP_ATOMIC);
+	if (!req)
+		return -ENOMEM;
+
+	err = cmac_compute_consts(consts, bs, flags, req);
+	skcipher_request_free(req);
+	return err;
+}
+
+static int crypto_cmac_ahash_init(struct ahash_request *req)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+
+	memset(ahash_request_ctx(req), 0, crypto_ahash_reqsize(tfm));
+	return 0;
+}
+
+static int cmac_ahash_final(struct ahash_request *req, u32 flags)
+{
+	struct crypto_ahash *parent = crypto_ahash_reqtfm(req);
+	struct cmac_atfm_ctx *tctx = crypto_ahash_ctx(parent);
+	struct cmac_req_ctx *ctx = ahash_request_ctx(req);
+	unsigned int len = ctx->final ? 0 : req->nbytes;
+	struct crypto_skcipher *tfm = tctx->child;
+	struct skcipher_request *creq = &ctx->req;
+	u8 *out = ctx->more ? NULL : req->result;
+	int bs = crypto_ahash_blocksize(parent);
+	u8 *consts = cmac_async_consts(parent);
+	unsigned int done = creq->cryptlen;
+	unsigned int clen = ctx->len;
+	unsigned long alignmask;
+	unsigned int ds;
+	int err = 0;
+	u8 *prev;
+	u8 *odds;
+
+	if (ctx->page) {
+		free_page((unsigned long)ctx->page);
+		ctx->page = NULL;
+	}
+
+	alignmask = crypto_ahash_alignmask(parent);
+
+	prev = skcipher_request_ctx(creq);
+	prev += crypto_skcipher_reqsize(tfm);
+	prev = PTR_ALIGN(prev, alignmask + 1);
+
+	odds = PTR_ALIGN(prev + bs, alignmask + 1);
+
+	ds = clen + len - done;
+
+	if (done >= clen) {
+		done -= clen;
+		len -= done;
+		clen = 0;
+	} else {
+		odds += done;
+		clen -= done;
+		done = 0;
+	}
+
+	if (len)
+		scatterwalk_map_and_copy(odds + clen, req->src, done, len, 0);
+
+	if (!out)
+		goto done;
+
+	if (ds & (bs - 1) || !ds) {
+		ds = fill_final_block(odds, ds, bs);
+		consts += bs;
+	}
+
+	memcpy(out, prev, bs);
+	crypto_xor(odds + ds - bs, consts, bs);
+	sg_init_one(ctx->src, odds, ds);
+
+	skcipher_request_set_tfm(creq, tfm);
+	skcipher_request_set_callback(creq, flags, req->base.complete,
+				      req->base.data);
+	skcipher_request_set_crypt(creq, ctx->src, ctx->src, ds, out);
+
+	err = crypto_skcipher_encrypt(creq);
+	ds = 0;
+
+done:
+	ctx->len = ds;
+	return err;
+}
+
+static void cmac_encrypt_done(struct crypto_async_request *areq, int err)
+{
+	struct ahash_request *req = areq->data;
+	u32 flags;
+
+	if (err)
+		goto out;
+
+	flags = req->base.flags & ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	err = cmac_ahash_final(req, flags);
+
+out:
+	ahash_request_complete(req, err);
+}
+
+static int cmac_ahash_finup(struct ahash_request *req)
+{
+	struct crypto_ahash *parent = crypto_ahash_reqtfm(req);
+	struct cmac_atfm_ctx *tctx = crypto_ahash_ctx(parent);
+	struct cmac_req_ctx *ctx = ahash_request_ctx(req);
+	unsigned int len = ctx->final ? 0 : req->nbytes;
+	struct crypto_skcipher *tfm = tctx->child;
+	struct skcipher_request *creq = &ctx->req;
+	int bs = crypto_ahash_blocksize(parent);
+	struct scatterlist *src = req->src;
+	struct scatterlist *dst = ctx->dst;
+	unsigned int clen = ctx->len;
+	u32 flags = req->base.flags;
+	unsigned long alignmask;
+	unsigned int cs, ds, es;
+	u8 *prev;
+	u8 *odds;
+	int err;
+
+	flags &= CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG;
+	alignmask = crypto_ahash_alignmask(parent);
+
+	prev = skcipher_request_ctx(creq);
+	prev += crypto_skcipher_reqsize(tfm);
+	prev = PTR_ALIGN(prev, alignmask + 1);
+
+	odds = PTR_ALIGN(prev + bs, alignmask + 1);
+
+	cs = crypto_ahash_reqsize(parent) - (odds - (u8 *)ctx);
+	cs &= ~(bs - 1);
+	if (cs > PAGE_SIZE)
+		cs = PAGE_SIZE;
+
+	ds = clen + len;
+	es = HASH_MAX_STATESIZE - 4 - bs;
+
+	creq->cryptlen = 0;
+
+	/* checking the data can fill the blocks */
+	if (ds <= es)
+		return cmac_ahash_final(req, flags);
+
+	/* Leave at least one byte for final. */
+	if (!ds || !(ds = (ds - 1) & ~(bs - 1)))
+		return cmac_ahash_final(req, flags);
+
+	if (clen) {
+		sg_chain(ctx->src, 2, src);
+		src = ctx->src;
+		sg_set_buf(src, odds, clen);
+	}
+
+	sg_set_buf(dst, odds, cs);
+	sg_chain(dst, 2, dst);
+
+	if (ds > cs && cs < PAGE_SIZE &&
+	    ((ctx->page = (void *)__get_free_page(GFP_ATOMIC))))
+		sg_set_buf(dst, ctx->page, PAGE_SIZE);
+
+	skcipher_request_set_tfm(creq, tfm);
+	skcipher_request_set_callback(creq, flags, cmac_encrypt_done, req);
+	skcipher_request_set_crypt(creq, src, dst, ds, prev);
+
+	err = crypto_skcipher_encrypt(creq);
+	if (err == -EINPROGRESS || err == -EBUSY)
+		return err;
+
+	return cmac_ahash_final(req, flags);
+}
+
+static int crypto_cmac_ahash_update(struct ahash_request *req)
+{
+	struct cmac_req_ctx *ctx = ahash_request_ctx(req);
+
+	ctx->more = true;
+	return cmac_ahash_finup(req);
+}
+
+static int crypto_cmac_ahash_finup(struct ahash_request *req)
+{
+	struct cmac_req_ctx *ctx = ahash_request_ctx(req);
+
+	ctx->more = false;
+	return cmac_ahash_finup(req);
+}
+
+static int crypto_cmac_ahash_final(struct ahash_request *req)
+{
+	struct cmac_req_ctx *ctx = ahash_request_ctx(req);
+
+	ctx->more = false;
+	ctx->final = true;
+	return cmac_ahash_finup(req);
+}
+
+static int crypto_cmac_ahash_digest(struct ahash_request *req)
+{
+	return crypto_cmac_ahash_init(req) ?: cmac_ahash_finup(req);
+}
+
+static int crypto_cmac_ahash_export(struct ahash_request *req, void *out)
+{
+	struct crypto_ahash *parent = crypto_ahash_reqtfm(req);
+	struct cmac_atfm_ctx *tctx = crypto_ahash_ctx(parent);
+	struct cmac_req_ctx *ctx = ahash_request_ctx(req);
+	unsigned int bs = crypto_ahash_blocksize(parent);
+	struct crypto_skcipher *tfm = tctx->child;
+	struct skcipher_request *creq = &ctx->req;
+	unsigned long alignmask;
+	u8 *p = out;
+	u8 *prev;
+	u8 *odds;
+
+	alignmask = crypto_ahash_alignmask(parent);
+
+	prev = skcipher_request_ctx(creq);
+	prev += crypto_skcipher_reqsize(tfm);
+	prev = PTR_ALIGN(prev, alignmask + 1);
+
+	odds = PTR_ALIGN(prev + bs, alignmask + 1);
+
+	*(u32 *)p = ctx->len;
+
+	p += 4;
+	memcpy(p, prev, bs);
+
+	p += bs;
+	memcpy(p, odds, ctx->len);
+
+	return 0;
+}
+
+static int crypto_cmac_ahash_import(struct ahash_request *req, const void *in)
+{
+	struct crypto_ahash *parent = crypto_ahash_reqtfm(req);
+	struct cmac_atfm_ctx *tctx = crypto_ahash_ctx(parent);
+	struct cmac_req_ctx *ctx = ahash_request_ctx(req);
+	unsigned int bs = crypto_ahash_blocksize(parent);
+	struct crypto_skcipher *tfm = tctx->child;
+	struct skcipher_request *creq = &ctx->req;
+	unsigned long alignmask;
+	const u8 *p = in;
+	u8 *prev;
+	u8 *odds;
+	int err;
+
+	err = crypto_cmac_ahash_init(req);
+	if (err)
+		return err;
+
+	alignmask = crypto_ahash_alignmask(parent);
+
+	prev = skcipher_request_ctx(creq);
+	prev += crypto_skcipher_reqsize(tfm);
+	prev = PTR_ALIGN(prev, alignmask + 1);
+
+	odds = PTR_ALIGN(prev + bs, alignmask + 1);
+
+	ctx->len = *(const u32 *)p;
+	if (ctx->len > HASH_MAX_STATESIZE - 4 - bs)
+		return -EINVAL;
+
+	p += 4;
+	memcpy(prev, p, bs);
+
+	p += bs;
+	memcpy(odds, p, ctx->len);
+
+	return 0;
+}
+
+static int cmac_init_ahash(struct crypto_ahash *tfm)
+{
+	struct ahash_instance *inst = ahash_alg_instance(tfm);
+	unsigned long alignmask = crypto_ahash_alignmask(tfm);
+	struct cmac_atfm_ctx *ctx = crypto_ahash_ctx(tfm);
+	unsigned int bs = crypto_ahash_blocksize(tfm);
+	struct crypto_skcipher_spawn *spawn;
+	struct crypto_skcipher *cipher;
+	struct cmac_ainst_ctx *ictx;
+	unsigned int reqsize;
+	unsigned int head;
+
+	ictx = ahash_instance_ctx(inst);
+	spawn = &ictx->spawn;
+
+	cipher = crypto_spawn_skcipher(spawn);
+	if (IS_ERR(cipher))
+		return PTR_ERR(cipher);
+
+	ctx->child = cipher;
+
+	reqsize = sizeof(struct cmac_req_ctx);
+	reqsize += ALIGN(crypto_skcipher_reqsize(cipher), CRYPTO_MINALIGN);
+	if (alignmask > CRYPTO_MINALIGN)
+		reqsize += alignmask + 1 - CRYPTO_MINALIGN;
+	reqsize += ALIGN(bs, alignmask + 1);
+	reqsize += HASH_MAX_STATESIZE;
+
+	head = sizeof(struct ahash_request);
+	reqsize = roundup_pow_of_two(reqsize + head) - head;
+
+	crypto_ahash_set_reqsize(tfm, reqsize);
+
+	return 0;
+}
+
+static void cmac_exit_ahash(struct crypto_ahash *tfm)
+{
+	struct cmac_atfm_ctx *ctx = crypto_ahash_ctx(tfm);
+	crypto_free_skcipher(ctx->child);
+}
+
+static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb)
+{
+
+	struct crypto_skcipher_spawn *spawn;
+	struct crypto_attr_type *algt;
+	struct cmac_ainst_ctx *ctx;
+	struct ahash_instance *inst;
+	struct skcipher_alg *alg;
+	const char *cipher_name;
+	unsigned int alignmask;
+	u32 mask;
+	int err;
+
+	algt = crypto_get_attr_type(tb);
+	if (IS_ERR(algt))
+		return PTR_ERR(algt);
+
+	if ((algt->type ^ (CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC)) &
+	    algt->mask)
+		return cmac_create_sync(tmpl, tb);
+
+	mask = crypto_algt_inherited_mask(algt);
+
+	cipher_name = crypto_attr_alg_name(tb[1]);
+	if (IS_ERR(cipher_name))
+		return PTR_ERR(cipher_name);
+
+	inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
+	if (!inst)
+		return -ENOMEM;
+
+	ctx = ahash_instance_ctx(inst);
+	spawn = &ctx->spawn;
+
+	err = crypto_grab_skcipher(spawn, ahash_crypto_instance(inst),
+				   cipher_name, 0, mask);
+	if (err == -ENOENT)
+		err = cmac_set_cbc(ctx->name, cipher_name) ?:
+		      crypto_grab_skcipher(spawn, ahash_crypto_instance(inst),
+					   ctx->name, 0, mask);
+	if (err)
+		goto err_free_inst;
+
+	alg = crypto_spawn_skcipher_alg(spawn);
+
+	err = cmac_check_blocksize(alg) ?:
+	      crypto_inst_setname(ahash_crypto_instance(inst), "cmac",
+				  &alg->base) ?:
+	      cmac_set_name(inst->alg.halg.base.cra_name, ctx->name,
+			    alg->base.cra_name);
+	if (err)
+		goto err_free_inst;
+
+	err = -EINVAL;
+
+	alignmask = alg->base.cra_alignmask;
+	inst->alg.halg.base.cra_alignmask = alignmask;
+	inst->alg.halg.base.cra_priority = alg->base.cra_priority;
+	inst->alg.halg.base.cra_blocksize = alg->base.cra_blocksize;
+
+	inst->alg.halg.digestsize = alg->base.cra_blocksize;
+	inst->alg.halg.statesize = HASH_MAX_STATESIZE;
+
+	inst->alg.halg.base.cra_ctxsize =
+		cmac_ctx_size(sizeof(struct cmac_atfm_ctx), alignmask,
+			      alg->base.cra_blocksize);
+
+	inst->alg.init_tfm = cmac_init_ahash;
+	inst->alg.exit_tfm = cmac_exit_ahash;
+
+	inst->alg.init = crypto_cmac_ahash_init;
+	inst->alg.update = crypto_cmac_ahash_update;
+	inst->alg.final = crypto_cmac_ahash_final;
+	inst->alg.finup = crypto_cmac_ahash_finup;
+	inst->alg.digest = crypto_cmac_ahash_digest;
+	inst->alg.export = crypto_cmac_ahash_export;
+	inst->alg.import = crypto_cmac_ahash_import;
+	inst->alg.setkey = crypto_cmac_ahash_setkey;
+
+	inst->free = ahash_free_singlespawn_instance;
+
+	err = ahash_register_instance(tmpl, inst);
+	if (err) {
+err_free_inst:
+		ahash_free_singlespawn_instance(inst);
+	}
+	return err;
+}
+
 static struct crypto_template crypto_cmac_tmpl = {
 	.name = "cmac",
 	.create = cmac_create,

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-18  8:24 ` [PATCH 0/5] crypto: Implement cmac based on cbc skcipher Herbert Xu
                     ` (5 preceding siblings ...)
  2020-08-18  8:25   ` [PATCH 6/6] crypto: cmac - Use cbc skcipher instead of raw cipher Herbert Xu
@ 2020-08-18  8:31   ` Ard Biesheuvel
  2020-08-18 13:51     ` Herbert Xu
  6 siblings, 1 reply; 46+ messages in thread
From: Ard Biesheuvel @ 2020-08-18  8:31 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Linux Crypto Mailing List, Eric Biggers, Ben Greear

On Tue, 18 Aug 2020 at 10:24, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Sun, Aug 02, 2020 at 12:06:16PM +0300, Ard Biesheuvel wrote:
> > Ben reports that CCM using AES-NI instructions performs pathologically
> > poorly, which is due to the overhead of preserving/restoring the SIMD
> > state, which is repeated after every 16 bytes of input when executing
> > the CBCMAC portion of the algorithm.
> >
> > So let's clone the arm64 implementation of cbcmac(aes), which takes
> > care to only preserve/restore the SIMD state after processing the
> > whole input. Since cmac(aes) and xcbc(aes) can reuse most of the code,
> > let's expose those as well.
> >
> > Cc: Ben Greear <greearb@candelatech.com>
> > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> > ---
> >  arch/x86/crypto/Makefile           |   2 +-
> >  arch/x86/crypto/aesni-intel.h      |  39 +++
> >  arch/x86/crypto/aesni-intel_glue.c |  42 +---
> >  arch/x86/crypto/aesni-intel_mac.c  | 257 ++++++++++++++++++++
> >  4 files changed, 306 insertions(+), 34 deletions(-)
>
> We should just use the accelerated cbc skcipher.
>

What do you mean? You cannot implement cbcmac using a cbc skcipher
unless you provide a scratch buffer of arbitrary size as the
destination, in order to capture the skcipher output IV as the MAC.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-18  8:31   ` [PATCH 0/5] crypto: Implement cmac based on cbc skcipher Ard Biesheuvel
@ 2020-08-18 13:51     ` Herbert Xu
  2020-08-18 13:56       ` Ben Greear
  0 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2020-08-18 13:51 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: Linux Crypto Mailing List, Eric Biggers, Ben Greear

On Tue, Aug 18, 2020 at 10:31:39AM +0200, Ard Biesheuvel wrote:
>
> What do you mean? You cannot implement cbcmac using a cbc skcipher
> unless you provide a scratch buffer of arbitrary size as the
> destination, in order to capture the skcipher output IV as the MAC.

Please have a look at patch 6.  The trick is to use an SG list
chained to itself.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-18 13:51     ` Herbert Xu
@ 2020-08-18 13:56       ` Ben Greear
  2020-08-18 14:05         ` Herbert Xu
  0 siblings, 1 reply; 46+ messages in thread
From: Ben Greear @ 2020-08-18 13:56 UTC (permalink / raw)
  To: Herbert Xu, Ard Biesheuvel; +Cc: Linux Crypto Mailing List, Eric Biggers

On 8/18/20 6:51 AM, Herbert Xu wrote:
> On Tue, Aug 18, 2020 at 10:31:39AM +0200, Ard Biesheuvel wrote:
>>
>> What do you mean? You cannot implement cbcmac using a cbc skcipher
>> unless you provide a scratch buffer of arbitrary size as the
>> destination, in order to capture the skcipher output IV as the MAC.
> 
> Please have a look at patch 6.  The trick is to use an SG list
> chained to itself.

Herbert, thanks for working on this.  If I apply the patches you posted,
that is expected to provide wifi aes decryption speedup similar to what
the original patch I sent does?  Or, are additional patches needed?

Thanks,
Ben


-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-18 13:56       ` Ben Greear
@ 2020-08-18 14:05         ` Herbert Xu
  2020-08-18 14:17           ` Ben Greear
  0 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2020-08-18 14:05 UTC (permalink / raw)
  To: Ben Greear; +Cc: Ard Biesheuvel, Linux Crypto Mailing List, Eric Biggers

On Tue, Aug 18, 2020 at 06:56:12AM -0700, Ben Greear wrote:
>
> Herbert, thanks for working on this.  If I apply the patches you posted,
> that is expected to provide wifi aes decryption speedup similar to what
> the original patch I sent does?  Or, are additional patches needed?

It depends on whether the wifi code uses the async ahash interface,
if not it probably won't make much of an impact at all.

I just checked and if the code you're using is net/mac80211/aes_cmac.c
then it's using shash so it won't really benefit from this.  However,
there's no reason why mac80211 can't be converted over to async as
we did for IPsec.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-18 14:05         ` Herbert Xu
@ 2020-08-18 14:17           ` Ben Greear
  2020-08-18 22:15             ` Herbert Xu
  0 siblings, 1 reply; 46+ messages in thread
From: Ben Greear @ 2020-08-18 14:17 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Ard Biesheuvel, Linux Crypto Mailing List, Eric Biggers

On 8/18/20 7:05 AM, Herbert Xu wrote:
> On Tue, Aug 18, 2020 at 06:56:12AM -0700, Ben Greear wrote:
>>
>> Herbert, thanks for working on this.  If I apply the patches you posted,
>> that is expected to provide wifi aes decryption speedup similar to what
>> the original patch I sent does?  Or, are additional patches needed?
> 
> It depends on whether the wifi code uses the async ahash interface,
> if not it probably won't make much of an impact at all.
> 
> I just checked and if the code you're using is net/mac80211/aes_cmac.c
> then it's using shash so it won't really benefit from this.  However,
> there's no reason why mac80211 can't be converted over to async as
> we did for IPsec.

I think converting it to async is beyond what I have time to work on
and not sure if it would be accepted upstream anyway.

Is there any easy way to use your work to make shash fast for aesni?  I
basically just want it to perform as well as it used to with my patch.

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-18 14:17           ` Ben Greear
@ 2020-08-18 22:15             ` Herbert Xu
  2020-08-18 22:27               ` Herbert Xu
  0 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2020-08-18 22:15 UTC (permalink / raw)
  To: Ben Greear; +Cc: Ard Biesheuvel, Linux Crypto Mailing List, Eric Biggers

On Tue, Aug 18, 2020 at 07:17:35AM -0700, Ben Greear wrote:
>
> Is there any easy way to use your work to make shash fast for aesni?  I
> basically just want it to perform as well as it used to with my patch.

Yes.  We could add a sync version of aesni that simply falls back
to aes-generic when simd is unavailable.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-18 22:15             ` Herbert Xu
@ 2020-08-18 22:27               ` Herbert Xu
  2020-08-18 22:31                 ` Ben Greear
  2020-08-22 22:35                 ` Christian Lamparter
  0 siblings, 2 replies; 46+ messages in thread
From: Herbert Xu @ 2020-08-18 22:27 UTC (permalink / raw)
  To: Ben Greear; +Cc: Ard Biesheuvel, Linux Crypto Mailing List, Eric Biggers

On Wed, Aug 19, 2020 at 08:15:50AM +1000, Herbert Xu wrote:
> On Tue, Aug 18, 2020 at 07:17:35AM -0700, Ben Greear wrote:
> >
> > Is there any easy way to use your work to make shash fast for aesni?  I
> > basically just want it to perform as well as it used to with my patch.
> 
> Yes.  We could add a sync version of aesni that simply falls back
> to aes-generic when simd is unavailable.

But I think before anyone attempts this we should explore making
mac80211 async like IPsec.  Is there any fundamental reason why
that is not possible? Have the wireless people expressed any
objections to making this async before?

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-18 22:27               ` Herbert Xu
@ 2020-08-18 22:31                 ` Ben Greear
  2020-08-18 22:33                   ` Herbert Xu
  2020-08-22 22:35                 ` Christian Lamparter
  1 sibling, 1 reply; 46+ messages in thread
From: Ben Greear @ 2020-08-18 22:31 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Ard Biesheuvel, Linux Crypto Mailing List, Eric Biggers

On 8/18/20 3:27 PM, Herbert Xu wrote:
> On Wed, Aug 19, 2020 at 08:15:50AM +1000, Herbert Xu wrote:
>> On Tue, Aug 18, 2020 at 07:17:35AM -0700, Ben Greear wrote:
>>>
>>> Is there any easy way to use your work to make shash fast for aesni?  I
>>> basically just want it to perform as well as it used to with my patch.
>>
>> Yes.  We could add a sync version of aesni that simply falls back
>> to aes-generic when simd is unavailable.
> 
> But I think before anyone attempts this we should explore making
> mac80211 async like IPsec.  Is there any fundamental reason why
> that is not possible? Have the wireless people expressed any
> objections to making this async before?

I don't think it has been discussed recently, but mac80211 is already
a complicated beast, so if this added any significant complexity
it might not be worth it.

Truth is though, I know very little about what changes would be
needed to make it do async decrypt, so maybe it would be a simple
matter?

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-18 22:31                 ` Ben Greear
@ 2020-08-18 22:33                   ` Herbert Xu
  2020-08-18 22:39                     ` Ben Greear
  0 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2020-08-18 22:33 UTC (permalink / raw)
  To: Ben Greear; +Cc: Ard Biesheuvel, Linux Crypto Mailing List, Eric Biggers

On Tue, Aug 18, 2020 at 03:31:10PM -0700, Ben Greear wrote:
>
> I don't think it has been discussed recently, but mac80211 is already
> a complicated beast, so if this added any significant complexity
> it might not be worth it.

Any bulk data path should be using the async interface, otherwise
performance will seriously suffer should SIMD be unavailable.  I
think someone should look at converting wireless to async like IPsec.

> Truth is though, I know very little about what changes would be
> needed to make it do async decrypt, so maybe it would be a simple
> matter?

IPsec was actually quite straightforward.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-18 22:33                   ` Herbert Xu
@ 2020-08-18 22:39                     ` Ben Greear
  2020-08-20  6:58                       ` Ard Biesheuvel
  0 siblings, 1 reply; 46+ messages in thread
From: Ben Greear @ 2020-08-18 22:39 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Ard Biesheuvel, Linux Crypto Mailing List, Eric Biggers

On 8/18/20 3:33 PM, Herbert Xu wrote:
> On Tue, Aug 18, 2020 at 03:31:10PM -0700, Ben Greear wrote:
>>
>> I don't think it has been discussed recently, but mac80211 is already
>> a complicated beast, so if this added any significant complexity
>> it might not be worth it.
> 
> Any bulk data path should be using the async interface, otherwise
> performance will seriously suffer should SIMD be unavailable.  I
> think someone should look at converting wireless to async like IPsec.

Most users in most cases are using hw crypt, so that is likely why
it hasn't gotten a huge amount of effort to optimize the software
crypt path.

If someone wants to give this async api a try for mac80211, I can
test, and I can sponsor the work, but I don't have time to try
to implement it myself.

Thanks,
Ben

> 
>> Truth is though, I know very little about what changes would be
>> needed to make it do async decrypt, so maybe it would be a simple
>> matter?
> 
> IPsec was actually quite straightforward.
> 
> Cheers,
> 


-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-18 22:39                     ` Ben Greear
@ 2020-08-20  6:58                       ` Ard Biesheuvel
  2020-08-20  7:01                         ` Herbert Xu
  0 siblings, 1 reply; 46+ messages in thread
From: Ard Biesheuvel @ 2020-08-20  6:58 UTC (permalink / raw)
  To: Ben Greear; +Cc: Herbert Xu, Linux Crypto Mailing List, Eric Biggers

On Wed, 19 Aug 2020 at 00:39, Ben Greear <greearb@candelatech.com> wrote:
>
> On 8/18/20 3:33 PM, Herbert Xu wrote:
> > On Tue, Aug 18, 2020 at 03:31:10PM -0700, Ben Greear wrote:
> >>
> >> I don't think it has been discussed recently, but mac80211 is already
> >> a complicated beast, so if this added any significant complexity
> >> it might not be worth it.
> >
> > Any bulk data path should be using the async interface, otherwise
> > performance will seriously suffer should SIMD be unavailable.  I
> > think someone should look at converting wireless to async like IPsec.
>
> Most users in most cases are using hw crypt, so that is likely why
> it hasn't gotten a huge amount of effort to optimize the software
> crypt path.
>

As I understand it, it is highly unusual for these code paths to be
exercised in the first place. All mac80211 hardware anyone still cares
about supports CCMP offload, and so only in special cases like Ben's
do we need the software implementation. Also, in Ben's case, it is on
a hot path whereas obsolete hardware that does not implement CCMP
offload does not support anything over 11g speeds to begin with.

Then, there is the additional issue where the FPU preserve/restore
appears to be disproportionately expensive on the actual SoC in
question.

My preferred approach for cbcmac(aes-ni) would be to mirror the arm64
exactly, which means going through the data only a single time, and
interleave the CTR and CBCMAC operations at the AES round level. My
cbcmac ahash approach (v2) is plan B as far as I am concerned, but it
turns out to be flawed and I haven't had time to look into this.

But if we look at the actual issue at hand, we might also look into
amortizing the FPU preserve/restore over multiple invocations of a
cipher. I proposed a patch a while ago that makes cipher an internal
crypto API abstraction, and we could easily add pre/post hooks that
preserve/restore the FPU in this case, in which case we would not need
any changes at higher levels.


> If someone wants to give this async api a try for mac80211, I can
> test, and I can sponsor the work, but I don't have time to try
> to implement it myself.
>
> Thanks,
> Ben
>
> >
> >> Truth is though, I know very little about what changes would be
> >> needed to make it do async decrypt, so maybe it would be a simple
> >> matter?
> >
> > IPsec was actually quite straightforward.
> >
> > Cheers,
> >
>
>
> --
> Ben Greear <greearb@candelatech.com>
> Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-20  6:58                       ` Ard Biesheuvel
@ 2020-08-20  7:01                         ` Herbert Xu
  2020-08-20  7:04                           ` Ard Biesheuvel
  0 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2020-08-20  7:01 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: Ben Greear, Linux Crypto Mailing List, Eric Biggers

On Thu, Aug 20, 2020 at 08:58:15AM +0200, Ard Biesheuvel wrote:
>
> But if we look at the actual issue at hand, we might also look into
> amortizing the FPU preserve/restore over multiple invocations of a
> cipher. I proposed a patch a while ago that makes cipher an internal
> crypto API abstraction, and we could easily add pre/post hooks that
> preserve/restore the FPU in this case, in which case we would not need
> any changes at higher levels.

I think any use of SIMD crypto_cipher on bulk data is just wrong.
Because the performance degradation when SIMD cannot be used is
too great for this to make sense.

So optimising the FPU overhead is attacking the wrong problem.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-20  7:01                         ` Herbert Xu
@ 2020-08-20  7:04                           ` Ard Biesheuvel
  2020-08-20  7:06                             ` Herbert Xu
  0 siblings, 1 reply; 46+ messages in thread
From: Ard Biesheuvel @ 2020-08-20  7:04 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Ben Greear, Linux Crypto Mailing List, Eric Biggers

On Thu, 20 Aug 2020 at 09:01, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Thu, Aug 20, 2020 at 08:58:15AM +0200, Ard Biesheuvel wrote:
> >
> > But if we look at the actual issue at hand, we might also look into
> > amortizing the FPU preserve/restore over multiple invocations of a
> > cipher. I proposed a patch a while ago that makes cipher an internal
> > crypto API abstraction, and we could easily add pre/post hooks that
> > preserve/restore the FPU in this case, in which case we would not need
> > any changes at higher levels.
>
> I think any use of SIMD crypto_cipher on bulk data is just wrong.
> Because the performance degradation when SIMD cannot be used is
> too great for this to make sense.
>
> So optimising the FPU overhead is attacking the wrong problem.
>

I don't disagree with that, especially given all the effort that went
into optimizing FPU preserve/restore on both arm64 and x86. But the
bottom line is that this is what is causing the degradation in Ben's
case, so we cannot disregard it.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-20  7:04                           ` Ard Biesheuvel
@ 2020-08-20  7:06                             ` Herbert Xu
  2020-08-20  7:19                               ` Ard Biesheuvel
  0 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2020-08-20  7:06 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: Ben Greear, Linux Crypto Mailing List, Eric Biggers

On Thu, Aug 20, 2020 at 09:04:26AM +0200, Ard Biesheuvel wrote:
>
> I don't disagree with that, especially given all the effort that went
> into optimizing FPU preserve/restore on both arm64 and x86. But the
> bottom line is that this is what is causing the degradation in Ben's
> case, so we cannot disregard it.

If he's having problems with the performance when SIMD is in use
due to preserve/restore, I'd hate to see his numbers when SIMD is
not available.

IOW if this really matters to him, then wireless code needs to switch
over to ahash.

Solving half of the problem simply makes no sense.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-20  7:06                             ` Herbert Xu
@ 2020-08-20  7:19                               ` Ard Biesheuvel
  2020-08-20  7:29                                 ` Herbert Xu
  0 siblings, 1 reply; 46+ messages in thread
From: Ard Biesheuvel @ 2020-08-20  7:19 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Ben Greear, Linux Crypto Mailing List, Eric Biggers

On Thu, 20 Aug 2020 at 09:06, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Thu, Aug 20, 2020 at 09:04:26AM +0200, Ard Biesheuvel wrote:
> >
> > I don't disagree with that, especially given all the effort that went
> > into optimizing FPU preserve/restore on both arm64 and x86. But the
> > bottom line is that this is what is causing the degradation in Ben's
> > case, so we cannot disregard it.
>
> If he's having problems with the performance when SIMD is in use
> due to preserve/restore, I'd hate to see his numbers when SIMD is
> not available.
>

Actually, I'm not so sure that they will be so much worse. The
expensive FPU preserve/restore occurs for every 16 bytes of data
processed by the AES cipher, which I'd estimate to take ~10 cycles per
byte for an unaccelerated implementation. But table based AES should
be avoided, especially for MAC algorithms where the plaintext may be
known to an attacker who is after the key.

However, the CCMP handling is invoked from softirq context or from
task context, and so SIMD is generally available unless the softirq
happens to be taken over the back of a hardirq that interrupted a task
running in the kernel that was using the SIMD already. IOW, this
happens so rarely in practice that I would not expect it to be
noticeable in the performance stats.

> IOW if this really matters to him, then wireless code needs to switch
> over to ahash.
>
> Solving half of the problem simply makes no sense.
>

My v2 attempt at cbcmac(aesni) implements an ahash, but a synchronous
one. This means we can amortize the FPU preserve/restore over the
entire scatterlist, instead of relying on the ahash walk to present
the data in virtually mapped chunks.

I'd still like to explore this approach, but I simply haven't had the
spare cycles to spend on this.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-20  7:19                               ` Ard Biesheuvel
@ 2020-08-20  7:29                                 ` Herbert Xu
  2020-08-20  7:33                                   ` Ard Biesheuvel
  0 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2020-08-20  7:29 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: Ben Greear, Linux Crypto Mailing List, Eric Biggers

On Thu, Aug 20, 2020 at 09:19:16AM +0200, Ard Biesheuvel wrote:
>
> Actually, I'm not so sure that they will be so much worse. The
> expensive FPU preserve/restore occurs for every 16 bytes of data
> processed by the AES cipher, which I'd estimate to take ~10 cycles per
> byte for an unaccelerated implementation. But table based AES should
> be avoided, especially for MAC algorithms where the plaintext may be
> known to an attacker who is after the key.

On my machine the performance difference on a 1472-byte request
between SIMD and generic is 2161 vs. 7558 (cycles).
> 
> However, the CCMP handling is invoked from softirq context or from
> task context, and so SIMD is generally available unless the softirq
> happens to be taken over the back of a hardirq that interrupted a task
> running in the kernel that was using the SIMD already. IOW, this
> happens so rarely in practice that I would not expect it to be
> noticeable in the performance stats.

What if the same machine was doing TLS/IPsec sends at full throttle?
That would be exactly the wrong time to slow down softirqs four-fold,
no?

> My v2 attempt at cbcmac(aesni) implements an ahash, but a synchronous
> one. This means we can amortize the FPU preserve/restore over the
> entire scatterlist, instead of relying on the ahash walk to present
> the data in virtually mapped chunks.
> 
> I'd still like to explore this approach, but I simply haven't had the
> spare cycles to spend on this.

I don't have an issue your patch per se.  But please make it so that
it has the async path like everything else.  Also wireless uses shash
so it can't use an ahash anyway even if it is sync.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-20  7:29                                 ` Herbert Xu
@ 2020-08-20  7:33                                   ` Ard Biesheuvel
  2020-08-20  7:44                                     ` Herbert Xu
  0 siblings, 1 reply; 46+ messages in thread
From: Ard Biesheuvel @ 2020-08-20  7:33 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Ben Greear, Linux Crypto Mailing List, Eric Biggers

On Thu, 20 Aug 2020 at 09:29, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Thu, Aug 20, 2020 at 09:19:16AM +0200, Ard Biesheuvel wrote:
> >
> > Actually, I'm not so sure that they will be so much worse. The
> > expensive FPU preserve/restore occurs for every 16 bytes of data
> > processed by the AES cipher, which I'd estimate to take ~10 cycles per
> > byte for an unaccelerated implementation. But table based AES should
> > be avoided, especially for MAC algorithms where the plaintext may be
> > known to an attacker who is after the key.
>
> On my machine the performance difference on a 1472-byte request
> between SIMD and generic is 2161 vs. 7558 (cycles).

Sure. But your machine does not have the pathological FPU
preserve/restore performance.

> >
> > However, the CCMP handling is invoked from softirq context or from
> > task context, and so SIMD is generally available unless the softirq
> > happens to be taken over the back of a hardirq that interrupted a task
> > running in the kernel that was using the SIMD already. IOW, this
> > happens so rarely in practice that I would not expect it to be
> > noticeable in the performance stats.
>
> What if the same machine was doing TLS/IPsec sends at full throttle?
> That would be exactly the wrong time to slow down softirqs four-fold,
> no?
>

Fair point.

> > My v2 attempt at cbcmac(aesni) implements an ahash, but a synchronous
> > one. This means we can amortize the FPU preserve/restore over the
> > entire scatterlist, instead of relying on the ahash walk to present
> > the data in virtually mapped chunks.
> >
> > I'd still like to explore this approach, but I simply haven't had the
> > spare cycles to spend on this.
>
> I don't have an issue your patch per se.  But please make it so that
> it has the async path like everything else.  Also wireless uses shash
> so it can't use an ahash anyway even if it is sync.
>

The mac80211 CCMP code uses a synchronous ccm aead, which gets backed
by a skcipher+ahash combo by the ccm template. So a synchronous ahash
is fine for this particular case.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-20  7:33                                   ` Ard Biesheuvel
@ 2020-08-20  7:44                                     ` Herbert Xu
  2020-08-20  7:48                                       ` Ard Biesheuvel
  0 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2020-08-20  7:44 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: Ben Greear, Linux Crypto Mailing List, Eric Biggers

On Thu, Aug 20, 2020 at 09:33:21AM +0200, Ard Biesheuvel wrote:
>
> > On my machine the performance difference on a 1472-byte request
> > between SIMD and generic is 2161 vs. 7558 (cycles).
> 
> Sure. But your machine does not have the pathological FPU
> preserve/restore performance.

Why does that matter? These are numbers for cbc-aesni which means
just a single preserve/restore for the whole request.

Or are you saying on Ben's machine cbc-aesni would have worse
performance vs. aes-generic?
 
> The mac80211 CCMP code uses a synchronous ccm aead, which gets backed
> by a skcipher+ahash combo by the ccm template. So a synchronous ahash
> is fine for this particular case.

OK I was just grepping for cmac so didn't see this.

For this case, I think it's even more important that it be converted
over to async because its sending path is also in user context just
like IPsec.

So simply by sending wireless packets you can hog the CPU while
doing SIMD in kernel context which would then kill the receive
path if you're using the generic fallback.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-20  7:44                                     ` Herbert Xu
@ 2020-08-20  7:48                                       ` Ard Biesheuvel
  2020-08-20  7:53                                         ` Herbert Xu
  0 siblings, 1 reply; 46+ messages in thread
From: Ard Biesheuvel @ 2020-08-20  7:48 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Ben Greear, Linux Crypto Mailing List, Eric Biggers

On Thu, 20 Aug 2020 at 09:44, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Thu, Aug 20, 2020 at 09:33:21AM +0200, Ard Biesheuvel wrote:
> >
> > > On my machine the performance difference on a 1472-byte request
> > > between SIMD and generic is 2161 vs. 7558 (cycles).
> >
> > Sure. But your machine does not have the pathological FPU
> > preserve/restore performance.
>
> Why does that matter? These are numbers for cbc-aesni which means
> just a single preserve/restore for the whole request.
>

No, that is the whole problem. The CCM template has a CBCMAC
implementation that wraps the bare cipher, which means it invokes
crypto_cipher_encrypt_one() for each 16 bytes of input, and each of
those calls involves a FPU preserve/restore.

> Or are you saying on Ben's machine cbc-aesni would have worse
> performance vs. aes-generic?
>

Yes, given the pathological overhead of FPU preserve/restore for every
block of 16 bytes processed by the cbcmac wrapper.

> > The mac80211 CCMP code uses a synchronous ccm aead, which gets backed
> > by a skcipher+ahash combo by the ccm template. So a synchronous ahash
> > is fine for this particular case.
>
> OK I was just grepping for cmac so didn't see this.
>
> For this case, I think it's even more important that it be converted
> over to async because its sending path is also in user context just
> like IPsec.
>

Indeed.

cmac() is not really relevant for performance, afaict. Only cbcmac()
is used for bulk data.

> So simply by sending wireless packets you can hog the CPU while
> doing SIMD in kernel context which would then kill the receive
> path if you're using the generic fallback.
>
> Cheers,
> --
> Email: Herbert Xu <herbert@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-20  7:48                                       ` Ard Biesheuvel
@ 2020-08-20  7:53                                         ` Herbert Xu
  2020-08-20  7:56                                           ` Ard Biesheuvel
  0 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2020-08-20  7:53 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: Ben Greear, Linux Crypto Mailing List, Eric Biggers

On Thu, Aug 20, 2020 at 09:48:02AM +0200, Ard Biesheuvel wrote:
>
> > Or are you saying on Ben's machine cbc-aesni would have worse
> > performance vs. aes-generic?
> >
> 
> Yes, given the pathological overhead of FPU preserve/restore for every
> block of 16 bytes processed by the cbcmac wrapper.

I'm sceptical.  Do we have numbers showing this? You can get them
from tcrypt with my patch:

	https://patchwork.kernel.org/patch/11701343/

Just do

	modprobe tcrypt mode=400 alg='cmac(aes-aesni)' klen=16
	modprobe tcrypt mode=400 alg='cmac(aes-generic)' klen=16

> cmac() is not really relevant for performance, afaict. Only cbcmac()
> is used for bulk data.

Sure but it's trivial to extend my cmac patch to support cbcmac.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-20  7:53                                         ` Herbert Xu
@ 2020-08-20  7:56                                           ` Ard Biesheuvel
  2020-08-20 13:54                                             ` Ben Greear
  0 siblings, 1 reply; 46+ messages in thread
From: Ard Biesheuvel @ 2020-08-20  7:56 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Ben Greear, Linux Crypto Mailing List, Eric Biggers

On Thu, 20 Aug 2020 at 09:54, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Thu, Aug 20, 2020 at 09:48:02AM +0200, Ard Biesheuvel wrote:
> >
> > > Or are you saying on Ben's machine cbc-aesni would have worse
> > > performance vs. aes-generic?
> > >
> >
> > Yes, given the pathological overhead of FPU preserve/restore for every
> > block of 16 bytes processed by the cbcmac wrapper.
>
> I'm sceptical.  Do we have numbers showing this? You can get them
> from tcrypt with my patch:
>
>         https://patchwork.kernel.org/patch/11701343/
>
> Just do
>
>         modprobe tcrypt mode=400 alg='cmac(aes-aesni)' klen=16
>         modprobe tcrypt mode=400 alg='cmac(aes-generic)' klen=16
>
> > cmac() is not really relevant for performance, afaict. Only cbcmac()
> > is used for bulk data.
>
> Sure but it's trivial to extend my cmac patch to support cbcmac.
>


Sure.

Ben, care to have a go at the above on your hardware? It would help us
get to the bottom of this issue.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-20  7:56                                           ` Ard Biesheuvel
@ 2020-08-20 13:54                                             ` Ben Greear
  2020-08-20 20:10                                               ` Herbert Xu
  0 siblings, 1 reply; 46+ messages in thread
From: Ben Greear @ 2020-08-20 13:54 UTC (permalink / raw)
  To: Ard Biesheuvel, Herbert Xu; +Cc: Linux Crypto Mailing List, Eric Biggers

On 8/20/20 12:56 AM, Ard Biesheuvel wrote:
> On Thu, 20 Aug 2020 at 09:54, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>>
>> On Thu, Aug 20, 2020 at 09:48:02AM +0200, Ard Biesheuvel wrote:
>>>
>>>> Or are you saying on Ben's machine cbc-aesni would have worse
>>>> performance vs. aes-generic?
>>>>
>>>
>>> Yes, given the pathological overhead of FPU preserve/restore for every
>>> block of 16 bytes processed by the cbcmac wrapper.
>>
>> I'm sceptical.  Do we have numbers showing this? You can get them
>> from tcrypt with my patch:
>>
>>          https://patchwork.kernel.org/patch/11701343/
>>
>> Just do
>>
>>          modprobe tcrypt mode=400 alg='cmac(aes-aesni)' klen=16
>>          modprobe tcrypt mode=400 alg='cmac(aes-generic)' klen=16
>>
>>> cmac() is not really relevant for performance, afaict. Only cbcmac()
>>> is used for bulk data.
>>
>> Sure but it's trivial to extend my cmac patch to support cbcmac.
>>
> 
> 
> Sure.
> 
> Ben, care to have a go at the above on your hardware? It would help us
> get to the bottom of this issue.

Here's a run on an:  Intel(R) Core(TM) i7-7700T CPU @ 2.90GHz

                testing speed of async cmac(aes-aesni) (cmac(aes-aesni))
[  259.397756] tcrypt: test  0 (   16 byte blocks,   16 bytes per update,   1 updates):    244 cycles/operation,   15 cycles/byte
[  259.397759] tcrypt: test  1 (   64 byte blocks,   16 bytes per update,   4 updates):   1052 cycles/operation,   16 cycles/byte
[  259.397765] tcrypt: test  2 (   64 byte blocks,   64 bytes per update,   1 updates):    641 cycles/operation,   10 cycles/byte
[  259.397768] tcrypt: test  3 (  256 byte blocks,   16 bytes per update,  16 updates):   3909 cycles/operation,   15 cycles/byte
[  259.397786] tcrypt: test  4 (  256 byte blocks,   64 bytes per update,   4 updates):   2602 cycles/operation,   10 cycles/byte
[  259.397797] tcrypt: test  5 (  256 byte blocks,  256 bytes per update,   1 updates):   2211 cycles/operation,    8 cycles/byte
[  259.397807] tcrypt: test  6 ( 1024 byte blocks,   16 bytes per update,  64 updates):  15453 cycles/operation,   15 cycles/byte
[  259.397872] tcrypt: test  7 ( 1024 byte blocks,  256 bytes per update,   4 updates):   8863 cycles/operation,    8 cycles/byte
[  259.397910] tcrypt: test  8 ( 1024 byte blocks, 1024 bytes per update,   1 updates):   8442 cycles/operation,    8 cycles/byte
[  259.397946] tcrypt: test  9 ( 2048 byte blocks,   16 bytes per update, 128 updates):  43542 cycles/operation,   21 cycles/byte
[  259.398110] tcrypt: test 10 ( 2048 byte blocks,  256 bytes per update,   8 updates):  17649 cycles/operation,    8 cycles/byte
[  259.398184] tcrypt: test 11 ( 2048 byte blocks, 1024 bytes per update,   2 updates):  21255 cycles/operation,   10 cycles/byte
[  259.398267] tcrypt: test 12 ( 2048 byte blocks, 2048 bytes per update,   1 updates):  16322 cycles/operation,    7 cycles/byte
[  259.398335] tcrypt: test 13 ( 4096 byte blocks,   16 bytes per update, 256 updates):  60301 cycles/operation,   14 cycles/byte
[  259.398585] tcrypt: test 14 ( 4096 byte blocks,  256 bytes per update,  16 updates):  34413 cycles/operation,    8 cycles/byte
[  259.398728] tcrypt: test 15 ( 4096 byte blocks, 1024 bytes per update,   4 updates):  32894 cycles/operation,    8 cycles/byte
[  259.398865] tcrypt: test 16 ( 4096 byte blocks, 4096 bytes per update,   1 updates):  32521 cycles/operation,    7 cycles/byte
[  259.399000] tcrypt: test 17 ( 8192 byte blocks,   16 bytes per update, 512 updates): 120415 cycles/operation,   14 cycles/byte
[  259.399550] tcrypt: test 18 ( 8192 byte blocks,  256 bytes per update,  32 updates):  68635 cycles/operation,    8 cycles/byte
[  259.399834] tcrypt: test 19 ( 8192 byte blocks, 1024 bytes per update,   8 updates):  83770 cycles/operation,   10 cycles/byte
[  259.400157] tcrypt: test 20 ( 8192 byte blocks, 4096 bytes per update,   2 updates):  65075 cycles/operation,    7 cycles/byte
[  259.400427] tcrypt: test 21 ( 8192 byte blocks, 8192 bytes per update,   1 updates):  65085 cycles/operation,    7 cycles/byte
[  294.171336]
                testing speed of async cmac(aes-generic) (cmac(aes-generic))
[  294.171340] tcrypt: test  0 (   16 byte blocks,   16 bytes per update,   1 updates):    275 cycles/operation,   17 cycles/byte
[  294.171343] tcrypt: test  1 (   64 byte blocks,   16 bytes per update,   4 updates):   1191 cycles/operation,   18 cycles/byte
[  294.171350] tcrypt: test  2 (   64 byte blocks,   64 bytes per update,   1 updates):    738 cycles/operation,   11 cycles/byte
[  294.171354] tcrypt: test  3 (  256 byte blocks,   16 bytes per update,  16 updates):   4386 cycles/operation,   17 cycles/byte
[  294.171374] tcrypt: test  4 (  256 byte blocks,   64 bytes per update,   4 updates):   2915 cycles/operation,   11 cycles/byte
[  294.171387] tcrypt: test  5 (  256 byte blocks,  256 bytes per update,   1 updates):   2464 cycles/operation,    9 cycles/byte
[  294.171398] tcrypt: test  6 ( 1024 byte blocks,   16 bytes per update,  64 updates):  17558 cycles/operation,   17 cycles/byte
[  294.171472] tcrypt: test  7 ( 1024 byte blocks,  256 bytes per update,   4 updates):  14022 cycles/operation,   13 cycles/byte
[  294.171530] tcrypt: test  8 ( 1024 byte blocks, 1024 bytes per update,   1 updates):   9022 cycles/operation,    8 cycles/byte
[  294.171569] tcrypt: test  9 ( 2048 byte blocks,   16 bytes per update, 128 updates):  38107 cycles/operation,   18 cycles/byte
[  294.171722] tcrypt: test 10 ( 2048 byte blocks,  256 bytes per update,   8 updates):  18083 cycles/operation,    8 cycles/byte
[  294.171798] tcrypt: test 11 ( 2048 byte blocks, 1024 bytes per update,   2 updates):  17260 cycles/operation,    8 cycles/byte
[  294.171870] tcrypt: test 12 ( 2048 byte blocks, 2048 bytes per update,   1 updates):  17415 cycles/operation,    8 cycles/byte
[  294.171943] tcrypt: test 13 ( 4096 byte blocks,   16 bytes per update, 256 updates):  66005 cycles/operation,   16 cycles/byte
[  294.172217] tcrypt: test 14 ( 4096 byte blocks,  256 bytes per update,  16 updates):  36035 cycles/operation,    8 cycles/byte
[  294.172366] tcrypt: test 15 ( 4096 byte blocks, 1024 bytes per update,   4 updates):  42812 cycles/operation,   10 cycles/byte
[  294.172533] tcrypt: test 16 ( 4096 byte blocks, 4096 bytes per update,   1 updates):  53415 cycles/operation,   13 cycles/byte
[  294.172745] tcrypt: test 17 ( 8192 byte blocks,   16 bytes per update, 512 updates): 133326 cycles/operation,   16 cycles/byte
[  294.173297] tcrypt: test 18 ( 8192 byte blocks,  256 bytes per update,  32 updates):  90271 cycles/operation,   11 cycles/byte
[  294.173646] tcrypt: test 19 ( 8192 byte blocks, 1024 bytes per update,   8 updates):  68703 cycles/operation,    8 cycles/byte
[  294.173931] tcrypt: test 20 ( 8192 byte blocks, 4096 bytes per update,   2 updates):  67951 cycles/operation,    8 cycles/byte
[  294.174213] tcrypt: test 21 ( 8192 byte blocks, 8192 bytes per update,   1 updates):  68370 cycles/operation,    8 cycles/byte


On my slow apu2 board with processor: AMD GX-412TC SOC

               testing speed of async cmac(aes-aesni) (cmac(aes-aesni))
[   51.750514] tcrypt: test  0 (   16 byte blocks,   16 bytes per update,   1 updates):    600 cycles/operation,   37 cycle
[   51.750532] tcrypt: test  1 (   64 byte blocks,   16 bytes per update,   4 updates):   2063 cycles/operation,   32 cycle
[   51.750582] tcrypt: test  2 (   64 byte blocks,   64 bytes per update,   1 updates):   1326 cycles/operation,   20 cycle
[   51.750619] tcrypt: test  3 (  256 byte blocks,   16 bytes per update,  16 updates):  11190 cycles/operation,   43 cycle
[   51.750775] tcrypt: test  4 (  256 byte blocks,   64 bytes per update,   4 updates):   4935 cycles/operation,   19 cycle
[   51.750840] tcrypt: test  5 (  256 byte blocks,  256 bytes per update,   1 updates):   8652 cycles/operation,   33 cycle
[   51.750948] tcrypt: test  6 ( 1024 byte blocks,   16 bytes per update,  64 updates):  43430 cycles/operation,   42 cycle
[   51.751488] tcrypt: test  7 ( 1024 byte blocks,  256 bytes per update,   4 updates):  23589 cycles/operation,   23 cycle
[   51.751810] tcrypt: test  8 ( 1024 byte blocks, 1024 bytes per update,   1 updates):  18759 cycles/operation,   18 cycle
[   51.752027] tcrypt: test  9 ( 2048 byte blocks,   16 bytes per update, 128 updates):  79699 cycles/operation,   38 cycle
[   51.753035] tcrypt: test 10 ( 2048 byte blocks,  256 bytes per update,   8 updates):  39900 cycles/operation,   19 cycle
[   51.753559] tcrypt: test 11 ( 2048 byte blocks, 1024 bytes per update,   2 updates):  38390 cycles/operation,   18 cycle
[   51.754057] tcrypt: test 12 ( 2048 byte blocks, 2048 bytes per update,   1 updates):  40888 cycles/operation,   19 cycle
[   51.754615] tcrypt: test 13 ( 4096 byte blocks,   16 bytes per update, 256 updates): 143019 cycles/operation,   34 cycle
[   51.756369] tcrypt: test 14 ( 4096 byte blocks,  256 bytes per update,  16 updates):  89046 cycles/operation,   21 cycle
[   51.757527] tcrypt: test 15 ( 4096 byte blocks, 1024 bytes per update,   4 updates):  77992 cycles/operation,   19 cycle
[   51.758526] tcrypt: test 16 ( 4096 byte blocks, 4096 bytes per update,   1 updates):  76021 cycles/operation,   18 cycle
[   51.759442] tcrypt: test 17 ( 8192 byte blocks,   16 bytes per update, 512 updates): 312260 cycles/operation,   38 cycle
[   51.763195] tcrypt: test 18 ( 8192 byte blocks,  256 bytes per update,  32 updates): 176472 cycles/operation,   21 cycle
[   51.765255] tcrypt: test 19 ( 8192 byte blocks, 1024 bytes per update,   8 updates): 169565 cycles/operation,   20 cycle
[   51.767321] tcrypt: test 20 ( 8192 byte blocks, 4096 bytes per update,   2 updates): 164968 cycles/operation,   20 cycle
[   51.769256] tcrypt: test 21 ( 8192 byte blocks, 8192 bytes per update,   1 updates): 165096 cycles/operation,   20 cycle

               testing speed of async cmac(aes-generic) (cmac(aes-generic))
[   97.835925] tcrypt: test  0 (   16 byte blocks,   16 bytes per update,   1 updates):    665 cycles/operation,   41 cycle
[   97.835945] tcrypt: test  1 (   64 byte blocks,   16 bytes per update,   4 updates):   2430 cycles/operation,   37 cycle
[   97.836016] tcrypt: test  2 (   64 byte blocks,   64 bytes per update,   1 updates):   1656 cycles/operation,   25 cycle
[   97.836044] tcrypt: test  3 (  256 byte blocks,   16 bytes per update,  16 updates):   9014 cycles/operation,   35 cycle
[   97.836259] tcrypt: test  4 (  256 byte blocks,   64 bytes per update,   4 updates):  13444 cycles/operation,   52 cycle
[   97.836399] tcrypt: test  5 (  256 byte blocks,  256 bytes per update,   1 updates):   8960 cycles/operation,   35 cycle
[   97.836515] tcrypt: test  6 ( 1024 byte blocks,   16 bytes per update,  64 updates):  51594 cycles/operation,   50 cycle
[   97.837151] tcrypt: test  7 ( 1024 byte blocks,  256 bytes per update,   4 updates):  28105 cycles/operation,   27 cycle
[   97.837497] tcrypt: test  8 ( 1024 byte blocks, 1024 bytes per update,   1 updates):  31365 cycles/operation,   30 cycle
[   97.837865] tcrypt: test  9 ( 2048 byte blocks,   16 bytes per update, 128 updates):  86111 cycles/operation,   42 cycle
[   97.838927] tcrypt: test 10 ( 2048 byte blocks,  256 bytes per update,   8 updates):  60021 cycles/operation,   29 cycle
[   97.839628] tcrypt: test 11 ( 2048 byte blocks, 1024 bytes per update,   2 updates):  56311 cycles/operation,   27 cycle
[   97.840308] tcrypt: test 12 ( 2048 byte blocks, 2048 bytes per update,   1 updates):  50877 cycles/operation,   24 cycle
[   97.840943] tcrypt: test 13 ( 4096 byte blocks,   16 bytes per update, 256 updates): 174028 cycles/operation,   42 cycle
[   97.843205] tcrypt: test 14 ( 4096 byte blocks,  256 bytes per update,  16 updates): 103243 cycles/operation,   25 cycle
[   97.844524] tcrypt: test 15 ( 4096 byte blocks, 1024 bytes per update,   4 updates):  99960 cycles/operation,   24 cycle
[   97.845865] tcrypt: test 16 ( 4096 byte blocks, 4096 bytes per update,   1 updates): 121735 cycles/operation,   29 cycle
[   97.847355] tcrypt: test 17 ( 8192 byte blocks,   16 bytes per update, 512 updates): 387559 cycles/operation,   47 cycle
[   97.851930] tcrypt: test 18 ( 8192 byte blocks,  256 bytes per update,  32 updates): 223662 cycles/operation,   27 cycle
[   97.854617] tcrypt: test 19 ( 8192 byte blocks, 1024 bytes per update,   8 updates): 226131 cycles/operation,   27 cycle
[   97.857385] tcrypt: test 20 ( 8192 byte blocks, 4096 bytes per update,   2 updates): 203840 cycles/operation,   24 cycle
[   97.859888] tcrypt: test 21 ( 8192 byte blocks, 8192 bytes per update,   1 updates): 220232 cycles/operation,   26 cycle

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-20 13:54                                             ` Ben Greear
@ 2020-08-20 20:10                                               ` Herbert Xu
  2020-08-20 22:09                                                 ` Ben Greear
  0 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2020-08-20 20:10 UTC (permalink / raw)
  To: Ben Greear; +Cc: Ard Biesheuvel, Linux Crypto Mailing List, Eric Biggers

On Thu, Aug 20, 2020 at 06:54:58AM -0700, Ben Greear wrote:
>
> Here's a run on an:  Intel(R) Core(TM) i7-7700T CPU @ 2.90GHz
> 
>                testing speed of async cmac(aes-aesni) (cmac(aes-aesni))
>
> [  259.397910] tcrypt: test  8 ( 1024 byte blocks, 1024 bytes per update,   1 updates):   8442 cycles/operation,    8 cycles/byte
>
>                testing speed of async cmac(aes-generic) (cmac(aes-generic))
>
> [  294.171530] tcrypt: test  8 ( 1024 byte blocks, 1024 bytes per update,   1 updates):   9022 cycles/operation,    8 cycles/byte
> 
> On my slow apu2 board with processor: AMD GX-412TC SOC
> 
>               testing speed of async cmac(aes-aesni) (cmac(aes-aesni))
>
> [   51.751810] tcrypt: test  8 ( 1024 byte blocks, 1024 bytes per update,   1 updates):  18759 cycles/operation,   18 cycle
>
>               testing speed of async cmac(aes-generic) (cmac(aes-generic))
>
> [   97.837497] tcrypt: test  8 ( 1024 byte blocks, 1024 bytes per update,   1 updates):  31365 cycles/operation,   30 cycle

So clearly aes-generic is slower than aes-aesni even with saving and
restoring for each block.  Therefore improving the performance of
the latter per se does not make sense.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-20 20:10                                               ` Herbert Xu
@ 2020-08-20 22:09                                                 ` Ben Greear
  2020-08-20 22:12                                                   ` Herbert Xu
  0 siblings, 1 reply; 46+ messages in thread
From: Ben Greear @ 2020-08-20 22:09 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Ard Biesheuvel, Linux Crypto Mailing List, Eric Biggers

On 8/20/20 1:10 PM, Herbert Xu wrote:
> On Thu, Aug 20, 2020 at 06:54:58AM -0700, Ben Greear wrote:
>>
>> Here's a run on an:  Intel(R) Core(TM) i7-7700T CPU @ 2.90GHz
>>
>>                 testing speed of async cmac(aes-aesni) (cmac(aes-aesni))
>>
>> [  259.397910] tcrypt: test  8 ( 1024 byte blocks, 1024 bytes per update,   1 updates):   8442 cycles/operation,    8 cycles/byte
>>
>>                 testing speed of async cmac(aes-generic) (cmac(aes-generic))
>>
>> [  294.171530] tcrypt: test  8 ( 1024 byte blocks, 1024 bytes per update,   1 updates):   9022 cycles/operation,    8 cycles/byte
>>
>> On my slow apu2 board with processor: AMD GX-412TC SOC
>>
>>                testing speed of async cmac(aes-aesni) (cmac(aes-aesni))
>>
>> [   51.751810] tcrypt: test  8 ( 1024 byte blocks, 1024 bytes per update,   1 updates):  18759 cycles/operation,   18 cycle
>>
>>                testing speed of async cmac(aes-generic) (cmac(aes-generic))
>>
>> [   97.837497] tcrypt: test  8 ( 1024 byte blocks, 1024 bytes per update,   1 updates):  31365 cycles/operation,   30 cycle
> 
> So clearly aes-generic is slower than aes-aesni even with saving and
> restoring for each block.  Therefore improving the performance of
> the latter per se does not make sense.

I have always assumed that I need aesni instructions to have any chance at this performing well,
but there are certainly chips out there that don't have aesni, so possibly it is still worth improving
if it is relatively easy to do so.

I am currently using x86-64 CPUs with aesni, and also some AP platforms running QCA ARM chips.
I am not sure if ARM is using aesni or not...it is certainly not that fast, but maybe for other
reasons.

Thanks,
Ben

> 
> Cheers,
> 


-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-20 22:09                                                 ` Ben Greear
@ 2020-08-20 22:12                                                   ` Herbert Xu
  0 siblings, 0 replies; 46+ messages in thread
From: Herbert Xu @ 2020-08-20 22:12 UTC (permalink / raw)
  To: Ben Greear; +Cc: Ard Biesheuvel, Linux Crypto Mailing List, Eric Biggers

On Thu, Aug 20, 2020 at 03:09:55PM -0700, Ben Greear wrote:
> 
> I have always assumed that I need aesni instructions to have any chance at this performing well,
> but there are certainly chips out there that don't have aesni, so possibly it is still worth improving
> if it is relatively easy to do so.

What we were discussing is the merit of improving aesni only
while still being exposed to aes-generic on the softirq path.

This is clearly not acceptable.

Improving aes-generic obviously would make sense.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 0/5] crypto: Implement cmac based on cbc skcipher
  2020-08-18 22:27               ` Herbert Xu
  2020-08-18 22:31                 ` Ben Greear
@ 2020-08-22 22:35                 ` Christian Lamparter
  1 sibling, 0 replies; 46+ messages in thread
From: Christian Lamparter @ 2020-08-22 22:35 UTC (permalink / raw)
  To: Herbert Xu, Ben Greear
  Cc: Ard Biesheuvel, Linux Crypto Mailing List, Eric Biggers

On 2020-08-19 00:27, Herbert Xu wrote:
> On Wed, Aug 19, 2020 at 08:15:50AM +1000, Herbert Xu wrote:
>> On Tue, Aug 18, 2020 at 07:17:35AM -0700, Ben Greear wrote:
>>>
>>> Is there any easy way to use your work to make shash fast for aesni?  I
>>> basically just want it to perform as well as it used to with my patch.
>>
>> Yes.  We could add a sync version of aesni that simply falls back
>> to aes-generic when simd is unavailable.
> 
> But I think before anyone attempts this we should explore making
> mac80211 async like IPsec.  Is there any fundamental reason why
> that is not possible? Have the wireless people expressed any
> objections to making this async before?

Ohh, is this still about a heavily-updated and rewritten version
of my old initial patch from 2014 for 3.16-wl?
<https://lore.kernel.org/linux-wireless/1518134.xFh23iA8q1@blech/>

Because back in 2016, I've asked this on linux-wireless:

| It would be a great if mac80211 would do to the encryption and
| decryption asynchronously. As this would work for other ciphers
| and also allows crypto offload to dedicated crypto hardware.

And the answer back then (same as now) was:
<https://lore.kernel.org/linux-wireless/1477168300.4123.8.camel@sipsolutions.net/>

 >The only problem with that is that we'd essentially need a software
 >queue for *all* frames, and release them from there in RX order after
 >decrypt. That's surely doable, but so far nobody has thought it
 >important enough since mostly HW crypto is used ...

Ben, keep up the good work!

Cheers,
Christian

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/6] crypto: cmac - Use cbc skcipher instead of raw cipher
  2020-08-18  8:25   ` [PATCH 6/6] crypto: cmac - Use cbc skcipher instead of raw cipher Herbert Xu
@ 2020-08-24  9:47     ` Ard Biesheuvel
  2020-08-24 11:20       ` Herbert Xu
  0 siblings, 1 reply; 46+ messages in thread
From: Ard Biesheuvel @ 2020-08-24  9:47 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Linux Crypto Mailing List, Eric Biggers, Ben Greear

On Tue, 18 Aug 2020 at 10:25, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> Many architectures provide an accelerated implementation of cbc(aes)
> skcipher that is far superior to using the standard cbc template
> over an accelerated aes cipher.  As cmac uses the raw cipher, it
> was not able to benefit from the accelerated cbc(aes) skcpipher.
>
> This patch changes cmac to use an underlying cbc skcipher.  In order
> to do so safely, cmac has been split into an ahash version as well
> as an shash version.  If the underlying cbc(aes) skcipher is async,
> then only the ahash version would provide the full acceleration.
>
> Here are the numbers on x86:
>
> 1. Old cmac:
> testing speed of async cmac(aes) (cmac(aes-aesni))
> tcrypt: test  0 (   16 byte blocks,   16 bytes per update,   1 updates):    753 cycles/operation,   47 cycles/byte
> tcrypt: test  1 (   64 byte blocks,   16 bytes per update,   4 updates):   2305 cycles/operation,   36 cycles/byte
> tcrypt: test  2 (   64 byte blocks,   64 bytes per update,   1 updates):   1398 cycles/operation,   21 cycles/byte
> tcrypt: test  3 (  256 byte blocks,   16 bytes per update,  16 updates):  10996 cycles/operation,   42 cycles/byte
> tcrypt: test  4 (  256 byte blocks,   64 bytes per update,   4 updates):   4808 cycles/operation,   18 cycles/byte
> tcrypt: test  5 (  256 byte blocks,  256 bytes per update,   1 updates):   3819 cycles/operation,   14 cycles/byte
> tcrypt: test  6 ( 1024 byte blocks,   16 bytes per update,  64 updates):  30528 cycles/operation,   29 cycles/byte
> tcrypt: test  7 ( 1024 byte blocks,  256 bytes per update,   4 updates):  14471 cycles/operation,   14 cycles/byte
> tcrypt: test  8 ( 1024 byte blocks, 1024 bytes per update,   1 updates):  13469 cycles/operation,   13 cycles/byte
> tcrypt: test  9 ( 2048 byte blocks,   16 bytes per update, 128 updates):  62646 cycles/operation,   30 cycles/byte
> tcrypt: test 10 ( 2048 byte blocks,  256 bytes per update,   8 updates):  28492 cycles/operation,   13 cycles/byte
> tcrypt: test 11 ( 2048 byte blocks, 1024 bytes per update,   2 updates):  25935 cycles/operation,   12 cycles/byte
> tcrypt: test 12 ( 2048 byte blocks, 2048 bytes per update,   1 updates):  25545 cycles/operation,   12 cycles/byte
> tcrypt: test 13 ( 4096 byte blocks,   16 bytes per update, 256 updates): 126308 cycles/operation,   30 cycles/byte
> tcrypt: test 14 ( 4096 byte blocks,  256 bytes per update,  16 updates):  56565 cycles/operation,   13 cycles/byte
> tcrypt: test 15 ( 4096 byte blocks, 1024 bytes per update,   4 updates):  64675 cycles/operation,   15 cycles/byte
> tcrypt: test 16 ( 4096 byte blocks, 4096 bytes per update,   1 updates):  59264 cycles/operation,   14 cycles/byte
> tcrypt: test 17 ( 8192 byte blocks,   16 bytes per update, 512 updates): 244882 cycles/operation,   29 cycles/byte
> tcrypt: test 18 ( 8192 byte blocks,  256 bytes per update,  32 updates): 117842 cycles/operation,   14 cycles/byte
> tcrypt: test 19 ( 8192 byte blocks, 1024 bytes per update,   8 updates): 106637 cycles/operation,   13 cycles/byte
> tcrypt: test 20 ( 8192 byte blocks, 4096 bytes per update,   2 updates): 103895 cycles/operation,   12 cycles/byte
> tcrypt: test 21 ( 8192 byte blocks, 8192 bytes per update,   1 updates): 103742 cycles/operation,   12 cycles/byte
>
> 2. New shash cmac:
> testing speed of async cmac(cbc(aes-aesni)) (cmac(cbc(aes-aesni)))
> tcrypt: test  0 (   16 byte blocks,   16 bytes per update,   1 updates):    959 cycles/operation,   59 cycles/byte
> tcrypt: test  1 (   64 byte blocks,   16 bytes per update,   4 updates):   2375 cycles/operation,   37 cycles/byte
> tcrypt: test  2 (   64 byte blocks,   64 bytes per update,   1 updates):   3528 cycles/operation,   55 cycles/byte
> tcrypt: test  3 (  256 byte blocks,   16 bytes per update,  16 updates):   6904 cycles/operation,   26 cycles/byte
> tcrypt: test  4 (  256 byte blocks,   64 bytes per update,   4 updates):   8130 cycles/operation,   31 cycles/byte
> tcrypt: test  5 (  256 byte blocks,  256 bytes per update,   1 updates):   8873 cycles/operation,   34 cycles/byte
> tcrypt: test  6 ( 1024 byte blocks,   16 bytes per update,  64 updates):  25937 cycles/operation,   25 cycles/byte
> tcrypt: test  7 ( 1024 byte blocks,  256 bytes per update,   4 updates):  16445 cycles/operation,   16 cycles/byte
> tcrypt: test  8 ( 1024 byte blocks, 1024 bytes per update,   1 updates):  19753 cycles/operation,   19 cycles/byte
> tcrypt: test  9 ( 2048 byte blocks,   16 bytes per update, 128 updates):  52589 cycles/operation,   25 cycles/byte
> tcrypt: test 10 ( 2048 byte blocks,  256 bytes per update,   8 updates):  40020 cycles/operation,   19 cycles/byte
> tcrypt: test 11 ( 2048 byte blocks, 1024 bytes per update,   2 updates):  40315 cycles/operation,   19 cycles/byte
> tcrypt: test 12 ( 2048 byte blocks, 2048 bytes per update,   1 updates):  31988 cycles/operation,   15 cycles/byte
> tcrypt: test 13 ( 4096 byte blocks,   16 bytes per update, 256 updates): 123648 cycles/operation,   30 cycles/byte
> tcrypt: test 14 ( 4096 byte blocks,  256 bytes per update,  16 updates):  65420 cycles/operation,   15 cycles/byte
> tcrypt: test 15 ( 4096 byte blocks, 1024 bytes per update,   4 updates):  65512 cycles/operation,   15 cycles/byte
> tcrypt: test 16 ( 4096 byte blocks, 4096 bytes per update,   1 updates):  62763 cycles/operation,   15 cycles/byte
> tcrypt: test 17 ( 8192 byte blocks,   16 bytes per update, 512 updates): 242429 cycles/operation,   29 cycles/byte
> tcrypt: test 18 ( 8192 byte blocks,  256 bytes per update,  32 updates): 126182 cycles/operation,   15 cycles/byte
> tcrypt: test 19 ( 8192 byte blocks, 1024 bytes per update,   8 updates): 130364 cycles/operation,   15 cycles/byte
> tcrypt: test 20 ( 8192 byte blocks, 4096 bytes per update,   2 updates): 121450 cycles/operation,   14 cycles/byte
> tcrypt: test 21 ( 8192 byte blocks, 8192 bytes per update,   1 updates): 124334 cycles/operation,   15 cycles/byte
>
> 3. New ahash cmac:
> testing speed of async cmac(aes) (cmac(cbc-aes-aesni))
> tcrypt: test  0 (   16 byte blocks,   16 bytes per update,   1 updates):   1026 cycles/operation,   64 cycles/byte
> tcrypt: test  1 (   64 byte blocks,   16 bytes per update,   4 updates):   2080 cycles/operation,   32 cycles/byte
> tcrypt: test  2 (   64 byte blocks,   64 bytes per update,   1 updates):   1250 cycles/operation,   19 cycles/byte
> tcrypt: test  3 (  256 byte blocks,   16 bytes per update,  16 updates):   5348 cycles/operation,   20 cycles/byte
> tcrypt: test  4 (  256 byte blocks,   64 bytes per update,   4 updates):   2998 cycles/operation,   11 cycles/byte
> tcrypt: test  5 (  256 byte blocks,  256 bytes per update,   1 updates):   2233 cycles/operation,    8 cycles/byte
> tcrypt: test  6 ( 1024 byte blocks,   16 bytes per update,  64 updates):  18879 cycles/operation,   18 cycles/byte
> tcrypt: test  7 ( 1024 byte blocks,  256 bytes per update,   4 updates):   7964 cycles/operation,    7 cycles/byte
> tcrypt: test  8 ( 1024 byte blocks, 1024 bytes per update,   1 updates):   6826 cycles/operation,    6 cycles/byte
> tcrypt: test  9 ( 2048 byte blocks,   16 bytes per update, 128 updates):  51125 cycles/operation,   24 cycles/byte
> tcrypt: test 10 ( 2048 byte blocks,  256 bytes per update,   8 updates):  14921 cycles/operation,    7 cycles/byte
> tcrypt: test 11 ( 2048 byte blocks, 1024 bytes per update,   2 updates):  13174 cycles/operation,    6 cycles/byte
> tcrypt: test 12 ( 2048 byte blocks, 2048 bytes per update,   1 updates):  11911 cycles/operation,    5 cycles/byte
> tcrypt: test 13 ( 4096 byte blocks,   16 bytes per update, 256 updates):  74883 cycles/operation,   18 cycles/byte
> tcrypt: test 14 ( 4096 byte blocks,  256 bytes per update,  16 updates):  34169 cycles/operation,    8 cycles/byte
> tcrypt: test 15 ( 4096 byte blocks, 1024 bytes per update,   4 updates):  24703 cycles/operation,    6 cycles/byte
> tcrypt: test 16 ( 4096 byte blocks, 4096 bytes per update,   1 updates):  22081 cycles/operation,    5 cycles/byte
> tcrypt: test 17 ( 8192 byte blocks,   16 bytes per update, 512 updates): 157086 cycles/operation,   19 cycles/byte
> tcrypt: test 18 ( 8192 byte blocks,  256 bytes per update,  32 updates):  56920 cycles/operation,    6 cycles/byte
> tcrypt: test 19 ( 8192 byte blocks, 1024 bytes per update,   8 updates):  50063 cycles/operation,    6 cycles/byte
> tcrypt: test 20 ( 8192 byte blocks, 4096 bytes per update,   2 updates):  43677 cycles/operation,    5 cycles/byte
> tcrypt: test 21 ( 8192 byte blocks, 8192 bytes per update,   1 updates):  42464 cycles/operation,    5 cycles/byte
>
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>


OK, so you are using a page size buffer for every request in flight,
and using that as a scratch buffer for the destination of the cbc()
transform?

I am not a fan of this approach, tbh. High latency/high bandwidth
users will cause lots of GFP_ATOMIC allocations, and synchronous CPU
implementations will cause lots of writes polluting the D-cache for no
good reason. Non-cache coherent accelerators will cause unnecessary
traffic on the memory bus in addition to the undesirable D-cache
behavior.

What we could do instead is having a certain flag/behavior in skcipher
where writes are omitted entirely, and cbcmac/cmac could opt into
that. But imho, the best way to improve this is to have a new AES-NI
asm helper (which I already implemented in my v2) that wraps the
AES-NI primitives in the right way to implement cbcmac.



>
>  crypto/cmac.c |  864 ++++++++++++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 745 insertions(+), 119 deletions(-)
>
> diff --git a/crypto/cmac.c b/crypto/cmac.c
> index df36be1efb817..78cbc16818b66 100644
> --- a/crypto/cmac.c
> +++ b/crypto/cmac.c
> @@ -12,9 +12,13 @@
>   */
>
>  #include <crypto/internal/hash.h>
> +#include <crypto/internal/skcipher.h>
> +#include <crypto/scatterwalk.h>
>  #include <linux/err.h>
>  #include <linux/kernel.h>
>  #include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/string.h>
>
>  /*
>   * +------------------------
> @@ -26,8 +30,13 @@
>   * +------------------------
>   */
>  struct cmac_tfm_ctx {
> -       struct crypto_cipher *child;
> -       u8 ctx[];
> +       struct crypto_sync_skcipher *child;
> +       __be64 consts[];
> +};
> +
> +struct cmac_atfm_ctx {
> +       struct crypto_skcipher *child;
> +       __be64 consts[];
>  };
>
>  /*
> @@ -36,9 +45,9 @@ struct cmac_tfm_ctx {
>   * +------------------------
>   * | cmac_desc_ctx
>   * +------------------------
> - * | odds (block size)
> + * | prev (block size, alignmask aligned)
>   * +------------------------
> - * | prev (block size)
> + * | odds (alignmask aligned)
>   * +------------------------
>   */
>  struct cmac_desc_ctx {
> @@ -46,25 +55,93 @@ struct cmac_desc_ctx {
>         u8 ctx[];
>  };
>
> -static int crypto_cmac_digest_setkey(struct crypto_shash *parent,
> -                                    const u8 *inkey, unsigned int keylen)
> +/*
> + * +------------------------
> + * | <ahash request>
> + * +------------------------
> + * | cmac_req_ctx
> + * +------------------------
> + * | skcipher_request_ctx
> + * +------------------------
> + * | prev (block size, alignmask aligned)
> + * +------------------------
> + * | odds (alignmask aligned)
> + * +------------------------
> + */
> +struct cmac_req_ctx {
> +       struct page *page;
> +       struct scatterlist src[2];
> +       bool more;
> +       bool final;
> +       struct scatterlist dst[3];
> +       unsigned int len;
> +       struct skcipher_request req;
> +};
> +
> +struct cmac_inst_ctx {
> +       struct crypto_sync_skcipher_spawn spawn;
> +       char name[CRYPTO_MAX_ALG_NAME];
> +};
> +
> +struct cmac_ainst_ctx {
> +       struct crypto_skcipher_spawn spawn;
> +       char name[CRYPTO_MAX_ALG_NAME];
> +};
> +
> +static inline void *cmac_ctx_consts(struct crypto_shash *parent)
>  {
>         unsigned long alignmask = crypto_shash_alignmask(parent);
>         struct cmac_tfm_ctx *ctx = crypto_shash_ctx(parent);
> -       unsigned int bs = crypto_shash_blocksize(parent);
> -       __be64 *consts = PTR_ALIGN((void *)ctx->ctx,
> -                                  (alignmask | (__alignof__(__be64) - 1)) + 1);
> -       u64 _const[2];
> -       int i, err = 0;
> -       u8 msb_mask, gfmask;
>
> -       err = crypto_cipher_setkey(ctx->child, inkey, keylen);
> -       if (err)
> -               return err;
> +       alignmask |= __alignof__(__be64) - 1;
> +
> +       return PTR_ALIGN(&ctx->consts[0], alignmask + 1);
> +}
> +
> +static inline void *cmac_async_consts(struct crypto_ahash *parent)
> +{
> +       unsigned long alignmask = crypto_ahash_alignmask(parent);
> +       struct cmac_atfm_ctx *ctx = crypto_ahash_ctx(parent);
> +
> +       alignmask |= __alignof__(__be64) - 1;
> +
> +       return PTR_ALIGN(&ctx->consts[0], alignmask + 1);
> +}
> +
> +static inline unsigned int cmac_ctx_size(unsigned int base,
> +                                        unsigned int alignmask,
> +                                        unsigned int bs)
> +{
> +       const unsigned int minalign = crypto_tfm_ctx_alignment();
> +
> +       return ALIGN(base, minalign) +
> +              ((alignmask | (__alignof__(__be64) - 1)) & ~(minalign - 1)) +
> +              bs * 2;
> +}
> +
> +static int cmac_compute_consts(__be64 *consts, unsigned int bs, u32 flags,
> +                              struct skcipher_request *req)
> +{
> +       u64 _const[2] __attribute__((aligned(16)));
> +       DECLARE_CRYPTO_WAIT(wait);
> +       struct scatterlist sg;
> +       u8 msb_mask, gfmask;
> +       int i, err;
>
>         /* encrypt the zero block */
> +       memset(_const, 0, bs);
>         memset(consts, 0, bs);
> -       crypto_cipher_encrypt_one(ctx->child, (u8 *)consts, (u8 *)consts);
> +       sg_init_one(&sg, consts, bs);
> +
> +       flags &= CRYPTO_TFM_REQ_MAY_SLEEP;
> +       flags |= CRYPTO_TFM_REQ_MAY_BACKLOG;
> +
> +       skcipher_request_set_callback(req, flags, crypto_req_done, &wait);
> +       skcipher_request_set_crypt(req, &sg, &sg, bs, _const);
> +
> +       err = crypto_skcipher_encrypt(req);
> +       if (err)
> +               return err;
>
>         switch (bs) {
>         case 16:
> @@ -101,129 +178,245 @@ static int crypto_cmac_digest_setkey(struct crypto_shash *parent,
>         return 0;
>  }
>
> -static int crypto_cmac_digest_init(struct shash_desc *pdesc)
> +static int crypto_cmac_digest_setkey(struct crypto_shash *parent,
> +                                    const u8 *inkey, unsigned int keylen)
>  {
> -       unsigned long alignmask = crypto_shash_alignmask(pdesc->tfm);
> -       struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
> -       int bs = crypto_shash_blocksize(pdesc->tfm);
> -       u8 *prev = PTR_ALIGN((void *)ctx->ctx, alignmask + 1) + bs;
> +       struct cmac_tfm_ctx *ctx = crypto_shash_ctx(parent);
> +       unsigned int bs = crypto_shash_blocksize(parent);
> +       struct crypto_sync_skcipher *child = ctx->child;
> +       SYNC_SKCIPHER_REQUEST_ON_STACK(req, child);
> +       __be64 *consts = cmac_ctx_consts(parent);
> +       u32 flags;
> +       int err;
>
> -       ctx->len = 0;
> -       memset(prev, 0, bs);
> +       crypto_sync_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
> +       flags = crypto_shash_get_flags(parent) & CRYPTO_TFM_REQ_MASK;
> +       crypto_sync_skcipher_set_flags(child, flags);
> +
> +       err = crypto_sync_skcipher_setkey(child, inkey, keylen);
> +       if (err)
> +               return err;
> +
> +       skcipher_request_set_sync_tfm(req, child);
> +
> +       return cmac_compute_consts(consts, bs, flags, req);
> +}
>
> +static int crypto_cmac_digest_init(struct shash_desc *pdesc)
> +{
> +       memset(shash_desc_ctx(pdesc), 0, HASH_MAX_DESCSIZE);
>         return 0;
>  }
>
> -static int crypto_cmac_digest_update(struct shash_desc *pdesc, const u8 *p,
> -                                    unsigned int len)
> +static unsigned int fill_final_block(char *odds, unsigned int len,
> +                                    unsigned int bs)
> +{
> +       u8 *p = odds + len;
> +       unsigned int rlen;
> +
> +       *p = 0x80;
> +       p++;
> +
> +       rlen = -(len + 1) & (bs - 1);
> +       memset(p, 0, rlen);
> +
> +       return len + rlen + 1;
> +}
> +
> +static int crypto_cmac_digest_finup(struct shash_desc *pdesc, const u8 *p,
> +                                   unsigned int len, u8 *out)
>  {
>         struct crypto_shash *parent = pdesc->tfm;
>         unsigned long alignmask = crypto_shash_alignmask(parent);
>         struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
>         struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
> -       struct crypto_cipher *tfm = tctx->child;
> +       u8 *prev = PTR_ALIGN(&ctx->ctx[0], alignmask + 1);
> +       struct crypto_sync_skcipher *tfm = tctx->child;
> +       SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
>         int bs = crypto_shash_blocksize(parent);
> -       u8 *odds = PTR_ALIGN((void *)ctx->ctx, alignmask + 1);
> -       u8 *prev = odds + bs;
> +       u8 *consts = cmac_ctx_consts(parent);
> +       unsigned int clen = ctx->len;
> +       bool empty = !(clen + len);
> +       struct scatterlist sg;
> +       u8 *odds = prev + bs;
> +       unsigned int cs, ds;
> +       void *page = NULL;
> +       u8 *buf;
> +       int err;
>
> -       /* checking the data can fill the block */
> -       if ((ctx->len + len) <= bs) {
> -               memcpy(odds + ctx->len, p, len);
> +       odds = PTR_ALIGN(odds, alignmask + 1);
> +       cs = HASH_MAX_DESCSIZE - (odds - (u8 *)pdesc);
> +       cs &= ~(bs - 1);
> +       BUILD_BUG_ON(4 + 60 + 16 + 48 + 16 > HASH_MAX_DESCSIZE);
> +
> +       /* checking the data can fill the blocks */
> +       if ((clen + len) <= cs && !out) {
> +               memcpy(odds + clen, p, len);
>                 ctx->len += len;
>                 return 0;
>         }
>
> -       /* filling odds with new data and encrypting it */
> -       memcpy(odds + ctx->len, p, bs - ctx->len);
> -       len -= bs - ctx->len;
> -       p += bs - ctx->len;
> -
> -       crypto_xor(prev, odds, bs);
> -       crypto_cipher_encrypt_one(tfm, prev, prev);
> -
>         /* clearing the length */
>         ctx->len = 0;
>
> -       /* encrypting the rest of data */
> -       while (len > bs) {
> -               crypto_xor(prev, p, bs);
> -               crypto_cipher_encrypt_one(tfm, prev, prev);
> -               p += bs;
> -               len -= bs;
> -       }
> +       buf = odds;
> +       ds = cs;
>
> -       /* keeping the surplus of blocksize */
> -       if (len) {
> -               memcpy(odds, p, len);
> -               ctx->len = len;
> +       if (clen + len > cs * 2 &&
> +           ((page = (void *)__get_free_page(GFP_ATOMIC)))) {
> +               buf = page;
> +               ds = PAGE_SIZE;
> +               memcpy(buf, odds, clen);
>         }
>
> -       return 0;
> -}
> +       sg_init_one(&sg, buf, ds);
>
> -static int crypto_cmac_digest_final(struct shash_desc *pdesc, u8 *out)
> -{
> -       struct crypto_shash *parent = pdesc->tfm;
> -       unsigned long alignmask = crypto_shash_alignmask(parent);
> -       struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
> -       struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
> -       struct crypto_cipher *tfm = tctx->child;
> -       int bs = crypto_shash_blocksize(parent);
> -       u8 *consts = PTR_ALIGN((void *)tctx->ctx,
> -                              (alignmask | (__alignof__(__be64) - 1)) + 1);
> -       u8 *odds = PTR_ALIGN((void *)ctx->ctx, alignmask + 1);
> -       u8 *prev = odds + bs;
> -       unsigned int offset = 0;
> +       /* encrypting the rest of data */
> +       do {
> +               unsigned int es = ds - clen;
> +               bool final = false;
> +
> +               if (len <= es) {
> +                       if (out) {
> +                               ds = len + clen;
> +                               es = len;
> +
> +                               if (ds & (bs - 1) || empty) {
> +                                       ds = fill_final_block(buf, ds, bs);
> +                                       consts += bs;
> +                               }
> +
> +                               final = true;
> +                       } else {
> +                               /* Leave at least one byte for final. */
> +                               ds = (len + clen - 1) & ~(bs - 1);
> +                               es = ds - clen;
> +                       }
> +               }
>
> -       if (ctx->len != bs) {
> -               unsigned int rlen;
> -               u8 *p = odds + ctx->len;
> +               memcpy(buf + clen, p, es);
>
> -               *p = 0x80;
> -               p++;
> +               if (final)
> +                       crypto_xor(buf + ds - bs, consts, bs);
>
> -               rlen = bs - ctx->len - 1;
> -               if (rlen)
> -                       memset(p, 0, rlen);
> +               clen = 0;
> +               empty = false;
>
> -               offset += bs;
> -       }
> +               skcipher_request_set_sync_tfm(req, tfm);
> +               skcipher_request_set_callback(req, 0, NULL, NULL);
> +               skcipher_request_set_crypt(req, &sg, &sg, ds, prev);
> +
> +               err = crypto_skcipher_encrypt(req);
> +               if (err)
> +                       return err;
> +
> +               p += es;
> +               len -= es;
> +       } while (len > (out ? 0 : cs));
> +
> +       if (page)
> +               free_page((unsigned long)page);
>
> -       crypto_xor(prev, odds, bs);
> -       crypto_xor(prev, consts + offset, bs);
> +       /* keeping the surplus */
> +       memcpy(odds, p, len);
> +       ctx->len = len;
>
> -       crypto_cipher_encrypt_one(tfm, out, prev);
> +       if (out)
> +               memcpy(out, prev, bs);
>
>         return 0;
>  }
>
> -static int cmac_init_tfm(struct crypto_tfm *tfm)
> +static int crypto_cmac_digest_update(struct shash_desc *pdesc, const u8 *p,
> +                                    unsigned int len)
> +{
> +       return crypto_cmac_digest_finup(pdesc, p, len, NULL);
> +}
> +
> +static int crypto_cmac_digest_final(struct shash_desc *pdesc, u8 *out)
> +{
> +       return crypto_cmac_digest_finup(pdesc, NULL, 0, out);
> +}
> +
> +static int cmac_init_tfm(struct crypto_shash *tfm)
>  {
> -       struct crypto_cipher *cipher;
> -       struct crypto_instance *inst = (void *)tfm->__crt_alg;
> -       struct crypto_cipher_spawn *spawn = crypto_instance_ctx(inst);
> -       struct cmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
> +       struct shash_instance *inst = shash_alg_instance(tfm);
> +       struct cmac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
> +       struct crypto_sync_skcipher_spawn *spawn;
> +       struct crypto_sync_skcipher *cipher;
> +       struct cmac_inst_ctx *ictx;
>
> -       cipher = crypto_spawn_cipher(spawn);
> +       ictx = shash_instance_ctx(inst);
> +       spawn = &ictx->spawn;
> +
> +       cipher = crypto_spawn_sync_skcipher(spawn);
>         if (IS_ERR(cipher))
>                 return PTR_ERR(cipher);
>
>         ctx->child = cipher;
>
>         return 0;
> -};
> +}
>
> -static void cmac_exit_tfm(struct crypto_tfm *tfm)
> +static void cmac_exit_tfm(struct crypto_shash *tfm)
>  {
> -       struct cmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
> -       crypto_free_cipher(ctx->child);
> +       struct cmac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
> +       crypto_free_sync_skcipher(ctx->child);
>  }
>
> -static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb)
> +static int cmac_set_cbc(char *name, const char *cipher_name)
> +{
> +       if (snprintf(name, CRYPTO_MAX_ALG_NAME, "cbc(%s)", cipher_name) >=
> +           CRYPTO_MAX_ALG_NAME)
> +               return -ENAMETOOLONG;
> +
> +       return 0;
> +}
> +
> +static int cmac_check_blocksize(struct skcipher_alg *alg)
> +{
> +       switch (alg->base.cra_blocksize) {
> +       case 16:
> +       case 8:
> +               return 0;
> +       }
> +
> +       return -EINVAL;
> +}
> +
> +static int cmac_set_name(char *cra_name, char *name, const char *cipher_name)
> +{
> +       unsigned len;
> +
> +       /* Alas we screwed up the naming so we have to mangle the
> +        * cipher name.
> +        */
> +       if (strncmp(cipher_name, "cbc(", 4))
> +               return -EINVAL;
> +
> +       len = strlcpy(name, cipher_name + 4, CRYPTO_MAX_ALG_NAME);
> +       if (len < 2 || len >= CRYPTO_MAX_ALG_NAME)
> +               return -EINVAL;
> +
> +       if (name[len - 1] != ')')
> +               return -EINVAL;
> +
> +       name[len - 1] = 0;
> +
> +       if (snprintf(cra_name, CRYPTO_MAX_ALG_NAME, "cmac(%s)", name) >=
> +           CRYPTO_MAX_ALG_NAME)
> +               return -ENAMETOOLONG;
> +
> +       return 0;
> +}
> +
> +static int cmac_create_sync(struct crypto_template *tmpl, struct rtattr **tb)
>  {
> +       struct crypto_sync_skcipher_spawn *spawn;
>         struct shash_instance *inst;
> -       struct crypto_cipher_spawn *spawn;
> -       struct crypto_alg *alg;
> +       struct cmac_inst_ctx *ctx;
> +       struct skcipher_alg *alg;
> +       const char *cipher_name;
>         unsigned long alignmask;
>         u32 mask;
>         int err;
> @@ -232,53 +425,60 @@ static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb)
>         if (err)
>                 return err;
>
> +       cipher_name = crypto_attr_alg_name(tb[1]);
> +       if (IS_ERR(cipher_name))
> +               return PTR_ERR(cipher_name);
> +
>         inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
>         if (!inst)
>                 return -ENOMEM;
> -       spawn = shash_instance_ctx(inst);
>
> -       err = crypto_grab_cipher(spawn, shash_crypto_instance(inst),
> -                                crypto_attr_alg_name(tb[1]), 0, mask);
> +       ctx = shash_instance_ctx(inst);
> +       spawn = &ctx->spawn;
> +
> +       err = crypto_grab_sync_skcipher(spawn, shash_crypto_instance(inst),
> +                                       cipher_name, 0, mask);
> +       if (err == -ENOENT)
> +               err = cmac_set_cbc(ctx->name, cipher_name) ?:
> +                     crypto_grab_sync_skcipher(spawn,
> +                                               shash_crypto_instance(inst),
> +                                               ctx->name, 0, mask);
>         if (err)
>                 goto err_free_inst;
> -       alg = crypto_spawn_cipher_alg(spawn);
>
> -       switch (alg->cra_blocksize) {
> -       case 16:
> -       case 8:
> -               break;
> -       default:
> -               err = -EINVAL;
> -               goto err_free_inst;
> -       }
> +       alg = crypto_sync_spawn_skcipher_alg(spawn);
>
> -       err = crypto_inst_setname(shash_crypto_instance(inst), tmpl->name, alg);
> +       err = cmac_check_blocksize(alg) ?:
> +             crypto_inst_setname(shash_crypto_instance(inst), "cmac",
> +                                 &alg->base) ?:
> +             cmac_set_name(inst->alg.base.cra_name, ctx->name,
> +                           alg->base.cra_name);
>         if (err)
>                 goto err_free_inst;
>
> -       alignmask = alg->cra_alignmask;
> +       err = -EINVAL;
> +       alignmask = alg->base.cra_alignmask;
> +       if (alignmask > alg->base.cra_blocksize)
> +               goto err_free_inst;
> +
>         inst->alg.base.cra_alignmask = alignmask;
> -       inst->alg.base.cra_priority = alg->cra_priority;
> -       inst->alg.base.cra_blocksize = alg->cra_blocksize;
> +       inst->alg.base.cra_priority = alg->base.cra_priority;
> +       inst->alg.base.cra_blocksize = alg->base.cra_blocksize;
>
> -       inst->alg.digestsize = alg->cra_blocksize;
> -       inst->alg.descsize =
> -               ALIGN(sizeof(struct cmac_desc_ctx), crypto_tfm_ctx_alignment())
> -               + (alignmask & ~(crypto_tfm_ctx_alignment() - 1))
> -               + alg->cra_blocksize * 2;
> +       inst->alg.digestsize = alg->base.cra_blocksize;
> +       inst->alg.descsize = HASH_MAX_DESCSIZE;
>
> -       inst->alg.base.cra_ctxsize =
> -               ALIGN(sizeof(struct cmac_tfm_ctx), crypto_tfm_ctx_alignment())
> -               + ((alignmask | (__alignof__(__be64) - 1)) &
> -                  ~(crypto_tfm_ctx_alignment() - 1))
> -               + alg->cra_blocksize * 2;
> +       inst->alg.base.cra_ctxsize = cmac_ctx_size(sizeof(struct cmac_tfm_ctx),
> +                                                  alignmask,
> +                                                  alg->base.cra_blocksize);
>
> -       inst->alg.base.cra_init = cmac_init_tfm;
> -       inst->alg.base.cra_exit = cmac_exit_tfm;
> +       inst->alg.init_tfm = cmac_init_tfm;
> +       inst->alg.exit_tfm = cmac_exit_tfm;
>
>         inst->alg.init = crypto_cmac_digest_init;
>         inst->alg.update = crypto_cmac_digest_update;
>         inst->alg.final = crypto_cmac_digest_final;
> +       inst->alg.finup = crypto_cmac_digest_finup;
>         inst->alg.setkey = crypto_cmac_digest_setkey;
>
>         inst->free = shash_free_singlespawn_instance;
> @@ -291,6 +491,432 @@ static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb)
>         return err;
>  }
>
> +static int crypto_cmac_ahash_setkey(struct crypto_ahash *parent,
> +                                   const u8 *inkey, unsigned int keylen)
> +{
> +       struct cmac_atfm_ctx *ctx = crypto_ahash_ctx(parent);
> +       unsigned int bs = crypto_ahash_blocksize(parent);
> +       struct crypto_skcipher *child = ctx->child;
> +       __be64 *consts = cmac_async_consts(parent);
> +       struct skcipher_request *req;
> +       u32 flags;
> +       int err;
> +
> +       crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
> +       flags = crypto_ahash_get_flags(parent) & CRYPTO_TFM_REQ_MASK;
> +       crypto_skcipher_set_flags(child, flags);
> +
> +       err = crypto_skcipher_setkey(child, inkey, keylen);
> +       if (err)
> +               return err;
> +
> +       flags &= CRYPTO_TFM_REQ_MAY_SLEEP;
> +       req = skcipher_request_alloc(child, flags ? GFP_KERNEL : GFP_ATOMIC);
> +       if (!req)
> +               return -ENOMEM;
> +
> +       err = cmac_compute_consts(consts, bs, flags, req);
> +       skcipher_request_free(req);
> +       return err;
> +}
> +
> +static int crypto_cmac_ahash_init(struct ahash_request *req)
> +{
> +       struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
> +
> +       memset(ahash_request_ctx(req), 0, crypto_ahash_reqsize(tfm));
> +       return 0;
> +}
> +
> +static int cmac_ahash_final(struct ahash_request *req, u32 flags)
> +{
> +       struct crypto_ahash *parent = crypto_ahash_reqtfm(req);
> +       struct cmac_atfm_ctx *tctx = crypto_ahash_ctx(parent);
> +       struct cmac_req_ctx *ctx = ahash_request_ctx(req);
> +       unsigned int len = ctx->final ? 0 : req->nbytes;
> +       struct crypto_skcipher *tfm = tctx->child;
> +       struct skcipher_request *creq = &ctx->req;
> +       u8 *out = ctx->more ? NULL : req->result;
> +       int bs = crypto_ahash_blocksize(parent);
> +       u8 *consts = cmac_async_consts(parent);
> +       unsigned int done = creq->cryptlen;
> +       unsigned int clen = ctx->len;
> +       unsigned long alignmask;
> +       unsigned int ds;
> +       int err = 0;
> +       u8 *prev;
> +       u8 *odds;
> +
> +       if (ctx->page) {
> +               free_page((unsigned long)ctx->page);
> +               ctx->page = NULL;
> +       }
> +
> +       alignmask = crypto_ahash_alignmask(parent);
> +
> +       prev = skcipher_request_ctx(creq);
> +       prev += crypto_skcipher_reqsize(tfm);
> +       prev = PTR_ALIGN(prev, alignmask + 1);
> +
> +       odds = PTR_ALIGN(prev + bs, alignmask + 1);
> +
> +       ds = clen + len - done;
> +
> +       if (done >= clen) {
> +               done -= clen;
> +               len -= done;
> +               clen = 0;
> +       } else {
> +               odds += done;
> +               clen -= done;
> +               done = 0;
> +       }
> +
> +       if (len)
> +               scatterwalk_map_and_copy(odds + clen, req->src, done, len, 0);
> +
> +       if (!out)
> +               goto done;
> +
> +       if (ds & (bs - 1) || !ds) {
> +               ds = fill_final_block(odds, ds, bs);
> +               consts += bs;
> +       }
> +
> +       memcpy(out, prev, bs);
> +       crypto_xor(odds + ds - bs, consts, bs);
> +       sg_init_one(ctx->src, odds, ds);
> +
> +       skcipher_request_set_tfm(creq, tfm);
> +       skcipher_request_set_callback(creq, flags, req->base.complete,
> +                                     req->base.data);
> +       skcipher_request_set_crypt(creq, ctx->src, ctx->src, ds, out);
> +
> +       err = crypto_skcipher_encrypt(creq);
> +       ds = 0;
> +
> +done:
> +       ctx->len = ds;
> +       return err;
> +}
> +
> +static void cmac_encrypt_done(struct crypto_async_request *areq, int err)
> +{
> +       struct ahash_request *req = areq->data;
> +       u32 flags;
> +
> +       if (err)
> +               goto out;
> +
> +       flags = req->base.flags & ~CRYPTO_TFM_REQ_MAY_SLEEP;
> +       err = cmac_ahash_final(req, flags);
> +
> +out:
> +       ahash_request_complete(req, err);
> +}
> +
> +static int cmac_ahash_finup(struct ahash_request *req)
> +{
> +       struct crypto_ahash *parent = crypto_ahash_reqtfm(req);
> +       struct cmac_atfm_ctx *tctx = crypto_ahash_ctx(parent);
> +       struct cmac_req_ctx *ctx = ahash_request_ctx(req);
> +       unsigned int len = ctx->final ? 0 : req->nbytes;
> +       struct crypto_skcipher *tfm = tctx->child;
> +       struct skcipher_request *creq = &ctx->req;
> +       int bs = crypto_ahash_blocksize(parent);
> +       struct scatterlist *src = req->src;
> +       struct scatterlist *dst = ctx->dst;
> +       unsigned int clen = ctx->len;
> +       u32 flags = req->base.flags;
> +       unsigned long alignmask;
> +       unsigned int cs, ds, es;
> +       u8 *prev;
> +       u8 *odds;
> +       int err;
> +
> +       flags &= CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG;
> +       alignmask = crypto_ahash_alignmask(parent);
> +
> +       prev = skcipher_request_ctx(creq);
> +       prev += crypto_skcipher_reqsize(tfm);
> +       prev = PTR_ALIGN(prev, alignmask + 1);
> +
> +       odds = PTR_ALIGN(prev + bs, alignmask + 1);
> +
> +       cs = crypto_ahash_reqsize(parent) - (odds - (u8 *)ctx);
> +       cs &= ~(bs - 1);
> +       if (cs > PAGE_SIZE)
> +               cs = PAGE_SIZE;
> +
> +       ds = clen + len;
> +       es = HASH_MAX_STATESIZE - 4 - bs;
> +
> +       creq->cryptlen = 0;
> +
> +       /* checking the data can fill the blocks */
> +       if (ds <= es)
> +               return cmac_ahash_final(req, flags);
> +
> +       /* Leave at least one byte for final. */
> +       if (!ds || !(ds = (ds - 1) & ~(bs - 1)))
> +               return cmac_ahash_final(req, flags);
> +
> +       if (clen) {
> +               sg_chain(ctx->src, 2, src);
> +               src = ctx->src;
> +               sg_set_buf(src, odds, clen);
> +       }
> +
> +       sg_set_buf(dst, odds, cs);
> +       sg_chain(dst, 2, dst);
> +
> +       if (ds > cs && cs < PAGE_SIZE &&
> +           ((ctx->page = (void *)__get_free_page(GFP_ATOMIC))))
> +               sg_set_buf(dst, ctx->page, PAGE_SIZE);
> +
> +       skcipher_request_set_tfm(creq, tfm);
> +       skcipher_request_set_callback(creq, flags, cmac_encrypt_done, req);
> +       skcipher_request_set_crypt(creq, src, dst, ds, prev);
> +
> +       err = crypto_skcipher_encrypt(creq);
> +       if (err == -EINPROGRESS || err == -EBUSY)
> +               return err;
> +
> +       return cmac_ahash_final(req, flags);
> +}
> +
> +static int crypto_cmac_ahash_update(struct ahash_request *req)
> +{
> +       struct cmac_req_ctx *ctx = ahash_request_ctx(req);
> +
> +       ctx->more = true;
> +       return cmac_ahash_finup(req);
> +}
> +
> +static int crypto_cmac_ahash_finup(struct ahash_request *req)
> +{
> +       struct cmac_req_ctx *ctx = ahash_request_ctx(req);
> +
> +       ctx->more = false;
> +       return cmac_ahash_finup(req);
> +}
> +
> +static int crypto_cmac_ahash_final(struct ahash_request *req)
> +{
> +       struct cmac_req_ctx *ctx = ahash_request_ctx(req);
> +
> +       ctx->more = false;
> +       ctx->final = true;
> +       return cmac_ahash_finup(req);
> +}
> +
> +static int crypto_cmac_ahash_digest(struct ahash_request *req)
> +{
> +       return crypto_cmac_ahash_init(req) ?: cmac_ahash_finup(req);
> +}
> +
> +static int crypto_cmac_ahash_export(struct ahash_request *req, void *out)
> +{
> +       struct crypto_ahash *parent = crypto_ahash_reqtfm(req);
> +       struct cmac_atfm_ctx *tctx = crypto_ahash_ctx(parent);
> +       struct cmac_req_ctx *ctx = ahash_request_ctx(req);
> +       unsigned int bs = crypto_ahash_blocksize(parent);
> +       struct crypto_skcipher *tfm = tctx->child;
> +       struct skcipher_request *creq = &ctx->req;
> +       unsigned long alignmask;
> +       u8 *p = out;
> +       u8 *prev;
> +       u8 *odds;
> +
> +       alignmask = crypto_ahash_alignmask(parent);
> +
> +       prev = skcipher_request_ctx(creq);
> +       prev += crypto_skcipher_reqsize(tfm);
> +       prev = PTR_ALIGN(prev, alignmask + 1);
> +
> +       odds = PTR_ALIGN(prev + bs, alignmask + 1);
> +
> +       *(u32 *)p = ctx->len;
> +
> +       p += 4;
> +       memcpy(p, prev, bs);
> +
> +       p += bs;
> +       memcpy(p, odds, ctx->len);
> +
> +       return 0;
> +}
> +
> +static int crypto_cmac_ahash_import(struct ahash_request *req, const void *in)
> +{
> +       struct crypto_ahash *parent = crypto_ahash_reqtfm(req);
> +       struct cmac_atfm_ctx *tctx = crypto_ahash_ctx(parent);
> +       struct cmac_req_ctx *ctx = ahash_request_ctx(req);
> +       unsigned int bs = crypto_ahash_blocksize(parent);
> +       struct crypto_skcipher *tfm = tctx->child;
> +       struct skcipher_request *creq = &ctx->req;
> +       unsigned long alignmask;
> +       const u8 *p = in;
> +       u8 *prev;
> +       u8 *odds;
> +       int err;
> +
> +       err = crypto_cmac_ahash_init(req);
> +       if (err)
> +               return err;
> +
> +       alignmask = crypto_ahash_alignmask(parent);
> +
> +       prev = skcipher_request_ctx(creq);
> +       prev += crypto_skcipher_reqsize(tfm);
> +       prev = PTR_ALIGN(prev, alignmask + 1);
> +
> +       odds = PTR_ALIGN(prev + bs, alignmask + 1);
> +
> +       ctx->len = *(const u32 *)p;
> +       if (ctx->len > HASH_MAX_STATESIZE - 4 - bs)
> +               return -EINVAL;
> +
> +       p += 4;
> +       memcpy(prev, p, bs);
> +
> +       p += bs;
> +       memcpy(odds, p, ctx->len);
> +
> +       return 0;
> +}
> +
> +static int cmac_init_ahash(struct crypto_ahash *tfm)
> +{
> +       struct ahash_instance *inst = ahash_alg_instance(tfm);
> +       unsigned long alignmask = crypto_ahash_alignmask(tfm);
> +       struct cmac_atfm_ctx *ctx = crypto_ahash_ctx(tfm);
> +       unsigned int bs = crypto_ahash_blocksize(tfm);
> +       struct crypto_skcipher_spawn *spawn;
> +       struct crypto_skcipher *cipher;
> +       struct cmac_ainst_ctx *ictx;
> +       unsigned int reqsize;
> +       unsigned int head;
> +
> +       ictx = ahash_instance_ctx(inst);
> +       spawn = &ictx->spawn;
> +
> +       cipher = crypto_spawn_skcipher(spawn);
> +       if (IS_ERR(cipher))
> +               return PTR_ERR(cipher);
> +
> +       ctx->child = cipher;
> +
> +       reqsize = sizeof(struct cmac_req_ctx);
> +       reqsize += ALIGN(crypto_skcipher_reqsize(cipher), CRYPTO_MINALIGN);
> +       if (alignmask > CRYPTO_MINALIGN)
> +               reqsize += alignmask + 1 - CRYPTO_MINALIGN;
> +       reqsize += ALIGN(bs, alignmask + 1);
> +       reqsize += HASH_MAX_STATESIZE;
> +
> +       head = sizeof(struct ahash_request);
> +       reqsize = roundup_pow_of_two(reqsize + head) - head;
> +
> +       crypto_ahash_set_reqsize(tfm, reqsize);
> +
> +       return 0;
> +}
> +
> +static void cmac_exit_ahash(struct crypto_ahash *tfm)
> +{
> +       struct cmac_atfm_ctx *ctx = crypto_ahash_ctx(tfm);
> +       crypto_free_skcipher(ctx->child);
> +}
> +
> +static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb)
> +{
> +
> +       struct crypto_skcipher_spawn *spawn;
> +       struct crypto_attr_type *algt;
> +       struct cmac_ainst_ctx *ctx;
> +       struct ahash_instance *inst;
> +       struct skcipher_alg *alg;
> +       const char *cipher_name;
> +       unsigned int alignmask;
> +       u32 mask;
> +       int err;
> +
> +       algt = crypto_get_attr_type(tb);
> +       if (IS_ERR(algt))
> +               return PTR_ERR(algt);
> +
> +       if ((algt->type ^ (CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC)) &
> +           algt->mask)
> +               return cmac_create_sync(tmpl, tb);
> +
> +       mask = crypto_algt_inherited_mask(algt);
> +
> +       cipher_name = crypto_attr_alg_name(tb[1]);
> +       if (IS_ERR(cipher_name))
> +               return PTR_ERR(cipher_name);
> +
> +       inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
> +       if (!inst)
> +               return -ENOMEM;
> +
> +       ctx = ahash_instance_ctx(inst);
> +       spawn = &ctx->spawn;
> +
> +       err = crypto_grab_skcipher(spawn, ahash_crypto_instance(inst),
> +                                  cipher_name, 0, mask);
> +       if (err == -ENOENT)
> +               err = cmac_set_cbc(ctx->name, cipher_name) ?:
> +                     crypto_grab_skcipher(spawn, ahash_crypto_instance(inst),
> +                                          ctx->name, 0, mask);
> +       if (err)
> +               goto err_free_inst;
> +
> +       alg = crypto_spawn_skcipher_alg(spawn);
> +
> +       err = cmac_check_blocksize(alg) ?:
> +             crypto_inst_setname(ahash_crypto_instance(inst), "cmac",
> +                                 &alg->base) ?:
> +             cmac_set_name(inst->alg.halg.base.cra_name, ctx->name,
> +                           alg->base.cra_name);
> +       if (err)
> +               goto err_free_inst;
> +
> +       err = -EINVAL;
> +
> +       alignmask = alg->base.cra_alignmask;
> +       inst->alg.halg.base.cra_alignmask = alignmask;
> +       inst->alg.halg.base.cra_priority = alg->base.cra_priority;
> +       inst->alg.halg.base.cra_blocksize = alg->base.cra_blocksize;
> +
> +       inst->alg.halg.digestsize = alg->base.cra_blocksize;
> +       inst->alg.halg.statesize = HASH_MAX_STATESIZE;
> +
> +       inst->alg.halg.base.cra_ctxsize =
> +               cmac_ctx_size(sizeof(struct cmac_atfm_ctx), alignmask,
> +                             alg->base.cra_blocksize);
> +
> +       inst->alg.init_tfm = cmac_init_ahash;
> +       inst->alg.exit_tfm = cmac_exit_ahash;
> +
> +       inst->alg.init = crypto_cmac_ahash_init;
> +       inst->alg.update = crypto_cmac_ahash_update;
> +       inst->alg.final = crypto_cmac_ahash_final;
> +       inst->alg.finup = crypto_cmac_ahash_finup;
> +       inst->alg.digest = crypto_cmac_ahash_digest;
> +       inst->alg.export = crypto_cmac_ahash_export;
> +       inst->alg.import = crypto_cmac_ahash_import;
> +       inst->alg.setkey = crypto_cmac_ahash_setkey;
> +
> +       inst->free = ahash_free_singlespawn_instance;
> +
> +       err = ahash_register_instance(tmpl, inst);
> +       if (err) {
> +err_free_inst:
> +               ahash_free_singlespawn_instance(inst);
> +       }
> +       return err;
> +}
> +
>  static struct crypto_template crypto_cmac_tmpl = {
>         .name = "cmac",
>         .create = cmac_create,

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/6] crypto: cmac - Use cbc skcipher instead of raw cipher
  2020-08-24  9:47     ` Ard Biesheuvel
@ 2020-08-24 11:20       ` Herbert Xu
  0 siblings, 0 replies; 46+ messages in thread
From: Herbert Xu @ 2020-08-24 11:20 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: Linux Crypto Mailing List, Eric Biggers, Ben Greear

On Mon, Aug 24, 2020 at 11:47:30AM +0200, Ard Biesheuvel wrote:
>
> OK, so you are using a page size buffer for every request in flight,
> and using that as a scratch buffer for the destination of the cbc()
> transform?

Not necessarily.  It'll only allocate the page if the request size
exceeds the buffer we already have in the request context.  The
request context size is dependent on the request context size of
the underlying CBC, but it should be at least 512 bytes long.

I should probably test always using the 512-byte buffer and perhaps
it might be good enough.

Note that the numbers I'm getting with aesni is already very close
to the numbers of the underly cbc-aesni encryption so there is not
much room for improvement.

> I am not a fan of this approach, tbh. High latency/high bandwidth
> users will cause lots of GFP_ATOMIC allocations, and synchronous CPU

We could make the request context size be at least 2048 bytes long
and that would obviate the need to allocate the page buffer.

In any case, the cost of the page allocation is going to be drowned
out by the crypto since this would only happen for large requests.

> implementations will cause lots of writes polluting the D-cache for no
> good reason. Non-cache coherent accelerators will cause unnecessary
> traffic on the memory bus in addition to the undesirable D-cache
> behavior.

Perhaps, but would this be significant compared to the crypto cost?
Only numbers can tell.

> What we could do instead is having a certain flag/behavior in skcipher
> where writes are omitted entirely, and cbcmac/cmac could opt into
> that. But imho, the best way to improve this is to have a new AES-NI
> asm helper (which I already implemented in my v2) that wraps the
> AES-NI primitives in the right way to implement cbcmac.

As I said before, I'm totally fine with a native aesni implementation
for ccm/cbcmac as long as it's fully async like everything else.  But
a ccm/cbcmac based on cbc still makes sense since we're not going to
reimplement the same thing in every crypto driver out there.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 5/6] crypto: ahash - Remove AHASH_REQUEST_ON_STACK
  2020-08-18  8:25   ` [PATCH 5/6] crypto: ahash - Remove AHASH_REQUEST_ON_STACK Herbert Xu
@ 2020-08-26 10:55     ` Ard Biesheuvel
  0 siblings, 0 replies; 46+ messages in thread
From: Ard Biesheuvel @ 2020-08-26 10:55 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Linux Crypto Mailing List, Eric Biggers, Ben Greear

On Tue, 18 Aug 2020 at 10:25, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> This patch removes AHASH_REQUEST_ON_STACK which is unused.
>
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Acked-by: Ard Biesheuvel <ardb@kernel.org>

and given that any new uses that creep in will trigger -Wvla warnings,
I suggest this is broken out from the series and merged as a fix
instead.


> ---
>
>  include/crypto/hash.h |    5 -----
>  1 file changed, 5 deletions(-)
>
> diff --git a/include/crypto/hash.h b/include/crypto/hash.h
> index c9d3fd3efa1b0..f16f5d4afc102 100644
> --- a/include/crypto/hash.h
> +++ b/include/crypto/hash.h
> @@ -59,11 +59,6 @@ struct ahash_request {
>         void *__ctx[] CRYPTO_MINALIGN_ATTR;
>  };
>
> -#define AHASH_REQUEST_ON_STACK(name, ahash) \
> -       char __##name##_desc[sizeof(struct ahash_request) + \
> -               crypto_ahash_reqsize(ahash)] CRYPTO_MINALIGN_ATTR; \
> -       struct ahash_request *name = (void *)__##name##_desc
> -
>  /**
>   * struct ahash_alg - asynchronous message digest definition
>   * @init: **[mandatory]** Initialize the transformation context. Intended only to initialize the

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] crypto: x86/aesni - implement accelerated CBCMAC, CMAC and XCBC shashes
  2020-08-04 19:45         ` Ben Greear
  2020-08-04 20:12           ` Ard Biesheuvel
@ 2020-09-23 11:03           ` Ben Greear
  2020-10-29 16:58             ` Ard Biesheuvel
  1 sibling, 1 reply; 46+ messages in thread
From: Ben Greear @ 2020-09-23 11:03 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: Linux Crypto Mailing List, Herbert Xu, Eric Biggers

On 8/4/20 12:45 PM, Ben Greear wrote:
> On 8/4/20 6:08 AM, Ard Biesheuvel wrote:
>> On Tue, 4 Aug 2020 at 15:01, Ben Greear <greearb@candelatech.com> wrote:
>>>
>>> On 8/4/20 5:55 AM, Ard Biesheuvel wrote:
>>>> On Mon, 3 Aug 2020 at 21:11, Ben Greear <greearb@candelatech.com> wrote:
>>>>>
>>>>> Hello,
>>>>>
>>>>> This helps a bit...now download sw-crypt performance is about 150Mbps,
>>>>> but still not as good as with my patch on 5.4 kernel, and fpu is still
>>>>> high in perf top:
>>>>>
>>>>>       13.89%  libc-2.29.so   [.] __memset_sse2_unaligned_erms
>>>>>         6.62%  [kernel]       [k] kernel_fpu_begin
>>>>>         4.14%  [kernel]       [k] _aesni_enc1
>>>>>         2.06%  [kernel]       [k] __crypto_xor
>>>>>         1.95%  [kernel]       [k] copy_user_generic_string
>>>>>         1.93%  libjvm.so      [.] SpinPause
>>>>>         1.01%  [kernel]       [k] aesni_encrypt
>>>>>         0.98%  [kernel]       [k] crypto_ctr_crypt
>>>>>         0.93%  [kernel]       [k] udp_sendmsg
>>>>>         0.78%  [kernel]       [k] crypto_inc
>>>>>         0.74%  [kernel]       [k] __ip_append_data.isra.53
>>>>>         0.65%  [kernel]       [k] aesni_cbc_enc
>>>>>         0.64%  [kernel]       [k] __dev_queue_xmit
>>>>>         0.62%  [kernel]       [k] ipt_do_table
>>>>>         0.62%  [kernel]       [k] igb_xmit_frame_ring
>>>>>         0.59%  [kernel]       [k] ip_route_output_key_hash_rcu
>>>>>         0.57%  [kernel]       [k] memcpy
>>>>>         0.57%  libjvm.so      [.] InstanceKlass::oop_follow_contents
>>>>>         0.56%  [kernel]       [k] irq_fpu_usable
>>>>>         0.56%  [kernel]       [k] mac_do_update
>>>>>
>>>>> If you'd like help setting up a test rig and have an ath10k pcie NIC or ath9k pcie NIC,
>>>>> then I can help.  Possibly hwsim would also be a good test case, but I have not tried
>>>>> that.
>>>>>
>>>>
>>>> I don't think this is likely to be reproducible on other
>>>> micro-architectures, so setting up a test rig is unlikely to help.
>>>>
>>>> I'll send out a v2 which implements a ahash instead of a shash (and
>>>> implements some other tweaks) so that kernel_fpu_begin() is only
>>>> called twice for each packet on the cbcmac path.
>>>>
>>>> Do you have any numbers for the old kernel without your patch? This
>>>> pathological FPU preserve/restore behavior could be caused be the
>>>> optimizations, or by other changes that landed in the meantime, so I
>>>> would like to know if kernel_fpu_begin() is as prominent in those
>>>> traces as well.
>>>>
>>>
>>> This same patch makes i7 mobile processors able to handle 1Gbps+ software
>>> decrypt rates, where without the patch, the rate was badly constrained and CPU
>>> load was much higher, so it is definitely noticeable on other processors too.
>>
>> OK
>>
>>> The weak processor on the current test rig is convenient because the problem
>>> is so noticeable even at slower wifi speeds.
>>>
>>> We can do some tests on 5.4 with our patch reverted.
>>>
>>
>> The issue with your CCM patch is that it keeps the FPU enabled for the
>> entire input, which also means that preemption is disabled, which
>> makes the -rt people grumpy. (Of course, it also uses APIs that no
>> longer exists, but that should be easy to fix)
>>
>> Do you happen to have any ballpark figures for the packet sizes and
>> the time spent doing encryption?
>>
> 
> My tester reports this last patch appears to break wpa-2 entirely, so we
> cannot test it as is.

Hello,

I'm still hoping that I can find some one to help enable this feature again
on 5.9-ish kernels.

I don't care about preemption being disabled for long-ish times, that is no worse than what I had
before, so even if an official in-kernel patch is not possible, I can carry an
out-of-tree patch.

Thanks,
Ben

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] crypto: x86/aesni - implement accelerated CBCMAC, CMAC and XCBC shashes
  2020-09-23 11:03           ` Ben Greear
@ 2020-10-29 16:58             ` Ard Biesheuvel
  0 siblings, 0 replies; 46+ messages in thread
From: Ard Biesheuvel @ 2020-10-29 16:58 UTC (permalink / raw)
  To: Ben Greear; +Cc: Linux Crypto Mailing List, Herbert Xu, Eric Biggers

On Wed, 23 Sep 2020 at 13:03, Ben Greear <greearb@candelatech.com> wrote:
>
> On 8/4/20 12:45 PM, Ben Greear wrote:
> > On 8/4/20 6:08 AM, Ard Biesheuvel wrote:
> >> On Tue, 4 Aug 2020 at 15:01, Ben Greear <greearb@candelatech.com> wrote:
> >>>
> >>> On 8/4/20 5:55 AM, Ard Biesheuvel wrote:
> >>>> On Mon, 3 Aug 2020 at 21:11, Ben Greear <greearb@candelatech.com> wrote:
> >>>>>
> >>>>> Hello,
> >>>>>
> >>>>> This helps a bit...now download sw-crypt performance is about 150Mbps,
> >>>>> but still not as good as with my patch on 5.4 kernel, and fpu is still
> >>>>> high in perf top:
> >>>>>
> >>>>>       13.89%  libc-2.29.so   [.] __memset_sse2_unaligned_erms
> >>>>>         6.62%  [kernel]       [k] kernel_fpu_begin
> >>>>>         4.14%  [kernel]       [k] _aesni_enc1
> >>>>>         2.06%  [kernel]       [k] __crypto_xor
> >>>>>         1.95%  [kernel]       [k] copy_user_generic_string
> >>>>>         1.93%  libjvm.so      [.] SpinPause
> >>>>>         1.01%  [kernel]       [k] aesni_encrypt
> >>>>>         0.98%  [kernel]       [k] crypto_ctr_crypt
> >>>>>         0.93%  [kernel]       [k] udp_sendmsg
> >>>>>         0.78%  [kernel]       [k] crypto_inc
> >>>>>         0.74%  [kernel]       [k] __ip_append_data.isra.53
> >>>>>         0.65%  [kernel]       [k] aesni_cbc_enc
> >>>>>         0.64%  [kernel]       [k] __dev_queue_xmit
> >>>>>         0.62%  [kernel]       [k] ipt_do_table
> >>>>>         0.62%  [kernel]       [k] igb_xmit_frame_ring
> >>>>>         0.59%  [kernel]       [k] ip_route_output_key_hash_rcu
> >>>>>         0.57%  [kernel]       [k] memcpy
> >>>>>         0.57%  libjvm.so      [.] InstanceKlass::oop_follow_contents
> >>>>>         0.56%  [kernel]       [k] irq_fpu_usable
> >>>>>         0.56%  [kernel]       [k] mac_do_update
> >>>>>
> >>>>> If you'd like help setting up a test rig and have an ath10k pcie NIC or ath9k pcie NIC,
> >>>>> then I can help.  Possibly hwsim would also be a good test case, but I have not tried
> >>>>> that.
> >>>>>
> >>>>
> >>>> I don't think this is likely to be reproducible on other
> >>>> micro-architectures, so setting up a test rig is unlikely to help.
> >>>>
> >>>> I'll send out a v2 which implements a ahash instead of a shash (and
> >>>> implements some other tweaks) so that kernel_fpu_begin() is only
> >>>> called twice for each packet on the cbcmac path.
> >>>>
> >>>> Do you have any numbers for the old kernel without your patch? This
> >>>> pathological FPU preserve/restore behavior could be caused be the
> >>>> optimizations, or by other changes that landed in the meantime, so I
> >>>> would like to know if kernel_fpu_begin() is as prominent in those
> >>>> traces as well.
> >>>>
> >>>
> >>> This same patch makes i7 mobile processors able to handle 1Gbps+ software
> >>> decrypt rates, where without the patch, the rate was badly constrained and CPU
> >>> load was much higher, so it is definitely noticeable on other processors too.
> >>
> >> OK
> >>
> >>> The weak processor on the current test rig is convenient because the problem
> >>> is so noticeable even at slower wifi speeds.
> >>>
> >>> We can do some tests on 5.4 with our patch reverted.
> >>>
> >>
> >> The issue with your CCM patch is that it keeps the FPU enabled for the
> >> entire input, which also means that preemption is disabled, which
> >> makes the -rt people grumpy. (Of course, it also uses APIs that no
> >> longer exists, but that should be easy to fix)
> >>
> >> Do you happen to have any ballpark figures for the packet sizes and
> >> the time spent doing encryption?
> >>
> >
> > My tester reports this last patch appears to break wpa-2 entirely, so we
> > cannot test it as is.
>
> Hello,
>
> I'm still hoping that I can find some one to help enable this feature again
> on 5.9-ish kernels.
>
> I don't care about preemption being disabled for long-ish times, that is no worse than what I had
> before, so even if an official in-kernel patch is not possible, I can carry an
> out-of-tree patch.
>

Just a note that this is still on my stack, just not at the very top :-)

^ permalink raw reply	[flat|nested] 46+ messages in thread

end of thread, other threads:[~2020-10-29 16:58 UTC | newest]

Thread overview: 46+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-02  9:06 [PATCH] crypto: x86/aesni - implement accelerated CBCMAC, CMAC and XCBC shashes Ard Biesheuvel
2020-08-03 19:11 ` Ben Greear
2020-08-04 12:55   ` Ard Biesheuvel
2020-08-04 13:01     ` Ben Greear
2020-08-04 13:08       ` Ard Biesheuvel
2020-08-04 13:22         ` Ben Greear
2020-08-04 19:45         ` Ben Greear
2020-08-04 20:12           ` Ard Biesheuvel
2020-09-23 11:03           ` Ben Greear
2020-10-29 16:58             ` Ard Biesheuvel
2020-08-18  8:24 ` [PATCH 0/5] crypto: Implement cmac based on cbc skcipher Herbert Xu
2020-08-18  8:25   ` [PATCH 1/6] crypto: skcipher - Add helpers for sync skcipher spawn Herbert Xu
2020-08-18  8:25   ` [PATCH 2/6] crypto: ahash - Add helper to free single spawn instance Herbert Xu
2020-08-18  8:25   ` [PATCH 3/6] crypto: ahash - Add init_tfm/exit_tfm Herbert Xu
2020-08-18  8:25   ` [PATCH 4/6] crypto: ahash - Add ahash_alg_instance Herbert Xu
2020-08-18  8:25   ` [PATCH 5/6] crypto: ahash - Remove AHASH_REQUEST_ON_STACK Herbert Xu
2020-08-26 10:55     ` Ard Biesheuvel
2020-08-18  8:25   ` [PATCH 6/6] crypto: cmac - Use cbc skcipher instead of raw cipher Herbert Xu
2020-08-24  9:47     ` Ard Biesheuvel
2020-08-24 11:20       ` Herbert Xu
2020-08-18  8:31   ` [PATCH 0/5] crypto: Implement cmac based on cbc skcipher Ard Biesheuvel
2020-08-18 13:51     ` Herbert Xu
2020-08-18 13:56       ` Ben Greear
2020-08-18 14:05         ` Herbert Xu
2020-08-18 14:17           ` Ben Greear
2020-08-18 22:15             ` Herbert Xu
2020-08-18 22:27               ` Herbert Xu
2020-08-18 22:31                 ` Ben Greear
2020-08-18 22:33                   ` Herbert Xu
2020-08-18 22:39                     ` Ben Greear
2020-08-20  6:58                       ` Ard Biesheuvel
2020-08-20  7:01                         ` Herbert Xu
2020-08-20  7:04                           ` Ard Biesheuvel
2020-08-20  7:06                             ` Herbert Xu
2020-08-20  7:19                               ` Ard Biesheuvel
2020-08-20  7:29                                 ` Herbert Xu
2020-08-20  7:33                                   ` Ard Biesheuvel
2020-08-20  7:44                                     ` Herbert Xu
2020-08-20  7:48                                       ` Ard Biesheuvel
2020-08-20  7:53                                         ` Herbert Xu
2020-08-20  7:56                                           ` Ard Biesheuvel
2020-08-20 13:54                                             ` Ben Greear
2020-08-20 20:10                                               ` Herbert Xu
2020-08-20 22:09                                                 ` Ben Greear
2020-08-20 22:12                                                   ` Herbert Xu
2020-08-22 22:35                 ` Christian Lamparter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).