All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation
@ 2009-09-16  1:35 Huang Ying
  2009-10-19  2:53 ` Herbert Xu
  0 siblings, 1 reply; 13+ messages in thread
From: Huang Ying @ 2009-09-16  1:35 UTC (permalink / raw)
  To: Herbert Xu; +Cc: linux-kernel, linux-crypto, Daniel Walker

PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
carry-less multiplication. More information about PCLMULQDQ can be
found at:

http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/

Because PCLMULQDQ changes XMM state, its usage must be enclosed with
kernel_fpu_begin/end, which can be used only in process context, the
acceleration is implemented as crypto_ahash. That is, request in soft
IRQ context will be defered to the cryptd kernel thread.

v4:
 - Fix some style issues.

v3:
 - Revise GHASH implementation, performance increase about 2x.

Signed-off-by: Huang Ying <ying.huang@intel.com>
---
 arch/x86/crypto/Makefile                   |    3 
 arch/x86/crypto/ghash-clmulni-intel_asm.S  |  157 +++++++++++++
 arch/x86/crypto/ghash-clmulni-intel_glue.c |  333 +++++++++++++++++++++++++++++
 arch/x86/include/asm/cpufeature.h          |    1 
 crypto/Kconfig                             |    8 
 crypto/cryptd.c                            |    7 
 include/crypto/cryptd.h                    |    1 
 7 files changed, 510 insertions(+)
 create mode 100644 arch/x86/crypto/ghash-clmulni-intel_asm.S
 create mode 100644 arch/x86/crypto/ghash-clmulni-intel_glue.c

--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
+obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 
 obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
 
@@ -24,3 +25,5 @@ twofish-x86_64-y := twofish-x86_64-asm_6
 salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
 
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
+
+ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -0,0 +1,157 @@
+/*
+ * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
+ * instructions. This file contains accelerated part of ghash
+ * implementation. More information about PCLMULQDQ can be found at:
+ *
+ * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
+ *
+ * Copyright (c) 2009 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *	     Vinodh Gopal
+ *	     Erdinc Ozturk
+ *	     Deniz Karakoyunlu
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+
+.align 16
+.Lbswap_mask:
+	.octa 0x000102030405060708090a0b0c0d0e0f
+.Lpoly:
+	.octa 0xc2000000000000000000000000000001
+.Ltwo_one:
+	.octa 0x00000001000000000000000000000001
+
+#define DATA	%xmm0
+#define SHASH	%xmm1
+#define T1	%xmm2
+#define T2	%xmm3
+#define T3	%xmm4
+#define BSWAP	%xmm5
+#define IN1	%xmm6
+
+.text
+
+/*
+ * __clmul_gf128mul_ble:	internal ABI
+ * input:
+ *	DATA:			operand1
+ *	SHASH:			operand2, hash_key << 1 mod poly
+ * output:
+ *	DATA:			operand1 * operand2 mod poly
+ * changed:
+ *	T1
+ *	T2
+ *	T3
+ */
+__clmul_gf128mul_ble:
+	movaps DATA, T1
+	pshufd $0b01001110, DATA, T2
+	pshufd $0b01001110, SHASH, T3
+	pxor DATA, T2
+	pxor SHASH, T3
+
+	# pclmulqdq $0x00, SHASH, DATA	# DATA = a0 * b0
+	.byte 0x66, 0x0f, 0x3a, 0x44, 0xc1, 0x00
+	# pclmulqdq $0x11, SHASH, T1	# T1 = a1 * b1
+	.byte 0x66, 0x0f, 0x3a, 0x44, 0xd1, 0x11
+	# pclmulqdq $0x00, T3, T2	# T2 = (a1 + a0) * (b1 + b0)
+	.byte 0x66, 0x0f, 0x3a, 0x44, 0xdc, 0x00
+	pxor DATA, T2
+	pxor T1, T2			# T2 = a0 * b1 + a1 * b0
+
+	movaps T2, T3
+	pslldq $8, T3
+	psrldq $8, T2
+	pxor T3, DATA
+	pxor T2, T1			# <T1:DATA> is result of
+					# carry-less multiplication
+
+	# first phase of the reduction
+	movaps DATA, T3
+	psllq $1, T3
+	pxor DATA, T3
+	psllq $5, T3
+	pxor DATA, T3
+	psllq $57, T3
+	movaps T3, T2
+	pslldq $8, T2
+	psrldq $8, T3
+	pxor T2, DATA
+	pxor T3, T1
+
+	# second phase of the reduction
+	movaps DATA, T2
+	psrlq $5, T2
+	pxor DATA, T2
+	psrlq $1, T2
+	pxor DATA, T2
+	psrlq $1, T2
+	pxor T2, T1
+	pxor T1, DATA
+	ret
+
+/* void clmul_ghash_mul(char *dst, const be128 *shash) */
+ENTRY(clmul_ghash_mul)
+	movups (%rdi), DATA
+	movups (%rsi), SHASH
+	movaps .Lbswap_mask, BSWAP
+	pshufb BSWAP, DATA
+	call __clmul_gf128mul_ble
+	pshufb BSWAP, DATA
+	movups DATA, (%rdi)
+	ret
+
+/*
+ * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
+ *			   const be128 *shash);
+ */
+ENTRY(clmul_ghash_update)
+	cmp $16, %rdx
+	jb .Lupdate_just_ret	# check length
+	movaps .Lbswap_mask, BSWAP
+	movups (%rdi), DATA
+	movups (%rcx), SHASH
+	pshufb BSWAP, DATA
+.align 4
+.Lupdate_loop:
+	movups (%rsi), IN1
+	pshufb BSWAP, IN1
+	pxor IN1, DATA
+	call __clmul_gf128mul_ble
+	sub $16, %rdx
+	add $16, %rsi
+	cmp $16, %rdx
+	jge .Lupdate_loop
+	pshufb BSWAP, DATA
+	movups DATA, (%rdi)
+.Lupdate_just_ret:
+	ret
+
+/*
+ * void clmul_ghash_setkey(be128 *shash, const u8 *key);
+ *
+ * Calculate hash_key << 1 mod poly
+ */
+ENTRY(clmul_ghash_setkey)
+	movaps .Lbswap_mask, BSWAP
+	movups (%rsi), %xmm0
+	pshufb BSWAP, %xmm0
+	movaps %xmm0, %xmm1
+	psllq $1, %xmm0
+	psrlq $63, %xmm1
+	movaps %xmm1, %xmm2
+	pslldq $8, %xmm1
+	psrldq $8, %xmm2
+	por %xmm1, %xmm0
+	# reduction
+	pshufd $0b00100100, %xmm2, %xmm1
+	pcmpeqd .Ltwo_one, %xmm1
+	pand .Lpoly, %xmm1
+	pxor %xmm1, %xmm0
+	movups %xmm0, (%rdi)
+	ret
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -0,0 +1,333 @@
+/*
+ * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
+ * instructions. This file contains glue code.
+ *
+ * Copyright (c) 2009 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/crypto.h>
+#include <crypto/algapi.h>
+#include <crypto/cryptd.h>
+#include <crypto/gf128mul.h>
+#include <crypto/internal/hash.h>
+#include <asm/i387.h>
+
+#define GHASH_BLOCK_SIZE	16
+#define GHASH_DIGEST_SIZE	16
+
+void clmul_ghash_mul(char *dst, const be128 *shash);
+
+void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
+			const be128 *shash);
+
+void clmul_ghash_setkey(be128 *shash, const u8 *key);
+
+struct ghash_async_ctx {
+	struct cryptd_ahash *cryptd_tfm;
+};
+
+struct ghash_ctx {
+	be128 shash;
+};
+
+struct ghash_desc_ctx {
+	u8 buffer[GHASH_BLOCK_SIZE];
+	u32 bytes;
+};
+
+static int ghash_init(struct shash_desc *desc)
+{
+	struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+	memset(dctx, 0, sizeof(*dctx));
+
+	return 0;
+}
+
+static int ghash_setkey(struct crypto_shash *tfm,
+			const u8 *key, unsigned int keylen)
+{
+	struct ghash_ctx *ctx = crypto_shash_ctx(tfm);
+
+	if (keylen != GHASH_BLOCK_SIZE) {
+		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	clmul_ghash_setkey(&ctx->shash, key);
+
+	return 0;
+}
+
+static int ghash_update(struct shash_desc *desc,
+			 const u8 *src, unsigned int srclen)
+{
+	struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+	struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
+	u8 *dst = dctx->buffer;
+
+	kernel_fpu_begin();
+	if (dctx->bytes) {
+		int n = min(srclen, dctx->bytes);
+		u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
+
+		dctx->bytes -= n;
+		srclen -= n;
+
+		while (n--)
+			*pos++ ^= *src++;
+
+		if (!dctx->bytes)
+			clmul_ghash_mul(dst, &ctx->shash);
+	}
+
+	clmul_ghash_update(dst, src, srclen, &ctx->shash);
+	kernel_fpu_end();
+
+	if (srclen & 0xf) {
+		src += srclen - (srclen & 0xf);
+		srclen &= 0xf;
+		dctx->bytes = GHASH_BLOCK_SIZE - srclen;
+		while (srclen--)
+			*dst++ ^= *src++;
+	}
+
+	return 0;
+}
+
+static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx)
+{
+	u8 *dst = dctx->buffer;
+
+	if (dctx->bytes) {
+		u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
+
+		while (dctx->bytes--)
+			*tmp++ ^= 0;
+
+		kernel_fpu_begin();
+		clmul_ghash_mul(dst, &ctx->shash);
+		kernel_fpu_end();
+	}
+
+	dctx->bytes = 0;
+}
+
+static int ghash_final(struct shash_desc *desc, u8 *dst)
+{
+	struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+	struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
+	u8 *buf = dctx->buffer;
+
+	ghash_flush(ctx, dctx);
+	memcpy(dst, buf, GHASH_BLOCK_SIZE);
+
+	return 0;
+}
+
+static struct shash_alg ghash_alg = {
+	.digestsize	= GHASH_DIGEST_SIZE,
+	.init		= ghash_init,
+	.update		= ghash_update,
+	.final		= ghash_final,
+	.setkey		= ghash_setkey,
+	.descsize	= sizeof(struct ghash_desc_ctx),
+	.base		= {
+		.cra_name		= "__ghash",
+		.cra_driver_name	= "__ghash-pclmulqdqni",
+		.cra_priority		= 0,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize		= GHASH_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct ghash_ctx),
+		.cra_module		= THIS_MODULE,
+		.cra_list		= LIST_HEAD_INIT(ghash_alg.base.cra_list),
+	},
+};
+
+static int ghash_async_init(struct ahash_request *req)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct ahash_request *cryptd_req = ahash_request_ctx(req);
+	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+
+	if (irq_fpu_usable()) {
+		memcpy(cryptd_req, req, sizeof(*req));
+		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+		return crypto_ahash_init(cryptd_req);
+	} else {
+		struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+		struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
+
+		desc->tfm = child;
+		desc->flags = req->base.flags;
+		return crypto_shash_init(desc);
+	}
+}
+
+static int ghash_async_update(struct ahash_request *req)
+{
+	struct ahash_request *cryptd_req = ahash_request_ctx(req);
+
+	if (irq_fpu_usable()) {
+		struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+		struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+		struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+
+		memcpy(cryptd_req, req, sizeof(*req));
+		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+		return crypto_ahash_update(cryptd_req);
+	} else {
+		struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+		return shash_ahash_update(req, desc);
+	}
+}
+
+static int ghash_async_final(struct ahash_request *req)
+{
+	struct ahash_request *cryptd_req = ahash_request_ctx(req);
+
+	if (irq_fpu_usable()) {
+		struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+		struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+		struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+
+		memcpy(cryptd_req, req, sizeof(*req));
+		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+		return crypto_ahash_final(cryptd_req);
+	} else {
+		struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+		return crypto_shash_final(desc, req->result);
+	}
+}
+
+static int ghash_async_digest(struct ahash_request *req)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct ahash_request *cryptd_req = ahash_request_ctx(req);
+	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+
+	if (irq_fpu_usable()) {
+		memcpy(cryptd_req, req, sizeof(*req));
+		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+		return crypto_ahash_digest(cryptd_req);
+	} else {
+		struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+		struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
+
+		desc->tfm = child;
+		desc->flags = req->base.flags;
+		return shash_ahash_digest(req, desc);
+	}
+}
+
+static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
+			      unsigned int keylen)
+{
+	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct crypto_ahash *child = &ctx->cryptd_tfm->base;
+	int err;
+
+	crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK);
+	crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm)
+			       & CRYPTO_TFM_REQ_MASK);
+	err = crypto_ahash_setkey(child, key, keylen);
+	crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child)
+			       & CRYPTO_TFM_RES_MASK);
+
+	return 0;
+}
+
+static int ghash_async_init_tfm(struct crypto_tfm *tfm)
+{
+	struct cryptd_ahash *cryptd_tfm;
+	struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ctx->cryptd_tfm = cryptd_tfm;
+	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+				 sizeof(struct ahash_request) +
+				 crypto_ahash_reqsize(&cryptd_tfm->base));
+
+	return 0;
+}
+
+static void ghash_async_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	cryptd_free_ahash(ctx->cryptd_tfm);
+}
+
+static struct ahash_alg ghash_async_alg = {
+	.init		= ghash_async_init,
+	.update		= ghash_async_update,
+	.final		= ghash_async_final,
+	.setkey		= ghash_async_setkey,
+	.digest		= ghash_async_digest,
+	.halg = {
+		.digestsize	= GHASH_DIGEST_SIZE,
+		.base = {
+			.cra_name		= "ghash",
+			.cra_driver_name	= "ghash-clmulni",
+			.cra_priority		= 400,
+			.cra_flags		= CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
+			.cra_blocksize		= GHASH_BLOCK_SIZE,
+			.cra_type		= &crypto_ahash_type,
+			.cra_module		= THIS_MODULE,
+			.cra_list		= LIST_HEAD_INIT(ghash_async_alg.halg.base.cra_list),
+			.cra_init		= ghash_async_init_tfm,
+			.cra_exit		= ghash_async_exit_tfm,
+		},
+	},
+};
+
+static int __init ghash_pclmulqdqni_mod_init(void)
+{
+	int err;
+
+	if (!cpu_has_pclmulqdq) {
+		printk(KERN_INFO "Intel PCLMULQDQ-NI instructions are not"
+		       " detected.\n");
+		return -ENODEV;
+	}
+
+	err = crypto_register_shash(&ghash_alg);
+	if (err)
+		goto err_out;
+	err = crypto_register_ahash(&ghash_async_alg);
+	if (err)
+		goto err_shash;
+
+	return 0;
+
+err_shash:
+	crypto_unregister_shash(&ghash_alg);
+err_out:
+	return err;
+}
+
+static void __exit ghash_pclmulqdqni_mod_exit(void)
+{
+	crypto_unregister_ahash(&ghash_async_alg);
+	crypto_unregister_shash(&ghash_alg);
+}
+
+module_init(ghash_pclmulqdqni_mod_init);
+module_exit(ghash_pclmulqdqni_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GHASH Message Digest Algorithm, "
+		   "acclerated by PCLMULQDQ-NI");
+MODULE_ALIAS("ghash");
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -247,6 +247,7 @@ extern const char * const x86_power_flag
 #define cpu_has_x2apic		boot_cpu_has(X86_FEATURE_X2APIC)
 #define cpu_has_xsave		boot_cpu_has(X86_FEATURE_XSAVE)
 #define cpu_has_hypervisor	boot_cpu_has(X86_FEATURE_HYPERVISOR)
+#define cpu_has_pclmulqdq	boot_cpu_has(X86_FEATURE_PCLMULQDQ)
 
 #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
 # define cpu_has_invlpg		1
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -440,6 +440,14 @@ config CRYPTO_WP512
 	  See also:
 	  <http://planeta.terra.com.br/informatica/paulobarreto/WhirlpoolPage.html>
 
+config CRYPTO_GHASH_CLMUL_NI_INTEL
+	tristate "GHASH digest algorithm (CLMUL-NI accelerated)"
+	select CRYPTO_SHASH
+	select CRYPTO_CRYPTD
+	help
+	  GHASH is message digest algorithm for GCM (Galois/Counter Mode).
+	  The implementation is accelerated by CLMUL-NI of Intel.
+
 comment "Ciphers"
 
 config CRYPTO_AES
--- a/crypto/cryptd.c
+++ b/crypto/cryptd.c
@@ -711,6 +711,13 @@ struct crypto_shash *cryptd_ahash_child(
 }
 EXPORT_SYMBOL_GPL(cryptd_ahash_child);
 
+struct shash_desc *cryptd_shash_desc(struct ahash_request *req)
+{
+	struct cryptd_hash_request_ctx *rctx = ahash_request_ctx(req);
+	return &rctx->desc;
+}
+EXPORT_SYMBOL_GPL(cryptd_shash_desc);
+
 void cryptd_free_ahash(struct cryptd_ahash *tfm)
 {
 	crypto_free_ahash(&tfm->base);
--- a/include/crypto/cryptd.h
+++ b/include/crypto/cryptd.h
@@ -39,6 +39,7 @@ static inline struct cryptd_ahash *__cry
 struct cryptd_ahash *cryptd_alloc_ahash(const char *alg_name,
 					u32 type, u32 mask);
 struct crypto_shash *cryptd_ahash_child(struct cryptd_ahash *tfm);
+struct shash_desc *cryptd_shash_desc(struct ahash_request *req);
 void cryptd_free_ahash(struct cryptd_ahash *tfm);
 
 #endif



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation
  2009-09-16  1:35 [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation Huang Ying
@ 2009-10-19  2:53 ` Herbert Xu
  2009-11-01  0:30   ` Andrew Morton
  0 siblings, 1 reply; 13+ messages in thread
From: Herbert Xu @ 2009-10-19  2:53 UTC (permalink / raw)
  To: Huang Ying; +Cc: linux-kernel, linux-crypto, Daniel Walker

On Wed, Sep 16, 2009 at 09:35:46AM +0800, Huang Ying wrote:
> PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
> carry-less multiplication. More information about PCLMULQDQ can be
> found at:
> 
> http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
> 
> Because PCLMULQDQ changes XMM state, its usage must be enclosed with
> kernel_fpu_begin/end, which can be used only in process context, the
> acceleration is implemented as crypto_ahash. That is, request in soft
> IRQ context will be defered to the cryptd kernel thread.
> 
> v4:
>  - Fix some style issues.
> 
> v3:
>  - Revise GHASH implementation, performance increase about 2x.
> 
> Signed-off-by: Huang Ying <ying.huang@intel.com>

Patch applied to cryptodev.  Thanks!
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation
  2009-10-19  2:53 ` Herbert Xu
@ 2009-11-01  0:30   ` Andrew Morton
  2009-11-01 17:50     ` Herbert Xu
  0 siblings, 1 reply; 13+ messages in thread
From: Andrew Morton @ 2009-11-01  0:30 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Huang Ying, linux-kernel, linux-crypto, Daniel Walker

On Mon, 19 Oct 2009 11:53:33 +0900 Herbert Xu <herbert@gondor.apana.org.au> wrote:

> On Wed, Sep 16, 2009 at 09:35:46AM +0800, Huang Ying wrote:
> > PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
> > carry-less multiplication. More information about PCLMULQDQ can be
> > found at:
> > 
> > http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
> > 
> > Because PCLMULQDQ changes XMM state, its usage must be enclosed with
> > kernel_fpu_begin/end, which can be used only in process context, the
> > acceleration is implemented as crypto_ahash. That is, request in soft
> > IRQ context will be defered to the cryptd kernel thread.
> > 
> > v4:
> >  - Fix some style issues.
> > 
> > v3:
> >  - Revise GHASH implementation, performance increase about 2x.
> > 
> > Signed-off-by: Huang Ying <ying.huang@intel.com>
> 
> Patch applied to cryptodev.  Thanks!

x86_64 allmodconfig, GNU assembler 2.16.1:

arch/x86/crypto/ghash-clmulni-intel_asm.S: Assembler messages:
arch/x86/crypto/ghash-clmulni-intel_asm.S:103: Error: no such instruction: `pshufb %xmm5,%xmm0'
arch/x86/crypto/ghash-clmulni-intel_asm.S:105: Error: no such instruction: `pshufb %xmm5,%xmm0'
arch/x86/crypto/ghash-clmulni-intel_asm.S:119: Error: no such instruction: `pshufb %xmm5,%xmm0'
arch/x86/crypto/ghash-clmulni-intel_asm.S:123: Error: no such instruction: `pshufb %xmm5,%xmm6'
arch/x86/crypto/ghash-clmulni-intel_asm.S:130: Error: no such instruction: `pshufb %xmm5,%xmm0'
arch/x86/crypto/ghash-clmulni-intel_asm.S:143: Error: no such instruction: `pshufb %xmm5,%xmm0'


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation
  2009-11-01  0:30   ` Andrew Morton
@ 2009-11-01 17:50     ` Herbert Xu
  2009-11-02  7:50       ` Ingo Molnar
  0 siblings, 1 reply; 13+ messages in thread
From: Herbert Xu @ 2009-11-01 17:50 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Huang Ying, linux-kernel, linux-crypto, Daniel Walker

On Sat, Oct 31, 2009 at 05:30:15PM -0700, Andrew Morton wrote:
> 
> x86_64 allmodconfig, GNU assembler 2.16.1:
> 
> arch/x86/crypto/ghash-clmulni-intel_asm.S: Assembler messages:
> arch/x86/crypto/ghash-clmulni-intel_asm.S:103: Error: no such instruction: `pshufb %xmm5,%xmm0'
> arch/x86/crypto/ghash-clmulni-intel_asm.S:105: Error: no such instruction: `pshufb %xmm5,%xmm0'
> arch/x86/crypto/ghash-clmulni-intel_asm.S:119: Error: no such instruction: `pshufb %xmm5,%xmm0'
> arch/x86/crypto/ghash-clmulni-intel_asm.S:123: Error: no such instruction: `pshufb %xmm5,%xmm6'
> arch/x86/crypto/ghash-clmulni-intel_asm.S:130: Error: no such instruction: `pshufb %xmm5,%xmm0'
> arch/x86/crypto/ghash-clmulni-intel_asm.S:143: Error: no such instruction: `pshufb %xmm5,%xmm0'

This patch should fix it.

commit 2d06ef7f42ed8c9969c9aa84e95df5d5c6378327
Author: Herbert Xu <herbert@gondor.apana.org.au>
Date:   Sun Nov 1 12:49:44 2009 -0500

    crypto: ghash-intel - Hard-code pshufb
    
    Old gases don't have a clue what pshufb stands for so we have
    to hard-code it for now.
    
    Reported-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index b9e787a..71768d5 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -100,9 +100,11 @@ ENTRY(clmul_ghash_mul)
 	movups (%rdi), DATA
 	movups (%rsi), SHASH
 	movaps .Lbswap_mask, BSWAP
-	pshufb BSWAP, DATA
+	# pshufb BSWAP, DATA
+	.byte 0x66, 0x0f, 0x38, 0x00, 0xc5
 	call __clmul_gf128mul_ble
-	pshufb BSWAP, DATA
+	# pshufb BSWAP, DATA
+	.byte 0x66, 0x0f, 0x38, 0x00, 0xc5
 	movups DATA, (%rdi)
 	ret
 
@@ -116,18 +118,21 @@ ENTRY(clmul_ghash_update)
 	movaps .Lbswap_mask, BSWAP
 	movups (%rdi), DATA
 	movups (%rcx), SHASH
-	pshufb BSWAP, DATA
+	# pshufb BSWAP, DATA
+	.byte 0x66, 0x0f, 0x38, 0x00, 0xc5
 .align 4
 .Lupdate_loop:
 	movups (%rsi), IN1
-	pshufb BSWAP, IN1
+	# pshufb BSWAP, IN1
+	.byte 0x66, 0x0f, 0x38, 0x00, 0xf5
 	pxor IN1, DATA
 	call __clmul_gf128mul_ble
 	sub $16, %rdx
 	add $16, %rsi
 	cmp $16, %rdx
 	jge .Lupdate_loop
-	pshufb BSWAP, DATA
+	# pshufb BSWAP, DATA
+	.byte 0x66, 0x0f, 0x38, 0x00, 0xc5
 	movups DATA, (%rdi)
 .Lupdate_just_ret:
 	ret
@@ -140,7 +145,8 @@ ENTRY(clmul_ghash_update)
 ENTRY(clmul_ghash_setkey)
 	movaps .Lbswap_mask, BSWAP
 	movups (%rsi), %xmm0
-	pshufb BSWAP, %xmm0
+	# pshufb BSWAP, %xmm0
+	.byte 0x66, 0x0f, 0x38, 0x00, 0xc5
 	movaps %xmm0, %xmm1
 	psllq $1, %xmm0
 	psrlq $63, %xmm1
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation
  2009-11-01 17:50     ` Herbert Xu
@ 2009-11-02  7:50       ` Ingo Molnar
  2009-11-02 14:28         ` Herbert Xu
  0 siblings, 1 reply; 13+ messages in thread
From: Ingo Molnar @ 2009-11-02  7:50 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Andrew Morton, Huang Ying, linux-kernel, linux-crypto,
	Daniel Walker, H. Peter Anvin


* Herbert Xu <herbert@gondor.apana.org.au> wrote:

> -	pshufb BSWAP, DATA
> +	# pshufb BSWAP, DATA
> +	.byte 0x66, 0x0f, 0x38, 0x00, 0xc5

A cleanup request: mind creating two macros for this PSHUFB MMX/SSE 
instruction in arch/x86/include/asm/i387.h, instead of open-coding the 
.byte sequences in ~6 places?

( After the .33 merge window we'll collect such instruction format 
  knowledge in arch/x86/include/asm/inst.h. That file is not upstream 
  yet so i387.h will do for now for FPU/SSE instructions. )

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation
  2009-11-02  7:50       ` Ingo Molnar
@ 2009-11-02 14:28         ` Herbert Xu
  2009-11-02 14:32           ` Ingo Molnar
  0 siblings, 1 reply; 13+ messages in thread
From: Herbert Xu @ 2009-11-02 14:28 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Huang Ying, linux-kernel, linux-crypto,
	Daniel Walker, H. Peter Anvin

On Mon, Nov 02, 2009 at 08:50:39AM +0100, Ingo Molnar wrote:
> 
> A cleanup request: mind creating two macros for this PSHUFB MMX/SSE 
> instruction in arch/x86/include/asm/i387.h, instead of open-coding the 
> .byte sequences in ~6 places?

I had a go at doing that, but it seems that i387.h isn't really
meant to be included in an asm file at this point :)

> ( After the .33 merge window we'll collect such instruction format 
>   knowledge in arch/x86/include/asm/inst.h. That file is not upstream 
>   yet so i387.h will do for now for FPU/SSE instructions. )

I'm happy to revisit this once inst.h exists.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation
  2009-11-02 14:28         ` Herbert Xu
@ 2009-11-02 14:32           ` Ingo Molnar
  2009-11-02 14:46             ` Herbert Xu
  2009-11-03  5:47             ` Huang Ying
  0 siblings, 2 replies; 13+ messages in thread
From: Ingo Molnar @ 2009-11-02 14:32 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Andrew Morton, Huang Ying, linux-kernel, linux-crypto,
	Daniel Walker, H. Peter Anvin


* Herbert Xu <herbert@gondor.apana.org.au> wrote:

> On Mon, Nov 02, 2009 at 08:50:39AM +0100, Ingo Molnar wrote:
> > 
> > A cleanup request: mind creating two macros for this PSHUFB MMX/SSE 
> > instruction in arch/x86/include/asm/i387.h, instead of open-coding the 
> > .byte sequences in ~6 places?
> 
> I had a go at doing that, but it seems that i387.h isn't really meant 
> to be included in an asm file at this point :)

Please use the standard construct and put an #ifndef __ASSEMBLY__ around 
it.

> > ( After the .33 merge window we'll collect such instruction format 
> >   knowledge in arch/x86/include/asm/inst.h. That file is not upstream 
> >   yet so i387.h will do for now for FPU/SSE instructions. )
> 
> I'm happy to revisit this once inst.h exists.

No reason to not do most of the change first though, the way i suggested 
it.

	Ingo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation
  2009-11-02 14:32           ` Ingo Molnar
@ 2009-11-02 14:46             ` Herbert Xu
  2009-11-02 15:46               ` Ingo Molnar
  2009-11-03  5:47             ` Huang Ying
  1 sibling, 1 reply; 13+ messages in thread
From: Herbert Xu @ 2009-11-02 14:46 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Huang Ying, linux-kernel, linux-crypto,
	Daniel Walker, H. Peter Anvin

On Mon, Nov 02, 2009 at 03:32:58PM +0100, Ingo Molnar wrote:
> 
> Please use the standard construct and put an #ifndef __ASSEMBLY__ around 
> it.

You mean like this?

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 0b20bbb..e22d237 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -10,6 +10,13 @@
 #ifndef _ASM_X86_I387_H
 #define _ASM_X86_I387_H
 
+#ifdef __ASSEMBLY__
+
+#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
+#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
+
+#else
+
 #include <linux/sched.h>
 #include <linux/kernel_stat.h>
 #include <linux/regset.h>
@@ -411,4 +418,5 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
 	}
 }
 
+#endif /* __ASSEMBLY__ */
 #endif /* _ASM_X86_I387_H */

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation
  2009-11-02 14:46             ` Herbert Xu
@ 2009-11-02 15:46               ` Ingo Molnar
  2009-11-03 14:12                 ` Herbert Xu
  0 siblings, 1 reply; 13+ messages in thread
From: Ingo Molnar @ 2009-11-02 15:46 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Andrew Morton, Huang Ying, linux-kernel, linux-crypto,
	Daniel Walker, H. Peter Anvin


* Herbert Xu <herbert@gondor.apana.org.au> wrote:

> On Mon, Nov 02, 2009 at 03:32:58PM +0100, Ingo Molnar wrote:
> > 
> > Please use the standard construct and put an #ifndef __ASSEMBLY__ around 
> > it.
> 
> You mean like this?
> 
> diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
> index 0b20bbb..e22d237 100644
> --- a/arch/x86/include/asm/i387.h
> +++ b/arch/x86/include/asm/i387.h
> @@ -10,6 +10,13 @@
>  #ifndef _ASM_X86_I387_H
>  #define _ASM_X86_I387_H
>  
> +#ifdef __ASSEMBLY__
> +
> +#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
> +#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
> +
> +#else
> +
>  #include <linux/sched.h>
>  #include <linux/kernel_stat.h>
>  #include <linux/regset.h>
> @@ -411,4 +418,5 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
>  	}
>  }
>  
> +#endif /* __ASSEMBLY__ */
>  #endif /* _ASM_X86_I387_H */

Yeah. Or just a single block of:


  #ifndef __ASSEMBLY__
  ...
  #endif /* __ASSEMBLY__ */

around the C bits - anything outside that is good for assembly as well. 

	Ingo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation
  2009-11-02 14:32           ` Ingo Molnar
  2009-11-02 14:46             ` Herbert Xu
@ 2009-11-03  5:47             ` Huang Ying
  2009-11-03  9:03               ` Ingo Molnar
  1 sibling, 1 reply; 13+ messages in thread
From: Huang Ying @ 2009-11-03  5:47 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Herbert Xu, Andrew Morton, linux-kernel, linux-crypto,
	Daniel Walker, H. Peter Anvin

On Mon, 2009-11-02 at 22:32 +0800, Ingo Molnar wrote: 
> * Herbert Xu <herbert@gondor.apana.org.au> wrote:
> 
> > On Mon, Nov 02, 2009 at 08:50:39AM +0100, Ingo Molnar wrote:
> > > 
> > > A cleanup request: mind creating two macros for this PSHUFB MMX/SSE 
> > > instruction in arch/x86/include/asm/i387.h, instead of open-coding the 
> > > .byte sequences in ~6 places?
> > 
> > I had a go at doing that, but it seems that i387.h isn't really meant 
> > to be included in an asm file at this point :)
> 
> Please use the standard construct and put an #ifndef __ASSEMBLY__ around 
> it.
> 
> > > ( After the .33 merge window we'll collect such instruction format 
> > >   knowledge in arch/x86/include/asm/inst.h. That file is not upstream 
> > >   yet so i387.h will do for now for FPU/SSE instructions. )
> > 
> > I'm happy to revisit this once inst.h exists.
> 
> No reason to not do most of the change first though, the way i suggested 
> it.

How about something as below? But it seems not appropriate to put these
bits into i387.h, that is, to combine C and gas syntax.

Best Regards,
Huang Ying

.macro xmm_num opd xmm
.ifc \xmm,%xmm0
\opd = 0
.endif
.ifc \xmm,%xmm1
\opd = 1
.endif
.ifc \xmm,%xmm2
\opd = 2
.endif
.ifc \xmm,%xmm3
\opd = 3
.endif
.ifc \xmm,%xmm4
\opd = 4
.endif
.ifc \xmm,%xmm5
\opd = 5
.endif
.ifc \xmm,%xmm6
\opd = 6
.endif
.ifc \xmm,%xmm7
\opd = 7
.endif
.ifc \xmm,%xmm8
\opd = 8
.endif
.ifc \xmm,%xmm9
\opd = 9
.endif
.ifc \xmm,%xmm10
\opd = 10
.endif
.ifc \xmm,%xmm11
\opd = 11
.endif
.ifc \xmm,%xmm12
\opd = 12
.endif
.ifc \xmm,%xmm13
\opd = 13
.endif
.ifc \xmm,%xmm14
\opd = 14
.endif
.ifc \xmm,%xmm15
\opd = 15
.endif
.endm

.macro PSHUFB_XMM xmm1 xmm2
xmm_num pshufb_opd1 \xmm1
xmm_num pshufb_opd2 \xmm2
.if (pshufb_opd1 < 8) && (pshufb_opd2 < 8)
.byte 0x66, 0x0f, 0x38, 0x00, 0xc0 | pshufb_opd1 | (pshufb_opd2<<3)
.elseif (pshufb_opd1 >= 8) && (pshufb_opd2 < 8)
.byte 0x66, 0x41, 0x0f, 0x38, 0x00, 0xc0 | (pshufb_opd1-8) | (pshufb_opd2<<3)
.elseif (pshufb_opd1 < 8) && (pshufb_opd2 >= 8)
.byte 0x66, 0x44, 0x0f, 0x38, 0x00, 0xc0 | pshufb_opd1 | ((pshufb_opd2-8)<<3)
.else
.byte 0x66, 0x45, 0x0f, 0x38, 0x00, 0xc0 | (pshufb_opd1-8) | ((pshufb_opd2-8)<<3)
.endif
.endm



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation
  2009-11-03  5:47             ` Huang Ying
@ 2009-11-03  9:03               ` Ingo Molnar
  2009-11-04  0:59                 ` H. Peter Anvin
  0 siblings, 1 reply; 13+ messages in thread
From: Ingo Molnar @ 2009-11-03  9:03 UTC (permalink / raw)
  To: Huang Ying
  Cc: Herbert Xu, Andrew Morton, linux-kernel, linux-crypto,
	Daniel Walker, H. Peter Anvin


* Huang Ying <ying.huang@intel.com> wrote:

> On Mon, 2009-11-02 at 22:32 +0800, Ingo Molnar wrote: 
> > * Herbert Xu <herbert@gondor.apana.org.au> wrote:
> > 
> > > On Mon, Nov 02, 2009 at 08:50:39AM +0100, Ingo Molnar wrote:
> > > > 
> > > > A cleanup request: mind creating two macros for this PSHUFB MMX/SSE 
> > > > instruction in arch/x86/include/asm/i387.h, instead of open-coding the 
> > > > .byte sequences in ~6 places?
> > > 
> > > I had a go at doing that, but it seems that i387.h isn't really meant 
> > > to be included in an asm file at this point :)
> > 
> > Please use the standard construct and put an #ifndef __ASSEMBLY__ around 
> > it.
> > 
> > > > ( After the .33 merge window we'll collect such instruction format 
> > > >   knowledge in arch/x86/include/asm/inst.h. That file is not upstream 
> > > >   yet so i387.h will do for now for FPU/SSE instructions. )
> > > 
> > > I'm happy to revisit this once inst.h exists.
> > 
> > No reason to not do most of the change first though, the way i suggested 
> > it.
> 
> How about something as below? But it seems not appropriate to put these
> bits into i387.h, that is, to combine C and gas syntax.
> 
> Best Regards,
> Huang Ying
> 
> .macro xmm_num opd xmm
> .ifc \xmm,%xmm0
> \opd = 0
> .endif
> .ifc \xmm,%xmm1
> \opd = 1
> .endif
> .ifc \xmm,%xmm2
> \opd = 2
> .endif
> .ifc \xmm,%xmm3
> \opd = 3
> .endif
> .ifc \xmm,%xmm4
> \opd = 4
> .endif
> .ifc \xmm,%xmm5
> \opd = 5
> .endif
> .ifc \xmm,%xmm6
> \opd = 6
> .endif
> .ifc \xmm,%xmm7
> \opd = 7
> .endif
> .ifc \xmm,%xmm8
> \opd = 8
> .endif
> .ifc \xmm,%xmm9
> \opd = 9
> .endif
> .ifc \xmm,%xmm10
> \opd = 10
> .endif
> .ifc \xmm,%xmm11
> \opd = 11
> .endif
> .ifc \xmm,%xmm12
> \opd = 12
> .endif
> .ifc \xmm,%xmm13
> \opd = 13
> .endif
> .ifc \xmm,%xmm14
> \opd = 14
> .endif
> .ifc \xmm,%xmm15
> \opd = 15
> .endif
> .endm
> 
> .macro PSHUFB_XMM xmm1 xmm2
> xmm_num pshufb_opd1 \xmm1
> xmm_num pshufb_opd2 \xmm2
> .if (pshufb_opd1 < 8) && (pshufb_opd2 < 8)
> .byte 0x66, 0x0f, 0x38, 0x00, 0xc0 | pshufb_opd1 | (pshufb_opd2<<3)
> .elseif (pshufb_opd1 >= 8) && (pshufb_opd2 < 8)
> .byte 0x66, 0x41, 0x0f, 0x38, 0x00, 0xc0 | (pshufb_opd1-8) | (pshufb_opd2<<3)
> .elseif (pshufb_opd1 < 8) && (pshufb_opd2 >= 8)
> .byte 0x66, 0x44, 0x0f, 0x38, 0x00, 0xc0 | pshufb_opd1 | ((pshufb_opd2-8)<<3)
> .else
> .byte 0x66, 0x45, 0x0f, 0x38, 0x00, 0xc0 | (pshufb_opd1-8) | ((pshufb_opd2-8)<<3)
> .endif
> .endm

Looks far too clever, i like it :-) We have quite a few assembly macros 
in arch/x86/include/asm/. The above one could be put into calling.h for 
example.

But the simpler .byte solution in i387.h would be fine too.

If you guys want to put helper define into arch/x86/include/asm/ into 
the crypto tree, feel free:

  Acked-by: Ingo Molnar <mingo@elte.hu>

it would be clumsy to keep it separately in the x86 tree. Just dont 
spread raw .byte sequences in .S files please ...

	Ingo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation
  2009-11-02 15:46               ` Ingo Molnar
@ 2009-11-03 14:12                 ` Herbert Xu
  0 siblings, 0 replies; 13+ messages in thread
From: Herbert Xu @ 2009-11-03 14:12 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Huang Ying, linux-kernel, linux-crypto,
	Daniel Walker, H. Peter Anvin

On Mon, Nov 02, 2009 at 04:46:04PM +0100, Ingo Molnar wrote:
> 
> Yeah. Or just a single block of:
> 
> 
>   #ifndef __ASSEMBLY__
>   ...
>   #endif /* __ASSEMBLY__ */
> 
> around the C bits - anything outside that is good for assembly as well. 

OK I'll throw this into cryptodev:

commit 3b0d65969b549b796abc6f0230f6142fed365d49
Author: Herbert Xu <herbert@gondor.apana.org.au>
Date:   Tue Nov 3 09:11:15 2009 -0500

    crypto: ghash-intel - Add PSHUFB macros
    
    Add PSHUFB macros instead of repeating byte sequences, suggested
    by Ingo.
    
    Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
    Acked-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index 71768d5..5958498 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -17,6 +17,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/i387.h>
 
 .align 16
 .Lbswap_mask:
@@ -101,7 +102,7 @@ ENTRY(clmul_ghash_mul)
 	movups (%rsi), SHASH
 	movaps .Lbswap_mask, BSWAP
 	# pshufb BSWAP, DATA
-	.byte 0x66, 0x0f, 0x38, 0x00, 0xc5
+	PSHUFB_XMM5_XMM0
 	call __clmul_gf128mul_ble
 	# pshufb BSWAP, DATA
 	.byte 0x66, 0x0f, 0x38, 0x00, 0xc5
@@ -119,12 +120,12 @@ ENTRY(clmul_ghash_update)
 	movups (%rdi), DATA
 	movups (%rcx), SHASH
 	# pshufb BSWAP, DATA
-	.byte 0x66, 0x0f, 0x38, 0x00, 0xc5
+	PSHUFB_XMM5_XMM0
 .align 4
 .Lupdate_loop:
 	movups (%rsi), IN1
 	# pshufb BSWAP, IN1
-	.byte 0x66, 0x0f, 0x38, 0x00, 0xf5
+	PSHUFB_XMM5_XMM6
 	pxor IN1, DATA
 	call __clmul_gf128mul_ble
 	sub $16, %rdx
@@ -132,7 +133,7 @@ ENTRY(clmul_ghash_update)
 	cmp $16, %rdx
 	jge .Lupdate_loop
 	# pshufb BSWAP, DATA
-	.byte 0x66, 0x0f, 0x38, 0x00, 0xc5
+	PSHUFB_XMM5_XMM0
 	movups DATA, (%rdi)
 .Lupdate_just_ret:
 	ret
@@ -146,7 +147,7 @@ ENTRY(clmul_ghash_setkey)
 	movaps .Lbswap_mask, BSWAP
 	movups (%rsi), %xmm0
 	# pshufb BSWAP, %xmm0
-	.byte 0x66, 0x0f, 0x38, 0x00, 0xc5
+	PSHUFB_XMM5_XMM0
 	movaps %xmm0, %xmm1
 	psllq $1, %xmm0
 	psrlq $63, %xmm1
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 0b20bbb..ebfb8a9 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -10,6 +10,8 @@
 #ifndef _ASM_X86_I387_H
 #define _ASM_X86_I387_H
 
+#ifndef __ASSEMBLY__
+
 #include <linux/sched.h>
 #include <linux/kernel_stat.h>
 #include <linux/regset.h>
@@ -411,4 +413,9 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
 	}
 }
 
+#endif /* __ASSEMBLY__ */
+
+#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
+#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
+
 #endif /* _ASM_X86_I387_H */

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation
  2009-11-03  9:03               ` Ingo Molnar
@ 2009-11-04  0:59                 ` H. Peter Anvin
  0 siblings, 0 replies; 13+ messages in thread
From: H. Peter Anvin @ 2009-11-04  0:59 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Huang Ying, Herbert Xu, Andrew Morton, linux-kernel,
	linux-crypto, Daniel Walker

On 11/03/2009 01:03 AM, Ingo Molnar wrote:
>>
>> .macro xmm_num opd xmm
>> .ifc \xmm,%xmm0
>> \opd = 0
>> .endif
>> .ifc \xmm,%xmm1
>> \opd = 1
>> .endif
>> .ifc \xmm,%xmm2
>> \opd = 2
>> .endif
>> .ifc \xmm,%xmm3
>> \opd = 3
>> .endif
>> .ifc \xmm,%xmm4
>> \opd = 4
>> .endif
>> .ifc \xmm,%xmm5
>> \opd = 5
>> .endif
>> .ifc \xmm,%xmm6
>> \opd = 6
>> .endif
>> .ifc \xmm,%xmm7
>> \opd = 7
>> .endif
>> .ifc \xmm,%xmm8
>> \opd = 8
>> .endif
>> .ifc \xmm,%xmm9
>> \opd = 9
>> .endif
>> .ifc \xmm,%xmm10
>> \opd = 10
>> .endif
>> .ifc \xmm,%xmm11
>> \opd = 11
>> .endif
>> .ifc \xmm,%xmm12
>> \opd = 12
>> .endif
>> .ifc \xmm,%xmm13
>> \opd = 13
>> .endif
>> .ifc \xmm,%xmm14
>> \opd = 14
>> .endif
>> .ifc \xmm,%xmm15
>> \opd = 15
>> .endif
>> .endm
>>
>> .macro PSHUFB_XMM xmm1 xmm2
>> xmm_num pshufb_opd1 \xmm1
>> xmm_num pshufb_opd2 \xmm2
>> .if (pshufb_opd1 < 8) && (pshufb_opd2 < 8)
>> .byte 0x66, 0x0f, 0x38, 0x00, 0xc0 | pshufb_opd1 | (pshufb_opd2<<3)
>> .elseif (pshufb_opd1 >= 8) && (pshufb_opd2 < 8)
>> .byte 0x66, 0x41, 0x0f, 0x38, 0x00, 0xc0 | (pshufb_opd1-8) | (pshufb_opd2<<3)
>> .elseif (pshufb_opd1 < 8) && (pshufb_opd2 >= 8)
>> .byte 0x66, 0x44, 0x0f, 0x38, 0x00, 0xc0 | pshufb_opd1 | ((pshufb_opd2-8)<<3)
>> .else
>> .byte 0x66, 0x45, 0x0f, 0x38, 0x00, 0xc0 | (pshufb_opd1-8) | ((pshufb_opd2-8)<<3)
>> .endif
>> .endm
> 
> Looks far too clever, i like it :-) We have quite a few assembly macros 
> in arch/x86/include/asm/. The above one could be put into calling.h for 
> example.
> 

I would really like to see something like that, with only one minor
tweak: please use submacros to generate the REX and MODRM bytes, since
we are *guaranteed* to want to do the same thing again.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2009-11-04  0:59 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-09-16  1:35 [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation Huang Ying
2009-10-19  2:53 ` Herbert Xu
2009-11-01  0:30   ` Andrew Morton
2009-11-01 17:50     ` Herbert Xu
2009-11-02  7:50       ` Ingo Molnar
2009-11-02 14:28         ` Herbert Xu
2009-11-02 14:32           ` Ingo Molnar
2009-11-02 14:46             ` Herbert Xu
2009-11-02 15:46               ` Ingo Molnar
2009-11-03 14:12                 ` Herbert Xu
2009-11-03  5:47             ` Huang Ying
2009-11-03  9:03               ` Ingo Molnar
2009-11-04  0:59                 ` H. Peter Anvin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.