All of lore.kernel.org
 help / color / mirror / Atom feed
From: Keith Busch <kbusch@kernel.org>
To: linux-nvme@lists.infradead.org, linux-block@vger.kernel.org,
	linux-crypto@vger.kernel.org, x86@kernel.org,
	linux-kernel@vger.kernel.org
Cc: axboe@kernel.dk, hch@lst.de, martin.petersen@oracle.com,
	colyli@suse.de, Keith Busch <kbusch@kernel.org>
Subject: [PATCHv3 10/10] x86/crypto: add pclmul acceleration for crc64
Date: Tue, 22 Feb 2022 08:31:44 -0800	[thread overview]
Message-ID: <20220222163144.1782447-11-kbusch@kernel.org> (raw)
In-Reply-To: <20220222163144.1782447-1-kbusch@kernel.org>

The crc64 table lookup method is inefficient, using a significant number
of CPU cycles in the block stack per IO. If available on x86, use a
PCLMULQDQ implementation to accelerate the calculation.

The assembly from this patch was mostly generated by gcc from a C
program using library functions provided by x86 intrinsics, and measures
~20x faster than the table lookup.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 arch/x86/crypto/Makefile                  |   3 +
 arch/x86/crypto/crc64-rocksoft-pcl-asm.S  | 215 ++++++++++++++++++++++
 arch/x86/crypto/crc64-rocksoft-pcl_glue.c | 117 ++++++++++++
 crypto/Kconfig                            |  11 ++
 4 files changed, 346 insertions(+)
 create mode 100644 arch/x86/crypto/crc64-rocksoft-pcl-asm.S
 create mode 100644 arch/x86/crypto/crc64-rocksoft-pcl_glue.c

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index c3af959648e6..036520c59f0e 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -79,6 +79,9 @@ crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
 obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
 crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
 
+obj-$(CONFIG_CRYPTO_CRC64_ROCKSOFT_PCLMUL) += crc64-rocksoft-pclmul.o
+crc64-rocksoft-pclmul-y := crc64-rocksoft-pcl-asm.o crc64-rocksoft-pcl_glue.o
+
 obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
 poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
 targets += poly1305-x86_64-cryptogams.S
diff --git a/arch/x86/crypto/crc64-rocksoft-pcl-asm.S b/arch/x86/crypto/crc64-rocksoft-pcl-asm.S
new file mode 100644
index 000000000000..e3b633a776a9
--- /dev/null
+++ b/arch/x86/crypto/crc64-rocksoft-pcl-asm.S
@@ -0,0 +1,215 @@
+########################################################################
+# Implement fast Rocksoft CRC-64 computation with SSE and PCLMULQDQ instructions
+#
+
+#include <linux/linkage.h>
+
+SYM_FUNC_START(crc_rocksoft_pcl)
+	leaq	(%rsi,%rdx), %rcx
+	movq	%rsi, %r10
+	andl	$15, %esi
+	movq	%rdi, %xmm3
+	leaq	15(%rcx), %rax
+	andq	$-16, %r10
+	pxor	%xmm1, %xmm1
+	andq	$-16, %rax
+	movdqa	%xmm1, %xmm5
+	movq	%rax, %r8
+	subq	%r10, %rax
+	subq	%rcx, %r8
+	movl	$16, %ecx
+	movq	%rax, %r11
+	movq	%rcx, %r9
+	sarq	$4, %r11
+	subq	%rsi, %r9
+	movdqu	shuffleMasks(%r9), %xmm4
+	movdqa	%xmm4, %xmm0
+	pblendvb	%xmm0, (%r10), %xmm5
+	cmpq	$16, %rax
+	je	.L12
+	movdqa	16(%r10), %xmm2
+	cmpq	$2, %r11
+	je	.L13
+	pcmpeqd	%xmm1, %xmm1
+	leaq	-16(%rsi,%rdx), %rdi
+	leaq	16(%r10), %r9
+	pxor	%xmm1, %xmm4
+	movdqa	%xmm3, %xmm1
+	pshufb	%xmm0, %xmm3
+	pshufb	%xmm4, %xmm1
+	movdqa	%xmm3, %xmm0
+	movdqa	.LC0(%rip), %xmm3
+	pxor	%xmm5, %xmm1
+	movdqa	%xmm1, %xmm4
+	pclmulqdq	$0, %xmm3, %xmm1
+	pclmulqdq	$17, %xmm3, %xmm4
+	pxor	%xmm4, %xmm1
+	pxor	%xmm1, %xmm0
+	cmpq	$31, %rdi
+	jbe	.L6
+	leaq	-32(%rdi), %rax
+	movq	%rax, %rsi
+	andq	$-16, %rax
+	leaq	32(%r10,%rax), %rcx
+	shrq	$4, %rsi
+	movq	%r9, %rax
+	.p2align 4,,10
+	.p2align 3
+.L7:
+	pxor	%xmm2, %xmm0
+	movq	%rax, %rdx
+	addq	$16, %rax
+	movdqa	%xmm0, %xmm1
+	pclmulqdq	$0, %xmm3, %xmm0
+	movdqa	16(%rdx), %xmm2
+	pclmulqdq	$17, %xmm3, %xmm1
+	pxor	%xmm1, %xmm0
+	cmpq	%rcx, %rax
+	jne	.L7
+	movq	%rsi, %rax
+	addq	$1, %rsi
+	negq	%rax
+	salq	$4, %rsi
+	salq	$4, %rax
+	addq	%rsi, %r9
+	leaq	-16(%rdi,%rax), %rdi
+.L6:
+	pxor	%xmm2, %xmm0
+	cmpq	$16, %rdi
+	je	.L9
+	movl	$16, %eax
+	pcmpeqd	%xmm2, %xmm2
+	movdqa	%xmm0, %xmm7
+	subq	%r8, %rax
+	movdqu	shuffleMasks(%rax), %xmm4
+	pxor	%xmm4, %xmm2
+	pshufb	%xmm4, %xmm0
+	movdqa	16(%r9), %xmm4
+	pshufb	%xmm2, %xmm7
+	pshufb	%xmm2, %xmm4
+	movdqa	%xmm7, %xmm1
+	movdqa	%xmm4, %xmm2
+	movdqa	%xmm7, %xmm4
+	pclmulqdq	$0, %xmm3, %xmm1
+	pclmulqdq	$17, %xmm3, %xmm4
+	por	%xmm2, %xmm0
+	pxor	%xmm4, %xmm1
+	pxor	%xmm1, %xmm0
+.L9:
+	movdqa	%xmm0, %xmm2
+	pclmulqdq	$16, %xmm3, %xmm0
+	psrldq	$8, %xmm2
+	pxor	%xmm2, %xmm0
+.L3:
+	movdqa	.LC1(%rip), %xmm2
+	movdqa	%xmm0, %xmm1
+	pclmulqdq	$0, %xmm2, %xmm1
+	movdqa	%xmm1, %xmm3
+	pclmulqdq	$16, %xmm2, %xmm1
+	pslldq	$8, %xmm3
+	pxor	%xmm3, %xmm1
+	pxor	%xmm1, %xmm0
+	pextrd	$3, %xmm0, %eax
+	salq	$32, %rax
+	movq	%rax, %rdx
+	pextrd	$2, %xmm0, %eax
+	orq	%rdx, %rax
+	notq	%rax
+	ret
+	.p2align 4,,10
+	.p2align 3
+.L13:
+	subq	%r8, %rcx
+	pcmpeqd	%xmm1, %xmm1
+	movdqu	shuffleMasks(%rcx), %xmm7
+	movdqa	%xmm7, %xmm6
+	pxor	%xmm1, %xmm6
+	cmpq	$7, %rdx
+	ja	.L5
+	movdqa	%xmm1, %xmm4
+	pshufb	%xmm7, %xmm5
+	movdqa	%xmm3, %xmm1
+	movdqu	shuffleMasks(%rdx), %xmm8
+	pshufb	%xmm6, %xmm2
+	pxor	%xmm8, %xmm4
+	pxor	%xmm5, %xmm2
+	pshufb	%xmm8, %xmm3
+	pshufb	%xmm4, %xmm1
+	movdqa	%xmm3, %xmm0
+	pxor	%xmm1, %xmm2
+	pslldq	$8, %xmm0
+	movdqa	%xmm2, %xmm3
+	pclmulqdq	$16, .LC0(%rip), %xmm2
+	psrldq	$8, %xmm3
+	pxor	%xmm3, %xmm0
+	pxor	%xmm2, %xmm0
+	jmp	.L3
+	.p2align 4,,10
+	.p2align 3
+.L12:
+	movdqu	shuffleMasks(%rdx), %xmm2
+	subq	%r8, %rcx
+	movdqa	%xmm3, %xmm6
+	pcmpeqd	%xmm4, %xmm4
+	movdqa	%xmm2, %xmm0
+	pshufb	%xmm2, %xmm3
+	movdqu	shuffleMasks(%rcx), %xmm2
+	pxor	%xmm4, %xmm0
+	pslldq	$8, %xmm3
+	pxor	%xmm4, %xmm2
+	pshufb	%xmm0, %xmm6
+	pshufb	%xmm2, %xmm5
+	movdqa	%xmm5, %xmm1
+	pxor	%xmm6, %xmm1
+	movdqa	%xmm1, %xmm0
+	pclmulqdq	$16, .LC0(%rip), %xmm1
+	psrldq	$8, %xmm0
+	pxor	%xmm3, %xmm0
+	pxor	%xmm1, %xmm0
+	jmp	.L3
+	.p2align 4,,10
+	.p2align 3
+.L5:
+	pxor	%xmm1, %xmm4
+	movdqa	%xmm3, %xmm1
+	pshufb	%xmm0, %xmm3
+	pshufb	%xmm4, %xmm1
+	pxor	%xmm3, %xmm2
+	movdqa	.LC0(%rip), %xmm3
+	pxor	%xmm5, %xmm1
+	pshufb	%xmm6, %xmm2
+	movdqa	%xmm1, %xmm5
+	pshufb	%xmm7, %xmm1
+	pshufb	%xmm6, %xmm5
+	pxor	%xmm2, %xmm1
+	movdqa	%xmm5, %xmm4
+	movdqa	%xmm5, %xmm0
+	pclmulqdq	$17, %xmm3, %xmm0
+	pclmulqdq	$0, %xmm3, %xmm4
+	pxor	%xmm0, %xmm4
+	pxor	%xmm4, %xmm1
+	movdqa	%xmm1, %xmm0
+	pclmulqdq	$16, %xmm3, %xmm1
+	psrldq	$8, %xmm0
+	pxor	%xmm1, %xmm0
+	jmp	.L3
+SYM_FUNC_END(crc_rocksoft_pcl)
+
+.section	.rodata
+.align 32
+.type	shuffleMasks, @object
+.size	shuffleMasks, 32
+shuffleMasks:
+	.string	""
+	.ascii	"\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\217\216\215"
+	.ascii	"\214\213\212\211\210\207\206\205\204\203\202\201\200"
+
+.section	.rodata.cst16,"aM",@progbits,16
+.align 16
+.LC0:
+	.quad	-1523270018343381984
+	.quad	2443614144669557164
+	.align 16
+.LC1:
+	.quad	2876949357237608311
+	.quad	3808117099328934763
diff --git a/arch/x86/crypto/crc64-rocksoft-pcl_glue.c b/arch/x86/crypto/crc64-rocksoft-pcl_glue.c
new file mode 100644
index 000000000000..996780aa3d93
--- /dev/null
+++ b/arch/x86/crypto/crc64-rocksoft-pcl_glue.c
@@ -0,0 +1,117 @@
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/crc64.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <asm/cpufeatures.h>
+#include <asm/cpu_device_id.h>
+#include <asm/simd.h>
+
+asmlinkage u64 crc_rocksoft_pcl(u64 init_crc, const u8 *buf, size_t len);
+
+struct chksum_desc_ctx {
+	u64 crc;
+};
+
+static int chksum_init(struct shash_desc *desc)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	ctx->crc = 0;
+
+	return 0;
+}
+
+static int chksum_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int length)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	if (length >= 16 && crypto_simd_usable()) {
+		kernel_fpu_begin();
+		ctx->crc = crc_rocksoft_pcl(ctx->crc, data, length);
+		kernel_fpu_end();
+	} else
+		ctx->crc = crc64_rocksoft_generic(ctx->crc, data, length);
+	return 0;
+}
+
+static int chksum_final(struct shash_desc *desc, u8 *out)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	*(u64 *)out = ctx->crc;
+	return 0;
+}
+
+static int __chksum_finup(u64 crc, const u8 *data, unsigned int len, u8 *out)
+{
+	if (len >= 16 && crypto_simd_usable()) {
+		kernel_fpu_begin();
+		*(u64 *)out = crc_rocksoft_pcl(crc, data, len);
+		kernel_fpu_end();
+	} else
+		*(u64 *)out = crc64_rocksoft_generic(crc, data, len);
+	return 0;
+}
+
+static int chksum_finup(struct shash_desc *desc, const u8 *data,
+			unsigned int len, u8 *out)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	return __chksum_finup(ctx->crc, data, len, out);
+}
+
+static int chksum_digest(struct shash_desc *desc, const u8 *data,
+			 unsigned int length, u8 *out)
+{
+	return __chksum_finup(0, data, length, out);
+}
+
+static struct shash_alg alg = {
+	.digestsize	= 	8,
+	.init		=	chksum_init,
+	.update		=	chksum_update,
+	.final		=	chksum_final,
+	.finup		=	chksum_finup,
+	.digest		=	chksum_digest,
+	.descsize	=	sizeof(struct chksum_desc_ctx),
+	.base		=	{
+		.cra_name		=	CRC64_ROCKSOFT_STRING,
+		.cra_driver_name	=	"crc64-rocksoft-pclmul",
+		.cra_priority		=	200,
+		.cra_blocksize		=	1,
+		.cra_module		=	THIS_MODULE,
+	}
+};
+
+static const struct x86_cpu_id crc64_rocksoft_cpu_id[] = {
+	X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL),
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, crc64_rocksoft_cpu_id);
+
+static int __init crc64_rocksoft_x86_mod_init(void)
+{
+	if (!x86_match_cpu(crc64_rocksoft_cpu_id))
+		return -ENODEV;
+
+	return crypto_register_shash(&alg);
+}
+
+static void __exit crc64_rocksoft_x86_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(crc64_rocksoft_x86_mod_init);
+module_exit(crc64_rocksoft_x86_mod_fini);
+
+MODULE_AUTHOR("Keith Busch <kbusch@kernel.org>");
+MODULE_DESCRIPTION("Rocksoft CRC64 calculation accelerated with PCLMULQDQ.");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("crc64-rocksoft-pclmul");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index e343147b9f8f..d8861138f117 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -744,6 +744,17 @@ config CRYPTO_CRC64_ROCKSOFT
 	  transform. This allows for faster crc64 transforms to be used
 	  if they are available.
 
+config CRYPTO_CRC64_ROCKSOFT_PCLMUL
+	tristate "Rocksoft model CRC64 PCLMULQDQ hardware acceleration"
+	depends on X86 && 64BIT && CRC64
+	select CRYPTO_HASH
+	help
+	  For x86_64 processors with SSE4.2 and PCLMULQDQ supported,
+	  CRC64 PCLMULQDQ computation can be hardware accelerated PCLMULQDQ
+	  instruction. This option will create 'crc64-rocksoft-pclmul'
+	  module, which is faster when computing crc64 checksum compared
+	  with the generic table implementation.
+
 config CRYPTO_VPMSUM_TESTER
 	tristate "Powerpc64 vpmsum hardware acceleration tester"
 	depends on CRYPTO_CRCT10DIF_VPMSUM && CRYPTO_CRC32C_VPMSUM
-- 
2.25.4


  parent reply	other threads:[~2022-02-22 16:32 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-02-22 16:31 [PATCHv3 00/10] 64-bit data integrity field support Keith Busch
2022-02-22 16:31 ` [PATCHv3 01/10] block: support pi with extended metadata Keith Busch
2022-02-25 16:01   ` Christoph Hellwig
2022-02-22 16:31 ` [PATCHv3 02/10] nvme: allow integrity on extended metadata formats Keith Busch
2022-02-25 16:02   ` Christoph Hellwig
2022-02-22 16:31 ` [PATCHv3 03/10] asm-generic: introduce be48 unaligned accessors Keith Busch
2022-02-22 16:52   ` Chaitanya Kulkarni
2022-02-25 16:03   ` Christoph Hellwig
2022-02-25 17:53     ` Joe Perches
2022-02-25 17:59       ` Keith Busch
2022-02-22 16:31 ` [PATCHv3 04/10] linux/kernel: introduce lower_48_bits macro Keith Busch
2022-02-22 16:45   ` Joe Perches
2022-02-22 16:50     ` Christoph Hellwig
2022-02-22 16:56       ` Keith Busch
2022-02-22 18:43         ` Joe Perches
2022-02-22 20:09           ` David Laight
2022-02-22 20:31             ` Joe Perches
2022-02-22 21:12               ` Keith Busch
2022-02-22 21:17                 ` Joe Perches
2022-02-22 16:58       ` Joe Perches
2022-02-22 17:09       ` David Laight
2022-02-22 17:14       ` Chaitanya Kulkarni
2022-02-22 16:48   ` Chaitanya Kulkarni
2022-02-22 16:31 ` [PATCHv3 05/10] lib: add rocksoft model crc64 Keith Busch
2022-02-25 16:04   ` Christoph Hellwig
2022-02-22 16:31 ` [PATCHv3 06/10] crypto: add rocksoft 64b crc framework Keith Busch
2022-02-22 19:50   ` Eric Biggers
2022-02-22 19:54     ` Eric Biggers
2022-02-22 20:09     ` Keith Busch
2022-02-25 16:11       ` Christoph Hellwig
2022-02-22 19:56   ` Eric Biggers
2022-02-22 16:31 ` [PATCHv3 07/10] lib: add crc64 tests Keith Busch
2022-02-22 16:50   ` Chaitanya Kulkarni
2022-02-25 16:05   ` Christoph Hellwig
2022-02-25 16:12     ` Keith Busch
2022-02-25 16:19       ` Christoph Hellwig
2022-02-22 16:31 ` [PATCHv3 08/10] block: add pi for nvme enhanced integrity Keith Busch
2022-02-25 16:14   ` Christoph Hellwig
2022-03-02  3:15     ` Martin K. Petersen
2022-02-22 16:31 ` [PATCHv3 09/10] nvme: add support for enhanced metadata Keith Busch
2022-02-25 16:17   ` Christoph Hellwig
2022-03-02  3:18   ` Martin K. Petersen
2022-02-22 16:31 ` Keith Busch [this message]
2022-02-22 17:02   ` [PATCHv3 10/10] x86/crypto: add pclmul acceleration for crc64 David Laight
2022-02-22 17:14     ` Keith Busch
2022-02-22 20:06       ` Eric Biggers
2022-02-22 20:51         ` Keith Busch

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220222163144.1782447-11-kbusch@kernel.org \
    --to=kbusch@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=colyli@suse.de \
    --cc=hch@lst.de \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-crypto@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=martin.petersen@oracle.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.