[RFC v2 PATCH 0/4] ARM[64]: kernel mode NEON in atomic contexts

All of lore.kernel.org
 help / color / mirror / Atom feed

* [RFC v2 PATCH 0/4] ARM[64]: kernel mode NEON in atomic contexts
@ 2013-10-09 18:50 Ard Biesheuvel
  2013-10-09 18:50 ` [RFC v2 PATCH 1/4] ARM: add support for kernel mode NEON in atomic context Ard Biesheuvel
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2013-10-09 18:50 UTC (permalink / raw)
  To: linux-arm-kernel

This is another stab at a way to enable the use of NEON in atomic
context. As suggested by Nico, all there is to it is basically to
stack the registers you are about to use, and unstack them when you're
done.

Patches #1 and #2 implement this for ARM and ARM64 respectively, #3 and #4
are the examples I posted earlier adapted to use this new method.

Ard Biesheuvel (4):
  ARM: add support for kernel mode NEON in atomic context
  ARM64: add support for kernel mode NEON in atomic context
  ARM64: add Crypto Extensions based synchronous core AES cipher
  ARM64: add Crypto Extensions based synchronous AES in CCM mode

 arch/arm/include/asm/fpstate.h        |  15 +-
 arch/arm/include/asm/neon.h           |  34 +++
 arch/arm/vfp/vfphw.S                  |  46 ++++
 arch/arm/vfp/vfpmodule.c              |   3 +
 arch/arm64/crypto/Makefile            |  12 +
 arch/arm64/crypto/aes-sync.c          | 453 ++++++++++++++++++++++++++++++++++
 arch/arm64/crypto/aesce-ccm.S         | 154 ++++++++++++
 arch/arm64/include/asm/fpsimd.h       |  16 ++
 arch/arm64/include/asm/fpsimdmacros.h |  37 +++
 arch/arm64/include/asm/neon.h         |  31 +++
 arch/arm64/kernel/entry-fpsimd.S      |  24 ++
 arch/arm64/kernel/fpsimd.c            |   3 +
 12 files changed, 827 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/crypto/Makefile
 create mode 100644 arch/arm64/crypto/aes-sync.c
 create mode 100644 arch/arm64/crypto/aesce-ccm.S

-- 
1.8.1.2

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [RFC v2 PATCH 1/4] ARM: add support for kernel mode NEON in atomic context
  2013-10-09 18:50 [RFC v2 PATCH 0/4] ARM[64]: kernel mode NEON in atomic contexts Ard Biesheuvel
@ 2013-10-09 18:50 ` Ard Biesheuvel
  2013-10-09 19:24   ` Nicolas Pitre
  2013-10-09 18:50 ` [RFC v2 PATCH 2/4] ARM64: " Ard Biesheuvel
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 14+ messages in thread
From: Ard Biesheuvel @ 2013-10-09 18:50 UTC (permalink / raw)
  To: linux-arm-kernel

Some applications, such as WPA CCMP encryption, do substantial
amounts of work in non-process context. In order to support
accelerated NEON implementations under these circumstances, we
need a way to preserve the NEON context that may
(a) belong to a completely unrelated userland process (if the
    NEON unit is turned off atm);
(b) belong to current userland;
(c) belong to current kernel mode in process context.

The best way to deal with this is to just stack whatever registers
we are going to use, and unstack them when we are done.

This patch adds kernel_neon_begin_atomic() and kernel_neon_end_atomic(),
which may be called from any context. In !in_interrupt() case, they
just call their non-_atomic counterparts. In atomic context, they
stack resp. unstack the number of NEON registers declared when setting
up the stack area using DEFINE_NEON_REG_STACK().

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm/include/asm/fpstate.h | 15 +++++++++++++-
 arch/arm/include/asm/neon.h    | 34 +++++++++++++++++++++++++++++++
 arch/arm/vfp/vfphw.S           | 46 ++++++++++++++++++++++++++++++++++++++++++
 arch/arm/vfp/vfpmodule.c       |  3 +++
 4 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/fpstate.h b/arch/arm/include/asm/fpstate.h
index 3ad4c10..7a6e100 100644
--- a/arch/arm/include/asm/fpstate.h
+++ b/arch/arm/include/asm/fpstate.h
@@ -19,7 +19,7 @@
  *  - FPEXC, FPSCR, FPINST and FPINST2.
  *  - 16 or 32 double precision data registers
  *  - an implementation-dependent word of state for FLDMX/FSTMX (pre-ARMv6)
- * 
+ *
  *  FPEXC will always be non-zero once the VFP has been used in this process.
  */
 
@@ -52,6 +52,19 @@ union vfp_state {
 extern void vfp_flush_thread(union vfp_state *);
 extern void vfp_release_thread(union vfp_state *);
 
+/*
+ * Variable sized struct for stacking the bottom 'n' NEON registers.
+ */
+struct vfp_partial_state {
+	const __u32	num_regs;
+	__u32		fpexc;
+	__u32		fpscr;
+	__u8		qregs[] __aligned(16);
+} __aligned(16);
+
+extern void vfp_load_partial_state(struct vfp_partial_state *);
+extern void vfp_save_partial_state(struct vfp_partial_state *);
+
 #define FP_HARD_SIZE 35
 
 struct fp_hard_struct {
diff --git a/arch/arm/include/asm/neon.h b/arch/arm/include/asm/neon.h
index 8f730fe..1efd9fc 100644
--- a/arch/arm/include/asm/neon.h
+++ b/arch/arm/include/asm/neon.h
@@ -8,10 +8,21 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/types.h>
+#include <linux/hardirq.h>
+#include <asm/fpstate.h>
 #include <asm/hwcap.h>
 
 #define cpu_has_neon()		(!!(elf_hwcap & HWCAP_NEON))
 
+#define DEFINE_NEON_STACK_REGS(v, num)					\
+	struct {							\
+		struct vfp_partial_state regs;				\
+		u8 qregs[(num) > 16 ? 256 : 16 * (((num) + 1) & ~1U)];	\
+	} v = { .regs.num_regs = sizeof(v.qregs)/16 }
+
+#define DEFINE_NEON_STACK_REGS_ALL(name)	DEFINE_NEON_STACK_REGS(name,16)
+
 #ifdef __ARM_NEON__
 
 /*
@@ -30,7 +41,30 @@
 #define kernel_neon_begin() \
 	BUILD_BUG_ON_MSG(1, "kernel_neon_begin() called from NEON code")
 
+#define kernel_neon_begin_atomic(a) \
+	BUILD_BUG_ON_MSG(1, "kernel_neon_begin_atomic() called from NEON code")
+
 #else
 void kernel_neon_begin(void);
+#define kernel_neon_begin_atomic(name) __kernel_neon_begin_atomic(&(name).regs)
 #endif
+
+#define kernel_neon_end_atomic(name) __kernel_neon_end_atomic(&(name).regs)
+
 void kernel_neon_end(void);
+
+static inline void __kernel_neon_begin_atomic(struct vfp_partial_state *regs)
+{
+	if (!in_interrupt())
+		kernel_neon_begin();
+	else
+		vfp_save_partial_state(regs);
+}
+
+static inline void __kernel_neon_end_atomic(struct vfp_partial_state *regs)
+{
+	if (!in_interrupt())
+		kernel_neon_end();
+	else
+		vfp_load_partial_state(regs);
+}
diff --git a/arch/arm/vfp/vfphw.S b/arch/arm/vfp/vfphw.S
index 3e5d311..747e782 100644
--- a/arch/arm/vfp/vfphw.S
+++ b/arch/arm/vfp/vfphw.S
@@ -322,3 +322,49 @@ ENTRY(vfp_put_double)
 	.endr
 #endif
 ENDPROC(vfp_put_double)
+
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+
+	.fpu	neon
+ENTRY(vfp_save_partial_state)
+	VFPFMRX	r2, FPEXC			@ load the control registers
+	VFPFMRX	r3, FPSCR
+	strd	r2, r3, [r0, #4]		@ and save to memory
+	tst	r2, #FPEXC_EN
+	bne	0f
+	orr	r2, r2, #FPEXC_EN		@ enable VFP if it was disabled
+	VFPFMXR	FPEXC, r2
+0:	ldr	r1, [r0]			@ load # of regs to preserve
+	rsbs	r1, r1, #16
+	add	r2, r0, #16
+	beq	1f
+	adr	r3, 1f
+	add	r3, r3, r1, lsl #1
+THUMB(	orr	r3, r3, #1)
+	bx	r3
+1:	.irp	qq,q14-q15,q12-q13,q10-q11,q8-q9,q6-q7,q4-q5,q2-q3,q0-q1
+	vst1.8	{\qq}, [r2,:128]!
+	.endr
+	bx	lr
+ENDPROC(vfp_save_partial_state)
+
+ENTRY(vfp_load_partial_state)
+	ldr	r2, [r0]			@ load # of regs to preserve
+	rsbs	r1, r2, #16
+	add	r2, r0, #16
+	beq	0f
+	adr	r3, 0f
+	add	r3, r3, r1, lsl #1
+THUMB(	orr	r3, r3, #1)
+	bx	r3
+0:	.irp	qq,q14-q15,q12-q13,q10-q11,q8-q9,q6-q7,q4-q5,q2-q3,q0-q1
+	vld1.8	{\qq}, [r2,:128]!
+	.endr
+	ldrd	r2, r3, [r0, #4]
+	VFPFMXR	FPSCR, r3
+	VFPFMXR	FPEXC, r2
+	bx	lr
+ENDPROC(vfp_load_partial_state)
+
+#endif
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index 52b8f40..3dea5ba 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -713,6 +713,9 @@ void kernel_neon_end(void)
 }
 EXPORT_SYMBOL(kernel_neon_end);
 
+EXPORT_SYMBOL(vfp_save_partial_state);
+EXPORT_SYMBOL(vfp_load_partial_state);
+
 #endif /* CONFIG_KERNEL_MODE_NEON */
 
 /*
-- 
1.8.1.2

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [RFC v2 PATCH 2/4] ARM64: add support for kernel mode NEON in atomic context
  2013-10-09 18:50 [RFC v2 PATCH 0/4] ARM[64]: kernel mode NEON in atomic contexts Ard Biesheuvel
  2013-10-09 18:50 ` [RFC v2 PATCH 1/4] ARM: add support for kernel mode NEON in atomic context Ard Biesheuvel
@ 2013-10-09 18:50 ` Ard Biesheuvel
  2013-10-11 17:14   ` Catalin Marinas
  2013-10-09 18:50 ` [RFC v2 PATCH 3/4] ARM64: add Crypto Extensions based synchronous core AES cipher Ard Biesheuvel
  2013-10-09 18:50 ` [RFC v2 PATCH 4/4] ARM64: add Crypto Extensions based synchronous AES in CCM mode Ard Biesheuvel
  3 siblings, 1 reply; 14+ messages in thread
From: Ard Biesheuvel @ 2013-10-09 18:50 UTC (permalink / raw)
  To: linux-arm-kernel

This patch adds kernel_neon_begin_atomic() and kernel_neon_end_atomic(),
which may be called from any context. In !in_interrupt() case, they
just call their non-_atomic counterparts. In atomic context, they
stack resp. unstack the number of NEON registers declared when setting
up the stack area using DEFINE_NEON_REG_STACK().

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/include/asm/fpsimd.h       | 16 +++++++++++++++
 arch/arm64/include/asm/fpsimdmacros.h | 37 +++++++++++++++++++++++++++++++++++
 arch/arm64/include/asm/neon.h         | 31 +++++++++++++++++++++++++++++
 arch/arm64/kernel/entry-fpsimd.S      | 24 +++++++++++++++++++++++
 arch/arm64/kernel/fpsimd.c            |  3 +++
 5 files changed, 111 insertions(+)

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index c43b4ac..3a741b0 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -39,6 +39,19 @@ struct fpsimd_state {
 	};
 };
 
+/*
+ * Variable sized struct for stacking the bottom n FP/SIMD registers.
+ * Mainly intended for kernel use of v8 Crypto Extensions which only
+ * needs a few registers and may need to execute in atomic context.
+ */
+struct fpsimd_partial_state {
+	const u32	num_regs;
+	u32		fpsr;
+	u32		fpcr;
+	__uint128_t	vregs[] __aligned(16);
+} __aligned(16);
+
+
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /* Masks for extracting the FPSR and FPCR from the FPSCR */
 #define VFP_FPSCR_STAT_MASK	0xf800009f
@@ -55,6 +68,9 @@ struct task_struct;
 extern void fpsimd_save_state(struct fpsimd_state *state);
 extern void fpsimd_load_state(struct fpsimd_state *state);
 
+extern void fpsimd_save_partial_state(struct fpsimd_partial_state *state);
+extern void fpsimd_load_partial_state(struct fpsimd_partial_state *state);
+
 extern void fpsimd_thread_switch(struct task_struct *next);
 extern void fpsimd_flush_thread(void);
 
diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h
index bbec599..1b47587 100644
--- a/arch/arm64/include/asm/fpsimdmacros.h
+++ b/arch/arm64/include/asm/fpsimdmacros.h
@@ -62,3 +62,40 @@
 	ldr	w\tmpnr, [\state, #16 * 2 + 4]
 	msr	fpcr, x\tmpnr
 .endm
+
+.altmacro
+.macro	q2op, op, q1, q2, state
+	\op	q\q1, q\q2, [\state, #-(16 * \q1) - 16]
+.endm
+
+.macro fpsimd_save_partial state, tmpnr1, tmpnr2
+	mrs	x\tmpnr1, fpsr
+	mrs	x\tmpnr2, fpcr
+	stp	w\tmpnr1, w\tmpnr2, [\state, #4]
+	adr	x\tmpnr1, 0f
+	ldr	w\tmpnr2, [\state]
+	add	\state, \state, x\tmpnr2, lsl #4
+	sub	x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1
+	br	x\tmpnr1
+	.irp	qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
+		qb = \qa + 1
+	q2op	stp, \qa, %qb, \state
+	.endr
+0:
+.endm
+
+.macro fpsimd_restore_partial state, tmpnr1, tmpnr2
+	ldp	w\tmpnr1, w\tmpnr2, [\state, #4]
+	msr	fpsr, x\tmpnr1
+	msr	fpcr, x\tmpnr2
+	adr	x\tmpnr1, 0f
+	ldr	w\tmpnr2, [\state]
+	add	\state, \state, x\tmpnr2, lsl #4
+	sub	x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1
+	br	x\tmpnr1
+	.irp	qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
+		qb = \qa + 1
+	q2op	ldp, \qa, %qb, \state
+	.endr
+0:
+.endm
diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h
index b0cc58a9..1c8600a 100644
--- a/arch/arm64/include/asm/neon.h
+++ b/arch/arm64/include/asm/neon.h
@@ -8,7 +8,38 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <asm/fpsimd.h>
+
 #define cpu_has_neon()		(1)
 
+#define DEFINE_NEON_STACK_REGS(a, num)					\
+	struct {							\
+		struct fpsimd_partial_state regs;			\
+		__uint128_t vregs[(num) > 32 ? 32 : ((num) + 1) & ~1U];	\
+	} a = { .regs.num_regs = sizeof(a.vregs) / sizeof(__uint128_t) }
+
+#define DEFINE_NEON_STACK_REGS_ALL(name)	DEFINE_NEON_STACK_REGS(name, 32)
+
 void kernel_neon_begin(void);
 void kernel_neon_end(void);
+
+static inline void __kernel_neon_begin_atomic(struct fpsimd_partial_state *regs)
+{
+	if (!in_interrupt())
+		kernel_neon_begin();
+	else
+		fpsimd_save_partial_state(regs);
+}
+
+static inline void __kernel_neon_end_atomic(struct fpsimd_partial_state *regs)
+{
+	if (!in_interrupt())
+		kernel_neon_end();
+	else
+		fpsimd_load_partial_state(regs);
+}
+
+#define kernel_neon_begin_atomic(a)	__kernel_neon_begin_atomic(&(a).regs)
+#define kernel_neon_end_atomic(a)	__kernel_neon_end_atomic(&(a).regs)
diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S
index 6a27cd6..82cf648 100644
--- a/arch/arm64/kernel/entry-fpsimd.S
+++ b/arch/arm64/kernel/entry-fpsimd.S
@@ -41,3 +41,27 @@ ENTRY(fpsimd_load_state)
 	fpsimd_restore x0, 8
 	ret
 ENDPROC(fpsimd_load_state)
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+
+/*
+ * Save the bottom n FP registers.
+ *
+ * x0 - pointer to struct fpsimd_partial_state
+ */
+ENTRY(fpsimd_save_partial_state)
+	fpsimd_save_partial x0, 8, 9
+	ret
+ENDPROC(fpsimd_load_partial_state)
+
+/*
+ * Load the bottom n FP registers.
+ *
+ * x0 - pointer to struct fpsimd_partial_state
+ */
+ENTRY(fpsimd_load_partial_state)
+	fpsimd_restore_partial x0, 8, 9
+	ret
+ENDPROC(fpsimd_load_partial_state)
+
+#endif
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 1f2e4d5..69c7962 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -109,6 +109,9 @@ void kernel_neon_end(void)
 }
 EXPORT_SYMBOL(kernel_neon_end);
 
+EXPORT_SYMBOL(fpsimd_load_partial_state);
+EXPORT_SYMBOL(fpsimd_save_partial_state);
+
 #endif /* CONFIG_KERNEL_MODE_NEON */
 
 /*
-- 
1.8.1.2

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [RFC v2 PATCH 3/4] ARM64: add Crypto Extensions based synchronous core AES cipher
  2013-10-09 18:50 [RFC v2 PATCH 0/4] ARM[64]: kernel mode NEON in atomic contexts Ard Biesheuvel
  2013-10-09 18:50 ` [RFC v2 PATCH 1/4] ARM: add support for kernel mode NEON in atomic context Ard Biesheuvel
  2013-10-09 18:50 ` [RFC v2 PATCH 2/4] ARM64: " Ard Biesheuvel
@ 2013-10-09 18:50 ` Ard Biesheuvel
  2013-10-09 18:50 ` [RFC v2 PATCH 4/4] ARM64: add Crypto Extensions based synchronous AES in CCM mode Ard Biesheuvel
  3 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2013-10-09 18:50 UTC (permalink / raw)
  To: linux-arm-kernel

This implements the core AES cipher using the Crypto Extensions,
using only NEON register q0 and q1.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/Makefile   |  12 +++++
 arch/arm64/crypto/aes-sync.c | 106 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 118 insertions(+)
 create mode 100644 arch/arm64/crypto/Makefile
 create mode 100644 arch/arm64/crypto/aes-sync.c

diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
new file mode 100644
index 0000000..7c636e9
--- /dev/null
+++ b/arch/arm64/crypto/Makefile
@@ -0,0 +1,12 @@
+#
+# linux/arch/arm64/crypto/Makefile
+#
+# Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+#
+
+aesce-sync-y	:= aes-sync.o
+obj-m		+= aesce-sync.o
diff --git a/arch/arm64/crypto/aes-sync.c b/arch/arm64/crypto/aes-sync.c
new file mode 100644
index 0000000..d047d49
--- /dev/null
+++ b/arch/arm64/crypto/aes-sync.c
@@ -0,0 +1,106 @@
+/*
+ * linux/arch/arm64/crypto/aes-sync.c
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <crypto/aes.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
+{
+	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	int rounds = 6 + ctx->key_length / 4;
+	DEFINE_NEON_STACK_REGS(regs, 2);
+
+	kernel_neon_begin_atomic(regs);
+
+	__asm__("	.arch		armv8-a+crypto			\n\t"
+		"	ld1		{v0.16b}, [%[in]]		\n\t"
+		"	ld1		{v1.16b}, [%[key]], #16		\n\t"
+		"0:	aese		v0.16b, v1.16b			\n\t"
+		"	subs		%[rounds], %[rounds], #1	\n\t"
+		"	ld1		{v1.16b}, [%[key]], #16		\n\t"
+		"	beq		1f				\n\t"
+		"	aesmc		v0.16b, v0.16b			\n\t"
+		"	b		0b				\n\t"
+		"1:	eor		v0.16b, v0.16b, v1.16b		\n\t"
+		"	st1		{v0.16b}, [%[out]]		\n\t"
+	: :
+		[out]		"r"(dst),
+		[in]		"r"(src),
+		[rounds]	"r"(rounds),
+		[key]		"r"(ctx->key_enc));
+
+	kernel_neon_end_atomic(regs);
+}
+
+static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
+{
+	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	int rounds = 6 + ctx->key_length / 4;
+	DEFINE_NEON_STACK_REGS(regs, 2);
+
+	kernel_neon_begin_atomic(regs);
+
+	__asm__("	.arch		armv8-a+crypto			\n\t"
+		"	ld1		{v0.16b}, [%[in]]		\n\t"
+		"	ld1		{v1.16b}, [%[key]], #16		\n\t"
+		"0:	aesd		v0.16b, v1.16b			\n\t"
+		"	ld1		{v1.16b}, [%[key]], #16		\n\t"
+		"	subs		%[rounds], %[rounds], #1	\n\t"
+		"	beq		1f				\n\t"
+		"	aesimc		v0.16b, v0.16b			\n\t"
+		"	b		0b				\n\t"
+		"1:	eor		v0.16b, v0.16b, v1.16b		\n\t"
+		"	st1		{v0.16b}, [%[out]]		\n\t"
+	: :
+		[out]		"r"(dst),
+		[in]		"r"(src),
+		[rounds]	"r"(rounds),
+		[key]		"r"(ctx->key_dec));
+
+	kernel_neon_end_atomic(regs);
+}
+
+static struct crypto_alg aes_alg = {
+	.cra_name		= "aes",
+	.cra_driver_name	= "aes-ce",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
+	.cra_module		= THIS_MODULE,
+	.cra_cipher = {
+		.cia_min_keysize	= AES_MIN_KEY_SIZE,
+		.cia_max_keysize	= AES_MAX_KEY_SIZE,
+		.cia_setkey		= crypto_aes_set_key,
+		.cia_encrypt		= aes_cipher_encrypt,
+		.cia_decrypt		= aes_cipher_decrypt
+	}
+};
+
+static int __init aes_mod_init(void)
+{
+	if (0) // TODO check for crypto extensions
+		return -ENODEV;
+	return crypto_register_alg(&aes_alg);
+}
+
+static void __exit aes_mod_exit(void)
+{
+	crypto_unregister_alg(&aes_alg);
+}
+
+module_init(aes_mod_init);
+module_exit(aes_mod_exit);
+
+MODULE_DESCRIPTION("Synchronous AES using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL");
-- 
1.8.1.2

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [RFC v2 PATCH 4/4] ARM64: add Crypto Extensions based synchronous AES in CCM mode
  2013-10-09 18:50 [RFC v2 PATCH 0/4] ARM[64]: kernel mode NEON in atomic contexts Ard Biesheuvel
                   ` (2 preceding siblings ...)
  2013-10-09 18:50 ` [RFC v2 PATCH 3/4] ARM64: add Crypto Extensions based synchronous core AES cipher Ard Biesheuvel
@ 2013-10-09 18:50 ` Ard Biesheuvel
  3 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2013-10-09 18:50 UTC (permalink / raw)
  To: linux-arm-kernel

This implements the CCM AEAD chaining mode for AES using Crypto
Extensions instructions.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/Makefile    |   2 +-
 arch/arm64/crypto/aes-sync.c  | 355 +++++++++++++++++++++++++++++++++++++++++-
 arch/arm64/crypto/aesce-ccm.S | 154 ++++++++++++++++++
 3 files changed, 506 insertions(+), 5 deletions(-)
 create mode 100644 arch/arm64/crypto/aesce-ccm.S

diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index 7c636e9..5708e6f 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -8,5 +8,5 @@
 # published by the Free Software Foundation.
 #
 
-aesce-sync-y	:= aes-sync.o
+aesce-sync-y	:= aes-sync.o aesce-ccm.o
 obj-m		+= aesce-sync.o
diff --git a/arch/arm64/crypto/aes-sync.c b/arch/arm64/crypto/aes-sync.c
index d047d49..0b483af 100644
--- a/arch/arm64/crypto/aes-sync.c
+++ b/arch/arm64/crypto/aes-sync.c
@@ -9,7 +9,10 @@
  */
 
 #include <asm/neon.h>
+#include <asm/unaligned.h>
 #include <crypto/aes.h>
+#include <crypto/algapi.h>
+#include <crypto/scatterwalk.h>
 #include <linux/crypto.h>
 #include <linux/module.h>
 
@@ -69,7 +72,313 @@ static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
 	kernel_neon_end_atomic(regs);
 }
 
-static struct crypto_alg aes_alg = {
+struct crypto_ccm_aes_ctx {
+	struct crypto_aes_ctx	*key;
+	struct crypto_blkcipher	*blk_tfm;
+};
+
+asmlinkage void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], long abytes,
+				     u32 const rk[], int rounds);
+
+asmlinkage void ce_aes_ccm_encrypt(u8 out[], u8 const in[], long cbytes,
+				   u32 const rk[], int rounds, u8 mac[],
+				   u8 ctr[]);
+
+asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], long cbytes,
+				   u32 const rk[], int rounds, u8 mac[],
+				   u8 ctr[]);
+
+asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[],
+				 long rounds);
+
+static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key,
+		      unsigned int key_len)
+{
+	struct crypto_ccm_aes_ctx *ctx = crypto_aead_ctx(tfm);
+	int ret;
+
+	ret = crypto_aes_expand_key(ctx->key, in_key, key_len);
+	if (!ret)
+		return 0;
+
+	tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+	return -EINVAL;
+}
+
+static int ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
+{
+	if ((authsize & 1) || authsize < 4)
+		return -EINVAL;
+	return 0;
+}
+
+static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	__be32 *n = (__be32 *)(&maciv[AES_BLOCK_SIZE - 8]);
+	u32 l = req->iv[0] + 1;
+
+	/* verify that CCM dimension 'L' is set correctly in the IV */
+	if (l < 2 || l > 8)
+		return -EINVAL;
+
+	/* verify that msglen can in fact be represented in L bytes */
+	if (msglen >> (8 * l))
+		return -EOVERFLOW;
+
+	/*
+	 * Even if the CCM spec allows L values of up to 8, the Linux cryptoapi
+	 * uses a u32 type to represent msglen so the top 4 bytes are always 0.
+	 */
+	n[0] = 0;
+	n[1] = cpu_to_be32(msglen);
+
+	memcpy(maciv, req->iv, AES_BLOCK_SIZE - l);
+
+	maciv[0] |= (crypto_aead_authsize(aead) - 2) << 2;
+	if (req->assoclen)
+		maciv[0] |= 0x40;
+
+	memset(&req->iv[AES_BLOCK_SIZE - l], 0, l);
+	return 0;
+}
+
+static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct crypto_ccm_aes_ctx *ctx = crypto_aead_ctx(aead);
+	struct __packed { __be16 l; __be32 h; } ltag;
+	int rounds = 6 + ctx->key->key_length / 4;
+	struct scatter_walk walk;
+	u32 len = req->assoclen;
+	u32 macp;
+
+	/* prepend the AAD with a length tag */
+	if (len < 0xff00) {
+		ltag.l = cpu_to_be16(len);
+		macp = 2;
+	} else  {
+		ltag.l = cpu_to_be16(0xfffe);
+		put_unaligned_be32(len, &ltag.h);
+		macp = 6;
+	}
+
+	ce_aes_ccm_auth_data(mac, (u8 *)&ltag, macp, ctx->key->key_enc, rounds);
+	scatterwalk_start(&walk, req->assoc);
+
+	do {
+		u32 n = scatterwalk_clamp(&walk, len);
+		u32 m;
+		u8 *p;
+
+		if (!n) {
+			scatterwalk_start(&walk, sg_next(walk.sg));
+			n = scatterwalk_clamp(&walk, len);
+		}
+		p = scatterwalk_map(&walk);
+		m = min(n, AES_BLOCK_SIZE - macp);
+		crypto_xor(&mac[macp], p, m);
+
+		len -= n;
+		n -= m;
+		macp += m;
+		if (macp == AES_BLOCK_SIZE && (n || len)) {
+			ce_aes_ccm_auth_data(mac, &p[m], n, ctx->key->key_enc,
+					     rounds);
+			macp = n % AES_BLOCK_SIZE;
+		}
+
+		scatterwalk_unmap(p);
+		scatterwalk_advance(&walk, n + m);
+		scatterwalk_done(&walk, 0, len);
+	} while (len);
+}
+
+struct ccm_inner_desc_info {
+	u8	ctriv[AES_BLOCK_SIZE];
+	u8	mac[AES_BLOCK_SIZE];
+} __aligned(8);
+
+static int ccm_inner_encrypt(struct blkcipher_desc *desc,
+			     struct scatterlist *dst, struct scatterlist *src,
+			     unsigned int nbytes)
+{
+	struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct ccm_inner_desc_info *descinfo = desc->info;
+	int rounds = 6 + ctx->key_length / 4;
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
+
+	while (walk.nbytes) {
+		u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+
+		if (walk.nbytes == nbytes)
+			tail = 0;
+
+		ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				   walk.nbytes - tail, ctx->key_enc, rounds,
+				   descinfo->mac, descinfo->ctriv);
+
+		nbytes -= walk.nbytes - tail;
+		err = blkcipher_walk_done(desc, &walk, tail);
+	}
+	return err;
+}
+
+static int ccm_inner_decrypt(struct blkcipher_desc *desc,
+			     struct scatterlist *dst, struct scatterlist *src,
+			     unsigned int nbytes)
+{
+	struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct ccm_inner_desc_info *descinfo = desc->info;
+	int rounds = 6 + ctx->key_length / 4;
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
+
+	while (walk.nbytes) {
+		u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+
+		if (walk.nbytes == nbytes)
+			tail = 0;
+
+		ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				   walk.nbytes - tail, ctx->key_enc, rounds,
+				   descinfo->mac, descinfo->ctriv);
+
+		nbytes -= walk.nbytes - tail;
+		err = blkcipher_walk_done(desc, &walk, tail);
+	}
+	return err;
+}
+
+static int ccm_encrypt(struct aead_request *req)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct crypto_ccm_aes_ctx *ctx = crypto_aead_ctx(aead);
+	int rounds = 6 + ctx->key->key_length / 4;
+	struct ccm_inner_desc_info descinfo;
+	DEFINE_NEON_STACK_REGS(regs, 4);
+	int err;
+
+	struct blkcipher_desc desc = {
+		.tfm	= ctx->blk_tfm,
+		.info	= &descinfo,
+		.flags = 0,
+	};
+
+	err = ccm_init_mac(req, descinfo.mac, req->cryptlen);
+	if (err)
+		return err;
+
+	kernel_neon_begin_atomic(regs);
+
+	if (req->assoclen)
+		ccm_calculate_auth_mac(req, descinfo.mac);
+
+	memcpy(descinfo.ctriv, req->iv, AES_BLOCK_SIZE);
+
+	/* call inner blkcipher to process the payload */
+	err = ccm_inner_encrypt(&desc, req->dst, req->src, req->cryptlen);
+	if (!err)
+		ce_aes_ccm_final(descinfo.mac, req->iv, ctx->key->key_enc,
+				 rounds);
+
+	kernel_neon_end_atomic(regs);
+
+	if (err)
+		return err;
+
+	/* copy authtag to end of dst */
+	scatterwalk_map_and_copy(descinfo.mac, req->dst, req->cryptlen,
+				 crypto_aead_authsize(aead), 1);
+
+	return 0;
+}
+
+static int ccm_decrypt(struct aead_request *req)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct crypto_ccm_aes_ctx *ctx = crypto_aead_ctx(aead);
+	int rounds = 6 + ctx->key->key_length / 4;
+	struct ccm_inner_desc_info descinfo;
+	DEFINE_NEON_STACK_REGS(regs, 4);
+	u8 atag[AES_BLOCK_SIZE];
+	u32 len;
+	int err;
+
+	struct blkcipher_desc desc = {
+		.tfm	= ctx->blk_tfm,
+		.info	= &descinfo,
+		.flags = 0,
+	};
+
+	len = req->cryptlen - crypto_aead_authsize(aead);
+	err = ccm_init_mac(req, descinfo.mac, len);
+	if (err)
+		return err;
+
+	if (req->assoclen)
+		ccm_calculate_auth_mac(req, descinfo.mac);
+
+	memcpy(descinfo.ctriv, req->iv, AES_BLOCK_SIZE);
+
+	kernel_neon_begin_atomic(regs);
+
+	/* call inner blkcipher to process the payload */
+	err = ccm_inner_decrypt(&desc, req->dst, req->src, len);
+	if (!err)
+		ce_aes_ccm_final(descinfo.mac, req->iv, ctx->key->key_enc,
+				 rounds);
+
+	kernel_neon_end_atomic(regs);
+
+	if (err)
+		return err;
+
+	/* compare calculated auth tag with the stored one */
+	scatterwalk_map_and_copy(atag, req->src, len,
+				 crypto_aead_authsize(aead), 0);
+
+	if (memcmp(descinfo.mac, atag, crypto_aead_authsize(aead)))
+		return -EBADMSG;
+	return 0;
+}
+
+static int ccm_init(struct crypto_tfm *tfm)
+{
+	struct crypto_ccm_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct crypto_blkcipher *blk_tfm;
+
+	blk_tfm = crypto_alloc_blkcipher("__driver-ccm-aesce-inner", 0, 0);
+	if (IS_ERR(blk_tfm))
+		return PTR_ERR(blk_tfm);
+
+	/* did we get the right one? (sanity check) */
+	if (crypto_blkcipher_crt(blk_tfm)->encrypt != ccm_inner_encrypt) {
+		crypto_free_blkcipher(ctx->blk_tfm);
+		return -EINVAL;
+	}
+
+	ctx->blk_tfm = blk_tfm;
+	ctx->key = crypto_blkcipher_ctx(blk_tfm);
+
+	return 0;
+}
+
+static void ccm_exit(struct crypto_tfm *tfm)
+{
+	struct crypto_ccm_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	crypto_free_blkcipher(ctx->blk_tfm);
+}
+
+static struct crypto_alg aes_algs[] = { {
 	.cra_name		= "aes",
 	.cra_driver_name	= "aes-ce",
 	.cra_priority		= 300,
@@ -84,18 +393,56 @@ static struct crypto_alg aes_alg = {
 		.cia_encrypt		= aes_cipher_encrypt,
 		.cia_decrypt		= aes_cipher_decrypt
 	}
-};
+}, {
+	.cra_name		= "__ccm-aesce-inner",
+	.cra_driver_name	= "__driver-ccm-aesce-inner",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_blkcipher = {
+		.min_keysize	= AES_MIN_KEY_SIZE,
+		.max_keysize	= AES_MAX_KEY_SIZE,
+		.ivsize		= sizeof(struct ccm_inner_desc_info),
+		.setkey		= crypto_aes_set_key,
+		.encrypt	= ccm_inner_encrypt,
+		.decrypt	= ccm_inner_decrypt,
+	},
+}, {
+	.cra_name		= "ccm(aes)",
+	.cra_driver_name	= "ccm-aes-ce",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_AEAD,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct crypto_ccm_aes_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_aead_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ccm_init,
+	.cra_exit		= ccm_exit,
+	.cra_aead = {
+		.ivsize		= AES_BLOCK_SIZE,
+		.maxauthsize	= AES_BLOCK_SIZE,
+		.setkey		= ccm_setkey,
+		.setauthsize	= ccm_setauthsize,
+		.encrypt	= ccm_encrypt,
+		.decrypt	= ccm_decrypt,
+	}
+} };
 
 static int __init aes_mod_init(void)
 {
 	if (0) // TODO check for crypto extensions
 		return -ENODEV;
-	return crypto_register_alg(&aes_alg);
+	return crypto_register_algs(aes_algs, ARRAY_SIZE(aes_algs));
 }
 
 static void __exit aes_mod_exit(void)
 {
-	crypto_unregister_alg(&aes_alg);
+	crypto_unregister_algs(aes_algs, ARRAY_SIZE(aes_algs));
 }
 
 module_init(aes_mod_init);
diff --git a/arch/arm64/crypto/aesce-ccm.S b/arch/arm64/crypto/aesce-ccm.S
new file mode 100644
index 0000000..74a025b
--- /dev/null
+++ b/arch/arm64/crypto/aesce-ccm.S
@@ -0,0 +1,154 @@
+/*
+ * linux/arch/arm64/crypto/aesce-ccm.S - AES-CCM transform for ARMv8 with
+ *                                       Crypto Extensions
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+
+	.text
+	.arch	armv8-a+crypto
+	.align	4
+
+	/*
+	 * void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], long abytes,
+	 *			     u8 const rk[], int rounds);
+	 */
+ENTRY(ce_aes_ccm_auth_data)
+	ld1	{v0.16b}, [x0]			/* load mac */
+	ld1	{v1.16b}, [x3]			/* load first round key */
+0:	mov	x7, x4
+	add	x6, x3, #16
+1:	aese	v0.16b, v1.16b
+	ld1	{v1.16b}, [x6], #16		/* load next round key */
+	subs	x7, x7, #1
+	beq	2f
+	aesmc	v0.16b, v0.16b
+	b	1b
+2:	eor	v0.16b, v0.16b, v1.16b		/* final round */
+	subs	x2, x2, #16			/* last data? */
+	bmi	3f
+	ld1	{v1.16b}, [x1], #16		/* load next input block */
+	eor	v0.16b, v0.16b, v1.16b		/* xor with mac */
+	beq	3f
+	ld1	{v1.16b}, [x3]			/* reload first round key */
+	b	0b
+3:	st1	{v0.16b}, [x0]			/* store mac */
+	beq	5f
+	adds	x2, x2, #16
+	beq	5f
+4:	ldrb	w7, [x1], #1
+	umov	w6, v0.b[0]
+	eor	w6, w6, w7
+	strb	w6, [x0], #1
+	subs	x2, x2, #1
+	beq	5f
+	ext	v0.16b, v0.16b, v0.16b, #1	/* rotate out the mac bytes */
+	b	4b
+5:	ret
+ENDPROC(ce_aes_ccm_auth_data)
+
+	.macro	aes_ccm_do_crypt,enc
+	ld1	{v0.16b}, [x5]			/* load mac */
+	ld1	{v2.16b}, [x6]			/* load ctr */
+	ld1	{v3.16b}, [x3]			/* load first round key */
+	umov	x8, v2.d[1]
+	rev	x8, x8				/* keep swabbed ctr in reg */
+0:	add	x8, x8, #1
+	mov	x7, x4
+	rev	x9, x8
+	add	x10, x3, #16
+	ins	v2.d[1], x9			/* no carry */
+1:	aese	v0.16b, v3.16b
+	aese	v2.16b, v3.16b
+	ld1	{v3.16b}, [x10], #16		/* load next round key */
+	subs	x7, x7, #1
+	beq	2f
+	aesmc	v0.16b, v0.16b
+	aesmc	v2.16b, v2.16b
+	b	1b
+2:	eor	v2.16b, v2.16b, v3.16b		/* final round enc */
+	eor	v0.16b, v0.16b, v3.16b		/* final round mac */
+	subs	x2, x2, #16
+	bmi	3f
+	ld1	{v1.16b}, [x1], #16		/* load next input block */
+	.if	\enc == 1
+	eor	v0.16b, v0.16b, v1.16b		/* xor mac with plaintext */
+	eor	v1.16b, v1.16b, v2.16b		/* xor with crypted ctr */
+	.else
+	eor	v1.16b, v1.16b, v2.16b		/* xor with crypted ctr */
+	eor	v0.16b, v0.16b, v1.16b		/* xor mac with plaintext */
+	.endif
+	st1	{v1.16b}, [x0], #16		/* write output block */
+	beq	5f
+	ld1	{v2.8b}, [x6]			/* reload ctriv */
+	ld1	{v3.16b}, [x3]			/* reload first round key */
+	b	0b
+3:	st1	{v0.16b}, [x5]			/* store mac */
+	add	x2, x2, #16			/* process partial tail block */
+4:	ldrb	w9, [x1], #1			/* get 1 byte of input */
+	umov	w6, v2.b[0]			/* get top crypted ctr byte */
+	umov	w7, v0.b[0]			/* get top mac byte */
+	.if	\enc == 1
+	eor	w7, w7, w9
+	eor	w9, w9, w6
+	.else
+	eor	w9, w9, w6
+	eor	w7, w7, w9
+	.endif
+	strb	w9, [x0], #1			/* store out byte */
+	strb	w7, [x5], #1			/* store mac byte */
+	subs	x2, x2, #1
+	beq	6f
+	ext	v0.16b, v0.16b, v0.16b, #1	/* shift out mac byte */
+	ext	v2.16b, v2.16b, v2.16b, #1	/* shift out ctr byte */
+	b	4b
+5:	rev	x8, x8
+	st1	{v0.16b}, [x5]			/* store mac */
+	str	x8, [x6, #8]			/* store lsb end of ctr (BE) */
+6:	ret
+	.endm
+
+	/*
+	 * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], long cbytes,
+	 * 			   u8 const rk[], int rounds, u8 mac[],
+	 * 			   u8 ctr[]);
+	 * void ce_aes_ccm_decrypt(u8 out[], u8 const in[], long cbytes,
+	 * 			   u8 const rk[], int rounds, u8 mac[],
+	 * 			   u8 ctr[]);
+	 */
+ENTRY(ce_aes_ccm_encrypt)
+	aes_ccm_do_crypt	1
+ENDPROC(ce_aes_ccm_encrypt)
+
+ENTRY(ce_aes_ccm_decrypt)
+	aes_ccm_do_crypt	0
+ENDPROC(ce_aes_ccm_decrypt)
+
+	/*
+	 * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[],
+	 * 			 long rounds);
+	 */
+ENTRY(ce_aes_ccm_final)
+	ld1	{v0.16b}, [x0]			/* load mac */
+	ld1	{v3.16b}, [x2], #16		/* load first round key */
+	ld1	{v2.16b}, [x1]			/* load 1st ctriv */
+0:	aese	v0.16b, v3.16b
+	aese	v2.16b, v3.16b
+	ld1	{v3.16b}, [x2], #16		/* load next round key */
+	subs	x3, x3, #1
+	beq	1f
+	aesmc	v0.16b, v0.16b
+	aesmc	v2.16b, v2.16b
+	b	0b
+1:	eor	v2.16b, v2.16b, v3.16b		/* final round enc */
+	eor	v0.16b, v0.16b, v3.16b		/* final round mac */
+	eor	v0.16b, v0.16b, v2.16b		/* en-/decrypt the mac */
+	st1	{v0.16b}, [x0]			/* store result */
+	ret
+ENDPROC(ce_aes_ccm_final)
-- 
1.8.1.2

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [RFC v2 PATCH 1/4] ARM: add support for kernel mode NEON in atomic context
  2013-10-09 18:50 ` [RFC v2 PATCH 1/4] ARM: add support for kernel mode NEON in atomic context Ard Biesheuvel
@ 2013-10-09 19:24   ` Nicolas Pitre
  2013-10-09 19:32     ` Ard Biesheuvel
  0 siblings, 1 reply; 14+ messages in thread
From: Nicolas Pitre @ 2013-10-09 19:24 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, 9 Oct 2013, Ard Biesheuvel wrote:

> Some applications, such as WPA CCMP encryption, do substantial
> amounts of work in non-process context. In order to support
> accelerated NEON implementations under these circumstances, we
> need a way to preserve the NEON context that may
> (a) belong to a completely unrelated userland process (if the
>     NEON unit is turned off atm);
> (b) belong to current userland;
> (c) belong to current kernel mode in process context.
> 
> The best way to deal with this is to just stack whatever registers
> we are going to use, and unstack them when we are done.
> 
> This patch adds kernel_neon_begin_atomic() and kernel_neon_end_atomic(),
> which may be called from any context. In !in_interrupt() case, they
> just call their non-_atomic counterparts. In atomic context, they
> stack resp. unstack the number of NEON registers declared when setting
> up the stack area using DEFINE_NEON_REG_STACK().
> 
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> ---
>  arch/arm/include/asm/fpstate.h | 15 +++++++++++++-
>  arch/arm/include/asm/neon.h    | 34 +++++++++++++++++++++++++++++++
>  arch/arm/vfp/vfphw.S           | 46 ++++++++++++++++++++++++++++++++++++++++++
>  arch/arm/vfp/vfpmodule.c       |  3 +++
>  4 files changed, 97 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm/include/asm/fpstate.h b/arch/arm/include/asm/fpstate.h
> index 3ad4c10..7a6e100 100644
> --- a/arch/arm/include/asm/fpstate.h
> +++ b/arch/arm/include/asm/fpstate.h
> @@ -19,7 +19,7 @@
>   *  - FPEXC, FPSCR, FPINST and FPINST2.
>   *  - 16 or 32 double precision data registers
>   *  - an implementation-dependent word of state for FLDMX/FSTMX (pre-ARMv6)
> - * 
> + *
>   *  FPEXC will always be non-zero once the VFP has been used in this process.
>   */
>  
> @@ -52,6 +52,19 @@ union vfp_state {
>  extern void vfp_flush_thread(union vfp_state *);
>  extern void vfp_release_thread(union vfp_state *);
>  
> +/*
> + * Variable sized struct for stacking the bottom 'n' NEON registers.
> + */
> +struct vfp_partial_state {
> +	const __u32	num_regs;
> +	__u32		fpexc;
> +	__u32		fpscr;
> +	__u8		qregs[] __aligned(16);
> +} __aligned(16);
> +
> +extern void vfp_load_partial_state(struct vfp_partial_state *);
> +extern void vfp_save_partial_state(struct vfp_partial_state *);
> +
>  #define FP_HARD_SIZE 35
>  
>  struct fp_hard_struct {
> diff --git a/arch/arm/include/asm/neon.h b/arch/arm/include/asm/neon.h
> index 8f730fe..1efd9fc 100644
> --- a/arch/arm/include/asm/neon.h
> +++ b/arch/arm/include/asm/neon.h
> @@ -8,10 +8,21 @@
>   * published by the Free Software Foundation.
>   */
>  
> +#include <linux/types.h>
> +#include <linux/hardirq.h>
> +#include <asm/fpstate.h>
>  #include <asm/hwcap.h>
>  
>  #define cpu_has_neon()		(!!(elf_hwcap & HWCAP_NEON))
>  
> +#define DEFINE_NEON_STACK_REGS(v, num)					\
> +	struct {							\
> +		struct vfp_partial_state regs;				\
> +		u8 qregs[(num) > 16 ? 256 : 16 * (((num) + 1) & ~1U)];	\
> +	} v = { .regs.num_regs = sizeof(v.qregs)/16 }
> +
> +#define DEFINE_NEON_STACK_REGS_ALL(name)	DEFINE_NEON_STACK_REGS(name,16)
> +
>  #ifdef __ARM_NEON__
>  
>  /*
> @@ -30,7 +41,30 @@
>  #define kernel_neon_begin() \
>  	BUILD_BUG_ON_MSG(1, "kernel_neon_begin() called from NEON code")
>  
> +#define kernel_neon_begin_atomic(a) \
> +	BUILD_BUG_ON_MSG(1, "kernel_neon_begin_atomic() called from NEON code")
> +
>  #else
>  void kernel_neon_begin(void);
> +#define kernel_neon_begin_atomic(name) __kernel_neon_begin_atomic(&(name).regs)
>  #endif
> +
> +#define kernel_neon_end_atomic(name) __kernel_neon_end_atomic(&(name).regs)
> +
>  void kernel_neon_end(void);
> +
> +static inline void __kernel_neon_begin_atomic(struct vfp_partial_state *regs)
> +{
> +	if (!in_interrupt())
> +		kernel_neon_begin();

Surely you want "if (!in_atomic())" here?

> +	else
> +		vfp_save_partial_state(regs);
> +}
> +
> +static inline void __kernel_neon_end_atomic(struct vfp_partial_state *regs)
> +{
> +	if (!in_interrupt())
> +		kernel_neon_end();

Ditto.

> +	else
> +		vfp_load_partial_state(regs);
> +}
> diff --git a/arch/arm/vfp/vfphw.S b/arch/arm/vfp/vfphw.S
> +	VFPFMXR	FPSCR, r3
> +	VFPFMXR	FPEXC, r2
> +	bx	lr
> +ENDPROC(vfp_load_partial_state)
> +
> +#endif
> diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
> index 52b8f40..3dea5ba 100644
> --- a/arch/arm/vfp/vfpmodule.c
> +++ b/arch/arm/vfp/vfpmodule.c
> @@ -713,6 +713,9 @@ void kernel_neon_end(void)
>  }
>  EXPORT_SYMBOL(kernel_neon_end);
>  
> +EXPORT_SYMBOL(vfp_save_partial_state);
> +EXPORT_SYMBOL(vfp_load_partial_state);
> +
>  #endif /* CONFIG_KERNEL_MODE_NEON */
>  
>  /*
> -- 
> 1.8.1.2
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [RFC v2 PATCH 1/4] ARM: add support for kernel mode NEON in atomic context
  2013-10-09 19:24   ` Nicolas Pitre
@ 2013-10-09 19:32     ` Ard Biesheuvel
  2013-10-10  3:45       ` Nicolas Pitre
  0 siblings, 1 reply; 14+ messages in thread
From: Ard Biesheuvel @ 2013-10-09 19:32 UTC (permalink / raw)
  To: linux-arm-kernel

On 9 October 2013 21:24, Nicolas Pitre <nicolas.pitre@linaro.org> wrote:
> On Wed, 9 Oct 2013, Ard Biesheuvel wrote:
>
>> Some applications, such as WPA CCMP encryption, do substantial
>> amounts of work in non-process context. In order to support
>> accelerated NEON implementations under these circumstances, we
>> need a way to preserve the NEON context that may
>> (a) belong to a completely unrelated userland process (if the
>>     NEON unit is turned off atm);
>> (b) belong to current userland;
>> (c) belong to current kernel mode in process context.
>>
>> The best way to deal with this is to just stack whatever registers
>> we are going to use, and unstack them when we are done.
>>
>> This patch adds kernel_neon_begin_atomic() and kernel_neon_end_atomic(),
>> which may be called from any context. In !in_interrupt() case, they
>> just call their non-_atomic counterparts. In atomic context, they
>> stack resp. unstack the number of NEON registers declared when setting
>> up the stack area using DEFINE_NEON_REG_STACK().
>>
>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
>> ---
>>  arch/arm/include/asm/fpstate.h | 15 +++++++++++++-
>>  arch/arm/include/asm/neon.h    | 34 +++++++++++++++++++++++++++++++
>>  arch/arm/vfp/vfphw.S           | 46 ++++++++++++++++++++++++++++++++++++++++++
>>  arch/arm/vfp/vfpmodule.c       |  3 +++
>>  4 files changed, 97 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/arm/include/asm/fpstate.h b/arch/arm/include/asm/fpstate.h
>> index 3ad4c10..7a6e100 100644
>> --- a/arch/arm/include/asm/fpstate.h
>> +++ b/arch/arm/include/asm/fpstate.h
>> @@ -19,7 +19,7 @@
>>   *  - FPEXC, FPSCR, FPINST and FPINST2.
>>   *  - 16 or 32 double precision data registers
>>   *  - an implementation-dependent word of state for FLDMX/FSTMX (pre-ARMv6)
>> - *
>> + *
>>   *  FPEXC will always be non-zero once the VFP has been used in this process.
>>   */
>>
>> @@ -52,6 +52,19 @@ union vfp_state {
>>  extern void vfp_flush_thread(union vfp_state *);
>>  extern void vfp_release_thread(union vfp_state *);
>>
>> +/*
>> + * Variable sized struct for stacking the bottom 'n' NEON registers.
>> + */
>> +struct vfp_partial_state {
>> +     const __u32     num_regs;
>> +     __u32           fpexc;
>> +     __u32           fpscr;
>> +     __u8            qregs[] __aligned(16);
>> +} __aligned(16);
>> +
>> +extern void vfp_load_partial_state(struct vfp_partial_state *);
>> +extern void vfp_save_partial_state(struct vfp_partial_state *);
>> +
>>  #define FP_HARD_SIZE 35
>>
>>  struct fp_hard_struct {
>> diff --git a/arch/arm/include/asm/neon.h b/arch/arm/include/asm/neon.h
>> index 8f730fe..1efd9fc 100644
>> --- a/arch/arm/include/asm/neon.h
>> +++ b/arch/arm/include/asm/neon.h
>> @@ -8,10 +8,21 @@
>>   * published by the Free Software Foundation.
>>   */
>>
>> +#include <linux/types.h>
>> +#include <linux/hardirq.h>
>> +#include <asm/fpstate.h>
>>  #include <asm/hwcap.h>
>>
>>  #define cpu_has_neon()               (!!(elf_hwcap & HWCAP_NEON))
>>
>> +#define DEFINE_NEON_STACK_REGS(v, num)                                       \
>> +     struct {                                                        \
>> +             struct vfp_partial_state regs;                          \
>> +             u8 qregs[(num) > 16 ? 256 : 16 * (((num) + 1) & ~1U)];  \
>> +     } v = { .regs.num_regs = sizeof(v.qregs)/16 }
>> +
>> +#define DEFINE_NEON_STACK_REGS_ALL(name)     DEFINE_NEON_STACK_REGS(name,16)
>> +
>>  #ifdef __ARM_NEON__
>>
>>  /*
>> @@ -30,7 +41,30 @@
>>  #define kernel_neon_begin() \
>>       BUILD_BUG_ON_MSG(1, "kernel_neon_begin() called from NEON code")
>>
>> +#define kernel_neon_begin_atomic(a) \
>> +     BUILD_BUG_ON_MSG(1, "kernel_neon_begin_atomic() called from NEON code")
>> +
>>  #else
>>  void kernel_neon_begin(void);
>> +#define kernel_neon_begin_atomic(name) __kernel_neon_begin_atomic(&(name).regs)
>>  #endif
>> +
>> +#define kernel_neon_end_atomic(name) __kernel_neon_end_atomic(&(name).regs)
>> +
>>  void kernel_neon_end(void);
>> +
>> +static inline void __kernel_neon_begin_atomic(struct vfp_partial_state *regs)
>> +{
>> +     if (!in_interrupt())
>> +             kernel_neon_begin();
>
> Surely you want "if (!in_atomic())" here?
>
>> +     else
>> +             vfp_save_partial_state(regs);
>> +}
>> +
>> +static inline void __kernel_neon_end_atomic(struct vfp_partial_state *regs)
>> +{
>> +     if (!in_interrupt())
>> +             kernel_neon_end();
>
> Ditto.
>

If I am reading (and understanding) the source correctly, in_atomic()
is also true when running with preemption disabled, and in that case,
you should be able to just do a normal preserve / lazy restore.

-- 
Ard.


>> +     else
>> +             vfp_load_partial_state(regs);
>> +}
>> diff --git a/arch/arm/vfp/vfphw.S b/arch/arm/vfp/vfphw.S
>> +     VFPFMXR FPSCR, r3
>> +     VFPFMXR FPEXC, r2
>> +     bx      lr
>> +ENDPROC(vfp_load_partial_state)
>> +
>> +#endif
>> diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
>> index 52b8f40..3dea5ba 100644
>> --- a/arch/arm/vfp/vfpmodule.c
>> +++ b/arch/arm/vfp/vfpmodule.c
>> @@ -713,6 +713,9 @@ void kernel_neon_end(void)
>>  }
>>  EXPORT_SYMBOL(kernel_neon_end);
>>
>> +EXPORT_SYMBOL(vfp_save_partial_state);
>> +EXPORT_SYMBOL(vfp_load_partial_state);
>> +
>>  #endif /* CONFIG_KERNEL_MODE_NEON */
>>
>>  /*
>> --
>> 1.8.1.2
>>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [RFC v2 PATCH 1/4] ARM: add support for kernel mode NEON in atomic context
  2013-10-09 19:32     ` Ard Biesheuvel
@ 2013-10-10  3:45       ` Nicolas Pitre
  0 siblings, 0 replies; 14+ messages in thread
From: Nicolas Pitre @ 2013-10-10  3:45 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, 9 Oct 2013, Ard Biesheuvel wrote:

> On 9 October 2013 21:24, Nicolas Pitre <nicolas.pitre@linaro.org> wrote:
> > On Wed, 9 Oct 2013, Ard Biesheuvel wrote:
> >
> >> +static inline void __kernel_neon_begin_atomic(struct vfp_partial_state *regs)
> >> +{
> >> +     if (!in_interrupt())
> >> +             kernel_neon_begin();
> >
> > Surely you want "if (!in_atomic())" here?
> >
> >> +     else
> >> +             vfp_save_partial_state(regs);
> >> +}
> >> +
> >> +static inline void __kernel_neon_end_atomic(struct vfp_partial_state *regs)
> >> +{
> >> +     if (!in_interrupt())
> >> +             kernel_neon_end();
> >
> > Ditto.
> >
> 
> If I am reading (and understanding) the source correctly, in_atomic()
> is also true when running with preemption disabled, and in that case,
> you should be able to just do a normal preserve / lazy restore.

Hmmm...  OK I agree.


Nicolas

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [RFC v2 PATCH 2/4] ARM64: add support for kernel mode NEON in atomic context
  2013-10-09 18:50 ` [RFC v2 PATCH 2/4] ARM64: " Ard Biesheuvel
@ 2013-10-11 17:14   ` Catalin Marinas
  2013-10-11 17:30     ` Ard Biesheuvel
  0 siblings, 1 reply; 14+ messages in thread
From: Catalin Marinas @ 2013-10-11 17:14 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Oct 09, 2013 at 07:50:32PM +0100, Ard Biesheuvel wrote:
> --- a/arch/arm64/include/asm/neon.h
> +++ b/arch/arm64/include/asm/neon.h
> @@ -8,7 +8,38 @@
>   * published by the Free Software Foundation.
>   */
>  
> +#include <linux/hardirq.h>
> +#include <linux/types.h>
> +#include <asm/fpsimd.h>
> +
>  #define cpu_has_neon()		(1)
>  
> +#define DEFINE_NEON_STACK_REGS(a, num)					\
> +	struct {							\
> +		struct fpsimd_partial_state regs;			\
> +		__uint128_t vregs[(num) > 32 ? 32 : ((num) + 1) & ~1U];	\
> +	} a = { .regs.num_regs = sizeof(a.vregs) / sizeof(__uint128_t) }
> +
> +#define DEFINE_NEON_STACK_REGS_ALL(name)	DEFINE_NEON_STACK_REGS(name, 32)
> +
>  void kernel_neon_begin(void);
>  void kernel_neon_end(void);
> +
> +static inline void __kernel_neon_begin_atomic(struct fpsimd_partial_state *regs)
> +{
> +	if (!in_interrupt())
> +		kernel_neon_begin();
> +	else
> +		fpsimd_save_partial_state(regs);
> +}
> +
> +static inline void __kernel_neon_end_atomic(struct fpsimd_partial_state *regs)
> +{
> +	if (!in_interrupt())
> +		kernel_neon_end();
> +	else
> +		fpsimd_load_partial_state(regs);
> +}

The _atomic suffix is a bit misleading (you basically mean no user
context). I wonder whether it's better to have some _fast/_slow variants
instead. Looking at the other two patches, you only need 2 or 4
registers to do the crypto stuff but if you are not in_interrupt(), you
basically save and restore the full NEON bank. I would say for such
cases just make kernel_neon_begin_fast() call which is safe in all
contexts and much faster.

-- 
Catalin

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [RFC v2 PATCH 2/4] ARM64: add support for kernel mode NEON in atomic context
  2013-10-11 17:14   ` Catalin Marinas
@ 2013-10-11 17:30     ` Ard Biesheuvel
  2013-10-11 19:35       ` Catalin Marinas
  0 siblings, 1 reply; 14+ messages in thread
From: Ard Biesheuvel @ 2013-10-11 17:30 UTC (permalink / raw)
  To: linux-arm-kernel



> On 11 okt. 2013, at 19:14, Catalin Marinas <catalin.marinas@arm.com> wrote:
> 
>> On Wed, Oct 09, 2013 at 07:50:32PM +0100, Ard Biesheuvel wrote:
>> --- a/arch/arm64/include/asm/neon.h
>> +++ b/arch/arm64/include/asm/neon.h
>> @@ -8,7 +8,38 @@
>>  * published by the Free Software Foundation.
>>  */
>> 
>> +#include <linux/hardirq.h>
>> +#include <linux/types.h>
>> +#include <asm/fpsimd.h>
>> +
>> #define cpu_has_neon()        (1)
>> 
>> +#define DEFINE_NEON_STACK_REGS(a, num)                    \
>> +    struct {                            \
>> +        struct fpsimd_partial_state regs;            \
>> +        __uint128_t vregs[(num) > 32 ? 32 : ((num) + 1) & ~1U];    \
>> +    } a = { .regs.num_regs = sizeof(a.vregs) / sizeof(__uint128_t) }
>> +
>> +#define DEFINE_NEON_STACK_REGS_ALL(name)    DEFINE_NEON_STACK_REGS(name, 32)
>> +
>> void kernel_neon_begin(void);
>> void kernel_neon_end(void);
>> +
>> +static inline void __kernel_neon_begin_atomic(struct fpsimd_partial_state *regs)
>> +{
>> +    if (!in_interrupt())
>> +        kernel_neon_begin();
>> +    else
>> +        fpsimd_save_partial_state(regs);
>> +}
>> +
>> +static inline void __kernel_neon_end_atomic(struct fpsimd_partial_state *regs)
>> +{
>> +    if (!in_interrupt())
>> +        kernel_neon_end();
>> +    else
>> +        fpsimd_load_partial_state(regs);
>> +}
> 
> The _atomic suffix is a bit misleading (you basically mean no user
> context). I wonder whether it's better to have some _fast/_slow variants
> instead. Looking at the other two patches, you only need 2 or 4
> registers to do the crypto stuff but if you are not in_interrupt(), you
> basically save and restore the full NEON bank. I would say for such
> cases just make kernel_neon_begin_fast() call which is safe in all
> contexts and much faster.
> 

I agree that the name is a bit misleading.

Regarding fast/slow: if you take the core aes cipher as an example, it will likely be called from a loop somewhere, and (assuming lazy restore gets merged at some point) you may be stacking and unstacking 2 or 4 registers many times while kernel_neon_begin() would just stack them once and let the lazy restore unstack them only when needed.

This is probably a detail where arm and arm64 will be implemented somewhat differently. I would still like to align the api between the two, if possible, so intrinsics or gcc vectorized code can be shared easily. However, as ARM has fewer use cases where using only 2 registers makes sense, (memcpy perhaps?) and already has lazy restore wired up, I personally feel hooking into the lazy restore in the !in_interrupt() case is still the best solution there.

-- 
Ard.


> -- 
> Catalin

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [RFC v2 PATCH 2/4] ARM64: add support for kernel mode NEON in atomic context
  2013-10-11 17:30     ` Ard Biesheuvel
@ 2013-10-11 19:35       ` Catalin Marinas
  2013-10-11 20:09         ` Nicolas Pitre
  0 siblings, 1 reply; 14+ messages in thread
From: Catalin Marinas @ 2013-10-11 19:35 UTC (permalink / raw)
  To: linux-arm-kernel

On 11 Oct 2013, at 18:30, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
>> On 11 okt. 2013, at 19:14, Catalin Marinas <catalin.marinas@arm.com> wrote:
>>> On Wed, Oct 09, 2013 at 07:50:32PM +0100, Ard Biesheuvel wrote:
>>> --- a/arch/arm64/include/asm/neon.h
>>> +++ b/arch/arm64/include/asm/neon.h
>>> @@ -8,7 +8,38 @@
>>> * published by the Free Software Foundation.
>>> */
>>> 
>>> +#include <linux/hardirq.h>
>>> +#include <linux/types.h>
>>> +#include <asm/fpsimd.h>
>>> +
>>> #define cpu_has_neon()        (1)
>>> 
>>> +#define DEFINE_NEON_STACK_REGS(a, num)                    \
>>> +    struct {                            \
>>> +        struct fpsimd_partial_state regs;            \
>>> +        __uint128_t vregs[(num) > 32 ? 32 : ((num) + 1) & ~1U];    \
>>> +    } a = { .regs.num_regs = sizeof(a.vregs) / sizeof(__uint128_t) }
>>> +
>>> +#define DEFINE_NEON_STACK_REGS_ALL(name)    DEFINE_NEON_STACK_REGS(name, 32)
>>> +
>>> void kernel_neon_begin(void);
>>> void kernel_neon_end(void);
>>> +
>>> +static inline void __kernel_neon_begin_atomic(struct fpsimd_partial_state *regs)
>>> +{
>>> +    if (!in_interrupt())
>>> +        kernel_neon_begin();
>>> +    else
>>> +        fpsimd_save_partial_state(regs);
>>> +}
>>> +
>>> +static inline void __kernel_neon_end_atomic(struct fpsimd_partial_state *regs)
>>> +{
>>> +    if (!in_interrupt())
>>> +        kernel_neon_end();
>>> +    else
>>> +        fpsimd_load_partial_state(regs);
>>> +}
>> 
>> The _atomic suffix is a bit misleading (you basically mean no user
>> context). I wonder whether it's better to have some _fast/_slow variants
>> instead. Looking at the other two patches, you only need 2 or 4
>> registers to do the crypto stuff but if you are not in_interrupt(), you
>> basically save and restore the full NEON bank. I would say for such
>> cases just make kernel_neon_begin_fast() call which is safe in all
>> contexts and much faster.
> 
> I agree that the name is a bit misleading.
> 
> Regarding fast/slow: if you take the core aes cipher as an example, it
> will likely be called from a loop somewhere, and (assuming lazy restore
> gets merged at some point) you may be stacking and unstacking 2 or 4
> registers many times while kernel_neon_begin() would just stack them
> once and let the lazy restore unstack them only when needed.
> 
> This is probably a detail where arm and arm64 will be implemented
> somewhat differently.  I would still like to align the api between the
> two, if possible, so intrinsics or gcc vectorized code can be shared
> easily.  However, as ARM has fewer use cases where using only 2
> registers makes sense, (memcpy perhaps?) and already has lazy restore
> wired up, I personally feel hooking into the lazy restore in the
> !in_interrupt() case is still the best solution there.

Lazy saving/restoring on context switch may not be beneficial on arm64
and I don't want to enable it until I see some real user space
benchmarks (not kernel crypto API benchmarks).

The way kernel_neon_begin() is currently implemented on arm64 just saves
the whole register bank, so being called in a loop it will be worse.
What you could do is to save the register bank in kernel_neon_begin()
only the first time, set a TIF flag and restore them when returning to
user.  This way you can call it multiple times but only save/restore
once.

Catalin

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [RFC v2 PATCH 2/4] ARM64: add support for kernel mode NEON in atomic context
  2013-10-11 19:35       ` Catalin Marinas
@ 2013-10-11 20:09         ` Nicolas Pitre
  2013-10-13 22:48           ` Catalin Marinas
  0 siblings, 1 reply; 14+ messages in thread
From: Nicolas Pitre @ 2013-10-11 20:09 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 11 Oct 2013, Catalin Marinas wrote:

> On 11 Oct 2013, at 18:30, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> >> On 11 okt. 2013, at 19:14, Catalin Marinas <catalin.marinas@arm.com> wrote:
> >>> On Wed, Oct 09, 2013 at 07:50:32PM +0100, Ard Biesheuvel wrote:
> >>> --- a/arch/arm64/include/asm/neon.h
> >>> +++ b/arch/arm64/include/asm/neon.h
> >>> @@ -8,7 +8,38 @@
> >>> * published by the Free Software Foundation.
> >>> */
> >>> 
> >>> +#include <linux/hardirq.h>
> >>> +#include <linux/types.h>
> >>> +#include <asm/fpsimd.h>
> >>> +
> >>> #define cpu_has_neon()        (1)
> >>> 
> >>> +#define DEFINE_NEON_STACK_REGS(a, num)                    \
> >>> +    struct {                            \
> >>> +        struct fpsimd_partial_state regs;            \
> >>> +        __uint128_t vregs[(num) > 32 ? 32 : ((num) + 1) & ~1U];    \
> >>> +    } a = { .regs.num_regs = sizeof(a.vregs) / sizeof(__uint128_t) }
> >>> +
> >>> +#define DEFINE_NEON_STACK_REGS_ALL(name)    DEFINE_NEON_STACK_REGS(name, 32)
> >>> +
> >>> void kernel_neon_begin(void);
> >>> void kernel_neon_end(void);
> >>> +
> >>> +static inline void __kernel_neon_begin_atomic(struct fpsimd_partial_state *regs)
> >>> +{
> >>> +    if (!in_interrupt())
> >>> +        kernel_neon_begin();
> >>> +    else
> >>> +        fpsimd_save_partial_state(regs);
> >>> +}
> >>> +
> >>> +static inline void __kernel_neon_end_atomic(struct fpsimd_partial_state *regs)
> >>> +{
> >>> +    if (!in_interrupt())
> >>> +        kernel_neon_end();
> >>> +    else
> >>> +        fpsimd_load_partial_state(regs);
> >>> +}
> >> 
> >> The _atomic suffix is a bit misleading (you basically mean no user
> >> context). I wonder whether it's better to have some _fast/_slow variants
> >> instead. Looking at the other two patches, you only need 2 or 4
> >> registers to do the crypto stuff but if you are not in_interrupt(), you
> >> basically save and restore the full NEON bank. I would say for such
> >> cases just make kernel_neon_begin_fast() call which is safe in all
> >> contexts and much faster.
> > 
> > I agree that the name is a bit misleading.
> > 
> > Regarding fast/slow: if you take the core aes cipher as an example, it
> > will likely be called from a loop somewhere, and (assuming lazy restore
> > gets merged at some point) you may be stacking and unstacking 2 or 4
> > registers many times while kernel_neon_begin() would just stack them
> > once and let the lazy restore unstack them only when needed.
> > 
> > This is probably a detail where arm and arm64 will be implemented
> > somewhat differently.  I would still like to align the api between the
> > two, if possible, so intrinsics or gcc vectorized code can be shared
> > easily.  However, as ARM has fewer use cases where using only 2
> > registers makes sense, (memcpy perhaps?) and already has lazy restore
> > wired up, I personally feel hooking into the lazy restore in the
> > !in_interrupt() case is still the best solution there.
> 
> Lazy saving/restoring on context switch may not be beneficial on arm64
> and I don't want to enable it until I see some real user space
> benchmarks (not kernel crypto API benchmarks).

I think it is more important to establish the API semantics here.  
Implementation may vary afterwards.

The difference right now between kernel_neon_begin() and 
__kernel_neon_begin_atomic() is that the later can be stacked while the 
former cannot.  And when there is multiple invocations of Neon claiming 
then subsequent calls to the former are supposed to be very cheap. So 
this is all about tradeoff and constraints.

And I agree that the kernel_neon_begin_atomic name is potentially 
confusing.  But the fact is that this is the only version that can be 
used in atomic context.

Maybe the API should be kernel_neon_begin() and 
kernel_neon_begin_partial(nb_regs), the former being a simple alias to 
the later with the full register set as argument.  And then the actual 
register saving method (whether it is an atomic context or not, the 
number of registers, etc.) could be handled and optimized internally 
instead of exposing such implementation constraints to users of the API.

> The way kernel_neon_begin() is currently implemented on arm64 just saves
> the whole register bank, so being called in a loop it will be worse.
> What you could do is to save the register bank in kernel_neon_begin()
> only the first time, set a TIF flag and restore them when returning to
> user.  This way you can call it multiple times but only save/restore
> once.

This is certainly more inline with the lazy restore behavior assumed on 
ARM32.


Nicolas

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [RFC v2 PATCH 2/4] ARM64: add support for kernel mode NEON in atomic context
  2013-10-11 20:09         ` Nicolas Pitre
@ 2013-10-13 22:48           ` Catalin Marinas
  2013-10-14  8:12             ` Ard Biesheuvel
  0 siblings, 1 reply; 14+ messages in thread
From: Catalin Marinas @ 2013-10-13 22:48 UTC (permalink / raw)
  To: linux-arm-kernel

On 11 Oct 2013, at 21:09, Nicolas Pitre <nicolas.pitre@linaro.org> wrote:
> On Fri, 11 Oct 2013, Catalin Marinas wrote:
>> On 11 Oct 2013, at 18:30, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
>>>> On 11 okt. 2013, at 19:14, Catalin Marinas <catalin.marinas@arm.com> wrote:
>>>>> On Wed, Oct 09, 2013 at 07:50:32PM +0100, Ard Biesheuvel wrote:
>>>>> --- a/arch/arm64/include/asm/neon.h
>>>>> +++ b/arch/arm64/include/asm/neon.h
>>>>> @@ -8,7 +8,38 @@
>>>>> * published by the Free Software Foundation.
>>>>> */
>>>>> 
>>>>> +#include <linux/hardirq.h>
>>>>> +#include <linux/types.h>
>>>>> +#include <asm/fpsimd.h>
>>>>> +
>>>>> #define cpu_has_neon()        (1)
>>>>> 
>>>>> +#define DEFINE_NEON_STACK_REGS(a, num)                    \
>>>>> +    struct {                            \
>>>>> +        struct fpsimd_partial_state regs;            \
>>>>> +        __uint128_t vregs[(num) > 32 ? 32 : ((num) + 1) & ~1U];    \
>>>>> +    } a = { .regs.num_regs = sizeof(a.vregs) / sizeof(__uint128_t) }
>>>>> +
>>>>> +#define DEFINE_NEON_STACK_REGS_ALL(name)    DEFINE_NEON_STACK_REGS(name, 32)
>>>>> +
>>>>> void kernel_neon_begin(void);
>>>>> void kernel_neon_end(void);
>>>>> +
>>>>> +static inline void __kernel_neon_begin_atomic(struct fpsimd_partial_state *regs)
>>>>> +{
>>>>> +    if (!in_interrupt())
>>>>> +        kernel_neon_begin();
>>>>> +    else
>>>>> +        fpsimd_save_partial_state(regs);
>>>>> +}
>>>>> +
>>>>> +static inline void __kernel_neon_end_atomic(struct fpsimd_partial_state *regs)
>>>>> +{
>>>>> +    if (!in_interrupt())
>>>>> +        kernel_neon_end();
>>>>> +    else
>>>>> +        fpsimd_load_partial_state(regs);
>>>>> +}
>>>> 
>>>> The _atomic suffix is a bit misleading (you basically mean no user
>>>> context). I wonder whether it's better to have some _fast/_slow variants
>>>> instead. Looking at the other two patches, you only need 2 or 4
>>>> registers to do the crypto stuff but if you are not in_interrupt(), you
>>>> basically save and restore the full NEON bank. I would say for such
>>>> cases just make kernel_neon_begin_fast() call which is safe in all
>>>> contexts and much faster.
>>> 
>>> I agree that the name is a bit misleading.
>>> 
>>> Regarding fast/slow: if you take the core aes cipher as an example, it
>>> will likely be called from a loop somewhere, and (assuming lazy restore
>>> gets merged at some point) you may be stacking and unstacking 2 or 4
>>> registers many times while kernel_neon_begin() would just stack them
>>> once and let the lazy restore unstack them only when needed.
>>> 
>>> This is probably a detail where arm and arm64 will be implemented
>>> somewhat differently.  I would still like to align the api between the
>>> two, if possible, so intrinsics or gcc vectorized code can be shared
>>> easily.  However, as ARM has fewer use cases where using only 2
>>> registers makes sense, (memcpy perhaps?) and already has lazy restore
>>> wired up, I personally feel hooking into the lazy restore in the
>>> !in_interrupt() case is still the best solution there.
>> 
>> Lazy saving/restoring on context switch may not be beneficial on arm64
>> and I don't want to enable it until I see some real user space
>> benchmarks (not kernel crypto API benchmarks).
> 
> I think it is more important to establish the API semantics here.  
> Implementation may vary afterwards.
> 
> The difference right now between kernel_neon_begin() and 
> __kernel_neon_begin_atomic() is that the later can be stacked while the 
> former cannot.  

How much stacking do we need?  If we limit the nesting to two levels
(process and IRQ context), we could pre-allocate per-CPU
fpsimd_state structures for interrupt context and always use the same
API. About softirqs, do we need another level of nesting?

> Maybe the API should be kernel_neon_begin() and 
> kernel_neon_begin_partial(nb_regs), the former being a simple alias to 
> the later with the full register set as argument.  And then the actual 
> register saving method (whether it is an atomic context or not, the 
> number of registers, etc.) could be handled and optimized internally 
> instead of exposing such implementation constraints to users of the API.

It could be more efficient to always specify the number of registers to
be saved/restored even for kernel_neon_begin().  But I haven't paid much
attention to the register use in the actual crypto algorithms.

Catalin

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [RFC v2 PATCH 2/4] ARM64: add support for kernel mode NEON in atomic context
  2013-10-13 22:48           ` Catalin Marinas
@ 2013-10-14  8:12             ` Ard Biesheuvel
  0 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2013-10-14  8:12 UTC (permalink / raw)
  To: linux-arm-kernel

On 14 October 2013 00:48, Catalin Marinas <catalin.marinas@arm.com> wrote:
> On 11 Oct 2013, at 21:09, Nicolas Pitre <nicolas.pitre@linaro.org> wrote:

[...]

>>
>> I think it is more important to establish the API semantics here.
>> Implementation may vary afterwards.
>>
>> The difference right now between kernel_neon_begin() and
>> __kernel_neon_begin_atomic() is that the later can be stacked while the
>> former cannot.
>
> How much stacking do we need?  If we limit the nesting to two levels
> (process and IRQ context), we could pre-allocate per-CPU
> fpsimd_state structures for interrupt context and always use the same
> API. About softirqs, do we need another level of nesting?
>

Softirq context is required as well, so that implies two additional
fpsimd_states of 512 bytes each. If we can afford that, then sure, why
not?

>> Maybe the API should be kernel_neon_begin() and
>> kernel_neon_begin_partial(nb_regs), the former being a simple alias to
>> the later with the full register set as argument.  And then the actual
>> register saving method (whether it is an atomic context or not, the
>> number of registers, etc.) could be handled and optimized internally
>> instead of exposing such implementation constraints to users of the API.
>
> It could be more efficient to always specify the number of registers to
> be saved/restored even for kernel_neon_begin().  But I haven't paid much
> attention to the register use in the actual crypto algorithms.
>

To elaborate a bit: WPA-CCMP uses AES in CCM mode executed in softirq
context. I have included a reference implementation using 4 NEON
registers only, which makes sense in this case as the CCM transform
itself cannot be parallelized.

On the other hand, AES in XTS mode (dm-crypt) is fully parallelizable,
always executes from a kernel thread and always operates on a sector.
In this case, using the entire register file allows an 8 way
interleaved (*) implementation with all the round keys (between 11 and
15 16-byte keys) cached in registers.

The bottom line is that even if the crypto instructions can be used in
a meaningful way with only 2 or 4 registers, it is highly likely that
using more registers will result in higher performance [at least in
the AES case]

For the plain NEON case, I have written an implementation that keeps
the entire S-box (256 bytes) in registers. This should perform quite
well [assuming 4 register wide tbl/tbx lookups are not too costly],
but only in the cases where the cost of loading the S-box can be
amortized over multiple operations. This implies no core AES cipher
using plain NEON, but doing the CCM might be feasible, even if we have
to stack the whole register file in that case.

I agree that always specifying the number of registers used is
probably a meaningful addition, and in fact this is what I have
implemented in the v3 that I sent yesterday. The only difference
between Nico's suggestion and my implementation is that the number of
registers is declared at the time that the stack area is reserved so
we don't waste a lot of space.

Regards,
Ard.

* I am assuming some level of interleaving will be required to get
optimal performance from these instructions, but whether 8 is the
sweet spot is TBD

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2013-10-14  8:12 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-10-09 18:50 [RFC v2 PATCH 0/4] ARM[64]: kernel mode NEON in atomic contexts Ard Biesheuvel
2013-10-09 18:50 ` [RFC v2 PATCH 1/4] ARM: add support for kernel mode NEON in atomic context Ard Biesheuvel
2013-10-09 19:24   ` Nicolas Pitre
2013-10-09 19:32     ` Ard Biesheuvel
2013-10-10  3:45       ` Nicolas Pitre
2013-10-09 18:50 ` [RFC v2 PATCH 2/4] ARM64: " Ard Biesheuvel
2013-10-11 17:14   ` Catalin Marinas
2013-10-11 17:30     ` Ard Biesheuvel
2013-10-11 19:35       ` Catalin Marinas
2013-10-11 20:09         ` Nicolas Pitre
2013-10-13 22:48           ` Catalin Marinas
2013-10-14  8:12             ` Ard Biesheuvel
2013-10-09 18:50 ` [RFC v2 PATCH 3/4] ARM64: add Crypto Extensions based synchronous core AES cipher Ard Biesheuvel
2013-10-09 18:50 ` [RFC v2 PATCH 4/4] ARM64: add Crypto Extensions based synchronous AES in CCM mode Ard Biesheuvel

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.