linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RT] crypto: limit more FPU-enabled sections
@ 2017-11-30 14:22 Sebastian Andrzej Siewior
  2017-11-30 14:30 ` Sebastian Andrzej Siewior
                   ` (2 more replies)
  0 siblings, 3 replies; 26+ messages in thread
From: Sebastian Andrzej Siewior @ 2017-11-30 14:22 UTC (permalink / raw)
  To: linux-rt-users; +Cc: linux-kernel, tglx, Peter Zijlstra, Steven Rostedt

Those crypto drivers use SSE/AVX/… for their crypto work and in order to
do so in kernel they need to enable the "FPU" in kernel mode which
disables preemption.
There are two problems with the way they are used:
- the while loop which processes X bytes may create latency spikes and
  should be avoided or limited.
- the cipher-walk-next part may allocate/free memory and may use
  kmap_atomic().

The whole kernel_fpu_begin()/end() processing isn't probably that cheap.
It most likely makes sense to prcess as much of those as possible in one
go. The new *_fpu_sched_rt() shedules only if a RT task is pending.

Probably we should meassure the performance those ciphers in pure SW
mode and with this optimisations to see if it makes sense to keep them
for RT.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/x86/crypto/camellia_aesni_avx2_glue.c | 32 ++++++++++++++++++++++++++
 arch/x86/crypto/camellia_aesni_avx_glue.c  | 30 ++++++++++++++++++++++++
 arch/x86/crypto/cast6_avx_glue.c           | 21 +++++++++++++----
 arch/x86/crypto/chacha20_glue.c            |  9 ++++----
 arch/x86/crypto/serpent_avx2_glue.c        | 29 +++++++++++++++++++++++
 arch/x86/crypto/serpent_avx_glue.c         | 20 ++++++++++++----
 arch/x86/crypto/serpent_sse2_glue.c        | 20 ++++++++++++----
 arch/x86/crypto/twofish_avx_glue.c         | 37 ++++++++++++++++++++++++++++--
 8 files changed, 179 insertions(+), 19 deletions(-)

diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index 60907c139c4e..d7502c023475 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -206,6 +206,32 @@ struct crypt_priv {
 	bool fpu_enabled;
 };
 
+static void camellia_fpu_end_rt(struct crypt_priv *ctx)
+{
+#if CONFIG_PREEMPT_RT_FULL
+       bool fpu_enabled = ctx->fpu_enabled;
+
+       if (!fpu_enabled)
+               return;
+       camellia_fpu_end(fpu_enabled);
+       ctx->fpu_enabled = false;
+#endif
+}
+
+static void camellia_fpu_sched_rt(struct crypt_priv *ctx)
+{
+#if CONFIG_PREEMPT_RT_FULL
+       bool fpu_enabled = ctx->fpu_enabled;
+
+       if (!fpu_enabled || !tif_need_resched_now())
+               return;
+       camellia_fpu_end(fpu_enabled);
+       kernel_fpu_end();
+       /* schedule due to preemptible */
+       kernel_fpu_begin();
+#endif
+}
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
@@ -221,16 +247,19 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 	}
 
 	if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
+		camellia_fpu_sched_rt(ctx);
 		camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
 		nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
 	}
 
 	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+		camellia_fpu_sched_rt(ctx);
 		camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
 		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
 	}
+	camellia_fpu_end_rt(ctx);
 
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
 		camellia_enc_blk(ctx->ctx, srcdst, srcdst);
@@ -251,16 +280,19 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 	}
 
 	if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
+		camellia_fpu_sched_rt(ctx);
 		camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
 		nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
 	}
 
 	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+		camellia_fpu_sched_rt(ctx);
 		camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
 		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
 	}
+	camellia_fpu_end_rt(ctx);
 
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
 		camellia_dec_blk(ctx->ctx, srcdst, srcdst);
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index d96429da88eb..ea98b57a4156 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -210,6 +210,32 @@ struct crypt_priv {
 	bool fpu_enabled;
 };
 
+static void camellia_fpu_end_rt(struct crypt_priv *ctx)
+{
+#if CONFIG_PREEMPT_RT_FULL
+	bool fpu_enabled = ctx->fpu_enabled;
+
+	if (!fpu_enabled)
+		return;
+	camellia_fpu_end(fpu_enabled);
+	ctx->fpu_enabled = false;
+#endif
+}
+
+static void camellia_fpu_sched_rt(struct crypt_priv *ctx)
+{
+#if CONFIG_PREEMPT_RT_FULL
+	bool fpu_enabled = ctx->fpu_enabled;
+
+	if (!fpu_enabled || !tif_need_resched_now())
+		return;
+	camellia_fpu_end(fpu_enabled);
+	kernel_fpu_end();
+	/* schedule due to preemptible */
+	kernel_fpu_begin();
+#endif
+}
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
@@ -225,10 +251,12 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 	}
 
 	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+		camellia_fpu_sched_rt(ctx);
 		camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
 		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
 	}
+	camellia_fpu_end_rt(ctx);
 
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
 		camellia_enc_blk(ctx->ctx, srcdst, srcdst);
@@ -249,10 +277,12 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 	}
 
 	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+		camellia_fpu_sched_rt(ctx);
 		camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
 		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
 	}
+	camellia_fpu_end_rt(ctx);
 
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
 		camellia_dec_blk(ctx->ctx, srcdst, srcdst);
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 50e684768c55..b16497b81623 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -205,19 +205,30 @@ struct crypt_priv {
 	bool fpu_enabled;
 };
 
+static void cast6_fpu_end_rt(struct crypt_priv *ctx)
+{
+#if CONFIG_PREEMPT_RT_FULL
+	bool fpu_enabled = ctx->fpu_enabled;
+
+	if (!fpu_enabled)
+		return;
+	cast6_fpu_end(fpu_enabled);
+	ctx->fpu_enabled = false;
+#endif
+}
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = CAST6_BLOCK_SIZE;
 	struct crypt_priv *ctx = priv;
 	int i;
 
-	ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
-
 	if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
+		ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
 		cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
+		cast6_fpu_end_rt(ctx);
 		return;
 	}
-
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
 		__cast6_encrypt(ctx->ctx, srcdst, srcdst);
 }
@@ -228,10 +239,10 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 	struct crypt_priv *ctx = priv;
 	int i;
 
-	ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
-
 	if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
+		ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
 		cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
+		cast6_fpu_end_rt(ctx);
 		return;
 	}
 
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 1e6af1b35f7b..e7809fd2a4fd 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -81,23 +81,24 @@ static int chacha20_simd(struct skcipher_request *req)
 
 	crypto_chacha20_init(state, ctx, walk.iv);
 
-	kernel_fpu_begin();
-
 	while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
+		kernel_fpu_begin();
+
 		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
 				rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
+		kernel_fpu_end();
 		err = skcipher_walk_done(&walk,
 					 walk.nbytes % CHACHA20_BLOCK_SIZE);
 	}
 
 	if (walk.nbytes) {
+		kernel_fpu_begin();
 		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
 				walk.nbytes);
+		kernel_fpu_end();
 		err = skcipher_walk_done(&walk, 0);
 	}
 
-	kernel_fpu_end();
-
 	return err;
 }
 
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index 870f6d812a2d..03a86747b97d 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -184,6 +184,31 @@ struct crypt_priv {
 	bool fpu_enabled;
 };
 
+static void serpent_fpu_end_rt(struct crypt_priv *ctx)
+{
+#if CONFIG_PREEMPT_RT_FULL
+       bool fpu_enabled = ctx->fpu_enabled;
+
+       if (!fpu_enabled)
+               return;
+       serpent_fpu_end(fpu_enabled);
+       ctx->fpu_enabled = false;
+#endif
+}
+
+static void serpent_fpu_sched_rt(struct crypt_priv *ctx)
+{
+#if CONFIG_PREEMPT_RT_FULL
+	bool fpu_enabled = ctx->fpu_enabled;
+
+	if (!fpu_enabled || !tif_need_resched_now())
+		return;
+	kernel_fpu_end();
+	/* schedule due to preemptible */
+	kernel_fpu_begin();
+#endif
+}
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = SERPENT_BLOCK_SIZE;
@@ -199,10 +224,12 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 	}
 
 	while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
+		serpent_fpu_sched_rt(ctx);
 		serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
 		nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
 	}
+	serpent_fpu_end_rt(ctx);
 
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
 		__serpent_encrypt(ctx->ctx, srcdst, srcdst);
@@ -223,10 +250,12 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 	}
 
 	while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
+		serpent_fpu_sched_rt(ctx);
 		serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
 		nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
 	}
+	serpent_fpu_end_rt(ctx);
 
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
 		__serpent_decrypt(ctx->ctx, srcdst, srcdst);
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 6f778d3daa22..3bf94fb39e47 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -218,16 +218,28 @@ struct crypt_priv {
 	bool fpu_enabled;
 };
 
+static void serpent_fpu_end_rt(struct crypt_priv *ctx)
+{
+#if CONFIG_PREEMPT_RT_FULL
+	bool fpu_enabled = ctx->fpu_enabled;
+
+	if (!fpu_enabled)
+		return;
+	serpent_fpu_end(fpu_enabled);
+	ctx->fpu_enabled = false;
+#endif
+}
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = SERPENT_BLOCK_SIZE;
 	struct crypt_priv *ctx = priv;
 	int i;
 
-	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
-
 	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+		ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
 		serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
+		serpent_fpu_end_rt(ctx);
 		return;
 	}
 
@@ -241,10 +253,10 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 	struct crypt_priv *ctx = priv;
 	int i;
 
-	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
-
 	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+		ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
 		serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
+		serpent_fpu_end_rt(ctx);
 		return;
 	}
 
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index ac0e831943f5..66fd2a51836f 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -187,16 +187,28 @@ struct crypt_priv {
 	bool fpu_enabled;
 };
 
+static void serpent_fpu_end_rt(struct crypt_priv *ctx)
+{
+#if CONFIG_PREEMPT_RT_FULL
+	bool fpu_enabled = ctx->fpu_enabled;
+
+	if (!fpu_enabled)
+		return;
+	serpent_fpu_end(fpu_enabled);
+	ctx->fpu_enabled = false;
+#endif
+}
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = SERPENT_BLOCK_SIZE;
 	struct crypt_priv *ctx = priv;
 	int i;
 
-	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
-
 	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+		ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
 		serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
+		serpent_fpu_end_rt(ctx);
 		return;
 	}
 
@@ -210,10 +222,10 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 	struct crypt_priv *ctx = priv;
 	int i;
 
-	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
-
 	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+		ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
 		serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
+		serpent_fpu_end_rt(ctx);
 		return;
 	}
 
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index b7a3904b953c..b9c8f72c6f0b 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -218,6 +218,31 @@ struct crypt_priv {
 	bool fpu_enabled;
 };
 
+static void twofish_fpu_end_rt(struct crypt_priv *ctx)
+{
+#if CONFIG_PREEMPT_RT_FULL
+	bool fpu_enabled = ctx->fpu_enabled;
+
+	if (!fpu_enabled)
+		return;
+	twofish_fpu_end(fpu_enabled);
+	ctx->fpu_enabled = false;
+#endif
+}
+
+static void twofish_fpu_sched_rt(struct crypt_priv *ctx)
+{
+#if CONFIG_PREEMPT_RT_FULL
+	bool fpu_enabled = ctx->fpu_enabled;
+
+	if (!fpu_enabled || !tif_need_resched_now())
+		return;
+	kernel_fpu_end();
+	/* schedule due to preemptible */
+	kernel_fpu_begin();
+#endif
+}
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = TF_BLOCK_SIZE;
@@ -228,12 +253,16 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 
 	if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
 		twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
+		twofish_fpu_end_rt(ctx);
 		return;
 	}
 
-	for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
+	for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) {
+		twofish_fpu_sched_rt(ctx);
 		twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
+	}
 
+	twofish_fpu_end_rt(ctx);
 	nbytes %= bsize * 3;
 
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
@@ -250,11 +279,15 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 
 	if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
 		twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
+		twofish_fpu_end_rt(ctx);
 		return;
 	}
 
-	for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
+	for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) {
+		twofish_fpu_sched_rt(ctx);
 		twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
+	}
+	twofish_fpu_end_rt(ctx);
 
 	nbytes %= bsize * 3;
 
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH RT] crypto: limit more FPU-enabled sections
  2017-11-30 14:22 [PATCH RT] crypto: limit more FPU-enabled sections Sebastian Andrzej Siewior
@ 2017-11-30 14:30 ` Sebastian Andrzej Siewior
  2017-12-01 10:43   ` [PATCH RT] arm*: disable NEON in kernel mode Sebastian Andrzej Siewior
  2017-11-30 15:19 ` [PATCH RT] crypto: limit more FPU-enabled sections Steven Rostedt
  2017-11-30 15:29 ` [PATCH RT] " Steven Rostedt
  2 siblings, 1 reply; 26+ messages in thread
From: Sebastian Andrzej Siewior @ 2017-11-30 14:30 UTC (permalink / raw)
  To: linux-rt-users; +Cc: linux-kernel, tglx, Peter Zijlstra, Steven Rostedt

On 2017-11-30 15:22:17 [+0100], To linux-rt-users@vger.kernel.org wrote:
> Probably we should meassure the performance those ciphers in pure SW
> mode and with this optimisations to see if it makes sense to keep them
> for RT.

This is only x86. We have the same fun with kernel_neon_begin() on the
arm/arm64 side…

Sebastian

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RT] crypto: limit more FPU-enabled sections
  2017-11-30 14:22 [PATCH RT] crypto: limit more FPU-enabled sections Sebastian Andrzej Siewior
  2017-11-30 14:30 ` Sebastian Andrzej Siewior
@ 2017-11-30 15:19 ` Steven Rostedt
  2017-11-30 15:22   ` Sebastian Andrzej Siewior
  2017-11-30 15:29 ` [PATCH RT] " Steven Rostedt
  2 siblings, 1 reply; 26+ messages in thread
From: Steven Rostedt @ 2017-11-30 15:19 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-rt-users, linux-kernel, tglx, Peter Zijlstra

On Thu, 30 Nov 2017 15:22:17 +0100
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:

> diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
> index 60907c139c4e..d7502c023475 100644
> --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
> +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
> @@ -206,6 +206,32 @@ struct crypt_priv {
>  	bool fpu_enabled;
>  };
>  
> +static void camellia_fpu_end_rt(struct crypt_priv *ctx)
> +{
> +#if CONFIG_PREEMPT_RT_FULL
> +       bool fpu_enabled = ctx->fpu_enabled;
> +
> +       if (!fpu_enabled)
> +               return;
> +       camellia_fpu_end(fpu_enabled);
> +       ctx->fpu_enabled = false;
> +#endif
> +}
> +
> +static void camellia_fpu_sched_rt(struct crypt_priv *ctx)
> +{
> +#if CONFIG_PREEMPT_RT_FULL
> +       bool fpu_enabled = ctx->fpu_enabled;
> +
> +       if (!fpu_enabled || !tif_need_resched_now())
> +               return;
> +       camellia_fpu_end(fpu_enabled);
> +       kernel_fpu_end();
> +       /* schedule due to preemptible */
> +       kernel_fpu_begin();
> +#endif
> +}

I think it would be cleaner to do:

#ifdef CONFIG_PREEMPT_RT_FULL
static void camellia_fpu_end_rt(struct crypt_priv *ctx)
{
       bool fpu_enabled = ctx->fpu_enabled;

       if (!fpu_enabled)
               return;
       camellia_fpu_end(fpu_enabled);
       ctx->fpu_enabled = false;
}

static void camellia_fpu_sched_rt(struct crypt_priv *ctx)
{
       bool fpu_enabled = ctx->fpu_enabled;

       if (!fpu_enabled || !tif_need_resched_now())
               return;
       camellia_fpu_end(fpu_enabled);
       kernel_fpu_end();
       /* schedule due to preemptible */
       kernel_fpu_begin();
}
#else
static inline void camellia_fpu_end_rt(struct crypt_priv *ctx) { }
static void camellia_fpu_sched_rt(struct crypt_priv *ctx) { }
#endif

It really shows that these functions are only used by RT. IMO it's bad
taste to have functions with the entire body encapsulated in #ifdefs.

And the same goes for the other functions in this patch.

-- Steve

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RT] crypto: limit more FPU-enabled sections
  2017-11-30 15:19 ` [PATCH RT] crypto: limit more FPU-enabled sections Steven Rostedt
@ 2017-11-30 15:22   ` Sebastian Andrzej Siewior
  2017-12-01 10:44     ` [PATCH RT v2] " Sebastian Andrzej Siewior
  0 siblings, 1 reply; 26+ messages in thread
From: Sebastian Andrzej Siewior @ 2017-11-30 15:22 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: linux-rt-users, linux-kernel, tglx, Peter Zijlstra

On 2017-11-30 10:19:43 [-0500], Steven Rostedt wrote:
…
> It really shows that these functions are only used by RT. IMO it's bad
> taste to have functions with the entire body encapsulated in #ifdefs.
> 
> And the same goes for the other functions in this patch.

No problem, I will change that.

> -- Steve

Sebastian

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RT] crypto: limit more FPU-enabled sections
  2017-11-30 14:22 [PATCH RT] crypto: limit more FPU-enabled sections Sebastian Andrzej Siewior
  2017-11-30 14:30 ` Sebastian Andrzej Siewior
  2017-11-30 15:19 ` [PATCH RT] crypto: limit more FPU-enabled sections Steven Rostedt
@ 2017-11-30 15:29 ` Steven Rostedt
  2017-11-30 15:41   ` Sebastian Andrzej Siewior
  2 siblings, 1 reply; 26+ messages in thread
From: Steven Rostedt @ 2017-11-30 15:29 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-rt-users, linux-kernel, tglx, Peter Zijlstra

On Thu, 30 Nov 2017 15:22:17 +0100
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:

> Those crypto drivers use SSE/AVX/… for their crypto work and in order to
> do so in kernel they need to enable the "FPU" in kernel mode which
> disables preemption.
> There are two problems with the way they are used:
> - the while loop which processes X bytes may create latency spikes and
>   should be avoided or limited.
> - the cipher-walk-next part may allocate/free memory and may use
>   kmap_atomic().
> 
> The whole kernel_fpu_begin()/end() processing isn't probably that cheap.
> It most likely makes sense to prcess as much of those as possible in one

s/prcess/process/

> go. The new *_fpu_sched_rt() shedules only if a RT task is pending.
> 
> Probably we should meassure the performance those ciphers in pure SW
> mode and with this optimisations to see if it makes sense to keep them
> for RT.
> 
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>



> +static void camellia_fpu_end_rt(struct crypt_priv *ctx)
> +{
> +#if CONFIG_PREEMPT_RT_FULL
> +       bool fpu_enabled = ctx->fpu_enabled;
> +
> +       if (!fpu_enabled)
> +               return;
> +       camellia_fpu_end(fpu_enabled);
> +       ctx->fpu_enabled = false;
> +#endif
> +}
> +
> +static void camellia_fpu_sched_rt(struct crypt_priv *ctx)
> +{
> +#if CONFIG_PREEMPT_RT_FULL
> +       bool fpu_enabled = ctx->fpu_enabled;
> +
> +       if (!fpu_enabled || !tif_need_resched_now())
> +               return;
> +       camellia_fpu_end(fpu_enabled);
> +       kernel_fpu_end();
> +       /* schedule due to preemptible */
> +       kernel_fpu_begin();
> +#endif
> +}
> +


> +static void camellia_fpu_end_rt(struct crypt_priv *ctx)
> +{
> +#if CONFIG_PREEMPT_RT_FULL
> +	bool fpu_enabled = ctx->fpu_enabled;
> +
> +	if (!fpu_enabled)
> +		return;
> +	camellia_fpu_end(fpu_enabled);
> +	ctx->fpu_enabled = false;
> +#endif
> +}
> +
> +static void camellia_fpu_sched_rt(struct crypt_priv *ctx)
> +{
> +#if CONFIG_PREEMPT_RT_FULL
> +	bool fpu_enabled = ctx->fpu_enabled;
> +
> +	if (!fpu_enabled || !tif_need_resched_now())
> +		return;
> +	camellia_fpu_end(fpu_enabled);

I haven't looked deeply, but why does this call the camellia_fpu_end()
but other *_fpu_sched_rt() do not call the equivalent?

> +	kernel_fpu_end();
> +	/* schedule due to preemptible */
> +	kernel_fpu_begin();
> +#endif
> +}
> +

These are duplicate functions. Shouldn't they go into a header file?

Also, they are very similar:

static inline void camellia_fpu_end(bool fpu_enabled)
{
	glue_fpu_end(fpu_enabled);
}

static inline void cast6_fpu_end(bool fpu_enabled)
{
	glue_fpu_end(fpu_enabled);
}

static inline void serpent_fpu_end(bool fpu_enabled)
{
	glue_fpu_end(fpu_enabled);
}

static inline void twofish_fpu_end(bool fpu_enabled)
{
	glue_fpu_end(fpu_enabled);
}

-- Steve

>

> +static void cast6_fpu_end_rt(struct crypt_priv *ctx)
> +{
> +#if CONFIG_PREEMPT_RT_FULL
> +	bool fpu_enabled = ctx->fpu_enabled;
> +
> +	if (!fpu_enabled)
> +		return;
> +	cast6_fpu_end(fpu_enabled);
> +	ctx->fpu_enabled = false;
> +#endif
> +}
>

> +static void serpent_fpu_end_rt(struct crypt_priv *ctx)
> +{
> +#if CONFIG_PREEMPT_RT_FULL
> +       bool fpu_enabled = ctx->fpu_enabled;
> +
> +       if (!fpu_enabled)
> +               return;
> +       serpent_fpu_end(fpu_enabled);
> +       ctx->fpu_enabled = false;
> +#endif
> +}
> +
> +static void serpent_fpu_sched_rt(struct crypt_priv *ctx)
> +{
> +#if CONFIG_PREEMPT_RT_FULL
> +	bool fpu_enabled = ctx->fpu_enabled;
> +
> +	if (!fpu_enabled || !tif_need_resched_now())
> +		return;
> +	kernel_fpu_end();
> +	/* schedule due to preemptible */
> +	kernel_fpu_begin();
> +#endif
> +}
> +
>  static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
>

> +static void serpent_fpu_end_rt(struct crypt_priv *ctx)
> +{
> +#if CONFIG_PREEMPT_RT_FULL
> +	bool fpu_enabled = ctx->fpu_enabled;
> +
> +	if (!fpu_enabled)
> +		return;
> +	serpent_fpu_end(fpu_enabled);
> +	ctx->fpu_enabled = false;
> +#endif
> +}
> +
>

> diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
> index ac0e831943f5..66fd2a51836f 100644
> --- a/arch/x86/crypto/serpent_sse2_glue.c
> +++ b/arch/x86/crypto/serpent_sse2_glue.c
> @@ -187,16 +187,28 @@ struct crypt_priv {
>  	bool fpu_enabled;
>  };
>  
> +static void serpent_fpu_end_rt(struct crypt_priv *ctx)
> +{
> +#if CONFIG_PREEMPT_RT_FULL
> +	bool fpu_enabled = ctx->fpu_enabled;
> +
> +	if (!fpu_enabled)
> +		return;
> +	serpent_fpu_end(fpu_enabled);
> +	ctx->fpu_enabled = false;
> +#endif
> +}
> +
> 

> +static void twofish_fpu_end_rt(struct crypt_priv *ctx)
> +{
> +#if CONFIG_PREEMPT_RT_FULL
> +	bool fpu_enabled = ctx->fpu_enabled;
> +
> +	if (!fpu_enabled)
> +		return;
> +	twofish_fpu_end(fpu_enabled);
> +	ctx->fpu_enabled = false;
> +#endif
> +}
> +
> +static void twofish_fpu_sched_rt(struct crypt_priv *ctx)
> +{
> +#if CONFIG_PREEMPT_RT_FULL
> +	bool fpu_enabled = ctx->fpu_enabled;
> +
> +	if (!fpu_enabled || !tif_need_resched_now())
> +		return;
> +	kernel_fpu_end();
> +	/* schedule due to preemptible */
> +	kernel_fpu_begin();
> +#endif
> +}
> +
> 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RT] crypto: limit more FPU-enabled sections
  2017-11-30 15:29 ` [PATCH RT] " Steven Rostedt
@ 2017-11-30 15:41   ` Sebastian Andrzej Siewior
  2017-11-30 16:18     ` Steven Rostedt
  0 siblings, 1 reply; 26+ messages in thread
From: Sebastian Andrzej Siewior @ 2017-11-30 15:41 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: linux-rt-users, linux-kernel, tglx, Peter Zijlstra

On 2017-11-30 10:29:48 [-0500], Steven Rostedt wrote:
> > +static void camellia_fpu_end_rt(struct crypt_priv *ctx)
> > +{
> > +#if CONFIG_PREEMPT_RT_FULL
> > +	bool fpu_enabled = ctx->fpu_enabled;
> > +
> > +	if (!fpu_enabled)
> > +		return;
> > +	camellia_fpu_end(fpu_enabled);
> > +	ctx->fpu_enabled = false;
> > +#endif
> > +}
> > +
> > +static void camellia_fpu_sched_rt(struct crypt_priv *ctx)
> > +{
> > +#if CONFIG_PREEMPT_RT_FULL
> > +	bool fpu_enabled = ctx->fpu_enabled;
> > +
> > +	if (!fpu_enabled || !tif_need_resched_now())
> > +		return;
> > +	camellia_fpu_end(fpu_enabled);
> 
> I haven't looked deeply, but why does this call the camellia_fpu_end()
> but other *_fpu_sched_rt() do not call the equivalent?

There is lrw_encrypt() which as "end" function after the end of the full
operation. So encrypt_callback() might be invoked multiple times but
only the first invocation does the fpu-enable part.

#1 We need to disable-FTP after one invocation of encrypt_callback()
   because blkcipher_walk_done() might alloc/free/map memory.

#2 That camellia_fpu_sched_rt() is only RT specific in order to enable
   preempt if need-resched is set for a RT task. For !RT we continue.

> > +	kernel_fpu_end();
> > +	/* schedule due to preemptible */
> > +	kernel_fpu_begin();
> > +#endif
> > +}
> > +
> 
> These are duplicate functions. Shouldn't they go into a header file?
> 
> Also, they are very similar:

That whole thing is kind of ugly, yes. That *_fpu_end_rt() use a struct
crypt_priv but this is defined in each cipher file.

I would keep it for now until we decide if we keep fixing those things
are disable for RT because it is not worth it. As I said, this is only
x86 specific and we have something similar on ARM with NEON.
But I guess the decision here will be postponed until someone posts
numbers with SW and with this workaround and cyclictest with a cycle of
1 - 10 ms.

Sebastian

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RT] crypto: limit more FPU-enabled sections
  2017-11-30 15:41   ` Sebastian Andrzej Siewior
@ 2017-11-30 16:18     ` Steven Rostedt
  0 siblings, 0 replies; 26+ messages in thread
From: Steven Rostedt @ 2017-11-30 16:18 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-rt-users, linux-kernel, tglx, Peter Zijlstra

On Thu, 30 Nov 2017 16:41:08 +0100
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:

> I would keep it for now until we decide if we keep fixing those things
> are disable for RT because it is not worth it. As I said, this is only
> x86 specific and we have something similar on ARM with NEON.
> But I guess the decision here will be postponed until someone posts
> numbers with SW and with this workaround and cyclictest with a cycle of
> 1 - 10 ms.

Fair enough.

-- Steve

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH RT] arm*: disable NEON in kernel mode
  2017-11-30 14:30 ` Sebastian Andrzej Siewior
@ 2017-12-01 10:43   ` Sebastian Andrzej Siewior
  2017-12-01 13:45     ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 26+ messages in thread
From: Sebastian Andrzej Siewior @ 2017-12-01 10:43 UTC (permalink / raw)
  To: linux-rt-users; +Cc: linux-kernel, tglx, Peter Zijlstra, Steven Rostedt

NEON in kernel mode is used by the crypto algorithms and raid6 code.
While the raid6 code looks okay, the crypto algorithms do not: NEON
is enabled on first invocation and may allocate/free/map memory before
the NEON mode is disabled again.
This needs to be changed until it can be enabled.
On ARM NEON in kernel mode can be simply disabled. on ARM64 it needs to
stay on due to possible EFI callbacks so here I disable each algorithm.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/arm/Kconfig                  |  2 +-
 arch/arm64/crypto/Kconfig         | 20 ++++++++++----------
 arch/arm64/crypto/crc32-ce-glue.c |  3 ++-
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index d1346a160760..914fecb088a8 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -2164,7 +2164,7 @@ config NEON
 
 config KERNEL_MODE_NEON
 	bool "Support for NEON in kernel mode"
-	depends on NEON && AEABI
+	depends on NEON && AEABI && !PREEMPT_RT_BASE
 	help
 	  Say Y to include support for NEON in kernel mode.
 
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 70c517aa4501..2a5f05b5a19a 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -19,19 +19,19 @@ config CRYPTO_SHA512_ARM64
 
 config CRYPTO_SHA1_ARM64_CE
 	tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
-	depends on KERNEL_MODE_NEON
+	depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
 	select CRYPTO_HASH
 	select CRYPTO_SHA1
 
 config CRYPTO_SHA2_ARM64_CE
 	tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)"
-	depends on KERNEL_MODE_NEON
+	depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
 	select CRYPTO_HASH
 	select CRYPTO_SHA256_ARM64
 
 config CRYPTO_GHASH_ARM64_CE
 	tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
-	depends on KERNEL_MODE_NEON
+	depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
 	select CRYPTO_HASH
 	select CRYPTO_GF128MUL
 	select CRYPTO_AES
@@ -39,7 +39,7 @@ config CRYPTO_GHASH_ARM64_CE
 
 config CRYPTO_CRCT10DIF_ARM64_CE
 	tristate "CRCT10DIF digest algorithm using PMULL instructions"
-	depends on KERNEL_MODE_NEON && CRC_T10DIF
+	depends on KERNEL_MODE_NEON && CRC_T10DIF && !PREEMPT_RT_BASE
 	select CRYPTO_HASH
 
 config CRYPTO_CRC32_ARM64_CE
@@ -53,13 +53,13 @@ config CRYPTO_AES_ARM64
 
 config CRYPTO_AES_ARM64_CE
 	tristate "AES core cipher using ARMv8 Crypto Extensions"
-	depends on ARM64 && KERNEL_MODE_NEON
+	depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
 	select CRYPTO_ALGAPI
 	select CRYPTO_AES_ARM64
 
 config CRYPTO_AES_ARM64_CE_CCM
 	tristate "AES in CCM mode using ARMv8 Crypto Extensions"
-	depends on ARM64 && KERNEL_MODE_NEON
+	depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
 	select CRYPTO_ALGAPI
 	select CRYPTO_AES_ARM64_CE
 	select CRYPTO_AES_ARM64
@@ -67,7 +67,7 @@ config CRYPTO_AES_ARM64_CE_CCM
 
 config CRYPTO_AES_ARM64_CE_BLK
 	tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
-	depends on KERNEL_MODE_NEON
+	depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_AES_ARM64_CE
 	select CRYPTO_AES_ARM64
@@ -75,7 +75,7 @@ config CRYPTO_AES_ARM64_CE_BLK
 
 config CRYPTO_AES_ARM64_NEON_BLK
 	tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
-	depends on KERNEL_MODE_NEON
+	depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_AES_ARM64
 	select CRYPTO_AES
@@ -83,13 +83,13 @@ config CRYPTO_AES_ARM64_NEON_BLK
 
 config CRYPTO_CHACHA20_NEON
 	tristate "NEON accelerated ChaCha20 symmetric cipher"
-	depends on KERNEL_MODE_NEON
+	depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_CHACHA20
 
 config CRYPTO_AES_ARM64_BS
 	tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
-	depends on KERNEL_MODE_NEON
+	depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_AES_ARM64_NEON_BLK
 	select CRYPTO_AES_ARM64
diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c
index 624f4137918c..599de95cd86d 100644
--- a/arch/arm64/crypto/crc32-ce-glue.c
+++ b/arch/arm64/crypto/crc32-ce-glue.c
@@ -206,7 +206,8 @@ static struct shash_alg crc32_pmull_algs[] = { {
 
 static int __init crc32_pmull_mod_init(void)
 {
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) {
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    !IS_ENABLED(CONFIG_PREEMPT_RT_BASE) && (elf_hwcap & HWCAP_PMULL)) {
 		crc32_pmull_algs[0].update = crc32_pmull_update;
 		crc32_pmull_algs[1].update = crc32c_pmull_update;
 
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH RT v2] crypto: limit more FPU-enabled sections
  2017-11-30 15:22   ` Sebastian Andrzej Siewior
@ 2017-12-01 10:44     ` Sebastian Andrzej Siewior
  2017-12-01 11:32       ` Peter Zijlstra
  0 siblings, 1 reply; 26+ messages in thread
From: Sebastian Andrzej Siewior @ 2017-12-01 10:44 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: linux-rt-users, linux-kernel, tglx, Peter Zijlstra

Those crypto drivers use SSE/AVX/… for their crypto work and in order to
do so in kernel they need to enable the "FPU" in kernel mode which
disables preemption.
There are two problems with the way they are used:
- the while loop which processes X bytes may create latency spikes and
  should be avoided or limited.
- the cipher-walk-next part may allocate/free memory and may use
  kmap_atomic().

The whole kernel_fpu_begin()/end() processing isn't probably that cheap.
It most likely makes sense to process as much of those as possible in one
go. The new *_fpu_sched_rt() shedules only if a RT task is pending.

Probably we should meassure the performance those ciphers in pure SW
mode and with this optimisations to see if it makes sense to keep them
for RT.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/x86/crypto/camellia_aesni_avx2_glue.c |   34 +++++++++++++++++++++++++
 arch/x86/crypto/camellia_aesni_avx_glue.c  |   32 +++++++++++++++++++++++
 arch/x86/crypto/cast6_avx_glue.c           |   24 ++++++++++++++---
 arch/x86/crypto/chacha20_glue.c            |    9 +++---
 arch/x86/crypto/serpent_avx2_glue.c        |   31 +++++++++++++++++++++++
 arch/x86/crypto/serpent_avx_glue.c         |   23 ++++++++++++++---
 arch/x86/crypto/serpent_sse2_glue.c        |   23 ++++++++++++++---
 arch/x86/crypto/twofish_avx_glue.c         |   39 +++++++++++++++++++++++++++--
 8 files changed, 196 insertions(+), 19 deletions(-)

--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -206,6 +206,34 @@ struct crypt_priv {
 	bool fpu_enabled;
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void camellia_fpu_end_rt(struct crypt_priv *ctx)
+{
+       bool fpu_enabled = ctx->fpu_enabled;
+
+       if (!fpu_enabled)
+               return;
+       camellia_fpu_end(fpu_enabled);
+       ctx->fpu_enabled = false;
+}
+
+static void camellia_fpu_sched_rt(struct crypt_priv *ctx)
+{
+       bool fpu_enabled = ctx->fpu_enabled;
+
+       if (!fpu_enabled || !tif_need_resched_now())
+               return;
+       camellia_fpu_end(fpu_enabled);
+       kernel_fpu_end();
+       /* schedule due to preemptible */
+       kernel_fpu_begin();
+}
+
+#else
+static void camellia_fpu_end_rt(struct crypt_priv *ctx) { }
+static void camellia_fpu_sched_rt(struct crypt_priv *ctx) { }
+#endif
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
@@ -221,16 +249,19 @@ static void encrypt_callback(void *priv,
 	}
 
 	if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
+		camellia_fpu_sched_rt(ctx);
 		camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
 		nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
 	}
 
 	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+		camellia_fpu_sched_rt(ctx);
 		camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
 		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
 	}
+	camellia_fpu_end_rt(ctx);
 
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
 		camellia_enc_blk(ctx->ctx, srcdst, srcdst);
@@ -251,16 +282,19 @@ static void decrypt_callback(void *priv,
 	}
 
 	if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
+		camellia_fpu_sched_rt(ctx);
 		camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
 		nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
 	}
 
 	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+		camellia_fpu_sched_rt(ctx);
 		camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
 		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
 	}
+	camellia_fpu_end_rt(ctx);
 
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
 		camellia_dec_blk(ctx->ctx, srcdst, srcdst);
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -210,6 +210,34 @@ struct crypt_priv {
 	bool fpu_enabled;
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void camellia_fpu_end_rt(struct crypt_priv *ctx)
+{
+	bool fpu_enabled = ctx->fpu_enabled;
+
+	if (!fpu_enabled)
+		return;
+	camellia_fpu_end(fpu_enabled);
+	ctx->fpu_enabled = false;
+}
+
+static void camellia_fpu_sched_rt(struct crypt_priv *ctx)
+{
+	bool fpu_enabled = ctx->fpu_enabled;
+
+	if (!fpu_enabled || !tif_need_resched_now())
+		return;
+	camellia_fpu_end(fpu_enabled);
+	kernel_fpu_end();
+	/* schedule due to preemptible */
+	kernel_fpu_begin();
+}
+
+#else
+static void camellia_fpu_end_rt(struct crypt_priv *ctx) { }
+static void camellia_fpu_sched_rt(struct crypt_priv *ctx) { }
+#endif
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
@@ -225,10 +253,12 @@ static void encrypt_callback(void *priv,
 	}
 
 	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+		camellia_fpu_sched_rt(ctx);
 		camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
 		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
 	}
+	camellia_fpu_end_rt(ctx);
 
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
 		camellia_enc_blk(ctx->ctx, srcdst, srcdst);
@@ -249,10 +279,12 @@ static void decrypt_callback(void *priv,
 	}
 
 	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+		camellia_fpu_sched_rt(ctx);
 		camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
 		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
 	}
+	camellia_fpu_end_rt(ctx);
 
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
 		camellia_dec_blk(ctx->ctx, srcdst, srcdst);
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -205,19 +205,33 @@ struct crypt_priv {
 	bool fpu_enabled;
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void cast6_fpu_end_rt(struct crypt_priv *ctx)
+{
+	bool fpu_enabled = ctx->fpu_enabled;
+
+	if (!fpu_enabled)
+		return;
+	cast6_fpu_end(fpu_enabled);
+	ctx->fpu_enabled = false;
+}
+
+#else
+static void cast6_fpu_end_rt(struct crypt_priv *ctx) { }
+#endif
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = CAST6_BLOCK_SIZE;
 	struct crypt_priv *ctx = priv;
 	int i;
 
-	ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
-
 	if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
+		ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
 		cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
+		cast6_fpu_end_rt(ctx);
 		return;
 	}
-
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
 		__cast6_encrypt(ctx->ctx, srcdst, srcdst);
 }
@@ -228,10 +242,10 @@ static void decrypt_callback(void *priv,
 	struct crypt_priv *ctx = priv;
 	int i;
 
-	ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
-
 	if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
+		ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
 		cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
+		cast6_fpu_end_rt(ctx);
 		return;
 	}
 
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -81,23 +81,24 @@ static int chacha20_simd(struct skcipher
 
 	crypto_chacha20_init(state, ctx, walk.iv);
 
-	kernel_fpu_begin();
-
 	while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
+		kernel_fpu_begin();
+
 		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
 				rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
+		kernel_fpu_end();
 		err = skcipher_walk_done(&walk,
 					 walk.nbytes % CHACHA20_BLOCK_SIZE);
 	}
 
 	if (walk.nbytes) {
+		kernel_fpu_begin();
 		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
 				walk.nbytes);
+		kernel_fpu_end();
 		err = skcipher_walk_done(&walk, 0);
 	}
 
-	kernel_fpu_end();
-
 	return err;
 }
 
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -184,6 +184,33 @@ struct crypt_priv {
 	bool fpu_enabled;
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void serpent_fpu_end_rt(struct crypt_priv *ctx)
+{
+       bool fpu_enabled = ctx->fpu_enabled;
+
+       if (!fpu_enabled)
+               return;
+       serpent_fpu_end(fpu_enabled);
+       ctx->fpu_enabled = false;
+}
+
+static void serpent_fpu_sched_rt(struct crypt_priv *ctx)
+{
+	bool fpu_enabled = ctx->fpu_enabled;
+
+	if (!fpu_enabled || !tif_need_resched_now())
+		return;
+	kernel_fpu_end();
+	/* schedule due to preemptible */
+	kernel_fpu_begin();
+}
+
+#else
+static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
+static void serpent_fpu_sched_rt(struct crypt_priv *ctx) { }
+#endif
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = SERPENT_BLOCK_SIZE;
@@ -199,10 +226,12 @@ static void encrypt_callback(void *priv,
 	}
 
 	while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
+		serpent_fpu_sched_rt(ctx);
 		serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
 		nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
 	}
+	serpent_fpu_end_rt(ctx);
 
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
 		__serpent_encrypt(ctx->ctx, srcdst, srcdst);
@@ -223,10 +252,12 @@ static void decrypt_callback(void *priv,
 	}
 
 	while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
+		serpent_fpu_sched_rt(ctx);
 		serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
 		nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
 	}
+	serpent_fpu_end_rt(ctx);
 
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
 		__serpent_decrypt(ctx->ctx, srcdst, srcdst);
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -218,16 +218,31 @@ struct crypt_priv {
 	bool fpu_enabled;
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void serpent_fpu_end_rt(struct crypt_priv *ctx)
+{
+	bool fpu_enabled = ctx->fpu_enabled;
+
+	if (!fpu_enabled)
+		return;
+	serpent_fpu_end(fpu_enabled);
+	ctx->fpu_enabled = false;
+}
+
+#else
+static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
+#endif
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = SERPENT_BLOCK_SIZE;
 	struct crypt_priv *ctx = priv;
 	int i;
 
-	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
-
 	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+		ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
 		serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
+		serpent_fpu_end_rt(ctx);
 		return;
 	}
 
@@ -241,10 +256,10 @@ static void decrypt_callback(void *priv,
 	struct crypt_priv *ctx = priv;
 	int i;
 
-	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
-
 	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+		ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
 		serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
+		serpent_fpu_end_rt(ctx);
 		return;
 	}
 
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -187,16 +187,31 @@ struct crypt_priv {
 	bool fpu_enabled;
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void serpent_fpu_end_rt(struct crypt_priv *ctx)
+{
+	bool fpu_enabled = ctx->fpu_enabled;
+
+	if (!fpu_enabled)
+		return;
+	serpent_fpu_end(fpu_enabled);
+	ctx->fpu_enabled = false;
+}
+
+#else
+static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
+#endif
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = SERPENT_BLOCK_SIZE;
 	struct crypt_priv *ctx = priv;
 	int i;
 
-	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
-
 	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+		ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
 		serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
+		serpent_fpu_end_rt(ctx);
 		return;
 	}
 
@@ -210,10 +225,10 @@ static void decrypt_callback(void *priv,
 	struct crypt_priv *ctx = priv;
 	int i;
 
-	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
-
 	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+		ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
 		serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
+		serpent_fpu_end_rt(ctx);
 		return;
 	}
 
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -218,6 +218,33 @@ struct crypt_priv {
 	bool fpu_enabled;
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void twofish_fpu_end_rt(struct crypt_priv *ctx)
+{
+	bool fpu_enabled = ctx->fpu_enabled;
+
+	if (!fpu_enabled)
+		return;
+	twofish_fpu_end(fpu_enabled);
+	ctx->fpu_enabled = false;
+}
+
+static void twofish_fpu_sched_rt(struct crypt_priv *ctx)
+{
+	bool fpu_enabled = ctx->fpu_enabled;
+
+	if (!fpu_enabled || !tif_need_resched_now())
+		return;
+	kernel_fpu_end();
+	/* schedule due to preemptible */
+	kernel_fpu_begin();
+}
+
+#else
+static void twofish_fpu_end_rt(struct crypt_priv *ctx) { }
+static void twofish_fpu_sched_rt(struct crypt_priv *ctx) { }
+#endif
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = TF_BLOCK_SIZE;
@@ -228,12 +255,16 @@ static void encrypt_callback(void *priv,
 
 	if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
 		twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
+		twofish_fpu_end_rt(ctx);
 		return;
 	}
 
-	for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
+	for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) {
+		twofish_fpu_sched_rt(ctx);
 		twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
+	}
 
+	twofish_fpu_end_rt(ctx);
 	nbytes %= bsize * 3;
 
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
@@ -250,11 +281,15 @@ static void decrypt_callback(void *priv,
 
 	if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
 		twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
+		twofish_fpu_end_rt(ctx);
 		return;
 	}
 
-	for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
+	for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) {
+		twofish_fpu_sched_rt(ctx);
 		twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
+	}
+	twofish_fpu_end_rt(ctx);
 
 	nbytes %= bsize * 3;
 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RT v2] crypto: limit more FPU-enabled sections
  2017-12-01 10:44     ` [PATCH RT v2] " Sebastian Andrzej Siewior
@ 2017-12-01 11:32       ` Peter Zijlstra
  2017-12-01 13:32         ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 26+ messages in thread
From: Peter Zijlstra @ 2017-12-01 11:32 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Steven Rostedt, linux-rt-users, linux-kernel, tglx

On Fri, Dec 01, 2017 at 11:44:22AM +0100, Sebastian Andrzej Siewior wrote:
> --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
> +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
> @@ -206,6 +206,34 @@ struct crypt_priv {
>  	bool fpu_enabled;
>  };
>  
> +#ifdef CONFIG_PREEMPT_RT_FULL
> +static void camellia_fpu_end_rt(struct crypt_priv *ctx)
> +{
> +       bool fpu_enabled = ctx->fpu_enabled;
> +
> +       if (!fpu_enabled)
> +               return;
> +       camellia_fpu_end(fpu_enabled);
> +       ctx->fpu_enabled = false;
> +}
> +
> +static void camellia_fpu_sched_rt(struct crypt_priv *ctx)
> +{
> +       bool fpu_enabled = ctx->fpu_enabled;
> +
> +       if (!fpu_enabled || !tif_need_resched_now())
> +               return;
> +       camellia_fpu_end(fpu_enabled);
> +       kernel_fpu_end();
> +       /* schedule due to preemptible */
> +       kernel_fpu_begin();
> +}

There's a ton of duplication in there; you're not nearly lazy enough.

Why can't we do something simple like kernel_fpu_resched() ?

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index f92a6593de1e..05321b98a55a 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -130,6 +130,18 @@ void kernel_fpu_begin(void)
 }
 EXPORT_SYMBOL_GPL(kernel_fpu_begin);
 
+void kernel_fpu_resched(void)
+{
+	WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
+
+	if (should_resched(PREEMPT_OFFSET)) {
+		kernel_fpu_end();
+		cond_resched();
+		kernel_fpu_begin();
+	}
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_resched);
+
 void kernel_fpu_end(void)
 {
 	__kernel_fpu_end();

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH RT v2] crypto: limit more FPU-enabled sections
  2017-12-01 11:32       ` Peter Zijlstra
@ 2017-12-01 13:32         ` Sebastian Andrzej Siewior
  2017-12-01 13:44           ` Peter Zijlstra
  0 siblings, 1 reply; 26+ messages in thread
From: Sebastian Andrzej Siewior @ 2017-12-01 13:32 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Steven Rostedt, linux-rt-users, linux-kernel, tglx

On 2017-12-01 12:32:35 [+0100], Peter Zijlstra wrote:
> On Fri, Dec 01, 2017 at 11:44:22AM +0100, Sebastian Andrzej Siewior wrote:
> > +static void camellia_fpu_sched_rt(struct crypt_priv *ctx)
> > +{
> > +       bool fpu_enabled = ctx->fpu_enabled;
> > +
> > +       if (!fpu_enabled || !tif_need_resched_now())
> > +               return;
> > +       camellia_fpu_end(fpu_enabled);
> > +       kernel_fpu_end();
> > +       /* schedule due to preemptible */
> > +       kernel_fpu_begin();
> > +}
> 
> There's a ton of duplication in there; you're not nearly lazy enough.
> 
> Why can't we do something simple like kernel_fpu_resched() ?
> 
> diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
> index f92a6593de1e..05321b98a55a 100644
> --- a/arch/x86/kernel/fpu/core.c
> +++ b/arch/x86/kernel/fpu/core.c
> @@ -130,6 +130,18 @@ void kernel_fpu_begin(void)
>  }
>  EXPORT_SYMBOL_GPL(kernel_fpu_begin);
>  
> +void kernel_fpu_resched(void)
> +{
> +	WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
> +
> +	if (should_resched(PREEMPT_OFFSET)) {
> +		kernel_fpu_end();
> +		cond_resched();
> +		kernel_fpu_begin();

I can do that but I would still keep it RT only to avoid the
kernel_fpu_begin/end to be invoked more often on !RT.
But why that cond_resched()? kernel_fpu_end() ends with preempt_enable()
and this one should do the trick.

> +	}
> +}
> +EXPORT_SYMBOL_GPL(kernel_fpu_resched);
> +
>  void kernel_fpu_end(void)
>  {
>  	__kernel_fpu_end();

Sebastian

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RT v2] crypto: limit more FPU-enabled sections
  2017-12-01 13:32         ` Sebastian Andrzej Siewior
@ 2017-12-01 13:44           ` Peter Zijlstra
  2017-12-01 13:50             ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 26+ messages in thread
From: Peter Zijlstra @ 2017-12-01 13:44 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Steven Rostedt, linux-rt-users, linux-kernel, tglx

On Fri, Dec 01, 2017 at 02:32:56PM +0100, Sebastian Andrzej Siewior wrote:
> > diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
> > index f92a6593de1e..05321b98a55a 100644
> > --- a/arch/x86/kernel/fpu/core.c
> > +++ b/arch/x86/kernel/fpu/core.c
> > @@ -130,6 +130,18 @@ void kernel_fpu_begin(void)
> >  }
> >  EXPORT_SYMBOL_GPL(kernel_fpu_begin);
> >  
> > +void kernel_fpu_resched(void)
> > +{
> > +	WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
> > +
> > +	if (should_resched(PREEMPT_OFFSET)) {
> > +		kernel_fpu_end();
> > +		cond_resched();
> > +		kernel_fpu_begin();
> 
> I can do that but I would still keep it RT only to avoid the
> kernel_fpu_begin/end to be invoked more often on !RT.
> But why that cond_resched()? kernel_fpu_end() ends with preempt_enable()
> and this one should do the trick.

!PREEMPT kernels. The above should work for everyone and would allow
using 'long' kernel_fpu loops:

	kernel_fpu_begin();
	while (work) {
		do_work();
		kernel_fpu_resched();
	}
	kernel_fpu_end();

regardless of preempt setting.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RT] arm*: disable NEON in kernel mode
  2017-12-01 10:43   ` [PATCH RT] arm*: disable NEON in kernel mode Sebastian Andrzej Siewior
@ 2017-12-01 13:45     ` Sebastian Andrzej Siewior
  2017-12-01 14:18       ` Mark Rutland
  0 siblings, 1 reply; 26+ messages in thread
From: Sebastian Andrzej Siewior @ 2017-12-01 13:45 UTC (permalink / raw)
  To: linux-rt-users
  Cc: linux-kernel, tglx, Peter Zijlstra, Steven Rostedt,
	Catalin Marinas, Will Deacon, linux-arm-kernel

+arm folks, to let you know

On 2017-12-01 11:43:32 [+0100], To linux-rt-users@vger.kernel.org wrote:
> NEON in kernel mode is used by the crypto algorithms and raid6 code.
> While the raid6 code looks okay, the crypto algorithms do not: NEON
> is enabled on first invocation and may allocate/free/map memory before
> the NEON mode is disabled again.
> This needs to be changed until it can be enabled.
> On ARM NEON in kernel mode can be simply disabled. on ARM64 it needs to
> stay on due to possible EFI callbacks so here I disable each algorithm.
> 
> Cc: stable-rt@vger.kernel.org
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
>  arch/arm/Kconfig                  |  2 +-
>  arch/arm64/crypto/Kconfig         | 20 ++++++++++----------
>  arch/arm64/crypto/crc32-ce-glue.c |  3 ++-
>  3 files changed, 13 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> index d1346a160760..914fecb088a8 100644
> --- a/arch/arm/Kconfig
> +++ b/arch/arm/Kconfig
> @@ -2164,7 +2164,7 @@ config NEON
>  
>  config KERNEL_MODE_NEON
>  	bool "Support for NEON in kernel mode"
> -	depends on NEON && AEABI
> +	depends on NEON && AEABI && !PREEMPT_RT_BASE
>  	help
>  	  Say Y to include support for NEON in kernel mode.
>  
> diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
> index 70c517aa4501..2a5f05b5a19a 100644
> --- a/arch/arm64/crypto/Kconfig
> +++ b/arch/arm64/crypto/Kconfig
> @@ -19,19 +19,19 @@ config CRYPTO_SHA512_ARM64
>  
>  config CRYPTO_SHA1_ARM64_CE
>  	tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
> -	depends on KERNEL_MODE_NEON
> +	depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
>  	select CRYPTO_HASH
>  	select CRYPTO_SHA1
>  
>  config CRYPTO_SHA2_ARM64_CE
>  	tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)"
> -	depends on KERNEL_MODE_NEON
> +	depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
>  	select CRYPTO_HASH
>  	select CRYPTO_SHA256_ARM64
>  
>  config CRYPTO_GHASH_ARM64_CE
>  	tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
> -	depends on KERNEL_MODE_NEON
> +	depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
>  	select CRYPTO_HASH
>  	select CRYPTO_GF128MUL
>  	select CRYPTO_AES
> @@ -39,7 +39,7 @@ config CRYPTO_GHASH_ARM64_CE
>  
>  config CRYPTO_CRCT10DIF_ARM64_CE
>  	tristate "CRCT10DIF digest algorithm using PMULL instructions"
> -	depends on KERNEL_MODE_NEON && CRC_T10DIF
> +	depends on KERNEL_MODE_NEON && CRC_T10DIF && !PREEMPT_RT_BASE
>  	select CRYPTO_HASH
>  
>  config CRYPTO_CRC32_ARM64_CE
> @@ -53,13 +53,13 @@ config CRYPTO_AES_ARM64
>  
>  config CRYPTO_AES_ARM64_CE
>  	tristate "AES core cipher using ARMv8 Crypto Extensions"
> -	depends on ARM64 && KERNEL_MODE_NEON
> +	depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
>  	select CRYPTO_ALGAPI
>  	select CRYPTO_AES_ARM64
>  
>  config CRYPTO_AES_ARM64_CE_CCM
>  	tristate "AES in CCM mode using ARMv8 Crypto Extensions"
> -	depends on ARM64 && KERNEL_MODE_NEON
> +	depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
>  	select CRYPTO_ALGAPI
>  	select CRYPTO_AES_ARM64_CE
>  	select CRYPTO_AES_ARM64
> @@ -67,7 +67,7 @@ config CRYPTO_AES_ARM64_CE_CCM
>  
>  config CRYPTO_AES_ARM64_CE_BLK
>  	tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
> -	depends on KERNEL_MODE_NEON
> +	depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
>  	select CRYPTO_BLKCIPHER
>  	select CRYPTO_AES_ARM64_CE
>  	select CRYPTO_AES_ARM64
> @@ -75,7 +75,7 @@ config CRYPTO_AES_ARM64_CE_BLK
>  
>  config CRYPTO_AES_ARM64_NEON_BLK
>  	tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
> -	depends on KERNEL_MODE_NEON
> +	depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
>  	select CRYPTO_BLKCIPHER
>  	select CRYPTO_AES_ARM64
>  	select CRYPTO_AES
> @@ -83,13 +83,13 @@ config CRYPTO_AES_ARM64_NEON_BLK
>  
>  config CRYPTO_CHACHA20_NEON
>  	tristate "NEON accelerated ChaCha20 symmetric cipher"
> -	depends on KERNEL_MODE_NEON
> +	depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
>  	select CRYPTO_BLKCIPHER
>  	select CRYPTO_CHACHA20
>  
>  config CRYPTO_AES_ARM64_BS
>  	tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
> -	depends on KERNEL_MODE_NEON
> +	depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
>  	select CRYPTO_BLKCIPHER
>  	select CRYPTO_AES_ARM64_NEON_BLK
>  	select CRYPTO_AES_ARM64
> diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c
> index 624f4137918c..599de95cd86d 100644
> --- a/arch/arm64/crypto/crc32-ce-glue.c
> +++ b/arch/arm64/crypto/crc32-ce-glue.c
> @@ -206,7 +206,8 @@ static struct shash_alg crc32_pmull_algs[] = { {
>  
>  static int __init crc32_pmull_mod_init(void)
>  {
> -	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) {
> +	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
> +	    !IS_ENABLED(CONFIG_PREEMPT_RT_BASE) && (elf_hwcap & HWCAP_PMULL)) {
>  		crc32_pmull_algs[0].update = crc32_pmull_update;
>  		crc32_pmull_algs[1].update = crc32c_pmull_update;
>  
> -- 
> 2.15.0

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RT v2] crypto: limit more FPU-enabled sections
  2017-12-01 13:44           ` Peter Zijlstra
@ 2017-12-01 13:50             ` Sebastian Andrzej Siewior
  2017-12-01 14:03               ` [PATCH RT v3] " Sebastian Andrzej Siewior
  0 siblings, 1 reply; 26+ messages in thread
From: Sebastian Andrzej Siewior @ 2017-12-01 13:50 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Steven Rostedt, linux-rt-users, linux-kernel, tglx

On 2017-12-01 14:44:54 [+0100], Peter Zijlstra wrote:
> !PREEMPT kernels. The above should work for everyone and would allow
> using 'long' kernel_fpu loops:
> 
> 	kernel_fpu_begin();
> 	while (work) {
> 		do_work();
> 		kernel_fpu_resched();
> 	}
> 	kernel_fpu_end();
> 
> regardless of preempt setting.

Ach, wasn't aware that you wanted that explicit for !PREEMPT kernels,
too. Okay, so let me pick this hunk up as-is for RT and before I throw
it mainline I will add some numbers.

Sebastian

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH RT v3] crypto: limit more FPU-enabled sections
  2017-12-01 13:50             ` Sebastian Andrzej Siewior
@ 2017-12-01 14:03               ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 26+ messages in thread
From: Sebastian Andrzej Siewior @ 2017-12-01 14:03 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Steven Rostedt, linux-rt-users, linux-kernel, tglx

Those crypto drivers use SSE/AVX/… for their crypto work and in order to
do so in kernel they need to enable the "FPU" in kernel mode which
disables preemption.
There are two problems with the way they are used:
- the while loop which processes X bytes may create latency spikes and
  should be avoided or limited.
- the cipher-walk-next part may allocate/free memory and may use
  kmap_atomic().

The whole kernel_fpu_begin()/end() processing isn't probably that cheap.
It most likely makes sense to process as much of those as possible in one
go. The new *_fpu_sched_rt() schedules only if a RT task is pending.

Probably we should measure the performance those ciphers in pure SW
mode and with this optimisations to see if it makes sense to keep them
for RT.

This kernel_fpu_resched() makes the code more preemptible which might hurt
performance.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/x86/crypto/camellia_aesni_avx2_glue.c |   20 ++++++++++++++++++++
 arch/x86/crypto/camellia_aesni_avx_glue.c  |   19 +++++++++++++++++++
 arch/x86/crypto/cast6_avx_glue.c           |   24 +++++++++++++++++++-----
 arch/x86/crypto/chacha20_glue.c            |    9 +++++----
 arch/x86/crypto/serpent_avx2_glue.c        |   19 +++++++++++++++++++
 arch/x86/crypto/serpent_avx_glue.c         |   23 +++++++++++++++++++----
 arch/x86/crypto/serpent_sse2_glue.c        |   23 +++++++++++++++++++----
 arch/x86/crypto/twofish_avx_glue.c         |   27 +++++++++++++++++++++++++--
 arch/x86/include/asm/fpu/api.h             |    1 +
 arch/x86/kernel/fpu/core.c                 |   12 ++++++++++++
 10 files changed, 158 insertions(+), 19 deletions(-)

--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -206,6 +206,20 @@ struct crypt_priv {
 	bool fpu_enabled;
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void camellia_fpu_end_rt(struct crypt_priv *ctx)
+{
+       bool fpu_enabled = ctx->fpu_enabled;
+
+       if (!fpu_enabled)
+               return;
+       camellia_fpu_end(fpu_enabled);
+       ctx->fpu_enabled = false;
+}
+#else
+static void camellia_fpu_end_rt(struct crypt_priv *ctx) { }
+#endif
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
@@ -221,16 +235,19 @@ static void encrypt_callback(void *priv,
 	}
 
 	if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
+		kernel_fpu_resched();
 		camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
 		nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
 	}
 
 	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+		kernel_fpu_resched();
 		camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
 		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
 	}
+	camellia_fpu_end_rt(ctx);
 
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
 		camellia_enc_blk(ctx->ctx, srcdst, srcdst);
@@ -251,16 +268,19 @@ static void decrypt_callback(void *priv,
 	}
 
 	if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
+		kernel_fpu_resched();
 		camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
 		nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
 	}
 
 	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+		kernel_fpu_resched();
 		camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
 		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
 	}
+	camellia_fpu_end_rt(ctx);
 
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
 		camellia_dec_blk(ctx->ctx, srcdst, srcdst);
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -210,6 +210,21 @@ struct crypt_priv {
 	bool fpu_enabled;
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void camellia_fpu_end_rt(struct crypt_priv *ctx)
+{
+	bool fpu_enabled = ctx->fpu_enabled;
+
+	if (!fpu_enabled)
+		return;
+	camellia_fpu_end(fpu_enabled);
+	ctx->fpu_enabled = false;
+}
+
+#else
+static void camellia_fpu_end_rt(struct crypt_priv *ctx) { }
+#endif
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
@@ -225,10 +240,12 @@ static void encrypt_callback(void *priv,
 	}
 
 	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+		kernel_fpu_resched();
 		camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
 		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
 	}
+	camellia_fpu_end_rt(ctx);
 
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
 		camellia_enc_blk(ctx->ctx, srcdst, srcdst);
@@ -249,10 +266,12 @@ static void decrypt_callback(void *priv,
 	}
 
 	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+		kernel_fpu_resched();
 		camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
 		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
 	}
+	camellia_fpu_end_rt(ctx);
 
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
 		camellia_dec_blk(ctx->ctx, srcdst, srcdst);
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -205,19 +205,33 @@ struct crypt_priv {
 	bool fpu_enabled;
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void cast6_fpu_end_rt(struct crypt_priv *ctx)
+{
+	bool fpu_enabled = ctx->fpu_enabled;
+
+	if (!fpu_enabled)
+		return;
+	cast6_fpu_end(fpu_enabled);
+	ctx->fpu_enabled = false;
+}
+
+#else
+static void cast6_fpu_end_rt(struct crypt_priv *ctx) { }
+#endif
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = CAST6_BLOCK_SIZE;
 	struct crypt_priv *ctx = priv;
 	int i;
 
-	ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
-
 	if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
+		ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
 		cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
+		cast6_fpu_end_rt(ctx);
 		return;
 	}
-
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
 		__cast6_encrypt(ctx->ctx, srcdst, srcdst);
 }
@@ -228,10 +242,10 @@ static void decrypt_callback(void *priv,
 	struct crypt_priv *ctx = priv;
 	int i;
 
-	ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
-
 	if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
+		ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
 		cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
+		cast6_fpu_end_rt(ctx);
 		return;
 	}
 
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -81,23 +81,24 @@ static int chacha20_simd(struct skcipher
 
 	crypto_chacha20_init(state, ctx, walk.iv);
 
-	kernel_fpu_begin();
-
 	while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
+		kernel_fpu_begin();
+
 		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
 				rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
+		kernel_fpu_end();
 		err = skcipher_walk_done(&walk,
 					 walk.nbytes % CHACHA20_BLOCK_SIZE);
 	}
 
 	if (walk.nbytes) {
+		kernel_fpu_begin();
 		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
 				walk.nbytes);
+		kernel_fpu_end();
 		err = skcipher_walk_done(&walk, 0);
 	}
 
-	kernel_fpu_end();
-
 	return err;
 }
 
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -184,6 +184,21 @@ struct crypt_priv {
 	bool fpu_enabled;
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void serpent_fpu_end_rt(struct crypt_priv *ctx)
+{
+       bool fpu_enabled = ctx->fpu_enabled;
+
+       if (!fpu_enabled)
+               return;
+       serpent_fpu_end(fpu_enabled);
+       ctx->fpu_enabled = false;
+}
+
+#else
+static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
+#endif
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = SERPENT_BLOCK_SIZE;
@@ -199,10 +214,12 @@ static void encrypt_callback(void *priv,
 	}
 
 	while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
+		kernel_fpu_resched();
 		serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
 		nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
 	}
+	serpent_fpu_end_rt(ctx);
 
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
 		__serpent_encrypt(ctx->ctx, srcdst, srcdst);
@@ -223,10 +240,12 @@ static void decrypt_callback(void *priv,
 	}
 
 	while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
+		kernel_fpu_resched();
 		serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
 		srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
 		nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
 	}
+	serpent_fpu_end_rt(ctx);
 
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
 		__serpent_decrypt(ctx->ctx, srcdst, srcdst);
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -218,16 +218,31 @@ struct crypt_priv {
 	bool fpu_enabled;
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void serpent_fpu_end_rt(struct crypt_priv *ctx)
+{
+	bool fpu_enabled = ctx->fpu_enabled;
+
+	if (!fpu_enabled)
+		return;
+	serpent_fpu_end(fpu_enabled);
+	ctx->fpu_enabled = false;
+}
+
+#else
+static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
+#endif
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = SERPENT_BLOCK_SIZE;
 	struct crypt_priv *ctx = priv;
 	int i;
 
-	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
-
 	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+		ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
 		serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
+		serpent_fpu_end_rt(ctx);
 		return;
 	}
 
@@ -241,10 +256,10 @@ static void decrypt_callback(void *priv,
 	struct crypt_priv *ctx = priv;
 	int i;
 
-	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
-
 	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+		ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
 		serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
+		serpent_fpu_end_rt(ctx);
 		return;
 	}
 
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -187,16 +187,31 @@ struct crypt_priv {
 	bool fpu_enabled;
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void serpent_fpu_end_rt(struct crypt_priv *ctx)
+{
+	bool fpu_enabled = ctx->fpu_enabled;
+
+	if (!fpu_enabled)
+		return;
+	serpent_fpu_end(fpu_enabled);
+	ctx->fpu_enabled = false;
+}
+
+#else
+static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
+#endif
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = SERPENT_BLOCK_SIZE;
 	struct crypt_priv *ctx = priv;
 	int i;
 
-	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
-
 	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+		ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
 		serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
+		serpent_fpu_end_rt(ctx);
 		return;
 	}
 
@@ -210,10 +225,10 @@ static void decrypt_callback(void *priv,
 	struct crypt_priv *ctx = priv;
 	int i;
 
-	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
-
 	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+		ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
 		serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
+		serpent_fpu_end_rt(ctx);
 		return;
 	}
 
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -218,6 +218,21 @@ struct crypt_priv {
 	bool fpu_enabled;
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void twofish_fpu_end_rt(struct crypt_priv *ctx)
+{
+	bool fpu_enabled = ctx->fpu_enabled;
+
+	if (!fpu_enabled)
+		return;
+	twofish_fpu_end(fpu_enabled);
+	ctx->fpu_enabled = false;
+}
+
+#else
+static void twofish_fpu_end_rt(struct crypt_priv *ctx) { }
+#endif
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = TF_BLOCK_SIZE;
@@ -228,12 +243,16 @@ static void encrypt_callback(void *priv,
 
 	if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
 		twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
+		twofish_fpu_end_rt(ctx);
 		return;
 	}
 
-	for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
+	for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) {
+		kernel_fpu_resched();
 		twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
+	}
 
+	twofish_fpu_end_rt(ctx);
 	nbytes %= bsize * 3;
 
 	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
@@ -250,11 +269,15 @@ static void decrypt_callback(void *priv,
 
 	if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
 		twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
+		twofish_fpu_end_rt(ctx);
 		return;
 	}
 
-	for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
+	for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) {
+		kernel_fpu_resched();
 		twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
+	}
+	twofish_fpu_end_rt(ctx);
 
 	nbytes %= bsize * 3;
 
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -25,6 +25,7 @@ extern void __kernel_fpu_begin(void);
 extern void __kernel_fpu_end(void);
 extern void kernel_fpu_begin(void);
 extern void kernel_fpu_end(void);
+extern void kernel_fpu_resched(void);
 extern bool irq_fpu_usable(void);
 
 /*
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -137,6 +137,18 @@ void kernel_fpu_end(void)
 }
 EXPORT_SYMBOL_GPL(kernel_fpu_end);
 
+void kernel_fpu_resched(void)
+{
+	WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
+
+	if (should_resched(PREEMPT_OFFSET)) {
+		kernel_fpu_end();
+		cond_resched();
+		kernel_fpu_begin();
+	}
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_resched);
+
 /*
  * Save the FPU state (mark it for reload if necessary):
  *

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RT] arm*: disable NEON in kernel mode
  2017-12-01 13:45     ` Sebastian Andrzej Siewior
@ 2017-12-01 14:18       ` Mark Rutland
  2017-12-01 14:36         ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 26+ messages in thread
From: Mark Rutland @ 2017-12-01 14:18 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-rt-users, Peter Zijlstra, Catalin Marinas, Will Deacon,
	linux-kernel, Steven Rostedt, tglx, linux-arm-kernel,
	ard.biesheuvel

[Adding Ard, who wrote the NEON crypto code]

On Fri, Dec 01, 2017 at 02:45:06PM +0100, Sebastian Andrzej Siewior wrote:
> +arm folks, to let you know
> 
> On 2017-12-01 11:43:32 [+0100], To linux-rt-users@vger.kernel.org wrote:
> > NEON in kernel mode is used by the crypto algorithms and raid6 code.
> > While the raid6 code looks okay, the crypto algorithms do not: NEON
> > is enabled on first invocation and may allocate/free/map memory before
> > the NEON mode is disabled again.

Could you elaborate on why this is a problem?

I guess this is because kernel_neon_{begin,end}() disable preemption?

... is this specific to RT?

Thanks,
Mark.

> > This needs to be changed until it can be enabled.
> > On ARM NEON in kernel mode can be simply disabled. on ARM64 it needs to
> > stay on due to possible EFI callbacks so here I disable each algorithm.
> > 
> > Cc: stable-rt@vger.kernel.org
> > Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> > ---
> >  arch/arm/Kconfig                  |  2 +-
> >  arch/arm64/crypto/Kconfig         | 20 ++++++++++----------
> >  arch/arm64/crypto/crc32-ce-glue.c |  3 ++-
> >  3 files changed, 13 insertions(+), 12 deletions(-)
> > 
> > diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> > index d1346a160760..914fecb088a8 100644
> > --- a/arch/arm/Kconfig
> > +++ b/arch/arm/Kconfig
> > @@ -2164,7 +2164,7 @@ config NEON
> >  
> >  config KERNEL_MODE_NEON
> >  	bool "Support for NEON in kernel mode"
> > -	depends on NEON && AEABI
> > +	depends on NEON && AEABI && !PREEMPT_RT_BASE
> >  	help
> >  	  Say Y to include support for NEON in kernel mode.
> >  
> > diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
> > index 70c517aa4501..2a5f05b5a19a 100644
> > --- a/arch/arm64/crypto/Kconfig
> > +++ b/arch/arm64/crypto/Kconfig
> > @@ -19,19 +19,19 @@ config CRYPTO_SHA512_ARM64
> >  
> >  config CRYPTO_SHA1_ARM64_CE
> >  	tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
> > -	depends on KERNEL_MODE_NEON
> > +	depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
> >  	select CRYPTO_HASH
> >  	select CRYPTO_SHA1
> >  
> >  config CRYPTO_SHA2_ARM64_CE
> >  	tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)"
> > -	depends on KERNEL_MODE_NEON
> > +	depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
> >  	select CRYPTO_HASH
> >  	select CRYPTO_SHA256_ARM64
> >  
> >  config CRYPTO_GHASH_ARM64_CE
> >  	tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
> > -	depends on KERNEL_MODE_NEON
> > +	depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
> >  	select CRYPTO_HASH
> >  	select CRYPTO_GF128MUL
> >  	select CRYPTO_AES
> > @@ -39,7 +39,7 @@ config CRYPTO_GHASH_ARM64_CE
> >  
> >  config CRYPTO_CRCT10DIF_ARM64_CE
> >  	tristate "CRCT10DIF digest algorithm using PMULL instructions"
> > -	depends on KERNEL_MODE_NEON && CRC_T10DIF
> > +	depends on KERNEL_MODE_NEON && CRC_T10DIF && !PREEMPT_RT_BASE
> >  	select CRYPTO_HASH
> >  
> >  config CRYPTO_CRC32_ARM64_CE
> > @@ -53,13 +53,13 @@ config CRYPTO_AES_ARM64
> >  
> >  config CRYPTO_AES_ARM64_CE
> >  	tristate "AES core cipher using ARMv8 Crypto Extensions"
> > -	depends on ARM64 && KERNEL_MODE_NEON
> > +	depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
> >  	select CRYPTO_ALGAPI
> >  	select CRYPTO_AES_ARM64
> >  
> >  config CRYPTO_AES_ARM64_CE_CCM
> >  	tristate "AES in CCM mode using ARMv8 Crypto Extensions"
> > -	depends on ARM64 && KERNEL_MODE_NEON
> > +	depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
> >  	select CRYPTO_ALGAPI
> >  	select CRYPTO_AES_ARM64_CE
> >  	select CRYPTO_AES_ARM64
> > @@ -67,7 +67,7 @@ config CRYPTO_AES_ARM64_CE_CCM
> >  
> >  config CRYPTO_AES_ARM64_CE_BLK
> >  	tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
> > -	depends on KERNEL_MODE_NEON
> > +	depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
> >  	select CRYPTO_BLKCIPHER
> >  	select CRYPTO_AES_ARM64_CE
> >  	select CRYPTO_AES_ARM64
> > @@ -75,7 +75,7 @@ config CRYPTO_AES_ARM64_CE_BLK
> >  
> >  config CRYPTO_AES_ARM64_NEON_BLK
> >  	tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
> > -	depends on KERNEL_MODE_NEON
> > +	depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
> >  	select CRYPTO_BLKCIPHER
> >  	select CRYPTO_AES_ARM64
> >  	select CRYPTO_AES
> > @@ -83,13 +83,13 @@ config CRYPTO_AES_ARM64_NEON_BLK
> >  
> >  config CRYPTO_CHACHA20_NEON
> >  	tristate "NEON accelerated ChaCha20 symmetric cipher"
> > -	depends on KERNEL_MODE_NEON
> > +	depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
> >  	select CRYPTO_BLKCIPHER
> >  	select CRYPTO_CHACHA20
> >  
> >  config CRYPTO_AES_ARM64_BS
> >  	tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
> > -	depends on KERNEL_MODE_NEON
> > +	depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
> >  	select CRYPTO_BLKCIPHER
> >  	select CRYPTO_AES_ARM64_NEON_BLK
> >  	select CRYPTO_AES_ARM64
> > diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c
> > index 624f4137918c..599de95cd86d 100644
> > --- a/arch/arm64/crypto/crc32-ce-glue.c
> > +++ b/arch/arm64/crypto/crc32-ce-glue.c
> > @@ -206,7 +206,8 @@ static struct shash_alg crc32_pmull_algs[] = { {
> >  
> >  static int __init crc32_pmull_mod_init(void)
> >  {
> > -	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) {
> > +	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
> > +	    !IS_ENABLED(CONFIG_PREEMPT_RT_BASE) && (elf_hwcap & HWCAP_PMULL)) {
> >  		crc32_pmull_algs[0].update = crc32_pmull_update;
> >  		crc32_pmull_algs[1].update = crc32c_pmull_update;
> >  
> > -- 
> > 2.15.0
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RT] arm*: disable NEON in kernel mode
  2017-12-01 14:18       ` Mark Rutland
@ 2017-12-01 14:36         ` Sebastian Andrzej Siewior
  2017-12-01 15:03           ` Ard Biesheuvel
  2017-12-01 17:14           ` Peter Zijlstra
  0 siblings, 2 replies; 26+ messages in thread
From: Sebastian Andrzej Siewior @ 2017-12-01 14:36 UTC (permalink / raw)
  To: Mark Rutland
  Cc: linux-rt-users, Peter Zijlstra, Catalin Marinas, Will Deacon,
	linux-kernel, Steven Rostedt, tglx, linux-arm-kernel,
	ard.biesheuvel

On 2017-12-01 14:18:28 [+0000], Mark Rutland wrote:
> [Adding Ard, who wrote the NEON crypto code]
> 
> On Fri, Dec 01, 2017 at 02:45:06PM +0100, Sebastian Andrzej Siewior wrote:
> > +arm folks, to let you know
> > 
> > On 2017-12-01 11:43:32 [+0100], To linux-rt-users@vger.kernel.org wrote:
> > > NEON in kernel mode is used by the crypto algorithms and raid6 code.
> > > While the raid6 code looks okay, the crypto algorithms do not: NEON
> > > is enabled on first invocation and may allocate/free/map memory before
> > > the NEON mode is disabled again.
> 
> Could you elaborate on why this is a problem?
> 
> I guess this is because kernel_neon_{begin,end}() disable preemption?
> 
> ... is this specific to RT?

It is RT specific, yes. One thing are the unbounded latencies since
everything in this preempt_disable section can take time depending on
the size of the request.
The other thing is code like in
  arch/arm64/crypto/aes-ce-ccm-glue.c:ccm_encrypt()

where within this preempt_disable() section skcipher_walk_done() is
invoked. That function can allocate/free/map memory which is okay for
!RT but is not for RT. I tried to break those loops for x86 [0] and I
simply didn't had the time to do the same for ARM. I am aware that
store/restore of the NEON registers (as SSE and AVX) is expensive and
doing a lot of operations in one go is desired. So for x86 I would want
to do some benchmarks and come up with some numbers based on which I
can argue with people one way or another depending on how much it hurts
and how long preemption can be disabled.

[0] https://www.spinics.net/lists/kernel/msg2663115.html

> Thanks,
> Mark.

Sebastian

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RT] arm*: disable NEON in kernel mode
  2017-12-01 14:36         ` Sebastian Andrzej Siewior
@ 2017-12-01 15:03           ` Ard Biesheuvel
  2017-12-01 17:58             ` Dave Martin
  2017-12-01 17:14           ` Peter Zijlstra
  1 sibling, 1 reply; 26+ messages in thread
From: Ard Biesheuvel @ 2017-12-01 15:03 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Mark Rutland, linux-rt-users, Peter Zijlstra, Catalin Marinas,
	Will Deacon, linux-kernel, Steven Rostedt, tglx,
	linux-arm-kernel


l
> On 1 Dec 2017, at 14:36, Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> 
>> On 2017-12-01 14:18:28 [+0000], Mark Rutland wrote:
>> [Adding Ard, who wrote the NEON crypto code]
>> 
>>> On Fri, Dec 01, 2017 at 02:45:06PM +0100, Sebastian Andrzej Siewior wrote:
>>> +arm folks, to let you know
>>> 
>>>> On 2017-12-01 11:43:32 [+0100], To linux-rt-users@vger.kernel.org wrote:
>>>> NEON in kernel mode is used by the crypto algorithms and raid6 code.
>>>> While the raid6 code looks okay, the crypto algorithms do not: NEON
>>>> is enabled on first invocation and may allocate/free/map memory before
>>>> the NEON mode is disabled again.
>> 
>> Could you elaborate on why this is a problem?
>> 
>> I guess this is because kernel_neon_{begin,end}() disable preemption?
>> 
>> ... is this specific to RT?
> 
> It is RT specific, yes. One thing are the unbounded latencies since
> everything in this preempt_disable section can take time depending on
> the size of the request.
> The other thing is code like in
>  arch/arm64/crypto/aes-ce-ccm-glue.c:ccm_encrypt()
> 
> where within this preempt_disable() section skcipher_walk_done() is
> invoked. That function can allocate/free/map memory which is okay for
> !RT but is not for RT. I tried to break those loops for x86 [0] and I
> simply didn't had the time to do the same for ARM. I am aware that
> store/restore of the NEON registers (as SSE and AVX) is expensive and
> doing a lot of operations in one go is desired.

I wouldn’t mind fixing the code instead. We never disable the neon, but only stack the contents of the registers the first time, and unstack them only before returning to userland (with the exception of nested neon use in softirq context). When this code was introduced, we always stacked/unstacked the whole register file eagerly every time.


> So for x86 I would want
> to do some benchmarks and come up with some numbers based on which I
> can argue with people one way or another depending on how much it hurts
> and how long preemption can be disabled.
> 
> [0] https://www.spinics.net/lists/kernel/msg2663115.html
> 
>> Thanks,
>> Mark.
> 
> Sebastian

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RT] arm*: disable NEON in kernel mode
  2017-12-01 14:36         ` Sebastian Andrzej Siewior
  2017-12-01 15:03           ` Ard Biesheuvel
@ 2017-12-01 17:14           ` Peter Zijlstra
  2017-12-01 17:31             ` Russell King - ARM Linux
  1 sibling, 1 reply; 26+ messages in thread
From: Peter Zijlstra @ 2017-12-01 17:14 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Mark Rutland, linux-rt-users, Catalin Marinas, Will Deacon,
	linux-kernel, Steven Rostedt, tglx, linux-arm-kernel,
	ard.biesheuvel

On Fri, Dec 01, 2017 at 03:36:48PM +0100, Sebastian Andrzej Siewior wrote:
> On 2017-12-01 14:18:28 [+0000], Mark Rutland wrote:
> > [Adding Ard, who wrote the NEON crypto code]
> > 
> > On Fri, Dec 01, 2017 at 02:45:06PM +0100, Sebastian Andrzej Siewior wrote:
> > > +arm folks, to let you know
> > > 
> > > On 2017-12-01 11:43:32 [+0100], To linux-rt-users@vger.kernel.org wrote:
> > > > NEON in kernel mode is used by the crypto algorithms and raid6 code.
> > > > While the raid6 code looks okay, the crypto algorithms do not: NEON
> > > > is enabled on first invocation and may allocate/free/map memory before
> > > > the NEON mode is disabled again.
> > 
> > Could you elaborate on why this is a problem?
> > 
> > I guess this is because kernel_neon_{begin,end}() disable preemption?
> > 
> > ... is this specific to RT?
> 
> It is RT specific, yes. One thing are the unbounded latencies since
> everything in this preempt_disable section can take time depending on
> the size of the request.

Well, PREEMPT cares about that too.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RT] arm*: disable NEON in kernel mode
  2017-12-01 17:14           ` Peter Zijlstra
@ 2017-12-01 17:31             ` Russell King - ARM Linux
  2017-12-01 17:39               ` Peter Zijlstra
  0 siblings, 1 reply; 26+ messages in thread
From: Russell King - ARM Linux @ 2017-12-01 17:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Sebastian Andrzej Siewior, Mark Rutland, linux-rt-users,
	ard.biesheuvel, Catalin Marinas, Will Deacon, linux-kernel,
	Steven Rostedt, tglx, linux-arm-kernel

On Fri, Dec 01, 2017 at 06:14:58PM +0100, Peter Zijlstra wrote:
> On Fri, Dec 01, 2017 at 03:36:48PM +0100, Sebastian Andrzej Siewior wrote:
> > On 2017-12-01 14:18:28 [+0000], Mark Rutland wrote:
> > > [Adding Ard, who wrote the NEON crypto code]
> > > 
> > > On Fri, Dec 01, 2017 at 02:45:06PM +0100, Sebastian Andrzej Siewior wrote:
> > > > +arm folks, to let you know
> > > > 
> > > > On 2017-12-01 11:43:32 [+0100], To linux-rt-users@vger.kernel.org wrote:
> > > > > NEON in kernel mode is used by the crypto algorithms and raid6 code.
> > > > > While the raid6 code looks okay, the crypto algorithms do not: NEON
> > > > > is enabled on first invocation and may allocate/free/map memory before
> > > > > the NEON mode is disabled again.
> > > 
> > > Could you elaborate on why this is a problem?
> > > 
> > > I guess this is because kernel_neon_{begin,end}() disable preemption?
> > > 
> > > ... is this specific to RT?
> > 
> > It is RT specific, yes. One thing are the unbounded latencies since
> > everything in this preempt_disable section can take time depending on
> > the size of the request.
> 
> Well, PREEMPT cares about that too.

Preempt may care, but it's the hit you take to use neon in the kernel.
The neon register set shares with the FPU, so preempting during that
path means that the normal FPU register saving would corrupt the
already saved user FPU context - and even worse would result in the
kernel's crypto function register contents being leaked to userspace.

If you care about preempt deeply, the only solution is to avoid using
kernel mode neon.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 8.8Mbps down 630kbps up
According to speedtest.net: 8.21Mbps down 510kbps up

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RT] arm*: disable NEON in kernel mode
  2017-12-01 17:31             ` Russell King - ARM Linux
@ 2017-12-01 17:39               ` Peter Zijlstra
  0 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2017-12-01 17:39 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Sebastian Andrzej Siewior, Mark Rutland, linux-rt-users,
	ard.biesheuvel, Catalin Marinas, Will Deacon, linux-kernel,
	Steven Rostedt, tglx, linux-arm-kernel

On Fri, Dec 01, 2017 at 05:31:20PM +0000, Russell King - ARM Linux wrote:
> On Fri, Dec 01, 2017 at 06:14:58PM +0100, Peter Zijlstra wrote:
> > Well, PREEMPT cares about that too.
> 
> Preempt may care, but it's the hit you take to use neon in the kernel.
> The neon register set shares with the FPU, so preempting during that
> path means that the normal FPU register saving would corrupt the
> already saved user FPU context - and even worse would result in the
> kernel's crypto function register contents being leaked to userspace.

Same thing on x86.

> If you care about preempt deeply, the only solution is to avoid using
> kernel mode neon.

Not quite, you can write the code such that it drops out of neon mode
regularly to allow preemption.

So setup a crypto block, enter neon, transform the block, drop out of
neon, rinse repeat.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RT] arm*: disable NEON in kernel mode
  2017-12-01 15:03           ` Ard Biesheuvel
@ 2017-12-01 17:58             ` Dave Martin
  2017-12-01 18:08               ` Russell King - ARM Linux
  0 siblings, 1 reply; 26+ messages in thread
From: Dave Martin @ 2017-12-01 17:58 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Sebastian Andrzej Siewior, Mark Rutland, linux-rt-users,
	Peter Zijlstra, Catalin Marinas, Will Deacon, linux-kernel,
	Steven Rostedt, tglx, linux-arm-kernel

On Fri, Dec 01, 2017 at 03:03:35PM +0000, Ard Biesheuvel wrote:
> 
> l
> > On 1 Dec 2017, at 14:36, Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> > 
> >> On 2017-12-01 14:18:28 [+0000], Mark Rutland wrote:
> >> [Adding Ard, who wrote the NEON crypto code]
> >> 
> >>> On Fri, Dec 01, 2017 at 02:45:06PM +0100, Sebastian Andrzej Siewior wrote:
> >>> +arm folks, to let you know
> >>> 
> >>>> On 2017-12-01 11:43:32 [+0100], To linux-rt-users@vger.kernel.org wrote:
> >>>> NEON in kernel mode is used by the crypto algorithms and raid6 code.
> >>>> While the raid6 code looks okay, the crypto algorithms do not: NEON
> >>>> is enabled on first invocation and may allocate/free/map memory before
> >>>> the NEON mode is disabled again.
> >> 
> >> Could you elaborate on why this is a problem?
> >> 
> >> I guess this is because kernel_neon_{begin,end}() disable preemption?
> >> 
> >> ... is this specific to RT?
> > 
> > It is RT specific, yes. One thing are the unbounded latencies since
> > everything in this preempt_disable section can take time depending on
> > the size of the request.
> > The other thing is code like in
> >  arch/arm64/crypto/aes-ce-ccm-glue.c:ccm_encrypt()
> > 
> > where within this preempt_disable() section skcipher_walk_done() is
> > invoked. That function can allocate/free/map memory which is okay for
> > !RT but is not for RT. I tried to break those loops for x86 [0] and I
> > simply didn't had the time to do the same for ARM. I am aware that
> > store/restore of the NEON registers (as SSE and AVX) is expensive and
> > doing a lot of operations in one go is desired.
> 
> I wouldn’t mind fixing the code instead. We never disable the neon,
> but only stack the contents of the registers the first time, and
> unstack them only before returning to userland (with the exception of
> nested neon use in softirq context). When this code was introduced,
> we always stacked/unstacked the whole register file eagerly every
> time.

+1, at least for arm64.  I don't see a really compelling reason for
holding kernel-mode NEON around memory management now that we have a
strict save-once-restore-lazily model.

This may not work so well for arm though -- I haven't looked at that
code for a while.


If there is memory manamement in any core loop, you already lost
the performance battle, and an extra
kernel_neon_end()+kernel_neon_begin() may not be that catastrophic.

[...]

Cheers
---Dave

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RT] arm*: disable NEON in kernel mode
  2017-12-01 17:58             ` Dave Martin
@ 2017-12-01 18:08               ` Russell King - ARM Linux
  2017-12-01 18:24                 ` Dave Martin
  0 siblings, 1 reply; 26+ messages in thread
From: Russell King - ARM Linux @ 2017-12-01 18:08 UTC (permalink / raw)
  To: Dave Martin
  Cc: Ard Biesheuvel, Mark Rutland, linux-rt-users, Peter Zijlstra,
	Catalin Marinas, Sebastian Andrzej Siewior, Will Deacon,
	linux-kernel, Steven Rostedt, tglx, linux-arm-kernel

On Fri, Dec 01, 2017 at 05:58:45PM +0000, Dave Martin wrote:
> +1, at least for arm64.  I don't see a really compelling reason for
> holding kernel-mode NEON around memory management now that we have a
> strict save-once-restore-lazily model.
> 
> This may not work so well for arm though -- I haven't looked at that
> code for a while.
> 
> 
> If there is memory manamement in any core loop, you already lost
> the performance battle, and an extra
> kernel_neon_end()+kernel_neon_begin() may not be that catastrophic.

Remember that we don't permit context switches while kernel neon is
in use on ARM - if there's any possibility of scheduling to occur,
the get_cpu() in kernel_neon_begin() should trigger a schedule-while-
atomic warning.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 8.8Mbps down 630kbps up
According to speedtest.net: 8.21Mbps down 510kbps up

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RT] arm*: disable NEON in kernel mode
  2017-12-01 18:08               ` Russell King - ARM Linux
@ 2017-12-01 18:24                 ` Dave Martin
  2017-12-01 19:20                   ` Ard Biesheuvel
  2017-12-04  9:21                   ` Sebastian Andrzej Siewior
  0 siblings, 2 replies; 26+ messages in thread
From: Dave Martin @ 2017-12-01 18:24 UTC (permalink / raw)
  To: Russell King - ARM Linux, Sebastian Andrzej Siewior
  Cc: Mark Rutland, linux-rt-users, Ard Biesheuvel, Peter Zijlstra,
	Catalin Marinas, Will Deacon, linux-kernel, Steven Rostedt, tglx,
	linux-arm-kernel

On Fri, Dec 01, 2017 at 06:08:28PM +0000, Russell King - ARM Linux wrote:
> On Fri, Dec 01, 2017 at 05:58:45PM +0000, Dave Martin wrote:
> > +1, at least for arm64.  I don't see a really compelling reason for
> > holding kernel-mode NEON around memory management now that we have a
> > strict save-once-restore-lazily model.
> > 
> > This may not work so well for arm though -- I haven't looked at that
> > code for a while.
> > 
> > 
> > If there is memory manamement in any core loop, you already lost
> > the performance battle, and an extra
> > kernel_neon_end()+kernel_neon_begin() may not be that catastrophic.
> 
> Remember that we don't permit context switches while kernel neon is
> in use on ARM - if there's any possibility of scheduling to occur,
> the get_cpu() in kernel_neon_begin() should trigger a schedule-while-
> atomic warning.

Agreed, and an arm64 the same is true.  (Actually softirq is disabled
too, for tortuous reasons involving SVE.)

On 2017-12-01 11:43:32 [+0100], To linux-rt-users@vger.kernel.org wrote:
> NEON in kernel mode is used by the crypto algorithms and raid6 code.
> While the raid6 code looks okay, the crypto algorithms do not: NEON
> is enabled on first invocation and may allocate/free/map memory before
> the NEON mode is disabled again.
> This needs to be changed until it can be enabled.
> On ARM NEON in kernel mode can be simply disabled. on ARM64 it needs to
> stay on due to possible EFI callbacks so here I disable each algorithm.

[...]

> diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
> index 70c517aa4501..2a5f05b5a19a 100644
> --- a/arch/arm64/crypto/Kconfig
> +++ b/arch/arm64/crypto/Kconfig
> @@ -19,19 +19,19 @@ config CRYPTO_SHA512_ARM64
>
>  config CRYPTO_SHA1_ARM64_CE
>     tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
> -   depends on KERNEL_MODE_NEON
> +   depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
>     select CRYPTO_HASH
>     select CRYPTO_SHA1

Sebastian, can you piont to where sha1 (say) hits this issue?
I wonder whether this is really a sign that some refactoring is needed.

Cheers
---Dave

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RT] arm*: disable NEON in kernel mode
  2017-12-01 18:24                 ` Dave Martin
@ 2017-12-01 19:20                   ` Ard Biesheuvel
  2017-12-04  9:21                   ` Sebastian Andrzej Siewior
  1 sibling, 0 replies; 26+ messages in thread
From: Ard Biesheuvel @ 2017-12-01 19:20 UTC (permalink / raw)
  To: Dave Martin
  Cc: Russell King - ARM Linux, Sebastian Andrzej Siewior,
	Mark Rutland, linux-rt-users, Peter Zijlstra, Catalin Marinas,
	Will Deacon, linux-kernel, Steven Rostedt, tglx,
	linux-arm-kernel

On 1 December 2017 at 18:24, Dave Martin <Dave.Martin@arm.com> wrote:
> On Fri, Dec 01, 2017 at 06:08:28PM +0000, Russell King - ARM Linux wrote:
>> On Fri, Dec 01, 2017 at 05:58:45PM +0000, Dave Martin wrote:
>> > +1, at least for arm64.  I don't see a really compelling reason for
>> > holding kernel-mode NEON around memory management now that we have a
>> > strict save-once-restore-lazily model.
>> >
>> > This may not work so well for arm though -- I haven't looked at that
>> > code for a while.
>> >
>> >
>> > If there is memory manamement in any core loop, you already lost
>> > the performance battle, and an extra
>> > kernel_neon_end()+kernel_neon_begin() may not be that catastrophic.
>>
>> Remember that we don't permit context switches while kernel neon is
>> in use on ARM - if there's any possibility of scheduling to occur,
>> the get_cpu() in kernel_neon_begin() should trigger a schedule-while-
>> atomic warning.
>
> Agreed, and an arm64 the same is true.  (Actually softirq is disabled
> too, for tortuous reasons involving SVE.)
>
> On 2017-12-01 11:43:32 [+0100], To linux-rt-users@vger.kernel.org wrote:
>> NEON in kernel mode is used by the crypto algorithms and raid6 code.
>> While the raid6 code looks okay, the crypto algorithms do not: NEON
>> is enabled on first invocation and may allocate/free/map memory before
>> the NEON mode is disabled again.
>> This needs to be changed until it can be enabled.
>> On ARM NEON in kernel mode can be simply disabled. on ARM64 it needs to
>> stay on due to possible EFI callbacks so here I disable each algorithm.
>
> [...]
>
>> diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
>> index 70c517aa4501..2a5f05b5a19a 100644
>> --- a/arch/arm64/crypto/Kconfig
>> +++ b/arch/arm64/crypto/Kconfig
>> @@ -19,19 +19,19 @@ config CRYPTO_SHA512_ARM64
>>
>>  config CRYPTO_SHA1_ARM64_CE
>>     tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
>> -   depends on KERNEL_MODE_NEON
>> +   depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
>>     select CRYPTO_HASH
>>     select CRYPTO_SHA1
>
> Sebastian, can you piont to where sha1 (say) hits this issue?
> I wonder whether this is really a sign that some refactoring is needed.
>

I don't think the SHA code is affected. It is primarily the block
ciphers, where the scatterwalk API offers some guarantees to the API
client (the NEON crypto driver) that data is presented in suitable
blocks. This may involve copying data from the scatterlist to a temp
buffer if any of the chunks are not blocksize aligned.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RT] arm*: disable NEON in kernel mode
  2017-12-01 18:24                 ` Dave Martin
  2017-12-01 19:20                   ` Ard Biesheuvel
@ 2017-12-04  9:21                   ` Sebastian Andrzej Siewior
  1 sibling, 0 replies; 26+ messages in thread
From: Sebastian Andrzej Siewior @ 2017-12-04  9:21 UTC (permalink / raw)
  To: Dave Martin
  Cc: Russell King - ARM Linux, Mark Rutland, linux-rt-users,
	Ard Biesheuvel, Peter Zijlstra, Catalin Marinas, Will Deacon,
	linux-kernel, Steven Rostedt, tglx, linux-arm-kernel

On 2017-12-01 18:24:10 [+0000], Dave Martin wrote:
> > diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
> > index 70c517aa4501..2a5f05b5a19a 100644
> > --- a/arch/arm64/crypto/Kconfig
> > +++ b/arch/arm64/crypto/Kconfig
> > @@ -19,19 +19,19 @@ config CRYPTO_SHA512_ARM64
> >
> >  config CRYPTO_SHA1_ARM64_CE
> >     tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
> > -   depends on KERNEL_MODE_NEON
> > +   depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
> >     select CRYPTO_HASH
> >     select CRYPTO_SHA1
> 
> Sebastian, can you piont to where sha1 (say) hits this issue?
> I wonder whether this is really a sign that some refactoring is needed.

I disabled all NEON in one go. Looking at this, it shouldn't be a issue
with the block-walk. The only thing that might be a problem is that it
can do multiple blocks in one go.

> Cheers
> ---Dave

Sebastian

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2017-12-04  9:22 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-11-30 14:22 [PATCH RT] crypto: limit more FPU-enabled sections Sebastian Andrzej Siewior
2017-11-30 14:30 ` Sebastian Andrzej Siewior
2017-12-01 10:43   ` [PATCH RT] arm*: disable NEON in kernel mode Sebastian Andrzej Siewior
2017-12-01 13:45     ` Sebastian Andrzej Siewior
2017-12-01 14:18       ` Mark Rutland
2017-12-01 14:36         ` Sebastian Andrzej Siewior
2017-12-01 15:03           ` Ard Biesheuvel
2017-12-01 17:58             ` Dave Martin
2017-12-01 18:08               ` Russell King - ARM Linux
2017-12-01 18:24                 ` Dave Martin
2017-12-01 19:20                   ` Ard Biesheuvel
2017-12-04  9:21                   ` Sebastian Andrzej Siewior
2017-12-01 17:14           ` Peter Zijlstra
2017-12-01 17:31             ` Russell King - ARM Linux
2017-12-01 17:39               ` Peter Zijlstra
2017-11-30 15:19 ` [PATCH RT] crypto: limit more FPU-enabled sections Steven Rostedt
2017-11-30 15:22   ` Sebastian Andrzej Siewior
2017-12-01 10:44     ` [PATCH RT v2] " Sebastian Andrzej Siewior
2017-12-01 11:32       ` Peter Zijlstra
2017-12-01 13:32         ` Sebastian Andrzej Siewior
2017-12-01 13:44           ` Peter Zijlstra
2017-12-01 13:50             ` Sebastian Andrzej Siewior
2017-12-01 14:03               ` [PATCH RT v3] " Sebastian Andrzej Siewior
2017-11-30 15:29 ` [PATCH RT] " Steven Rostedt
2017-11-30 15:41   ` Sebastian Andrzej Siewior
2017-11-30 16:18     ` Steven Rostedt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).