From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752260AbdLAKo2 convert rfc822-to-8bit (ORCPT ); Fri, 1 Dec 2017 05:44:28 -0500 Received: from Galois.linutronix.de ([146.0.238.70]:49574 "EHLO Galois.linutronix.de" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751689AbdLAKo0 (ORCPT ); Fri, 1 Dec 2017 05:44:26 -0500 Date: Fri, 1 Dec 2017 11:44:22 +0100 From: Sebastian Andrzej Siewior To: Steven Rostedt Cc: linux-rt-users@vger.kernel.org, linux-kernel@vger.kernel.org, tglx@linutronix.de, Peter Zijlstra Subject: [PATCH RT v2] crypto: limit more FPU-enabled sections Message-ID: <20171201104422.GC1612@linutronix.de> References: <20171130142216.GB12606@linutronix.de> <20171130101943.15f7ade4@gandalf.local.home> <20171130152236.GA11362@linutronix.de> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Disposition: inline Content-Transfer-Encoding: 8BIT In-Reply-To: <20171130152236.GA11362@linutronix.de> User-Agent: Mutt/1.9.1 (2017-09-22) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Those crypto drivers use SSE/AVX/… for their crypto work and in order to do so in kernel they need to enable the "FPU" in kernel mode which disables preemption. There are two problems with the way they are used: - the while loop which processes X bytes may create latency spikes and should be avoided or limited. - the cipher-walk-next part may allocate/free memory and may use kmap_atomic(). The whole kernel_fpu_begin()/end() processing isn't probably that cheap. It most likely makes sense to process as much of those as possible in one go. The new *_fpu_sched_rt() shedules only if a RT task is pending. Probably we should meassure the performance those ciphers in pure SW mode and with this optimisations to see if it makes sense to keep them for RT. Signed-off-by: Sebastian Andrzej Siewior --- arch/x86/crypto/camellia_aesni_avx2_glue.c | 34 +++++++++++++++++++++++++ arch/x86/crypto/camellia_aesni_avx_glue.c | 32 +++++++++++++++++++++++ arch/x86/crypto/cast6_avx_glue.c | 24 ++++++++++++++--- arch/x86/crypto/chacha20_glue.c | 9 +++--- arch/x86/crypto/serpent_avx2_glue.c | 31 +++++++++++++++++++++++ arch/x86/crypto/serpent_avx_glue.c | 23 ++++++++++++++--- arch/x86/crypto/serpent_sse2_glue.c | 23 ++++++++++++++--- arch/x86/crypto/twofish_avx_glue.c | 39 +++++++++++++++++++++++++++-- 8 files changed, 196 insertions(+), 19 deletions(-) --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -206,6 +206,34 @@ struct crypt_priv { bool fpu_enabled; }; +#ifdef CONFIG_PREEMPT_RT_FULL +static void camellia_fpu_end_rt(struct crypt_priv *ctx) +{ + bool fpu_enabled = ctx->fpu_enabled; + + if (!fpu_enabled) + return; + camellia_fpu_end(fpu_enabled); + ctx->fpu_enabled = false; +} + +static void camellia_fpu_sched_rt(struct crypt_priv *ctx) +{ + bool fpu_enabled = ctx->fpu_enabled; + + if (!fpu_enabled || !tif_need_resched_now()) + return; + camellia_fpu_end(fpu_enabled); + kernel_fpu_end(); + /* schedule due to preemptible */ + kernel_fpu_begin(); +} + +#else +static void camellia_fpu_end_rt(struct crypt_priv *ctx) { } +static void camellia_fpu_sched_rt(struct crypt_priv *ctx) { } +#endif + static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) { const unsigned int bsize = CAMELLIA_BLOCK_SIZE; @@ -221,16 +249,19 @@ static void encrypt_callback(void *priv, } if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { + camellia_fpu_sched_rt(ctx); camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst); srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; } while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { + camellia_fpu_sched_rt(ctx); camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst); srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; } + camellia_fpu_end_rt(ctx); for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) camellia_enc_blk(ctx->ctx, srcdst, srcdst); @@ -251,16 +282,19 @@ static void decrypt_callback(void *priv, } if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { + camellia_fpu_sched_rt(ctx); camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst); srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; } while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { + camellia_fpu_sched_rt(ctx); camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst); srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; } + camellia_fpu_end_rt(ctx); for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) camellia_dec_blk(ctx->ctx, srcdst, srcdst); --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -210,6 +210,34 @@ struct crypt_priv { bool fpu_enabled; }; +#ifdef CONFIG_PREEMPT_RT_FULL +static void camellia_fpu_end_rt(struct crypt_priv *ctx) +{ + bool fpu_enabled = ctx->fpu_enabled; + + if (!fpu_enabled) + return; + camellia_fpu_end(fpu_enabled); + ctx->fpu_enabled = false; +} + +static void camellia_fpu_sched_rt(struct crypt_priv *ctx) +{ + bool fpu_enabled = ctx->fpu_enabled; + + if (!fpu_enabled || !tif_need_resched_now()) + return; + camellia_fpu_end(fpu_enabled); + kernel_fpu_end(); + /* schedule due to preemptible */ + kernel_fpu_begin(); +} + +#else +static void camellia_fpu_end_rt(struct crypt_priv *ctx) { } +static void camellia_fpu_sched_rt(struct crypt_priv *ctx) { } +#endif + static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) { const unsigned int bsize = CAMELLIA_BLOCK_SIZE; @@ -225,10 +253,12 @@ static void encrypt_callback(void *priv, } while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { + camellia_fpu_sched_rt(ctx); camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst); srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; } + camellia_fpu_end_rt(ctx); for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) camellia_enc_blk(ctx->ctx, srcdst, srcdst); @@ -249,10 +279,12 @@ static void decrypt_callback(void *priv, } while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { + camellia_fpu_sched_rt(ctx); camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst); srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; } + camellia_fpu_end_rt(ctx); for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) camellia_dec_blk(ctx->ctx, srcdst, srcdst); --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -205,19 +205,33 @@ struct crypt_priv { bool fpu_enabled; }; +#ifdef CONFIG_PREEMPT_RT_FULL +static void cast6_fpu_end_rt(struct crypt_priv *ctx) +{ + bool fpu_enabled = ctx->fpu_enabled; + + if (!fpu_enabled) + return; + cast6_fpu_end(fpu_enabled); + ctx->fpu_enabled = false; +} + +#else +static void cast6_fpu_end_rt(struct crypt_priv *ctx) { } +#endif + static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) { const unsigned int bsize = CAST6_BLOCK_SIZE; struct crypt_priv *ctx = priv; int i; - ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); - if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { + ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst); + cast6_fpu_end_rt(ctx); return; } - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) __cast6_encrypt(ctx->ctx, srcdst, srcdst); } @@ -228,10 +242,10 @@ static void decrypt_callback(void *priv, struct crypt_priv *ctx = priv; int i; - ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); - if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { + ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst); + cast6_fpu_end_rt(ctx); return; } --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -81,23 +81,24 @@ static int chacha20_simd(struct skcipher crypto_chacha20_init(state, ctx, walk.iv); - kernel_fpu_begin(); - while (walk.nbytes >= CHACHA20_BLOCK_SIZE) { + kernel_fpu_begin(); + chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE)); + kernel_fpu_end(); err = skcipher_walk_done(&walk, walk.nbytes % CHACHA20_BLOCK_SIZE); } if (walk.nbytes) { + kernel_fpu_begin(); chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, walk.nbytes); + kernel_fpu_end(); err = skcipher_walk_done(&walk, 0); } - kernel_fpu_end(); - return err; } --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -184,6 +184,33 @@ struct crypt_priv { bool fpu_enabled; }; +#ifdef CONFIG_PREEMPT_RT_FULL +static void serpent_fpu_end_rt(struct crypt_priv *ctx) +{ + bool fpu_enabled = ctx->fpu_enabled; + + if (!fpu_enabled) + return; + serpent_fpu_end(fpu_enabled); + ctx->fpu_enabled = false; +} + +static void serpent_fpu_sched_rt(struct crypt_priv *ctx) +{ + bool fpu_enabled = ctx->fpu_enabled; + + if (!fpu_enabled || !tif_need_resched_now()) + return; + kernel_fpu_end(); + /* schedule due to preemptible */ + kernel_fpu_begin(); +} + +#else +static void serpent_fpu_end_rt(struct crypt_priv *ctx) { } +static void serpent_fpu_sched_rt(struct crypt_priv *ctx) { } +#endif + static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) { const unsigned int bsize = SERPENT_BLOCK_SIZE; @@ -199,10 +226,12 @@ static void encrypt_callback(void *priv, } while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) { + serpent_fpu_sched_rt(ctx); serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst); srcdst += bsize * SERPENT_PARALLEL_BLOCKS; nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; } + serpent_fpu_end_rt(ctx); for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) __serpent_encrypt(ctx->ctx, srcdst, srcdst); @@ -223,10 +252,12 @@ static void decrypt_callback(void *priv, } while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) { + serpent_fpu_sched_rt(ctx); serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst); srcdst += bsize * SERPENT_PARALLEL_BLOCKS; nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; } + serpent_fpu_end_rt(ctx); for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) __serpent_decrypt(ctx->ctx, srcdst, srcdst); --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -218,16 +218,31 @@ struct crypt_priv { bool fpu_enabled; }; +#ifdef CONFIG_PREEMPT_RT_FULL +static void serpent_fpu_end_rt(struct crypt_priv *ctx) +{ + bool fpu_enabled = ctx->fpu_enabled; + + if (!fpu_enabled) + return; + serpent_fpu_end(fpu_enabled); + ctx->fpu_enabled = false; +} + +#else +static void serpent_fpu_end_rt(struct crypt_priv *ctx) { } +#endif + static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) { const unsigned int bsize = SERPENT_BLOCK_SIZE; struct crypt_priv *ctx = priv; int i; - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); - if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst); + serpent_fpu_end_rt(ctx); return; } @@ -241,10 +256,10 @@ static void decrypt_callback(void *priv, struct crypt_priv *ctx = priv; int i; - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); - if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst); + serpent_fpu_end_rt(ctx); return; } --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -187,16 +187,31 @@ struct crypt_priv { bool fpu_enabled; }; +#ifdef CONFIG_PREEMPT_RT_FULL +static void serpent_fpu_end_rt(struct crypt_priv *ctx) +{ + bool fpu_enabled = ctx->fpu_enabled; + + if (!fpu_enabled) + return; + serpent_fpu_end(fpu_enabled); + ctx->fpu_enabled = false; +} + +#else +static void serpent_fpu_end_rt(struct crypt_priv *ctx) { } +#endif + static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) { const unsigned int bsize = SERPENT_BLOCK_SIZE; struct crypt_priv *ctx = priv; int i; - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); - if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst); + serpent_fpu_end_rt(ctx); return; } @@ -210,10 +225,10 @@ static void decrypt_callback(void *priv, struct crypt_priv *ctx = priv; int i; - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); - if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst); + serpent_fpu_end_rt(ctx); return; } --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -218,6 +218,33 @@ struct crypt_priv { bool fpu_enabled; }; +#ifdef CONFIG_PREEMPT_RT_FULL +static void twofish_fpu_end_rt(struct crypt_priv *ctx) +{ + bool fpu_enabled = ctx->fpu_enabled; + + if (!fpu_enabled) + return; + twofish_fpu_end(fpu_enabled); + ctx->fpu_enabled = false; +} + +static void twofish_fpu_sched_rt(struct crypt_priv *ctx) +{ + bool fpu_enabled = ctx->fpu_enabled; + + if (!fpu_enabled || !tif_need_resched_now()) + return; + kernel_fpu_end(); + /* schedule due to preemptible */ + kernel_fpu_begin(); +} + +#else +static void twofish_fpu_end_rt(struct crypt_priv *ctx) { } +static void twofish_fpu_sched_rt(struct crypt_priv *ctx) { } +#endif + static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) { const unsigned int bsize = TF_BLOCK_SIZE; @@ -228,12 +255,16 @@ static void encrypt_callback(void *priv, if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst); + twofish_fpu_end_rt(ctx); return; } - for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) + for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) { + twofish_fpu_sched_rt(ctx); twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst); + } + twofish_fpu_end_rt(ctx); nbytes %= bsize * 3; for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) @@ -250,11 +281,15 @@ static void decrypt_callback(void *priv, if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst); + twofish_fpu_end_rt(ctx); return; } - for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) + for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) { + twofish_fpu_sched_rt(ctx); twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst); + } + twofish_fpu_end_rt(ctx); nbytes %= bsize * 3;