linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 1/2] lib/crypto: blake2s: include as built-in
@ 2021-12-23 14:11 Jason A. Donenfeld
  2021-12-23 14:11 ` [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction Jason A. Donenfeld
                   ` (3 more replies)
  0 siblings, 4 replies; 61+ messages in thread
From: Jason A. Donenfeld @ 2021-12-23 14:11 UTC (permalink / raw)
  To: linux-kernel, tytso, gregkh
  Cc: Jason A. Donenfeld, Ard Biesheuvel, Masahiro Yamada,
	linux-kbuild, Herbert Xu, linux-crypto

In preparation for using blake2s in the RNG, we change the way that it
is wired-in to the build system. Instead of kconfig mazes and ifdefs, we
use weak symbols, so that an arch version can override the generic
version. Then we include the generic version in lib-y, so that it can be
removed from the image if the arch version doesn't fallback to it (as is
the case on arm though not x86). The result should be a bit simpler and
smaller than the code it replaces.

Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: linux-kbuild@vger.kernel.org
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: linux-crypto@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
Herbert - I intend to take this via the crng/random.git tree, since it
forms a dependency and I'd like to send a pull early in 5.17 cycle.

 Makefile                          |  2 +-
 arch/arm/crypto/Kconfig           |  3 +--
 arch/arm/crypto/blake2s-core.S    |  8 ++++----
 arch/arm/crypto/blake2s-glue.c    |  6 +++---
 arch/s390/configs/debug_defconfig |  1 -
 arch/s390/configs/defconfig       |  1 -
 arch/x86/crypto/blake2s-glue.c    | 11 +++++------
 crypto/Kconfig                    |  5 +----
 drivers/net/Kconfig               |  1 -
 include/crypto/internal/blake2s.h |  6 +++---
 lib/Makefile                      |  2 +-
 lib/crypto/Kconfig                | 25 -------------------------
 lib/crypto/Makefile               |  7 +++----
 lib/crypto/blake2s-generic.c      |  6 +++++-
 lib/crypto/blake2s.c              |  6 ------
 15 files changed, 27 insertions(+), 63 deletions(-)

diff --git a/Makefile b/Makefile
index d85f1ff79f5c..892ea632ea63 100644
--- a/Makefile
+++ b/Makefile
@@ -668,7 +668,7 @@ drivers-y	:= drivers/ sound/
 drivers-$(CONFIG_SAMPLES) += samples/
 drivers-$(CONFIG_NET) += net/
 drivers-y	+= virt/
-libs-y		:= lib/
+libs-y		:= lib/ lib/crypto/
 endif # KBUILD_EXTMOD
 
 # The all: target is the default when no target is given on the
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 2b575792363e..47cb22645746 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -63,8 +63,7 @@ config CRYPTO_SHA512_ARM
 	  using optimized ARM assembler and NEON, when available.
 
 config CRYPTO_BLAKE2S_ARM
-	tristate "BLAKE2s digest algorithm (ARM)"
-	select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
+	bool "BLAKE2s digest algorithm (ARM)"
 	help
 	  BLAKE2s digest algorithm optimized with ARM scalar instructions.  This
 	  is faster than the generic implementations of BLAKE2s and BLAKE2b, but
diff --git a/arch/arm/crypto/blake2s-core.S b/arch/arm/crypto/blake2s-core.S
index 86345751bbf3..df40e46601f1 100644
--- a/arch/arm/crypto/blake2s-core.S
+++ b/arch/arm/crypto/blake2s-core.S
@@ -167,8 +167,8 @@
 .endm
 
 //
-// void blake2s_compress_arch(struct blake2s_state *state,
-//			      const u8 *block, size_t nblocks, u32 inc);
+// void blake2s_compress(struct blake2s_state *state,
+//			 const u8 *block, size_t nblocks, u32 inc);
 //
 // Only the first three fields of struct blake2s_state are used:
 //	u32 h[8];	(inout)
@@ -176,7 +176,7 @@
 //	u32 f[2];	(in)
 //
 	.align		5
-ENTRY(blake2s_compress_arch)
+ENTRY(blake2s_compress)
 	push		{r0-r2,r4-r11,lr}	// keep this an even number
 
 .Lnext_block:
@@ -303,4 +303,4 @@ ENTRY(blake2s_compress_arch)
 	str		r3, [r12], #4
 	bne		1b
 	b		.Lcopy_block_done
-ENDPROC(blake2s_compress_arch)
+ENDPROC(blake2s_compress)
diff --git a/arch/arm/crypto/blake2s-glue.c b/arch/arm/crypto/blake2s-glue.c
index f2cc1e5fc9ec..09d3a0cabd2c 100644
--- a/arch/arm/crypto/blake2s-glue.c
+++ b/arch/arm/crypto/blake2s-glue.c
@@ -11,17 +11,17 @@
 #include <linux/module.h>
 
 /* defined in blake2s-core.S */
-EXPORT_SYMBOL(blake2s_compress_arch);
+EXPORT_SYMBOL(blake2s_compress);
 
 static int crypto_blake2s_update_arm(struct shash_desc *desc,
 				     const u8 *in, unsigned int inlen)
 {
-	return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
+	return crypto_blake2s_update(desc, in, inlen, blake2s_compress);
 }
 
 static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out)
 {
-	return crypto_blake2s_final(desc, out, blake2s_compress_arch);
+	return crypto_blake2s_final(desc, out, blake2s_compress);
 }
 
 #define BLAKE2S_ALG(name, driver_name, digest_size)			\
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index e45cc27716de..caa3d1d6a0e8 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -757,7 +757,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 CONFIG_CRYPTO_STATS=y
-CONFIG_CRYPTO_LIB_BLAKE2S=m
 CONFIG_CRYPTO_LIB_CURVE25519=m
 CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m
 CONFIG_ZCRYPT=m
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index 1c750bfca2d8..fffc6af5358c 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -744,7 +744,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 CONFIG_CRYPTO_STATS=y
-CONFIG_CRYPTO_LIB_BLAKE2S=m
 CONFIG_CRYPTO_LIB_CURVE25519=m
 CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m
 CONFIG_ZCRYPT=m
diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
index a40365ab301e..ef91a3167d27 100644
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@@ -28,9 +28,8 @@ asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
 
-void blake2s_compress_arch(struct blake2s_state *state,
-			   const u8 *block, size_t nblocks,
-			   const u32 inc)
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+		      size_t nblocks, const u32 inc)
 {
 	/* SIMD disables preemption, so relax after processing each page. */
 	BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
@@ -56,17 +55,17 @@ void blake2s_compress_arch(struct blake2s_state *state,
 		block += blocks * BLAKE2S_BLOCK_SIZE;
 	} while (nblocks);
 }
-EXPORT_SYMBOL(blake2s_compress_arch);
+EXPORT_SYMBOL(blake2s_compress);
 
 static int crypto_blake2s_update_x86(struct shash_desc *desc,
 				     const u8 *in, unsigned int inlen)
 {
-	return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
+	return crypto_blake2s_update(desc, in, inlen, blake2s_compress);
 }
 
 static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out)
 {
-	return crypto_blake2s_final(desc, out, blake2s_compress_arch);
+	return crypto_blake2s_final(desc, out, blake2s_compress);
 }
 
 #define BLAKE2S_ALG(name, driver_name, digest_size)			\
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 285f82647d2b..bfda2c82774d 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -685,7 +685,6 @@ config CRYPTO_BLAKE2B
 
 config CRYPTO_BLAKE2S
 	tristate "BLAKE2s digest algorithm"
-	select CRYPTO_LIB_BLAKE2S_GENERIC
 	select CRYPTO_HASH
 	help
 	  Implementation of cryptographic hash function BLAKE2s
@@ -702,10 +701,8 @@ config CRYPTO_BLAKE2S
 	  See https://blake2.net for further information.
 
 config CRYPTO_BLAKE2S_X86
-	tristate "BLAKE2s digest algorithm (x86 accelerated version)"
+	bool "BLAKE2s digest algorithm (x86 accelerated version)"
 	depends on X86 && 64BIT
-	select CRYPTO_LIB_BLAKE2S_GENERIC
-	select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
 
 config CRYPTO_CRCT10DIF
 	tristate "CRCT10DIF algorithm"
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 6cccc3dc00bc..b2a4f998c180 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -81,7 +81,6 @@ config WIREGUARD
 	select CRYPTO
 	select CRYPTO_LIB_CURVE25519
 	select CRYPTO_LIB_CHACHA20POLY1305
-	select CRYPTO_LIB_BLAKE2S
 	select CRYPTO_CHACHA20_X86_64 if X86 && 64BIT
 	select CRYPTO_POLY1305_X86_64 if X86 && 64BIT
 	select CRYPTO_BLAKE2S_X86 if X86 && 64BIT
diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h
index 8e50d487500f..d39cfa0d333e 100644
--- a/include/crypto/internal/blake2s.h
+++ b/include/crypto/internal/blake2s.h
@@ -11,11 +11,11 @@
 #include <crypto/internal/hash.h>
 #include <linux/string.h>
 
-void blake2s_compress_generic(struct blake2s_state *state,const u8 *block,
+void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 			      size_t nblocks, const u32 inc);
 
-void blake2s_compress_arch(struct blake2s_state *state,const u8 *block,
-			   size_t nblocks, const u32 inc);
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+		      size_t nblocks, const u32 inc);
 
 bool blake2s_selftest(void);
 
diff --git a/lib/Makefile b/lib/Makefile
index 364c23f15578..bb57b2e466fa 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -139,7 +139,7 @@ endif
 obj-$(CONFIG_DEBUG_INFO_REDUCED) += debug_info.o
 CFLAGS_debug_info.o += $(call cc-option, -femit-struct-debug-detailed=any)
 
-obj-y += math/ crypto/
+obj-y += math/
 
 obj-$(CONFIG_GENERIC_IOMAP) += iomap.o
 obj-$(CONFIG_GENERIC_PCI_IOMAP) += pci_iomap.o
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 545ccbddf6a1..31c6e2be3b84 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -8,31 +8,6 @@ config CRYPTO_LIB_AES
 config CRYPTO_LIB_ARC4
 	tristate
 
-config CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	tristate
-	help
-	  Declares whether the architecture provides an arch-specific
-	  accelerated implementation of the Blake2s library interface,
-	  either builtin or as a module.
-
-config CRYPTO_LIB_BLAKE2S_GENERIC
-	tristate
-	help
-	  This symbol can be depended upon by arch implementations of the
-	  Blake2s library interface that require the generic code as a
-	  fallback, e.g., for SIMD implementations. If no arch specific
-	  implementation is enabled, this implementation serves the users
-	  of CRYPTO_LIB_BLAKE2S.
-
-config CRYPTO_LIB_BLAKE2S
-	tristate "BLAKE2s hash function library"
-	depends on CRYPTO_ARCH_HAVE_LIB_BLAKE2S || !CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	select CRYPTO_LIB_BLAKE2S_GENERIC if CRYPTO_ARCH_HAVE_LIB_BLAKE2S=n
-	help
-	  Enable the Blake2s library interface. This interface may be fulfilled
-	  by either the generic implementation or an arch-specific one, if one
-	  is available and enabled.
-
 config CRYPTO_ARCH_HAVE_LIB_CHACHA
 	tristate
 	help
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 73205ed269ba..42e1d932c077 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -10,10 +10,9 @@ libaes-y					:= aes.o
 obj-$(CONFIG_CRYPTO_LIB_ARC4)			+= libarc4.o
 libarc4-y					:= arc4.o
 
-obj-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC)	+= libblake2s-generic.o
-libblake2s-generic-y				+= blake2s-generic.o
-
-obj-$(CONFIG_CRYPTO_LIB_BLAKE2S)		+= libblake2s.o
+# blake2s is used by the /dev/random driver which is always builtin
+lib-y						+= blake2s-generic.o
+obj-y						+= libblake2s.o
 libblake2s-y					+= blake2s.o
 
 obj-$(CONFIG_CRYPTO_LIB_CHACHA20POLY1305)	+= libchacha20poly1305.o
diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c
index 04ff8df24513..75ccb3e633e6 100644
--- a/lib/crypto/blake2s-generic.c
+++ b/lib/crypto/blake2s-generic.c
@@ -37,7 +37,11 @@ static inline void blake2s_increment_counter(struct blake2s_state *state,
 	state->t[1] += (state->t[0] < inc);
 }
 
-void blake2s_compress_generic(struct blake2s_state *state,const u8 *block,
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+		      size_t nblocks, const u32 inc)
+		      __weak __alias(blake2s_compress_generic);
+
+void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 			      size_t nblocks, const u32 inc)
 {
 	u32 m[16];
diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index 4055aa593ec4..93f2ae051370 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -16,12 +16,6 @@
 #include <linux/init.h>
 #include <linux/bug.h>
 
-#if IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S)
-#  define blake2s_compress blake2s_compress_arch
-#else
-#  define blake2s_compress blake2s_compress_generic
-#endif
-
 void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen)
 {
 	__blake2s_update(state, in, inlen, blake2s_compress);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction
  2021-12-23 14:11 [PATCH v2 1/2] lib/crypto: blake2s: include as built-in Jason A. Donenfeld
@ 2021-12-23 14:11 ` Jason A. Donenfeld
  2021-12-24 20:56   ` Eric Biggers
                     ` (2 more replies)
  2021-12-23 14:20 ` [PATCH v2 1/2] lib/crypto: blake2s: include as built-in Ard Biesheuvel
                   ` (2 subsequent siblings)
  3 siblings, 3 replies; 61+ messages in thread
From: Jason A. Donenfeld @ 2021-12-23 14:11 UTC (permalink / raw)
  To: linux-kernel, tytso, gregkh; +Cc: Jason A. Donenfeld, Jean-Philippe Aumasson

This commit addresses one of the lower hanging fruits of the RNG: its
usage of SHA1.

BLAKE2s is generally faster, and certainly more secure, than SHA1, which
has [1] been [2] really [3] very [4] broken [5]. Additionally, the
current construction in the RNG doesn't use the full SHA1 function, as
specified, and allows overwriting the IV with RDRAND output in an
undocumented way, even in the case when RDRAND isn't set to "trusted",
which means potential malicious IV choices. And its short length means
that keeping only half of it secret when feeding back into the mixer
gives us only 2^80 bits of forward secrecy. In other words, not only is
the choice of hash function dated, but the use of it isn't really great
either.

This commit aims to fix both of these issues while also keeping the
general structure and semantics as close to the original as possible.
Specifically:

   a) Rather than overwriting the hash IV with RDRAND, we put it into
      BLAKE2's documented "salt" and "personal" fields, which were
      specifically created for this type of usage.
   b) Since this function feeds the full hash result back into the
      entropy collector, we only return from it half the length of the
      hash, just as it was done before. This increases the
      construction's forward secrecy from 2^80 to a much more
      comfortable 2^128.
   c) Rather than using the raw "sha1_transform" function alone, we
      instead use the full proper BLAKE2s function, with finalization.

This also has the advantage of supplying 16 bytes at a time rather than
SHA1's 10 bytes, which, in addition to having a faster compression
function to begin with, means faster extraction in general. On an Intel
i7-11850H, this commit makes initial seeding around 131% faster.

BLAKE2s itself has the nice property of internally being based on the
ChaCha permutation, which the RNG is already using for expansion, so
there shouldn't be any issue with newness, funkiness, or surprising CPU
behavior, since it's based on something already in use.

[1] https://eprint.iacr.org/2005/010.pdf
[2] https://www.iacr.org/archive/crypto2005/36210017/36210017.pdf
[3] https://eprint.iacr.org/2015/967.pdf
[4] https://shattered.io/static/shattered.pdf
[5] https://www.usenix.org/system/files/sec20-leurent.pdf

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
v1->v2:
- Moved the blake2s lib/crypto/ build system changes to prior commit,
  since it's a bit more involved than initially thought.

 drivers/char/random.c | 64 ++++++++++++++++++-------------------------
 1 file changed, 27 insertions(+), 37 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 13c968b950c5..e3827d45d2f2 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -78,12 +78,12 @@
  * an *estimate* of how many bits of randomness have been stored into
  * the random number generator's internal state.
  *
- * When random bytes are desired, they are obtained by taking the SHA
- * hash of the contents of the "entropy pool".  The SHA hash avoids
+ * When random bytes are desired, they are obtained by taking the BLAKE2s
+ * hash of the contents of the "entropy pool".  The BLAKE2s hash avoids
  * exposing the internal state of the entropy pool.  It is believed to
  * be computationally infeasible to derive any useful information
- * about the input of SHA from its output.  Even if it is possible to
- * analyze SHA in some clever way, as long as the amount of data
+ * about the input of BLAKE2s from its output.  Even if it is possible to
+ * analyze BLAKE2s in some clever way, as long as the amount of data
  * returned from the generator is less than the inherent entropy in
  * the pool, the output data is totally unpredictable.  For this
  * reason, the routine decreases its internal estimate of how many
@@ -93,7 +93,7 @@
  * If this estimate goes to zero, the routine can still generate
  * random numbers; however, an attacker may (at least in theory) be
  * able to infer the future output of the generator from prior
- * outputs.  This requires successful cryptanalysis of SHA, which is
+ * outputs.  This requires successful cryptanalysis of BLAKE2s, which is
  * not believed to be feasible, but there is a remote possibility.
  * Nonetheless, these numbers should be useful for the vast majority
  * of purposes.
@@ -347,7 +347,7 @@
 #include <linux/completion.h>
 #include <linux/uuid.h>
 #include <crypto/chacha.h>
-#include <crypto/sha1.h>
+#include <crypto/blake2s.h>
 
 #include <asm/processor.h>
 #include <linux/uaccess.h>
@@ -367,10 +367,7 @@
 #define INPUT_POOL_WORDS	(1 << (INPUT_POOL_SHIFT-5))
 #define OUTPUT_POOL_SHIFT	10
 #define OUTPUT_POOL_WORDS	(1 << (OUTPUT_POOL_SHIFT-5))
-#define EXTRACT_SIZE		10
-
-
-#define LONGS(x) (((x) + sizeof(unsigned long) - 1)/sizeof(unsigned long))
+#define EXTRACT_SIZE		(BLAKE2S_HASH_SIZE / 2)
 
 /*
  * To allow fractional bits to be tracked, the entropy_count field is
@@ -406,7 +403,7 @@ static int random_write_wakeup_bits = 28 * OUTPUT_POOL_WORDS;
  * Thanks to Colin Plumb for suggesting this.
  *
  * The mixing operation is much less sensitive than the output hash,
- * where we use SHA-1.  All that we want of mixing operation is that
+ * where we use BLAKE2s.  All that we want of mixing operation is that
  * it be a good non-cryptographic hash; i.e. it not produce collisions
  * when fed "random" data of the sort we expect to see.  As long as
  * the pool state differs for different inputs, we have preserved the
@@ -1384,54 +1381,47 @@ static size_t account(struct entropy_store *r, size_t nbytes, int min,
  */
 static void extract_buf(struct entropy_store *r, __u8 *out)
 {
-	int i;
-	union {
-		__u32 w[5];
-		unsigned long l[LONGS(20)];
-	} hash;
-	__u32 workspace[SHA1_WORKSPACE_WORDS];
+	struct blake2s_state state __aligned(__alignof__(unsigned long));
+	u8 hash[BLAKE2S_HASH_SIZE];
+	unsigned long *salt;
 	unsigned long flags;
 
+	blake2s_init(&state, sizeof(hash));
+
 	/*
 	 * If we have an architectural hardware random number
-	 * generator, use it for SHA's initial vector
+	 * generator, use it for BLAKE2's salt & personal fields.
 	 */
-	sha1_init(hash.w);
-	for (i = 0; i < LONGS(20); i++) {
+	for (salt = (unsigned long *)&state.h[4];
+	     salt < (unsigned long *)&state.h[8]; ++salt) {
 		unsigned long v;
 		if (!arch_get_random_long(&v))
 			break;
-		hash.l[i] = v;
+		*salt ^= v;
 	}
 
-	/* Generate a hash across the pool, 16 words (512 bits) at a time */
+	/* Generate a hash across the pool */
 	spin_lock_irqsave(&r->lock, flags);
-	for (i = 0; i < r->poolinfo->poolwords; i += 16)
-		sha1_transform(hash.w, (__u8 *)(r->pool + i), workspace);
+	blake2s_update(&state, (const u8 *)r->pool,
+		       r->poolinfo->poolwords * sizeof(*r->pool));
+	blake2s_final(&state, hash); /* final zeros out state */
 
 	/*
 	 * We mix the hash back into the pool to prevent backtracking
 	 * attacks (where the attacker knows the state of the pool
 	 * plus the current outputs, and attempts to find previous
-	 * ouputs), unless the hash function can be inverted. By
-	 * mixing at least a SHA1 worth of hash data back, we make
+	 * outputs), unless the hash function can be inverted. By
+	 * mixing at least a hash worth of hash data back, we make
 	 * brute-forcing the feedback as hard as brute-forcing the
 	 * hash.
 	 */
-	__mix_pool_bytes(r, hash.w, sizeof(hash.w));
+	__mix_pool_bytes(r, hash, sizeof(hash));
 	spin_unlock_irqrestore(&r->lock, flags);
 
-	memzero_explicit(workspace, sizeof(workspace));
-
-	/*
-	 * In case the hash function has some recognizable output
-	 * pattern, we fold it in half. Thus, we always feed back
-	 * twice as much data as we output.
+	/* Note that EXTRACT_SIZE is half of hash size here, because above
+	 * we've dumped the full length back into mixer. By reducing the
+	 * amount that we emit, we retain a level of forward secrecy.
 	 */
-	hash.w[0] ^= hash.w[3];
-	hash.w[1] ^= hash.w[4];
-	hash.w[2] ^= rol32(hash.w[2], 16);
-
 	memcpy(out, &hash, EXTRACT_SIZE);
 	memzero_explicit(&hash, sizeof(hash));
 }
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH v2 1/2] lib/crypto: blake2s: include as built-in
  2021-12-23 14:11 [PATCH v2 1/2] lib/crypto: blake2s: include as built-in Jason A. Donenfeld
  2021-12-23 14:11 ` [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction Jason A. Donenfeld
@ 2021-12-23 14:20 ` Ard Biesheuvel
  2021-12-24 13:35 ` Greg KH
  2021-12-25  9:26 ` Masahiro Yamada
  3 siblings, 0 replies; 61+ messages in thread
From: Ard Biesheuvel @ 2021-12-23 14:20 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Linux Kernel Mailing List, Theodore Y. Ts'o,
	Greg Kroah-Hartman, Masahiro Yamada, Linux Kbuild mailing list,
	Herbert Xu, Linux Crypto Mailing List

On Thu, 23 Dec 2021 at 15:11, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>
> In preparation for using blake2s in the RNG, we change the way that it
> is wired-in to the build system. Instead of kconfig mazes and ifdefs, we
> use weak symbols, so that an arch version can override the generic
> version. Then we include the generic version in lib-y, so that it can be
> removed from the image if the arch version doesn't fallback to it (as is
> the case on arm though not x86). The result should be a bit simpler and
> smaller than the code it replaces.
>
> Cc: Ard Biesheuvel <ardb@kernel.org>
> Cc: Masahiro Yamada <masahiroy@kernel.org>
> Cc: linux-kbuild@vger.kernel.org
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: linux-crypto@vger.kernel.org
> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> ---
> Herbert - I intend to take this via the crng/random.git tree, since it
> forms a dependency and I'd like to send a pull early in 5.17 cycle.
>
>  Makefile                          |  2 +-
>  arch/arm/crypto/Kconfig           |  3 +--
>  arch/arm/crypto/blake2s-core.S    |  8 ++++----
>  arch/arm/crypto/blake2s-glue.c    |  6 +++---
>  arch/s390/configs/debug_defconfig |  1 -
>  arch/s390/configs/defconfig       |  1 -

You can drop these two hunks - not worth the risk of a conflict with
the S390 tree.

Other than that,

Acked-by: Ard Biesheuvel <ardb@kernel.org>

>  arch/x86/crypto/blake2s-glue.c    | 11 +++++------
>  crypto/Kconfig                    |  5 +----
>  drivers/net/Kconfig               |  1 -
>  include/crypto/internal/blake2s.h |  6 +++---
>  lib/Makefile                      |  2 +-
>  lib/crypto/Kconfig                | 25 -------------------------
>  lib/crypto/Makefile               |  7 +++----
>  lib/crypto/blake2s-generic.c      |  6 +++++-
>  lib/crypto/blake2s.c              |  6 ------
>  15 files changed, 27 insertions(+), 63 deletions(-)
>
> diff --git a/Makefile b/Makefile
> index d85f1ff79f5c..892ea632ea63 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -668,7 +668,7 @@ drivers-y   := drivers/ sound/
>  drivers-$(CONFIG_SAMPLES) += samples/
>  drivers-$(CONFIG_NET) += net/
>  drivers-y      += virt/
> -libs-y         := lib/
> +libs-y         := lib/ lib/crypto/
>  endif # KBUILD_EXTMOD
>
>  # The all: target is the default when no target is given on the
> diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
> index 2b575792363e..47cb22645746 100644
> --- a/arch/arm/crypto/Kconfig
> +++ b/arch/arm/crypto/Kconfig
> @@ -63,8 +63,7 @@ config CRYPTO_SHA512_ARM
>           using optimized ARM assembler and NEON, when available.
>
>  config CRYPTO_BLAKE2S_ARM
> -       tristate "BLAKE2s digest algorithm (ARM)"
> -       select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
> +       bool "BLAKE2s digest algorithm (ARM)"
>         help
>           BLAKE2s digest algorithm optimized with ARM scalar instructions.  This
>           is faster than the generic implementations of BLAKE2s and BLAKE2b, but
> diff --git a/arch/arm/crypto/blake2s-core.S b/arch/arm/crypto/blake2s-core.S
> index 86345751bbf3..df40e46601f1 100644
> --- a/arch/arm/crypto/blake2s-core.S
> +++ b/arch/arm/crypto/blake2s-core.S
> @@ -167,8 +167,8 @@
>  .endm
>
>  //
> -// void blake2s_compress_arch(struct blake2s_state *state,
> -//                           const u8 *block, size_t nblocks, u32 inc);
> +// void blake2s_compress(struct blake2s_state *state,
> +//                      const u8 *block, size_t nblocks, u32 inc);
>  //
>  // Only the first three fields of struct blake2s_state are used:
>  //     u32 h[8];       (inout)
> @@ -176,7 +176,7 @@
>  //     u32 f[2];       (in)
>  //
>         .align          5
> -ENTRY(blake2s_compress_arch)
> +ENTRY(blake2s_compress)
>         push            {r0-r2,r4-r11,lr}       // keep this an even number
>
>  .Lnext_block:
> @@ -303,4 +303,4 @@ ENTRY(blake2s_compress_arch)
>         str             r3, [r12], #4
>         bne             1b
>         b               .Lcopy_block_done
> -ENDPROC(blake2s_compress_arch)
> +ENDPROC(blake2s_compress)
> diff --git a/arch/arm/crypto/blake2s-glue.c b/arch/arm/crypto/blake2s-glue.c
> index f2cc1e5fc9ec..09d3a0cabd2c 100644
> --- a/arch/arm/crypto/blake2s-glue.c
> +++ b/arch/arm/crypto/blake2s-glue.c
> @@ -11,17 +11,17 @@
>  #include <linux/module.h>
>
>  /* defined in blake2s-core.S */
> -EXPORT_SYMBOL(blake2s_compress_arch);
> +EXPORT_SYMBOL(blake2s_compress);
>
>  static int crypto_blake2s_update_arm(struct shash_desc *desc,
>                                      const u8 *in, unsigned int inlen)
>  {
> -       return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
> +       return crypto_blake2s_update(desc, in, inlen, blake2s_compress);
>  }
>
>  static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out)
>  {
> -       return crypto_blake2s_final(desc, out, blake2s_compress_arch);
> +       return crypto_blake2s_final(desc, out, blake2s_compress);
>  }
>
>  #define BLAKE2S_ALG(name, driver_name, digest_size)                    \
> diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
> index e45cc27716de..caa3d1d6a0e8 100644
> --- a/arch/s390/configs/debug_defconfig
> +++ b/arch/s390/configs/debug_defconfig
> @@ -757,7 +757,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
>  CONFIG_CRYPTO_USER_API_RNG=m
>  CONFIG_CRYPTO_USER_API_AEAD=m
>  CONFIG_CRYPTO_STATS=y
> -CONFIG_CRYPTO_LIB_BLAKE2S=m
>  CONFIG_CRYPTO_LIB_CURVE25519=m
>  CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m
>  CONFIG_ZCRYPT=m
> diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
> index 1c750bfca2d8..fffc6af5358c 100644
> --- a/arch/s390/configs/defconfig
> +++ b/arch/s390/configs/defconfig
> @@ -744,7 +744,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
>  CONFIG_CRYPTO_USER_API_RNG=m
>  CONFIG_CRYPTO_USER_API_AEAD=m
>  CONFIG_CRYPTO_STATS=y
> -CONFIG_CRYPTO_LIB_BLAKE2S=m
>  CONFIG_CRYPTO_LIB_CURVE25519=m
>  CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m
>  CONFIG_ZCRYPT=m
> diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
> index a40365ab301e..ef91a3167d27 100644
> --- a/arch/x86/crypto/blake2s-glue.c
> +++ b/arch/x86/crypto/blake2s-glue.c
> @@ -28,9 +28,8 @@ asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
>  static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
>  static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
>
> -void blake2s_compress_arch(struct blake2s_state *state,
> -                          const u8 *block, size_t nblocks,
> -                          const u32 inc)
> +void blake2s_compress(struct blake2s_state *state, const u8 *block,
> +                     size_t nblocks, const u32 inc)
>  {
>         /* SIMD disables preemption, so relax after processing each page. */
>         BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
> @@ -56,17 +55,17 @@ void blake2s_compress_arch(struct blake2s_state *state,
>                 block += blocks * BLAKE2S_BLOCK_SIZE;
>         } while (nblocks);
>  }
> -EXPORT_SYMBOL(blake2s_compress_arch);
> +EXPORT_SYMBOL(blake2s_compress);
>
>  static int crypto_blake2s_update_x86(struct shash_desc *desc,
>                                      const u8 *in, unsigned int inlen)
>  {
> -       return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
> +       return crypto_blake2s_update(desc, in, inlen, blake2s_compress);
>  }
>
>  static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out)
>  {
> -       return crypto_blake2s_final(desc, out, blake2s_compress_arch);
> +       return crypto_blake2s_final(desc, out, blake2s_compress);
>  }
>
>  #define BLAKE2S_ALG(name, driver_name, digest_size)                    \
> diff --git a/crypto/Kconfig b/crypto/Kconfig
> index 285f82647d2b..bfda2c82774d 100644
> --- a/crypto/Kconfig
> +++ b/crypto/Kconfig
> @@ -685,7 +685,6 @@ config CRYPTO_BLAKE2B
>
>  config CRYPTO_BLAKE2S
>         tristate "BLAKE2s digest algorithm"
> -       select CRYPTO_LIB_BLAKE2S_GENERIC
>         select CRYPTO_HASH
>         help
>           Implementation of cryptographic hash function BLAKE2s
> @@ -702,10 +701,8 @@ config CRYPTO_BLAKE2S
>           See https://blake2.net for further information.
>
>  config CRYPTO_BLAKE2S_X86
> -       tristate "BLAKE2s digest algorithm (x86 accelerated version)"
> +       bool "BLAKE2s digest algorithm (x86 accelerated version)"
>         depends on X86 && 64BIT
> -       select CRYPTO_LIB_BLAKE2S_GENERIC
> -       select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
>
>  config CRYPTO_CRCT10DIF
>         tristate "CRCT10DIF algorithm"
> diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
> index 6cccc3dc00bc..b2a4f998c180 100644
> --- a/drivers/net/Kconfig
> +++ b/drivers/net/Kconfig
> @@ -81,7 +81,6 @@ config WIREGUARD
>         select CRYPTO
>         select CRYPTO_LIB_CURVE25519
>         select CRYPTO_LIB_CHACHA20POLY1305
> -       select CRYPTO_LIB_BLAKE2S
>         select CRYPTO_CHACHA20_X86_64 if X86 && 64BIT
>         select CRYPTO_POLY1305_X86_64 if X86 && 64BIT
>         select CRYPTO_BLAKE2S_X86 if X86 && 64BIT
> diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h
> index 8e50d487500f..d39cfa0d333e 100644
> --- a/include/crypto/internal/blake2s.h
> +++ b/include/crypto/internal/blake2s.h
> @@ -11,11 +11,11 @@
>  #include <crypto/internal/hash.h>
>  #include <linux/string.h>
>
> -void blake2s_compress_generic(struct blake2s_state *state,const u8 *block,
> +void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
>                               size_t nblocks, const u32 inc);
>
> -void blake2s_compress_arch(struct blake2s_state *state,const u8 *block,
> -                          size_t nblocks, const u32 inc);
> +void blake2s_compress(struct blake2s_state *state, const u8 *block,
> +                     size_t nblocks, const u32 inc);
>
>  bool blake2s_selftest(void);
>
> diff --git a/lib/Makefile b/lib/Makefile
> index 364c23f15578..bb57b2e466fa 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -139,7 +139,7 @@ endif
>  obj-$(CONFIG_DEBUG_INFO_REDUCED) += debug_info.o
>  CFLAGS_debug_info.o += $(call cc-option, -femit-struct-debug-detailed=any)
>
> -obj-y += math/ crypto/
> +obj-y += math/
>
>  obj-$(CONFIG_GENERIC_IOMAP) += iomap.o
>  obj-$(CONFIG_GENERIC_PCI_IOMAP) += pci_iomap.o
> diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
> index 545ccbddf6a1..31c6e2be3b84 100644
> --- a/lib/crypto/Kconfig
> +++ b/lib/crypto/Kconfig
> @@ -8,31 +8,6 @@ config CRYPTO_LIB_AES
>  config CRYPTO_LIB_ARC4
>         tristate
>
> -config CRYPTO_ARCH_HAVE_LIB_BLAKE2S
> -       tristate
> -       help
> -         Declares whether the architecture provides an arch-specific
> -         accelerated implementation of the Blake2s library interface,
> -         either builtin or as a module.
> -
> -config CRYPTO_LIB_BLAKE2S_GENERIC
> -       tristate
> -       help
> -         This symbol can be depended upon by arch implementations of the
> -         Blake2s library interface that require the generic code as a
> -         fallback, e.g., for SIMD implementations. If no arch specific
> -         implementation is enabled, this implementation serves the users
> -         of CRYPTO_LIB_BLAKE2S.
> -
> -config CRYPTO_LIB_BLAKE2S
> -       tristate "BLAKE2s hash function library"
> -       depends on CRYPTO_ARCH_HAVE_LIB_BLAKE2S || !CRYPTO_ARCH_HAVE_LIB_BLAKE2S
> -       select CRYPTO_LIB_BLAKE2S_GENERIC if CRYPTO_ARCH_HAVE_LIB_BLAKE2S=n
> -       help
> -         Enable the Blake2s library interface. This interface may be fulfilled
> -         by either the generic implementation or an arch-specific one, if one
> -         is available and enabled.
> -
>  config CRYPTO_ARCH_HAVE_LIB_CHACHA
>         tristate
>         help
> diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
> index 73205ed269ba..42e1d932c077 100644
> --- a/lib/crypto/Makefile
> +++ b/lib/crypto/Makefile
> @@ -10,10 +10,9 @@ libaes-y                                     := aes.o
>  obj-$(CONFIG_CRYPTO_LIB_ARC4)                  += libarc4.o
>  libarc4-y                                      := arc4.o
>
> -obj-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC)       += libblake2s-generic.o
> -libblake2s-generic-y                           += blake2s-generic.o
> -
> -obj-$(CONFIG_CRYPTO_LIB_BLAKE2S)               += libblake2s.o
> +# blake2s is used by the /dev/random driver which is always builtin
> +lib-y                                          += blake2s-generic.o
> +obj-y                                          += libblake2s.o
>  libblake2s-y                                   += blake2s.o
>
>  obj-$(CONFIG_CRYPTO_LIB_CHACHA20POLY1305)      += libchacha20poly1305.o
> diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c
> index 04ff8df24513..75ccb3e633e6 100644
> --- a/lib/crypto/blake2s-generic.c
> +++ b/lib/crypto/blake2s-generic.c
> @@ -37,7 +37,11 @@ static inline void blake2s_increment_counter(struct blake2s_state *state,
>         state->t[1] += (state->t[0] < inc);
>  }
>
> -void blake2s_compress_generic(struct blake2s_state *state,const u8 *block,
> +void blake2s_compress(struct blake2s_state *state, const u8 *block,
> +                     size_t nblocks, const u32 inc)
> +                     __weak __alias(blake2s_compress_generic);
> +
> +void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
>                               size_t nblocks, const u32 inc)
>  {
>         u32 m[16];
> diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
> index 4055aa593ec4..93f2ae051370 100644
> --- a/lib/crypto/blake2s.c
> +++ b/lib/crypto/blake2s.c
> @@ -16,12 +16,6 @@
>  #include <linux/init.h>
>  #include <linux/bug.h>
>
> -#if IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S)
> -#  define blake2s_compress blake2s_compress_arch
> -#else
> -#  define blake2s_compress blake2s_compress_generic
> -#endif
> -
>  void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen)
>  {
>         __blake2s_update(state, in, inlen, blake2s_compress);
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v2 1/2] lib/crypto: blake2s: include as built-in
  2021-12-23 14:11 [PATCH v2 1/2] lib/crypto: blake2s: include as built-in Jason A. Donenfeld
  2021-12-23 14:11 ` [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction Jason A. Donenfeld
  2021-12-23 14:20 ` [PATCH v2 1/2] lib/crypto: blake2s: include as built-in Ard Biesheuvel
@ 2021-12-24 13:35 ` Greg KH
  2021-12-25  9:26 ` Masahiro Yamada
  3 siblings, 0 replies; 61+ messages in thread
From: Greg KH @ 2021-12-24 13:35 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: linux-kernel, tytso, Ard Biesheuvel, Masahiro Yamada,
	linux-kbuild, Herbert Xu, linux-crypto

On Thu, Dec 23, 2021 at 03:11:12PM +0100, Jason A. Donenfeld wrote:
> In preparation for using blake2s in the RNG, we change the way that it
> is wired-in to the build system. Instead of kconfig mazes and ifdefs, we
> use weak symbols, so that an arch version can override the generic
> version. Then we include the generic version in lib-y, so that it can be
> removed from the image if the arch version doesn't fallback to it (as is
> the case on arm though not x86). The result should be a bit simpler and
> smaller than the code it replaces.
> 
> Cc: Ard Biesheuvel <ardb@kernel.org>
> Cc: Masahiro Yamada <masahiroy@kernel.org>
> Cc: linux-kbuild@vger.kernel.org
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: linux-crypto@vger.kernel.org
> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> ---
> Herbert - I intend to take this via the crng/random.git tree, since it
> forms a dependency and I'd like to send a pull early in 5.17 cycle.
> 
>  Makefile                          |  2 +-
>  arch/arm/crypto/Kconfig           |  3 +--
>  arch/arm/crypto/blake2s-core.S    |  8 ++++----
>  arch/arm/crypto/blake2s-glue.c    |  6 +++---
>  arch/s390/configs/debug_defconfig |  1 -
>  arch/s390/configs/defconfig       |  1 -
>  arch/x86/crypto/blake2s-glue.c    | 11 +++++------
>  crypto/Kconfig                    |  5 +----
>  drivers/net/Kconfig               |  1 -
>  include/crypto/internal/blake2s.h |  6 +++---
>  lib/Makefile                      |  2 +-
>  lib/crypto/Kconfig                | 25 -------------------------
>  lib/crypto/Makefile               |  7 +++----
>  lib/crypto/blake2s-generic.c      |  6 +++++-
>  lib/crypto/blake2s.c              |  6 ------
>  15 files changed, 27 insertions(+), 63 deletions(-)

Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction
  2021-12-23 14:11 ` [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction Jason A. Donenfeld
@ 2021-12-24 20:56   ` Eric Biggers
  2022-01-05 21:42     ` Eric Biggers
  2021-12-27 15:52   ` Theodore Ts'o
  2022-01-11 11:38   ` Geert Uytterhoeven
  2 siblings, 1 reply; 61+ messages in thread
From: Eric Biggers @ 2021-12-24 20:56 UTC (permalink / raw)
  To: Jason A. Donenfeld; +Cc: linux-kernel, tytso, gregkh, Jean-Philippe Aumasson

On Thu, Dec 23, 2021 at 03:11:13PM +0100, Jason A. Donenfeld wrote:
> This commit addresses one of the lower hanging fruits of the RNG: its
> usage of SHA1.
> 
> BLAKE2s is generally faster, and certainly more secure, than SHA1, which
> has [1] been [2] really [3] very [4] broken [5]. Additionally, the
> current construction in the RNG doesn't use the full SHA1 function, as
> specified, and allows overwriting the IV with RDRAND output in an
> undocumented way, even in the case when RDRAND isn't set to "trusted",
> which means potential malicious IV choices. And its short length means
> that keeping only half of it secret when feeding back into the mixer
> gives us only 2^80 bits of forward secrecy. In other words, not only is
> the choice of hash function dated, but the use of it isn't really great
> either.
> 
> This commit aims to fix both of these issues while also keeping the
> general structure and semantics as close to the original as possible.
> Specifically:
> 
>    a) Rather than overwriting the hash IV with RDRAND, we put it into
>       BLAKE2's documented "salt" and "personal" fields, which were
>       specifically created for this type of usage.
>    b) Since this function feeds the full hash result back into the
>       entropy collector, we only return from it half the length of the
>       hash, just as it was done before. This increases the
>       construction's forward secrecy from 2^80 to a much more
>       comfortable 2^128.
>    c) Rather than using the raw "sha1_transform" function alone, we
>       instead use the full proper BLAKE2s function, with finalization.
> 
> This also has the advantage of supplying 16 bytes at a time rather than
> SHA1's 10 bytes, which, in addition to having a faster compression
> function to begin with, means faster extraction in general. On an Intel
> i7-11850H, this commit makes initial seeding around 131% faster.
> 
> BLAKE2s itself has the nice property of internally being based on the
> ChaCha permutation, which the RNG is already using for expansion, so
> there shouldn't be any issue with newness, funkiness, or surprising CPU
> behavior, since it's based on something already in use.
> 
> [1] https://eprint.iacr.org/2005/010.pdf
> [2] https://www.iacr.org/archive/crypto2005/36210017/36210017.pdf
> [3] https://eprint.iacr.org/2015/967.pdf
> [4] https://shattered.io/static/shattered.pdf
> [5] https://www.usenix.org/system/files/sec20-leurent.pdf
> 
> Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Reviewed-by: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>

Looks good.  I had thought about replacing this with SHA-256, but BLAKE2s is
arguably a better choice here.  You can add:

	Reviewed-by: Eric Biggers <ebiggers@google.com>

A couple comments about the new development process though:

It seems that you're applying patches to
https://git.kernel.org/pub/scm/linux/kernel/git/crng/random.git, but that git
repository isn't listed in the MAINTAINERS file entry.  Can you add it?

Also, this patch was only sent to linux-kernel, not to linux-crypto, so I only
found it because I happened to see it in the above git repository, then dig it
up from lore.kernel.org.  How about Cc'ing all random.c patches to linux-crypto,
and putting that in the MAINTAINERS file entry?

- Eric

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v2 1/2] lib/crypto: blake2s: include as built-in
  2021-12-23 14:11 [PATCH v2 1/2] lib/crypto: blake2s: include as built-in Jason A. Donenfeld
                   ` (2 preceding siblings ...)
  2021-12-24 13:35 ` Greg KH
@ 2021-12-25  9:26 ` Masahiro Yamada
  2021-12-25 10:26   ` Ard Biesheuvel
  3 siblings, 1 reply; 61+ messages in thread
From: Masahiro Yamada @ 2021-12-25  9:26 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Linux Kernel Mailing List, Theodore Ts'o, Greg Kroah-Hartman,
	Ard Biesheuvel, Linux Kbuild mailing list, Herbert Xu,
	Linux Crypto Mailing List

On Thu, Dec 23, 2021 at 11:11 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>
> In preparation for using blake2s in the RNG, we change the way that it
> is wired-in to the build system. Instead of kconfig mazes and ifdefs, we
> use weak symbols, so that an arch version can override the generic
> version. Then we include the generic version in lib-y, so that it can be
> removed from the image if the arch version doesn't fallback to it (as is
> the case on arm though not x86).


As I replied in another email, this does not work like that.

Since 7273ad2b08f8ac9563579d16a3cf528857b26f49,
libs-y are all linked when CONFIG_MODULES=y.



So, what this patch is doing are:

 - Add __weak to the generic function
 - Make modules into built-in.


Both generic functions and ARM-specific ones
will remain in vmlinux.

__weak makes it difficult to track which function is
actually used.
Using #ifdef CONFIG_* (as the current code does)
is better.



>
> diff --git a/Makefile b/Makefile
> index d85f1ff79f5c..892ea632ea63 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -668,7 +668,7 @@ drivers-y   := drivers/ sound/
>  drivers-$(CONFIG_SAMPLES) += samples/
>  drivers-$(CONFIG_NET) += net/
>  drivers-y      += virt/
> -libs-y         := lib/
> +libs-y         := lib/ lib/crypto/


If this is merged, someone will try to
add random patterns.
libs-y         := lib/ lib/crypto/ lib/foo/bar/ lib/foo/ba/baz



lib-y and libs-y are a bad idea in the first place
and should not be extended any more.

Since this patch is not working as the commit description
claims, and it is going in the bad direction, so

NACK




-- 
Best Regards
Masahiro Yamada

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v2 1/2] lib/crypto: blake2s: include as built-in
  2021-12-25  9:26 ` Masahiro Yamada
@ 2021-12-25 10:26   ` Ard Biesheuvel
  2021-12-25 15:47     ` Masahiro Yamada
  0 siblings, 1 reply; 61+ messages in thread
From: Ard Biesheuvel @ 2021-12-25 10:26 UTC (permalink / raw)
  To: Masahiro Yamada
  Cc: Jason A. Donenfeld, Linux Kernel Mailing List, Theodore Ts'o,
	Greg Kroah-Hartman, Linux Kbuild mailing list, Herbert Xu,
	Linux Crypto Mailing List

On Sat, 25 Dec 2021 at 10:28, Masahiro Yamada <masahiroy@kernel.org> wrote:
>
> On Thu, Dec 23, 2021 at 11:11 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> >
> > In preparation for using blake2s in the RNG, we change the way that it
> > is wired-in to the build system. Instead of kconfig mazes and ifdefs, we
> > use weak symbols, so that an arch version can override the generic
> > version. Then we include the generic version in lib-y, so that it can be
> > removed from the image if the arch version doesn't fallback to it (as is
> > the case on arm though not x86).
>
>
> As I replied in another email, this does not work like that.
>
> Since 7273ad2b08f8ac9563579d16a3cf528857b26f49,
> libs-y are all linked when CONFIG_MODULES=y.
>
>
>
> So, what this patch is doing are:
>
>  - Add __weak to the generic function
>  - Make modules into built-in.
>
>
> Both generic functions and ARM-specific ones
> will remain in vmlinux.
>
> __weak makes it difficult to track which function is
> actually used.
> Using #ifdef CONFIG_* (as the current code does)
> is better.
>
>
>
> >
> > diff --git a/Makefile b/Makefile
> > index d85f1ff79f5c..892ea632ea63 100644
> > --- a/Makefile
> > +++ b/Makefile
> > @@ -668,7 +668,7 @@ drivers-y   := drivers/ sound/
> >  drivers-$(CONFIG_SAMPLES) += samples/
> >  drivers-$(CONFIG_NET) += net/
> >  drivers-y      += virt/
> > -libs-y         := lib/
> > +libs-y         := lib/ lib/crypto/
>
>
> If this is merged, someone will try to
> add random patterns.
> libs-y         := lib/ lib/crypto/ lib/foo/bar/ lib/foo/ba/baz
>
>
>
> lib-y and libs-y are a bad idea in the first place
> and should not be extended any more.
>
> Since this patch is not working as the commit description
> claims, and it is going in the bad direction, so
>
> NACK
>

So we are no longer permitted to use static libraries to provide
routines that should only be pulled into vmlinux on demand? Has this
also changed for things like string routines etc?

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v2 1/2] lib/crypto: blake2s: include as built-in
  2021-12-25 10:26   ` Ard Biesheuvel
@ 2021-12-25 15:47     ` Masahiro Yamada
  2021-12-27 13:43       ` Jason A. Donenfeld
  0 siblings, 1 reply; 61+ messages in thread
From: Masahiro Yamada @ 2021-12-25 15:47 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Jason A. Donenfeld, Linux Kernel Mailing List, Theodore Ts'o,
	Greg Kroah-Hartman, Linux Kbuild mailing list, Herbert Xu,
	Linux Crypto Mailing List

On Sat, Dec 25, 2021 at 7:26 PM Ard Biesheuvel <ardb@kernel.org> wrote:
>
> On Sat, 25 Dec 2021 at 10:28, Masahiro Yamada <masahiroy@kernel.org> wrote:
> >
> > On Thu, Dec 23, 2021 at 11:11 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > >
> > > In preparation for using blake2s in the RNG, we change the way that it
> > > is wired-in to the build system. Instead of kconfig mazes and ifdefs, we
> > > use weak symbols, so that an arch version can override the generic
> > > version. Then we include the generic version in lib-y, so that it can be
> > > removed from the image if the arch version doesn't fallback to it (as is
> > > the case on arm though not x86).
> >
> >
> > As I replied in another email, this does not work like that.
> >
> > Since 7273ad2b08f8ac9563579d16a3cf528857b26f49,
> > libs-y are all linked when CONFIG_MODULES=y.
> >
> >
> >
> > So, what this patch is doing are:
> >
> >  - Add __weak to the generic function
> >  - Make modules into built-in.
> >
> >
> > Both generic functions and ARM-specific ones
> > will remain in vmlinux.
> >
> > __weak makes it difficult to track which function is
> > actually used.
> > Using #ifdef CONFIG_* (as the current code does)
> > is better.
> >
> >
> >
> > >
> > > diff --git a/Makefile b/Makefile
> > > index d85f1ff79f5c..892ea632ea63 100644
> > > --- a/Makefile
> > > +++ b/Makefile
> > > @@ -668,7 +668,7 @@ drivers-y   := drivers/ sound/
> > >  drivers-$(CONFIG_SAMPLES) += samples/
> > >  drivers-$(CONFIG_NET) += net/
> > >  drivers-y      += virt/
> > > -libs-y         := lib/
> > > +libs-y         := lib/ lib/crypto/
> >
> >
> > If this is merged, someone will try to
> > add random patterns.
> > libs-y         := lib/ lib/crypto/ lib/foo/bar/ lib/foo/ba/baz
> >
> >
> >
> > lib-y and libs-y are a bad idea in the first place
> > and should not be extended any more.
> >
> > Since this patch is not working as the commit description
> > claims, and it is going in the bad direction, so
> >
> > NACK
> >
>
> So we are no longer permitted to use static libraries to provide
> routines that should only be pulled into vmlinux on demand? Has this
> also changed for things like string routines etc?

Utility functions such as string routines are intended to be used
anywhere on demand, not only in vmlinux but also in loadable
modules.

Therefore, such functions are very likely to be EXPORT_SYMBOL'ed.
As a matter of fact, most of the files listed in lib-y
contain EXPORT_SYMBOL.

Historically, static libraries did not work well with EXPORT_SYMBOL.

Originally, lib-y dropped functions that had no callsite in vmlinux, but
it was a wrong behavior. We must always keep exported functions, which
might be used by modules, even if not by vmlinux.

7f2084fa55e6cb61f61b4224d4a8bafaeee55f9f
added a workaround so that all of EXPORT_SYMBOL
are considered "referenced".

Since then, most of lib-y objects were linked anyway,
given the following:

- Most of *.c files listed in lib-y contain at least one EXPORT_SYMBOL
- In static library, if any one of symbol is referenced, the entire object
  is linked

So, lib-y was not helpful for reducing the kernel image size.

The exceptional cases are CONFIG_MODULES=n
or CONFIG_TRIM_UNUSED_KSYMS=y, but neither of
them is a common use-case.

To remove unused functions,
CONFIG_LD_DEAD_CODE_DATA_ELIMINATION (per-symbol
removal) seems to be a more sensible solution to me.



-- 
Best Regards
Masahiro Yamada

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v2 1/2] lib/crypto: blake2s: include as built-in
  2021-12-25 15:47     ` Masahiro Yamada
@ 2021-12-27 13:43       ` Jason A. Donenfeld
  2021-12-27 13:47         ` [PATCH v3] " Jason A. Donenfeld
  0 siblings, 1 reply; 61+ messages in thread
From: Jason A. Donenfeld @ 2021-12-27 13:43 UTC (permalink / raw)
  To: Masahiro Yamada
  Cc: Ard Biesheuvel, Linux Kernel Mailing List, Theodore Ts'o,
	Greg Kroah-Hartman, Linux Kbuild mailing list, Herbert Xu,
	Linux Crypto Mailing List

Hi Masahiro,

Thanks for your feedback. Indeed it looks like you're right about the
CONFIG_MODULE case. We'll go back to using the kconfig system
normally, and cease tempting the beast with libs-y and such. I'll have
a v+1 for you shortly in case you're curious, though I expect it to be
sufficiently boring to be no longer worth your time :-).

Jason

^ permalink raw reply	[flat|nested] 61+ messages in thread

* [PATCH v3] lib/crypto: blake2s: include as built-in
  2021-12-27 13:43       ` Jason A. Donenfeld
@ 2021-12-27 13:47         ` Jason A. Donenfeld
  2021-12-27 14:20           ` [PATCH v4] " Jason A. Donenfeld
  0 siblings, 1 reply; 61+ messages in thread
From: Jason A. Donenfeld @ 2021-12-27 13:47 UTC (permalink / raw)
  To: linux-kernel
  Cc: Jason A. Donenfeld, Ard Biesheuvel, Greg Kroah-Hartman,
	Masahiro Yamada, linux-kbuild, Herbert Xu, linux-crypto

In preparation for using blake2s in the RNG, we change the way that it
is wired-in to the build system. Instead of using ifdefs to select the
right symbol, we use weak symbols. And because ARM doesn't need the
generic implementation, we make the generic one default only if an arch
library doesn't need it already, and then have arch libraries that do
need it opt-in.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: linux-kbuild@vger.kernel.org
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: linux-crypto@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
Herbert - As mentioned with the vPrev, I intend to take this via the
crng/random.git tree, since it forms a dependency and I'd like to send a
pull early in 5.17 cycle.

Changes v2->v3:
- Rather than using lib-y, use obj-y, and retain the kconfig symbols
  for selection.

 arch/arm/crypto/blake2s-core.S    |  8 ++++----
 arch/arm/crypto/blake2s-glue.c    |  6 +++---
 arch/x86/crypto/blake2s-glue.c    | 11 +++++------
 crypto/Kconfig                    |  1 -
 drivers/net/Kconfig               |  1 -
 include/crypto/internal/blake2s.h |  6 +++---
 lib/crypto/Kconfig                | 13 ++-----------
 lib/crypto/Makefile               |  9 ++++-----
 lib/crypto/blake2s-generic.c      |  6 +++++-
 lib/crypto/blake2s.c              |  6 ------
 10 files changed, 26 insertions(+), 41 deletions(-)

diff --git a/arch/arm/crypto/blake2s-core.S b/arch/arm/crypto/blake2s-core.S
index 86345751bbf3..df40e46601f1 100644
--- a/arch/arm/crypto/blake2s-core.S
+++ b/arch/arm/crypto/blake2s-core.S
@@ -167,8 +167,8 @@
 .endm
 
 //
-// void blake2s_compress_arch(struct blake2s_state *state,
-//			      const u8 *block, size_t nblocks, u32 inc);
+// void blake2s_compress(struct blake2s_state *state,
+//			 const u8 *block, size_t nblocks, u32 inc);
 //
 // Only the first three fields of struct blake2s_state are used:
 //	u32 h[8];	(inout)
@@ -176,7 +176,7 @@
 //	u32 f[2];	(in)
 //
 	.align		5
-ENTRY(blake2s_compress_arch)
+ENTRY(blake2s_compress)
 	push		{r0-r2,r4-r11,lr}	// keep this an even number
 
 .Lnext_block:
@@ -303,4 +303,4 @@ ENTRY(blake2s_compress_arch)
 	str		r3, [r12], #4
 	bne		1b
 	b		.Lcopy_block_done
-ENDPROC(blake2s_compress_arch)
+ENDPROC(blake2s_compress)
diff --git a/arch/arm/crypto/blake2s-glue.c b/arch/arm/crypto/blake2s-glue.c
index f2cc1e5fc9ec..09d3a0cabd2c 100644
--- a/arch/arm/crypto/blake2s-glue.c
+++ b/arch/arm/crypto/blake2s-glue.c
@@ -11,17 +11,17 @@
 #include <linux/module.h>
 
 /* defined in blake2s-core.S */
-EXPORT_SYMBOL(blake2s_compress_arch);
+EXPORT_SYMBOL(blake2s_compress);
 
 static int crypto_blake2s_update_arm(struct shash_desc *desc,
 				     const u8 *in, unsigned int inlen)
 {
-	return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
+	return crypto_blake2s_update(desc, in, inlen, blake2s_compress);
 }
 
 static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out)
 {
-	return crypto_blake2s_final(desc, out, blake2s_compress_arch);
+	return crypto_blake2s_final(desc, out, blake2s_compress);
 }
 
 #define BLAKE2S_ALG(name, driver_name, digest_size)			\
diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
index a40365ab301e..ef91a3167d27 100644
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@@ -28,9 +28,8 @@ asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
 
-void blake2s_compress_arch(struct blake2s_state *state,
-			   const u8 *block, size_t nblocks,
-			   const u32 inc)
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+		      size_t nblocks, const u32 inc)
 {
 	/* SIMD disables preemption, so relax after processing each page. */
 	BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
@@ -56,17 +55,17 @@ void blake2s_compress_arch(struct blake2s_state *state,
 		block += blocks * BLAKE2S_BLOCK_SIZE;
 	} while (nblocks);
 }
-EXPORT_SYMBOL(blake2s_compress_arch);
+EXPORT_SYMBOL(blake2s_compress);
 
 static int crypto_blake2s_update_x86(struct shash_desc *desc,
 				     const u8 *in, unsigned int inlen)
 {
-	return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
+	return crypto_blake2s_update(desc, in, inlen, blake2s_compress);
 }
 
 static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out)
 {
-	return crypto_blake2s_final(desc, out, blake2s_compress_arch);
+	return crypto_blake2s_final(desc, out, blake2s_compress);
 }
 
 #define BLAKE2S_ALG(name, driver_name, digest_size)			\
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 285f82647d2b..f2cb34515afa 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -685,7 +685,6 @@ config CRYPTO_BLAKE2B
 
 config CRYPTO_BLAKE2S
 	tristate "BLAKE2s digest algorithm"
-	select CRYPTO_LIB_BLAKE2S_GENERIC
 	select CRYPTO_HASH
 	help
 	  Implementation of cryptographic hash function BLAKE2s
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 6cccc3dc00bc..b2a4f998c180 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -81,7 +81,6 @@ config WIREGUARD
 	select CRYPTO
 	select CRYPTO_LIB_CURVE25519
 	select CRYPTO_LIB_CHACHA20POLY1305
-	select CRYPTO_LIB_BLAKE2S
 	select CRYPTO_CHACHA20_X86_64 if X86 && 64BIT
 	select CRYPTO_POLY1305_X86_64 if X86 && 64BIT
 	select CRYPTO_BLAKE2S_X86 if X86 && 64BIT
diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h
index 8e50d487500f..d39cfa0d333e 100644
--- a/include/crypto/internal/blake2s.h
+++ b/include/crypto/internal/blake2s.h
@@ -11,11 +11,11 @@
 #include <crypto/internal/hash.h>
 #include <linux/string.h>
 
-void blake2s_compress_generic(struct blake2s_state *state,const u8 *block,
+void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 			      size_t nblocks, const u32 inc);
 
-void blake2s_compress_arch(struct blake2s_state *state,const u8 *block,
-			   size_t nblocks, const u32 inc);
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+		      size_t nblocks, const u32 inc);
 
 bool blake2s_selftest(void);
 
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 545ccbddf6a1..0f27976b5038 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -9,14 +9,14 @@ config CRYPTO_LIB_ARC4
 	tristate
 
 config CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	tristate
+	bool
 	help
 	  Declares whether the architecture provides an arch-specific
 	  accelerated implementation of the Blake2s library interface,
 	  either builtin or as a module.
 
 config CRYPTO_LIB_BLAKE2S_GENERIC
-	tristate
+	def_bool !CRYPTO_ARCH_HAVE_LIB_BLAKE2S
 	help
 	  This symbol can be depended upon by arch implementations of the
 	  Blake2s library interface that require the generic code as a
@@ -24,15 +24,6 @@ config CRYPTO_LIB_BLAKE2S_GENERIC
 	  implementation is enabled, this implementation serves the users
 	  of CRYPTO_LIB_BLAKE2S.
 
-config CRYPTO_LIB_BLAKE2S
-	tristate "BLAKE2s hash function library"
-	depends on CRYPTO_ARCH_HAVE_LIB_BLAKE2S || !CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	select CRYPTO_LIB_BLAKE2S_GENERIC if CRYPTO_ARCH_HAVE_LIB_BLAKE2S=n
-	help
-	  Enable the Blake2s library interface. This interface may be fulfilled
-	  by either the generic implementation or an arch-specific one, if one
-	  is available and enabled.
-
 config CRYPTO_ARCH_HAVE_LIB_CHACHA
 	tristate
 	help
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 73205ed269ba..ed43a41f2dcc 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -10,11 +10,10 @@ libaes-y					:= aes.o
 obj-$(CONFIG_CRYPTO_LIB_ARC4)			+= libarc4.o
 libarc4-y					:= arc4.o
 
-obj-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC)	+= libblake2s-generic.o
-libblake2s-generic-y				+= blake2s-generic.o
-
-obj-$(CONFIG_CRYPTO_LIB_BLAKE2S)		+= libblake2s.o
-libblake2s-y					+= blake2s.o
+# blake2s is used by the /dev/random driver which is always builtin
+obj-y						+= libblake2s.o
+libblake2s-y					:= blake2s.o
+libblake2s-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC)	+= blake2s-generic.o
 
 obj-$(CONFIG_CRYPTO_LIB_CHACHA20POLY1305)	+= libchacha20poly1305.o
 libchacha20poly1305-y				+= chacha20poly1305.o
diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c
index 04ff8df24513..75ccb3e633e6 100644
--- a/lib/crypto/blake2s-generic.c
+++ b/lib/crypto/blake2s-generic.c
@@ -37,7 +37,11 @@ static inline void blake2s_increment_counter(struct blake2s_state *state,
 	state->t[1] += (state->t[0] < inc);
 }
 
-void blake2s_compress_generic(struct blake2s_state *state,const u8 *block,
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+		      size_t nblocks, const u32 inc)
+		      __weak __alias(blake2s_compress_generic);
+
+void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 			      size_t nblocks, const u32 inc)
 {
 	u32 m[16];
diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index 4055aa593ec4..93f2ae051370 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -16,12 +16,6 @@
 #include <linux/init.h>
 #include <linux/bug.h>
 
-#if IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S)
-#  define blake2s_compress blake2s_compress_arch
-#else
-#  define blake2s_compress blake2s_compress_generic
-#endif
-
 void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen)
 {
 	__blake2s_update(state, in, inlen, blake2s_compress);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH v4] lib/crypto: blake2s: include as built-in
  2021-12-27 13:47         ` [PATCH v3] " Jason A. Donenfeld
@ 2021-12-27 14:20           ` Jason A. Donenfeld
  2022-01-01 15:59             ` [PATCH v5] " Jason A. Donenfeld
  0 siblings, 1 reply; 61+ messages in thread
From: Jason A. Donenfeld @ 2021-12-27 14:20 UTC (permalink / raw)
  To: linux-kernel
  Cc: Jason A. Donenfeld, Ard Biesheuvel, Greg Kroah-Hartman,
	Masahiro Yamada, linux-kbuild, Herbert Xu, linux-crypto

In preparation for using blake2s in the RNG, we change the way that it
is wired-in to the build system. Instead of using ifdefs to select the
right symbol, we use weak symbols. And because ARM doesn't need the
generic implementation, we make the generic one default only if an arch
library doesn't need it already, and then have arch libraries that do
need it opt-in.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: linux-kbuild@vger.kernel.org
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: linux-crypto@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
Herbert - As mentioned with the vPrev, I intend to take this via the
crng/random.git tree, since it forms a dependency and I'd like to send a
pull early in 5.17 cycle.

Changes v3->v4:
- Keep the generic one for the generic shash implementation.
Changes v2->v3:
- Rather than using lib-y, use obj-y, and retain the kconfig symbols
  for selection.

 arch/arm/crypto/blake2s-core.S    |  8 ++++----
 arch/arm/crypto/blake2s-glue.c    |  6 +++---
 arch/x86/crypto/blake2s-glue.c    | 11 +++++------
 drivers/net/Kconfig               |  1 -
 include/crypto/internal/blake2s.h |  6 +++---
 lib/crypto/Kconfig                | 13 ++-----------
 lib/crypto/Makefile               |  9 ++++-----
 lib/crypto/blake2s-generic.c      |  6 +++++-
 lib/crypto/blake2s.c              |  6 ------
 9 files changed, 26 insertions(+), 40 deletions(-)

diff --git a/arch/arm/crypto/blake2s-core.S b/arch/arm/crypto/blake2s-core.S
index 86345751bbf3..df40e46601f1 100644
--- a/arch/arm/crypto/blake2s-core.S
+++ b/arch/arm/crypto/blake2s-core.S
@@ -167,8 +167,8 @@
 .endm
 
 //
-// void blake2s_compress_arch(struct blake2s_state *state,
-//			      const u8 *block, size_t nblocks, u32 inc);
+// void blake2s_compress(struct blake2s_state *state,
+//			 const u8 *block, size_t nblocks, u32 inc);
 //
 // Only the first three fields of struct blake2s_state are used:
 //	u32 h[8];	(inout)
@@ -176,7 +176,7 @@
 //	u32 f[2];	(in)
 //
 	.align		5
-ENTRY(blake2s_compress_arch)
+ENTRY(blake2s_compress)
 	push		{r0-r2,r4-r11,lr}	// keep this an even number
 
 .Lnext_block:
@@ -303,4 +303,4 @@ ENTRY(blake2s_compress_arch)
 	str		r3, [r12], #4
 	bne		1b
 	b		.Lcopy_block_done
-ENDPROC(blake2s_compress_arch)
+ENDPROC(blake2s_compress)
diff --git a/arch/arm/crypto/blake2s-glue.c b/arch/arm/crypto/blake2s-glue.c
index f2cc1e5fc9ec..09d3a0cabd2c 100644
--- a/arch/arm/crypto/blake2s-glue.c
+++ b/arch/arm/crypto/blake2s-glue.c
@@ -11,17 +11,17 @@
 #include <linux/module.h>
 
 /* defined in blake2s-core.S */
-EXPORT_SYMBOL(blake2s_compress_arch);
+EXPORT_SYMBOL(blake2s_compress);
 
 static int crypto_blake2s_update_arm(struct shash_desc *desc,
 				     const u8 *in, unsigned int inlen)
 {
-	return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
+	return crypto_blake2s_update(desc, in, inlen, blake2s_compress);
 }
 
 static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out)
 {
-	return crypto_blake2s_final(desc, out, blake2s_compress_arch);
+	return crypto_blake2s_final(desc, out, blake2s_compress);
 }
 
 #define BLAKE2S_ALG(name, driver_name, digest_size)			\
diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
index a40365ab301e..ef91a3167d27 100644
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@@ -28,9 +28,8 @@ asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
 
-void blake2s_compress_arch(struct blake2s_state *state,
-			   const u8 *block, size_t nblocks,
-			   const u32 inc)
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+		      size_t nblocks, const u32 inc)
 {
 	/* SIMD disables preemption, so relax after processing each page. */
 	BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
@@ -56,17 +55,17 @@ void blake2s_compress_arch(struct blake2s_state *state,
 		block += blocks * BLAKE2S_BLOCK_SIZE;
 	} while (nblocks);
 }
-EXPORT_SYMBOL(blake2s_compress_arch);
+EXPORT_SYMBOL(blake2s_compress);
 
 static int crypto_blake2s_update_x86(struct shash_desc *desc,
 				     const u8 *in, unsigned int inlen)
 {
-	return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
+	return crypto_blake2s_update(desc, in, inlen, blake2s_compress);
 }
 
 static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out)
 {
-	return crypto_blake2s_final(desc, out, blake2s_compress_arch);
+	return crypto_blake2s_final(desc, out, blake2s_compress);
 }
 
 #define BLAKE2S_ALG(name, driver_name, digest_size)			\
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 6cccc3dc00bc..b2a4f998c180 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -81,7 +81,6 @@ config WIREGUARD
 	select CRYPTO
 	select CRYPTO_LIB_CURVE25519
 	select CRYPTO_LIB_CHACHA20POLY1305
-	select CRYPTO_LIB_BLAKE2S
 	select CRYPTO_CHACHA20_X86_64 if X86 && 64BIT
 	select CRYPTO_POLY1305_X86_64 if X86 && 64BIT
 	select CRYPTO_BLAKE2S_X86 if X86 && 64BIT
diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h
index 8e50d487500f..d39cfa0d333e 100644
--- a/include/crypto/internal/blake2s.h
+++ b/include/crypto/internal/blake2s.h
@@ -11,11 +11,11 @@
 #include <crypto/internal/hash.h>
 #include <linux/string.h>
 
-void blake2s_compress_generic(struct blake2s_state *state,const u8 *block,
+void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 			      size_t nblocks, const u32 inc);
 
-void blake2s_compress_arch(struct blake2s_state *state,const u8 *block,
-			   size_t nblocks, const u32 inc);
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+		      size_t nblocks, const u32 inc);
 
 bool blake2s_selftest(void);
 
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 545ccbddf6a1..0f27976b5038 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -9,14 +9,14 @@ config CRYPTO_LIB_ARC4
 	tristate
 
 config CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	tristate
+	bool
 	help
 	  Declares whether the architecture provides an arch-specific
 	  accelerated implementation of the Blake2s library interface,
 	  either builtin or as a module.
 
 config CRYPTO_LIB_BLAKE2S_GENERIC
-	tristate
+	def_bool !CRYPTO_ARCH_HAVE_LIB_BLAKE2S
 	help
 	  This symbol can be depended upon by arch implementations of the
 	  Blake2s library interface that require the generic code as a
@@ -24,15 +24,6 @@ config CRYPTO_LIB_BLAKE2S_GENERIC
 	  implementation is enabled, this implementation serves the users
 	  of CRYPTO_LIB_BLAKE2S.
 
-config CRYPTO_LIB_BLAKE2S
-	tristate "BLAKE2s hash function library"
-	depends on CRYPTO_ARCH_HAVE_LIB_BLAKE2S || !CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	select CRYPTO_LIB_BLAKE2S_GENERIC if CRYPTO_ARCH_HAVE_LIB_BLAKE2S=n
-	help
-	  Enable the Blake2s library interface. This interface may be fulfilled
-	  by either the generic implementation or an arch-specific one, if one
-	  is available and enabled.
-
 config CRYPTO_ARCH_HAVE_LIB_CHACHA
 	tristate
 	help
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 73205ed269ba..ed43a41f2dcc 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -10,11 +10,10 @@ libaes-y					:= aes.o
 obj-$(CONFIG_CRYPTO_LIB_ARC4)			+= libarc4.o
 libarc4-y					:= arc4.o
 
-obj-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC)	+= libblake2s-generic.o
-libblake2s-generic-y				+= blake2s-generic.o
-
-obj-$(CONFIG_CRYPTO_LIB_BLAKE2S)		+= libblake2s.o
-libblake2s-y					+= blake2s.o
+# blake2s is used by the /dev/random driver which is always builtin
+obj-y						+= libblake2s.o
+libblake2s-y					:= blake2s.o
+libblake2s-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC)	+= blake2s-generic.o
 
 obj-$(CONFIG_CRYPTO_LIB_CHACHA20POLY1305)	+= libchacha20poly1305.o
 libchacha20poly1305-y				+= chacha20poly1305.o
diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c
index 04ff8df24513..75ccb3e633e6 100644
--- a/lib/crypto/blake2s-generic.c
+++ b/lib/crypto/blake2s-generic.c
@@ -37,7 +37,11 @@ static inline void blake2s_increment_counter(struct blake2s_state *state,
 	state->t[1] += (state->t[0] < inc);
 }
 
-void blake2s_compress_generic(struct blake2s_state *state,const u8 *block,
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+		      size_t nblocks, const u32 inc)
+		      __weak __alias(blake2s_compress_generic);
+
+void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 			      size_t nblocks, const u32 inc)
 {
 	u32 m[16];
diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index 4055aa593ec4..93f2ae051370 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -16,12 +16,6 @@
 #include <linux/init.h>
 #include <linux/bug.h>
 
-#if IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S)
-#  define blake2s_compress blake2s_compress_arch
-#else
-#  define blake2s_compress blake2s_compress_generic
-#endif
-
 void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen)
 {
 	__blake2s_update(state, in, inlen, blake2s_compress);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction
  2021-12-23 14:11 ` [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction Jason A. Donenfeld
  2021-12-24 20:56   ` Eric Biggers
@ 2021-12-27 15:52   ` Theodore Ts'o
  2022-01-11 11:38   ` Geert Uytterhoeven
  2 siblings, 0 replies; 61+ messages in thread
From: Theodore Ts'o @ 2021-12-27 15:52 UTC (permalink / raw)
  To: Jason A. Donenfeld; +Cc: linux-kernel, gregkh, Jean-Philippe Aumasson

On Thu, Dec 23, 2021 at 03:11:13PM +0100, Jason A. Donenfeld wrote:
> This commit addresses one of the lower hanging fruits of the RNG: its
> usage of SHA1.
> 
> BLAKE2s is generally faster, and certainly more secure, than SHA1, which
> has [1] been [2] really [3] very [4] broken [5]. Additionally, the
> current construction in the RNG doesn't use the full SHA1 function, as
> specified, and allows overwriting the IV with RDRAND output in an
> undocumented way, even in the case when RDRAND isn't set to "trusted",
> which means potential malicious IV choices. And its short length means
> that keeping only half of it secret when feeding back into the mixer
> gives us only 2^80 bits of forward secrecy. In other words, not only is
> the choice of hash function dated, but the use of it isn't really great
> either.
> 
> This commit aims to fix both of these issues while also keeping the
> general structure and semantics as close to the original as possible.
> Specifically:
> 
>    a) Rather than overwriting the hash IV with RDRAND, we put it into
>       BLAKE2's documented "salt" and "personal" fields, which were
>       specifically created for this type of usage.
>    b) Since this function feeds the full hash result back into the
>       entropy collector, we only return from it half the length of the
>       hash, just as it was done before. This increases the
>       construction's forward secrecy from 2^80 to a much more
>       comfortable 2^128.
>    c) Rather than using the raw "sha1_transform" function alone, we
>       instead use the full proper BLAKE2s function, with finalization.
> 
> This also has the advantage of supplying 16 bytes at a time rather than
> SHA1's 10 bytes, which, in addition to having a faster compression
> function to begin with, means faster extraction in general. On an Intel
> i7-11850H, this commit makes initial seeding around 131% faster.
> 
> BLAKE2s itself has the nice property of internally being based on the
> ChaCha permutation, which the RNG is already using for expansion, so
> there shouldn't be any issue with newness, funkiness, or surprising CPU
> behavior, since it's based on something already in use.
> 
> [1] https://eprint.iacr.org/2005/010.pdf
> [2] https://www.iacr.org/archive/crypto2005/36210017/36210017.pdf
> [3] https://eprint.iacr.org/2015/967.pdf
> [4] https://shattered.io/static/shattered.pdf
> [5] https://www.usenix.org/system/files/sec20-leurent.pdf
> 
> Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Reviewed-by: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>

Looks good, thanks for the work!

Reviewed-by: Theodore Ts'o <tytso@mit.edu>

						- Ted

^ permalink raw reply	[flat|nested] 61+ messages in thread

* [PATCH v5] lib/crypto: blake2s: include as built-in
  2021-12-27 14:20           ` [PATCH v4] " Jason A. Donenfeld
@ 2022-01-01 15:59             ` Jason A. Donenfeld
  2022-01-02 20:42               ` [PATCH v6] " Jason A. Donenfeld
  0 siblings, 1 reply; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-01 15:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: Jason A. Donenfeld, Ard Biesheuvel, Greg Kroah-Hartman,
	Masahiro Yamada, linux-kbuild, Herbert Xu, linux-crypto

In preparation for using blake2s in the RNG, we change the way that it
is wired-in to the build system. Instead of using ifdefs to select the
right symbol, we use weak symbols. And because ARM doesn't need the
generic implementation, we make the generic one default only if an arch
library doesn't need it already, and then have arch libraries that do
need it opt-in.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: linux-kbuild@vger.kernel.org
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: linux-crypto@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
Herbert - As mentioned with the vPrev, I intend to take this via the
crng/random.git tree, since it forms a dependency and I'd like to send a
pull early in 5.17 cycle.

Changes v4->v5:
- Move sourcing the lib/crypto Kconfig file outside of 'if CRYPTO'.

 arch/arm/crypto/blake2s-core.S    |  8 ++++----
 arch/arm/crypto/blake2s-glue.c    |  6 +++---
 arch/x86/crypto/blake2s-glue.c    | 11 +++++------
 crypto/Kconfig                    |  3 ++-
 drivers/net/Kconfig               |  1 -
 include/crypto/internal/blake2s.h |  6 +++---
 lib/crypto/Kconfig                | 13 ++-----------
 lib/crypto/Makefile               |  9 ++++-----
 lib/crypto/blake2s-generic.c      |  6 +++++-
 lib/crypto/blake2s.c              |  6 ------
 10 files changed, 28 insertions(+), 41 deletions(-)

diff --git a/arch/arm/crypto/blake2s-core.S b/arch/arm/crypto/blake2s-core.S
index 86345751bbf3..df40e46601f1 100644
--- a/arch/arm/crypto/blake2s-core.S
+++ b/arch/arm/crypto/blake2s-core.S
@@ -167,8 +167,8 @@
 .endm
 
 //
-// void blake2s_compress_arch(struct blake2s_state *state,
-//			      const u8 *block, size_t nblocks, u32 inc);
+// void blake2s_compress(struct blake2s_state *state,
+//			 const u8 *block, size_t nblocks, u32 inc);
 //
 // Only the first three fields of struct blake2s_state are used:
 //	u32 h[8];	(inout)
@@ -176,7 +176,7 @@
 //	u32 f[2];	(in)
 //
 	.align		5
-ENTRY(blake2s_compress_arch)
+ENTRY(blake2s_compress)
 	push		{r0-r2,r4-r11,lr}	// keep this an even number
 
 .Lnext_block:
@@ -303,4 +303,4 @@ ENTRY(blake2s_compress_arch)
 	str		r3, [r12], #4
 	bne		1b
 	b		.Lcopy_block_done
-ENDPROC(blake2s_compress_arch)
+ENDPROC(blake2s_compress)
diff --git a/arch/arm/crypto/blake2s-glue.c b/arch/arm/crypto/blake2s-glue.c
index f2cc1e5fc9ec..09d3a0cabd2c 100644
--- a/arch/arm/crypto/blake2s-glue.c
+++ b/arch/arm/crypto/blake2s-glue.c
@@ -11,17 +11,17 @@
 #include <linux/module.h>
 
 /* defined in blake2s-core.S */
-EXPORT_SYMBOL(blake2s_compress_arch);
+EXPORT_SYMBOL(blake2s_compress);
 
 static int crypto_blake2s_update_arm(struct shash_desc *desc,
 				     const u8 *in, unsigned int inlen)
 {
-	return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
+	return crypto_blake2s_update(desc, in, inlen, blake2s_compress);
 }
 
 static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out)
 {
-	return crypto_blake2s_final(desc, out, blake2s_compress_arch);
+	return crypto_blake2s_final(desc, out, blake2s_compress);
 }
 
 #define BLAKE2S_ALG(name, driver_name, digest_size)			\
diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
index a40365ab301e..ef91a3167d27 100644
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@@ -28,9 +28,8 @@ asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
 
-void blake2s_compress_arch(struct blake2s_state *state,
-			   const u8 *block, size_t nblocks,
-			   const u32 inc)
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+		      size_t nblocks, const u32 inc)
 {
 	/* SIMD disables preemption, so relax after processing each page. */
 	BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
@@ -56,17 +55,17 @@ void blake2s_compress_arch(struct blake2s_state *state,
 		block += blocks * BLAKE2S_BLOCK_SIZE;
 	} while (nblocks);
 }
-EXPORT_SYMBOL(blake2s_compress_arch);
+EXPORT_SYMBOL(blake2s_compress);
 
 static int crypto_blake2s_update_x86(struct shash_desc *desc,
 				     const u8 *in, unsigned int inlen)
 {
-	return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
+	return crypto_blake2s_update(desc, in, inlen, blake2s_compress);
 }
 
 static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out)
 {
-	return crypto_blake2s_final(desc, out, blake2s_compress_arch);
+	return crypto_blake2s_final(desc, out, blake2s_compress);
 }
 
 #define BLAKE2S_ALG(name, driver_name, digest_size)			\
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 285f82647d2b..55718de56137 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1919,9 +1919,10 @@ config CRYPTO_STATS
 config CRYPTO_HASH_INFO
 	bool
 
-source "lib/crypto/Kconfig"
 source "drivers/crypto/Kconfig"
 source "crypto/asymmetric_keys/Kconfig"
 source "certs/Kconfig"
 
 endif	# if CRYPTO
+
+source "lib/crypto/Kconfig"
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 6cccc3dc00bc..b2a4f998c180 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -81,7 +81,6 @@ config WIREGUARD
 	select CRYPTO
 	select CRYPTO_LIB_CURVE25519
 	select CRYPTO_LIB_CHACHA20POLY1305
-	select CRYPTO_LIB_BLAKE2S
 	select CRYPTO_CHACHA20_X86_64 if X86 && 64BIT
 	select CRYPTO_POLY1305_X86_64 if X86 && 64BIT
 	select CRYPTO_BLAKE2S_X86 if X86 && 64BIT
diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h
index 8e50d487500f..d39cfa0d333e 100644
--- a/include/crypto/internal/blake2s.h
+++ b/include/crypto/internal/blake2s.h
@@ -11,11 +11,11 @@
 #include <crypto/internal/hash.h>
 #include <linux/string.h>
 
-void blake2s_compress_generic(struct blake2s_state *state,const u8 *block,
+void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 			      size_t nblocks, const u32 inc);
 
-void blake2s_compress_arch(struct blake2s_state *state,const u8 *block,
-			   size_t nblocks, const u32 inc);
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+		      size_t nblocks, const u32 inc);
 
 bool blake2s_selftest(void);
 
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 545ccbddf6a1..0f27976b5038 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -9,14 +9,14 @@ config CRYPTO_LIB_ARC4
 	tristate
 
 config CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	tristate
+	bool
 	help
 	  Declares whether the architecture provides an arch-specific
 	  accelerated implementation of the Blake2s library interface,
 	  either builtin or as a module.
 
 config CRYPTO_LIB_BLAKE2S_GENERIC
-	tristate
+	def_bool !CRYPTO_ARCH_HAVE_LIB_BLAKE2S
 	help
 	  This symbol can be depended upon by arch implementations of the
 	  Blake2s library interface that require the generic code as a
@@ -24,15 +24,6 @@ config CRYPTO_LIB_BLAKE2S_GENERIC
 	  implementation is enabled, this implementation serves the users
 	  of CRYPTO_LIB_BLAKE2S.
 
-config CRYPTO_LIB_BLAKE2S
-	tristate "BLAKE2s hash function library"
-	depends on CRYPTO_ARCH_HAVE_LIB_BLAKE2S || !CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	select CRYPTO_LIB_BLAKE2S_GENERIC if CRYPTO_ARCH_HAVE_LIB_BLAKE2S=n
-	help
-	  Enable the Blake2s library interface. This interface may be fulfilled
-	  by either the generic implementation or an arch-specific one, if one
-	  is available and enabled.
-
 config CRYPTO_ARCH_HAVE_LIB_CHACHA
 	tristate
 	help
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 73205ed269ba..ed43a41f2dcc 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -10,11 +10,10 @@ libaes-y					:= aes.o
 obj-$(CONFIG_CRYPTO_LIB_ARC4)			+= libarc4.o
 libarc4-y					:= arc4.o
 
-obj-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC)	+= libblake2s-generic.o
-libblake2s-generic-y				+= blake2s-generic.o
-
-obj-$(CONFIG_CRYPTO_LIB_BLAKE2S)		+= libblake2s.o
-libblake2s-y					+= blake2s.o
+# blake2s is used by the /dev/random driver which is always builtin
+obj-y						+= libblake2s.o
+libblake2s-y					:= blake2s.o
+libblake2s-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC)	+= blake2s-generic.o
 
 obj-$(CONFIG_CRYPTO_LIB_CHACHA20POLY1305)	+= libchacha20poly1305.o
 libchacha20poly1305-y				+= chacha20poly1305.o
diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c
index 04ff8df24513..75ccb3e633e6 100644
--- a/lib/crypto/blake2s-generic.c
+++ b/lib/crypto/blake2s-generic.c
@@ -37,7 +37,11 @@ static inline void blake2s_increment_counter(struct blake2s_state *state,
 	state->t[1] += (state->t[0] < inc);
 }
 
-void blake2s_compress_generic(struct blake2s_state *state,const u8 *block,
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+		      size_t nblocks, const u32 inc)
+		      __weak __alias(blake2s_compress_generic);
+
+void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 			      size_t nblocks, const u32 inc)
 {
 	u32 m[16];
diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index 4055aa593ec4..93f2ae051370 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -16,12 +16,6 @@
 #include <linux/init.h>
 #include <linux/bug.h>
 
-#if IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S)
-#  define blake2s_compress blake2s_compress_arch
-#else
-#  define blake2s_compress blake2s_compress_generic
-#endif
-
 void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen)
 {
 	__blake2s_update(state, in, inlen, blake2s_compress);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH v6] lib/crypto: blake2s: include as built-in
  2022-01-01 15:59             ` [PATCH v5] " Jason A. Donenfeld
@ 2022-01-02 20:42               ` Jason A. Donenfeld
  2022-01-03  3:23                 ` Herbert Xu
  0 siblings, 1 reply; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-02 20:42 UTC (permalink / raw)
  To: linux-kernel
  Cc: Jason A. Donenfeld, Ard Biesheuvel, Greg Kroah-Hartman,
	Masahiro Yamada, linux-kbuild, Herbert Xu, linux-crypto

In preparation for using blake2s in the RNG, we change the way that it
is wired-in to the build system. Instead of using ifdefs to select the
right symbol, we use weak symbols. And because ARM doesn't need the
generic implementation, we make the generic one default only if an arch
library doesn't need it already, and then have arch libraries that do
need it opt-in.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: linux-kbuild@vger.kernel.org
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: linux-crypto@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
Herbert - As mentioned with the vPrev, I intend to take this via the
crng/random.git tree, since it forms a dependency and I'd like to send a
pull early in 5.17 cycle.

Changes v5->v6:
- Make accelerated versions bool instead of tristate.
Changes v4->v5:
- Move sourcing the lib/crypto Kconfig file outside of 'if CRYPTO'.

 arch/arm/crypto/Kconfig           |  2 +-
 arch/arm/crypto/blake2s-core.S    |  8 ++++----
 arch/arm/crypto/blake2s-glue.c    |  6 +++---
 arch/x86/crypto/blake2s-glue.c    | 11 +++++------
 crypto/Kconfig                    |  5 +++--
 drivers/net/Kconfig               |  1 -
 include/crypto/internal/blake2s.h |  6 +++---
 lib/crypto/Kconfig                | 13 ++-----------
 lib/crypto/Makefile               |  9 ++++-----
 lib/crypto/blake2s-generic.c      |  6 +++++-
 lib/crypto/blake2s.c              |  6 ------
 11 files changed, 30 insertions(+), 43 deletions(-)

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 2b575792363e..f1bcf804b8b5 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -63,7 +63,7 @@ config CRYPTO_SHA512_ARM
 	  using optimized ARM assembler and NEON, when available.
 
 config CRYPTO_BLAKE2S_ARM
-	tristate "BLAKE2s digest algorithm (ARM)"
+	bool "BLAKE2s digest algorithm (ARM)"
 	select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
 	help
 	  BLAKE2s digest algorithm optimized with ARM scalar instructions.  This
diff --git a/arch/arm/crypto/blake2s-core.S b/arch/arm/crypto/blake2s-core.S
index 86345751bbf3..df40e46601f1 100644
--- a/arch/arm/crypto/blake2s-core.S
+++ b/arch/arm/crypto/blake2s-core.S
@@ -167,8 +167,8 @@
 .endm
 
 //
-// void blake2s_compress_arch(struct blake2s_state *state,
-//			      const u8 *block, size_t nblocks, u32 inc);
+// void blake2s_compress(struct blake2s_state *state,
+//			 const u8 *block, size_t nblocks, u32 inc);
 //
 // Only the first three fields of struct blake2s_state are used:
 //	u32 h[8];	(inout)
@@ -176,7 +176,7 @@
 //	u32 f[2];	(in)
 //
 	.align		5
-ENTRY(blake2s_compress_arch)
+ENTRY(blake2s_compress)
 	push		{r0-r2,r4-r11,lr}	// keep this an even number
 
 .Lnext_block:
@@ -303,4 +303,4 @@ ENTRY(blake2s_compress_arch)
 	str		r3, [r12], #4
 	bne		1b
 	b		.Lcopy_block_done
-ENDPROC(blake2s_compress_arch)
+ENDPROC(blake2s_compress)
diff --git a/arch/arm/crypto/blake2s-glue.c b/arch/arm/crypto/blake2s-glue.c
index f2cc1e5fc9ec..09d3a0cabd2c 100644
--- a/arch/arm/crypto/blake2s-glue.c
+++ b/arch/arm/crypto/blake2s-glue.c
@@ -11,17 +11,17 @@
 #include <linux/module.h>
 
 /* defined in blake2s-core.S */
-EXPORT_SYMBOL(blake2s_compress_arch);
+EXPORT_SYMBOL(blake2s_compress);
 
 static int crypto_blake2s_update_arm(struct shash_desc *desc,
 				     const u8 *in, unsigned int inlen)
 {
-	return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
+	return crypto_blake2s_update(desc, in, inlen, blake2s_compress);
 }
 
 static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out)
 {
-	return crypto_blake2s_final(desc, out, blake2s_compress_arch);
+	return crypto_blake2s_final(desc, out, blake2s_compress);
 }
 
 #define BLAKE2S_ALG(name, driver_name, digest_size)			\
diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
index a40365ab301e..ef91a3167d27 100644
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@@ -28,9 +28,8 @@ asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
 
-void blake2s_compress_arch(struct blake2s_state *state,
-			   const u8 *block, size_t nblocks,
-			   const u32 inc)
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+		      size_t nblocks, const u32 inc)
 {
 	/* SIMD disables preemption, so relax after processing each page. */
 	BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
@@ -56,17 +55,17 @@ void blake2s_compress_arch(struct blake2s_state *state,
 		block += blocks * BLAKE2S_BLOCK_SIZE;
 	} while (nblocks);
 }
-EXPORT_SYMBOL(blake2s_compress_arch);
+EXPORT_SYMBOL(blake2s_compress);
 
 static int crypto_blake2s_update_x86(struct shash_desc *desc,
 				     const u8 *in, unsigned int inlen)
 {
-	return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
+	return crypto_blake2s_update(desc, in, inlen, blake2s_compress);
 }
 
 static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out)
 {
-	return crypto_blake2s_final(desc, out, blake2s_compress_arch);
+	return crypto_blake2s_final(desc, out, blake2s_compress);
 }
 
 #define BLAKE2S_ALG(name, driver_name, digest_size)			\
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 285f82647d2b..b7a2e50dcbc8 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -702,7 +702,7 @@ config CRYPTO_BLAKE2S
 	  See https://blake2.net for further information.
 
 config CRYPTO_BLAKE2S_X86
-	tristate "BLAKE2s digest algorithm (x86 accelerated version)"
+	bool "BLAKE2s digest algorithm (x86 accelerated version)"
 	depends on X86 && 64BIT
 	select CRYPTO_LIB_BLAKE2S_GENERIC
 	select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
@@ -1919,9 +1919,10 @@ config CRYPTO_STATS
 config CRYPTO_HASH_INFO
 	bool
 
-source "lib/crypto/Kconfig"
 source "drivers/crypto/Kconfig"
 source "crypto/asymmetric_keys/Kconfig"
 source "certs/Kconfig"
 
 endif	# if CRYPTO
+
+source "lib/crypto/Kconfig"
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 6cccc3dc00bc..b2a4f998c180 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -81,7 +81,6 @@ config WIREGUARD
 	select CRYPTO
 	select CRYPTO_LIB_CURVE25519
 	select CRYPTO_LIB_CHACHA20POLY1305
-	select CRYPTO_LIB_BLAKE2S
 	select CRYPTO_CHACHA20_X86_64 if X86 && 64BIT
 	select CRYPTO_POLY1305_X86_64 if X86 && 64BIT
 	select CRYPTO_BLAKE2S_X86 if X86 && 64BIT
diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h
index 8e50d487500f..d39cfa0d333e 100644
--- a/include/crypto/internal/blake2s.h
+++ b/include/crypto/internal/blake2s.h
@@ -11,11 +11,11 @@
 #include <crypto/internal/hash.h>
 #include <linux/string.h>
 
-void blake2s_compress_generic(struct blake2s_state *state,const u8 *block,
+void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 			      size_t nblocks, const u32 inc);
 
-void blake2s_compress_arch(struct blake2s_state *state,const u8 *block,
-			   size_t nblocks, const u32 inc);
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+		      size_t nblocks, const u32 inc);
 
 bool blake2s_selftest(void);
 
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 545ccbddf6a1..0f27976b5038 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -9,14 +9,14 @@ config CRYPTO_LIB_ARC4
 	tristate
 
 config CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	tristate
+	bool
 	help
 	  Declares whether the architecture provides an arch-specific
 	  accelerated implementation of the Blake2s library interface,
 	  either builtin or as a module.
 
 config CRYPTO_LIB_BLAKE2S_GENERIC
-	tristate
+	def_bool !CRYPTO_ARCH_HAVE_LIB_BLAKE2S
 	help
 	  This symbol can be depended upon by arch implementations of the
 	  Blake2s library interface that require the generic code as a
@@ -24,15 +24,6 @@ config CRYPTO_LIB_BLAKE2S_GENERIC
 	  implementation is enabled, this implementation serves the users
 	  of CRYPTO_LIB_BLAKE2S.
 
-config CRYPTO_LIB_BLAKE2S
-	tristate "BLAKE2s hash function library"
-	depends on CRYPTO_ARCH_HAVE_LIB_BLAKE2S || !CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	select CRYPTO_LIB_BLAKE2S_GENERIC if CRYPTO_ARCH_HAVE_LIB_BLAKE2S=n
-	help
-	  Enable the Blake2s library interface. This interface may be fulfilled
-	  by either the generic implementation or an arch-specific one, if one
-	  is available and enabled.
-
 config CRYPTO_ARCH_HAVE_LIB_CHACHA
 	tristate
 	help
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 73205ed269ba..ed43a41f2dcc 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -10,11 +10,10 @@ libaes-y					:= aes.o
 obj-$(CONFIG_CRYPTO_LIB_ARC4)			+= libarc4.o
 libarc4-y					:= arc4.o
 
-obj-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC)	+= libblake2s-generic.o
-libblake2s-generic-y				+= blake2s-generic.o
-
-obj-$(CONFIG_CRYPTO_LIB_BLAKE2S)		+= libblake2s.o
-libblake2s-y					+= blake2s.o
+# blake2s is used by the /dev/random driver which is always builtin
+obj-y						+= libblake2s.o
+libblake2s-y					:= blake2s.o
+libblake2s-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC)	+= blake2s-generic.o
 
 obj-$(CONFIG_CRYPTO_LIB_CHACHA20POLY1305)	+= libchacha20poly1305.o
 libchacha20poly1305-y				+= chacha20poly1305.o
diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c
index 04ff8df24513..75ccb3e633e6 100644
--- a/lib/crypto/blake2s-generic.c
+++ b/lib/crypto/blake2s-generic.c
@@ -37,7 +37,11 @@ static inline void blake2s_increment_counter(struct blake2s_state *state,
 	state->t[1] += (state->t[0] < inc);
 }
 
-void blake2s_compress_generic(struct blake2s_state *state,const u8 *block,
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+		      size_t nblocks, const u32 inc)
+		      __weak __alias(blake2s_compress_generic);
+
+void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 			      size_t nblocks, const u32 inc)
 {
 	u32 m[16];
diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index 4055aa593ec4..93f2ae051370 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -16,12 +16,6 @@
 #include <linux/init.h>
 #include <linux/bug.h>
 
-#if IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S)
-#  define blake2s_compress blake2s_compress_arch
-#else
-#  define blake2s_compress blake2s_compress_generic
-#endif
-
 void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen)
 {
 	__blake2s_update(state, in, inlen, blake2s_compress);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH v6] lib/crypto: blake2s: include as built-in
  2022-01-02 20:42               ` [PATCH v6] " Jason A. Donenfeld
@ 2022-01-03  3:23                 ` Herbert Xu
  2022-01-03  3:45                   ` Jason A. Donenfeld
  0 siblings, 1 reply; 61+ messages in thread
From: Herbert Xu @ 2022-01-03  3:23 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: linux-kernel, Ard Biesheuvel, Greg Kroah-Hartman,
	Masahiro Yamada, linux-kbuild, linux-crypto

On Sun, Jan 02, 2022 at 09:42:03PM +0100, Jason A. Donenfeld wrote:
> In preparation for using blake2s in the RNG, we change the way that it
> is wired-in to the build system. Instead of using ifdefs to select the
> right symbol, we use weak symbols. And because ARM doesn't need the
> generic implementation, we make the generic one default only if an arch
> library doesn't need it already, and then have arch libraries that do
> need it opt-in.
> 
> Acked-by: Ard Biesheuvel <ardb@kernel.org>
> Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Cc: Masahiro Yamada <masahiroy@kernel.org>
> Cc: linux-kbuild@vger.kernel.org
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: linux-crypto@vger.kernel.org
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> ---
> Herbert - As mentioned with the vPrev, I intend to take this via the
> crng/random.git tree, since it forms a dependency and I'd like to send a
> pull early in 5.17 cycle.

At this point I think we should push this through crypto.  The
changes are too invasive with respect to the crypto Kconfig files.

> diff --git a/crypto/Kconfig b/crypto/Kconfig
> index 285f82647d2b..b7a2e50dcbc8 100644
> --- a/crypto/Kconfig
> +++ b/crypto/Kconfig
> @@ -702,7 +702,7 @@ config CRYPTO_BLAKE2S
>  	  See https://blake2.net for further information.
>  
>  config CRYPTO_BLAKE2S_X86
> -	tristate "BLAKE2s digest algorithm (x86 accelerated version)"
> +	bool "BLAKE2s digest algorithm (x86 accelerated version)"
>  	depends on X86 && 64BIT
>  	select CRYPTO_LIB_BLAKE2S_GENERIC
>  	select CRYPTO_ARCH_HAVE_LIB_BLAKE2S

This will break when CRYPTO is disabled because the x86 crypto
glue code depends on the crypto subsystem.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v6] lib/crypto: blake2s: include as built-in
  2022-01-03  3:23                 ` Herbert Xu
@ 2022-01-03  3:45                   ` Jason A. Donenfeld
  2022-01-03  4:06                     ` Herbert Xu
  0 siblings, 1 reply; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-03  3:45 UTC (permalink / raw)
  To: Herbert Xu
  Cc: linux-kernel, Ard Biesheuvel, Greg Kroah-Hartman,
	Masahiro Yamada, linux-kbuild, linux-crypto

On 1/3/22, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> At this point I think we should push this through crypto.  The
> changes are too invasive with respect to the crypto Kconfig files.

Ugh, can we please not? That will really make things much harder and
more annoying for me. I have an early pull planned, and you'll quickly
be able to rebase on top of it. It also doesn't appear to conflict
with anything you have queued up. Please, I would really appreciate
some straight forward linearity here, and I don't think my taking it
will negatively impact the flow.

>
>> diff --git a/crypto/Kconfig b/crypto/Kconfig
>> index 285f82647d2b..b7a2e50dcbc8 100644
>> --- a/crypto/Kconfig
>> +++ b/crypto/Kconfig
>> @@ -702,7 +702,7 @@ config CRYPTO_BLAKE2S
>>  	  See https://blake2.net for further information.
>>
>>  config CRYPTO_BLAKE2S_X86
>> -	tristate "BLAKE2s digest algorithm (x86 accelerated version)"
>> +	bool "BLAKE2s digest algorithm (x86 accelerated version)"
>>  	depends on X86 && 64BIT
>>  	select CRYPTO_LIB_BLAKE2S_GENERIC
>>  	select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
>
> This will break when CRYPTO is disabled because the x86 crypto
> glue code depends on the crypto subsystem.

That snippet is inside an 'if CRYPTO' block, so it can't be selected
without CRYPTO being enabled.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v6] lib/crypto: blake2s: include as built-in
  2022-01-03  3:45                   ` Jason A. Donenfeld
@ 2022-01-03  4:06                     ` Herbert Xu
  2022-01-03 11:57                       ` Jason A. Donenfeld
  0 siblings, 1 reply; 61+ messages in thread
From: Herbert Xu @ 2022-01-03  4:06 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: linux-kernel, Ard Biesheuvel, Greg Kroah-Hartman,
	Masahiro Yamada, linux-kbuild, linux-crypto

On Mon, Jan 03, 2022 at 04:45:10AM +0100, Jason A. Donenfeld wrote:
> 
> Ugh, can we please not? That will really make things much harder and
> more annoying for me. I have an early pull planned, and you'll quickly
> be able to rebase on top of it. It also doesn't appear to conflict
> with anything you have queued up. Please, I would really appreciate
> some straight forward linearity here, and I don't think my taking it
> will negatively impact the flow.

Your patches as they stand will break the crypto tree.  So
that's why they should not go in without the proper changes.

> That snippet is inside an 'if CRYPTO' block, so it can't be selected
> without CRYPTO being enabled.

No CONFIG_CRYPTO is not the issue.  This depends on specific
bits of the Crypto API such as CRYPTO_HASH.  Simply selecting
it is also not acceptable because you will be forcing all of the
Crypto API into vmlinux even though none of it is required by
/dev/random.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v6] lib/crypto: blake2s: include as built-in
  2022-01-03  4:06                     ` Herbert Xu
@ 2022-01-03 11:57                       ` Jason A. Donenfeld
  2022-01-03 12:31                         ` [PATCH v7] " Jason A. Donenfeld
  0 siblings, 1 reply; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-03 11:57 UTC (permalink / raw)
  To: Herbert Xu
  Cc: LKML, Ard Biesheuvel, Greg Kroah-Hartman, Masahiro Yamada,
	Linux Kbuild mailing list, Linux Crypto Mailing List

Hi Herbert,

On Mon, Jan 3, 2022 at 5:07 AM Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Mon, Jan 03, 2022 at 04:45:10AM +0100, Jason A. Donenfeld wrote:
> >
> > Ugh, can we please not? That will really make things much harder and
> > more annoying for me. I have an early pull planned, and you'll quickly
> > be able to rebase on top of it. It also doesn't appear to conflict
> > with anything you have queued up. Please, I would really appreciate
> > some straight forward linearity here, and I don't think my taking it
> > will negatively impact the flow.
>
> Your patches as they stand will break the crypto tree.  So
> that's why they should not go in without the proper changes.

Okay, I'll try to fix them up so that they don't break the crypto
tree, and given below, I think I should be able to do this with fewer
changes to some of the Kconfig, which will hopefully address your
concerns and enable me to take this patch so that things are a bit
more straightforward.


>
> > That snippet is inside an 'if CRYPTO' block, so it can't be selected
> > without CRYPTO being enabled.
>
> No CONFIG_CRYPTO is not the issue.  This depends on specific
> bits of the Crypto API such as CRYPTO_HASH.  Simply selecting
> it is also not acceptable because you will be forcing all of the
> Crypto API into vmlinux even though none of it is required by
> /dev/random.

Thanks, I think I see what your concern is now. I'll take a stab at
addressing that.

Jason

^ permalink raw reply	[flat|nested] 61+ messages in thread

* [PATCH v7] lib/crypto: blake2s: include as built-in
  2022-01-03 11:57                       ` Jason A. Donenfeld
@ 2022-01-03 12:31                         ` Jason A. Donenfeld
  2022-01-04  1:21                           ` Herbert Xu
  0 siblings, 1 reply; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-03 12:31 UTC (permalink / raw)
  To: linux-kernel
  Cc: Jason A. Donenfeld, Ard Biesheuvel, Greg Kroah-Hartman,
	Masahiro Yamada, linux-kbuild, Herbert Xu, linux-crypto

In preparation for using blake2s in the RNG, we change the way that it
is wired-in to the build system. Instead of using ifdefs to select the
right symbol, we use weak symbols. And because ARM doesn't need the
generic implementation, we make the generic one default only if an arch
library doesn't need it already, and then have arch libraries that do
need it opt-in. So that the arch libraries can remain tristate rather
than bool, we then split the shash part from the glue code.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: linux-kbuild@vger.kernel.org
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: linux-crypto@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
Herbert - As discussed, I still intend to take this via the
crng/random.git tree because it forms a dependency, and I'd like to send
a pull very early in the 5.17 cycle. I've taken some care to minimize
changes to the {arch/*/}crypto/Kconfig files, as you mentioned this
might cause some conflicts. Your tree should work cleanly on top of this
commit.

Changes v6->v7:
- Split arch shash implementations out from the glue code, so that they
  can remain as tristates, and we thus don't need to touch
  arch/*/crypto/Kconfig at all.
Changes v5->v6:
- Make accelerated versions bool instead of tristate.
Changes v4->v5:
- Move sourcing the lib/crypto Kconfig file outside of 'if CRYPTO'.

 arch/arm/crypto/Makefile          |  4 +-
 arch/arm/crypto/blake2s-core.S    |  8 ++--
 arch/arm/crypto/blake2s-glue.c    | 73 +----------------------------
 arch/arm/crypto/blake2s-shash.c   | 75 ++++++++++++++++++++++++++++++
 arch/x86/crypto/Makefile          |  4 +-
 arch/x86/crypto/blake2s-glue.c    | 68 +++------------------------
 arch/x86/crypto/blake2s-shash.c   | 77 +++++++++++++++++++++++++++++++
 crypto/Kconfig                    |  3 +-
 drivers/net/Kconfig               |  1 -
 include/crypto/internal/blake2s.h |  6 +--
 lib/crypto/Kconfig                | 23 +++------
 lib/crypto/Makefile               |  9 ++--
 lib/crypto/blake2s-generic.c      |  6 ++-
 lib/crypto/blake2s.c              |  6 ---
 14 files changed, 189 insertions(+), 174 deletions(-)
 create mode 100644 arch/arm/crypto/blake2s-shash.c
 create mode 100644 arch/x86/crypto/blake2s-shash.c

diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index eafa898ba6a7..0274f81cc8ea 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
 obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += blake2s-arm.o
+obj-$(if $(CONFIG_CRYPTO_BLAKE2S_ARM),y) += libblake2s-arm.o
 obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
 obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
 obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
@@ -31,7 +32,8 @@ sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
 sha256-arm-y	:= sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
 sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
 sha512-arm-y	:= sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
-blake2s-arm-y   := blake2s-core.o blake2s-glue.o
+blake2s-arm-y   := blake2s-shash.o
+libblake2s-arm-y:= blake2s-core.o blake2s-glue.o
 blake2b-neon-y  := blake2b-neon-core.o blake2b-neon-glue.o
 sha1-arm-ce-y	:= sha1-ce-core.o sha1-ce-glue.o
 sha2-arm-ce-y	:= sha2-ce-core.o sha2-ce-glue.o
diff --git a/arch/arm/crypto/blake2s-core.S b/arch/arm/crypto/blake2s-core.S
index 86345751bbf3..df40e46601f1 100644
--- a/arch/arm/crypto/blake2s-core.S
+++ b/arch/arm/crypto/blake2s-core.S
@@ -167,8 +167,8 @@
 .endm
 
 //
-// void blake2s_compress_arch(struct blake2s_state *state,
-//			      const u8 *block, size_t nblocks, u32 inc);
+// void blake2s_compress(struct blake2s_state *state,
+//			 const u8 *block, size_t nblocks, u32 inc);
 //
 // Only the first three fields of struct blake2s_state are used:
 //	u32 h[8];	(inout)
@@ -176,7 +176,7 @@
 //	u32 f[2];	(in)
 //
 	.align		5
-ENTRY(blake2s_compress_arch)
+ENTRY(blake2s_compress)
 	push		{r0-r2,r4-r11,lr}	// keep this an even number
 
 .Lnext_block:
@@ -303,4 +303,4 @@ ENTRY(blake2s_compress_arch)
 	str		r3, [r12], #4
 	bne		1b
 	b		.Lcopy_block_done
-ENDPROC(blake2s_compress_arch)
+ENDPROC(blake2s_compress)
diff --git a/arch/arm/crypto/blake2s-glue.c b/arch/arm/crypto/blake2s-glue.c
index f2cc1e5fc9ec..0238a70d9581 100644
--- a/arch/arm/crypto/blake2s-glue.c
+++ b/arch/arm/crypto/blake2s-glue.c
@@ -1,78 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * BLAKE2s digest algorithm, ARM scalar implementation
- *
- * Copyright 2020 Google LLC
- */
 
 #include <crypto/internal/blake2s.h>
-#include <crypto/internal/hash.h>
-
 #include <linux/module.h>
 
 /* defined in blake2s-core.S */
-EXPORT_SYMBOL(blake2s_compress_arch);
-
-static int crypto_blake2s_update_arm(struct shash_desc *desc,
-				     const u8 *in, unsigned int inlen)
-{
-	return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
-}
-
-static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out)
-{
-	return crypto_blake2s_final(desc, out, blake2s_compress_arch);
-}
-
-#define BLAKE2S_ALG(name, driver_name, digest_size)			\
-	{								\
-		.base.cra_name		= name,				\
-		.base.cra_driver_name	= driver_name,			\
-		.base.cra_priority	= 200,				\
-		.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,	\
-		.base.cra_blocksize	= BLAKE2S_BLOCK_SIZE,		\
-		.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx), \
-		.base.cra_module	= THIS_MODULE,			\
-		.digestsize		= digest_size,			\
-		.setkey			= crypto_blake2s_setkey,	\
-		.init			= crypto_blake2s_init,		\
-		.update			= crypto_blake2s_update_arm,	\
-		.final			= crypto_blake2s_final_arm,	\
-		.descsize		= sizeof(struct blake2s_state),	\
-	}
-
-static struct shash_alg blake2s_arm_algs[] = {
-	BLAKE2S_ALG("blake2s-128", "blake2s-128-arm", BLAKE2S_128_HASH_SIZE),
-	BLAKE2S_ALG("blake2s-160", "blake2s-160-arm", BLAKE2S_160_HASH_SIZE),
-	BLAKE2S_ALG("blake2s-224", "blake2s-224-arm", BLAKE2S_224_HASH_SIZE),
-	BLAKE2S_ALG("blake2s-256", "blake2s-256-arm", BLAKE2S_256_HASH_SIZE),
-};
-
-static int __init blake2s_arm_mod_init(void)
-{
-	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
-		crypto_register_shashes(blake2s_arm_algs,
-					ARRAY_SIZE(blake2s_arm_algs)) : 0;
-}
-
-static void __exit blake2s_arm_mod_exit(void)
-{
-	if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
-		crypto_unregister_shashes(blake2s_arm_algs,
-					  ARRAY_SIZE(blake2s_arm_algs));
-}
-
-module_init(blake2s_arm_mod_init);
-module_exit(blake2s_arm_mod_exit);
-
-MODULE_DESCRIPTION("BLAKE2s digest algorithm, ARM scalar implementation");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
-MODULE_ALIAS_CRYPTO("blake2s-128");
-MODULE_ALIAS_CRYPTO("blake2s-128-arm");
-MODULE_ALIAS_CRYPTO("blake2s-160");
-MODULE_ALIAS_CRYPTO("blake2s-160-arm");
-MODULE_ALIAS_CRYPTO("blake2s-224");
-MODULE_ALIAS_CRYPTO("blake2s-224-arm");
-MODULE_ALIAS_CRYPTO("blake2s-256");
-MODULE_ALIAS_CRYPTO("blake2s-256-arm");
+EXPORT_SYMBOL(blake2s_compress);
diff --git a/arch/arm/crypto/blake2s-shash.c b/arch/arm/crypto/blake2s-shash.c
new file mode 100644
index 000000000000..17c1c3bfe2f5
--- /dev/null
+++ b/arch/arm/crypto/blake2s-shash.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * BLAKE2s digest algorithm, ARM scalar implementation
+ *
+ * Copyright 2020 Google LLC
+ */
+
+#include <crypto/internal/blake2s.h>
+#include <crypto/internal/hash.h>
+
+#include <linux/module.h>
+
+static int crypto_blake2s_update_arm(struct shash_desc *desc,
+				     const u8 *in, unsigned int inlen)
+{
+	return crypto_blake2s_update(desc, in, inlen, blake2s_compress);
+}
+
+static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out)
+{
+	return crypto_blake2s_final(desc, out, blake2s_compress);
+}
+
+#define BLAKE2S_ALG(name, driver_name, digest_size)			\
+	{								\
+		.base.cra_name		= name,				\
+		.base.cra_driver_name	= driver_name,			\
+		.base.cra_priority	= 200,				\
+		.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,	\
+		.base.cra_blocksize	= BLAKE2S_BLOCK_SIZE,		\
+		.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx), \
+		.base.cra_module	= THIS_MODULE,			\
+		.digestsize		= digest_size,			\
+		.setkey			= crypto_blake2s_setkey,	\
+		.init			= crypto_blake2s_init,		\
+		.update			= crypto_blake2s_update_arm,	\
+		.final			= crypto_blake2s_final_arm,	\
+		.descsize		= sizeof(struct blake2s_state),	\
+	}
+
+static struct shash_alg blake2s_arm_algs[] = {
+	BLAKE2S_ALG("blake2s-128", "blake2s-128-arm", BLAKE2S_128_HASH_SIZE),
+	BLAKE2S_ALG("blake2s-160", "blake2s-160-arm", BLAKE2S_160_HASH_SIZE),
+	BLAKE2S_ALG("blake2s-224", "blake2s-224-arm", BLAKE2S_224_HASH_SIZE),
+	BLAKE2S_ALG("blake2s-256", "blake2s-256-arm", BLAKE2S_256_HASH_SIZE),
+};
+
+static int __init blake2s_arm_mod_init(void)
+{
+	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
+		crypto_register_shashes(blake2s_arm_algs,
+					ARRAY_SIZE(blake2s_arm_algs)) : 0;
+}
+
+static void __exit blake2s_arm_mod_exit(void)
+{
+	if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
+		crypto_unregister_shashes(blake2s_arm_algs,
+					  ARRAY_SIZE(blake2s_arm_algs));
+}
+
+module_init(blake2s_arm_mod_init);
+module_exit(blake2s_arm_mod_exit);
+
+MODULE_DESCRIPTION("BLAKE2s digest algorithm, ARM scalar implementation");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("blake2s-128");
+MODULE_ALIAS_CRYPTO("blake2s-128-arm");
+MODULE_ALIAS_CRYPTO("blake2s-160");
+MODULE_ALIAS_CRYPTO("blake2s-160-arm");
+MODULE_ALIAS_CRYPTO("blake2s-224");
+MODULE_ALIAS_CRYPTO("blake2s-224-arm");
+MODULE_ALIAS_CRYPTO("blake2s-256");
+MODULE_ALIAS_CRYPTO("blake2s-256-arm");
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index f307c93fc90a..c3af959648e6 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -62,7 +62,9 @@ obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
 sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
 
 obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
-blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
+blake2s-x86_64-y := blake2s-shash.o
+obj-$(if $(CONFIG_CRYPTO_BLAKE2S_X86),y) += libblake2s-x86_64.o
+libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o
 
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
index a40365ab301e..69853c13e8fb 100644
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@@ -5,7 +5,6 @@
 
 #include <crypto/internal/blake2s.h>
 #include <crypto/internal/simd.h>
-#include <crypto/internal/hash.h>
 
 #include <linux/types.h>
 #include <linux/jump_label.h>
@@ -28,9 +27,8 @@ asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
 
-void blake2s_compress_arch(struct blake2s_state *state,
-			   const u8 *block, size_t nblocks,
-			   const u32 inc)
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+		      size_t nblocks, const u32 inc)
 {
 	/* SIMD disables preemption, so relax after processing each page. */
 	BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
@@ -56,49 +54,12 @@ void blake2s_compress_arch(struct blake2s_state *state,
 		block += blocks * BLAKE2S_BLOCK_SIZE;
 	} while (nblocks);
 }
-EXPORT_SYMBOL(blake2s_compress_arch);
-
-static int crypto_blake2s_update_x86(struct shash_desc *desc,
-				     const u8 *in, unsigned int inlen)
-{
-	return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
-}
-
-static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out)
-{
-	return crypto_blake2s_final(desc, out, blake2s_compress_arch);
-}
-
-#define BLAKE2S_ALG(name, driver_name, digest_size)			\
-	{								\
-		.base.cra_name		= name,				\
-		.base.cra_driver_name	= driver_name,			\
-		.base.cra_priority	= 200,				\
-		.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,	\
-		.base.cra_blocksize	= BLAKE2S_BLOCK_SIZE,		\
-		.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx), \
-		.base.cra_module	= THIS_MODULE,			\
-		.digestsize		= digest_size,			\
-		.setkey			= crypto_blake2s_setkey,	\
-		.init			= crypto_blake2s_init,		\
-		.update			= crypto_blake2s_update_x86,	\
-		.final			= crypto_blake2s_final_x86,	\
-		.descsize		= sizeof(struct blake2s_state),	\
-	}
-
-static struct shash_alg blake2s_algs[] = {
-	BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE),
-	BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE),
-	BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE),
-	BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE),
-};
+EXPORT_SYMBOL(blake2s_compress);
 
 static int __init blake2s_mod_init(void)
 {
-	if (!boot_cpu_has(X86_FEATURE_SSSE3))
-		return 0;
-
-	static_branch_enable(&blake2s_use_ssse3);
+	if (boot_cpu_has(X86_FEATURE_SSSE3))
+		static_branch_enable(&blake2s_use_ssse3);
 
 	if (IS_ENABLED(CONFIG_AS_AVX512) &&
 	    boot_cpu_has(X86_FEATURE_AVX) &&
@@ -109,26 +70,9 @@ static int __init blake2s_mod_init(void)
 			      XFEATURE_MASK_AVX512, NULL))
 		static_branch_enable(&blake2s_use_avx512);
 
-	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
-		crypto_register_shashes(blake2s_algs,
-					ARRAY_SIZE(blake2s_algs)) : 0;
-}
-
-static void __exit blake2s_mod_exit(void)
-{
-	if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
-		crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+	return 0;
 }
 
 module_init(blake2s_mod_init);
-module_exit(blake2s_mod_exit);
 
-MODULE_ALIAS_CRYPTO("blake2s-128");
-MODULE_ALIAS_CRYPTO("blake2s-128-x86");
-MODULE_ALIAS_CRYPTO("blake2s-160");
-MODULE_ALIAS_CRYPTO("blake2s-160-x86");
-MODULE_ALIAS_CRYPTO("blake2s-224");
-MODULE_ALIAS_CRYPTO("blake2s-224-x86");
-MODULE_ALIAS_CRYPTO("blake2s-256");
-MODULE_ALIAS_CRYPTO("blake2s-256-x86");
 MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/crypto/blake2s-shash.c b/arch/x86/crypto/blake2s-shash.c
new file mode 100644
index 000000000000..f9e2fecdb761
--- /dev/null
+++ b/arch/x86/crypto/blake2s-shash.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <crypto/internal/blake2s.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/hash.h>
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+
+static int crypto_blake2s_update_x86(struct shash_desc *desc,
+				     const u8 *in, unsigned int inlen)
+{
+	return crypto_blake2s_update(desc, in, inlen, blake2s_compress);
+}
+
+static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out)
+{
+	return crypto_blake2s_final(desc, out, blake2s_compress);
+}
+
+#define BLAKE2S_ALG(name, driver_name, digest_size)			\
+	{								\
+		.base.cra_name		= name,				\
+		.base.cra_driver_name	= driver_name,			\
+		.base.cra_priority	= 200,				\
+		.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,	\
+		.base.cra_blocksize	= BLAKE2S_BLOCK_SIZE,		\
+		.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx), \
+		.base.cra_module	= THIS_MODULE,			\
+		.digestsize		= digest_size,			\
+		.setkey			= crypto_blake2s_setkey,	\
+		.init			= crypto_blake2s_init,		\
+		.update			= crypto_blake2s_update_x86,	\
+		.final			= crypto_blake2s_final_x86,	\
+		.descsize		= sizeof(struct blake2s_state),	\
+	}
+
+static struct shash_alg blake2s_algs[] = {
+	BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE),
+	BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE),
+	BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE),
+	BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE),
+};
+
+static int __init blake2s_mod_init(void)
+{
+	if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
+		return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+	return 0;
+}
+
+static void __exit blake2s_mod_exit(void)
+{
+	if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
+		crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+}
+
+module_init(blake2s_mod_init);
+module_exit(blake2s_mod_exit);
+
+MODULE_ALIAS_CRYPTO("blake2s-128");
+MODULE_ALIAS_CRYPTO("blake2s-128-x86");
+MODULE_ALIAS_CRYPTO("blake2s-160");
+MODULE_ALIAS_CRYPTO("blake2s-160-x86");
+MODULE_ALIAS_CRYPTO("blake2s-224");
+MODULE_ALIAS_CRYPTO("blake2s-224-x86");
+MODULE_ALIAS_CRYPTO("blake2s-256");
+MODULE_ALIAS_CRYPTO("blake2s-256-x86");
+MODULE_LICENSE("GPL v2");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 285f82647d2b..55718de56137 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1919,9 +1919,10 @@ config CRYPTO_STATS
 config CRYPTO_HASH_INFO
 	bool
 
-source "lib/crypto/Kconfig"
 source "drivers/crypto/Kconfig"
 source "crypto/asymmetric_keys/Kconfig"
 source "certs/Kconfig"
 
 endif	# if CRYPTO
+
+source "lib/crypto/Kconfig"
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 6cccc3dc00bc..b2a4f998c180 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -81,7 +81,6 @@ config WIREGUARD
 	select CRYPTO
 	select CRYPTO_LIB_CURVE25519
 	select CRYPTO_LIB_CHACHA20POLY1305
-	select CRYPTO_LIB_BLAKE2S
 	select CRYPTO_CHACHA20_X86_64 if X86 && 64BIT
 	select CRYPTO_POLY1305_X86_64 if X86 && 64BIT
 	select CRYPTO_BLAKE2S_X86 if X86 && 64BIT
diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h
index 8e50d487500f..d39cfa0d333e 100644
--- a/include/crypto/internal/blake2s.h
+++ b/include/crypto/internal/blake2s.h
@@ -11,11 +11,11 @@
 #include <crypto/internal/hash.h>
 #include <linux/string.h>
 
-void blake2s_compress_generic(struct blake2s_state *state,const u8 *block,
+void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 			      size_t nblocks, const u32 inc);
 
-void blake2s_compress_arch(struct blake2s_state *state,const u8 *block,
-			   size_t nblocks, const u32 inc);
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+		      size_t nblocks, const u32 inc);
 
 bool blake2s_selftest(void);
 
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 545ccbddf6a1..8620f38e117c 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
 
-comment "Crypto library routines"
-
 config CRYPTO_LIB_AES
 	tristate
 
@@ -9,14 +7,14 @@ config CRYPTO_LIB_ARC4
 	tristate
 
 config CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	tristate
+	bool
 	help
 	  Declares whether the architecture provides an arch-specific
 	  accelerated implementation of the Blake2s library interface,
 	  either builtin or as a module.
 
 config CRYPTO_LIB_BLAKE2S_GENERIC
-	tristate
+	def_bool !CRYPTO_ARCH_HAVE_LIB_BLAKE2S
 	help
 	  This symbol can be depended upon by arch implementations of the
 	  Blake2s library interface that require the generic code as a
@@ -24,15 +22,6 @@ config CRYPTO_LIB_BLAKE2S_GENERIC
 	  implementation is enabled, this implementation serves the users
 	  of CRYPTO_LIB_BLAKE2S.
 
-config CRYPTO_LIB_BLAKE2S
-	tristate "BLAKE2s hash function library"
-	depends on CRYPTO_ARCH_HAVE_LIB_BLAKE2S || !CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	select CRYPTO_LIB_BLAKE2S_GENERIC if CRYPTO_ARCH_HAVE_LIB_BLAKE2S=n
-	help
-	  Enable the Blake2s library interface. This interface may be fulfilled
-	  by either the generic implementation or an arch-specific one, if one
-	  is available and enabled.
-
 config CRYPTO_ARCH_HAVE_LIB_CHACHA
 	tristate
 	help
@@ -51,7 +40,7 @@ config CRYPTO_LIB_CHACHA_GENERIC
 	  of CRYPTO_LIB_CHACHA.
 
 config CRYPTO_LIB_CHACHA
-	tristate "ChaCha library interface"
+	tristate
 	depends on CRYPTO_ARCH_HAVE_LIB_CHACHA || !CRYPTO_ARCH_HAVE_LIB_CHACHA
 	select CRYPTO_LIB_CHACHA_GENERIC if CRYPTO_ARCH_HAVE_LIB_CHACHA=n
 	help
@@ -76,7 +65,7 @@ config CRYPTO_LIB_CURVE25519_GENERIC
 	  of CRYPTO_LIB_CURVE25519.
 
 config CRYPTO_LIB_CURVE25519
-	tristate "Curve25519 scalar multiplication library"
+	tristate
 	depends on CRYPTO_ARCH_HAVE_LIB_CURVE25519 || !CRYPTO_ARCH_HAVE_LIB_CURVE25519
 	select CRYPTO_LIB_CURVE25519_GENERIC if CRYPTO_ARCH_HAVE_LIB_CURVE25519=n
 	help
@@ -111,7 +100,7 @@ config CRYPTO_LIB_POLY1305_GENERIC
 	  of CRYPTO_LIB_POLY1305.
 
 config CRYPTO_LIB_POLY1305
-	tristate "Poly1305 library interface"
+	tristate
 	depends on CRYPTO_ARCH_HAVE_LIB_POLY1305 || !CRYPTO_ARCH_HAVE_LIB_POLY1305
 	select CRYPTO_LIB_POLY1305_GENERIC if CRYPTO_ARCH_HAVE_LIB_POLY1305=n
 	help
@@ -120,7 +109,7 @@ config CRYPTO_LIB_POLY1305
 	  is available and enabled.
 
 config CRYPTO_LIB_CHACHA20POLY1305
-	tristate "ChaCha20-Poly1305 AEAD support (8-byte nonce library version)"
+	tristate
 	depends on CRYPTO_ARCH_HAVE_LIB_CHACHA || !CRYPTO_ARCH_HAVE_LIB_CHACHA
 	depends on CRYPTO_ARCH_HAVE_LIB_POLY1305 || !CRYPTO_ARCH_HAVE_LIB_POLY1305
 	select CRYPTO_LIB_CHACHA
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 73205ed269ba..ed43a41f2dcc 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -10,11 +10,10 @@ libaes-y					:= aes.o
 obj-$(CONFIG_CRYPTO_LIB_ARC4)			+= libarc4.o
 libarc4-y					:= arc4.o
 
-obj-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC)	+= libblake2s-generic.o
-libblake2s-generic-y				+= blake2s-generic.o
-
-obj-$(CONFIG_CRYPTO_LIB_BLAKE2S)		+= libblake2s.o
-libblake2s-y					+= blake2s.o
+# blake2s is used by the /dev/random driver which is always builtin
+obj-y						+= libblake2s.o
+libblake2s-y					:= blake2s.o
+libblake2s-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC)	+= blake2s-generic.o
 
 obj-$(CONFIG_CRYPTO_LIB_CHACHA20POLY1305)	+= libchacha20poly1305.o
 libchacha20poly1305-y				+= chacha20poly1305.o
diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c
index 04ff8df24513..75ccb3e633e6 100644
--- a/lib/crypto/blake2s-generic.c
+++ b/lib/crypto/blake2s-generic.c
@@ -37,7 +37,11 @@ static inline void blake2s_increment_counter(struct blake2s_state *state,
 	state->t[1] += (state->t[0] < inc);
 }
 
-void blake2s_compress_generic(struct blake2s_state *state,const u8 *block,
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+		      size_t nblocks, const u32 inc)
+		      __weak __alias(blake2s_compress_generic);
+
+void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 			      size_t nblocks, const u32 inc)
 {
 	u32 m[16];
diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index 4055aa593ec4..93f2ae051370 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -16,12 +16,6 @@
 #include <linux/init.h>
 #include <linux/bug.h>
 
-#if IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S)
-#  define blake2s_compress blake2s_compress_arch
-#else
-#  define blake2s_compress blake2s_compress_generic
-#endif
-
 void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen)
 {
 	__blake2s_update(state, in, inlen, blake2s_compress);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH v7] lib/crypto: blake2s: include as built-in
  2022-01-03 12:31                         ` [PATCH v7] " Jason A. Donenfeld
@ 2022-01-04  1:21                           ` Herbert Xu
  2022-01-04 17:02                             ` Ard Biesheuvel
  0 siblings, 1 reply; 61+ messages in thread
From: Herbert Xu @ 2022-01-04  1:21 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: linux-kernel, Ard Biesheuvel, Greg Kroah-Hartman,
	Masahiro Yamada, linux-kbuild, linux-crypto

On Mon, Jan 03, 2022 at 01:31:52PM +0100, Jason A. Donenfeld wrote:
> In preparation for using blake2s in the RNG, we change the way that it
> is wired-in to the build system. Instead of using ifdefs to select the
> right symbol, we use weak symbols. And because ARM doesn't need the
> generic implementation, we make the generic one default only if an arch
> library doesn't need it already, and then have arch libraries that do
> need it opt-in. So that the arch libraries can remain tristate rather
> than bool, we then split the shash part from the glue code.
> 
> Acked-by: Ard Biesheuvel <ardb@kernel.org>
> Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Cc: Masahiro Yamada <masahiroy@kernel.org>
> Cc: linux-kbuild@vger.kernel.org
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: linux-crypto@vger.kernel.org
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> ---
> Herbert - As discussed, I still intend to take this via the
> crng/random.git tree because it forms a dependency, and I'd like to send
> a pull very early in the 5.17 cycle. I've taken some care to minimize
> changes to the {arch/*/}crypto/Kconfig files, as you mentioned this
> might cause some conflicts. Your tree should work cleanly on top of this
> commit.

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>

> Changes v6->v7:
> - Split arch shash implementations out from the glue code, so that they
>   can remain as tristates, and we thus don't need to touch
>   arch/*/crypto/Kconfig at all.

This looks good to me although I confess that I haven't actually
tried to build it :) Hopefully the build robots will take care of
this.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v7] lib/crypto: blake2s: include as built-in
  2022-01-04  1:21                           ` Herbert Xu
@ 2022-01-04 17:02                             ` Ard Biesheuvel
  2022-01-04 17:04                               ` Jason A. Donenfeld
                                                 ` (2 more replies)
  0 siblings, 3 replies; 61+ messages in thread
From: Ard Biesheuvel @ 2022-01-04 17:02 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Jason A. Donenfeld, Linux Kernel Mailing List,
	Greg Kroah-Hartman, Masahiro Yamada, Linux Kbuild mailing list,
	Linux Crypto Mailing List

On Tue, 4 Jan 2022 at 02:22, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Mon, Jan 03, 2022 at 01:31:52PM +0100, Jason A. Donenfeld wrote:
> > In preparation for using blake2s in the RNG, we change the way that it
> > is wired-in to the build system. Instead of using ifdefs to select the
> > right symbol, we use weak symbols. And because ARM doesn't need the
> > generic implementation, we make the generic one default only if an arch
> > library doesn't need it already, and then have arch libraries that do
> > need it opt-in. So that the arch libraries can remain tristate rather
> > than bool, we then split the shash part from the glue code.
> >
> > Acked-by: Ard Biesheuvel <ardb@kernel.org>
> > Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> > Cc: Masahiro Yamada <masahiroy@kernel.org>
> > Cc: linux-kbuild@vger.kernel.org
> > Cc: Herbert Xu <herbert@gondor.apana.org.au>
> > Cc: linux-crypto@vger.kernel.org
> > Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> > ---
> > Herbert - As discussed, I still intend to take this via the
> > crng/random.git tree because it forms a dependency, and I'd like to send
> > a pull very early in the 5.17 cycle. I've taken some care to minimize
> > changes to the {arch/*/}crypto/Kconfig files, as you mentioned this
> > might cause some conflicts. Your tree should work cleanly on top of this
> > commit.
>
> Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
>
> > Changes v6->v7:
> > - Split arch shash implementations out from the glue code, so that they
> >   can remain as tristates, and we thus don't need to touch
> >   arch/*/crypto/Kconfig at all.
>
> This looks good to me although I confess that I haven't actually
> tried to build it :) Hopefully the build robots will take care of
> this.
>

The only downside here is that the ARM/x86 accelerated shashes and the
generic shash now use the same core transform, right? Given that the
generic blake2s shash is never used for anything in the kernel, the
only reason for its existence was to be able to use the randomized
crypto testing infrastructure to test the arch code.

Ergo, there is no point in retaining the blake2s shashes and we can
simply remove all of them. (Note that blake2b is used as an shash via
the crypto API by btrfs, but blake2s is only used via the library API)

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v7] lib/crypto: blake2s: include as built-in
  2022-01-04 17:02                             ` Ard Biesheuvel
@ 2022-01-04 17:04                               ` Jason A. Donenfeld
  2022-01-05  0:28                               ` Herbert Xu
  2022-01-05 21:53                               ` Eric Biggers
  2 siblings, 0 replies; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-04 17:04 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Herbert Xu, Linux Kernel Mailing List, Greg Kroah-Hartman,
	Masahiro Yamada, Linux Kbuild mailing list,
	Linux Crypto Mailing List

Hi Ard,

On Tue, Jan 4, 2022 at 6:03 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> The only downside here is that the ARM/x86 accelerated shashes and the
> generic shash now use the same core transform, right? Given that the
> generic blake2s shash is never used for anything in the kernel, the
> only reason for its existence was to be able to use the randomized
> crypto testing infrastructure to test the arch code.
>
> Ergo, there is no point in retaining the blake2s shashes and we can
> simply remove all of them. (Note that blake2b is used as an shash via
> the crypto API by btrfs, but blake2s is only used via the library API)

That makes sense and is fine with me. Let's do that in a separate
commit later. I've got a bunch of things I'd like to fix up in the
general lib/crypto vs crypto split that are kind of interrelated.

Jason

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v7] lib/crypto: blake2s: include as built-in
  2022-01-04 17:02                             ` Ard Biesheuvel
  2022-01-04 17:04                               ` Jason A. Donenfeld
@ 2022-01-05  0:28                               ` Herbert Xu
  2022-01-05 21:53                               ` Eric Biggers
  2 siblings, 0 replies; 61+ messages in thread
From: Herbert Xu @ 2022-01-05  0:28 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Jason A. Donenfeld, Linux Kernel Mailing List,
	Greg Kroah-Hartman, Masahiro Yamada, Linux Kbuild mailing list,
	Linux Crypto Mailing List

On Tue, Jan 04, 2022 at 06:02:52PM +0100, Ard Biesheuvel wrote:
>
> The only downside here is that the ARM/x86 accelerated shashes and the
> generic shash now use the same core transform, right? Given that the
> generic blake2s shash is never used for anything in the kernel, the
> only reason for its existence was to be able to use the randomized
> crypto testing infrastructure to test the arch code.
> 
> Ergo, there is no point in retaining the blake2s shashes and we can
> simply remove all of them. (Note that blake2b is used as an shash via
> the crypto API by btrfs, but blake2s is only used via the library API)

I have no objections to removing blake2s.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction
  2021-12-24 20:56   ` Eric Biggers
@ 2022-01-05 21:42     ` Eric Biggers
  0 siblings, 0 replies; 61+ messages in thread
From: Eric Biggers @ 2022-01-05 21:42 UTC (permalink / raw)
  To: Jason A. Donenfeld; +Cc: linux-kernel, tytso, gregkh, Jean-Philippe Aumasson

Hi Jason,

On Fri, Dec 24, 2021 at 02:56:44PM -0600, Eric Biggers wrote:
> Also, this patch was only sent to linux-kernel, not to linux-crypto, so I only
> found it because I happened to see it in the above git repository, then dig it
> up from lore.kernel.org.  How about Cc'ing all random.c patches to linux-crypto,
> and putting that in the MAINTAINERS file entry?

Any thoughts about this?  I see more patches appearing in random.git, but they
were only sent to linux-kernel.  I'd expect the MAINTAINERS entry to contain:

	L:      linux-crypto@vger.kernel.org

- Eric

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v7] lib/crypto: blake2s: include as built-in
  2022-01-04 17:02                             ` Ard Biesheuvel
  2022-01-04 17:04                               ` Jason A. Donenfeld
  2022-01-05  0:28                               ` Herbert Xu
@ 2022-01-05 21:53                               ` Eric Biggers
  2022-01-05 22:01                                 ` Ard Biesheuvel
  2 siblings, 1 reply; 61+ messages in thread
From: Eric Biggers @ 2022-01-05 21:53 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Herbert Xu, Jason A. Donenfeld, Linux Kernel Mailing List,
	Greg Kroah-Hartman, Masahiro Yamada, Linux Kbuild mailing list,
	Linux Crypto Mailing List

On Tue, Jan 04, 2022 at 06:02:52PM +0100, Ard Biesheuvel wrote:
> The only downside here is that the ARM/x86 accelerated shashes and the
> generic shash now use the same core transform, right?

I don't see how this is the case, given that crypto/blake2s_generic.c still uses
blake2s_compress_generic(), not blake2s_compress().

- Eric

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v7] lib/crypto: blake2s: include as built-in
  2022-01-05 21:53                               ` Eric Biggers
@ 2022-01-05 22:01                                 ` Ard Biesheuvel
  2022-01-05 22:09                                   ` Eric Biggers
  0 siblings, 1 reply; 61+ messages in thread
From: Ard Biesheuvel @ 2022-01-05 22:01 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Herbert Xu, Jason A. Donenfeld, Linux Kernel Mailing List,
	Greg Kroah-Hartman, Masahiro Yamada, Linux Kbuild mailing list,
	Linux Crypto Mailing List

On Wed, 5 Jan 2022 at 22:53, Eric Biggers <ebiggers@kernel.org> wrote:
>
> On Tue, Jan 04, 2022 at 06:02:52PM +0100, Ard Biesheuvel wrote:
> > The only downside here is that the ARM/x86 accelerated shashes and the
> > generic shash now use the same core transform, right?
>
> I don't see how this is the case, given that crypto/blake2s_generic.c still uses
> blake2s_compress_generic(), not blake2s_compress().
>

Ah ok, I stand corrected then.

So what are your thoughts on this? Should we keep the shashes while
they have no users?

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v7] lib/crypto: blake2s: include as built-in
  2022-01-05 22:01                                 ` Ard Biesheuvel
@ 2022-01-05 22:09                                   ` Eric Biggers
  0 siblings, 0 replies; 61+ messages in thread
From: Eric Biggers @ 2022-01-05 22:09 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Herbert Xu, Jason A. Donenfeld, Linux Kernel Mailing List,
	Greg Kroah-Hartman, Masahiro Yamada, Linux Kbuild mailing list,
	Linux Crypto Mailing List

On Wed, Jan 05, 2022 at 11:01:04PM +0100, Ard Biesheuvel wrote:
> On Wed, 5 Jan 2022 at 22:53, Eric Biggers <ebiggers@kernel.org> wrote:
> >
> > On Tue, Jan 04, 2022 at 06:02:52PM +0100, Ard Biesheuvel wrote:
> > > The only downside here is that the ARM/x86 accelerated shashes and the
> > > generic shash now use the same core transform, right?
> >
> > I don't see how this is the case, given that crypto/blake2s_generic.c still uses
> > blake2s_compress_generic(), not blake2s_compress().
> >
> 
> Ah ok, I stand corrected then.
> 
> So what are your thoughts on this? Should we keep the shashes while
> they have no users?

I don't know.  Removing unused stuff is good per se, but I wouldn't have
expected this to be something that is being considered here.  It's not like this
is a "controversial" algorithm, blake2b is already supported, and there could be
users of it already (dm-integrity, dm-verity, AF_ALG, etc.).  If this is going
to happen, then the acceptance criteria for new algorithms need to get *much*
stricter, so that algorithms aren't constantly being added and removed.

- Eric

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction
  2021-12-23 14:11 ` [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction Jason A. Donenfeld
  2021-12-24 20:56   ` Eric Biggers
  2021-12-27 15:52   ` Theodore Ts'o
@ 2022-01-11 11:38   ` Geert Uytterhoeven
  2022-01-11 12:28     ` Jason A. Donenfeld
  2 siblings, 1 reply; 61+ messages in thread
From: Geert Uytterhoeven @ 2022-01-11 11:38 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Linux Kernel Mailing List, Theodore Tso, Greg KH, Jean-Philippe Aumasson

Hi Jsaon,

On Sat, Dec 25, 2021 at 1:52 AM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> This commit addresses one of the lower hanging fruits of the RNG: its
> usage of SHA1.
>
> BLAKE2s is generally faster, and certainly more secure, than SHA1, which
> has [1] been [2] really [3] very [4] broken [5]. Additionally, the
> current construction in the RNG doesn't use the full SHA1 function, as
> specified, and allows overwriting the IV with RDRAND output in an
> undocumented way, even in the case when RDRAND isn't set to "trusted",
> which means potential malicious IV choices. And its short length means
> that keeping only half of it secret when feeding back into the mixer
> gives us only 2^80 bits of forward secrecy. In other words, not only is
> the choice of hash function dated, but the use of it isn't really great
> either.

m68k bloat-o-meter:

blake2s_compress_generic                       -    4448   +4448
blake2s256_hmac                                -     302    +302
blake2s_update                                 -     156    +156
blake2s_final                                  -     124    +124
blake2s_init.constprop                         -      94     +94
__ksymtab_blake2s_update                       -      12     +12
__ksymtab_blake2s_final                        -      12     +12
__ksymtab_blake2s_compress_generic             -      12     +12
__ksymtab_blake2s256_hmac                      -      12     +12
blake2s_mod_init                               -       4      +4
__initcall__kmod_libblake2s__101_82_blake2s_mod_init6       -       4      +4

Unfortunately we cannot get rid of the sha1 code yet (lib/sha1.o is
built-in unconditionally), as there are other users...

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction
  2022-01-11 11:38   ` Geert Uytterhoeven
@ 2022-01-11 12:28     ` Jason A. Donenfeld
  2022-01-11 12:50       ` Jason A. Donenfeld
  2022-01-11 12:51       ` Geert Uytterhoeven
  0 siblings, 2 replies; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-11 12:28 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Linux Kernel Mailing List, Theodore Tso, Greg KH, Jean-Philippe Aumasson

Hi Geert,

On Tue, Jan 11, 2022 at 12:38 PM Geert Uytterhoeven
<geert@linux-m68k.org> wrote:
> Unfortunately we cannot get rid of the sha1 code yet (lib/sha1.o is
> built-in unconditionally), as there are other users...

I think that's just how things go and a price for progress. We're not
going to stick with sha1, and blake2s has some nice properties that we
certainly want. In the future hopefully this can decrease in other
ways based on other future improvements. But that's where we are now.

If you're really quite concerned about m68k code size, I can probably
do some things to reduce that. For example, blake2s256_hmac is only
used by wireguard and it could probably be made local there. And with
some trivial loop re-rolling, I can shave off another 2300 bytes. And
I bet I can find a few other things too. The question is: how
important is this to you?

Jason

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction
  2022-01-11 12:28     ` Jason A. Donenfeld
@ 2022-01-11 12:50       ` Jason A. Donenfeld
  2022-01-11 12:57         ` Geert Uytterhoeven
                           ` (2 more replies)
  2022-01-11 12:51       ` Geert Uytterhoeven
  1 sibling, 3 replies; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-11 12:50 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Linux Kernel Mailing List, Theodore Tso, Greg KH, Jean-Philippe Aumasson

On Tue, Jan 11, 2022 at 1:28 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> If you're really quite concerned about m68k code size, I can probably
> do some things to reduce that. For example, blake2s256_hmac is only
> used by wireguard and it could probably be made local there. And with
> some trivial loop re-rolling, I can shave off another 2300 bytes. And
> I bet I can find a few other things too. The question is: how
> important is this to you?

And with another trick (see below), another extra 1000 bytes or so
shaved off. Aside from moving blake2s256_hmac, I'm not really super
enthusiastic about making these changes, but depending on how important
this is to you, maybe we can make something work. There are probably
additional possibilities too with the code.

====

diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c
index 75ccb3e633e6..8e3c6372363a 100644
--- a/lib/crypto/blake2s-generic.c
+++ b/lib/crypto/blake2s-generic.c
@@ -46,7 +46,7 @@ void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 {
 	u32 m[16];
 	u32 v[16];
-	int i;
+	int i, j;
 
 	WARN_ON(IS_ENABLED(DEBUG) &&
 		(nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));
@@ -76,29 +76,11 @@ void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 	b = ror32(b ^ c, 7); \
 } while (0)
 
-#define ROUND(r) do { \
-	G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
-	G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
-	G(r, 2, v[2], v[ 6], v[10], v[14]); \
-	G(r, 3, v[3], v[ 7], v[11], v[15]); \
-	G(r, 4, v[0], v[ 5], v[10], v[15]); \
-	G(r, 5, v[1], v[ 6], v[11], v[12]); \
-	G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
-	G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
-} while (0)
-		ROUND(0);
-		ROUND(1);
-		ROUND(2);
-		ROUND(3);
-		ROUND(4);
-		ROUND(5);
-		ROUND(6);
-		ROUND(7);
-		ROUND(8);
-		ROUND(9);
-
+		for (i = 0; i < 10; ++i) {
+			for (j = 0; j < 8; ++j)
+				G(i, j, v[j % 4], v[((j + (j / 4)) % 4) + 4], v[((j + 2 * (j / 4)) % 4) + 8], v[((j + 3 * (j / 4)) % 4) + 12]);
+		}
 #undef G
-#undef ROUND
 
 		for (i = 0; i < 8; ++i)
 			state->h[i] ^= v[i] ^ v[i + 8];


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction
  2022-01-11 12:28     ` Jason A. Donenfeld
  2022-01-11 12:50       ` Jason A. Donenfeld
@ 2022-01-11 12:51       ` Geert Uytterhoeven
  2022-01-11 13:02         ` Jason A. Donenfeld
  1 sibling, 1 reply; 61+ messages in thread
From: Geert Uytterhoeven @ 2022-01-11 12:51 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Linux Kernel Mailing List, Theodore Tso, Greg KH,
	Jean-Philippe Aumasson, bpf, netdev

Hi Jason,

CC bpf, netdev

On Tue, Jan 11, 2022 at 1:28 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> On Tue, Jan 11, 2022 at 12:38 PM Geert Uytterhoeven
> <geert@linux-m68k.org> wrote:
> > Unfortunately we cannot get rid of the sha1 code yet (lib/sha1.o is
> > built-in unconditionally), as there are other users...

kernel/bpf/core.c and net/ipv6/addrconf.c
Could they be switched to blake2s, too?

> I think that's just how things go and a price for progress. We're not
> going to stick with sha1, and blake2s has some nice properties that we
> certainly want. In the future hopefully this can decrease in other
> ways based on other future improvements. But that's where we are now.
>
> If you're really quite concerned about m68k code size, I can probably
> do some things to reduce that. For example, blake2s256_hmac is only
> used by wireguard and it could probably be made local there. And with
> some trivial loop re-rolling, I can shave off another 2300 bytes. And
> I bet I can find a few other things too. The question is: how
> important is this to you?

No problem, I just try to report all measurable impact on kernel size,
so there is some record of it.

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction
  2022-01-11 12:50       ` Jason A. Donenfeld
@ 2022-01-11 12:57         ` Geert Uytterhoeven
  2022-01-11 13:00           ` Jason A. Donenfeld
  2022-01-11 15:46         ` David Laight
  2022-01-14 17:27         ` David Laight
  2 siblings, 1 reply; 61+ messages in thread
From: Geert Uytterhoeven @ 2022-01-11 12:57 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Linux Kernel Mailing List, Theodore Tso, Greg KH, Jean-Philippe Aumasson

Hi Jason,

On Tue, Jan 11, 2022 at 1:50 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> On Tue, Jan 11, 2022 at 1:28 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > If you're really quite concerned about m68k code size, I can probably

It's not just m68k. There exist ARM SoCs with 8 MiB builtin SRAM that
are used in products running Linux.

> > do some things to reduce that. For example, blake2s256_hmac is only
> > used by wireguard and it could probably be made local there. And with
> > some trivial loop re-rolling, I can shave off another 2300 bytes. And
> > I bet I can find a few other things too. The question is: how
> > important is this to you?
>
> And with another trick (see below), another extra 1000 bytes or so
> shaved off. Aside from moving blake2s256_hmac, I'm not really super
> enthusiastic about making these changes, but depending on how important
> this is to you, maybe we can make something work. There are probably
> additional possibilities too with the code.

Cool, much more than 1000 bytes:

add/remove: 1/0 grow/shrink: 0/1 up/down: 160/-4032 (-3872)
Function                                     old     new   delta
blake2s_sigma                                  -     160    +160
blake2s_compress_generic                    4448     416   -4032
Total: Before=4227876, After=4224004, chg -0.09%

I don't know what the impact is on performance, and if the compiler
might do a good job unrolling this again when performance matters
(i.e. if CONFIG_CC_OPTIMIZE_FOR_SIZE is not set).

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction
  2022-01-11 12:57         ` Geert Uytterhoeven
@ 2022-01-11 13:00           ` Jason A. Donenfeld
  0 siblings, 0 replies; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-11 13:00 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Linux Kernel Mailing List, Theodore Tso, Greg KH, Jean-Philippe Aumasson

On Tue, Jan 11, 2022 at 1:58 PM Geert Uytterhoeven <geert@linux-m68k.org> wrote:
> Cool, much more than 1000 bytes:
>
> add/remove: 1/0 grow/shrink: 0/1 up/down: 160/-4032 (-3872)
> Function                                     old     new   delta
> blake2s_sigma                                  -     160    +160
> blake2s_compress_generic                    4448     416   -4032
> Total: Before=4227876, After=4224004, chg -0.09%
>
> I don't know what the impact is on performance, and if the compiler
> might do a good job unrolling this again when performance matters
> (i.e. if CONFIG_CC_OPTIMIZE_FOR_SIZE is not set).

Oh, wow, that's a lot indeed.

I guess the new code could be ifdef'd in a CONFIG_CC_OPTIMIZE_FOR_SIZE block?

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction
  2022-01-11 12:51       ` Geert Uytterhoeven
@ 2022-01-11 13:02         ` Jason A. Donenfeld
  2022-01-11 13:49           ` [PATCH crypto 0/2] smaller blake2s code size on m68k and other small platforms Jason A. Donenfeld
  0 siblings, 1 reply; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-11 13:02 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Linux Kernel Mailing List, Theodore Tso, Greg KH,
	Jean-Philippe Aumasson, bpf, netdev

On Tue, Jan 11, 2022 at 1:51 PM Geert Uytterhoeven <geert@linux-m68k.org> wrote:
> On Tue, Jan 11, 2022 at 1:28 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > On Tue, Jan 11, 2022 at 12:38 PM Geert Uytterhoeven
> > <geert@linux-m68k.org> wrote:
> > > Unfortunately we cannot get rid of the sha1 code yet (lib/sha1.o is
> > > built-in unconditionally), as there are other users...
>
> kernel/bpf/core.c and net/ipv6/addrconf.c
> Could they be switched to blake2s, too?

I've brought this up before and the conclusion is that they probably
can't for compatibility reasons. No need to rehash that discussion
again. Alternatively, send a patch to netdev and see how it goes.

Anyway, we can follow up with the code size reduction patches we
discussed elsewhere in this thread.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* [PATCH crypto 0/2] smaller blake2s code size on m68k and other small platforms
  2022-01-11 13:02         ` Jason A. Donenfeld
@ 2022-01-11 13:49           ` Jason A. Donenfeld
  2022-01-11 13:49             ` [PATCH crypto 1/2] lib/crypto: blake2s-generic: reduce code size on small systems Jason A. Donenfeld
                               ` (2 more replies)
  0 siblings, 3 replies; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-11 13:49 UTC (permalink / raw)
  To: linux-crypto, netdev, wireguard, linux-kernel, bpf, geert, tytso,
	gregkh, jeanphilippe.aumasson, ardb
  Cc: Jason A. Donenfeld

Hi,

Geert emailed me this afternoon concerned about blake2s codesize on m68k
and other small systems. We identified two extremely effective ways of
chopping down the size. One of them moves some wireguard-specific things
into wireguard proper. The other one adds a slower codepath for
CONFIG_CC_OPTIMIZE_FOR_SIZE configurations. I really don't like that
slower codepath, but since it is configuration gated, at least it stays
out of the way except for people who know they need a tiny kernel image

Thanks,
Jason

Jason A. Donenfeld (2):
  lib/crypto: blake2s-generic: reduce code size on small systems
  lib/crypto: blake2s: move hmac construction into wireguard

 drivers/net/wireguard/noise.c | 45 ++++++++++++++++++++++++++++++-----
 include/crypto/blake2s.h      |  3 ---
 lib/crypto/blake2s-generic.c  | 30 +++++++++++++----------
 lib/crypto/blake2s-selftest.c | 31 ------------------------
 lib/crypto/blake2s.c          | 37 ----------------------------
 5 files changed, 57 insertions(+), 89 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 61+ messages in thread

* [PATCH crypto 1/2] lib/crypto: blake2s-generic: reduce code size on small systems
  2022-01-11 13:49           ` [PATCH crypto 0/2] smaller blake2s code size on m68k and other small platforms Jason A. Donenfeld
@ 2022-01-11 13:49             ` Jason A. Donenfeld
  2022-01-12 10:57               ` Geert Uytterhoeven
  2022-01-12 18:31               ` Eric Biggers
  2022-01-11 13:49             ` [PATCH crypto 2/2] lib/crypto: blake2s: move hmac construction into wireguard Jason A. Donenfeld
  2022-01-11 18:10             ` [PATCH crypto v2 0/2] reduce code size from blake2s on m68k and other small platforms Jason A. Donenfeld
  2 siblings, 2 replies; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-11 13:49 UTC (permalink / raw)
  To: linux-crypto, netdev, wireguard, linux-kernel, bpf, geert, tytso,
	gregkh, jeanphilippe.aumasson, ardb
  Cc: Jason A. Donenfeld, Herbert Xu

Re-wind the loops entirely on kernels optimized for code size. This is
really not good at all performance-wise. But on m68k, it shaves off 4k
of code size, which is apparently important.

Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 lib/crypto/blake2s-generic.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c
index 75ccb3e633e6..990f000e22ee 100644
--- a/lib/crypto/blake2s-generic.c
+++ b/lib/crypto/blake2s-generic.c
@@ -46,7 +46,7 @@ void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 {
 	u32 m[16];
 	u32 v[16];
-	int i;
+	int i, j;
 
 	WARN_ON(IS_ENABLED(DEBUG) &&
 		(nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));
@@ -86,17 +86,23 @@ void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 	G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
 	G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
 } while (0)
-		ROUND(0);
-		ROUND(1);
-		ROUND(2);
-		ROUND(3);
-		ROUND(4);
-		ROUND(5);
-		ROUND(6);
-		ROUND(7);
-		ROUND(8);
-		ROUND(9);
-
+		if (IS_ENABLED(CONFIG_CC_OPTIMIZE_FOR_SIZE)) {
+			for (i = 0; i < 10; ++i) {
+				for (j = 0; j < 8; ++j)
+					G(i, j, v[j % 4], v[((j + (j / 4)) % 4) + 4], v[((j + 2 * (j / 4)) % 4) + 8], v[((j + 3 * (j / 4)) % 4) + 12]);
+			}
+		} else {
+			ROUND(0);
+			ROUND(1);
+			ROUND(2);
+			ROUND(3);
+			ROUND(4);
+			ROUND(5);
+			ROUND(6);
+			ROUND(7);
+			ROUND(8);
+			ROUND(9);
+		}
 #undef G
 #undef ROUND
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH crypto 2/2] lib/crypto: blake2s: move hmac construction into wireguard
  2022-01-11 13:49           ` [PATCH crypto 0/2] smaller blake2s code size on m68k and other small platforms Jason A. Donenfeld
  2022-01-11 13:49             ` [PATCH crypto 1/2] lib/crypto: blake2s-generic: reduce code size on small systems Jason A. Donenfeld
@ 2022-01-11 13:49             ` Jason A. Donenfeld
  2022-01-11 14:43               ` Ard Biesheuvel
  2022-01-12 18:35               ` Eric Biggers
  2022-01-11 18:10             ` [PATCH crypto v2 0/2] reduce code size from blake2s on m68k and other small platforms Jason A. Donenfeld
  2 siblings, 2 replies; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-11 13:49 UTC (permalink / raw)
  To: linux-crypto, netdev, wireguard, linux-kernel, bpf, geert, tytso,
	gregkh, jeanphilippe.aumasson, ardb
  Cc: Jason A. Donenfeld, Herbert Xu

Basically nobody should use blake2s in an HMAC construction; it already
has a keyed variant. But for unfortunately historical reasons, Noise,
used by WireGuard, uses HKDF quite strictly, which means we have to use
this. Because this really shouldn't be used by others, this commit moves
it into wireguard's noise.c locally, so that kernels that aren't using
WireGuard don't get this superfluous code baked in. On m68k systems,
this shaves off ~314 bytes.

Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: netdev@vger.kernel.org
Cc: wireguard@lists.zx2c4.com
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 drivers/net/wireguard/noise.c | 45 ++++++++++++++++++++++++++++++-----
 include/crypto/blake2s.h      |  3 ---
 lib/crypto/blake2s-selftest.c | 31 ------------------------
 lib/crypto/blake2s.c          | 37 ----------------------------
 4 files changed, 39 insertions(+), 77 deletions(-)

diff --git a/drivers/net/wireguard/noise.c b/drivers/net/wireguard/noise.c
index c0cfd9b36c0b..720952b92e78 100644
--- a/drivers/net/wireguard/noise.c
+++ b/drivers/net/wireguard/noise.c
@@ -302,6 +302,41 @@ void wg_noise_set_static_identity_private_key(
 		static_identity->static_public, private_key);
 }
 
+static void hmac(u8 *out, const u8 *in, const u8 *key, const size_t inlen, const size_t keylen)
+{
+	struct blake2s_state state;
+	u8 x_key[BLAKE2S_BLOCK_SIZE] __aligned(__alignof__(u32)) = { 0 };
+	u8 i_hash[BLAKE2S_HASH_SIZE] __aligned(__alignof__(u32));
+	int i;
+
+	if (keylen > BLAKE2S_BLOCK_SIZE) {
+		blake2s_init(&state, BLAKE2S_HASH_SIZE);
+		blake2s_update(&state, key, keylen);
+		blake2s_final(&state, x_key);
+	} else
+		memcpy(x_key, key, keylen);
+
+	for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
+		x_key[i] ^= 0x36;
+
+	blake2s_init(&state, BLAKE2S_HASH_SIZE);
+	blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
+	blake2s_update(&state, in, inlen);
+	blake2s_final(&state, i_hash);
+
+	for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
+		x_key[i] ^= 0x5c ^ 0x36;
+
+	blake2s_init(&state, BLAKE2S_HASH_SIZE);
+	blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
+	blake2s_update(&state, i_hash, BLAKE2S_HASH_SIZE);
+	blake2s_final(&state, i_hash);
+
+	memcpy(out, i_hash, BLAKE2S_HASH_SIZE);
+	memzero_explicit(x_key, BLAKE2S_BLOCK_SIZE);
+	memzero_explicit(i_hash, BLAKE2S_HASH_SIZE);
+}
+
 /* This is Hugo Krawczyk's HKDF:
  *  - https://eprint.iacr.org/2010/264.pdf
  *  - https://tools.ietf.org/html/rfc5869
@@ -322,14 +357,14 @@ static void kdf(u8 *first_dst, u8 *second_dst, u8 *third_dst, const u8 *data,
 		 ((third_len || third_dst) && (!second_len || !second_dst))));
 
 	/* Extract entropy from data into secret */
-	blake2s256_hmac(secret, data, chaining_key, data_len, NOISE_HASH_LEN);
+	hmac(secret, data, chaining_key, data_len, NOISE_HASH_LEN);
 
 	if (!first_dst || !first_len)
 		goto out;
 
 	/* Expand first key: key = secret, data = 0x1 */
 	output[0] = 1;
-	blake2s256_hmac(output, output, secret, 1, BLAKE2S_HASH_SIZE);
+	hmac(output, output, secret, 1, BLAKE2S_HASH_SIZE);
 	memcpy(first_dst, output, first_len);
 
 	if (!second_dst || !second_len)
@@ -337,8 +372,7 @@ static void kdf(u8 *first_dst, u8 *second_dst, u8 *third_dst, const u8 *data,
 
 	/* Expand second key: key = secret, data = first-key || 0x2 */
 	output[BLAKE2S_HASH_SIZE] = 2;
-	blake2s256_hmac(output, output, secret, BLAKE2S_HASH_SIZE + 1,
-			BLAKE2S_HASH_SIZE);
+	hmac(output, output, secret, BLAKE2S_HASH_SIZE + 1, BLAKE2S_HASH_SIZE);
 	memcpy(second_dst, output, second_len);
 
 	if (!third_dst || !third_len)
@@ -346,8 +380,7 @@ static void kdf(u8 *first_dst, u8 *second_dst, u8 *third_dst, const u8 *data,
 
 	/* Expand third key: key = secret, data = second-key || 0x3 */
 	output[BLAKE2S_HASH_SIZE] = 3;
-	blake2s256_hmac(output, output, secret, BLAKE2S_HASH_SIZE + 1,
-			BLAKE2S_HASH_SIZE);
+	hmac(output, output, secret, BLAKE2S_HASH_SIZE + 1, BLAKE2S_HASH_SIZE);
 	memcpy(third_dst, output, third_len);
 
 out:
diff --git a/include/crypto/blake2s.h b/include/crypto/blake2s.h
index bc3fb59442ce..4e30e1799e61 100644
--- a/include/crypto/blake2s.h
+++ b/include/crypto/blake2s.h
@@ -101,7 +101,4 @@ static inline void blake2s(u8 *out, const u8 *in, const u8 *key,
 	blake2s_final(&state, out);
 }
 
-void blake2s256_hmac(u8 *out, const u8 *in, const u8 *key, const size_t inlen,
-		     const size_t keylen);
-
 #endif /* _CRYPTO_BLAKE2S_H */
diff --git a/lib/crypto/blake2s-selftest.c b/lib/crypto/blake2s-selftest.c
index 5d9ea53be973..409e4b728770 100644
--- a/lib/crypto/blake2s-selftest.c
+++ b/lib/crypto/blake2s-selftest.c
@@ -15,7 +15,6 @@
  * #include <stdio.h>
  *
  * #include <openssl/evp.h>
- * #include <openssl/hmac.h>
  *
  * #define BLAKE2S_TESTVEC_COUNT	256
  *
@@ -58,16 +57,6 @@
  *	}
  *	printf("};\n\n");
  *
- *	printf("static const u8 blake2s_hmac_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {\n");
- *
- *	HMAC(EVP_blake2s256(), key, sizeof(key), buf, sizeof(buf), hash, NULL);
- *	print_vec(hash, BLAKE2S_OUTBYTES);
- *
- *	HMAC(EVP_blake2s256(), buf, sizeof(buf), key, sizeof(key), hash, NULL);
- *	print_vec(hash, BLAKE2S_OUTBYTES);
- *
- *	printf("};\n");
- *
  *	return 0;
  *}
  */
@@ -554,15 +543,6 @@ static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {
     0xd6, 0x98, 0x6b, 0x07, 0x10, 0x65, 0x52, 0x65, },
 };
 
-static const u8 blake2s_hmac_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {
-  { 0xce, 0xe1, 0x57, 0x69, 0x82, 0xdc, 0xbf, 0x43, 0xad, 0x56, 0x4c, 0x70,
-    0xed, 0x68, 0x16, 0x96, 0xcf, 0xa4, 0x73, 0xe8, 0xe8, 0xfc, 0x32, 0x79,
-    0x08, 0x0a, 0x75, 0x82, 0xda, 0x3f, 0x05, 0x11, },
-  { 0x77, 0x2f, 0x0c, 0x71, 0x41, 0xf4, 0x4b, 0x2b, 0xb3, 0xc6, 0xb6, 0xf9,
-    0x60, 0xde, 0xe4, 0x52, 0x38, 0x66, 0xe8, 0xbf, 0x9b, 0x96, 0xc4, 0x9f,
-    0x60, 0xd9, 0x24, 0x37, 0x99, 0xd6, 0xec, 0x31, },
-};
-
 bool __init blake2s_selftest(void)
 {
 	u8 key[BLAKE2S_KEY_SIZE];
@@ -607,16 +587,5 @@ bool __init blake2s_selftest(void)
 		}
 	}
 
-	if (success) {
-		blake2s256_hmac(hash, buf, key, sizeof(buf), sizeof(key));
-		success &= !memcmp(hash, blake2s_hmac_testvecs[0], BLAKE2S_HASH_SIZE);
-
-		blake2s256_hmac(hash, key, buf, sizeof(key), sizeof(buf));
-		success &= !memcmp(hash, blake2s_hmac_testvecs[1], BLAKE2S_HASH_SIZE);
-
-		if (!success)
-			pr_err("blake2s256_hmac self-test: FAIL\n");
-	}
-
 	return success;
 }
diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index 93f2ae051370..9364f79937b8 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -30,43 +30,6 @@ void blake2s_final(struct blake2s_state *state, u8 *out)
 }
 EXPORT_SYMBOL(blake2s_final);
 
-void blake2s256_hmac(u8 *out, const u8 *in, const u8 *key, const size_t inlen,
-		     const size_t keylen)
-{
-	struct blake2s_state state;
-	u8 x_key[BLAKE2S_BLOCK_SIZE] __aligned(__alignof__(u32)) = { 0 };
-	u8 i_hash[BLAKE2S_HASH_SIZE] __aligned(__alignof__(u32));
-	int i;
-
-	if (keylen > BLAKE2S_BLOCK_SIZE) {
-		blake2s_init(&state, BLAKE2S_HASH_SIZE);
-		blake2s_update(&state, key, keylen);
-		blake2s_final(&state, x_key);
-	} else
-		memcpy(x_key, key, keylen);
-
-	for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
-		x_key[i] ^= 0x36;
-
-	blake2s_init(&state, BLAKE2S_HASH_SIZE);
-	blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
-	blake2s_update(&state, in, inlen);
-	blake2s_final(&state, i_hash);
-
-	for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
-		x_key[i] ^= 0x5c ^ 0x36;
-
-	blake2s_init(&state, BLAKE2S_HASH_SIZE);
-	blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
-	blake2s_update(&state, i_hash, BLAKE2S_HASH_SIZE);
-	blake2s_final(&state, i_hash);
-
-	memcpy(out, i_hash, BLAKE2S_HASH_SIZE);
-	memzero_explicit(x_key, BLAKE2S_BLOCK_SIZE);
-	memzero_explicit(i_hash, BLAKE2S_HASH_SIZE);
-}
-EXPORT_SYMBOL(blake2s256_hmac);
-
 static int __init blake2s_mod_init(void)
 {
 	if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) &&
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH crypto 2/2] lib/crypto: blake2s: move hmac construction into wireguard
  2022-01-11 13:49             ` [PATCH crypto 2/2] lib/crypto: blake2s: move hmac construction into wireguard Jason A. Donenfeld
@ 2022-01-11 14:43               ` Ard Biesheuvel
  2022-01-12 18:35               ` Eric Biggers
  1 sibling, 0 replies; 61+ messages in thread
From: Ard Biesheuvel @ 2022-01-11 14:43 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Linux Crypto Mailing List,
	open list:BPF JIT for MIPS (32-BIT AND 64-BIT),
	wireguard, Linux Kernel Mailing List,
	open list:BPF JIT for MIPS (32-BIT AND 64-BIT),
	Geert Uytterhoeven, Theodore Y. Ts'o, Greg Kroah-Hartman,
	Jean-Philippe Aumasson, Herbert Xu

On Tue, 11 Jan 2022 at 14:49, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>
> Basically nobody should use blake2s in an HMAC construction; it already
> has a keyed variant. But for unfortunately historical reasons, Noise,

-ly

> used by WireGuard, uses HKDF quite strictly, which means we have to use
> this. Because this really shouldn't be used by others, this commit moves
> it into wireguard's noise.c locally, so that kernels that aren't using
> WireGuard don't get this superfluous code baked in. On m68k systems,
> this shaves off ~314 bytes.
>
> Cc: Geert Uytterhoeven <geert@linux-m68k.org>
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: Ard Biesheuvel <ardb@kernel.org>
> Cc: netdev@vger.kernel.org
> Cc: wireguard@lists.zx2c4.com
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>

Acked-by: Ard Biesheuvel <ardb@kernel.org>

> ---
>  drivers/net/wireguard/noise.c | 45 ++++++++++++++++++++++++++++++-----
>  include/crypto/blake2s.h      |  3 ---
>  lib/crypto/blake2s-selftest.c | 31 ------------------------
>  lib/crypto/blake2s.c          | 37 ----------------------------
>  4 files changed, 39 insertions(+), 77 deletions(-)
>
> diff --git a/drivers/net/wireguard/noise.c b/drivers/net/wireguard/noise.c
> index c0cfd9b36c0b..720952b92e78 100644
> --- a/drivers/net/wireguard/noise.c
> +++ b/drivers/net/wireguard/noise.c
> @@ -302,6 +302,41 @@ void wg_noise_set_static_identity_private_key(
>                 static_identity->static_public, private_key);
>  }
>
> +static void hmac(u8 *out, const u8 *in, const u8 *key, const size_t inlen, const size_t keylen)
> +{
> +       struct blake2s_state state;
> +       u8 x_key[BLAKE2S_BLOCK_SIZE] __aligned(__alignof__(u32)) = { 0 };
> +       u8 i_hash[BLAKE2S_HASH_SIZE] __aligned(__alignof__(u32));
> +       int i;
> +
> +       if (keylen > BLAKE2S_BLOCK_SIZE) {
> +               blake2s_init(&state, BLAKE2S_HASH_SIZE);
> +               blake2s_update(&state, key, keylen);
> +               blake2s_final(&state, x_key);
> +       } else
> +               memcpy(x_key, key, keylen);
> +
> +       for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
> +               x_key[i] ^= 0x36;
> +
> +       blake2s_init(&state, BLAKE2S_HASH_SIZE);
> +       blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
> +       blake2s_update(&state, in, inlen);
> +       blake2s_final(&state, i_hash);
> +
> +       for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
> +               x_key[i] ^= 0x5c ^ 0x36;
> +
> +       blake2s_init(&state, BLAKE2S_HASH_SIZE);
> +       blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
> +       blake2s_update(&state, i_hash, BLAKE2S_HASH_SIZE);
> +       blake2s_final(&state, i_hash);
> +
> +       memcpy(out, i_hash, BLAKE2S_HASH_SIZE);
> +       memzero_explicit(x_key, BLAKE2S_BLOCK_SIZE);
> +       memzero_explicit(i_hash, BLAKE2S_HASH_SIZE);
> +}
> +
>  /* This is Hugo Krawczyk's HKDF:
>   *  - https://eprint.iacr.org/2010/264.pdf
>   *  - https://tools.ietf.org/html/rfc5869
> @@ -322,14 +357,14 @@ static void kdf(u8 *first_dst, u8 *second_dst, u8 *third_dst, const u8 *data,
>                  ((third_len || third_dst) && (!second_len || !second_dst))));
>
>         /* Extract entropy from data into secret */
> -       blake2s256_hmac(secret, data, chaining_key, data_len, NOISE_HASH_LEN);
> +       hmac(secret, data, chaining_key, data_len, NOISE_HASH_LEN);
>
>         if (!first_dst || !first_len)
>                 goto out;
>
>         /* Expand first key: key = secret, data = 0x1 */
>         output[0] = 1;
> -       blake2s256_hmac(output, output, secret, 1, BLAKE2S_HASH_SIZE);
> +       hmac(output, output, secret, 1, BLAKE2S_HASH_SIZE);
>         memcpy(first_dst, output, first_len);
>
>         if (!second_dst || !second_len)
> @@ -337,8 +372,7 @@ static void kdf(u8 *first_dst, u8 *second_dst, u8 *third_dst, const u8 *data,
>
>         /* Expand second key: key = secret, data = first-key || 0x2 */
>         output[BLAKE2S_HASH_SIZE] = 2;
> -       blake2s256_hmac(output, output, secret, BLAKE2S_HASH_SIZE + 1,
> -                       BLAKE2S_HASH_SIZE);
> +       hmac(output, output, secret, BLAKE2S_HASH_SIZE + 1, BLAKE2S_HASH_SIZE);
>         memcpy(second_dst, output, second_len);
>
>         if (!third_dst || !third_len)
> @@ -346,8 +380,7 @@ static void kdf(u8 *first_dst, u8 *second_dst, u8 *third_dst, const u8 *data,
>
>         /* Expand third key: key = secret, data = second-key || 0x3 */
>         output[BLAKE2S_HASH_SIZE] = 3;
> -       blake2s256_hmac(output, output, secret, BLAKE2S_HASH_SIZE + 1,
> -                       BLAKE2S_HASH_SIZE);
> +       hmac(output, output, secret, BLAKE2S_HASH_SIZE + 1, BLAKE2S_HASH_SIZE);
>         memcpy(third_dst, output, third_len);
>
>  out:
> diff --git a/include/crypto/blake2s.h b/include/crypto/blake2s.h
> index bc3fb59442ce..4e30e1799e61 100644
> --- a/include/crypto/blake2s.h
> +++ b/include/crypto/blake2s.h
> @@ -101,7 +101,4 @@ static inline void blake2s(u8 *out, const u8 *in, const u8 *key,
>         blake2s_final(&state, out);
>  }
>
> -void blake2s256_hmac(u8 *out, const u8 *in, const u8 *key, const size_t inlen,
> -                    const size_t keylen);
> -
>  #endif /* _CRYPTO_BLAKE2S_H */
> diff --git a/lib/crypto/blake2s-selftest.c b/lib/crypto/blake2s-selftest.c
> index 5d9ea53be973..409e4b728770 100644
> --- a/lib/crypto/blake2s-selftest.c
> +++ b/lib/crypto/blake2s-selftest.c
> @@ -15,7 +15,6 @@
>   * #include <stdio.h>
>   *
>   * #include <openssl/evp.h>
> - * #include <openssl/hmac.h>
>   *
>   * #define BLAKE2S_TESTVEC_COUNT       256
>   *
> @@ -58,16 +57,6 @@
>   *     }
>   *     printf("};\n\n");
>   *
> - *     printf("static const u8 blake2s_hmac_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {\n");
> - *
> - *     HMAC(EVP_blake2s256(), key, sizeof(key), buf, sizeof(buf), hash, NULL);
> - *     print_vec(hash, BLAKE2S_OUTBYTES);
> - *
> - *     HMAC(EVP_blake2s256(), buf, sizeof(buf), key, sizeof(key), hash, NULL);
> - *     print_vec(hash, BLAKE2S_OUTBYTES);
> - *
> - *     printf("};\n");
> - *
>   *     return 0;
>   *}
>   */
> @@ -554,15 +543,6 @@ static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {
>      0xd6, 0x98, 0x6b, 0x07, 0x10, 0x65, 0x52, 0x65, },
>  };
>
> -static const u8 blake2s_hmac_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {
> -  { 0xce, 0xe1, 0x57, 0x69, 0x82, 0xdc, 0xbf, 0x43, 0xad, 0x56, 0x4c, 0x70,
> -    0xed, 0x68, 0x16, 0x96, 0xcf, 0xa4, 0x73, 0xe8, 0xe8, 0xfc, 0x32, 0x79,
> -    0x08, 0x0a, 0x75, 0x82, 0xda, 0x3f, 0x05, 0x11, },
> -  { 0x77, 0x2f, 0x0c, 0x71, 0x41, 0xf4, 0x4b, 0x2b, 0xb3, 0xc6, 0xb6, 0xf9,
> -    0x60, 0xde, 0xe4, 0x52, 0x38, 0x66, 0xe8, 0xbf, 0x9b, 0x96, 0xc4, 0x9f,
> -    0x60, 0xd9, 0x24, 0x37, 0x99, 0xd6, 0xec, 0x31, },
> -};
> -
>  bool __init blake2s_selftest(void)
>  {
>         u8 key[BLAKE2S_KEY_SIZE];
> @@ -607,16 +587,5 @@ bool __init blake2s_selftest(void)
>                 }
>         }
>
> -       if (success) {
> -               blake2s256_hmac(hash, buf, key, sizeof(buf), sizeof(key));
> -               success &= !memcmp(hash, blake2s_hmac_testvecs[0], BLAKE2S_HASH_SIZE);
> -
> -               blake2s256_hmac(hash, key, buf, sizeof(key), sizeof(buf));
> -               success &= !memcmp(hash, blake2s_hmac_testvecs[1], BLAKE2S_HASH_SIZE);
> -
> -               if (!success)
> -                       pr_err("blake2s256_hmac self-test: FAIL\n");
> -       }
> -
>         return success;
>  }
> diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
> index 93f2ae051370..9364f79937b8 100644
> --- a/lib/crypto/blake2s.c
> +++ b/lib/crypto/blake2s.c
> @@ -30,43 +30,6 @@ void blake2s_final(struct blake2s_state *state, u8 *out)
>  }
>  EXPORT_SYMBOL(blake2s_final);
>
> -void blake2s256_hmac(u8 *out, const u8 *in, const u8 *key, const size_t inlen,
> -                    const size_t keylen)
> -{
> -       struct blake2s_state state;
> -       u8 x_key[BLAKE2S_BLOCK_SIZE] __aligned(__alignof__(u32)) = { 0 };
> -       u8 i_hash[BLAKE2S_HASH_SIZE] __aligned(__alignof__(u32));
> -       int i;
> -
> -       if (keylen > BLAKE2S_BLOCK_SIZE) {
> -               blake2s_init(&state, BLAKE2S_HASH_SIZE);
> -               blake2s_update(&state, key, keylen);
> -               blake2s_final(&state, x_key);
> -       } else
> -               memcpy(x_key, key, keylen);
> -
> -       for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
> -               x_key[i] ^= 0x36;
> -
> -       blake2s_init(&state, BLAKE2S_HASH_SIZE);
> -       blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
> -       blake2s_update(&state, in, inlen);
> -       blake2s_final(&state, i_hash);
> -
> -       for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
> -               x_key[i] ^= 0x5c ^ 0x36;
> -
> -       blake2s_init(&state, BLAKE2S_HASH_SIZE);
> -       blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
> -       blake2s_update(&state, i_hash, BLAKE2S_HASH_SIZE);
> -       blake2s_final(&state, i_hash);
> -
> -       memcpy(out, i_hash, BLAKE2S_HASH_SIZE);
> -       memzero_explicit(x_key, BLAKE2S_BLOCK_SIZE);
> -       memzero_explicit(i_hash, BLAKE2S_HASH_SIZE);
> -}
> -EXPORT_SYMBOL(blake2s256_hmac);
> -
>  static int __init blake2s_mod_init(void)
>  {
>         if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) &&
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 61+ messages in thread

* RE: [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction
  2022-01-11 12:50       ` Jason A. Donenfeld
  2022-01-11 12:57         ` Geert Uytterhoeven
@ 2022-01-11 15:46         ` David Laight
  2022-01-11 18:26           ` Jason A. Donenfeld
  2022-01-14 17:27         ` David Laight
  2 siblings, 1 reply; 61+ messages in thread
From: David Laight @ 2022-01-11 15:46 UTC (permalink / raw)
  To: 'Jason A. Donenfeld', Geert Uytterhoeven
  Cc: Linux Kernel Mailing List, Theodore Tso, Greg KH, Jean-Philippe Aumasson

From: Jason A. Donenfeld
> Sent: 11 January 2022 12:50
> 
> On Tue, Jan 11, 2022 at 1:28 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > If you're really quite concerned about m68k code size, I can probably
> > do some things to reduce that. For example, blake2s256_hmac is only
> > used by wireguard and it could probably be made local there. And with
> > some trivial loop re-rolling, I can shave off another 2300 bytes. And
> > I bet I can find a few other things too. The question is: how
> > important is this to you?
> 
> And with another trick (see below), another extra 1000 bytes or so
> shaved off. Aside from moving blake2s256_hmac, I'm not really super
> enthusiastic about making these changes, but depending on how important
> this is to you, maybe we can make something work. There are probably
> additional possibilities too with the code.
> 
> ====
> 
> diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c
> index 75ccb3e633e6..8e3c6372363a 100644
> --- a/lib/crypto/blake2s-generic.c
> +++ b/lib/crypto/blake2s-generic.c
> @@ -46,7 +46,7 @@ void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
>  {
>  	u32 m[16];
>  	u32 v[16];
> -	int i;
> +	int i, j;

Use unsigned int i, j;
Ensures the '% 4' are done as '& 3' and the divides as shifts.
Unless the compiler manages to track the valid values that will
even generate better code on x86-64.
(Saves a sign extension prior to the array indexes.)

>  	WARN_ON(IS_ENABLED(DEBUG) &&
>  		(nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));
> @@ -76,29 +76,11 @@ void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
>  	b = ror32(b ^ c, 7); \
>  } while (0)
> 
> -#define ROUND(r) do { \
> -	G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
> -	G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
> -	G(r, 2, v[2], v[ 6], v[10], v[14]); \
> -	G(r, 3, v[3], v[ 7], v[11], v[15]); \
> -	G(r, 4, v[0], v[ 5], v[10], v[15]); \
> -	G(r, 5, v[1], v[ 6], v[11], v[12]); \
> -	G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
> -	G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
> -} while (0)
> -		ROUND(0);
> -		ROUND(1);
> -		ROUND(2);
> -		ROUND(3);
> -		ROUND(4);
> -		ROUND(5);
> -		ROUND(6);
> -		ROUND(7);
> -		ROUND(8);
> -		ROUND(9);
> -
> +		for (i = 0; i < 10; ++i) {
> +			for (j = 0; j < 8; ++j)
> +				G(i, j, v[j % 4], v[((j + (j / 4)) % 4) + 4], v[((j + 2 * (j / 4)) % 4) + 8],
> v[((j + 3 * (j / 4)) % 4) + 12]);

I think I'd look at doing [0..3] then [4..7] to save execution time.

I actually wonder how large a block you need to be processing to get
a gain from all that unrolling on architectures like x86-64.
The 'cold cache' timing must be horrid.
Never mind the side effects of displacing that much other code from the I-cache.

There aren't enough registers to hold all the v[] values so they'll
all be memory reads - and there are probably others as well.
The other instructions will happen in parallel - even 3 or 4 for each memory read.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 61+ messages in thread

* [PATCH crypto v2 0/2] reduce code size from blake2s on m68k and other small platforms
  2022-01-11 13:49           ` [PATCH crypto 0/2] smaller blake2s code size on m68k and other small platforms Jason A. Donenfeld
  2022-01-11 13:49             ` [PATCH crypto 1/2] lib/crypto: blake2s-generic: reduce code size on small systems Jason A. Donenfeld
  2022-01-11 13:49             ` [PATCH crypto 2/2] lib/crypto: blake2s: move hmac construction into wireguard Jason A. Donenfeld
@ 2022-01-11 18:10             ` Jason A. Donenfeld
  2022-01-11 18:10               ` [PATCH crypto v2 1/2] lib/crypto: blake2s: move hmac construction into wireguard Jason A. Donenfeld
                                 ` (2 more replies)
  2 siblings, 3 replies; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-11 18:10 UTC (permalink / raw)
  To: linux-crypto, netdev, wireguard, linux-kernel, bpf, geert, tytso,
	gregkh, jeanphilippe.aumasson, ardb
  Cc: Jason A. Donenfeld

Hi,

Geert emailed me this afternoon concerned about blake2s codesize on m68k
and other small systems. We identified two effective ways of chopping
down the size. One of them moves some wireguard-specific things into
wireguard proper. The other one adds a slower codepath for small
machines to blake2s. This worked, and was v1 of this patchset, but I
wasn't so much of a fan. Then someone pointed out that the generic C
SHA-1 implementation is still unrolled, which is a *lot* of extra code.
Simply rerolling that saves about as much as v1 did. So, we instead do
that in this v2 patchset. SHA-1 is being phased out, and soon it won't
be included at all (hopefully). And nothing performance-oriented has
anything to do with it anyway.

The result of these two patches mitigates Geert's feared code size
increase for 5.17.

Thanks,
Jason


Jason A. Donenfeld (2):
  lib/crypto: blake2s: move hmac construction into wireguard
  lib/crypto: sha1: re-roll loops to reduce code size

 drivers/net/wireguard/noise.c |  45 +++++++++++--
 include/crypto/blake2s.h      |   3 -
 lib/crypto/blake2s-selftest.c |  31 ---------
 lib/crypto/blake2s.c          |  37 -----------
 lib/sha1.c                    | 117 ++++++++--------------------------
 5 files changed, 64 insertions(+), 169 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 61+ messages in thread

* [PATCH crypto v2 1/2] lib/crypto: blake2s: move hmac construction into wireguard
  2022-01-11 18:10             ` [PATCH crypto v2 0/2] reduce code size from blake2s on m68k and other small platforms Jason A. Donenfeld
@ 2022-01-11 18:10               ` Jason A. Donenfeld
  2022-01-11 18:10               ` [PATCH crypto v2 2/2] lib/crypto: sha1: re-roll loops to reduce code size Jason A. Donenfeld
  2022-01-11 22:05               ` [PATCH crypto v3 0/2] reduce code size from blake2s on m68k and other small platforms Jason A. Donenfeld
  2 siblings, 0 replies; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-11 18:10 UTC (permalink / raw)
  To: linux-crypto, netdev, wireguard, linux-kernel, bpf, geert, tytso,
	gregkh, jeanphilippe.aumasson, ardb
  Cc: Jason A. Donenfeld, Herbert Xu

Basically nobody should use blake2s in an HMAC construction; it already
has a keyed variant. But unfortunately for historical reasons, Noise,
used by WireGuard, uses HKDF quite strictly, which means we have to use
this. Because this really shouldn't be used by others, this commit moves
it into wireguard's noise.c locally, so that kernels that aren't using
WireGuard don't get this superfluous code baked in. On m68k systems,
this shaves off ~314 bytes.

Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 drivers/net/wireguard/noise.c | 45 ++++++++++++++++++++++++++++++-----
 include/crypto/blake2s.h      |  3 ---
 lib/crypto/blake2s-selftest.c | 31 ------------------------
 lib/crypto/blake2s.c          | 37 ----------------------------
 4 files changed, 39 insertions(+), 77 deletions(-)

diff --git a/drivers/net/wireguard/noise.c b/drivers/net/wireguard/noise.c
index c0cfd9b36c0b..720952b92e78 100644
--- a/drivers/net/wireguard/noise.c
+++ b/drivers/net/wireguard/noise.c
@@ -302,6 +302,41 @@ void wg_noise_set_static_identity_private_key(
 		static_identity->static_public, private_key);
 }
 
+static void hmac(u8 *out, const u8 *in, const u8 *key, const size_t inlen, const size_t keylen)
+{
+	struct blake2s_state state;
+	u8 x_key[BLAKE2S_BLOCK_SIZE] __aligned(__alignof__(u32)) = { 0 };
+	u8 i_hash[BLAKE2S_HASH_SIZE] __aligned(__alignof__(u32));
+	int i;
+
+	if (keylen > BLAKE2S_BLOCK_SIZE) {
+		blake2s_init(&state, BLAKE2S_HASH_SIZE);
+		blake2s_update(&state, key, keylen);
+		blake2s_final(&state, x_key);
+	} else
+		memcpy(x_key, key, keylen);
+
+	for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
+		x_key[i] ^= 0x36;
+
+	blake2s_init(&state, BLAKE2S_HASH_SIZE);
+	blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
+	blake2s_update(&state, in, inlen);
+	blake2s_final(&state, i_hash);
+
+	for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
+		x_key[i] ^= 0x5c ^ 0x36;
+
+	blake2s_init(&state, BLAKE2S_HASH_SIZE);
+	blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
+	blake2s_update(&state, i_hash, BLAKE2S_HASH_SIZE);
+	blake2s_final(&state, i_hash);
+
+	memcpy(out, i_hash, BLAKE2S_HASH_SIZE);
+	memzero_explicit(x_key, BLAKE2S_BLOCK_SIZE);
+	memzero_explicit(i_hash, BLAKE2S_HASH_SIZE);
+}
+
 /* This is Hugo Krawczyk's HKDF:
  *  - https://eprint.iacr.org/2010/264.pdf
  *  - https://tools.ietf.org/html/rfc5869
@@ -322,14 +357,14 @@ static void kdf(u8 *first_dst, u8 *second_dst, u8 *third_dst, const u8 *data,
 		 ((third_len || third_dst) && (!second_len || !second_dst))));
 
 	/* Extract entropy from data into secret */
-	blake2s256_hmac(secret, data, chaining_key, data_len, NOISE_HASH_LEN);
+	hmac(secret, data, chaining_key, data_len, NOISE_HASH_LEN);
 
 	if (!first_dst || !first_len)
 		goto out;
 
 	/* Expand first key: key = secret, data = 0x1 */
 	output[0] = 1;
-	blake2s256_hmac(output, output, secret, 1, BLAKE2S_HASH_SIZE);
+	hmac(output, output, secret, 1, BLAKE2S_HASH_SIZE);
 	memcpy(first_dst, output, first_len);
 
 	if (!second_dst || !second_len)
@@ -337,8 +372,7 @@ static void kdf(u8 *first_dst, u8 *second_dst, u8 *third_dst, const u8 *data,
 
 	/* Expand second key: key = secret, data = first-key || 0x2 */
 	output[BLAKE2S_HASH_SIZE] = 2;
-	blake2s256_hmac(output, output, secret, BLAKE2S_HASH_SIZE + 1,
-			BLAKE2S_HASH_SIZE);
+	hmac(output, output, secret, BLAKE2S_HASH_SIZE + 1, BLAKE2S_HASH_SIZE);
 	memcpy(second_dst, output, second_len);
 
 	if (!third_dst || !third_len)
@@ -346,8 +380,7 @@ static void kdf(u8 *first_dst, u8 *second_dst, u8 *third_dst, const u8 *data,
 
 	/* Expand third key: key = secret, data = second-key || 0x3 */
 	output[BLAKE2S_HASH_SIZE] = 3;
-	blake2s256_hmac(output, output, secret, BLAKE2S_HASH_SIZE + 1,
-			BLAKE2S_HASH_SIZE);
+	hmac(output, output, secret, BLAKE2S_HASH_SIZE + 1, BLAKE2S_HASH_SIZE);
 	memcpy(third_dst, output, third_len);
 
 out:
diff --git a/include/crypto/blake2s.h b/include/crypto/blake2s.h
index bc3fb59442ce..4e30e1799e61 100644
--- a/include/crypto/blake2s.h
+++ b/include/crypto/blake2s.h
@@ -101,7 +101,4 @@ static inline void blake2s(u8 *out, const u8 *in, const u8 *key,
 	blake2s_final(&state, out);
 }
 
-void blake2s256_hmac(u8 *out, const u8 *in, const u8 *key, const size_t inlen,
-		     const size_t keylen);
-
 #endif /* _CRYPTO_BLAKE2S_H */
diff --git a/lib/crypto/blake2s-selftest.c b/lib/crypto/blake2s-selftest.c
index 5d9ea53be973..409e4b728770 100644
--- a/lib/crypto/blake2s-selftest.c
+++ b/lib/crypto/blake2s-selftest.c
@@ -15,7 +15,6 @@
  * #include <stdio.h>
  *
  * #include <openssl/evp.h>
- * #include <openssl/hmac.h>
  *
  * #define BLAKE2S_TESTVEC_COUNT	256
  *
@@ -58,16 +57,6 @@
  *	}
  *	printf("};\n\n");
  *
- *	printf("static const u8 blake2s_hmac_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {\n");
- *
- *	HMAC(EVP_blake2s256(), key, sizeof(key), buf, sizeof(buf), hash, NULL);
- *	print_vec(hash, BLAKE2S_OUTBYTES);
- *
- *	HMAC(EVP_blake2s256(), buf, sizeof(buf), key, sizeof(key), hash, NULL);
- *	print_vec(hash, BLAKE2S_OUTBYTES);
- *
- *	printf("};\n");
- *
  *	return 0;
  *}
  */
@@ -554,15 +543,6 @@ static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {
     0xd6, 0x98, 0x6b, 0x07, 0x10, 0x65, 0x52, 0x65, },
 };
 
-static const u8 blake2s_hmac_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {
-  { 0xce, 0xe1, 0x57, 0x69, 0x82, 0xdc, 0xbf, 0x43, 0xad, 0x56, 0x4c, 0x70,
-    0xed, 0x68, 0x16, 0x96, 0xcf, 0xa4, 0x73, 0xe8, 0xe8, 0xfc, 0x32, 0x79,
-    0x08, 0x0a, 0x75, 0x82, 0xda, 0x3f, 0x05, 0x11, },
-  { 0x77, 0x2f, 0x0c, 0x71, 0x41, 0xf4, 0x4b, 0x2b, 0xb3, 0xc6, 0xb6, 0xf9,
-    0x60, 0xde, 0xe4, 0x52, 0x38, 0x66, 0xe8, 0xbf, 0x9b, 0x96, 0xc4, 0x9f,
-    0x60, 0xd9, 0x24, 0x37, 0x99, 0xd6, 0xec, 0x31, },
-};
-
 bool __init blake2s_selftest(void)
 {
 	u8 key[BLAKE2S_KEY_SIZE];
@@ -607,16 +587,5 @@ bool __init blake2s_selftest(void)
 		}
 	}
 
-	if (success) {
-		blake2s256_hmac(hash, buf, key, sizeof(buf), sizeof(key));
-		success &= !memcmp(hash, blake2s_hmac_testvecs[0], BLAKE2S_HASH_SIZE);
-
-		blake2s256_hmac(hash, key, buf, sizeof(key), sizeof(buf));
-		success &= !memcmp(hash, blake2s_hmac_testvecs[1], BLAKE2S_HASH_SIZE);
-
-		if (!success)
-			pr_err("blake2s256_hmac self-test: FAIL\n");
-	}
-
 	return success;
 }
diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index 93f2ae051370..9364f79937b8 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -30,43 +30,6 @@ void blake2s_final(struct blake2s_state *state, u8 *out)
 }
 EXPORT_SYMBOL(blake2s_final);
 
-void blake2s256_hmac(u8 *out, const u8 *in, const u8 *key, const size_t inlen,
-		     const size_t keylen)
-{
-	struct blake2s_state state;
-	u8 x_key[BLAKE2S_BLOCK_SIZE] __aligned(__alignof__(u32)) = { 0 };
-	u8 i_hash[BLAKE2S_HASH_SIZE] __aligned(__alignof__(u32));
-	int i;
-
-	if (keylen > BLAKE2S_BLOCK_SIZE) {
-		blake2s_init(&state, BLAKE2S_HASH_SIZE);
-		blake2s_update(&state, key, keylen);
-		blake2s_final(&state, x_key);
-	} else
-		memcpy(x_key, key, keylen);
-
-	for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
-		x_key[i] ^= 0x36;
-
-	blake2s_init(&state, BLAKE2S_HASH_SIZE);
-	blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
-	blake2s_update(&state, in, inlen);
-	blake2s_final(&state, i_hash);
-
-	for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
-		x_key[i] ^= 0x5c ^ 0x36;
-
-	blake2s_init(&state, BLAKE2S_HASH_SIZE);
-	blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
-	blake2s_update(&state, i_hash, BLAKE2S_HASH_SIZE);
-	blake2s_final(&state, i_hash);
-
-	memcpy(out, i_hash, BLAKE2S_HASH_SIZE);
-	memzero_explicit(x_key, BLAKE2S_BLOCK_SIZE);
-	memzero_explicit(i_hash, BLAKE2S_HASH_SIZE);
-}
-EXPORT_SYMBOL(blake2s256_hmac);
-
 static int __init blake2s_mod_init(void)
 {
 	if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) &&
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH crypto v2 2/2] lib/crypto: sha1: re-roll loops to reduce code size
  2022-01-11 18:10             ` [PATCH crypto v2 0/2] reduce code size from blake2s on m68k and other small platforms Jason A. Donenfeld
  2022-01-11 18:10               ` [PATCH crypto v2 1/2] lib/crypto: blake2s: move hmac construction into wireguard Jason A. Donenfeld
@ 2022-01-11 18:10               ` Jason A. Donenfeld
  2022-01-11 22:05               ` [PATCH crypto v3 0/2] reduce code size from blake2s on m68k and other small platforms Jason A. Donenfeld
  2 siblings, 0 replies; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-11 18:10 UTC (permalink / raw)
  To: linux-crypto, netdev, wireguard, linux-kernel, bpf, geert, tytso,
	gregkh, jeanphilippe.aumasson, ardb
  Cc: Jason A. Donenfeld, Herbert Xu

With SHA-1 no longer being used for anything performance oriented, and
also soon to be phased out entirely, we can make up for the space added
by unrolled BLAKE2s by simply re-rolling SHA-1. Since SHA-1 is so much
more complex, re-rolling it more or less takes care of the code size
added by BLAKE2s. And eventually, hopefully we'll see SHA-1 removed
entirely from most small kernel builds.

Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 lib/sha1.c | 117 ++++++++++++-----------------------------------------
 1 file changed, 25 insertions(+), 92 deletions(-)

diff --git a/lib/sha1.c b/lib/sha1.c
index 9bd1935a1472..f2acfa294e64 100644
--- a/lib/sha1.c
+++ b/lib/sha1.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/bitops.h>
+#include <linux/string.h>
 #include <crypto/sha1.h>
 #include <asm/unaligned.h>
 
@@ -83,109 +84,41 @@
  */
 void sha1_transform(__u32 *digest, const char *data, __u32 *array)
 {
-	__u32 A, B, C, D, E;
+	u32 d[5];
+	unsigned int i = 0;
 
-	A = digest[0];
-	B = digest[1];
-	C = digest[2];
-	D = digest[3];
-	E = digest[4];
+	memcpy(d, digest, sizeof(d));
 
 	/* Round 1 - iterations 0-16 take their input from 'data' */
-	T_0_15( 0, A, B, C, D, E);
-	T_0_15( 1, E, A, B, C, D);
-	T_0_15( 2, D, E, A, B, C);
-	T_0_15( 3, C, D, E, A, B);
-	T_0_15( 4, B, C, D, E, A);
-	T_0_15( 5, A, B, C, D, E);
-	T_0_15( 6, E, A, B, C, D);
-	T_0_15( 7, D, E, A, B, C);
-	T_0_15( 8, C, D, E, A, B);
-	T_0_15( 9, B, C, D, E, A);
-	T_0_15(10, A, B, C, D, E);
-	T_0_15(11, E, A, B, C, D);
-	T_0_15(12, D, E, A, B, C);
-	T_0_15(13, C, D, E, A, B);
-	T_0_15(14, B, C, D, E, A);
-	T_0_15(15, A, B, C, D, E);
+	for (; i < 16; ++i)
+		T_0_15(i, d[(-6 - i) % 5], d[(-5 - i) % 5],
+		       d[(-4 - i) % 5], d[(-3 - i) % 5], d[(-2 - i) % 5]);
 
 	/* Round 1 - tail. Input from 512-bit mixing array */
-	T_16_19(16, E, A, B, C, D);
-	T_16_19(17, D, E, A, B, C);
-	T_16_19(18, C, D, E, A, B);
-	T_16_19(19, B, C, D, E, A);
+	for (; i < 20; ++i)
+		T_16_19(i, d[(-6 - i) % 5], d[(-5 - i) % 5],
+			d[(-4 - i) % 5], d[(-3 - i) % 5], d[(-2 - i) % 5]);
 
 	/* Round 2 */
-	T_20_39(20, A, B, C, D, E);
-	T_20_39(21, E, A, B, C, D);
-	T_20_39(22, D, E, A, B, C);
-	T_20_39(23, C, D, E, A, B);
-	T_20_39(24, B, C, D, E, A);
-	T_20_39(25, A, B, C, D, E);
-	T_20_39(26, E, A, B, C, D);
-	T_20_39(27, D, E, A, B, C);
-	T_20_39(28, C, D, E, A, B);
-	T_20_39(29, B, C, D, E, A);
-	T_20_39(30, A, B, C, D, E);
-	T_20_39(31, E, A, B, C, D);
-	T_20_39(32, D, E, A, B, C);
-	T_20_39(33, C, D, E, A, B);
-	T_20_39(34, B, C, D, E, A);
-	T_20_39(35, A, B, C, D, E);
-	T_20_39(36, E, A, B, C, D);
-	T_20_39(37, D, E, A, B, C);
-	T_20_39(38, C, D, E, A, B);
-	T_20_39(39, B, C, D, E, A);
+	for (; i < 40; ++i)
+		T_20_39(i, d[(-6 - i) % 5], d[(-5 - i) % 5],
+			d[(-4 - i) % 5], d[(-3 - i) % 5], d[(-2 - i) % 5]);
 
 	/* Round 3 */
-	T_40_59(40, A, B, C, D, E);
-	T_40_59(41, E, A, B, C, D);
-	T_40_59(42, D, E, A, B, C);
-	T_40_59(43, C, D, E, A, B);
-	T_40_59(44, B, C, D, E, A);
-	T_40_59(45, A, B, C, D, E);
-	T_40_59(46, E, A, B, C, D);
-	T_40_59(47, D, E, A, B, C);
-	T_40_59(48, C, D, E, A, B);
-	T_40_59(49, B, C, D, E, A);
-	T_40_59(50, A, B, C, D, E);
-	T_40_59(51, E, A, B, C, D);
-	T_40_59(52, D, E, A, B, C);
-	T_40_59(53, C, D, E, A, B);
-	T_40_59(54, B, C, D, E, A);
-	T_40_59(55, A, B, C, D, E);
-	T_40_59(56, E, A, B, C, D);
-	T_40_59(57, D, E, A, B, C);
-	T_40_59(58, C, D, E, A, B);
-	T_40_59(59, B, C, D, E, A);
+	for (; i < 60; ++i)
+		T_40_59(i, d[(-6 - i) % 5], d[(-5 - i) % 5],
+			d[(-4 - i) % 5], d[(-3 - i) % 5], d[(-2 - i) % 5]);
 
 	/* Round 4 */
-	T_60_79(60, A, B, C, D, E);
-	T_60_79(61, E, A, B, C, D);
-	T_60_79(62, D, E, A, B, C);
-	T_60_79(63, C, D, E, A, B);
-	T_60_79(64, B, C, D, E, A);
-	T_60_79(65, A, B, C, D, E);
-	T_60_79(66, E, A, B, C, D);
-	T_60_79(67, D, E, A, B, C);
-	T_60_79(68, C, D, E, A, B);
-	T_60_79(69, B, C, D, E, A);
-	T_60_79(70, A, B, C, D, E);
-	T_60_79(71, E, A, B, C, D);
-	T_60_79(72, D, E, A, B, C);
-	T_60_79(73, C, D, E, A, B);
-	T_60_79(74, B, C, D, E, A);
-	T_60_79(75, A, B, C, D, E);
-	T_60_79(76, E, A, B, C, D);
-	T_60_79(77, D, E, A, B, C);
-	T_60_79(78, C, D, E, A, B);
-	T_60_79(79, B, C, D, E, A);
-
-	digest[0] += A;
-	digest[1] += B;
-	digest[2] += C;
-	digest[3] += D;
-	digest[4] += E;
+	for (; i < 80; ++i)
+		T_60_79(i, d[(-6 - i) % 5], d[(-5 - i) % 5],
+			d[(-4 - i) % 5], d[(-3 - i) % 5], d[(-2 - i) % 5]);
+
+	digest[0] += d[0];
+	digest[1] += d[1];
+	digest[2] += d[2];
+	digest[3] += d[3];
+	digest[4] += d[4];
 }
 EXPORT_SYMBOL(sha1_transform);
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction
  2022-01-11 15:46         ` David Laight
@ 2022-01-11 18:26           ` Jason A. Donenfeld
  0 siblings, 0 replies; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-11 18:26 UTC (permalink / raw)
  To: David Laight
  Cc: Geert Uytterhoeven, Linux Kernel Mailing List, Theodore Tso,
	Greg KH, Jean-Philippe Aumasson

On Tue, Jan 11, 2022 at 4:47 PM David Laight <David.Laight@aculab.com> wrote:
> > -     int i;
> > +     int i, j;
>
> Use unsigned int i, j;
> Ensures the '% 4' are done as '& 3' and the divides as shifts.
> Unless the compiler manages to track the valid values that will
> even generate better code on x86-64.
> (Saves a sign extension prior to the array indexes.)

Ack.

> I think I'd look at doing [0..3] then [4..7] to save execution time.

I actually wound up making the same change to sha1 instead of blake2s
for v2 of this, and achieved pretty similar results, but I think
that's more satisfactory of a conclusion. v2 is here:
https://lore.kernel.org/linux-crypto/20220111181037.632969-1-Jason@zx2c4.com/T/#u

Jason

^ permalink raw reply	[flat|nested] 61+ messages in thread

* [PATCH crypto v3 0/2] reduce code size from blake2s on m68k and other small platforms
  2022-01-11 18:10             ` [PATCH crypto v2 0/2] reduce code size from blake2s on m68k and other small platforms Jason A. Donenfeld
  2022-01-11 18:10               ` [PATCH crypto v2 1/2] lib/crypto: blake2s: move hmac construction into wireguard Jason A. Donenfeld
  2022-01-11 18:10               ` [PATCH crypto v2 2/2] lib/crypto: sha1: re-roll loops to reduce code size Jason A. Donenfeld
@ 2022-01-11 22:05               ` Jason A. Donenfeld
  2022-01-11 22:05                 ` [PATCH crypto v3 1/2] lib/crypto: blake2s: move hmac construction into wireguard Jason A. Donenfeld
                                   ` (2 more replies)
  2 siblings, 3 replies; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-11 22:05 UTC (permalink / raw)
  To: linux-crypto, netdev, wireguard, linux-kernel, bpf, geert, tytso,
	gregkh, jeanphilippe.aumasson, ardb
  Cc: Jason A. Donenfeld

Hi,

Geert emailed me this afternoon concerned about blake2s codesize on m68k
and other small systems. We identified two effective ways of chopping
down the size. One of them moves some wireguard-specific things into
wireguard proper. The other one adds a slower codepath for small
machines to blake2s. This worked, and was v1 of this patchset, but I
wasn't so much of a fan. Then someone pointed out that the generic C
SHA-1 implementation is still unrolled, which is a *lot* of extra code.
Simply rerolling that saves about as much as v1 did. So, we instead do
that in this patchset. SHA-1 is being phased out, and soon it won't
be included at all (hopefully). And nothing performance-oriented has
anything to do with it anyway.

The result of these two patches mitigates Geert's feared code size
increase for 5.17.

v3 improves on v2 by making the re-rolling of SHA-1 much simpler,
resulting in even larger code size reduction and much better
performance. The reason I'm sending yet a third version in such a short
amount of time is because the trick here feels obvious and substantial
enough that I'd hate for Geert to waste time measuring the impact of the
previous commit.

Thanks,
Jason

Jason A. Donenfeld (2):
  lib/crypto: blake2s: move hmac construction into wireguard
  lib/crypto: sha1: re-roll loops to reduce code size

 drivers/net/wireguard/noise.c | 45 ++++++++++++++---
 include/crypto/blake2s.h      |  3 --
 lib/crypto/blake2s-selftest.c | 31 ------------
 lib/crypto/blake2s.c          | 37 --------------
 lib/sha1.c                    | 95 ++++++-----------------------------
 5 files changed, 53 insertions(+), 158 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 61+ messages in thread

* [PATCH crypto v3 1/2] lib/crypto: blake2s: move hmac construction into wireguard
  2022-01-11 22:05               ` [PATCH crypto v3 0/2] reduce code size from blake2s on m68k and other small platforms Jason A. Donenfeld
@ 2022-01-11 22:05                 ` Jason A. Donenfeld
  2022-01-11 22:05                 ` [PATCH crypto v3 2/2] lib/crypto: sha1: re-roll loops to reduce code size Jason A. Donenfeld
  2022-01-12 10:59                 ` [PATCH crypto v3 0/2] reduce code size from blake2s on m68k and other small platforms Geert Uytterhoeven
  2 siblings, 0 replies; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-11 22:05 UTC (permalink / raw)
  To: linux-crypto, netdev, wireguard, linux-kernel, bpf, geert, tytso,
	gregkh, jeanphilippe.aumasson, ardb
  Cc: Jason A. Donenfeld, Herbert Xu

Basically nobody should use blake2s in an HMAC construction; it already
has a keyed variant. But unfortunately for historical reasons, Noise,
used by WireGuard, uses HKDF quite strictly, which means we have to use
this. Because this really shouldn't be used by others, this commit moves
it into wireguard's noise.c locally, so that kernels that aren't using
WireGuard don't get this superfluous code baked in. On m68k systems,
this shaves off ~314 bytes.

Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 drivers/net/wireguard/noise.c | 45 ++++++++++++++++++++++++++++++-----
 include/crypto/blake2s.h      |  3 ---
 lib/crypto/blake2s-selftest.c | 31 ------------------------
 lib/crypto/blake2s.c          | 37 ----------------------------
 4 files changed, 39 insertions(+), 77 deletions(-)

diff --git a/drivers/net/wireguard/noise.c b/drivers/net/wireguard/noise.c
index c0cfd9b36c0b..720952b92e78 100644
--- a/drivers/net/wireguard/noise.c
+++ b/drivers/net/wireguard/noise.c
@@ -302,6 +302,41 @@ void wg_noise_set_static_identity_private_key(
 		static_identity->static_public, private_key);
 }
 
+static void hmac(u8 *out, const u8 *in, const u8 *key, const size_t inlen, const size_t keylen)
+{
+	struct blake2s_state state;
+	u8 x_key[BLAKE2S_BLOCK_SIZE] __aligned(__alignof__(u32)) = { 0 };
+	u8 i_hash[BLAKE2S_HASH_SIZE] __aligned(__alignof__(u32));
+	int i;
+
+	if (keylen > BLAKE2S_BLOCK_SIZE) {
+		blake2s_init(&state, BLAKE2S_HASH_SIZE);
+		blake2s_update(&state, key, keylen);
+		blake2s_final(&state, x_key);
+	} else
+		memcpy(x_key, key, keylen);
+
+	for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
+		x_key[i] ^= 0x36;
+
+	blake2s_init(&state, BLAKE2S_HASH_SIZE);
+	blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
+	blake2s_update(&state, in, inlen);
+	blake2s_final(&state, i_hash);
+
+	for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
+		x_key[i] ^= 0x5c ^ 0x36;
+
+	blake2s_init(&state, BLAKE2S_HASH_SIZE);
+	blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
+	blake2s_update(&state, i_hash, BLAKE2S_HASH_SIZE);
+	blake2s_final(&state, i_hash);
+
+	memcpy(out, i_hash, BLAKE2S_HASH_SIZE);
+	memzero_explicit(x_key, BLAKE2S_BLOCK_SIZE);
+	memzero_explicit(i_hash, BLAKE2S_HASH_SIZE);
+}
+
 /* This is Hugo Krawczyk's HKDF:
  *  - https://eprint.iacr.org/2010/264.pdf
  *  - https://tools.ietf.org/html/rfc5869
@@ -322,14 +357,14 @@ static void kdf(u8 *first_dst, u8 *second_dst, u8 *third_dst, const u8 *data,
 		 ((third_len || third_dst) && (!second_len || !second_dst))));
 
 	/* Extract entropy from data into secret */
-	blake2s256_hmac(secret, data, chaining_key, data_len, NOISE_HASH_LEN);
+	hmac(secret, data, chaining_key, data_len, NOISE_HASH_LEN);
 
 	if (!first_dst || !first_len)
 		goto out;
 
 	/* Expand first key: key = secret, data = 0x1 */
 	output[0] = 1;
-	blake2s256_hmac(output, output, secret, 1, BLAKE2S_HASH_SIZE);
+	hmac(output, output, secret, 1, BLAKE2S_HASH_SIZE);
 	memcpy(first_dst, output, first_len);
 
 	if (!second_dst || !second_len)
@@ -337,8 +372,7 @@ static void kdf(u8 *first_dst, u8 *second_dst, u8 *third_dst, const u8 *data,
 
 	/* Expand second key: key = secret, data = first-key || 0x2 */
 	output[BLAKE2S_HASH_SIZE] = 2;
-	blake2s256_hmac(output, output, secret, BLAKE2S_HASH_SIZE + 1,
-			BLAKE2S_HASH_SIZE);
+	hmac(output, output, secret, BLAKE2S_HASH_SIZE + 1, BLAKE2S_HASH_SIZE);
 	memcpy(second_dst, output, second_len);
 
 	if (!third_dst || !third_len)
@@ -346,8 +380,7 @@ static void kdf(u8 *first_dst, u8 *second_dst, u8 *third_dst, const u8 *data,
 
 	/* Expand third key: key = secret, data = second-key || 0x3 */
 	output[BLAKE2S_HASH_SIZE] = 3;
-	blake2s256_hmac(output, output, secret, BLAKE2S_HASH_SIZE + 1,
-			BLAKE2S_HASH_SIZE);
+	hmac(output, output, secret, BLAKE2S_HASH_SIZE + 1, BLAKE2S_HASH_SIZE);
 	memcpy(third_dst, output, third_len);
 
 out:
diff --git a/include/crypto/blake2s.h b/include/crypto/blake2s.h
index bc3fb59442ce..4e30e1799e61 100644
--- a/include/crypto/blake2s.h
+++ b/include/crypto/blake2s.h
@@ -101,7 +101,4 @@ static inline void blake2s(u8 *out, const u8 *in, const u8 *key,
 	blake2s_final(&state, out);
 }
 
-void blake2s256_hmac(u8 *out, const u8 *in, const u8 *key, const size_t inlen,
-		     const size_t keylen);
-
 #endif /* _CRYPTO_BLAKE2S_H */
diff --git a/lib/crypto/blake2s-selftest.c b/lib/crypto/blake2s-selftest.c
index 5d9ea53be973..409e4b728770 100644
--- a/lib/crypto/blake2s-selftest.c
+++ b/lib/crypto/blake2s-selftest.c
@@ -15,7 +15,6 @@
  * #include <stdio.h>
  *
  * #include <openssl/evp.h>
- * #include <openssl/hmac.h>
  *
  * #define BLAKE2S_TESTVEC_COUNT	256
  *
@@ -58,16 +57,6 @@
  *	}
  *	printf("};\n\n");
  *
- *	printf("static const u8 blake2s_hmac_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {\n");
- *
- *	HMAC(EVP_blake2s256(), key, sizeof(key), buf, sizeof(buf), hash, NULL);
- *	print_vec(hash, BLAKE2S_OUTBYTES);
- *
- *	HMAC(EVP_blake2s256(), buf, sizeof(buf), key, sizeof(key), hash, NULL);
- *	print_vec(hash, BLAKE2S_OUTBYTES);
- *
- *	printf("};\n");
- *
  *	return 0;
  *}
  */
@@ -554,15 +543,6 @@ static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {
     0xd6, 0x98, 0x6b, 0x07, 0x10, 0x65, 0x52, 0x65, },
 };
 
-static const u8 blake2s_hmac_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {
-  { 0xce, 0xe1, 0x57, 0x69, 0x82, 0xdc, 0xbf, 0x43, 0xad, 0x56, 0x4c, 0x70,
-    0xed, 0x68, 0x16, 0x96, 0xcf, 0xa4, 0x73, 0xe8, 0xe8, 0xfc, 0x32, 0x79,
-    0x08, 0x0a, 0x75, 0x82, 0xda, 0x3f, 0x05, 0x11, },
-  { 0x77, 0x2f, 0x0c, 0x71, 0x41, 0xf4, 0x4b, 0x2b, 0xb3, 0xc6, 0xb6, 0xf9,
-    0x60, 0xde, 0xe4, 0x52, 0x38, 0x66, 0xe8, 0xbf, 0x9b, 0x96, 0xc4, 0x9f,
-    0x60, 0xd9, 0x24, 0x37, 0x99, 0xd6, 0xec, 0x31, },
-};
-
 bool __init blake2s_selftest(void)
 {
 	u8 key[BLAKE2S_KEY_SIZE];
@@ -607,16 +587,5 @@ bool __init blake2s_selftest(void)
 		}
 	}
 
-	if (success) {
-		blake2s256_hmac(hash, buf, key, sizeof(buf), sizeof(key));
-		success &= !memcmp(hash, blake2s_hmac_testvecs[0], BLAKE2S_HASH_SIZE);
-
-		blake2s256_hmac(hash, key, buf, sizeof(key), sizeof(buf));
-		success &= !memcmp(hash, blake2s_hmac_testvecs[1], BLAKE2S_HASH_SIZE);
-
-		if (!success)
-			pr_err("blake2s256_hmac self-test: FAIL\n");
-	}
-
 	return success;
 }
diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index 93f2ae051370..9364f79937b8 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -30,43 +30,6 @@ void blake2s_final(struct blake2s_state *state, u8 *out)
 }
 EXPORT_SYMBOL(blake2s_final);
 
-void blake2s256_hmac(u8 *out, const u8 *in, const u8 *key, const size_t inlen,
-		     const size_t keylen)
-{
-	struct blake2s_state state;
-	u8 x_key[BLAKE2S_BLOCK_SIZE] __aligned(__alignof__(u32)) = { 0 };
-	u8 i_hash[BLAKE2S_HASH_SIZE] __aligned(__alignof__(u32));
-	int i;
-
-	if (keylen > BLAKE2S_BLOCK_SIZE) {
-		blake2s_init(&state, BLAKE2S_HASH_SIZE);
-		blake2s_update(&state, key, keylen);
-		blake2s_final(&state, x_key);
-	} else
-		memcpy(x_key, key, keylen);
-
-	for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
-		x_key[i] ^= 0x36;
-
-	blake2s_init(&state, BLAKE2S_HASH_SIZE);
-	blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
-	blake2s_update(&state, in, inlen);
-	blake2s_final(&state, i_hash);
-
-	for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
-		x_key[i] ^= 0x5c ^ 0x36;
-
-	blake2s_init(&state, BLAKE2S_HASH_SIZE);
-	blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
-	blake2s_update(&state, i_hash, BLAKE2S_HASH_SIZE);
-	blake2s_final(&state, i_hash);
-
-	memcpy(out, i_hash, BLAKE2S_HASH_SIZE);
-	memzero_explicit(x_key, BLAKE2S_BLOCK_SIZE);
-	memzero_explicit(i_hash, BLAKE2S_HASH_SIZE);
-}
-EXPORT_SYMBOL(blake2s256_hmac);
-
 static int __init blake2s_mod_init(void)
 {
 	if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) &&
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH crypto v3 2/2] lib/crypto: sha1: re-roll loops to reduce code size
  2022-01-11 22:05               ` [PATCH crypto v3 0/2] reduce code size from blake2s on m68k and other small platforms Jason A. Donenfeld
  2022-01-11 22:05                 ` [PATCH crypto v3 1/2] lib/crypto: blake2s: move hmac construction into wireguard Jason A. Donenfeld
@ 2022-01-11 22:05                 ` Jason A. Donenfeld
  2022-01-12 10:59                 ` [PATCH crypto v3 0/2] reduce code size from blake2s on m68k and other small platforms Geert Uytterhoeven
  2 siblings, 0 replies; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-11 22:05 UTC (permalink / raw)
  To: linux-crypto, netdev, wireguard, linux-kernel, bpf, geert, tytso,
	gregkh, jeanphilippe.aumasson, ardb
  Cc: Jason A. Donenfeld, Herbert Xu

With SHA-1 no longer being used for anything performance oriented, and
also soon to be phased out entirely, we can make up for the space added
by unrolled BLAKE2s by simply re-rolling SHA-1. Since SHA-1 is so much
more complex, re-rolling it more or less takes care of the code size
added by BLAKE2s. And eventually, hopefully we'll see SHA-1 removed
entirely from most small kernel builds.

Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 lib/sha1.c | 95 ++++++++----------------------------------------------
 1 file changed, 14 insertions(+), 81 deletions(-)

diff --git a/lib/sha1.c b/lib/sha1.c
index 9bd1935a1472..0494766fc574 100644
--- a/lib/sha1.c
+++ b/lib/sha1.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/bitops.h>
+#include <linux/string.h>
 #include <crypto/sha1.h>
 #include <asm/unaligned.h>
 
@@ -55,7 +56,8 @@
 #define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \
 	__u32 TEMP = input(t); setW(t, TEMP); \
 	E += TEMP + rol32(A,5) + (fn) + (constant); \
-	B = ror32(B, 2); } while (0)
+	B = ror32(B, 2); \
+	TEMP = E; E = D; D = C; C = B; B = A; A = TEMP; } while (0)
 
 #define T_0_15(t, A, B, C, D, E)  SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E )
 #define T_16_19(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E )
@@ -84,6 +86,7 @@
 void sha1_transform(__u32 *digest, const char *data, __u32 *array)
 {
 	__u32 A, B, C, D, E;
+	unsigned int i = 0;
 
 	A = digest[0];
 	B = digest[1];
@@ -92,94 +95,24 @@ void sha1_transform(__u32 *digest, const char *data, __u32 *array)
 	E = digest[4];
 
 	/* Round 1 - iterations 0-16 take their input from 'data' */
-	T_0_15( 0, A, B, C, D, E);
-	T_0_15( 1, E, A, B, C, D);
-	T_0_15( 2, D, E, A, B, C);
-	T_0_15( 3, C, D, E, A, B);
-	T_0_15( 4, B, C, D, E, A);
-	T_0_15( 5, A, B, C, D, E);
-	T_0_15( 6, E, A, B, C, D);
-	T_0_15( 7, D, E, A, B, C);
-	T_0_15( 8, C, D, E, A, B);
-	T_0_15( 9, B, C, D, E, A);
-	T_0_15(10, A, B, C, D, E);
-	T_0_15(11, E, A, B, C, D);
-	T_0_15(12, D, E, A, B, C);
-	T_0_15(13, C, D, E, A, B);
-	T_0_15(14, B, C, D, E, A);
-	T_0_15(15, A, B, C, D, E);
+	for (; i < 16; ++i)
+		T_0_15(i, A, B, C, D, E);
 
 	/* Round 1 - tail. Input from 512-bit mixing array */
-	T_16_19(16, E, A, B, C, D);
-	T_16_19(17, D, E, A, B, C);
-	T_16_19(18, C, D, E, A, B);
-	T_16_19(19, B, C, D, E, A);
+	for (; i < 20; ++i)
+		T_16_19(i, A, B, C, D, E);
 
 	/* Round 2 */
-	T_20_39(20, A, B, C, D, E);
-	T_20_39(21, E, A, B, C, D);
-	T_20_39(22, D, E, A, B, C);
-	T_20_39(23, C, D, E, A, B);
-	T_20_39(24, B, C, D, E, A);
-	T_20_39(25, A, B, C, D, E);
-	T_20_39(26, E, A, B, C, D);
-	T_20_39(27, D, E, A, B, C);
-	T_20_39(28, C, D, E, A, B);
-	T_20_39(29, B, C, D, E, A);
-	T_20_39(30, A, B, C, D, E);
-	T_20_39(31, E, A, B, C, D);
-	T_20_39(32, D, E, A, B, C);
-	T_20_39(33, C, D, E, A, B);
-	T_20_39(34, B, C, D, E, A);
-	T_20_39(35, A, B, C, D, E);
-	T_20_39(36, E, A, B, C, D);
-	T_20_39(37, D, E, A, B, C);
-	T_20_39(38, C, D, E, A, B);
-	T_20_39(39, B, C, D, E, A);
+	for (; i < 40; ++i)
+		T_20_39(i, A, B, C, D, E);
 
 	/* Round 3 */
-	T_40_59(40, A, B, C, D, E);
-	T_40_59(41, E, A, B, C, D);
-	T_40_59(42, D, E, A, B, C);
-	T_40_59(43, C, D, E, A, B);
-	T_40_59(44, B, C, D, E, A);
-	T_40_59(45, A, B, C, D, E);
-	T_40_59(46, E, A, B, C, D);
-	T_40_59(47, D, E, A, B, C);
-	T_40_59(48, C, D, E, A, B);
-	T_40_59(49, B, C, D, E, A);
-	T_40_59(50, A, B, C, D, E);
-	T_40_59(51, E, A, B, C, D);
-	T_40_59(52, D, E, A, B, C);
-	T_40_59(53, C, D, E, A, B);
-	T_40_59(54, B, C, D, E, A);
-	T_40_59(55, A, B, C, D, E);
-	T_40_59(56, E, A, B, C, D);
-	T_40_59(57, D, E, A, B, C);
-	T_40_59(58, C, D, E, A, B);
-	T_40_59(59, B, C, D, E, A);
+	for (; i < 60; ++i)
+		T_40_59(i, A, B, C, D, E);
 
 	/* Round 4 */
-	T_60_79(60, A, B, C, D, E);
-	T_60_79(61, E, A, B, C, D);
-	T_60_79(62, D, E, A, B, C);
-	T_60_79(63, C, D, E, A, B);
-	T_60_79(64, B, C, D, E, A);
-	T_60_79(65, A, B, C, D, E);
-	T_60_79(66, E, A, B, C, D);
-	T_60_79(67, D, E, A, B, C);
-	T_60_79(68, C, D, E, A, B);
-	T_60_79(69, B, C, D, E, A);
-	T_60_79(70, A, B, C, D, E);
-	T_60_79(71, E, A, B, C, D);
-	T_60_79(72, D, E, A, B, C);
-	T_60_79(73, C, D, E, A, B);
-	T_60_79(74, B, C, D, E, A);
-	T_60_79(75, A, B, C, D, E);
-	T_60_79(76, E, A, B, C, D);
-	T_60_79(77, D, E, A, B, C);
-	T_60_79(78, C, D, E, A, B);
-	T_60_79(79, B, C, D, E, A);
+	for (; i < 80; ++i)
+		T_60_79(i, A, B, C, D, E);
 
 	digest[0] += A;
 	digest[1] += B;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH crypto 1/2] lib/crypto: blake2s-generic: reduce code size on small systems
  2022-01-11 13:49             ` [PATCH crypto 1/2] lib/crypto: blake2s-generic: reduce code size on small systems Jason A. Donenfeld
@ 2022-01-12 10:57               ` Geert Uytterhoeven
  2022-01-12 13:16                 ` Jason A. Donenfeld
  2022-01-12 18:31               ` Eric Biggers
  1 sibling, 1 reply; 61+ messages in thread
From: Geert Uytterhoeven @ 2022-01-12 10:57 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Linux Crypto Mailing List, netdev, wireguard,
	Linux Kernel Mailing List, bpf, Theodore Tso, Greg KH,
	jeanphilippe.aumasson, Ard Biesheuvel, Herbert Xu

Hi Jason,

On Tue, Jan 11, 2022 at 2:49 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> Re-wind the loops entirely on kernels optimized for code size. This is
> really not good at all performance-wise. But on m68k, it shaves off 4k
> of code size, which is apparently important.

On arm32:

add/remove: 1/0 grow/shrink: 0/1 up/down: 160/-4212 (-4052)
Function                                     old     new   delta
blake2s_sigma                                  -     160    +160
blake2s_compress_generic                    4872     660   -4212
Total: Before=9846148, After=9842096, chg -0.04%

On arm64:

add/remove: 1/2 grow/shrink: 0/1 up/down: 160/-4584 (-4424)
Function                                     old     new   delta
blake2s_sigma                                  -     160    +160
e843419@0710_00007634_e8a0                     8       -      -8
e843419@0441_0000423a_178c                     8       -      -8
blake2s_compress_generic                    5088     520   -4568
Total: Before=32800278, After=32795854, chg -0.01%

> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>

For the size reduction:
Tested-by: Geert Uytterhoeven <geert@linux-m68k.org>

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH crypto v3 0/2] reduce code size from blake2s on m68k and other small platforms
  2022-01-11 22:05               ` [PATCH crypto v3 0/2] reduce code size from blake2s on m68k and other small platforms Jason A. Donenfeld
  2022-01-11 22:05                 ` [PATCH crypto v3 1/2] lib/crypto: blake2s: move hmac construction into wireguard Jason A. Donenfeld
  2022-01-11 22:05                 ` [PATCH crypto v3 2/2] lib/crypto: sha1: re-roll loops to reduce code size Jason A. Donenfeld
@ 2022-01-12 10:59                 ` Geert Uytterhoeven
  2022-01-12 13:18                   ` Jason A. Donenfeld
  2 siblings, 1 reply; 61+ messages in thread
From: Geert Uytterhoeven @ 2022-01-12 10:59 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Linux Crypto Mailing List, netdev, wireguard,
	Linux Kernel Mailing List, bpf, Theodore Tso, Greg KH,
	jeanphilippe.aumasson, Ard Biesheuvel

Hi Jason,

On Tue, Jan 11, 2022 at 11:05 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> Geert emailed me this afternoon concerned about blake2s codesize on m68k
> and other small systems. We identified two effective ways of chopping
> down the size. One of them moves some wireguard-specific things into
> wireguard proper. The other one adds a slower codepath for small
> machines to blake2s. This worked, and was v1 of this patchset, but I
> wasn't so much of a fan. Then someone pointed out that the generic C
> SHA-1 implementation is still unrolled, which is a *lot* of extra code.
> Simply rerolling that saves about as much as v1 did. So, we instead do
> that in this patchset. SHA-1 is being phased out, and soon it won't
> be included at all (hopefully). And nothing performance-oriented has
> anything to do with it anyway.
>
> The result of these two patches mitigates Geert's feared code size
> increase for 5.17.
>
> v3 improves on v2 by making the re-rolling of SHA-1 much simpler,
> resulting in even larger code size reduction and much better
> performance. The reason I'm sending yet a third version in such a short
> amount of time is because the trick here feels obvious and substantial
> enough that I'd hate for Geert to waste time measuring the impact of the
> previous commit.
>
> Thanks,
> Jason
>
> Jason A. Donenfeld (2):
>   lib/crypto: blake2s: move hmac construction into wireguard
>   lib/crypto: sha1: re-roll loops to reduce code size

Thanks for the series!

On m68k:
add/remove: 1/4 grow/shrink: 0/1 up/down: 4/-4232 (-4228)
Function                                     old     new   delta
__ksymtab_blake2s256_hmac                     12       -     -12
blake2s_init.constprop                        94       -     -94
blake2s256_hmac                              302       -    -302
sha1_transform                              4402     582   -3820
Total: Before=4230537, After=4226309, chg -0.10%

Tested-by: Geert Uytterhoeven <geert@linux-m68k.org>

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH crypto 1/2] lib/crypto: blake2s-generic: reduce code size on small systems
  2022-01-12 10:57               ` Geert Uytterhoeven
@ 2022-01-12 13:16                 ` Jason A. Donenfeld
  0 siblings, 0 replies; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-12 13:16 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Linux Crypto Mailing List, netdev, WireGuard mailing list,
	Linux Kernel Mailing List, bpf, Theodore Tso, Greg KH,
	Jean-Philippe Aumasson, Ard Biesheuvel, Herbert Xu

Hi Geert,

Thanks for testing this. However, I've *abandoned* this patch, due to
unacceptable performance hits, and figuring out that we can accomplish
basically the same thing without as large of a hit by modifying the
obsolete sha1 implementation.

Herbert - please do not apply this patch. Instead, later versions of
this patchset (e.g. v3 [1] and potentially later if it comes to that)
are what should be applied.

Jason

[1] https://lore.kernel.org/linux-crypto/20220111220506.742067-1-Jason@zx2c4.com/

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH crypto v3 0/2] reduce code size from blake2s on m68k and other small platforms
  2022-01-12 10:59                 ` [PATCH crypto v3 0/2] reduce code size from blake2s on m68k and other small platforms Geert Uytterhoeven
@ 2022-01-12 13:18                   ` Jason A. Donenfeld
  2022-01-18  6:42                     ` Herbert Xu
  0 siblings, 1 reply; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-12 13:18 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Linux Crypto Mailing List, netdev, WireGuard mailing list,
	Linux Kernel Mailing List, bpf, Theodore Tso, Greg KH,
	Jean-Philippe Aumasson, Ard Biesheuvel, Herbert Xu

Hi Geert,

On Wed, Jan 12, 2022 at 12:00 PM Geert Uytterhoeven
<geert@linux-m68k.org> wrote:
> Thanks for the series!
>
> On m68k:
> add/remove: 1/4 grow/shrink: 0/1 up/down: 4/-4232 (-4228)
> Function                                     old     new   delta
> __ksymtab_blake2s256_hmac                     12       -     -12
> blake2s_init.constprop                        94       -     -94
> blake2s256_hmac                              302       -    -302
> sha1_transform                              4402     582   -3820
> Total: Before=4230537, After=4226309, chg -0.10%
>
> Tested-by: Geert Uytterhoeven <geert@linux-m68k.org>

Excellent, thanks for the breakdown. So this shaves off ~4k, which was
about what we were shooting for here, so I think indeed this series
accomplishes its goal of counteracting the addition of BLAKE2s.
Hopefully Herbert will apply this series for 5.17.

Jason

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH crypto 1/2] lib/crypto: blake2s-generic: reduce code size on small systems
  2022-01-11 13:49             ` [PATCH crypto 1/2] lib/crypto: blake2s-generic: reduce code size on small systems Jason A. Donenfeld
  2022-01-12 10:57               ` Geert Uytterhoeven
@ 2022-01-12 18:31               ` Eric Biggers
  2022-01-12 18:50                 ` Jason A. Donenfeld
  1 sibling, 1 reply; 61+ messages in thread
From: Eric Biggers @ 2022-01-12 18:31 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: linux-crypto, netdev, wireguard, linux-kernel, bpf, geert, tytso,
	gregkh, jeanphilippe.aumasson, ardb, Herbert Xu

On Tue, Jan 11, 2022 at 02:49:33PM +0100, Jason A. Donenfeld wrote:
> Re-wind the loops entirely on kernels optimized for code size. This is
> really not good at all performance-wise. But on m68k, it shaves off 4k
> of code size, which is apparently important.
> 
> Cc: Geert Uytterhoeven <geert@linux-m68k.org>
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: Ard Biesheuvel <ardb@kernel.org>
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> ---
>  lib/crypto/blake2s-generic.c | 30 ++++++++++++++++++------------
>  1 file changed, 18 insertions(+), 12 deletions(-)
> 
> diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c
> index 75ccb3e633e6..990f000e22ee 100644
> --- a/lib/crypto/blake2s-generic.c
> +++ b/lib/crypto/blake2s-generic.c
> @@ -46,7 +46,7 @@ void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
>  {
>  	u32 m[16];
>  	u32 v[16];
> -	int i;
> +	int i, j;
>  
>  	WARN_ON(IS_ENABLED(DEBUG) &&
>  		(nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));
> @@ -86,17 +86,23 @@ void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
>  	G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
>  	G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
>  } while (0)
> -		ROUND(0);
> -		ROUND(1);
> -		ROUND(2);
> -		ROUND(3);
> -		ROUND(4);
> -		ROUND(5);
> -		ROUND(6);
> -		ROUND(7);
> -		ROUND(8);
> -		ROUND(9);
> -
> +		if (IS_ENABLED(CONFIG_CC_OPTIMIZE_FOR_SIZE)) {
> +			for (i = 0; i < 10; ++i) {
> +				for (j = 0; j < 8; ++j)
> +					G(i, j, v[j % 4], v[((j + (j / 4)) % 4) + 4], v[((j + 2 * (j / 4)) % 4) + 8], v[((j + 3 * (j / 4)) % 4) + 12]);
> +			}

How about unrolling the inner loop but not the outer one?  Wouldn't that give
most of the benefit, without hurting performance as much?

If you stay with this approach and don't unroll either loop, can you use 'r' and
'i' instead of 'i' and 'j', to match the naming in G()?

Also, please wrap lines at 80 columns.

- Eric

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH crypto 2/2] lib/crypto: blake2s: move hmac construction into wireguard
  2022-01-11 13:49             ` [PATCH crypto 2/2] lib/crypto: blake2s: move hmac construction into wireguard Jason A. Donenfeld
  2022-01-11 14:43               ` Ard Biesheuvel
@ 2022-01-12 18:35               ` Eric Biggers
  1 sibling, 0 replies; 61+ messages in thread
From: Eric Biggers @ 2022-01-12 18:35 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: linux-crypto, netdev, wireguard, linux-kernel, bpf, geert, tytso,
	gregkh, jeanphilippe.aumasson, ardb, Herbert Xu

On Tue, Jan 11, 2022 at 02:49:34PM +0100, Jason A. Donenfeld wrote:
> Basically nobody should use blake2s in an HMAC construction; it already
> has a keyed variant. But for unfortunately historical reasons, Noise,
> used by WireGuard, uses HKDF quite strictly, which means we have to use
> this. Because this really shouldn't be used by others, this commit moves
> it into wireguard's noise.c locally, so that kernels that aren't using
> WireGuard don't get this superfluous code baked in. On m68k systems,
> this shaves off ~314 bytes.
> 
> Cc: Geert Uytterhoeven <geert@linux-m68k.org>
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: Ard Biesheuvel <ardb@kernel.org>
> Cc: netdev@vger.kernel.org
> Cc: wireguard@lists.zx2c4.com
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> ---

Reviewed-by: Eric Biggers <ebiggers@google.com>

- Eric

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH crypto 1/2] lib/crypto: blake2s-generic: reduce code size on small systems
  2022-01-12 18:31               ` Eric Biggers
@ 2022-01-12 18:50                 ` Jason A. Donenfeld
  2022-01-12 21:27                   ` David Laight
  0 siblings, 1 reply; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-12 18:50 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Linux Crypto Mailing List, Netdev, WireGuard mailing list, LKML,
	bpf, Geert Uytterhoeven, Theodore Ts'o, Greg Kroah-Hartman,
	Jean-Philippe Aumasson, Ard Biesheuvel, Herbert Xu

On Wed, Jan 12, 2022 at 7:32 PM Eric Biggers <ebiggers@kernel.org> wrote:
> How about unrolling the inner loop but not the outer one?  Wouldn't that give
> most of the benefit, without hurting performance as much?
>
> If you stay with this approach and don't unroll either loop, can you use 'r' and
> 'i' instead of 'i' and 'j', to match the naming in G()?

All this might work, sure. But as mentioned earlier, I've abandoned
this entirely, as I don't think this patch is necessary. See the v3
patchset instead:

https://lore.kernel.org/linux-crypto/20220111220506.742067-1-Jason@zx2c4.com/

^ permalink raw reply	[flat|nested] 61+ messages in thread

* RE: [PATCH crypto 1/2] lib/crypto: blake2s-generic: reduce code size on small systems
  2022-01-12 18:50                 ` Jason A. Donenfeld
@ 2022-01-12 21:27                   ` David Laight
  2022-01-12 22:00                     ` Jason A. Donenfeld
  0 siblings, 1 reply; 61+ messages in thread
From: David Laight @ 2022-01-12 21:27 UTC (permalink / raw)
  To: 'Jason A. Donenfeld', Eric Biggers
  Cc: Linux Crypto Mailing List, Netdev, WireGuard mailing list, LKML,
	bpf, Geert Uytterhoeven, Theodore Ts'o, Greg Kroah-Hartman,
	Jean-Philippe Aumasson, Ard Biesheuvel, Herbert Xu

From: Jason A. Donenfeld
> Sent: 12 January 2022 18:51
> 
> On Wed, Jan 12, 2022 at 7:32 PM Eric Biggers <ebiggers@kernel.org> wrote:
> > How about unrolling the inner loop but not the outer one?  Wouldn't that give
> > most of the benefit, without hurting performance as much?
> >
> > If you stay with this approach and don't unroll either loop, can you use 'r' and
> > 'i' instead of 'i' and 'j', to match the naming in G()?
> 
> All this might work, sure. But as mentioned earlier, I've abandoned
> this entirely, as I don't think this patch is necessary. See the v3
> patchset instead:
> 
> https://lore.kernel.org/linux-crypto/20220111220506.742067-1-Jason@zx2c4.com/

I think you mentioned in another thread that the buffers (eg for IPv6
addresses) are actually often quite short.

For short buffers the 'rolled-up' loop may be of similar performance
to the unrolled one because of the time taken to read all the instructions
into the I-cache and decode them.
If the loop ends up small enough it will fit into the 'decoded loop
buffer' of modern Intel x86 cpu and won't even need decoding on
each iteration.

I really suspect that the heavily unrolled loop is only really fast
for big buffers and/or when it is already in the I-cache.
In real life I wonder how often that actually happens?
Especially for the uses the kernel is making of the code.

You need to benchmark single executions of the function
(doable on x86 with the performance monitor cycle counter)
to get typical/best clocks/byte figures rather than a
big average for repeated operation on a long buffer.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH crypto 1/2] lib/crypto: blake2s-generic: reduce code size on small systems
  2022-01-12 21:27                   ` David Laight
@ 2022-01-12 22:00                     ` Jason A. Donenfeld
  0 siblings, 0 replies; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-12 22:00 UTC (permalink / raw)
  To: David Laight
  Cc: Eric Biggers, Linux Crypto Mailing List, Netdev,
	WireGuard mailing list, LKML, bpf, Geert Uytterhoeven,
	Theodore Ts'o, Greg Kroah-Hartman, Jean-Philippe Aumasson,
	Ard Biesheuvel, Herbert Xu

Hi David,

On 1/12/22, David Laight <David.Laight@aculab.com> wrote:
> I think you mentioned in another thread that the buffers (eg for IPv6
> addresses) are actually often quite short.
>
> For short buffers the 'rolled-up' loop may be of similar performance
> to the unrolled one because of the time taken to read all the instructions
> into the I-cache and decode them.
> If the loop ends up small enough it will fit into the 'decoded loop
> buffer' of modern Intel x86 cpu and won't even need decoding on
> each iteration.
>
> I really suspect that the heavily unrolled loop is only really fast
> for big buffers and/or when it is already in the I-cache.
> In real life I wonder how often that actually happens?
> Especially for the uses the kernel is making of the code.
>
> You need to benchmark single executions of the function
> (doable on x86 with the performance monitor cycle counter)
> to get typical/best clocks/byte figures rather than a
> big average for repeated operation on a long buffer.
>
> 	David

This patch has been dropped entirely from future revisions. The latest
as of writing is at:

https://lore.kernel.org/linux-crypto/20220111220506.742067-1-Jason@zx2c4.com/

If you'd like to do something with blake2s, by all means submit a
patch and include various rationale and metrics and benchmarks. I do
not intend to do that myself and do not think my particular patch here
should be merged. But if you'd like to do something, feel free to CC
me for a review. However, as mentioned, I don't think much needs to be
done here.

Again, v3 is here:
https://lore.kernel.org/linux-crypto/20220111220506.742067-1-Jason@zx2c4.com/

Thanks,
Jason

^ permalink raw reply	[flat|nested] 61+ messages in thread

* RE: [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction
  2022-01-11 12:50       ` Jason A. Donenfeld
  2022-01-11 12:57         ` Geert Uytterhoeven
  2022-01-11 15:46         ` David Laight
@ 2022-01-14 17:27         ` David Laight
  2022-01-14 17:31           ` Jason A. Donenfeld
  2 siblings, 1 reply; 61+ messages in thread
From: David Laight @ 2022-01-14 17:27 UTC (permalink / raw)
  To: 'Jason A. Donenfeld', Geert Uytterhoeven
  Cc: Linux Kernel Mailing List, Theodore Tso, Greg KH, Jean-Philippe Aumasson

From: Jason A. Donenfeld
> Sent: 11 January 2022 12:50
>
> On Tue, Jan 11, 2022 at 1:28 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > If you're really quite concerned about m68k code size, I can probably
> > do some things to reduce that. For example, blake2s256_hmac is only
> > used by wireguard and it could probably be made local there. And with
> > some trivial loop re-rolling, I can shave off another 2300 bytes. And
> > I bet I can find a few other things too. The question is: how
> > important is this to you?
> 
> And with another trick (see below), another extra 1000 bytes or so
> shaved off. Aside from moving blake2s256_hmac, I'm not really super
> enthusiastic about making these changes, but depending on how important
> this is to you, maybe we can make something work. There are probably
> additional possibilities too with the code.

Quite clearly whoever wrote the unrolled loops needs their head examined.
It is extremely unlikely that a cpu has enough registers to implement it
effeciently.
(Of course, a pipelined implementation on a fgpa is another matter.)

So every read of v[] is going to be a memory read.
Much better to do that than to spill values that change.
The memory reads won't really hit performance either.
They add a bit of latency - but that will be handled by
instruction scheduling - either by the compiler of cpu hardware.

> -#define ROUND(r) do { \
> -	G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
> -	G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
> -	G(r, 2, v[2], v[ 6], v[10], v[14]); \
> -	G(r, 3, v[3], v[ 7], v[11], v[15]); \
> -	G(r, 4, v[0], v[ 5], v[10], v[15]); \
> -	G(r, 5, v[1], v[ 6], v[11], v[12]); \
> -	G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
> -	G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
> -} while (0)
> -		ROUND(0);
> -		ROUND(1);
> -		ROUND(2);
> -		ROUND(3);
> -		ROUND(4);
> -		ROUND(5);
> -		ROUND(6);
> -		ROUND(7);
> -		ROUND(8);
> -		ROUND(9);

The v[] values clearly don't change in the above.
Use 4 separate arrays so you have:

#define ROUND(r) do { \
	G(r, 0, v[0], w[0], x[0], y[0]); \
	G(r, 1, v[1], w[1], x[1], y[1]); \
	G(r, 2, v[2], w[2], x[2], y[2]); \
	G(r, 3, v[3], w[3], x[3], y[3]); \
	G(r, 4, v[0], w[1], x[2], y[3]); \
	G(r, 5, v[1], w[2], x[3], y[0]); \
	G(r, 6, v[2], w[3], x[0], y[1]); \
	G(r, 7, v[3], w[0], x[1], y[2]); \

Now double the sizes of v/w/x/y array and write the correct
values when they are created/updated and you get:

#define ROUND(r) do { \
	G(r, 0, v[0], w[0], x[0], y[0]); \
	G(r, 1, v[1], w[1], x[1], y[1]); \
	G(r, 2, v[2], w[2], x[2], y[2]); \
	G(r, 3, v[3], w[3], x[3], y[3]); \
	G(r, 4, v[4], w[4], x[4], y[4]); \
	G(r, 5, v[5], w[5], x[5], y[5]); \
	G(r, 6, v[6], w[6], x[6], y[6]); \
	G(r, 7, v[7], w[7], x[7], y[7]); \

Oh - that is a nice loop...
So we get:
	for (r = 0; r < 10; r++)
		for (j = 0; j < 8; j++)
			G(r, j, v[j], w[j], x[j], y[j]);

Which is likely to be just as fast as any other version.

You might need to give the compiler some great big hints
in order to get sensible code.
Possible make v[] w[] x[] and y[] all volatile and replace
the inner loop body with:
			v_j = v[j]; w_j = x[j]; x_j = x[j]; y_j = z[j];
			G(r, j, v_j, w_j, x_j, y_j);

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction
  2022-01-14 17:27         ` David Laight
@ 2022-01-14 17:31           ` Jason A. Donenfeld
  0 siblings, 0 replies; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-14 17:31 UTC (permalink / raw)
  To: David Laight
  Cc: Geert Uytterhoeven, Linux Kernel Mailing List, Theodore Tso,
	Greg KH, Jean-Philippe Aumasson

On Fri, Jan 14, 2022 at 6:27 PM David Laight <David.Laight@aculab.com> wrote:
>
> From: Jason A. Donenfeld
> > Sent: 11 January 2022 12:50
> >
> > On Tue, Jan 11, 2022 at 1:28 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > > If you're really quite concerned about m68k code size, I can probably
> > > do some things to reduce that. For example, blake2s256_hmac is only
> > > used by wireguard and it could probably be made local there. And with
> > > some trivial loop re-rolling, I can shave off another 2300 bytes. And
> > > I bet I can find a few other things too. The question is: how
> > > important is this to you?
> >
> > And with another trick (see below), another extra 1000 bytes or so
> > shaved off. Aside from moving blake2s256_hmac, I'm not really super
> > enthusiastic about making these changes, but depending on how important
> > this is to you, maybe we can make something work. There are probably
> > additional possibilities too with the code.
>
> Quite clearly whoever wrote the unrolled loops needs their head examined.
> It is extremely unlikely that a cpu has enough registers to implement it
> effeciently.

Feel free to send a patch doing this, along with benchmarks. It
doesn't seem impossible to me that re-rolling the rounds might be
better on some platforms. The question is - is it really? And if so,
which ones? And for what varieties of inputs? If you put some research
into this, please do CC me on patches.

Jason

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH crypto v3 0/2] reduce code size from blake2s on m68k and other small platforms
  2022-01-12 13:18                   ` Jason A. Donenfeld
@ 2022-01-18  6:42                     ` Herbert Xu
  2022-01-18 11:43                       ` Jason A. Donenfeld
  0 siblings, 1 reply; 61+ messages in thread
From: Herbert Xu @ 2022-01-18  6:42 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: geert, linux-crypto, netdev, wireguard, linux-kernel, bpf, tytso,
	gregkh, jeanphilippe.aumasson, ardb

Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>
> Excellent, thanks for the breakdown. So this shaves off ~4k, which was
> about what we were shooting for here, so I think indeed this series
> accomplishes its goal of counteracting the addition of BLAKE2s.
> Hopefully Herbert will apply this series for 5.17.

As the patches that triggered this weren't part of the crypto
tree, this will have to go through the random tree if you want
them for 5.17.

Otherwise if you're happy to wait then I can pull them through
cryptodev.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH crypto v3 0/2] reduce code size from blake2s on m68k and other small platforms
  2022-01-18  6:42                     ` Herbert Xu
@ 2022-01-18 11:43                       ` Jason A. Donenfeld
  2022-01-18 12:44                         ` David Laight
  0 siblings, 1 reply; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-18 11:43 UTC (permalink / raw)
  To: Herbert Xu
  Cc: geert, linux-crypto, netdev, wireguard, linux-kernel, bpf, tytso,
	gregkh, jeanphilippe.aumasson, ardb

On 1/18/22, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> As the patches that triggered this weren't part of the crypto
> tree, this will have to go through the random tree if you want
> them for 5.17.

Sure, will do.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* RE: [PATCH crypto v3 0/2] reduce code size from blake2s on m68k and other small platforms
  2022-01-18 11:43                       ` Jason A. Donenfeld
@ 2022-01-18 12:44                         ` David Laight
  2022-01-18 12:50                           ` Jason A. Donenfeld
  0 siblings, 1 reply; 61+ messages in thread
From: David Laight @ 2022-01-18 12:44 UTC (permalink / raw)
  To: 'Jason A. Donenfeld', Herbert Xu
  Cc: geert, linux-crypto, netdev, wireguard, linux-kernel, bpf, tytso,
	gregkh, jeanphilippe.aumasson, ardb

From: Jason A. Donenfeld
> Sent: 18 January 2022 11:43
> 
> On 1/18/22, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> > As the patches that triggered this weren't part of the crypto
> > tree, this will have to go through the random tree if you want
> > them for 5.17.
> 
> Sure, will do.

I've rammed the code through godbolt... https://godbolt.org/z/Wv64z9zG8

Some things I've noticed;

1) There is no point having all the inline functions.
   Far better to have real functions to do the work.
   Given the cost of hashing 64 bytes of data the extra
   function call won't matter.
   Indeed for repeated calls it will help because the required
   code will be in the I-cache.

2) The compiles I tried do manage to remove the blake2_sigma[][]
   when unrolling everything - which is a slight gain for the full
   unroll. But I doubt it is that significant if the access can
   get sensibly optimised.
   For non-x86 that might require all the values by multiplied by 4.

3) Although G() is a massive register dependency chain the compiler
   knows that G(,[0-3],) are independent and can execute in parallel.
   This does help execution time on multi-issue cpu (like x86).
   With care it ought to be possible to use the same code for G(,[4-7],)
   without stopping the compiler interleaving all the instructions.

4) I strongly suspect that using a loop for the rounds will have
   minimal impact on performance - especially if the first call is
   'cold cache'.
   But I've not got time to test the code.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH crypto v3 0/2] reduce code size from blake2s on m68k and other small platforms
  2022-01-18 12:44                         ` David Laight
@ 2022-01-18 12:50                           ` Jason A. Donenfeld
  0 siblings, 0 replies; 61+ messages in thread
From: Jason A. Donenfeld @ 2022-01-18 12:50 UTC (permalink / raw)
  To: David Laight
  Cc: Herbert Xu, geert, linux-crypto, netdev, wireguard, linux-kernel,
	bpf, tytso, gregkh, jeanphilippe.aumasson, ardb

On Tue, Jan 18, 2022 at 1:45 PM David Laight <David.Laight@aculab.com> wrote:
> I've rammed the code through godbolt... https://godbolt.org/z/Wv64z9zG8
>
> Some things I've noticed;

It seems like you've done a lot of work here but...

>    But I've not got time to test the code.

But you're not going to take it all the way. So it unfortunately
amounts to mailing list armchair optimization. That's too bad because
it really seems like you might be onto something worth seeing through.
As I've mentioned a few times now, I've dropped the blake2s
optimization patch, and I won't be developing that further. But it
appears as though you've really been captured by it, so I urge you:
please send a real patch with benchmarks on various platforms! (And CC
me on the patch.) Faster reference code would really be terrific.

Jason

^ permalink raw reply	[flat|nested] 61+ messages in thread

end of thread, other threads:[~2022-01-18 12:51 UTC | newest]

Thread overview: 61+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-23 14:11 [PATCH v2 1/2] lib/crypto: blake2s: include as built-in Jason A. Donenfeld
2021-12-23 14:11 ` [PATCH v2 2/2] random: use BLAKE2s instead of SHA1 in extraction Jason A. Donenfeld
2021-12-24 20:56   ` Eric Biggers
2022-01-05 21:42     ` Eric Biggers
2021-12-27 15:52   ` Theodore Ts'o
2022-01-11 11:38   ` Geert Uytterhoeven
2022-01-11 12:28     ` Jason A. Donenfeld
2022-01-11 12:50       ` Jason A. Donenfeld
2022-01-11 12:57         ` Geert Uytterhoeven
2022-01-11 13:00           ` Jason A. Donenfeld
2022-01-11 15:46         ` David Laight
2022-01-11 18:26           ` Jason A. Donenfeld
2022-01-14 17:27         ` David Laight
2022-01-14 17:31           ` Jason A. Donenfeld
2022-01-11 12:51       ` Geert Uytterhoeven
2022-01-11 13:02         ` Jason A. Donenfeld
2022-01-11 13:49           ` [PATCH crypto 0/2] smaller blake2s code size on m68k and other small platforms Jason A. Donenfeld
2022-01-11 13:49             ` [PATCH crypto 1/2] lib/crypto: blake2s-generic: reduce code size on small systems Jason A. Donenfeld
2022-01-12 10:57               ` Geert Uytterhoeven
2022-01-12 13:16                 ` Jason A. Donenfeld
2022-01-12 18:31               ` Eric Biggers
2022-01-12 18:50                 ` Jason A. Donenfeld
2022-01-12 21:27                   ` David Laight
2022-01-12 22:00                     ` Jason A. Donenfeld
2022-01-11 13:49             ` [PATCH crypto 2/2] lib/crypto: blake2s: move hmac construction into wireguard Jason A. Donenfeld
2022-01-11 14:43               ` Ard Biesheuvel
2022-01-12 18:35               ` Eric Biggers
2022-01-11 18:10             ` [PATCH crypto v2 0/2] reduce code size from blake2s on m68k and other small platforms Jason A. Donenfeld
2022-01-11 18:10               ` [PATCH crypto v2 1/2] lib/crypto: blake2s: move hmac construction into wireguard Jason A. Donenfeld
2022-01-11 18:10               ` [PATCH crypto v2 2/2] lib/crypto: sha1: re-roll loops to reduce code size Jason A. Donenfeld
2022-01-11 22:05               ` [PATCH crypto v3 0/2] reduce code size from blake2s on m68k and other small platforms Jason A. Donenfeld
2022-01-11 22:05                 ` [PATCH crypto v3 1/2] lib/crypto: blake2s: move hmac construction into wireguard Jason A. Donenfeld
2022-01-11 22:05                 ` [PATCH crypto v3 2/2] lib/crypto: sha1: re-roll loops to reduce code size Jason A. Donenfeld
2022-01-12 10:59                 ` [PATCH crypto v3 0/2] reduce code size from blake2s on m68k and other small platforms Geert Uytterhoeven
2022-01-12 13:18                   ` Jason A. Donenfeld
2022-01-18  6:42                     ` Herbert Xu
2022-01-18 11:43                       ` Jason A. Donenfeld
2022-01-18 12:44                         ` David Laight
2022-01-18 12:50                           ` Jason A. Donenfeld
2021-12-23 14:20 ` [PATCH v2 1/2] lib/crypto: blake2s: include as built-in Ard Biesheuvel
2021-12-24 13:35 ` Greg KH
2021-12-25  9:26 ` Masahiro Yamada
2021-12-25 10:26   ` Ard Biesheuvel
2021-12-25 15:47     ` Masahiro Yamada
2021-12-27 13:43       ` Jason A. Donenfeld
2021-12-27 13:47         ` [PATCH v3] " Jason A. Donenfeld
2021-12-27 14:20           ` [PATCH v4] " Jason A. Donenfeld
2022-01-01 15:59             ` [PATCH v5] " Jason A. Donenfeld
2022-01-02 20:42               ` [PATCH v6] " Jason A. Donenfeld
2022-01-03  3:23                 ` Herbert Xu
2022-01-03  3:45                   ` Jason A. Donenfeld
2022-01-03  4:06                     ` Herbert Xu
2022-01-03 11:57                       ` Jason A. Donenfeld
2022-01-03 12:31                         ` [PATCH v7] " Jason A. Donenfeld
2022-01-04  1:21                           ` Herbert Xu
2022-01-04 17:02                             ` Ard Biesheuvel
2022-01-04 17:04                               ` Jason A. Donenfeld
2022-01-05  0:28                               ` Herbert Xu
2022-01-05 21:53                               ` Eric Biggers
2022-01-05 22:01                                 ` Ard Biesheuvel
2022-01-05 22:09                                   ` Eric Biggers

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).