linux-crypto.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] random: use rejection sampling for uniform bounded random integers
@ 2022-10-17  2:37 Jason A. Donenfeld
  2022-10-17 18:27 ` Eric Biggers
  2022-10-17 20:39 ` [PATCH] " Yann Droneaud
  0 siblings, 2 replies; 6+ messages in thread
From: Jason A. Donenfeld @ 2022-10-17  2:37 UTC (permalink / raw)
  To: linux-kernel, linux-crypto; +Cc: sneves, Jason A. Donenfeld

Until the very recent commits, many bounded random integers were
calculated using `get_random_u32() % max_plus_one`, which not only
incurs the price of a division -- indicating performance mostly was not
a real issue -- but also does not result in a uniformly distributed
output if max_plus_one is not a power of two. Recent commits moved to
using `prandom_u32_max(max_plus_one)`, which replaces the division with
a faster multiplication, but still does not solve the issue with
non-uniform output.

For some users, maybe this isn't a problem, and for others, maybe it is,
but for the majority of users, probably the question has never been
posed and analyzed, and nobody thought much about it, probably assuming
random is random is random. In other words, the unthinking expectation
of most users is likely that the resultant numbers are uniform.

So we implement here an efficient way of generating uniform bounded
random integers. Through use of compile-time evaluation, and avoiding
divisions as much as possible, this commit introduces no measurable
overhead. At least for hot-path uses tested, any potential difference
was lost in the noise. On both clang and gcc, code generation is pretty
small.

The new function, get_random_u32_below(), lives in random.h, rather than
prandom.h, and has a "get_random_xxx" function name, because it is
suitable for all uses, including cryptography.

In order to be efficient, we implement a kernel-specific variant of
Daniel Lemire's algorithm from "Fast Random Integer Generation in an
Interval", linked below. The kernel's variant takes advantage of
constant folding to avoid divisions entirely in the vast majority of
cases, works on both 32-bit and 64-bit architectures, and requests a
minimal amount of bytes from the RNG.

Link: https://arxiv.org/pdf/1805.10941.pdf
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 drivers/char/random.c   | 13 +++++++++++++
 include/linux/prandom.h | 18 ++----------------
 include/linux/random.h  | 27 +++++++++++++++++++++++++++
 3 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 2fe28eeb2f38..02b2b3bc1b7f 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -160,6 +160,7 @@ EXPORT_SYMBOL(wait_for_random_bytes);
  *	u8 get_random_u8()
  *	u16 get_random_u16()
  *	u32 get_random_u32()
+ *	u32 get_random_u32_below(u32 ceil)
  *	u64 get_random_u64()
  *	unsigned long get_random_long()
  *
@@ -510,6 +511,18 @@ DEFINE_BATCHED_ENTROPY(u16)
 DEFINE_BATCHED_ENTROPY(u32)
 DEFINE_BATCHED_ENTROPY(u64)
 
+u32 __get_random_u32_below(u32 ceil)
+{
+	u64 mult = (u64)ceil * get_random_u32();
+	if (unlikely((u32)mult < ceil)) {
+		u32 bound = -ceil % ceil;
+		while (unlikely((u32)mult < bound))
+			mult = (u64)ceil * get_random_u32();
+	}
+	return mult >> 32;
+}
+EXPORT_SYMBOL(__get_random_u32_below);
+
 #ifdef CONFIG_SMP
 /*
  * This function is called when the CPU is coming up, with entry
diff --git a/include/linux/prandom.h b/include/linux/prandom.h
index e0a0759dd09c..1f4a0de7b019 100644
--- a/include/linux/prandom.h
+++ b/include/linux/prandom.h
@@ -23,24 +23,10 @@ void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state);
 #define prandom_init_once(pcpu_state)			\
 	DO_ONCE(prandom_seed_full_state, (pcpu_state))
 
-/**
- * prandom_u32_max - returns a pseudo-random number in interval [0, ep_ro)
- * @ep_ro: right open interval endpoint
- *
- * Returns a pseudo-random number that is in interval [0, ep_ro). This is
- * useful when requesting a random index of an array containing ep_ro elements,
- * for example. The result is somewhat biased when ep_ro is not a power of 2,
- * so do not use this for cryptographic purposes.
- *
- * Returns: pseudo-random number in interval [0, ep_ro)
- */
+/* Deprecated: use get_random_u32_below() instead. */
 static inline u32 prandom_u32_max(u32 ep_ro)
 {
-	if (__builtin_constant_p(ep_ro <= 1U << 8) && ep_ro <= 1U << 8)
-		return (get_random_u8() * ep_ro) >> 8;
-	if (__builtin_constant_p(ep_ro <= 1U << 16) && ep_ro <= 1U << 16)
-		return (get_random_u16() * ep_ro) >> 16;
-	return ((u64)get_random_u32() * ep_ro) >> 32;
+	return get_random_u32_below(ep_ro);
 }
 
 /*
diff --git a/include/linux/random.h b/include/linux/random.h
index 147a5e0d0b8e..54b0fa687c32 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -51,6 +51,33 @@ static inline unsigned long get_random_long(void)
 #endif
 }
 
+u32 __get_random_u32_below(u32 ceil);
+/* Returns a random integer in the interval [0, ceil), with uniform distribution. */
+static inline u32 get_random_u32_below(u32 ceil)
+{
+	if (!__builtin_constant_p(ceil))
+		return __get_random_u32_below(ceil);
+
+	BUILD_BUG_ON_MSG(!ceil, "get_random_u32_below() must take ceil > 0");
+	if (ceil <= 1)
+		return 0;
+	for (;;) {
+		if (ceil <= 1U << 8) {
+			u32 mult = ceil * get_random_u8();
+			if (likely(is_power_of_2(ceil) || (u8)mult >= (1U << 8) % ceil))
+				return mult >> 8;
+		} else if (ceil <= 1U << 16) {
+			u32 mult = ceil * get_random_u16();
+			if (likely(is_power_of_2(ceil) || (u16)mult >= (1U << 16) % ceil))
+				return mult >> 16;
+		} else {
+			u64 mult = (u64)ceil * get_random_u32();
+			if (likely(is_power_of_2(ceil) || (u32)mult >= -ceil % ceil))
+				return mult >> 32;
+		}
+	}
+}
+
 /*
  * On 64-bit architectures, protect against non-terminated C string overflows
  * by zeroing out the first byte of the canary; this leaves 56 bits of entropy.
-- 
2.37.3


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] random: use rejection sampling for uniform bounded random integers
  2022-10-17  2:37 [PATCH] random: use rejection sampling for uniform bounded random integers Jason A. Donenfeld
@ 2022-10-17 18:27 ` Eric Biggers
  2022-10-18  5:31   ` Jason A. Donenfeld
  2022-10-17 20:39 ` [PATCH] " Yann Droneaud
  1 sibling, 1 reply; 6+ messages in thread
From: Eric Biggers @ 2022-10-17 18:27 UTC (permalink / raw)
  To: Jason A. Donenfeld; +Cc: linux-kernel, linux-crypto, sneves

On Sun, Oct 16, 2022 at 08:37:53PM -0600, Jason A. Donenfeld wrote:
> In order to be efficient, we implement a kernel-specific variant of
> Daniel Lemire's algorithm from "Fast Random Integer Generation in an
> Interval", linked below. The kernel's variant takes advantage of
> constant folding to avoid divisions entirely in the vast majority of
> cases, works on both 32-bit and 64-bit architectures, and requests a
> minimal amount of bytes from the RNG.
> 
> Link: https://arxiv.org/pdf/1805.10941.pdf

Thanks for doing this!  Your code looks correct, but it was hard for me to
understand until I read the paper that is linked to.  Could you include a brief
comment in the code that explains the algorithm?  Also, though the code looks
correct, I assume that you've also explicitly tested that each of the four code
paths produce uniform random numbers as intended?

- Eric

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] random: use rejection sampling for uniform bounded random integers
  2022-10-17  2:37 [PATCH] random: use rejection sampling for uniform bounded random integers Jason A. Donenfeld
  2022-10-17 18:27 ` Eric Biggers
@ 2022-10-17 20:39 ` Yann Droneaud
  1 sibling, 0 replies; 6+ messages in thread
From: Yann Droneaud @ 2022-10-17 20:39 UTC (permalink / raw)
  To: Eric Biggers, Jason A. Donenfeld; +Cc: linux-kernel, linux-crypto, sneves

Hi, 

17 octobre 2022 à 20:27 "Eric Biggers" a écrit:
> On Sun, Oct 16, 2022 at 08:37:53PM -0600, Jason A. Donenfeld wrote:
> 
> > 
> > In order to be efficient, we implement a kernel-specific variant of
> >  Daniel Lemire's algorithm from "Fast Random Integer Generation in an
> >  Interval", linked below. The kernel's variant takes advantage of
> >  constant folding to avoid divisions entirely in the vast majority of
> >  cases, works on both 32-bit and 64-bit architectures, and requests a
> >  minimal amount of bytes from the RNG.
> >  
> >  Link: https://arxiv.org/pdf/1805.10941.pdf
> > 
> 

> Thanks for doing this! Your code looks correct, but it was hard for me to
> understand until I read the paper that is linked to. 

Other algorithms exists that might be easier to understand before the Lemire’s one.
M.E. O’Neil has written a long blog post on many possible alternatives.

https://www.pcg-random.org/posts/bounded-rands.html

Regards.

—- 
Yann Droneaud
OPTEYA

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] random: use rejection sampling for uniform bounded random integers
  2022-10-17 18:27 ` Eric Biggers
@ 2022-10-18  5:31   ` Jason A. Donenfeld
  2022-10-18  7:04     ` Eric Biggers
  0 siblings, 1 reply; 6+ messages in thread
From: Jason A. Donenfeld @ 2022-10-18  5:31 UTC (permalink / raw)
  To: Eric Biggers; +Cc: linux-kernel, linux-crypto, sneves

On Mon, Oct 17, 2022 at 11:27:17AM -0700, Eric Biggers wrote:
> On Sun, Oct 16, 2022 at 08:37:53PM -0600, Jason A. Donenfeld wrote:
> > In order to be efficient, we implement a kernel-specific variant of
> > Daniel Lemire's algorithm from "Fast Random Integer Generation in an
> > Interval", linked below. The kernel's variant takes advantage of
> > constant folding to avoid divisions entirely in the vast majority of
> > cases, works on both 32-bit and 64-bit architectures, and requests a
> > minimal amount of bytes from the RNG.
> > 
> > Link: https://arxiv.org/pdf/1805.10941.pdf
> 
> Thanks for doing this!  Your code looks correct, but it was hard for me to
> understand until I read the paper that is linked to.  Could you include a brief
> comment in the code that explains the algorithm?  Also, though the code looks
> correct, I assume that you've also explicitly tested that each of the four code
> paths produce uniform random numbers as intended?

Yes, I've tested those, and they work. (Threw a lot of cores and ram at
it.)

I could include a comment, sure. What do you have in mind? A
line-by-line thing, or just a short blurb at the top of the function?

Jason

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] random: use rejection sampling for uniform bounded random integers
  2022-10-18  5:31   ` Jason A. Donenfeld
@ 2022-10-18  7:04     ` Eric Biggers
  2022-10-18 17:34       ` [PATCH v2] " Jason A. Donenfeld
  0 siblings, 1 reply; 6+ messages in thread
From: Eric Biggers @ 2022-10-18  7:04 UTC (permalink / raw)
  To: Jason A. Donenfeld; +Cc: linux-kernel, linux-crypto, sneves

On Mon, Oct 17, 2022 at 11:31:03PM -0600, Jason A. Donenfeld wrote:
> On Mon, Oct 17, 2022 at 11:27:17AM -0700, Eric Biggers wrote:
> > On Sun, Oct 16, 2022 at 08:37:53PM -0600, Jason A. Donenfeld wrote:
> > > In order to be efficient, we implement a kernel-specific variant of
> > > Daniel Lemire's algorithm from "Fast Random Integer Generation in an
> > > Interval", linked below. The kernel's variant takes advantage of
> > > constant folding to avoid divisions entirely in the vast majority of
> > > cases, works on both 32-bit and 64-bit architectures, and requests a
> > > minimal amount of bytes from the RNG.
> > > 
> > > Link: https://arxiv.org/pdf/1805.10941.pdf
> > 
> > Thanks for doing this!  Your code looks correct, but it was hard for me to
> > understand until I read the paper that is linked to.  Could you include a brief
> > comment in the code that explains the algorithm?  Also, though the code looks
> > correct, I assume that you've also explicitly tested that each of the four code
> > paths produce uniform random numbers as intended?
> 
> Yes, I've tested those, and they work. (Threw a lot of cores and ram at
> it.)
> 
> I could include a comment, sure. What do you have in mind? A
> line-by-line thing, or just a short blurb at the top of the function?
> 
> Jason

A comment at the top of the function would be good.

- Eric

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH v2] random: use rejection sampling for uniform bounded random integers
  2022-10-18  7:04     ` Eric Biggers
@ 2022-10-18 17:34       ` Jason A. Donenfeld
  0 siblings, 0 replies; 6+ messages in thread
From: Jason A. Donenfeld @ 2022-10-18 17:34 UTC (permalink / raw)
  To: linux-kernel, linux-crypto; +Cc: sneves, ebiggers, Jason A. Donenfeld

Until the very recent commits, many bounded random integers were
calculated using `get_random_u32() % max_plus_one`, which not only
incurs the price of a division -- indicating performance mostly was not
a real issue -- but also does not result in a uniformly distributed
output if max_plus_one is not a power of two. Recent commits moved to
using `prandom_u32_max(max_plus_one)`, which replaces the division with
a faster multiplication, but still does not solve the issue with
non-uniform output.

For some users, maybe this isn't a problem, and for others, maybe it is,
but for the majority of users, probably the question has never been
posed and analyzed, and nobody thought much about it, probably assuming
random is random is random. In other words, the unthinking expectation
of most users is likely that the resultant numbers are uniform.

So we implement here an efficient way of generating uniform bounded
random integers. Through use of compile-time evaluation, and avoiding
divisions as much as possible, this commit introduces no measurable
overhead. At least for hot-path uses tested, any potential difference
was lost in the noise. On both clang and gcc, code generation is pretty
small.

The new function, get_random_u32_below(), lives in random.h, rather than
prandom.h, and has a "get_random_xxx" function name, because it is
suitable for all uses, including cryptography.

In order to be efficient, we implement a kernel-specific variant of
Daniel Lemire's algorithm from "Fast Random Integer Generation in an
Interval", linked below. The kernel's variant takes advantage of
constant folding to avoid divisions entirely in the vast majority of
cases, works on both 32-bit and 64-bit architectures, and requests a
minimal amount of bytes from the RNG.

Link: https://arxiv.org/pdf/1805.10941.pdf
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
Changes v1->v2:
- Add a few explanatory comments.

 drivers/char/random.c   | 22 ++++++++++++++++++++++
 include/linux/prandom.h | 18 ++----------------
 include/linux/random.h  | 40 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 64 insertions(+), 16 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 2fe28eeb2f38..e3cf4f51ed58 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -160,6 +160,7 @@ EXPORT_SYMBOL(wait_for_random_bytes);
  *	u8 get_random_u8()
  *	u16 get_random_u16()
  *	u32 get_random_u32()
+ *	u32 get_random_u32_below(u32 ceil)
  *	u64 get_random_u64()
  *	unsigned long get_random_long()
  *
@@ -510,6 +511,27 @@ DEFINE_BATCHED_ENTROPY(u16)
 DEFINE_BATCHED_ENTROPY(u32)
 DEFINE_BATCHED_ENTROPY(u64)
 
+u32 __get_random_u32_below(u32 ceil)
+{
+	/*
+	 * This is the slow path for variable ceil. It is still fast, most of
+	 * the time, by doing traditional reciprocal multiplication and
+	 * opportunistically comparing the lower half to ceil itself, before
+	 * falling back to computing a larger bound, and then rejecting samples
+	 * whose lower half would indicate a range indivisible by ceil. The use
+	 * of `-ceil % ceil` is analogous to `2^32 % ceil`, but is computable
+	 * in 32-bits.
+	 */
+	u64 mult = (u64)ceil * get_random_u32();
+	if (unlikely((u32)mult < ceil)) {
+		u32 bound = -ceil % ceil;
+		while (unlikely((u32)mult < bound))
+			mult = (u64)ceil * get_random_u32();
+	}
+	return mult >> 32;
+}
+EXPORT_SYMBOL(__get_random_u32_below);
+
 #ifdef CONFIG_SMP
 /*
  * This function is called when the CPU is coming up, with entry
diff --git a/include/linux/prandom.h b/include/linux/prandom.h
index e0a0759dd09c..1f4a0de7b019 100644
--- a/include/linux/prandom.h
+++ b/include/linux/prandom.h
@@ -23,24 +23,10 @@ void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state);
 #define prandom_init_once(pcpu_state)			\
 	DO_ONCE(prandom_seed_full_state, (pcpu_state))
 
-/**
- * prandom_u32_max - returns a pseudo-random number in interval [0, ep_ro)
- * @ep_ro: right open interval endpoint
- *
- * Returns a pseudo-random number that is in interval [0, ep_ro). This is
- * useful when requesting a random index of an array containing ep_ro elements,
- * for example. The result is somewhat biased when ep_ro is not a power of 2,
- * so do not use this for cryptographic purposes.
- *
- * Returns: pseudo-random number in interval [0, ep_ro)
- */
+/* Deprecated: use get_random_u32_below() instead. */
 static inline u32 prandom_u32_max(u32 ep_ro)
 {
-	if (__builtin_constant_p(ep_ro <= 1U << 8) && ep_ro <= 1U << 8)
-		return (get_random_u8() * ep_ro) >> 8;
-	if (__builtin_constant_p(ep_ro <= 1U << 16) && ep_ro <= 1U << 16)
-		return (get_random_u16() * ep_ro) >> 16;
-	return ((u64)get_random_u32() * ep_ro) >> 32;
+	return get_random_u32_below(ep_ro);
 }
 
 /*
diff --git a/include/linux/random.h b/include/linux/random.h
index 147a5e0d0b8e..3a82c0a8bc46 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -51,6 +51,46 @@ static inline unsigned long get_random_long(void)
 #endif
 }
 
+u32 __get_random_u32_below(u32 ceil);
+
+/*
+ * Returns a random integer in the interval [0, ceil), with uniform
+ * distribution, suitable for all uses. Fastest when ceil is a constant, but
+ * still fast for variable ceil as well.
+ */
+static inline u32 get_random_u32_below(u32 ceil)
+{
+	if (!__builtin_constant_p(ceil))
+		return __get_random_u32_below(ceil);
+
+	/*
+	 * For the fast path, below, all operations on ceil are precomputed by
+	 * the compiler, so this incurs no overhead for checking pow2, doing
+	 * divisions, or branching based on integer size. The resultant
+	 * algorithm does traditional reciprocal multiplication (typically
+	 * optimized by the compiler into shifts and adds), rejecting samples
+	 * whose lower half would indicate a range indivisible by ceil.
+	 */
+	BUILD_BUG_ON_MSG(!ceil, "get_random_u32_below() must take ceil > 0");
+	if (ceil <= 1)
+		return 0;
+	for (;;) {
+		if (ceil <= 1U << 8) {
+			u32 mult = ceil * get_random_u8();
+			if (likely(is_power_of_2(ceil) || (u8)mult >= (1U << 8) % ceil))
+				return mult >> 8;
+		} else if (ceil <= 1U << 16) {
+			u32 mult = ceil * get_random_u16();
+			if (likely(is_power_of_2(ceil) || (u16)mult >= (1U << 16) % ceil))
+				return mult >> 16;
+		} else {
+			u64 mult = (u64)ceil * get_random_u32();
+			if (likely(is_power_of_2(ceil) || (u32)mult >= -ceil % ceil))
+				return mult >> 32;
+		}
+	}
+}
+
 /*
  * On 64-bit architectures, protect against non-terminated C string overflows
  * by zeroing out the first byte of the canary; this leaves 56 bits of entropy.
-- 
2.37.3


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2022-10-18 17:35 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-17  2:37 [PATCH] random: use rejection sampling for uniform bounded random integers Jason A. Donenfeld
2022-10-17 18:27 ` Eric Biggers
2022-10-18  5:31   ` Jason A. Donenfeld
2022-10-18  7:04     ` Eric Biggers
2022-10-18 17:34       ` [PATCH v2] " Jason A. Donenfeld
2022-10-17 20:39 ` [PATCH] " Yann Droneaud

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).