Revising prandom_32 generator

* Revising prandom_32 generator
@ 2019-03-26 11:10 George Spelvin
  2019-03-26 11:17 ` George Spelvin
  2019-03-26 14:58 ` Stephen Hemminger
  0 siblings, 2 replies; 9+ messages in thread
From: George Spelvin @ 2019-03-26 11:10 UTC (permalink / raw)
  To: daniel, hannes, Borkmann, Daniel, Frederic, Hannes, Sowa
  Cc: lkml, George, netdev, Spelvin

I started on a project to correct all of the instances of
"prandom_u32() % FOO" in the kernel (there are lots)
to "prandom_u32_max(FOO)".

This has snowballed into an ongoing audit of random number use in the
kernel, including uses of get_random_bytes() when get_random_u32 or
prandom_u32() would suffice.

But one of the more potentially-controversial changes in the series,
which might benefit from more discussion time, is a change of the
actual prandom_u32() generator itself.

The current lfsr113 generator isn't bad, but like all LFSRs, it fails
linear complexity tests in suites like testU01 and PractRand.

And there are generators with the same 128-bit state which produce
better output in about half the time.  So I figured as long as I was
neck-deep in the code, I might as well tweak that, too.

Thw ones that seem interesting to me are:
- Chris Doty-Humphrey's sfc32.  This is a 96-bit chaotic generator
  (meaning period *probably* long but not well defined) fed with
  a 32-bit counter to ensure a minimum period.  It's extremely
  fast, and the author is also the author of PractRand, so it's
  well-tested.
- Vigna and Bacman's xoshiro128**.  This is a 128-bit LFSR with some
  output postprocessing.
- (on 64-bit machines) xoroshiro128**, by the same authors.
  This is only efficient on 64-bit machines, so it would need
  a have a 32-bit backup.
- Bob Jenkins' jsf (originally "flea").  128 bits, good mixing,
  fully chaotic.  I prefer the safety of a guaranteed minimum
  period, but this is well thought of.
- A lag-3 mutiply-with-carry generator.  2^32 - 1736 is the largest
  "safe prime" mutiplier.

I'm currently planning on using the first.  It also has the advantage
for lockless reseeding that there are no bad states to avoid.

Some discussion of most of these at
http://www.pcg-random.org/posts/some-prng-implementations.html

Here are some timing numbers.  Clock cycles per 1e7 iterations,
sp 8-19 cycles per iteration.

Core 2:
         lcg32: cycles 89018580
         lcg64: cycles 130782048
           mwc: cycles 90385992
   prandom_u32: cycles 168780348
  xoshiro128ss: cycles 90331056
xoroshiro128ss: cycles 106012008
         sfc32: cycles 90318276
         jsf32: cycles 90364464
        jsf32a: cycles 100382724
        gjrand: cycles 131713680

Opteron:
         lcg32: cycles 103434699
         lcg64: cycles 80064205
           mwc: cycles 110313786
   prandom_u32: cycles 190311062
  xoshiro128ss: cycles 110115475
xoroshiro128ss: cycles 100114345
         sfc32: cycles 100163397
         jsf32: cycles 100104957
        jsf32a: cycles 110133030
        gjrand: cycles 110122007

(The LCGs are not 128-bit state; they're just there as a speed baseine.)

As you can see, the current lfsr113-based prandom_u32 takes almost
twice the time of the aternatives.

My quick & dirty test code is appended for anyone who wants to play.

#include <stdint.h>

typedef uint32_t u32;
typedef uint64_t u64;
struct rnd_state {
	u32 s1, s2, s3, s4;
};

u32 lcg32(struct rnd_state *state)
{
	u32 x = state->s1 * 2891336453 + 1;

	return state->s1 = x;
}

u32 lcg64(struct rnd_state *state)
{
	u64 x = *(u64 *)&state->s1;
	//x = x * 0xf2fc5985 + 1;
	x = x * 6364136223846793005 + 1;
	*(u64 *)&state->s1 = x;
	return x >> 32;
}

u32 mwc(struct rnd_state *state)
{
	u64 x = state->s1 + (u64)state->s2 * -(u32)1736;
	state->s2 = state->s3;
	state->s3 = state->s4;
	state->s1 = x >> 32;
	return state->s4 = (u32)x;
}

u32 prandom_u32_state(struct rnd_state *state)
{
#define TAUSWORTHE(s, a, b, c, d) ((s & c) << d) ^ (((s << a) ^ s) >> b)
	u32 x = state->s1 = TAUSWORTHE(state->s1,  6, 13,   ~1u, 18);
	x    ^= state->s2 = TAUSWORTHE(state->s2,  2, 27,   ~7u,  2);
	//barrier();
	x    ^= state->s3 = TAUSWORTHE(state->s3, 13, 21,  ~15u,  7);
	x    ^= state->s4 = TAUSWORTHE(state->s4,  3, 12, ~127u, 13);

	return x;
}

static inline uint32_t rol32(uint32_t x, int k)
{
	return x << k | x >> (32 - k);
}

uint32_t xoshiro128ss(uint32_t s[4])
{
#if 0
	const uint32_t res = rol32(s[0] * 5, 7) * 9;
	const uint32_t t = s[1] << 9;

	s[2] ^= s[0];
	s[3] ^= s[1];
	s[1] ^= s[2];
	s[0] ^= s[3];
	s[2] ^= t;
	s[3] = rol32(s[3], 11);

	return res;
#else
	/* xoshiro128** */
	uint32_t a = s[0], b = s[1], c = s[2] ^ a, d = s[3] ^ b;

	s[0] = a ^ d;
	s[1] = b ^ c;
	s[2] = c ^ (b << 9);
	s[3] = rol32(d, 11);

	return rol32(a * 5, 7) * 9;	/* The "**" non-linear function */
#endif
}

static inline uint64_t rol64(uint64_t x, int k)
{
	return x << k | x >> (64 - k);
}

uint64_t xoroshiro128ss(uint64_t s[2])
{
	const uint64_t s0 = s[0];
	uint64_t s1 = s[1] ^ s0;
	const uint64_t res = rol64(s0 * 5, 7) * 9;

	s[1] = rol64(s1, 37); // c
	s[0] = rol64(s0, 24) ^ s1 ^ (s1 << 16); // a, b

	return res;
}

uint32_t sfc32(uint32_t s[4])
{
	uint32_t b = s[1], c = s[2];
	uint32_t const res = s[0] + b + s[3]++;

	// good sets include {21,9,3},{15,8,3};
	// older versions used {25,8,3} which wasn't as good
	s[0] = b ^ (b >> 9);
	s[1] = c + (c << 3);
	s[2] = rol32(c, 21) + res;
	return res;
}

/* http://burtleburtle.net/bob/rand/smallprng.html */
uint32_t jsf32(uint32_t s[4])
{
	uint32_t  e = s[0] - rol32(s[1], 27);
	       s[0] = s[1] ^ rol32(s[2], 17);
	       s[1] = s[2] + s[3];
	       s[2] = s[3] + e;
	return s[3] = e + s[0];
}

uint32_t jsf32a(uint32_t s[4])
{
	uint32_t  e = s[0] - rol32(s[1], 23);
	       s[0] = s[1] ^ rol32(s[2], 16);
	       s[1] = s[2] + rol32(s[3], 11);
	       s[2] = s[3] + e;
	return s[3] = e + s[0];
}

/* See https://gist.github.com/imneme/7a783e20f71259cc13e219829bcea4ac */
uint32_t gjrand(uint32_t s[4])
{
	uint32_t a = s[0], b = s[1], c = s[2], d = s[3] += 0x96a5;

        b += c;            a  =  rol32(a, 16);
	c ^= b;            a += b;
	c  = rol32(c, 11); b ^= a;
        a += c;            b  = rol32(b, 19);
        c += a;            b += d;
	s[0] = a;
	s[1] = b;
	s[2] = c;
	return a;
}

#include <stdio.h>

static uint64_t rdtsc()
{
	uint32_t low, high;
	asm volatile("rdtsc" : "=a" (low), "=d" (high));
	return (uint64_t)high << 32 | low;
}

typedef uint32_t (*rng_func)(uint64_t s[2]);

#define FUNCS 10

int
main(void)
{
	static const rng_func funcs[FUNCS] = {
		(rng_func)lcg32,
		(rng_func)lcg64,
		(rng_func)mwc,
		(rng_func)prandom_u32_state,
		(rng_func)xoshiro128ss,
		(rng_func)xoroshiro128ss,
		(rng_func)sfc32,
		(rng_func)jsf32,
		(rng_func)jsf32a,
		(rng_func)gjrand
	};
	static const char rng_names[FUNCS][16] = {
		"         lcg32",
		"         lcg64",
		"           mwc",
		"   prandom_u32",
		"  xoshiro128ss",
		"xoroshiro128ss",
		"         sfc32",
		"         jsf32",
		"        jsf32a",
		"        gjrand"
	};
	uint32_t sums[FUNCS];
	int i, j, k;
	uint64_t timing[FUNCS], min_timing[FUNCS];

	for (k = 0; k < 10; k++) {
		for (i = 0; i < FUNCS; i++) {
			uint32_t sum = 0;
			uint64_t s[2] = { -1ul, -1ul };
			uint64_t start = rdtsc();
			asm("" : "+g" (start));

			for (j = 0; j < 10000000; j++)
				sum += funcs[i](s);
			asm("" : "+g" (start));
			timing[i] = rdtsc() - start;
			sums[i] = sum;
		}
		for (i = 0; i < FUNCS; i++) {
			printf("%s: sum %08x cycles %lu\n",
				rng_names[i], sums[i], timing[i]);
			if (!k || timing[i] < min_timing[i])
				min_timing[i] = timing[i];
		}
	}
	puts("Minimum timings:");
	for (i = 0; i < FUNCS; i++)
		printf("%s: cycles %lu\n", rng_names[i], min_timing[i]);
	return 0;
}

^ permalink raw reply	[flat|nested] 9+ messages in thread