Re: [PATCH v4 1/3] random: add rte_drand() function

From: "Mattias Rönnblom" <hofors@lysator.liu.se>
To: Stephen Hemminger <stephen@networkplumber.org>, dev@dpdk.org
Cc: "Mattias Rönnblom" <mattias.ronnblom@ericsson.com>,
	"Ray Kinsella" <mdr@ashroe.eu>
Subject: Re: [PATCH v4 1/3] random: add rte_drand() function
Date: Thu, 26 May 2022 15:20:29 +0200	[thread overview]
Message-ID: <84afee2e-fa4c-faf1-d046-febb0ae77c09@lysator.liu.se> (raw)
In-Reply-To: <20220525203123.277180-2-stephen@networkplumber.org>

On 2022-05-25 22:31, Stephen Hemminger wrote:
> The PIE code and other applications can benefit from having a
> fast way to get a random floating point value. This new function
> is equivalent to drand() in the standard library.
> 
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
>   app/test/test_rand_perf.c              |  7 +++++
>   doc/guides/rel_notes/release_22_07.rst |  5 ++++
>   lib/eal/common/rte_random.c            | 41 ++++++++++++++++++++++++++
>   lib/eal/include/rte_random.h           | 18 +++++++++++
>   lib/eal/meson.build                    |  3 ++
>   lib/eal/version.map                    |  1 +
>   6 files changed, 75 insertions(+)
> 
> diff --git a/app/test/test_rand_perf.c b/app/test/test_rand_perf.c
> index fe797ebfa1ca..26fb1d9a586e 100644
> --- a/app/test/test_rand_perf.c
> +++ b/app/test/test_rand_perf.c
> @@ -20,6 +20,7 @@ static volatile uint64_t vsum;
>   
>   enum rand_type {
>   	rand_type_64,
> +	rand_type_float,
>   	rand_type_bounded_best_case,
>   	rand_type_bounded_worst_case
>   };
> @@ -30,6 +31,8 @@ rand_type_desc(enum rand_type rand_type)
>   	switch (rand_type) {
>   	case rand_type_64:
>   		return "Full 64-bit [rte_rand()]";
> +	case rand_type_float:
> +		return "Floating point [rte_drand()]";
>   	case rand_type_bounded_best_case:
>   		return "Bounded average best-case [rte_rand_max()]";
>   	case rand_type_bounded_worst_case:
> @@ -55,6 +58,9 @@ test_rand_perf_type(enum rand_type rand_type)
>   		case rand_type_64:
>   			sum += rte_rand();
>   			break;
> +		case rand_type_float:
> +			sum += 1000. * rte_drand();

Including this floating point multiplication will lead to an 
overestimation of rte_drand() latency.

You could refactor this function to be a macro, and pass the return type 
to as a parameter to this macro. I did just that, and on both an AMD 
5900X and a Cortex-A72 it didn't add more than ~5%, so I don't think 
it's necessary.

> +			break;
>   		case rand_type_bounded_best_case:
>   			sum += rte_rand_max(BEST_CASE_BOUND);
>   			break;
> @@ -83,6 +89,7 @@ test_rand_perf(void)
>   	printf("Pseudo-random number generation latencies:\n");
>   
>   	test_rand_perf_type(rand_type_64);
> +	test_rand_perf_type(rand_type_float);
>   	test_rand_perf_type(rand_type_bounded_best_case);
>   	test_rand_perf_type(rand_type_bounded_worst_case);
>   
> diff --git a/doc/guides/rel_notes/release_22_07.rst b/doc/guides/rel_notes/release_22_07.rst
> index e49cacecefd4..b131ea577226 100644
> --- a/doc/guides/rel_notes/release_22_07.rst
> +++ b/doc/guides/rel_notes/release_22_07.rst
> @@ -104,6 +104,11 @@ New Features
>     * ``RTE_EVENT_QUEUE_ATTR_WEIGHT``
>     * ``RTE_EVENT_QUEUE_ATTR_AFFINITY``
>   
> +* ** Added function get random floating point number.**
> +
> +  Added the function ``rte_drand()`` to provide a pseudo-random
> +  floating point number.
> +
>   
>   Removed Items
>   -------------
> diff --git a/lib/eal/common/rte_random.c b/lib/eal/common/rte_random.c
> index 4535cc980cec..3dc3484ee655 100644
> --- a/lib/eal/common/rte_random.c
> +++ b/lib/eal/common/rte_random.c
> @@ -6,6 +6,9 @@
>   #include <x86intrin.h>
>   #endif
>   #include <unistd.h>
> +#ifdef RTE_LIBEAL_USE_IEEE754
> +#include <ieee754.h>
> +#endif
>   
>   #include <rte_branch_prediction.h>
>   #include <rte_cycles.h>
> @@ -173,6 +176,44 @@ rte_rand_max(uint64_t upper_bound)
>   	return res;
>   }
>   
> +double
> +rte_drand(void)
> +{
> +	struct rte_rand_state *state = __rte_rand_get_state();
> +	uint64_t rand64 = __rte_rand_lfsr258(state);
> +#ifdef RTE_LIBEAL_USE_IEEE754
> +	union ieee754_double u = {
> +		.ieee = {
> +			.negative = 0,
> +			.exponent = IEEE754_DOUBLE_BIAS,
> +		},
> +	};
> +
> +	/* Take 64 bit random value and put it into the mantissa
> +	 * This uses direct access to IEEE format to avoid doing
> +	 * any direct floating point math here.
> +	 */
> +	u.ieee.mantissa0 = rand64 >> 32;
> +	u.ieee.mantissa1 = rand64;
> +
> +	return u.d - 1.0;
> +#else
> +	/* Slower method requiring floating point divide
> +	 *

Do you know how much slower? I ran rand_perf_test on two of my systems.

                       AMD 5900X     Pi4 (ARM Cortex-A72)
IEEE754 version          12              1.19
Non-IEEE754 version      11              1.16
Naive version*           24              1.16

* (double)rte_rand() / (double)UINT64_MAX

Numbers are TSC cycles/op.

Surprisingly, it seems like the IEEE754 version is slower on both of 
these machines.

Do you have a machine (or a different use case) where the supposedly 
more optimized version actually runs faster?

> +	 * The double mantissa only has 53 bits, so we uniformly mask off the
> +	 * high 11 bits and then floating-point divide by 2^53 to achieve a
> +	 * result in [0, 1).
> +	 *
> +	 * We are not allowed to emit 1.0, so denom must be one greater than
> +	 * the possible range of the preceeding step.
> +	 */
> +	static const uint64_t denom = (uint64_t)1 << 53;

Remove "static const". Surely, this can't make a difference (at least 
not in a positive direction).

> +
> +	rand64 &= denom - 1;
> +	return (double)rand64 / denom;
> +#endif
> +}
> +
>   static uint64_t
>   __rte_random_initial_seed(void)
>   {
> diff --git a/lib/eal/include/rte_random.h b/lib/eal/include/rte_random.h
> index 29f5f1325a30..f6541c2b0f08 100644
> --- a/lib/eal/include/rte_random.h
> +++ b/lib/eal/include/rte_random.h
> @@ -65,6 +65,24 @@ rte_rand(void);
>   uint64_t
>   rte_rand_max(uint64_t upper_bound);
>   
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Generates a pseudo-random floating point number.
> + *
> + * This function returns a nonnegative double-precision floating random
> + * number uniformly distributed over the interval [0.0, 1.0).
> + *
> + * The generator is not cryptographically secure.
> + * If called from lcore threads, this function is thread-safe.
> + *
> + * @return
> + *   A pseudo-random value between 0 and 1.0.
> + */
> +__rte_experimental
> +double rte_drand(void);
> +
>   #ifdef __cplusplus
>   }
>   #endif
> diff --git a/lib/eal/meson.build b/lib/eal/meson.build
> index 056beb946119..e50524901c98 100644
> --- a/lib/eal/meson.build
> +++ b/lib/eal/meson.build
> @@ -32,3 +32,6 @@ endif
>   if cc.has_function('getentropy', prefix : '#include <unistd.h>')
>       cflags += '-DRTE_LIBEAL_USE_GETENTROPY'
>   endif
> +if cc.has_header_symbol('ieee754.h', 'union ieee754_double')
> +    cflags += '-DRTE_LIBEAL_USE_IEEE754'
> +endif
> diff --git a/lib/eal/version.map b/lib/eal/version.map
> index d49e30bd042f..cfbade9a33e9 100644
> --- a/lib/eal/version.map
> +++ b/lib/eal/version.map
> @@ -422,6 +422,7 @@ EXPERIMENTAL {
>   	rte_intr_type_set;
>   
>   	# added in 22.07
> +	rte_drand;
>   	rte_thread_get_affinity_by_id;
>   	rte_thread_self;
>   	rte_thread_set_affinity_by_id;