linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] random: use an improved fast_mix() function
@ 2014-06-15  2:19 Theodore Ts'o
  2014-06-15  3:04 ` Surprising 64-bit performance anomaly (was Re: [PATCH] random: use an improved fast_mix() function) Theodore Ts'o
  0 siblings, 1 reply; 4+ messages in thread
From: Theodore Ts'o @ 2014-06-15  2:19 UTC (permalink / raw)
  To: Linux Kernel Developers List; +Cc: Theodore Ts'o, George Spelvin

Use more efficient fast_mix() function.  Thanks to George Spelvin for
doing the leg work to find a more efficient mixing function.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: George Spelvin <linux@horizon.com>
---
 drivers/char/random.c | 92 +++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 68 insertions(+), 24 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 7ff6dd4..b19edad 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -267,6 +267,8 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/random.h>
 
+/* #define ADD_INTERRUPT_BENCH */
+
 /*
  * Configuration information
  */
@@ -558,25 +560,29 @@ struct fast_pool {
  * collector.  It's hardcoded for an 128 bit pool and assumes that any
  * locks that might be needed are taken by the caller.
  */
-static void fast_mix(struct fast_pool *f, __u32 input[4])
+static void fast_mix(struct fast_pool *f)
 {
-	__u32		w;
-	unsigned	input_rotate = f->rotate;
-
-	w = rol32(input[0], input_rotate) ^ f->pool[0] ^ f->pool[3];
-	f->pool[0] = (w >> 3) ^ twist_table[w & 7];
-	input_rotate = (input_rotate + 14) & 31;
-	w = rol32(input[1], input_rotate) ^ f->pool[1] ^ f->pool[0];
-	f->pool[1] = (w >> 3) ^ twist_table[w & 7];
-	input_rotate = (input_rotate + 7) & 31;
-	w = rol32(input[2], input_rotate) ^ f->pool[2] ^ f->pool[1];
-	f->pool[2] = (w >> 3) ^ twist_table[w & 7];
-	input_rotate = (input_rotate + 7) & 31;
-	w = rol32(input[3], input_rotate) ^ f->pool[3] ^ f->pool[2];
-	f->pool[3] = (w >> 3) ^ twist_table[w & 7];
-	input_rotate = (input_rotate + 7) & 31;
-
-	f->rotate = input_rotate;
+	__u32 a = f->pool[0],	b = f->pool[1];
+	__u32 c = f->pool[2],	d = f->pool[3];
+
+	a += b;			c += d;
+	b = rol32(a, 6);	d = rol32(c, 27);
+	d ^= a;			b ^= c;
+
+	a += b;			c += d;
+	b = rol32(a, 16);	d = rol32(c, 14);
+	d ^= a;			b ^= c;
+
+	a += b;			c += d;
+	b = rol32(a, 6);	d = rol32(c, 27);
+	d ^= a;			b ^= c;
+
+	a += b;			c += d;
+	b = rol32(a, 16);	d = rol32(c, 14);
+	d ^= a;			b ^= c;
+
+	f->pool[0] = a;  f->pool[1] = b;
+	f->pool[2] = c;  f->pool[3] = d;
 	f->count++;
 }
 
@@ -829,6 +835,27 @@ EXPORT_SYMBOL_GPL(add_input_randomness);
 
 static DEFINE_PER_CPU(struct fast_pool, irq_randomness);
 
+#ifdef ADD_INTERRUPT_BENCH
+static unsigned long avg_cycles, avg_deviation;
+
+#define AVG_SHIFT 8     /* Exponential average factor k=1/256 */
+#define FIXED_1_2 (1 << (AVG_SHIFT-1))
+
+static void add_interrupt_bench(cycles_t start)
+{
+        long delta = random_get_entropy() - start;
+
+        /* Use a weighted moving average */
+        delta = delta - ((avg_cycles + FIXED_1_2) >> AVG_SHIFT);
+        avg_cycles += delta;
+        /* And average deviation */
+        delta = abs(delta) - ((avg_deviation + FIXED_1_2) >> AVG_SHIFT);
+        avg_deviation += delta;
+}
+#else
+#define add_interrupt_bench(x)
+#endif
+
 void add_interrupt_randomness(int irq, int irq_flags)
 {
 	struct entropy_store	*r;
@@ -836,22 +863,23 @@ void add_interrupt_randomness(int irq, int irq_flags)
 	struct pt_regs		*regs = get_irq_regs();
 	unsigned long		now = jiffies;
 	cycles_t		cycles = random_get_entropy();
-	__u32			input[4], c_high, j_high;
+	__u32			c_high, j_high;
 	__u64			ip;
 	unsigned long		seed;
 	int			credit = 0;
 
 	c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
 	j_high = (sizeof(now) > 4) ? now >> 32 : 0;
-	input[0] = cycles ^ j_high ^ irq;
-	input[1] = now ^ c_high;
+	fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
+	fast_pool->pool[1] ^= now ^ c_high;
 	ip = regs ? instruction_pointer(regs) : _RET_IP_;
-	input[2] = ip;
-	input[3] = ip >> 32;
+	fast_pool->pool[2] ^= ip;
+	fast_pool->pool[3] ^= ip >> 32;
 
-	fast_mix(fast_pool, input);
+	fast_mix(fast_pool);
 	if ((irq_flags & __IRQF_TIMER) == 0)
 		fast_pool->notimer_count++;
+	add_interrupt_bench(cycles);
 
 	if (cycles) {
 		if ((fast_pool->count < 64) &&
@@ -1648,6 +1676,22 @@ struct ctl_table random_table[] = {
 		.mode		= 0444,
 		.proc_handler	= proc_do_uuid,
 	},
+#ifdef ADD_INTERRUPT_BENCH
+	{
+		.procname	= "add_interrupt_avg_cycles",
+		.data		= &avg_cycles,
+		.maxlen		= sizeof(avg_cycles),
+		.mode		= 0444,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
+	{
+		.procname	= "add_interrupt_avg_deviation",
+		.data		= &avg_deviation,
+		.maxlen		= sizeof(avg_deviation),
+		.mode		= 0444,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
+#endif
 	{ }
 };
 #endif 	/* CONFIG_SYSCTL */
-- 
2.0.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Surprising 64-bit performance anomaly (was Re: [PATCH] random: use an improved fast_mix() function)
  2014-06-15  2:19 [PATCH] random: use an improved fast_mix() function Theodore Ts'o
@ 2014-06-15  3:04 ` Theodore Ts'o
  2014-06-15  3:19   ` Theodore Ts'o
  0 siblings, 1 reply; 4+ messages in thread
From: Theodore Ts'o @ 2014-06-15  3:04 UTC (permalink / raw)
  To: Linux Kernel Developers List; +Cc: George Spelvin

Hi George,

On top of the above patch, I applied the following to add 64-bit pool
support.  I had to use a union to avoid type punning warnings.

When building a 64-bit kernel and running under under KVM, I'm finding
that the 64-bit mix function which you suggested is twice as slow.

Using the (new) 32-bit function:

31821 23970
31629 24366
30856 24182

Using the 64-bit mixing function:

60438 44369
60820 45402
58778 45419

				- Ted


diff --git a/drivers/char/random.c b/drivers/char/random.c
index b19edad..0685413 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -257,6 +257,7 @@
 #include <linux/kmemcheck.h>
 #include <linux/workqueue.h>
 #include <linux/irq.h>
+#include <linux/bitops.h>
 
 #include <asm/processor.h>
 #include <asm/uaccess.h>
@@ -267,7 +268,8 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/random.h>
 
-/* #define ADD_INTERRUPT_BENCH */
+#define ADD_INTERRUPT_BENCH
+/* #define FORCE_32_VERSION */
 
 /*
  * Configuration information
@@ -548,7 +550,10 @@ static void mix_pool_bytes(struct entropy_store *r, const void *in,
 }
 
 struct fast_pool {
-	__u32		pool[4];
+	union {
+		__u32	pool32[4];
+		__u64	pool64[2];
+	} pool;
 	unsigned long	last;
 	unsigned char	count;
 	unsigned char	notimer_count;
@@ -560,10 +565,32 @@ struct fast_pool {
  * collector.  It's hardcoded for an 128 bit pool and assumes that any
  * locks that might be needed are taken by the caller.
  */
+#if (BITS_PER_LONG == 64) && !defined(FORCE_32_VERSION)
+#warning Building 64 bit fast_mix
 static void fast_mix(struct fast_pool *f)
 {
-	__u32 a = f->pool[0],	b = f->pool[1];
-	__u32 c = f->pool[2],	d = f->pool[3];
+	__u64 a = f->pool.pool64[0], b = f->pool.pool64[1];
+
+	a += b; b = rol64(b, 52);
+	b ^= a; a = rol64(a, 10);
+	a += b; b = rol64(b, 47);
+	b ^= a; a = rol64(a, 17);
+
+	a += b; b = rol64(b, 52);
+	b ^= a; a = rol64(a, 10);
+	a += b; b = rol64(b, 47);
+	b ^= a; a = rol64(a, 17);
+
+	f->pool.pool64[0] = a;  f->pool.pool64[1] = b;
+	f->count++;
+}
+
+#else
+#warning Building 32 bit fast_mix
+static void fast_mix(struct fast_pool *f)
+{
+	__u32 a = f->pool.pool32[0], b = f->pool.pool32[1];
+	__u32 c = f->pool.pool32[2], d = f->pool.pool32[3];
 
 	a += b;			c += d;
 	b = rol32(a, 6);	d = rol32(c, 27);
@@ -581,10 +608,11 @@ static void fast_mix(struct fast_pool *f)
 	b = rol32(a, 16);	d = rol32(c, 14);
 	d ^= a;			b ^= c;
 
-	f->pool[0] = a;  f->pool[1] = b;
-	f->pool[2] = c;  f->pool[3] = d;
+	f->pool.pool32[0] = a;  f->pool.pool32[1] = b;
+	f->pool.pool32[2] = c;  f->pool.pool32[3] = d;
 	f->count++;
 }
+#endif
 
 /*
  * Credit (or debit) the entropy store with n bits of entropy.
@@ -870,11 +898,11 @@ void add_interrupt_randomness(int irq, int irq_flags)
 
 	c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
 	j_high = (sizeof(now) > 4) ? now >> 32 : 0;
-	fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
-	fast_pool->pool[1] ^= now ^ c_high;
+	fast_pool->pool.pool32[0] ^= cycles ^ j_high ^ irq;
+	fast_pool->pool.pool32[1] ^= now ^ c_high;
 	ip = regs ? instruction_pointer(regs) : _RET_IP_;
-	fast_pool->pool[2] ^= ip;
-	fast_pool->pool[3] ^= ip >> 32;
+	fast_pool->pool.pool32[2] ^= ip;
+	fast_pool->pool.pool32[3] ^= ip >> 32;
 
 	fast_mix(fast_pool);
 	if ((irq_flags & __IRQF_TIMER) == 0)

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: Surprising 64-bit performance anomaly (was Re: [PATCH] random: use an improved fast_mix() function)
  2014-06-15  3:04 ` Surprising 64-bit performance anomaly (was Re: [PATCH] random: use an improved fast_mix() function) Theodore Ts'o
@ 2014-06-15  3:19   ` Theodore Ts'o
  2014-06-15  9:07     ` George Spelvin
  0 siblings, 1 reply; 4+ messages in thread
From: Theodore Ts'o @ 2014-06-15  3:19 UTC (permalink / raw)
  To: Linux Kernel Developers List, George Spelvin

On Sat, Jun 14, 2014 at 11:04:41PM -0400, Theodore Ts'o wrote:
> Hi George,
> 
> On top of the above patch, I applied the following to add 64-bit pool
> support.  I had to use a union to avoid type punning warnings.
> 
> When building a 64-bit kernel and running under under KVM, I'm finding
> that the 64-bit mix function which you suggested is twice as slow.
> 
> Using the (new) 32-bit function:
> 
> 31821 23970
> 31629 24366
> 30856 24182
> 
> Using the 64-bit mixing function:
> 
> 60438 44369
> 60820 45402
> 58778 45419

I'm now getting a different set of timing numbers for the 64-bit
mixing function.  As before the first number is the weighted moving
average; the second is the deviation:

48035 34322
47452 34413
46974 34350

I'm not sure why I'm getting slightly better figures now, but it's
still worse than the 32-bit fast_mix3() algorithm.  Also, given the
additional complexity I'm not sure it's worth it to have a different
mixing algorithm for 64-bit platforms, unless it's significantly
better, and right now, I'm seeing numbers which are about 50% worse,
at least on my test platform.

	   	      	      	   	      - Ted

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Surprising 64-bit performance anomaly (was Re: [PATCH] random: use an improved fast_mix() function)
  2014-06-15  3:19   ` Theodore Ts'o
@ 2014-06-15  9:07     ` George Spelvin
  0 siblings, 0 replies; 4+ messages in thread
From: George Spelvin @ 2014-06-15  9:07 UTC (permalink / raw)
  To: linux-kernel, linux, tytso

That is just *weird*.  I'm testing on bare metal, and I keep measuring
significantly better performance for the 64-bit code.  Here's an i7-2700K
@ 3.50GHz.  (I added a "rolled x2" measurement just for interest's sake.)

The idle performance is very similar for the original and unrolled x2
versions, but under load the original degrades far more. (174.5 cycles
vs. 132.1 for fast_mix3.)

And the 64-bit code is less than 2/3 the time of the 32-bit code
under load. (84.3 cycles average = 132.1 * 63.8%)


Idle		make -j9

Rolled x3:
27043 1212	62897 28037
26910 1547	58415 25790
27095 1302	59158 26083
30684 6233	37242 15848
26979 1339	59683 24658
27042 1534	61609 27537
27069 1402	56379 25877
27301 1162	56782 26379
27300 1535	55344 25003
26873 1507	48627 22207

Rolled x2:
25697 2859	34779 11771
27097 5065	31805 12125
25564 2646	54448 22853
25123 2191	57469 29098
24931 2106	57621 27855
25223 2207	63852 31484
24956 2112	74393 34080
25889 2340	65434 33316
25170 2031	68246 31109
25541 2296	70891 34294

Unrolled x3:
23066 1358	50698 29156
22990 1569	59566 28935
22797 1269	58363 28813
22321 1004	54652 30148
23122 1545	52651 29113
22704 875	57904 33033
22793 979	51882 28880
22413 891	55537 30635
22814 997	52760 27188
22998 1020	59592 30133

Original:
22038 2303	26656 18045
21209 2082	37559 22256
21134 1626	40082 23261
20689 1436	34227 19133
20804 1503	48391 28563
20979 1617	48235 28095
20752 1427	52101 29817
21046 1830	56703 30179
20828 1362	51196 28658
20865 1659	51641 29680

Unrolled x2:
21051 1667	29000 17989
20889 1381	28328 16046
21244 1664	33015 21364
21130 1628	32588 21417
20753 1497	42376 25412
21943 2917	39879 24932
20880 1279	24064 15512
21303 1389	38724 24107
21416 1712	36570 23549
20956 1652	33578 22983

64-bit, unrolled x2:
16037 1031	16651 9740
16664 2094	19199 12689
16129 1057	20216 8590
16211 905	18382 9964
16637 2056	20125 8495
16229 1218	24190 13176
16515 1632	28717 18022
16220 1096	23762 13147
16307 1103	19599 5382
16326 1145	24844 12837

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2014-06-15  9:07 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-06-15  2:19 [PATCH] random: use an improved fast_mix() function Theodore Ts'o
2014-06-15  3:04 ` Surprising 64-bit performance anomaly (was Re: [PATCH] random: use an improved fast_mix() function) Theodore Ts'o
2014-06-15  3:19   ` Theodore Ts'o
2014-06-15  9:07     ` George Spelvin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).