All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v4 0/2] random: PREEMPT_RT fixes
@ 2022-02-09 12:56 Jason A. Donenfeld
  2022-02-09 12:56 ` [PATCH v4 1/2] random: remove batched entropy locking Jason A. Donenfeld
  2022-02-09 12:56 ` [PATCH v4 2/2] random: defer fast pool mixing to worker Jason A. Donenfeld
  0 siblings, 2 replies; 17+ messages in thread
From: Jason A. Donenfeld @ 2022-02-09 12:56 UTC (permalink / raw)
  To: linux-kernel, linux-crypto
  Cc: Jason A. Donenfeld, Sebastian Andrzej Siewior, Thomas Gleixner,
	Peter Zijlstra, Theodore Ts'o, Sultan Alsawaf,
	Jonathan Neuschäfer, Eric Biggers

Here are the two patches we've been looking at thus far in separate
threads, now brought together as one patchset. This doesn't fix _all_ of
the PREEMPT_RT issues, but it does very much move in the right direction
of having less locking in places where that matters.

v4 improves on v3 by fixing documentation comments and copying the fast
pool to the stack before mixing.

Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Sultan Alsawaf <sultan@kerneltoast.com>
Cc: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
Cc: Eric Biggers <ebiggers@kernel.org>

Jason A. Donenfeld (2):
  random: remove batched entropy locking
  random: defer fast pool mixing to worker

 drivers/char/random.c         | 109 +++++++++++++++++++---------------
 include/trace/events/random.h |   6 --
 2 files changed, 62 insertions(+), 53 deletions(-)

-- 
2.35.0


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH v4 1/2] random: remove batched entropy locking
  2022-02-09 12:56 [PATCH v4 0/2] random: PREEMPT_RT fixes Jason A. Donenfeld
@ 2022-02-09 12:56 ` Jason A. Donenfeld
  2022-02-10 16:24   ` Sebastian Andrzej Siewior
  2022-02-21  2:30   ` Eric Biggers
  2022-02-09 12:56 ` [PATCH v4 2/2] random: defer fast pool mixing to worker Jason A. Donenfeld
  1 sibling, 2 replies; 17+ messages in thread
From: Jason A. Donenfeld @ 2022-02-09 12:56 UTC (permalink / raw)
  To: linux-kernel, linux-crypto
  Cc: Jason A. Donenfeld, Sebastian Andrzej Siewior, Thomas Gleixner,
	Peter Zijlstra, Theodore Ts'o, Sultan Alsawaf,
	Jonathan Neuschäfer, Eric Biggers, Andy Lutomirski

Rather than use spinlocks to protect batched entropy, we can instead
disable interrupts locally, since we're dealing with per-cpu data, and
manage resets with a basic generation counter. At the same time, we
can't quite do this on PREEMPT_RT, where we still want spinlocks-as-
mutexes semantics. So we use a local_lock_t, which provides the right
behavior for each. Because this is a per-cpu lock, that generation
counter is still doing the necessary CPU-to-CPU communication.

This should improve performance a bit. It will also fix the linked splat
that Jonathan received with a PROVE_RAW_LOCK_NESTING=y.

Suggested-by: Andy Lutomirski <luto@kernel.org>
Reported-by: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
Tested-by: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
Link: https://lore.kernel.org/lkml/YfMa0QgsjCVdRAvJ@latitude/
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Sultan Alsawaf <sultan@kerneltoast.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 drivers/char/random.c | 55 ++++++++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 968c415d1f45..ceded1c4f73b 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1731,13 +1731,16 @@ static int __init random_sysctls_init(void)
 device_initcall(random_sysctls_init);
 #endif	/* CONFIG_SYSCTL */
 
+static atomic_t batch_generation = ATOMIC_INIT(0);
+
 struct batched_entropy {
 	union {
 		u64 entropy_u64[CHACHA_BLOCK_SIZE / sizeof(u64)];
 		u32 entropy_u32[CHACHA_BLOCK_SIZE / sizeof(u32)];
 	};
+	local_lock_t lock;
 	unsigned int position;
-	spinlock_t batch_lock;
+	int generation;
 };
 
 /*
@@ -1749,7 +1752,7 @@ struct batched_entropy {
  * point prior.
  */
 static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u64) = {
-	.batch_lock = __SPIN_LOCK_UNLOCKED(batched_entropy_u64.lock),
+	.lock = INIT_LOCAL_LOCK(batched_entropy_u64.lock)
 };
 
 u64 get_random_u64(void)
@@ -1758,67 +1761,65 @@ u64 get_random_u64(void)
 	unsigned long flags;
 	struct batched_entropy *batch;
 	static void *previous;
+	int next_gen;
 
 	warn_unseeded_randomness(&previous);
 
+	local_lock_irqsave(&batched_entropy_u64.lock, flags);
 	batch = raw_cpu_ptr(&batched_entropy_u64);
-	spin_lock_irqsave(&batch->batch_lock, flags);
-	if (batch->position % ARRAY_SIZE(batch->entropy_u64) == 0) {
+
+	next_gen = atomic_read(&batch_generation);
+	if (batch->position % ARRAY_SIZE(batch->entropy_u64) == 0 ||
+	    next_gen != batch->generation) {
 		extract_crng((u8 *)batch->entropy_u64);
 		batch->position = 0;
+		batch->generation = next_gen;
 	}
+
 	ret = batch->entropy_u64[batch->position++];
-	spin_unlock_irqrestore(&batch->batch_lock, flags);
+	local_unlock_irqrestore(&batched_entropy_u64.lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(get_random_u64);
 
 static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u32) = {
-	.batch_lock = __SPIN_LOCK_UNLOCKED(batched_entropy_u32.lock),
+	.lock = INIT_LOCAL_LOCK(batched_entropy_u32.lock)
 };
+
 u32 get_random_u32(void)
 {
 	u32 ret;
 	unsigned long flags;
 	struct batched_entropy *batch;
 	static void *previous;
+	int next_gen;
 
 	warn_unseeded_randomness(&previous);
 
+	local_lock_irqsave(&batched_entropy_u32.lock, flags);
 	batch = raw_cpu_ptr(&batched_entropy_u32);
-	spin_lock_irqsave(&batch->batch_lock, flags);
-	if (batch->position % ARRAY_SIZE(batch->entropy_u32) == 0) {
+
+	next_gen = atomic_read(&batch_generation);
+	if (batch->position % ARRAY_SIZE(batch->entropy_u32) == 0 ||
+	    next_gen != batch->generation) {
 		extract_crng((u8 *)batch->entropy_u32);
 		batch->position = 0;
+		batch->generation = next_gen;
 	}
+
 	ret = batch->entropy_u32[batch->position++];
-	spin_unlock_irqrestore(&batch->batch_lock, flags);
+	local_unlock_irqrestore(&batched_entropy_u32.lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(get_random_u32);
 
 /* It's important to invalidate all potential batched entropy that might
  * be stored before the crng is initialized, which we can do lazily by
- * simply resetting the counter to zero so that it's re-extracted on the
- * next usage. */
+ * bumping the generation counter.
+ */
 static void invalidate_batched_entropy(void)
 {
-	int cpu;
-	unsigned long flags;
-
-	for_each_possible_cpu(cpu) {
-		struct batched_entropy *batched_entropy;
-
-		batched_entropy = per_cpu_ptr(&batched_entropy_u32, cpu);
-		spin_lock_irqsave(&batched_entropy->batch_lock, flags);
-		batched_entropy->position = 0;
-		spin_unlock(&batched_entropy->batch_lock);
-
-		batched_entropy = per_cpu_ptr(&batched_entropy_u64, cpu);
-		spin_lock(&batched_entropy->batch_lock);
-		batched_entropy->position = 0;
-		spin_unlock_irqrestore(&batched_entropy->batch_lock, flags);
-	}
+	atomic_inc(&batch_generation);
 }
 
 /**
-- 
2.35.0


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH v4 2/2] random: defer fast pool mixing to worker
  2022-02-09 12:56 [PATCH v4 0/2] random: PREEMPT_RT fixes Jason A. Donenfeld
  2022-02-09 12:56 ` [PATCH v4 1/2] random: remove batched entropy locking Jason A. Donenfeld
@ 2022-02-09 12:56 ` Jason A. Donenfeld
  2022-02-10 18:04   ` Sebastian Andrzej Siewior
  1 sibling, 1 reply; 17+ messages in thread
From: Jason A. Donenfeld @ 2022-02-09 12:56 UTC (permalink / raw)
  To: linux-kernel, linux-crypto
  Cc: Jason A. Donenfeld, Sebastian Andrzej Siewior, Thomas Gleixner,
	Peter Zijlstra, Theodore Ts'o, Sultan Alsawaf,
	Jonathan Neuschäfer, Eric Biggers

On PREEMPT_RT, it's problematic to take spinlocks from hard irq
handlers. We can fix this by deferring to a work queue the dumping of
the fast pool into the input pool.

We accomplish this with some careful rules on fast_pool->count:

  - When it's incremented to >= 64, we schedule the work.
  - If the top bit is set, we never schedule the work, even if >= 64.
  - The worker is responsible for setting it back to 0 when it's done.

In the worst case, an irq handler is mixing a new irq into the pool at
the same time as the worker is dumping it into the input pool. In this
case, we only ever set the count back to 0 _after_ we're done, so that
subsequent cycles will require a full 64 to dump it in again. In other
words, the result of this race is only ever adding a little bit more
information than normal, but never less, and never crediting any more
for this partial additional information.

Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Sultan Alsawaf <sultan@kerneltoast.com>
Cc: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 drivers/char/random.c         | 54 ++++++++++++++++++++++-------------
 include/trace/events/random.h |  6 ----
 2 files changed, 34 insertions(+), 26 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index ceded1c4f73b..f985d84872de 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -377,12 +377,6 @@ static void _mix_pool_bytes(const void *in, int nbytes)
 	blake2s_update(&input_pool.hash, in, nbytes);
 }
 
-static void __mix_pool_bytes(const void *in, int nbytes)
-{
-	trace_mix_pool_bytes_nolock(nbytes, _RET_IP_);
-	_mix_pool_bytes(in, nbytes);
-}
-
 static void mix_pool_bytes(const void *in, int nbytes)
 {
 	unsigned long flags;
@@ -394,11 +388,13 @@ static void mix_pool_bytes(const void *in, int nbytes)
 }
 
 struct fast_pool {
-	u32 pool[4];
+	struct work_struct mix;
 	unsigned long last;
+	u32 pool[4];
+	unsigned int count;
 	u16 reg_idx;
-	u8 count;
 };
+#define FAST_POOL_MIX_INFLIGHT (1U << 31)
 
 /*
  * This is a fast mixing routine used by the interrupt randomness
@@ -428,7 +424,6 @@ static void fast_mix(struct fast_pool *f)
 
 	f->pool[0] = a;  f->pool[1] = b;
 	f->pool[2] = c;  f->pool[3] = d;
-	f->count++;
 }
 
 static void process_random_ready_list(void)
@@ -977,12 +972,34 @@ static u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
 	return *ptr;
 }
 
+static void mix_interrupt_randomness(struct work_struct *work)
+{
+	struct fast_pool *fast_pool = container_of(work, struct fast_pool, mix);
+	u8 pool[sizeof(fast_pool->pool)];
+
+	/*
+	 * Since this is the result of a trip through the scheduler, xor in
+	 * a cycle counter. It can't hurt, and might help.
+	 */
+	fast_pool->pool[3] ^= random_get_entropy();
+	/* Copy the pool to the stack so that the mixer always has a consistent view. */
+	memcpy(pool, fast_pool->pool, sizeof(pool));
+	/* We take care to zero out the count only after we're done reading the pool. */
+	WRITE_ONCE(fast_pool->count, 0);
+	fast_pool->last = jiffies;
+
+	mix_pool_bytes(pool, sizeof(pool));
+	credit_entropy_bits(1);
+	memzero_explicit(pool, sizeof(pool));
+}
+
 void add_interrupt_randomness(int irq)
 {
 	struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
 	struct pt_regs *regs = get_irq_regs();
 	unsigned long now = jiffies;
 	cycles_t cycles = random_get_entropy();
+	unsigned int new_count;
 	u32 c_high, j_high;
 	u64 ip;
 
@@ -999,9 +1016,10 @@ void add_interrupt_randomness(int irq)
 
 	fast_mix(fast_pool);
 	add_interrupt_bench(cycles);
+	new_count = ++fast_pool->count;
 
 	if (unlikely(crng_init == 0)) {
-		if ((fast_pool->count >= 64) &&
+		if (new_count >= 64 &&
 		    crng_fast_load((u8 *)fast_pool->pool, sizeof(fast_pool->pool)) > 0) {
 			fast_pool->count = 0;
 			fast_pool->last = now;
@@ -1009,20 +1027,16 @@ void add_interrupt_randomness(int irq)
 		return;
 	}
 
-	if ((fast_pool->count < 64) && !time_after(now, fast_pool->last + HZ))
+	if (new_count & FAST_POOL_MIX_INFLIGHT)
 		return;
 
-	if (!spin_trylock(&input_pool.lock))
+	if (new_count < 64 && !time_after(now, fast_pool->last + HZ))
 		return;
 
-	fast_pool->last = now;
-	__mix_pool_bytes(&fast_pool->pool, sizeof(fast_pool->pool));
-	spin_unlock(&input_pool.lock);
-
-	fast_pool->count = 0;
-
-	/* award one bit for the contents of the fast pool */
-	credit_entropy_bits(1);
+	if (unlikely(!fast_pool->mix.func))
+		INIT_WORK(&fast_pool->mix, mix_interrupt_randomness);
+	fast_pool->count |= FAST_POOL_MIX_INFLIGHT;
+	queue_work_on(raw_smp_processor_id(), system_highpri_wq, &fast_pool->mix);
 }
 EXPORT_SYMBOL_GPL(add_interrupt_randomness);
 
diff --git a/include/trace/events/random.h b/include/trace/events/random.h
index ad149aeaf42c..833f42afc70f 100644
--- a/include/trace/events/random.h
+++ b/include/trace/events/random.h
@@ -52,12 +52,6 @@ DEFINE_EVENT(random__mix_pool_bytes, mix_pool_bytes,
 	TP_ARGS(bytes, IP)
 );
 
-DEFINE_EVENT(random__mix_pool_bytes, mix_pool_bytes_nolock,
-	TP_PROTO(int bytes, unsigned long IP),
-
-	TP_ARGS(bytes, IP)
-);
-
 TRACE_EVENT(credit_entropy_bits,
 	TP_PROTO(int bits, int entropy_count, unsigned long IP),
 
-- 
2.35.0


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH v4 1/2] random: remove batched entropy locking
  2022-02-09 12:56 ` [PATCH v4 1/2] random: remove batched entropy locking Jason A. Donenfeld
@ 2022-02-10 16:24   ` Sebastian Andrzej Siewior
  2022-02-21  2:30   ` Eric Biggers
  1 sibling, 0 replies; 17+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-02-10 16:24 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: linux-kernel, linux-crypto, Thomas Gleixner, Peter Zijlstra,
	Theodore Ts'o, Sultan Alsawaf, Jonathan Neuschäfer,
	Eric Biggers, Andy Lutomirski

On 2022-02-09 13:56:43 [+0100], Jason A. Donenfeld wrote:
> Rather than use spinlocks to protect batched entropy, we can instead
> disable interrupts locally, since we're dealing with per-cpu data, and
> manage resets with a basic generation counter. At the same time, we
> can't quite do this on PREEMPT_RT, where we still want spinlocks-as-
> mutexes semantics. So we use a local_lock_t, which provides the right
> behavior for each. Because this is a per-cpu lock, that generation
> counter is still doing the necessary CPU-to-CPU communication.
> 
> This should improve performance a bit. It will also fix the linked splat
> that Jonathan received with a PROVE_RAW_LOCK_NESTING=y.
> 
> Suggested-by: Andy Lutomirski <luto@kernel.org>
> Reported-by: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
> Tested-by: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
> Link: https://lore.kernel.org/lkml/YfMa0QgsjCVdRAvJ@latitude/
> Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> Cc: Sultan Alsawaf <sultan@kerneltoast.com>
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>

Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

Sebastian

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4 2/2] random: defer fast pool mixing to worker
  2022-02-09 12:56 ` [PATCH v4 2/2] random: defer fast pool mixing to worker Jason A. Donenfeld
@ 2022-02-10 18:04   ` Sebastian Andrzej Siewior
  2022-02-11  0:42     ` Jason A. Donenfeld
  2022-02-11  8:25     ` Dominik Brodowski
  0 siblings, 2 replies; 17+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-02-10 18:04 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: linux-kernel, linux-crypto, Thomas Gleixner, Peter Zijlstra,
	Theodore Ts'o, Sultan Alsawaf, Jonathan Neuschäfer,
	Eric Biggers

On 2022-02-09 13:56:44 [+0100], Jason A. Donenfeld wrote:
> +static void mix_interrupt_randomness(struct work_struct *work)
> +{
> +	struct fast_pool *fast_pool = container_of(work, struct fast_pool, mix);
> +	u8 pool[sizeof(fast_pool->pool)];

So.
- CPU1 schedules a worker
- CPU1 goes offline before the gets on the CPU.
- The worker runs CPU2
- CPU2 is back online
- and now
   CPU1						CPU2
   new_count = ++fast_pool->count;
    reg = fast_pool->count (FAST_POOL_MIX_INFLIGHT | 64)
    incl reg (FAST_POOL_MIX_INFLIGHT | 65)
    						WRITE_ONCE(fast_pool->count, 0);
    fast_pool->count = reg ((FAST_POOL_MIX_INFLIGHT | 65)

So we lost the WRITE_ONCE(, 0), FAST_POOL_MIX_INFLIGHT is still set and
worker is not scheduled. Not easy to trigger, not by an ordinary user.
Just wanted to mention…

…
> @@ -999,9 +1016,10 @@ void add_interrupt_randomness(int irq)
>  
>  	fast_mix(fast_pool);
>  	add_interrupt_bench(cycles);
> +	new_count = ++fast_pool->count;
>  
>  	if (unlikely(crng_init == 0)) {
> -		if ((fast_pool->count >= 64) &&
> +		if (new_count >= 64 &&
>  		    crng_fast_load((u8 *)fast_pool->pool, sizeof(fast_pool->pool)) > 0) {

crng_fast_load() does spin_trylock_irqsave() in hardirq context. It does
not produce any warning on RT but is still wrong IMHO:
- lockdep will see a random task and I remember in the past it produced
  strange lock chains based on this.

- Should another task attempt to acquire this lock then it will PI-boost the
  wrong task.

If we just could move this, too.

I don't know how timing critical this is but the first backtrace from
crng_fast_load() came (to my surprise) from hwrng_fillfn() (a kthread)
and added 64bytes in one go.

I did move that crng_fast_load() into the worker and did made some
numbers:
           <idle>-0       [000] d..h1..     2.069924: add_interrupt_randomness: Tick

first interrupt
…
        swapper/0-1       [000] d..h.11     2.341938: add_interrupt_randomness: Tick
        swapper/0-1       [000] d..h.11     2.341938: add_interrupt_randomness: work

the 64th interrupt, scheduling the worker.

        swapper/0-1       [000] d..h.11     2.345937: add_interrupt_randomness: Tick
        swapper/0-1       [000] d..h111     2.349938: add_interrupt_randomness: Tick
        swapper/0-1       [000] d..h.11     2.353939: add_interrupt_randomness: Tick
        swapper/0-1       [000] d..h.11     2.357940: add_interrupt_randomness: Tick
        swapper/0-1       [000] d..h111     2.361939: add_interrupt_randomness: Tick
        swapper/0-1       [000] d..h111     2.365939: add_interrupt_randomness: Tick
        swapper/0-1       [000] d..h.11     2.369941: add_interrupt_randomness: Tick
     kworker/0:0H-6       [000] .......     2.384714: mix_interrupt_randomness: load
     kworker/0:0H-6       [000] .......     2.384715: crng_fast_load: 16
           <idle>-0       [001] dn.h1..     3.205766: add_interrupt_randomness: Tick
           <idle>-0       [019] dn.h1..     6.771047: add_interrupt_randomness: Tick

7 interrupts got lost before the worker could run & load first 16 bytes.
The workqueue core gets initialized at that point and spawns first
worker. After that the interrupts took a break.
And then the work-to-load delay was quite low:

           <idle>-0       [019] dn.h1..     7.586234: add_interrupt_randomness: Tick
           <idle>-0       [019] dn.h1..     7.586234: add_interrupt_randomness: work
    kworker/19:0H-175     [019] .......     7.586504: mix_interrupt_randomness: load
    kworker/19:0H-175     [019] .......     7.586507: crng_fast_load: 16
           <idle>-0       [020] dn.h1..     7.614649: add_interrupt_randomness: Tick
           <idle>-0       [020] dn.h1..     7.614651: add_interrupt_randomness: work
           <idle>-0       [020] dn.h1..     7.614736: add_interrupt_randomness: Tick
    kworker/20:0H-183     [020] dn.h...     7.614859: add_interrupt_randomness: Tick
    kworker/20:0H-183     [020] .......     7.614871: mix_interrupt_randomness: load
    kworker/20:0H-183     [020] .......     7.614872: crng_fast_load: 16
           <idle>-0       [018] dn.h1..     8.352423: add_interrupt_randomness: Tick
           <idle>-0       [018] dn.h1..     8.352423: add_interrupt_randomness: work
    kworker/18:0H-167     [018] dn.h1..     8.352438: add_interrupt_randomness: Tick
    kworker/18:0H-167     [018] dn.h1..     8.352448: add_interrupt_randomness: Tick
    kworker/18:0H-167     [018] dn.h1..     8.352459: add_interrupt_randomness: Tick
    kworker/18:0H-167     [018] dn.h1..     8.352491: add_interrupt_randomness: Tick
    kworker/18:0H-167     [018] .......     8.352505: mix_interrupt_randomness: load
    kworker/18:0H-167     [018] .......     8.352506: crng_fast_load: 16

In total we lost 13 ticks.
I did the same test on PREEMPT_VOLUNTARY and lost 2 ticks only.

Sebastian

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4 2/2] random: defer fast pool mixing to worker
  2022-02-10 18:04   ` Sebastian Andrzej Siewior
@ 2022-02-11  0:42     ` Jason A. Donenfeld
  2022-02-11  1:14       ` [PATCH] random: ensure mix_interrupt_randomness() is consistent Jason A. Donenfeld
  2022-02-11  7:09       ` [PATCH v4 2/2] random: defer fast pool mixing to worker Sebastian Andrzej Siewior
  2022-02-11  8:25     ` Dominik Brodowski
  1 sibling, 2 replies; 17+ messages in thread
From: Jason A. Donenfeld @ 2022-02-11  0:42 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: LKML, Linux Crypto Mailing List, Thomas Gleixner, Peter Zijlstra,
	Theodore Ts'o, Sultan Alsawaf, Jonathan Neuschäfer,
	Eric Biggers

Hi Sebastian,

On Thu, Feb 10, 2022 at 7:04 PM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
> So.
> - CPU1 schedules a worker
> - CPU1 goes offline before the gets on the CPU.
> - The worker runs CPU2
> - CPU2 is back online
> - and now
>    CPU1                                         CPU2
>    new_count = ++fast_pool->count;
>     reg = fast_pool->count (FAST_POOL_MIX_INFLIGHT | 64)
>     incl reg (FAST_POOL_MIX_INFLIGHT | 65)
>                                                 WRITE_ONCE(fast_pool->count, 0);
>     fast_pool->count = reg ((FAST_POOL_MIX_INFLIGHT | 65)
>
> So we lost the WRITE_ONCE(, 0), FAST_POOL_MIX_INFLIGHT is still set and
> worker is not scheduled. Not easy to trigger, not by an ordinary user.
> Just wanted to mention…

Thanks for pointing this out. I'll actually fix this using atomics,
and fix another minor issue at the same time the same way, and move to
making sure the worker is running on the right CPU like we originally
discussed. I'm going to send that as an additional patch so that we
can narrow in on the issue there. It's a little bit involved but not
too bad. I'll have that for you shortly.

> crng_fast_load() does spin_trylock_irqsave() in hardirq context. It does
> not produce any warning on RT but is still wrong IMHO:
> If we just could move this, too.
> I don't know how timing critical this is but the first backtrace from
> crng_fast_load() came (to my surprise) from hwrng_fillfn() (a kthread)
> and added 64bytes in one go.

I'll look into seeing if I can do it. On my first pass a few days ago,
it seemed a bit too tricky, but I'll revisit after this part here
settles. Thanks for your benchmarks, by the way. That's useful.

Jason

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] random: ensure mix_interrupt_randomness() is consistent
  2022-02-11  0:42     ` Jason A. Donenfeld
@ 2022-02-11  1:14       ` Jason A. Donenfeld
  2022-02-11  8:16         ` Sebastian Andrzej Siewior
  2022-02-11  7:09       ` [PATCH v4 2/2] random: defer fast pool mixing to worker Sebastian Andrzej Siewior
  1 sibling, 1 reply; 17+ messages in thread
From: Jason A. Donenfeld @ 2022-02-11  1:14 UTC (permalink / raw)
  To: linux-kernel, linux-crypto
  Cc: Jason A. Donenfeld, Sebastian Andrzej Siewior, Thomas Gleixner,
	Theodore Ts'o, Dominik Brodowski, Sultan Alsawaf

This addresses two issues alluded to in an earlier commit.

The first issue is that mix_interrupt_randomness() might be migrated to
another CPU during CPU hotplug. This issue is rectified by checking that
it hasn't been migrated (after disabling migration). If it has been
migrated, then we set the count to zero, so that when the CPU comes
online again, it can requeue the work. As part of this, we switch to
using an atomic_t, so that the increment in the irq handler doesn't wipe
out the zeroing if the CPU comes back online while this worker is
running.

The second issue is that, though relatively minor in effect, we probably
do after all want to make sure we get a consistent view of the pool onto
the stack, in case it's interrupted by an irq while reading. To do this,
we simply read count before and after the memcpy and make sure they're
the same. If they're not, we try again. The likelihood of actually
hitting this is very low, as we're talking about a 2 or 4 word mov, and
we're executing on the same CPU as the potential interruption.
Nonetheless, it's easy enough to fix, so we do here.

If we only were concerned with the first issue rather than the second,
we could fix this entirely with just using an atomic_t. But in order to
fix them both, it requires a bit of coordination, which this patch
tackles.

Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Sultan Alsawaf <sultan@kerneltoast.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 drivers/char/random.c | 42 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 9c779f1bda34..caaf3c33bb38 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1152,10 +1152,11 @@ struct fast_pool {
 	union {
 		u64 pool64[2];
 		u32 pool32[4];
+		unsigned long pool_long[16 / sizeof(long)];
 	};
 	struct work_struct mix;
 	unsigned long last;
-	unsigned int count;
+	atomic_t count;
 	u16 reg_idx;
 };
 #define FAST_POOL_MIX_INFLIGHT (1U << 31)
@@ -1210,14 +1211,39 @@ static u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
 static void mix_interrupt_randomness(struct work_struct *work)
 {
 	struct fast_pool *fast_pool = container_of(work, struct fast_pool, mix);
-	u32 pool[ARRAY_SIZE(fast_pool->pool32)];
+	unsigned long pool[ARRAY_SIZE(fast_pool->pool_long)];
+	unsigned int count_snapshot;
+	size_t i;
 
-	/* Copy the pool to the stack so that the mixer always has a consistent view. */
-	memcpy(pool, fast_pool->pool32, sizeof(pool));
+	/* Check to see if we're running on the wrong CPU due to hotplug. */
+	migrate_disable();
+	if (fast_pool != this_cpu_ptr(&irq_randomness)) {
+		migrate_enable();
+		/*
+		 * If we are unlucky enough to have been moved to another CPU,
+		 * then we set our count to zero atomically so that when the
+		 * CPU comes back online, it can enqueue work again.
+		 */
+		atomic_set_release(&fast_pool->count, 0);
+		return;
+	}
+
+	/*
+	 * Copy the pool to the stack so that the mixer always has a
+	 * consistent view. It's extremely unlikely but possible that
+	 * this 2 or 4 word read is interrupted by an irq, but in case
+	 * it is, we double check that count stays the same.
+	 */
+	do {
+		count_snapshot = (unsigned int)atomic_read(&fast_pool->count);
+		for (i = 0; i < ARRAY_SIZE(pool); ++i)
+			pool[i] = READ_ONCE(fast_pool->pool_long[i]);
+	} while (count_snapshot != (unsigned int)atomic_read(&fast_pool->count));
 
 	/* We take care to zero out the count only after we're done reading the pool. */
-	WRITE_ONCE(fast_pool->count, 0);
+	atomic_set(&fast_pool->count, 0);
 	fast_pool->last = jiffies;
+	migrate_enable();
 
 	mix_pool_bytes(pool, sizeof(pool));
 	credit_entropy_bits(1);
@@ -1246,12 +1272,12 @@ void add_interrupt_randomness(int irq)
 	}
 
 	fast_mix(fast_pool->pool32);
-	new_count = ++fast_pool->count;
+	new_count = (unsigned int)atomic_inc_return_acquire(&fast_pool->count);
 
 	if (unlikely(crng_init == 0)) {
 		if (new_count >= 64 &&
 		    crng_fast_load(fast_pool->pool32, sizeof(fast_pool->pool32)) > 0) {
-			fast_pool->count = 0;
+			atomic_set(&fast_pool->count, 0);
 			fast_pool->last = now;
 
 			/*
@@ -1273,7 +1299,7 @@ void add_interrupt_randomness(int irq)
 
 	if (unlikely(!fast_pool->mix.func))
 		INIT_WORK(&fast_pool->mix, mix_interrupt_randomness);
-	fast_pool->count |= FAST_POOL_MIX_INFLIGHT;
+	atomic_or(FAST_POOL_MIX_INFLIGHT, &fast_pool->count);
 	queue_work_on(raw_smp_processor_id(), system_highpri_wq, &fast_pool->mix);
 }
 EXPORT_SYMBOL_GPL(add_interrupt_randomness);
-- 
2.35.0


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH v4 2/2] random: defer fast pool mixing to worker
  2022-02-11  0:42     ` Jason A. Donenfeld
  2022-02-11  1:14       ` [PATCH] random: ensure mix_interrupt_randomness() is consistent Jason A. Donenfeld
@ 2022-02-11  7:09       ` Sebastian Andrzej Siewior
  1 sibling, 0 replies; 17+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-02-11  7:09 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: LKML, Linux Crypto Mailing List, Thomas Gleixner, Peter Zijlstra,
	Theodore Ts'o, Sultan Alsawaf, Jonathan Neuschäfer,
	Eric Biggers

On 2022-02-11 01:42:56 [+0100], Jason A. Donenfeld wrote:
> Hi Sebastian,
Hi Jason,

> Thanks for pointing this out. I'll actually fix this using atomics,
> and fix another minor issue at the same time the same way, and move to
> making sure the worker is running on the right CPU like we originally
> discussed. I'm going to send that as an additional patch so that we
> can narrow in on the issue there. It's a little bit involved but not
> too bad. I'll have that for you shortly.

oki.

> > crng_fast_load() does spin_trylock_irqsave() in hardirq context. It does
> > not produce any warning on RT but is still wrong IMHO:
> > If we just could move this, too.
> > I don't know how timing critical this is but the first backtrace from
> > crng_fast_load() came (to my surprise) from hwrng_fillfn() (a kthread)
> > and added 64bytes in one go.
> 
> I'll look into seeing if I can do it. On my first pass a few days ago,
> it seemed a bit too tricky, but I'll revisit after this part here
> settles. Thanks for your benchmarks, by the way. That's useful.

If you want me to do it again or another machines, just let me know.
That was from a bigger x86 machine. I added that series and the patch at
the bottom to my RT queue.

-------->8--------

From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 10 Feb 2022 18:22:05 +0100
Subject: [PATCH] random: Move crng_fast_load() to the worker.

crng_fast_load() is invoked from hard IRQ context and acquires a
spinlock_t via a trylock. If the lock is locked in hard IRQ context then
the following locking attempt (on another CPU) will PI-boost the wrong
task.

Move the crng_fast_load() invocation into the worker.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/char/random.c |   27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1047,6 +1047,17 @@ static void mix_interrupt_randomness(str
 	struct fast_pool *fast_pool = container_of(work, struct fast_pool, mix);
 	u8 pool[sizeof(fast_pool->pool)];
 
+	if (unlikely(crng_init == 0)) {
+		size_t ret;
+
+		ret = crng_fast_load((u8 *)fast_pool->pool, sizeof(fast_pool->pool));
+		if (ret) {
+			WRITE_ONCE(fast_pool->count, 0);
+			fast_pool->last = jiffies;
+			return;
+		}
+	}
+
 	/*
 	 * Since this is the result of a trip through the scheduler, xor in
 	 * a cycle counter. It can't hurt, and might help.
@@ -1089,11 +1100,17 @@ void add_interrupt_randomness(int irq)
 	new_count = ++fast_pool->count;
 
 	if (unlikely(crng_init == 0)) {
-		if (new_count >= 64 &&
-		    crng_fast_load((u8 *)fast_pool->pool, sizeof(fast_pool->pool)) > 0) {
-			fast_pool->count = 0;
-			fast_pool->last = now;
-		}
+		if (new_count & FAST_POOL_MIX_INFLIGHT)
+			return;
+
+		if (new_count < 64)
+			return;
+
+		fast_pool->count |= FAST_POOL_MIX_INFLIGHT;
+		if (unlikely(!fast_pool->mix.func))
+			INIT_WORK(&fast_pool->mix, mix_interrupt_randomness);
+		queue_work_on(raw_smp_processor_id(), system_highpri_wq,
+			      &fast_pool->mix);
 		return;
 	}
 

> Jason

Sebastian

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] random: ensure mix_interrupt_randomness() is consistent
  2022-02-11  1:14       ` [PATCH] random: ensure mix_interrupt_randomness() is consistent Jason A. Donenfeld
@ 2022-02-11  8:16         ` Sebastian Andrzej Siewior
  2022-02-11 10:48           ` Jason A. Donenfeld
  2022-02-11 13:04           ` Jason A. Donenfeld
  0 siblings, 2 replies; 17+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-02-11  8:16 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: linux-kernel, linux-crypto, Thomas Gleixner, Theodore Ts'o,
	Dominik Brodowski, Sultan Alsawaf

On 2022-02-11 02:14:46 [+0100], Jason A. Donenfeld wrote:
> Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Theodore Ts'o <tytso@mit.edu>
> Cc: Dominik Brodowski <linux@dominikbrodowski.net>
> Cc: Sultan Alsawaf <sultan@kerneltoast.com>
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> ---
>  drivers/char/random.c | 42 ++++++++++++++++++++++++++++++++++--------
>  1 file changed, 34 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/char/random.c b/drivers/char/random.c
> index 9c779f1bda34..caaf3c33bb38 100644
> --- a/drivers/char/random.c
> +++ b/drivers/char/random.c
> @@ -1210,14 +1211,39 @@ static u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
>  static void mix_interrupt_randomness(struct work_struct *work)
>  {
>  	struct fast_pool *fast_pool = container_of(work, struct fast_pool, mix);
> -	u32 pool[ARRAY_SIZE(fast_pool->pool32)];
> +	unsigned long pool[ARRAY_SIZE(fast_pool->pool_long)];
> +	unsigned int count_snapshot;
> +	size_t i;
>  
> -	/* Copy the pool to the stack so that the mixer always has a consistent view. */
> -	memcpy(pool, fast_pool->pool32, sizeof(pool));
> +	/* Check to see if we're running on the wrong CPU due to hotplug. */
> +	migrate_disable();
> +	if (fast_pool != this_cpu_ptr(&irq_randomness)) {

I am not sure that acquire and release semantic is needed and if so a
comment would probably be helpful to explain why.
But I'm trying to avoid the migrate_disable(), so:
To close the racy with losing the workqueue bit, wouldn't it be
sufficient to set it to zero via atomic_cmpxchg()? Also if the counter
before the memcpy() and after (at cmpxchg time) didn't change then the
pool wasn't modified. So basically 

 do {
 	counter = atomic_read(&fast_pool->count); // no need to cast
	memcpy(pool, fast_pool->pool_long, ARRAY_SIZE(pool));
    } while (atomic_cmpxchg(&fast_pool->count, counter, 0) != counter);


then it also shouldn't matter if we are _accidentally_ on the wrong CPU.

> +		migrate_enable();
> +		/*
> +		 * If we are unlucky enough to have been moved to another CPU,
> +		 * then we set our count to zero atomically so that when the
> +		 * CPU comes back online, it can enqueue work again.
> +		 */
> +		atomic_set_release(&fast_pool->count, 0);
> +		return;
> +	}
> +
> +	/*
> +	 * Copy the pool to the stack so that the mixer always has a
> +	 * consistent view. It's extremely unlikely but possible that
> +	 * this 2 or 4 word read is interrupted by an irq, but in case
> +	 * it is, we double check that count stays the same.
> +	 */
> +	do {
> +		count_snapshot = (unsigned int)atomic_read(&fast_pool->count);
> +		for (i = 0; i < ARRAY_SIZE(pool); ++i)
> +			pool[i] = READ_ONCE(fast_pool->pool_long[i]);

Why do you avoid memcpy()? Since it is a small memcpy, I'm sure the
compile will inline the register moves.

Sebastian

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4 2/2] random: defer fast pool mixing to worker
  2022-02-10 18:04   ` Sebastian Andrzej Siewior
  2022-02-11  0:42     ` Jason A. Donenfeld
@ 2022-02-11  8:25     ` Dominik Brodowski
  2022-02-11 14:18       ` Sebastian Andrzej Siewior
  1 sibling, 1 reply; 17+ messages in thread
From: Dominik Brodowski @ 2022-02-11  8:25 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Jason A. Donenfeld, linux-kernel, linux-crypto, Thomas Gleixner,
	Peter Zijlstra, Theodore Ts'o, Sultan Alsawaf,
	Jonathan Neuschäfer, Eric Biggers

Am Thu, Feb 10, 2022 at 07:04:20PM +0100 schrieb Sebastian Andrzej Siewior:
> > @@ -999,9 +1016,10 @@ void add_interrupt_randomness(int irq)
> >  
> >  	fast_mix(fast_pool);
> >  	add_interrupt_bench(cycles);
> > +	new_count = ++fast_pool->count;
> >  
> >  	if (unlikely(crng_init == 0)) {
> > -		if ((fast_pool->count >= 64) &&
> > +		if (new_count >= 64 &&
> >  		    crng_fast_load((u8 *)fast_pool->pool, sizeof(fast_pool->pool)) > 0) {
> 
> crng_fast_load() does spin_trylock_irqsave() in hardirq context. It does
> not produce any warning on RT but is still wrong IMHO:
> - lockdep will see a random task and I remember in the past it produced
>   strange lock chains based on this.
> 
> - Should another task attempt to acquire this lock then it will PI-boost the
>   wrong task.
> 
> If we just could move this, too.
> 
> I don't know how timing critical this is but the first backtrace from
> crng_fast_load() came (to my surprise) from hwrng_fillfn() (a kthread)
> and added 64bytes in one go.

That's a hw rng (such as a tpm chip or the virtio-rng driver) providing some
entropy; if it's 64 bytes of input, crng_init progresses to 1, and
crng_fast_load() should never be called again.[*] I'm a bit suprised that the
hw_rng input occurred so early (it's only at device_initcall() level), and
earlier than 64 interrupts. But that may differ from system to system.

Note that crng_fast_load() will also never be called from
add_interrupt_randomness() if

	EFI, DT or kexec provides bootloader entropy of at least 64 bytes,
	and CONFIG_RANDOM_TRUST_BOOTLOADER is set

and/or	CONFIG_RANDOM_TRUST_CPU is set and the RDRAND/RDSEED instructions do
	not fail.

If neither of these three conditions (hw_rng is run early, bootloader or CPU
randomness) are met, the initial and early seeding of the base_crng depends
on add_interrupt_randomness(), and should happen rather quickly.

> I did move that crng_fast_load() into the worker and did made some
> numbers:
>            <idle>-0       [000] d..h1..     2.069924: add_interrupt_randomness: Tick
> 
> first interrupt
> …
>         swapper/0-1       [000] d..h.11     2.341938: add_interrupt_randomness: Tick
>         swapper/0-1       [000] d..h.11     2.341938: add_interrupt_randomness: work
> 
> the 64th interrupt, scheduling the worker.
> 
>         swapper/0-1       [000] d..h.11     2.345937: add_interrupt_randomness: Tick
>         swapper/0-1       [000] d..h111     2.349938: add_interrupt_randomness: Tick
>         swapper/0-1       [000] d..h.11     2.353939: add_interrupt_randomness: Tick
>         swapper/0-1       [000] d..h.11     2.357940: add_interrupt_randomness: Tick
>         swapper/0-1       [000] d..h111     2.361939: add_interrupt_randomness: Tick
>         swapper/0-1       [000] d..h111     2.365939: add_interrupt_randomness: Tick
>         swapper/0-1       [000] d..h.11     2.369941: add_interrupt_randomness: Tick
>      kworker/0:0H-6       [000] .......     2.384714: mix_interrupt_randomness: load
>      kworker/0:0H-6       [000] .......     2.384715: crng_fast_load: 16
>            <idle>-0       [001] dn.h1..     3.205766: add_interrupt_randomness: Tick
>            <idle>-0       [019] dn.h1..     6.771047: add_interrupt_randomness: Tick
> 
> 7 interrupts got lost before the worker could run & load first 16 bytes.
> The workqueue core gets initialized at that point and spawns first
> worker.

So the reason for the longer delay here is that the workqueue core had not
been initialized beforehand?

> After that the interrupts took a break.
> And then the work-to-load delay was quite low:
> 
>            <idle>-0       [019] dn.h1..     7.586234: add_interrupt_randomness: Tick
>            <idle>-0       [019] dn.h1..     7.586234: add_interrupt_randomness: work
>     kworker/19:0H-175     [019] .......     7.586504: mix_interrupt_randomness: load
>     kworker/19:0H-175     [019] .......     7.586507: crng_fast_load: 16
>            <idle>-0       [020] dn.h1..     7.614649: add_interrupt_randomness: Tick
>            <idle>-0       [020] dn.h1..     7.614651: add_interrupt_randomness: work
>            <idle>-0       [020] dn.h1..     7.614736: add_interrupt_randomness: Tick
>     kworker/20:0H-183     [020] dn.h...     7.614859: add_interrupt_randomness: Tick
>     kworker/20:0H-183     [020] .......     7.614871: mix_interrupt_randomness: load
>     kworker/20:0H-183     [020] .......     7.614872: crng_fast_load: 16
>            <idle>-0       [018] dn.h1..     8.352423: add_interrupt_randomness: Tick
>            <idle>-0       [018] dn.h1..     8.352423: add_interrupt_randomness: work
>     kworker/18:0H-167     [018] dn.h1..     8.352438: add_interrupt_randomness: Tick
>     kworker/18:0H-167     [018] dn.h1..     8.352448: add_interrupt_randomness: Tick
>     kworker/18:0H-167     [018] dn.h1..     8.352459: add_interrupt_randomness: Tick
>     kworker/18:0H-167     [018] dn.h1..     8.352491: add_interrupt_randomness: Tick
>     kworker/18:0H-167     [018] .......     8.352505: mix_interrupt_randomness: load
>     kworker/18:0H-167     [018] .......     8.352506: crng_fast_load: 16
> 
> In total we lost 13 ticks.

Was this still way before the initramfs was up and running?

> I did the same test on PREEMPT_VOLUNTARY and lost 2 ticks only.

Thanks,
	Dominik

[*] Actually, there's some contradiciton going on: If we do not trust the
hw_rng device (that is, its quality setting is 0), crng_fast_load() will be
called nonetheless, and the hw_rng-provided input will be used to increment
crng_init to 1. If !CONFIG_RANDOM_TRUST_BOOTLOADER, only crng_slow_load() is
called, and crng_init will remain at 0. Similar for
!CONFIG_RANDOM_TRUST_CPU.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] random: ensure mix_interrupt_randomness() is consistent
  2022-02-11  8:16         ` Sebastian Andrzej Siewior
@ 2022-02-11 10:48           ` Jason A. Donenfeld
  2022-02-11 14:51             ` Sebastian Andrzej Siewior
  2022-02-11 13:04           ` Jason A. Donenfeld
  1 sibling, 1 reply; 17+ messages in thread
From: Jason A. Donenfeld @ 2022-02-11 10:48 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: LKML, Linux Crypto Mailing List, Thomas Gleixner,
	Theodore Ts'o, Dominik Brodowski, Sultan Alsawaf

Hi Sebastian,

On Fri, Feb 11, 2022 at 9:16 AM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
> But I'm trying to avoid the migrate_disable(), so:
> To close the racy with losing the workqueue bit, wouldn't it be
> sufficient to set it to zero via atomic_cmpxchg()? Also if the counter
> before the memcpy() and after (at cmpxchg time) didn't change then the
> pool wasn't modified. So basically
>
>  do {
>         counter = atomic_read(&fast_pool->count); // no need to cast
>         memcpy(pool, fast_pool->pool_long, ARRAY_SIZE(pool));
>     } while (atomic_cmpxchg(&fast_pool->count, counter, 0) != counter);
>
>
> then it also shouldn't matter if we are _accidentally_ on the wrong CPU.

This won't work. If we're executing on a different CPU, the CPU
mutating the pool won't necessarily update the count at the right
time. This isn't actually a seqlock or something like that. Rather, it
depends on running on the same CPU, where the interrupting irq handler
runs in full before giving control back, so that count and pool are
either both updated or not at all. Making this work across CPUs makes
things a lot more complicated and I'd rather not do that.

Actually, though, a nicer fix would be to just disable local
interrupts for that *2 word copy*. That's a tiny period of time. If
you permit me, that seems nicer. But if you don't like that, I'll keep
that loop.

Unfortunately, though, I think disabling migration is required. Sultan
(CC'd) found that these workqueues can migrate even midway through
running. And generally the whole idea is to keep this on the *same*
CPU so that we don't have to introduce locks and synchronization.

I'll add comments around the acquire/release. The remaining question
I believe is: would you prefer disabing irqs during the 2 word memcpy,
or this counter double read loop?


Jason

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] random: ensure mix_interrupt_randomness() is consistent
  2022-02-11  8:16         ` Sebastian Andrzej Siewior
  2022-02-11 10:48           ` Jason A. Donenfeld
@ 2022-02-11 13:04           ` Jason A. Donenfeld
  1 sibling, 0 replies; 17+ messages in thread
From: Jason A. Donenfeld @ 2022-02-11 13:04 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: LKML, Linux Crypto Mailing List, Thomas Gleixner,
	Theodore Ts'o, Dominik Brodowski, Sultan Alsawaf

Sorry, missed this in your last email:

On Fri, Feb 11, 2022 at 9:16 AM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
> > +     do {
> > +             count_snapshot = (unsigned int)atomic_read(&fast_pool->count);
> > +             for (i = 0; i < ARRAY_SIZE(pool); ++i)
> > +                     pool[i] = READ_ONCE(fast_pool->pool_long[i]);
>
> Why do you avoid memcpy()? Since it is a small memcpy, I'm sure the
> compile will inline the register moves.

Because the compiler will otherwise reorder it to be after the two
counter reads. I checked. And a barrier() is too heavy as it flushes
the writes to the stack instead of keeping them read into registers.
READ_ONCE() is the exact semantics we care about here.

Jason

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4 2/2] random: defer fast pool mixing to worker
  2022-02-11  8:25     ` Dominik Brodowski
@ 2022-02-11 14:18       ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 17+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-02-11 14:18 UTC (permalink / raw)
  To: Dominik Brodowski
  Cc: Jason A. Donenfeld, linux-kernel, linux-crypto, Thomas Gleixner,
	Peter Zijlstra, Theodore Ts'o, Sultan Alsawaf,
	Jonathan Neuschäfer, Eric Biggers

On 2022-02-11 09:25:00 [+0100], Dominik Brodowski wrote:
> That's a hw rng (such as a tpm chip or the virtio-rng driver) providing some
> entropy; if it's 64 bytes of input, crng_init progresses to 1, and
> crng_fast_load() should never be called again.[*] I'm a bit suprised that the
> hw_rng input occurred so early (it's only at device_initcall() level), and
> earlier than 64 interrupts. But that may differ from system to system.
> 
> Note that crng_fast_load() will also never be called from
> add_interrupt_randomness() if
> 
> 	EFI, DT or kexec provides bootloader entropy of at least 64 bytes,
> 	and CONFIG_RANDOM_TRUST_BOOTLOADER is set
> 
> and/or	CONFIG_RANDOM_TRUST_CPU is set and the RDRAND/RDSEED instructions do
> 	not fail.
> 
> If neither of these three conditions (hw_rng is run early, bootloader or CPU
> randomness) are met, the initial and early seeding of the base_crng depends
> on add_interrupt_randomness(), and should happen rather quickly.

Right.

> > I did move that crng_fast_load() into the worker and did made some
> > numbers:
> >            <idle>-0       [000] d..h1..     2.069924: add_interrupt_randomness: Tick
> > 
> > first interrupt
> > …
> >         swapper/0-1       [000] d..h.11     2.341938: add_interrupt_randomness: Tick
> >         swapper/0-1       [000] d..h.11     2.341938: add_interrupt_randomness: work
> > 
> > the 64th interrupt, scheduling the worker.
> > 
> >         swapper/0-1       [000] d..h.11     2.345937: add_interrupt_randomness: Tick
> >         swapper/0-1       [000] d..h111     2.349938: add_interrupt_randomness: Tick
> >         swapper/0-1       [000] d..h.11     2.353939: add_interrupt_randomness: Tick
> >         swapper/0-1       [000] d..h.11     2.357940: add_interrupt_randomness: Tick
> >         swapper/0-1       [000] d..h111     2.361939: add_interrupt_randomness: Tick
> >         swapper/0-1       [000] d..h111     2.365939: add_interrupt_randomness: Tick
> >         swapper/0-1       [000] d..h.11     2.369941: add_interrupt_randomness: Tick
> >      kworker/0:0H-6       [000] .......     2.384714: mix_interrupt_randomness: load
> >      kworker/0:0H-6       [000] .......     2.384715: crng_fast_load: 16
> >            <idle>-0       [001] dn.h1..     3.205766: add_interrupt_randomness: Tick
> >            <idle>-0       [019] dn.h1..     6.771047: add_interrupt_randomness: Tick
> > 
> > 7 interrupts got lost before the worker could run & load first 16 bytes.
> > The workqueue core gets initialized at that point and spawns first
> > worker.
> 
> So the reason for the longer delay here is that the workqueue core had not
> been initialized beforehand?

Kind of, yes. workqueue_init_early() happens quite early so we don't
have to worry about NULL pointer for system_highpri_wq. The worker
started after workqueue_init() populated the worker pools.

> > After that the interrupts took a break.
> > And then the work-to-load delay was quite low:
> > 
> >            <idle>-0       [019] dn.h1..     7.586234: add_interrupt_randomness: Tick
> >            <idle>-0       [019] dn.h1..     7.586234: add_interrupt_randomness: work
> >     kworker/19:0H-175     [019] .......     7.586504: mix_interrupt_randomness: load
> >     kworker/19:0H-175     [019] .......     7.586507: crng_fast_load: 16
> >            <idle>-0       [020] dn.h1..     7.614649: add_interrupt_randomness: Tick
> >            <idle>-0       [020] dn.h1..     7.614651: add_interrupt_randomness: work
> >            <idle>-0       [020] dn.h1..     7.614736: add_interrupt_randomness: Tick
> >     kworker/20:0H-183     [020] dn.h...     7.614859: add_interrupt_randomness: Tick
> >     kworker/20:0H-183     [020] .......     7.614871: mix_interrupt_randomness: load
> >     kworker/20:0H-183     [020] .......     7.614872: crng_fast_load: 16
> >            <idle>-0       [018] dn.h1..     8.352423: add_interrupt_randomness: Tick
> >            <idle>-0       [018] dn.h1..     8.352423: add_interrupt_randomness: work
> >     kworker/18:0H-167     [018] dn.h1..     8.352438: add_interrupt_randomness: Tick
> >     kworker/18:0H-167     [018] dn.h1..     8.352448: add_interrupt_randomness: Tick
> >     kworker/18:0H-167     [018] dn.h1..     8.352459: add_interrupt_randomness: Tick
> >     kworker/18:0H-167     [018] dn.h1..     8.352491: add_interrupt_randomness: Tick
> >     kworker/18:0H-167     [018] .......     8.352505: mix_interrupt_randomness: load
> >     kworker/18:0H-167     [018] .......     8.352506: crng_fast_load: 16
> > 
> > In total we lost 13 ticks.
> 
> Was this still way before the initramfs was up and running?

After unpacked initramfs. From current boot:

| [    5.901462] Unpacking initramfs...
| [    6.758747] sd 1:0:0:0: [sda] Attached SCSI disk
| [    7.886651] Freeing initrd memory: 9532K
| [    7.893753] Freeing unused kernel image (initmem) memory: 2184K
| [    7.963519] Write protecting the kernel read-only data: 20480k
| [    7.971465] Freeing unused kernel image (text/rodata gap) memory: 2032K
| [    7.979711] Freeing unused kernel image (rodata/data gap) memory: 1980K
| [    7.987132] Run /init as init process
| Loading, please wait...
| 
| Starting version 250.3-2
| [    8.157529] igb 0000:07:00.0 eno0: renamed from eth0
| [    8.203836] igb 0000:07:00.1 enp7s0f1: renamed from eth1
| [    8.219489] random: fast init done
| Begin: Loading essential drivers ... done.
| Begin: Running /scripts/init-premount ... done.
| Begin: Mounting root file system ... Begin: Running /scripts/local-top ... done.
| Begin: Running /scripts/local-premount ... done.
| Warning: fsck not present, so skipping root file[    8.337554] XFS (sda2): Mounting V5 Filesystem
| [    8.392151] XFS (sda2): Ending clean mount
| done.
| Begin: Running /scripts/local-bottom ... done.
| Begin: Running /scripts/init-bottom ... done.
| [    8.540708] systemd[1]: systemd 250.3-2 running in system mode …
| [   12.207227] random: crng init done

I have to note that on this system there is a initramfs but all drivers
are built-in (you see sd, net _after_ "Unpacking initramfs" but there
are not drivers to load).

> Thanks,
> 	Dominik

Sebastian

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] random: ensure mix_interrupt_randomness() is consistent
  2022-02-11 10:48           ` Jason A. Donenfeld
@ 2022-02-11 14:51             ` Sebastian Andrzej Siewior
  2022-02-11 16:19               ` Jason A. Donenfeld
  0 siblings, 1 reply; 17+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-02-11 14:51 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: LKML, Linux Crypto Mailing List, Thomas Gleixner,
	Theodore Ts'o, Dominik Brodowski, Sultan Alsawaf

On 2022-02-11 11:48:15 [+0100], Jason A. Donenfeld wrote:
> Hi Sebastian,
Hi,

> On Fri, Feb 11, 2022 at 9:16 AM Sebastian Andrzej Siewior
> <bigeasy@linutronix.de> wrote:
> > But I'm trying to avoid the migrate_disable(), so:
> > To close the racy with losing the workqueue bit, wouldn't it be
> > sufficient to set it to zero via atomic_cmpxchg()? Also if the counter
> > before the memcpy() and after (at cmpxchg time) didn't change then the
> > pool wasn't modified. So basically
> >
> >  do {
> >         counter = atomic_read(&fast_pool->count); // no need to cast
> >         memcpy(pool, fast_pool->pool_long, ARRAY_SIZE(pool));
> >     } while (atomic_cmpxchg(&fast_pool->count, counter, 0) != counter);
> >
> >
> > then it also shouldn't matter if we are _accidentally_ on the wrong CPU.
> 
> This won't work. If we're executing on a different CPU, the CPU
> mutating the pool won't necessarily update the count at the right
> time. This isn't actually a seqlock or something like that. Rather, it

But it is atomic, isn't it?

> depends on running on the same CPU, where the interrupting irq handler
> runs in full before giving control back, so that count and pool are
> either both updated or not at all. Making this work across CPUs makes
> things a lot more complicated and I'd rather not do that.

but this isn't the rule, is it? It runs on the same CPU so we should
observe the update in IRQ context and the worker should observe the
counter _and_ pool update.

And cross CPU isn't the rule. We only re-do the loop if
- an interrupt came in on the local-CPU between atomic_read() and
  atomic_cmpxchg().

- the worker was migrated due CPU hotplug and we managed properly reset
  counter back to 0.

> Actually, though, a nicer fix would be to just disable local
> interrupts for that *2 word copy*. That's a tiny period of time. If
> you permit me, that seems nicer. But if you don't like that, I'll keep
> that loop.

Here, I don't mind but I don't think it is needed.

> Unfortunately, though, I think disabling migration is required. Sultan
> (CC'd) found that these workqueues can migrate even midway through
> running. And generally the whole idea is to keep this on the *same*
> CPU so that we don't have to introduce locks and synchronization.

They can't. Your workqueue is not unbound _and_ you specify a specific
CPU instead of WORK_CPU_UNBOUND (or an offlined CPU).
The only way it can migrate is if the CPU goes down while the worker is
running (or before it had a chance I think) which forces the scheduler
to break its (worker's) CPU affinity and move it to another CPU.

> I'll add comments around the acquire/release. The remaining question
> I believe is: would you prefer disabing irqs during the 2 word memcpy,
> or this counter double read loop?

I would prefer the cmpxchg in case it highly unlikely gets moved to
another CPU and we may lose that SCHED bit. That is why we switched to
atomics I think. Otherwise if the updates are only local can disable
interrupts during the update.
But I don't mind disabling interrupts for that copy.

> Jason

Sebastian

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] random: ensure mix_interrupt_randomness() is consistent
  2022-02-11 14:51             ` Sebastian Andrzej Siewior
@ 2022-02-11 16:19               ` Jason A. Donenfeld
  2022-02-11 16:24                 ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 17+ messages in thread
From: Jason A. Donenfeld @ 2022-02-11 16:19 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: LKML, Linux Crypto Mailing List, Thomas Gleixner,
	Theodore Ts'o, Dominik Brodowski, Sultan Alsawaf

Hi Sebastian,

On Fri, Feb 11, 2022 at 3:51 PM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
> > Unfortunately, though, I think disabling migration is required. Sultan
> > (CC'd) found that these workqueues can migrate even midway through
> > running. And generally the whole idea is to keep this on the *same*
> > CPU so that we don't have to introduce locks and synchronization.
>
> They can't. Your workqueue is not unbound _and_ you specify a specific
> CPU instead of WORK_CPU_UNBOUND (or an offlined CPU).
> The only way it can migrate is if the CPU goes down while the worker is
> running (or before it had a chance I think) which forces the scheduler
> to break its (worker's) CPU affinity and move it to another CPU.

Right, but the CPU could come back up while the worker is running on
the wrong CPU, and then kaboom. Anyway, the migration_disable() window
is very, very small - a few instructions at most with no loops. I
think it'll be fine.

> > I'll add comments around the acquire/release. The remaining question
> > I believe is: would you prefer disabing irqs during the 2 word memcpy,
> > or this counter double read loop?
>
> I would prefer the cmpxchg

I'll do the cmpxchg and send you a v+1. Sorry it wasn't in the last
one. It only now clicked with me what that code would look like, after
I stepped away from the screen for a bit to defrobulate my brains.

Jason

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] random: ensure mix_interrupt_randomness() is consistent
  2022-02-11 16:19               ` Jason A. Donenfeld
@ 2022-02-11 16:24                 ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 17+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-02-11 16:24 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: LKML, Linux Crypto Mailing List, Thomas Gleixner,
	Theodore Ts'o, Dominik Brodowski, Sultan Alsawaf

On 2022-02-11 17:19:21 [+0100], Jason A. Donenfeld wrote:
> Hi Sebastian,
Hi,

> I'll do the cmpxchg and send you a v+1. Sorry it wasn't in the last
> one. It only now clicked with me what that code would look like, after
> I stepped away from the screen for a bit to defrobulate my brains.

No worries. As much as I liked my first series, I like even more where
this is heading to ;)

> Jason

Sebastian

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4 1/2] random: remove batched entropy locking
  2022-02-09 12:56 ` [PATCH v4 1/2] random: remove batched entropy locking Jason A. Donenfeld
  2022-02-10 16:24   ` Sebastian Andrzej Siewior
@ 2022-02-21  2:30   ` Eric Biggers
  1 sibling, 0 replies; 17+ messages in thread
From: Eric Biggers @ 2022-02-21  2:30 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: linux-kernel, linux-crypto, Sebastian Andrzej Siewior,
	Thomas Gleixner, Peter Zijlstra, Theodore Ts'o,
	Sultan Alsawaf, Jonathan Neuschäfer, Andy Lutomirski

On Wed, Feb 09, 2022 at 01:56:43PM +0100, Jason A. Donenfeld wrote:
> Rather than use spinlocks to protect batched entropy, we can instead
> disable interrupts locally, since we're dealing with per-cpu data, and
> manage resets with a basic generation counter. At the same time, we
> can't quite do this on PREEMPT_RT, where we still want spinlocks-as-
> mutexes semantics. So we use a local_lock_t, which provides the right
> behavior for each. Because this is a per-cpu lock, that generation
> counter is still doing the necessary CPU-to-CPU communication.
> 
> This should improve performance a bit. It will also fix the linked splat
> that Jonathan received with a PROVE_RAW_LOCK_NESTING=y.
> 
> Suggested-by: Andy Lutomirski <luto@kernel.org>
> Reported-by: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
> Tested-by: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
> Link: https://lore.kernel.org/lkml/YfMa0QgsjCVdRAvJ@latitude/
> Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> Cc: Sultan Alsawaf <sultan@kerneltoast.com>
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> ---
>  drivers/char/random.c | 55 ++++++++++++++++++++++---------------------
>  1 file changed, 28 insertions(+), 27 deletions(-)

Reviewed-by: Eric Biggers <ebiggers@google.com>

- Eric

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2022-02-21  2:30 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-09 12:56 [PATCH v4 0/2] random: PREEMPT_RT fixes Jason A. Donenfeld
2022-02-09 12:56 ` [PATCH v4 1/2] random: remove batched entropy locking Jason A. Donenfeld
2022-02-10 16:24   ` Sebastian Andrzej Siewior
2022-02-21  2:30   ` Eric Biggers
2022-02-09 12:56 ` [PATCH v4 2/2] random: defer fast pool mixing to worker Jason A. Donenfeld
2022-02-10 18:04   ` Sebastian Andrzej Siewior
2022-02-11  0:42     ` Jason A. Donenfeld
2022-02-11  1:14       ` [PATCH] random: ensure mix_interrupt_randomness() is consistent Jason A. Donenfeld
2022-02-11  8:16         ` Sebastian Andrzej Siewior
2022-02-11 10:48           ` Jason A. Donenfeld
2022-02-11 14:51             ` Sebastian Andrzej Siewior
2022-02-11 16:19               ` Jason A. Donenfeld
2022-02-11 16:24                 ` Sebastian Andrzej Siewior
2022-02-11 13:04           ` Jason A. Donenfeld
2022-02-11  7:09       ` [PATCH v4 2/2] random: defer fast pool mixing to worker Sebastian Andrzej Siewior
2022-02-11  8:25     ` Dominik Brodowski
2022-02-11 14:18       ` Sebastian Andrzej Siewior

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.