All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] SRCU: More efficient reader counts.
@ 2016-11-17 22:03 Lance Roy
  2016-11-18 14:08 ` Paul E. McKenney
  0 siblings, 1 reply; 8+ messages in thread
From: Lance Roy @ 2016-11-17 22:03 UTC (permalink / raw)
  To: linux-kernel
  Cc: paulmck, mingo, jiangshanlai, dipankar, akpm, mathieu.desnoyers,
	josh, tglx, peterz, rostedt, dhowells, edumazet, dvhart,
	fweisbec, oleg, bobby.prani, ldr709

SRCU uses two per-cpu counters: a nesting counter to count the number of
active critical sections, and a sequence counter to ensure that the nesting
counters don't change while they are being added together in
srcu_readers_active_idx_check().

This patch instead uses per-cpu lock and unlock counters. Because the both
counters only increase and srcu_readers_active_idx_check() reads the unlock
counter before the lock counter, this achieves the same end without having
to increment two different counters in srcu_read_lock(). This also saves a
smp_mb() in srcu_readers_active_idx_check().

Possible bug: There is no guarantee that the lock counter won't overflow
during srcu_readers_active_idx_check(), as there are no memory barriers
around srcu_flip() (see comment in srcu_readers_active_idx_check() for
details). However, this problem was already present before this patch.

Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Lance Roy <ldr709@gmail.com>
---
 include/linux/srcu.h    |   4 +-
 kernel/rcu/rcutorture.c |  20 ++++++++-
 kernel/rcu/srcu.c       | 116 ++++++++++++++++++------------------------------
 3 files changed, 63 insertions(+), 77 deletions(-)

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index dc8eb63..0caea34 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -34,8 +34,8 @@
 #include <linux/workqueue.h>
 
 struct srcu_struct_array {
-	unsigned long c[2];
-	unsigned long seq[2];
+	unsigned long lock_count[2];
+	unsigned long unlock_count[2];
 };
 
 struct rcu_batch {
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index bf08fee..2450c61 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -555,10 +555,26 @@ static void srcu_torture_stats(void)
 	pr_alert("%s%s per-CPU(idx=%d):",
 		 torture_type, TORTURE_FLAG, idx);
 	for_each_possible_cpu(cpu) {
+		unsigned long l0, l1;
+		unsigned long u0, u1;
 		long c0, c1;
+		struct srcu_struct_array *counts =
+			per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
 
-		c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx];
-		c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx];
+		u0 = counts->unlock_count[!idx];
+		u1 = counts->unlock_count[idx];
+
+		/*
+		 * Make sure that a lock is always counted if the corresponding
+		 * unlock is counted.
+		 */
+		smp_rmb();
+
+		l0 = counts->lock_count[!idx];
+		l1 = counts->lock_count[idx];
+
+		c0 = (long)(l0 - u0);
+		c1 = (long)(l1 - u1);
 		pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
 	}
 	pr_cont("\n");
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 9b9cdd5..38e9aae 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -141,34 +141,38 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 /*
- * Returns approximate total of the readers' ->seq[] values for the
+ * Returns approximate total of the readers' ->lock_count[] values for the
  * rank of per-CPU counters specified by idx.
  */
-static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
+static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
 {
 	int cpu;
 	unsigned long sum = 0;
 	unsigned long t;
 
 	for_each_possible_cpu(cpu) {
-		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
+		struct srcu_struct_array *cpu_counts =
+			per_cpu_ptr(sp->per_cpu_ref, cpu);
+		t = READ_ONCE(cpu_counts->lock_count[idx]);
 		sum += t;
 	}
 	return sum;
 }
 
 /*
- * Returns approximate number of readers active on the specified rank
- * of the per-CPU ->c[] counters.
+ * Returns approximate total of the readers' ->unlock_count[] values for the
+ * rank of per-CPU counters specified by idx.
  */
-static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
+static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
 {
 	int cpu;
 	unsigned long sum = 0;
 	unsigned long t;
 
 	for_each_possible_cpu(cpu) {
-		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
+		struct srcu_struct_array *cpu_counts =
+			per_cpu_ptr(sp->per_cpu_ref, cpu);
+		t = READ_ONCE(cpu_counts->unlock_count[idx]);
 		sum += t;
 	}
 	return sum;
@@ -176,79 +180,42 @@ static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
 
 /*
  * Return true if the number of pre-existing readers is determined to
- * be stably zero.  An example unstable zero can occur if the call
- * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
- * but due to task migration, sees the corresponding __srcu_read_unlock()
- * decrement.  This can happen because srcu_readers_active_idx() takes
- * time to sum the array, and might in fact be interrupted or preempted
- * partway through the summation.
+ * be zero.
  */
 static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
 {
-	unsigned long seq;
+	unsigned long unlocks;
 
-	seq = srcu_readers_seq_idx(sp, idx);
+	unlocks = srcu_readers_unlock_idx(sp, idx);
 
 	/*
-	 * The following smp_mb() A pairs with the smp_mb() B located in
-	 * __srcu_read_lock().  This pairing ensures that if an
-	 * __srcu_read_lock() increments its counter after the summation
-	 * in srcu_readers_active_idx(), then the corresponding SRCU read-side
-	 * critical section will see any changes made prior to the start
-	 * of the current SRCU grace period.
+	 * Make sure that a lock is always counted if the corresponding unlock
+	 * is counted. Needs to be a smp_mb() as the read side may contain a
+	 * read from a variable that is written to before the synchronize_srcu()
+	 * in the write side. In this case smp_mb()s A and B act like the store
+	 * buffering pattern.
 	 *
-	 * Also, if the above call to srcu_readers_seq_idx() saw the
-	 * increment of ->seq[], then the call to srcu_readers_active_idx()
-	 * must see the increment of ->c[].
+	 * This smp_mb() also pairs with smp_mb() C to prevent writes after the
+	 * synchronize_srcu() from being executed before the grace period ends.
 	 */
 	smp_mb(); /* A */
 
 	/*
-	 * Note that srcu_readers_active_idx() can incorrectly return
-	 * zero even though there is a pre-existing reader throughout.
-	 * To see this, suppose that task A is in a very long SRCU
-	 * read-side critical section that started on CPU 0, and that
-	 * no other reader exists, so that the sum of the counters
-	 * is equal to one.  Then suppose that task B starts executing
-	 * srcu_readers_active_idx(), summing up to CPU 1, and then that
-	 * task C starts reading on CPU 0, so that its increment is not
-	 * summed, but finishes reading on CPU 2, so that its decrement
-	 * -is- summed.  Then when task B completes its sum, it will
-	 * incorrectly get zero, despite the fact that task A has been
-	 * in its SRCU read-side critical section the whole time.
-	 *
-	 * We therefore do a validation step should srcu_readers_active_idx()
-	 * return zero.
-	 */
-	if (srcu_readers_active_idx(sp, idx) != 0)
-		return false;
-
-	/*
-	 * The remainder of this function is the validation step.
-	 * The following smp_mb() D pairs with the smp_mb() C in
-	 * __srcu_read_unlock().  If the __srcu_read_unlock() was seen
-	 * by srcu_readers_active_idx() above, then any destructive
-	 * operation performed after the grace period will happen after
-	 * the corresponding SRCU read-side critical section.
+	 * If the locks are the same as the unlocks, then there must of have
+	 * been no readers on this index at some time in between. This does not
+	 * mean that there are no more readers, as one could have read the
+	 * current index but have incremented the lock counter yet.
 	 *
-	 * Note that there can be at most NR_CPUS worth of readers using
-	 * the old index, which is not enough to overflow even a 32-bit
-	 * integer.  (Yes, this does mean that systems having more than
-	 * a billion or so CPUs need to be 64-bit systems.)  Therefore,
-	 * the sum of the ->seq[] counters cannot possibly overflow.
-	 * Therefore, the only way that the return values of the two
-	 * calls to srcu_readers_seq_idx() can be equal is if there were
-	 * no increments of the corresponding rank of ->seq[] counts
-	 * in the interim.  But the missed-increment scenario laid out
-	 * above includes an increment of the ->seq[] counter by
-	 * the corresponding __srcu_read_lock().  Therefore, if this
-	 * scenario occurs, the return values from the two calls to
-	 * srcu_readers_seq_idx() will differ, and thus the validation
-	 * step below suffices.
+	 * Possible bug: There is no guarantee that there haven't been ULONG_MAX
+	 * increments of ->lock_count[] since the unlocks were counted, meaning
+	 * that this could return true even if there are still active readers.
+	 * Since there are no memory barriers around srcu_flip(), the CPU is not
+	 * required to increment ->completed before running
+	 * srcu_readers_unlock_idx(), which means that there could be an
+	 * arbitrarily large number of critical sections that execute after
+	 * srcu_readers_unlock_idx() but use the old value of ->completed.
 	 */
-	smp_mb(); /* D */
-
-	return srcu_readers_seq_idx(sp, idx) == seq;
+	return srcu_readers_lock_idx(sp, idx) == unlocks;
 }
 
 /**
@@ -266,8 +233,12 @@ static bool srcu_readers_active(struct srcu_struct *sp)
 	unsigned long sum = 0;
 
 	for_each_possible_cpu(cpu) {
-		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
-		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
+		struct srcu_struct_array *cpu_counts =
+			per_cpu_ptr(sp->per_cpu_ref, cpu);
+		sum += READ_ONCE(cpu_counts->lock_count[0]);
+		sum += READ_ONCE(cpu_counts->lock_count[1]);
+		sum -= READ_ONCE(cpu_counts->unlock_count[0]);
+		sum -= READ_ONCE(cpu_counts->unlock_count[1]);
 	}
 	return sum;
 }
@@ -298,9 +269,8 @@ int __srcu_read_lock(struct srcu_struct *sp)
 	int idx;
 
 	idx = READ_ONCE(sp->completed) & 0x1;
-	__this_cpu_inc(sp->per_cpu_ref->c[idx]);
+	__this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
 	smp_mb(); /* B */  /* Avoid leaking the critical section. */
-	__this_cpu_inc(sp->per_cpu_ref->seq[idx]);
 	return idx;
 }
 EXPORT_SYMBOL_GPL(__srcu_read_lock);
@@ -314,7 +284,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
 void __srcu_read_unlock(struct srcu_struct *sp, int idx)
 {
 	smp_mb(); /* C */  /* Avoid leaking the critical section. */
-	this_cpu_dec(sp->per_cpu_ref->c[idx]);
+	this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]);
 }
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 
@@ -349,7 +319,7 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
 
 /*
  * Increment the ->completed counter so that future SRCU readers will
- * use the other rank of the ->c[] and ->seq[] arrays.  This allows
+ * use the other rank of the ->(un)lock_count[] arrays.  This allows
  * us to wait for pre-existing readers in a starvation-free manner.
  */
 static void srcu_flip(struct srcu_struct *sp)
-- 
2.9.0

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH] SRCU: More efficient reader counts.
  2016-11-17 22:03 [RFC PATCH] SRCU: More efficient reader counts Lance Roy
@ 2016-11-18 14:08 ` Paul E. McKenney
  2016-11-18 14:27   ` Mathieu Desnoyers
  2016-11-18 20:33   ` Lance Roy
  0 siblings, 2 replies; 8+ messages in thread
From: Paul E. McKenney @ 2016-11-18 14:08 UTC (permalink / raw)
  To: Lance Roy
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, peterz, rostedt, dhowells,
	edumazet, dvhart, fweisbec, oleg, bobby.prani

On Thu, Nov 17, 2016 at 02:03:35PM -0800, Lance Roy wrote:
> SRCU uses two per-cpu counters: a nesting counter to count the number of
> active critical sections, and a sequence counter to ensure that the nesting
> counters don't change while they are being added together in
> srcu_readers_active_idx_check().
> 
> This patch instead uses per-cpu lock and unlock counters. Because the both
> counters only increase and srcu_readers_active_idx_check() reads the unlock
> counter before the lock counter, this achieves the same end without having
> to increment two different counters in srcu_read_lock(). This also saves a
> smp_mb() in srcu_readers_active_idx_check().
> 
> Possible bug: There is no guarantee that the lock counter won't overflow
> during srcu_readers_active_idx_check(), as there are no memory barriers
> around srcu_flip() (see comment in srcu_readers_active_idx_check() for
> details). However, this problem was already present before this patch.

This patch differs from the previous one in a few (good) code-style
changes, comment changes, and of course the commit log.  Good.  Once
discussion converges, I will apply the agreed-upon commit, which might
well be this one.

However, let's first take a look at the overflow issue.

If a given program could have ULONG_MAX or more readers at any given
time, there would of course be overflow.  However, each read must have
an srcu_read_lock() outstanding, and the resulting four-byte return
value must be stored somewhere.  Because the full address space is at
most ULONG_MAX, the maximum number of outstanding readers is at most
ULONG_MAX/4, even in the degenerate case where a single CPU/task invokes
srcu_read_lock() in a tight loop.  And even this assumes that the entire
address space can somehow be devoted to srcu_read_lock() return values.
ULONG_MAX/4 is therefore a hard upper bound on the number of outstanding
SRCU readers.

Now srcu_readers_active_idx_check() checks for strict equality between
the number of locks and unlocks, so we can in theory tolerate ULONG_MAX-1
readers.  So, the question is whether ULONG_MAX/4 readers can result
in the updater seeing ULONG_MAX reads, due to memory misordering and
other issues.

Because there are no memory barriers surrounding srcu_flip(), the updater
could miss an extremely large number of srcu_read_unlock()s.  However,
each missed srcu_read_unlock() must have a corresponding srcu_read_lock(),
and there is a full memory barrier between between the srcu_flip() and
the read of the lock count.  There is also a full barrier between any
srcu_read_lock()'s increment of the lock count and that CPU's/task's next
srcu_read_lock()'s fetch of the index.  Therefore, if the updater misses
counting a given srcu_read_lock(), that CPU's/task's next srcu_read_lock()
must see the new value of the index.  Because srcu_read_lock() disables
preemption across the index fetch and the lock increment, there can be at
most NR_CPUS-1 srcu_read_lock() calls that missed the recent srcu_flip()'s
update to the index.  (I said NR_CPUS earlier, but Mathieu is correct
in pointing out that srcu_flip() has to run somewhere.)

The maximum number of locks that the updater can see is therefore:

o	ULONG_MAX/4 for a full set of missed srcu_read_unlock()s.

o	ULONG_MAX/4 for a full set of srcu_read_lock()s.

o	NR_CPUS-1 for a full set of subsequent srcu_read_lock()s that
	missed the flip.

This totals to ULONG_MAX/2+NR_CPUS-1.  So as long as there are no more
than ULONG_MAX/2 CPUs, we should be good.  And given that the biggest
system I have hard evidence of is 4K CPUs, we have ample headrooom
compared to the ~2G value of ULONG_MAX/2, even on 32-bit systems.

So, what am I missing here?

							Thanx, Paul

> Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> Signed-off-by: Lance Roy <ldr709@gmail.com>
> ---
>  include/linux/srcu.h    |   4 +-
>  kernel/rcu/rcutorture.c |  20 ++++++++-
>  kernel/rcu/srcu.c       | 116 ++++++++++++++++++------------------------------
>  3 files changed, 63 insertions(+), 77 deletions(-)
> 
> diff --git a/include/linux/srcu.h b/include/linux/srcu.h
> index dc8eb63..0caea34 100644
> --- a/include/linux/srcu.h
> +++ b/include/linux/srcu.h
> @@ -34,8 +34,8 @@
>  #include <linux/workqueue.h>
> 
>  struct srcu_struct_array {
> -	unsigned long c[2];
> -	unsigned long seq[2];
> +	unsigned long lock_count[2];
> +	unsigned long unlock_count[2];
>  };
> 
>  struct rcu_batch {
> diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
> index bf08fee..2450c61 100644
> --- a/kernel/rcu/rcutorture.c
> +++ b/kernel/rcu/rcutorture.c
> @@ -555,10 +555,26 @@ static void srcu_torture_stats(void)
>  	pr_alert("%s%s per-CPU(idx=%d):",
>  		 torture_type, TORTURE_FLAG, idx);
>  	for_each_possible_cpu(cpu) {
> +		unsigned long l0, l1;
> +		unsigned long u0, u1;
>  		long c0, c1;
> +		struct srcu_struct_array *counts =
> +			per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
> 
> -		c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx];
> -		c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx];
> +		u0 = counts->unlock_count[!idx];
> +		u1 = counts->unlock_count[idx];
> +
> +		/*
> +		 * Make sure that a lock is always counted if the corresponding
> +		 * unlock is counted.
> +		 */
> +		smp_rmb();
> +
> +		l0 = counts->lock_count[!idx];
> +		l1 = counts->lock_count[idx];
> +
> +		c0 = (long)(l0 - u0);
> +		c1 = (long)(l1 - u1);
>  		pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
>  	}
>  	pr_cont("\n");
> diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
> index 9b9cdd5..38e9aae 100644
> --- a/kernel/rcu/srcu.c
> +++ b/kernel/rcu/srcu.c
> @@ -141,34 +141,38 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
>  #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
> 
>  /*
> - * Returns approximate total of the readers' ->seq[] values for the
> + * Returns approximate total of the readers' ->lock_count[] values for the
>   * rank of per-CPU counters specified by idx.
>   */
> -static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
> +static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
>  {
>  	int cpu;
>  	unsigned long sum = 0;
>  	unsigned long t;
> 
>  	for_each_possible_cpu(cpu) {
> -		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
> +		struct srcu_struct_array *cpu_counts =
> +			per_cpu_ptr(sp->per_cpu_ref, cpu);
> +		t = READ_ONCE(cpu_counts->lock_count[idx]);
>  		sum += t;
>  	}
>  	return sum;
>  }
> 
>  /*
> - * Returns approximate number of readers active on the specified rank
> - * of the per-CPU ->c[] counters.
> + * Returns approximate total of the readers' ->unlock_count[] values for the
> + * rank of per-CPU counters specified by idx.
>   */
> -static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
> +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
>  {
>  	int cpu;
>  	unsigned long sum = 0;
>  	unsigned long t;
> 
>  	for_each_possible_cpu(cpu) {
> -		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
> +		struct srcu_struct_array *cpu_counts =
> +			per_cpu_ptr(sp->per_cpu_ref, cpu);
> +		t = READ_ONCE(cpu_counts->unlock_count[idx]);
>  		sum += t;
>  	}
>  	return sum;
> @@ -176,79 +180,42 @@ static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
> 
>  /*
>   * Return true if the number of pre-existing readers is determined to
> - * be stably zero.  An example unstable zero can occur if the call
> - * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
> - * but due to task migration, sees the corresponding __srcu_read_unlock()
> - * decrement.  This can happen because srcu_readers_active_idx() takes
> - * time to sum the array, and might in fact be interrupted or preempted
> - * partway through the summation.
> + * be zero.
>   */
>  static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
>  {
> -	unsigned long seq;
> +	unsigned long unlocks;
> 
> -	seq = srcu_readers_seq_idx(sp, idx);
> +	unlocks = srcu_readers_unlock_idx(sp, idx);
> 
>  	/*
> -	 * The following smp_mb() A pairs with the smp_mb() B located in
> -	 * __srcu_read_lock().  This pairing ensures that if an
> -	 * __srcu_read_lock() increments its counter after the summation
> -	 * in srcu_readers_active_idx(), then the corresponding SRCU read-side
> -	 * critical section will see any changes made prior to the start
> -	 * of the current SRCU grace period.
> +	 * Make sure that a lock is always counted if the corresponding unlock
> +	 * is counted. Needs to be a smp_mb() as the read side may contain a
> +	 * read from a variable that is written to before the synchronize_srcu()
> +	 * in the write side. In this case smp_mb()s A and B act like the store
> +	 * buffering pattern.
>  	 *
> -	 * Also, if the above call to srcu_readers_seq_idx() saw the
> -	 * increment of ->seq[], then the call to srcu_readers_active_idx()
> -	 * must see the increment of ->c[].
> +	 * This smp_mb() also pairs with smp_mb() C to prevent writes after the
> +	 * synchronize_srcu() from being executed before the grace period ends.
>  	 */
>  	smp_mb(); /* A */
> 
>  	/*
> -	 * Note that srcu_readers_active_idx() can incorrectly return
> -	 * zero even though there is a pre-existing reader throughout.
> -	 * To see this, suppose that task A is in a very long SRCU
> -	 * read-side critical section that started on CPU 0, and that
> -	 * no other reader exists, so that the sum of the counters
> -	 * is equal to one.  Then suppose that task B starts executing
> -	 * srcu_readers_active_idx(), summing up to CPU 1, and then that
> -	 * task C starts reading on CPU 0, so that its increment is not
> -	 * summed, but finishes reading on CPU 2, so that its decrement
> -	 * -is- summed.  Then when task B completes its sum, it will
> -	 * incorrectly get zero, despite the fact that task A has been
> -	 * in its SRCU read-side critical section the whole time.
> -	 *
> -	 * We therefore do a validation step should srcu_readers_active_idx()
> -	 * return zero.
> -	 */
> -	if (srcu_readers_active_idx(sp, idx) != 0)
> -		return false;
> -
> -	/*
> -	 * The remainder of this function is the validation step.
> -	 * The following smp_mb() D pairs with the smp_mb() C in
> -	 * __srcu_read_unlock().  If the __srcu_read_unlock() was seen
> -	 * by srcu_readers_active_idx() above, then any destructive
> -	 * operation performed after the grace period will happen after
> -	 * the corresponding SRCU read-side critical section.
> +	 * If the locks are the same as the unlocks, then there must of have
> +	 * been no readers on this index at some time in between. This does not
> +	 * mean that there are no more readers, as one could have read the
> +	 * current index but have incremented the lock counter yet.
>  	 *
> -	 * Note that there can be at most NR_CPUS worth of readers using
> -	 * the old index, which is not enough to overflow even a 32-bit
> -	 * integer.  (Yes, this does mean that systems having more than
> -	 * a billion or so CPUs need to be 64-bit systems.)  Therefore,
> -	 * the sum of the ->seq[] counters cannot possibly overflow.
> -	 * Therefore, the only way that the return values of the two
> -	 * calls to srcu_readers_seq_idx() can be equal is if there were
> -	 * no increments of the corresponding rank of ->seq[] counts
> -	 * in the interim.  But the missed-increment scenario laid out
> -	 * above includes an increment of the ->seq[] counter by
> -	 * the corresponding __srcu_read_lock().  Therefore, if this
> -	 * scenario occurs, the return values from the two calls to
> -	 * srcu_readers_seq_idx() will differ, and thus the validation
> -	 * step below suffices.
> +	 * Possible bug: There is no guarantee that there haven't been ULONG_MAX
> +	 * increments of ->lock_count[] since the unlocks were counted, meaning
> +	 * that this could return true even if there are still active readers.
> +	 * Since there are no memory barriers around srcu_flip(), the CPU is not
> +	 * required to increment ->completed before running
> +	 * srcu_readers_unlock_idx(), which means that there could be an
> +	 * arbitrarily large number of critical sections that execute after
> +	 * srcu_readers_unlock_idx() but use the old value of ->completed.
>  	 */
> -	smp_mb(); /* D */
> -
> -	return srcu_readers_seq_idx(sp, idx) == seq;
> +	return srcu_readers_lock_idx(sp, idx) == unlocks;
>  }
> 
>  /**
> @@ -266,8 +233,12 @@ static bool srcu_readers_active(struct srcu_struct *sp)
>  	unsigned long sum = 0;
> 
>  	for_each_possible_cpu(cpu) {
> -		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
> -		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
> +		struct srcu_struct_array *cpu_counts =
> +			per_cpu_ptr(sp->per_cpu_ref, cpu);
> +		sum += READ_ONCE(cpu_counts->lock_count[0]);
> +		sum += READ_ONCE(cpu_counts->lock_count[1]);
> +		sum -= READ_ONCE(cpu_counts->unlock_count[0]);
> +		sum -= READ_ONCE(cpu_counts->unlock_count[1]);
>  	}
>  	return sum;
>  }
> @@ -298,9 +269,8 @@ int __srcu_read_lock(struct srcu_struct *sp)
>  	int idx;
> 
>  	idx = READ_ONCE(sp->completed) & 0x1;
> -	__this_cpu_inc(sp->per_cpu_ref->c[idx]);
> +	__this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
>  	smp_mb(); /* B */  /* Avoid leaking the critical section. */
> -	__this_cpu_inc(sp->per_cpu_ref->seq[idx]);
>  	return idx;
>  }
>  EXPORT_SYMBOL_GPL(__srcu_read_lock);
> @@ -314,7 +284,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
>  void __srcu_read_unlock(struct srcu_struct *sp, int idx)
>  {
>  	smp_mb(); /* C */  /* Avoid leaking the critical section. */
> -	this_cpu_dec(sp->per_cpu_ref->c[idx]);
> +	this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]);
>  }
>  EXPORT_SYMBOL_GPL(__srcu_read_unlock);
> 
> @@ -349,7 +319,7 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
> 
>  /*
>   * Increment the ->completed counter so that future SRCU readers will
> - * use the other rank of the ->c[] and ->seq[] arrays.  This allows
> + * use the other rank of the ->(un)lock_count[] arrays.  This allows
>   * us to wait for pre-existing readers in a starvation-free manner.
>   */
>  static void srcu_flip(struct srcu_struct *sp)
> -- 
> 2.9.0
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH] SRCU: More efficient reader counts.
  2016-11-18 14:08 ` Paul E. McKenney
@ 2016-11-18 14:27   ` Mathieu Desnoyers
  2016-11-18 14:47     ` Paul E. McKenney
  2016-11-18 20:33   ` Lance Roy
  1 sibling, 1 reply; 8+ messages in thread
From: Mathieu Desnoyers @ 2016-11-18 14:27 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: ldr709, linux-kernel, Ingo Molnar, Lai Jiangshan, dipankar,
	Andrew Morton, Josh Triplett, Thomas Gleixner, Peter Zijlstra,
	rostedt, David Howells, Eric Dumazet, dvhart, fweisbec,
	Oleg Nesterov, bobby prani

----- On Nov 18, 2016, at 9:08 AM, Paul E. McKenney paulmck@linux.vnet.ibm.com wrote:

> On Thu, Nov 17, 2016 at 02:03:35PM -0800, Lance Roy wrote:
>> SRCU uses two per-cpu counters: a nesting counter to count the number of
>> active critical sections, and a sequence counter to ensure that the nesting
>> counters don't change while they are being added together in
>> srcu_readers_active_idx_check().
>> 
>> This patch instead uses per-cpu lock and unlock counters. Because the both
>> counters only increase and srcu_readers_active_idx_check() reads the unlock
>> counter before the lock counter, this achieves the same end without having
>> to increment two different counters in srcu_read_lock(). This also saves a
>> smp_mb() in srcu_readers_active_idx_check().
>> 
>> Possible bug: There is no guarantee that the lock counter won't overflow
>> during srcu_readers_active_idx_check(), as there are no memory barriers
>> around srcu_flip() (see comment in srcu_readers_active_idx_check() for
>> details). However, this problem was already present before this patch.
> 
> This patch differs from the previous one in a few (good) code-style
> changes, comment changes, and of course the commit log.  Good.  Once
> discussion converges, I will apply the agreed-upon commit, which might
> well be this one.
> 
> However, let's first take a look at the overflow issue.
> 
> If a given program could have ULONG_MAX or more readers at any given
> time, there would of course be overflow.  However, each read must have
> an srcu_read_lock() outstanding, and the resulting four-byte return
> value must be stored somewhere.  Because the full address space is at
> most ULONG_MAX, the maximum number of outstanding readers is at most
> ULONG_MAX/4, even in the degenerate case where a single CPU/task invokes
> srcu_read_lock() in a tight loop.  And even this assumes that the entire
> address space can somehow be devoted to srcu_read_lock() return values.
> ULONG_MAX/4 is therefore a hard upper bound on the number of outstanding
> SRCU readers.

The loop taking srcu_read_lock() seems a bit far fetched.

More realistically, we could move this bound even lower
if we can expect each thread to only nest srcu up to a limited
amount (e.g. 128). The realistic limit of number of SRCU readers
then becomes bounded by the maximum number of threads multiplied
by the srcu max nesting count.

> 
> Now srcu_readers_active_idx_check() checks for strict equality between
> the number of locks and unlocks, so we can in theory tolerate ULONG_MAX-1
> readers.  So, the question is whether ULONG_MAX/4 readers can result
> in the updater seeing ULONG_MAX reads, due to memory misordering and
> other issues.
> 
> Because there are no memory barriers surrounding srcu_flip(), the updater
> could miss an extremely large number of srcu_read_unlock()s.  However,
> each missed srcu_read_unlock() must have a corresponding srcu_read_lock(),
> and there is a full memory barrier between between the srcu_flip() and
> the read of the lock count.  There is also a full barrier between any
> srcu_read_lock()'s increment of the lock count and that CPU's/task's next
> srcu_read_lock()'s fetch of the index.  Therefore, if the updater misses
> counting a given srcu_read_lock(), that CPU's/task's next srcu_read_lock()
> must see the new value of the index.  Because srcu_read_lock() disables
> preemption across the index fetch and the lock increment, there can be at
> most NR_CPUS-1 srcu_read_lock() calls that missed the recent srcu_flip()'s
> update to the index.  (I said NR_CPUS earlier, but Mathieu is correct
> in pointing out that srcu_flip() has to run somewhere.)
> 
> The maximum number of locks that the updater can see is therefore:
> 
> o	ULONG_MAX/4 for a full set of missed srcu_read_unlock()s.
> 
> o	ULONG_MAX/4 for a full set of srcu_read_lock()s.
> 
> o	NR_CPUS-1 for a full set of subsequent srcu_read_lock()s that
>	missed the flip.
> 
> This totals to ULONG_MAX/2+NR_CPUS-1.  So as long as there are no more
> than ULONG_MAX/2 CPUs, we should be good.  And given that the biggest
> system I have hard evidence of is 4K CPUs, we have ample headrooom
> compared to the ~2G value of ULONG_MAX/2, even on 32-bit systems.
> 
> So, what am I missing here?

Your analysis makes sense.

An alternative approach would be to document some limits on the allowed
nesting count for a given SRCU domain that should be expected from a thread,
and use that as an even lower upper bound on the number of concurrent
SRCU readers, which would allow us to increase the number of supported
CPUs accordingly on 32-bit systems.

Thanks,

Mathieu

> 
>							Thanx, Paul
> 
>> Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
>> Signed-off-by: Lance Roy <ldr709@gmail.com>
>> ---
>>  include/linux/srcu.h    |   4 +-
>>  kernel/rcu/rcutorture.c |  20 ++++++++-
>>  kernel/rcu/srcu.c       | 116 ++++++++++++++++++------------------------------
>>  3 files changed, 63 insertions(+), 77 deletions(-)
>> 
>> diff --git a/include/linux/srcu.h b/include/linux/srcu.h
>> index dc8eb63..0caea34 100644
>> --- a/include/linux/srcu.h
>> +++ b/include/linux/srcu.h
>> @@ -34,8 +34,8 @@
>>  #include <linux/workqueue.h>
>> 
>>  struct srcu_struct_array {
>> -	unsigned long c[2];
>> -	unsigned long seq[2];
>> +	unsigned long lock_count[2];
>> +	unsigned long unlock_count[2];
>>  };
>> 
>>  struct rcu_batch {
>> diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
>> index bf08fee..2450c61 100644
>> --- a/kernel/rcu/rcutorture.c
>> +++ b/kernel/rcu/rcutorture.c
>> @@ -555,10 +555,26 @@ static void srcu_torture_stats(void)
>>  	pr_alert("%s%s per-CPU(idx=%d):",
>>  		 torture_type, TORTURE_FLAG, idx);
>>  	for_each_possible_cpu(cpu) {
>> +		unsigned long l0, l1;
>> +		unsigned long u0, u1;
>>  		long c0, c1;
>> +		struct srcu_struct_array *counts =
>> +			per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
>> 
>> -		c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx];
>> -		c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx];
>> +		u0 = counts->unlock_count[!idx];
>> +		u1 = counts->unlock_count[idx];
>> +
>> +		/*
>> +		 * Make sure that a lock is always counted if the corresponding
>> +		 * unlock is counted.
>> +		 */
>> +		smp_rmb();
>> +
>> +		l0 = counts->lock_count[!idx];
>> +		l1 = counts->lock_count[idx];
>> +
>> +		c0 = (long)(l0 - u0);
>> +		c1 = (long)(l1 - u1);
>>  		pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
>>  	}
>>  	pr_cont("\n");
>> diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
>> index 9b9cdd5..38e9aae 100644
>> --- a/kernel/rcu/srcu.c
>> +++ b/kernel/rcu/srcu.c
>> @@ -141,34 +141,38 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
>>  #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
>> 
>>  /*
>> - * Returns approximate total of the readers' ->seq[] values for the
>> + * Returns approximate total of the readers' ->lock_count[] values for the
>>   * rank of per-CPU counters specified by idx.
>>   */
>> -static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
>> +static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
>>  {
>>  	int cpu;
>>  	unsigned long sum = 0;
>>  	unsigned long t;
>> 
>>  	for_each_possible_cpu(cpu) {
>> -		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
>> +		struct srcu_struct_array *cpu_counts =
>> +			per_cpu_ptr(sp->per_cpu_ref, cpu);
>> +		t = READ_ONCE(cpu_counts->lock_count[idx]);
>>  		sum += t;
>>  	}
>>  	return sum;
>>  }
>> 
>>  /*
>> - * Returns approximate number of readers active on the specified rank
>> - * of the per-CPU ->c[] counters.
>> + * Returns approximate total of the readers' ->unlock_count[] values for the
>> + * rank of per-CPU counters specified by idx.
>>   */
>> -static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
>> +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
>>  {
>>  	int cpu;
>>  	unsigned long sum = 0;
>>  	unsigned long t;
>> 
>>  	for_each_possible_cpu(cpu) {
>> -		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
>> +		struct srcu_struct_array *cpu_counts =
>> +			per_cpu_ptr(sp->per_cpu_ref, cpu);
>> +		t = READ_ONCE(cpu_counts->unlock_count[idx]);
>>  		sum += t;
>>  	}
>>  	return sum;
>> @@ -176,79 +180,42 @@ static unsigned long srcu_readers_active_idx(struct
>> srcu_struct *sp, int idx)
>> 
>>  /*
>>   * Return true if the number of pre-existing readers is determined to
>> - * be stably zero.  An example unstable zero can occur if the call
>> - * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
>> - * but due to task migration, sees the corresponding __srcu_read_unlock()
>> - * decrement.  This can happen because srcu_readers_active_idx() takes
>> - * time to sum the array, and might in fact be interrupted or preempted
>> - * partway through the summation.
>> + * be zero.
>>   */
>>  static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
>>  {
>> -	unsigned long seq;
>> +	unsigned long unlocks;
>> 
>> -	seq = srcu_readers_seq_idx(sp, idx);
>> +	unlocks = srcu_readers_unlock_idx(sp, idx);
>> 
>>  	/*
>> -	 * The following smp_mb() A pairs with the smp_mb() B located in
>> -	 * __srcu_read_lock().  This pairing ensures that if an
>> -	 * __srcu_read_lock() increments its counter after the summation
>> -	 * in srcu_readers_active_idx(), then the corresponding SRCU read-side
>> -	 * critical section will see any changes made prior to the start
>> -	 * of the current SRCU grace period.
>> +	 * Make sure that a lock is always counted if the corresponding unlock
>> +	 * is counted. Needs to be a smp_mb() as the read side may contain a
>> +	 * read from a variable that is written to before the synchronize_srcu()
>> +	 * in the write side. In this case smp_mb()s A and B act like the store
>> +	 * buffering pattern.
>>  	 *
>> -	 * Also, if the above call to srcu_readers_seq_idx() saw the
>> -	 * increment of ->seq[], then the call to srcu_readers_active_idx()
>> -	 * must see the increment of ->c[].
>> +	 * This smp_mb() also pairs with smp_mb() C to prevent writes after the
>> +	 * synchronize_srcu() from being executed before the grace period ends.
>>  	 */
>>  	smp_mb(); /* A */
>> 
>>  	/*
>> -	 * Note that srcu_readers_active_idx() can incorrectly return
>> -	 * zero even though there is a pre-existing reader throughout.
>> -	 * To see this, suppose that task A is in a very long SRCU
>> -	 * read-side critical section that started on CPU 0, and that
>> -	 * no other reader exists, so that the sum of the counters
>> -	 * is equal to one.  Then suppose that task B starts executing
>> -	 * srcu_readers_active_idx(), summing up to CPU 1, and then that
>> -	 * task C starts reading on CPU 0, so that its increment is not
>> -	 * summed, but finishes reading on CPU 2, so that its decrement
>> -	 * -is- summed.  Then when task B completes its sum, it will
>> -	 * incorrectly get zero, despite the fact that task A has been
>> -	 * in its SRCU read-side critical section the whole time.
>> -	 *
>> -	 * We therefore do a validation step should srcu_readers_active_idx()
>> -	 * return zero.
>> -	 */
>> -	if (srcu_readers_active_idx(sp, idx) != 0)
>> -		return false;
>> -
>> -	/*
>> -	 * The remainder of this function is the validation step.
>> -	 * The following smp_mb() D pairs with the smp_mb() C in
>> -	 * __srcu_read_unlock().  If the __srcu_read_unlock() was seen
>> -	 * by srcu_readers_active_idx() above, then any destructive
>> -	 * operation performed after the grace period will happen after
>> -	 * the corresponding SRCU read-side critical section.
>> +	 * If the locks are the same as the unlocks, then there must of have
>> +	 * been no readers on this index at some time in between. This does not
>> +	 * mean that there are no more readers, as one could have read the
>> +	 * current index but have incremented the lock counter yet.
>>  	 *
>> -	 * Note that there can be at most NR_CPUS worth of readers using
>> -	 * the old index, which is not enough to overflow even a 32-bit
>> -	 * integer.  (Yes, this does mean that systems having more than
>> -	 * a billion or so CPUs need to be 64-bit systems.)  Therefore,
>> -	 * the sum of the ->seq[] counters cannot possibly overflow.
>> -	 * Therefore, the only way that the return values of the two
>> -	 * calls to srcu_readers_seq_idx() can be equal is if there were
>> -	 * no increments of the corresponding rank of ->seq[] counts
>> -	 * in the interim.  But the missed-increment scenario laid out
>> -	 * above includes an increment of the ->seq[] counter by
>> -	 * the corresponding __srcu_read_lock().  Therefore, if this
>> -	 * scenario occurs, the return values from the two calls to
>> -	 * srcu_readers_seq_idx() will differ, and thus the validation
>> -	 * step below suffices.
>> +	 * Possible bug: There is no guarantee that there haven't been ULONG_MAX
>> +	 * increments of ->lock_count[] since the unlocks were counted, meaning
>> +	 * that this could return true even if there are still active readers.
>> +	 * Since there are no memory barriers around srcu_flip(), the CPU is not
>> +	 * required to increment ->completed before running
>> +	 * srcu_readers_unlock_idx(), which means that there could be an
>> +	 * arbitrarily large number of critical sections that execute after
>> +	 * srcu_readers_unlock_idx() but use the old value of ->completed.
>>  	 */
>> -	smp_mb(); /* D */
>> -
>> -	return srcu_readers_seq_idx(sp, idx) == seq;
>> +	return srcu_readers_lock_idx(sp, idx) == unlocks;
>>  }
>> 
>>  /**
>> @@ -266,8 +233,12 @@ static bool srcu_readers_active(struct srcu_struct *sp)
>>  	unsigned long sum = 0;
>> 
>>  	for_each_possible_cpu(cpu) {
>> -		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
>> -		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
>> +		struct srcu_struct_array *cpu_counts =
>> +			per_cpu_ptr(sp->per_cpu_ref, cpu);
>> +		sum += READ_ONCE(cpu_counts->lock_count[0]);
>> +		sum += READ_ONCE(cpu_counts->lock_count[1]);
>> +		sum -= READ_ONCE(cpu_counts->unlock_count[0]);
>> +		sum -= READ_ONCE(cpu_counts->unlock_count[1]);
>>  	}
>>  	return sum;
>>  }
>> @@ -298,9 +269,8 @@ int __srcu_read_lock(struct srcu_struct *sp)
>>  	int idx;
>> 
>>  	idx = READ_ONCE(sp->completed) & 0x1;
>> -	__this_cpu_inc(sp->per_cpu_ref->c[idx]);
>> +	__this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
>>  	smp_mb(); /* B */  /* Avoid leaking the critical section. */
>> -	__this_cpu_inc(sp->per_cpu_ref->seq[idx]);
>>  	return idx;
>>  }
>>  EXPORT_SYMBOL_GPL(__srcu_read_lock);
>> @@ -314,7 +284,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
>>  void __srcu_read_unlock(struct srcu_struct *sp, int idx)
>>  {
>>  	smp_mb(); /* C */  /* Avoid leaking the critical section. */
>> -	this_cpu_dec(sp->per_cpu_ref->c[idx]);
>> +	this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]);
>>  }
>>  EXPORT_SYMBOL_GPL(__srcu_read_unlock);
>> 
>> @@ -349,7 +319,7 @@ static bool try_check_zero(struct srcu_struct *sp, int idx,
>> int trycount)
>> 
>>  /*
>>   * Increment the ->completed counter so that future SRCU readers will
>> - * use the other rank of the ->c[] and ->seq[] arrays.  This allows
>> + * use the other rank of the ->(un)lock_count[] arrays.  This allows
>>   * us to wait for pre-existing readers in a starvation-free manner.
>>   */
>>  static void srcu_flip(struct srcu_struct *sp)
>> --
>> 2.9.0

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH] SRCU: More efficient reader counts.
  2016-11-18 14:27   ` Mathieu Desnoyers
@ 2016-11-18 14:47     ` Paul E. McKenney
  0 siblings, 0 replies; 8+ messages in thread
From: Paul E. McKenney @ 2016-11-18 14:47 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: ldr709, linux-kernel, Ingo Molnar, Lai Jiangshan, dipankar,
	Andrew Morton, Josh Triplett, Thomas Gleixner, Peter Zijlstra,
	rostedt, David Howells, Eric Dumazet, dvhart, fweisbec,
	Oleg Nesterov, bobby prani

On Fri, Nov 18, 2016 at 02:27:08PM +0000, Mathieu Desnoyers wrote:
> ----- On Nov 18, 2016, at 9:08 AM, Paul E. McKenney paulmck@linux.vnet.ibm.com wrote:
> 
> > On Thu, Nov 17, 2016 at 02:03:35PM -0800, Lance Roy wrote:
> >> SRCU uses two per-cpu counters: a nesting counter to count the number of
> >> active critical sections, and a sequence counter to ensure that the nesting
> >> counters don't change while they are being added together in
> >> srcu_readers_active_idx_check().
> >> 
> >> This patch instead uses per-cpu lock and unlock counters. Because the both
> >> counters only increase and srcu_readers_active_idx_check() reads the unlock
> >> counter before the lock counter, this achieves the same end without having
> >> to increment two different counters in srcu_read_lock(). This also saves a
> >> smp_mb() in srcu_readers_active_idx_check().
> >> 
> >> Possible bug: There is no guarantee that the lock counter won't overflow
> >> during srcu_readers_active_idx_check(), as there are no memory barriers
> >> around srcu_flip() (see comment in srcu_readers_active_idx_check() for
> >> details). However, this problem was already present before this patch.
> > 
> > This patch differs from the previous one in a few (good) code-style
> > changes, comment changes, and of course the commit log.  Good.  Once
> > discussion converges, I will apply the agreed-upon commit, which might
> > well be this one.
> > 
> > However, let's first take a look at the overflow issue.
> > 
> > If a given program could have ULONG_MAX or more readers at any given
> > time, there would of course be overflow.  However, each read must have
> > an srcu_read_lock() outstanding, and the resulting four-byte return
> > value must be stored somewhere.  Because the full address space is at
> > most ULONG_MAX, the maximum number of outstanding readers is at most
> > ULONG_MAX/4, even in the degenerate case where a single CPU/task invokes
> > srcu_read_lock() in a tight loop.  And even this assumes that the entire
> > address space can somehow be devoted to srcu_read_lock() return values.
> > ULONG_MAX/4 is therefore a hard upper bound on the number of outstanding
> > SRCU readers.
> 
> The loop taking srcu_read_lock() seems a bit far fetched.

True, but it is a good worst-case scenario, so it is worth seeing where
it goes.

> More realistically, we could move this bound even lower
> if we can expect each thread to only nest srcu up to a limited
> amount (e.g. 128). The realistic limit of number of SRCU readers
> then becomes bounded by the maximum number of threads multiplied
> by the srcu max nesting count.

A per-thread limit would make sense.  The thought is to enforce this
limit using lockdep or some such?  (Without some sort of enforcement,
setting a limit isn't really helpful.)

> > Now srcu_readers_active_idx_check() checks for strict equality between
> > the number of locks and unlocks, so we can in theory tolerate ULONG_MAX-1
> > readers.  So, the question is whether ULONG_MAX/4 readers can result
> > in the updater seeing ULONG_MAX reads, due to memory misordering and
> > other issues.
> > 
> > Because there are no memory barriers surrounding srcu_flip(), the updater
> > could miss an extremely large number of srcu_read_unlock()s.  However,
> > each missed srcu_read_unlock() must have a corresponding srcu_read_lock(),
> > and there is a full memory barrier between between the srcu_flip() and
> > the read of the lock count.  There is also a full barrier between any
> > srcu_read_lock()'s increment of the lock count and that CPU's/task's next
> > srcu_read_lock()'s fetch of the index.  Therefore, if the updater misses
> > counting a given srcu_read_lock(), that CPU's/task's next srcu_read_lock()
> > must see the new value of the index.  Because srcu_read_lock() disables
> > preemption across the index fetch and the lock increment, there can be at
> > most NR_CPUS-1 srcu_read_lock() calls that missed the recent srcu_flip()'s
> > update to the index.  (I said NR_CPUS earlier, but Mathieu is correct
> > in pointing out that srcu_flip() has to run somewhere.)
> > 
> > The maximum number of locks that the updater can see is therefore:
> > 
> > o	ULONG_MAX/4 for a full set of missed srcu_read_unlock()s.
> > 
> > o	ULONG_MAX/4 for a full set of srcu_read_lock()s.
> > 
> > o	NR_CPUS-1 for a full set of subsequent srcu_read_lock()s that
> >	missed the flip.
> > 
> > This totals to ULONG_MAX/2+NR_CPUS-1.  So as long as there are no more
> > than ULONG_MAX/2 CPUs, we should be good.  And given that the biggest
> > system I have hard evidence of is 4K CPUs, we have ample headrooom
> > compared to the ~2G value of ULONG_MAX/2, even on 32-bit systems.
> > 
> > So, what am I missing here?
> 
> Your analysis makes sense.

OK, so what is it that both of us are missing here?  ;-)

> An alternative approach would be to document some limits on the allowed
> nesting count for a given SRCU domain that should be expected from a thread,
> and use that as an even lower upper bound on the number of concurrent
> SRCU readers, which would allow us to increase the number of supported
> CPUs accordingly on 32-bit systems.

Do you really believe that there will be a 32-bit system with more than
even one million CPUs anytime soon?  Because the code can tolerate a bit
more than two billion.  ;-)

That said, I have no problem with an enforced limit on srcu_read_lock()
nesting as a debugging aid.  But don't we already have a limit inherent
in lockdep's limited number of array entries for tracking acquisitions?
Currently 2000 rather than 128, but seems close enough.  It looks to
me that lockdep dumps out the acquisitions when this limit is exceeded,
so the information would be there.

Thoughts?

							Thanx, Paul

> Thanks,
> 
> Mathieu
> 
> > 
> >							Thanx, Paul
> > 
> >> Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> >> Signed-off-by: Lance Roy <ldr709@gmail.com>
> >> ---
> >>  include/linux/srcu.h    |   4 +-
> >>  kernel/rcu/rcutorture.c |  20 ++++++++-
> >>  kernel/rcu/srcu.c       | 116 ++++++++++++++++++------------------------------
> >>  3 files changed, 63 insertions(+), 77 deletions(-)
> >> 
> >> diff --git a/include/linux/srcu.h b/include/linux/srcu.h
> >> index dc8eb63..0caea34 100644
> >> --- a/include/linux/srcu.h
> >> +++ b/include/linux/srcu.h
> >> @@ -34,8 +34,8 @@
> >>  #include <linux/workqueue.h>
> >> 
> >>  struct srcu_struct_array {
> >> -	unsigned long c[2];
> >> -	unsigned long seq[2];
> >> +	unsigned long lock_count[2];
> >> +	unsigned long unlock_count[2];
> >>  };
> >> 
> >>  struct rcu_batch {
> >> diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
> >> index bf08fee..2450c61 100644
> >> --- a/kernel/rcu/rcutorture.c
> >> +++ b/kernel/rcu/rcutorture.c
> >> @@ -555,10 +555,26 @@ static void srcu_torture_stats(void)
> >>  	pr_alert("%s%s per-CPU(idx=%d):",
> >>  		 torture_type, TORTURE_FLAG, idx);
> >>  	for_each_possible_cpu(cpu) {
> >> +		unsigned long l0, l1;
> >> +		unsigned long u0, u1;
> >>  		long c0, c1;
> >> +		struct srcu_struct_array *counts =
> >> +			per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
> >> 
> >> -		c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx];
> >> -		c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx];
> >> +		u0 = counts->unlock_count[!idx];
> >> +		u1 = counts->unlock_count[idx];
> >> +
> >> +		/*
> >> +		 * Make sure that a lock is always counted if the corresponding
> >> +		 * unlock is counted.
> >> +		 */
> >> +		smp_rmb();
> >> +
> >> +		l0 = counts->lock_count[!idx];
> >> +		l1 = counts->lock_count[idx];
> >> +
> >> +		c0 = (long)(l0 - u0);
> >> +		c1 = (long)(l1 - u1);
> >>  		pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
> >>  	}
> >>  	pr_cont("\n");
> >> diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
> >> index 9b9cdd5..38e9aae 100644
> >> --- a/kernel/rcu/srcu.c
> >> +++ b/kernel/rcu/srcu.c
> >> @@ -141,34 +141,38 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
> >>  #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
> >> 
> >>  /*
> >> - * Returns approximate total of the readers' ->seq[] values for the
> >> + * Returns approximate total of the readers' ->lock_count[] values for the
> >>   * rank of per-CPU counters specified by idx.
> >>   */
> >> -static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
> >> +static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
> >>  {
> >>  	int cpu;
> >>  	unsigned long sum = 0;
> >>  	unsigned long t;
> >> 
> >>  	for_each_possible_cpu(cpu) {
> >> -		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
> >> +		struct srcu_struct_array *cpu_counts =
> >> +			per_cpu_ptr(sp->per_cpu_ref, cpu);
> >> +		t = READ_ONCE(cpu_counts->lock_count[idx]);
> >>  		sum += t;
> >>  	}
> >>  	return sum;
> >>  }
> >> 
> >>  /*
> >> - * Returns approximate number of readers active on the specified rank
> >> - * of the per-CPU ->c[] counters.
> >> + * Returns approximate total of the readers' ->unlock_count[] values for the
> >> + * rank of per-CPU counters specified by idx.
> >>   */
> >> -static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
> >> +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
> >>  {
> >>  	int cpu;
> >>  	unsigned long sum = 0;
> >>  	unsigned long t;
> >> 
> >>  	for_each_possible_cpu(cpu) {
> >> -		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
> >> +		struct srcu_struct_array *cpu_counts =
> >> +			per_cpu_ptr(sp->per_cpu_ref, cpu);
> >> +		t = READ_ONCE(cpu_counts->unlock_count[idx]);
> >>  		sum += t;
> >>  	}
> >>  	return sum;
> >> @@ -176,79 +180,42 @@ static unsigned long srcu_readers_active_idx(struct
> >> srcu_struct *sp, int idx)
> >> 
> >>  /*
> >>   * Return true if the number of pre-existing readers is determined to
> >> - * be stably zero.  An example unstable zero can occur if the call
> >> - * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
> >> - * but due to task migration, sees the corresponding __srcu_read_unlock()
> >> - * decrement.  This can happen because srcu_readers_active_idx() takes
> >> - * time to sum the array, and might in fact be interrupted or preempted
> >> - * partway through the summation.
> >> + * be zero.
> >>   */
> >>  static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
> >>  {
> >> -	unsigned long seq;
> >> +	unsigned long unlocks;
> >> 
> >> -	seq = srcu_readers_seq_idx(sp, idx);
> >> +	unlocks = srcu_readers_unlock_idx(sp, idx);
> >> 
> >>  	/*
> >> -	 * The following smp_mb() A pairs with the smp_mb() B located in
> >> -	 * __srcu_read_lock().  This pairing ensures that if an
> >> -	 * __srcu_read_lock() increments its counter after the summation
> >> -	 * in srcu_readers_active_idx(), then the corresponding SRCU read-side
> >> -	 * critical section will see any changes made prior to the start
> >> -	 * of the current SRCU grace period.
> >> +	 * Make sure that a lock is always counted if the corresponding unlock
> >> +	 * is counted. Needs to be a smp_mb() as the read side may contain a
> >> +	 * read from a variable that is written to before the synchronize_srcu()
> >> +	 * in the write side. In this case smp_mb()s A and B act like the store
> >> +	 * buffering pattern.
> >>  	 *
> >> -	 * Also, if the above call to srcu_readers_seq_idx() saw the
> >> -	 * increment of ->seq[], then the call to srcu_readers_active_idx()
> >> -	 * must see the increment of ->c[].
> >> +	 * This smp_mb() also pairs with smp_mb() C to prevent writes after the
> >> +	 * synchronize_srcu() from being executed before the grace period ends.
> >>  	 */
> >>  	smp_mb(); /* A */
> >> 
> >>  	/*
> >> -	 * Note that srcu_readers_active_idx() can incorrectly return
> >> -	 * zero even though there is a pre-existing reader throughout.
> >> -	 * To see this, suppose that task A is in a very long SRCU
> >> -	 * read-side critical section that started on CPU 0, and that
> >> -	 * no other reader exists, so that the sum of the counters
> >> -	 * is equal to one.  Then suppose that task B starts executing
> >> -	 * srcu_readers_active_idx(), summing up to CPU 1, and then that
> >> -	 * task C starts reading on CPU 0, so that its increment is not
> >> -	 * summed, but finishes reading on CPU 2, so that its decrement
> >> -	 * -is- summed.  Then when task B completes its sum, it will
> >> -	 * incorrectly get zero, despite the fact that task A has been
> >> -	 * in its SRCU read-side critical section the whole time.
> >> -	 *
> >> -	 * We therefore do a validation step should srcu_readers_active_idx()
> >> -	 * return zero.
> >> -	 */
> >> -	if (srcu_readers_active_idx(sp, idx) != 0)
> >> -		return false;
> >> -
> >> -	/*
> >> -	 * The remainder of this function is the validation step.
> >> -	 * The following smp_mb() D pairs with the smp_mb() C in
> >> -	 * __srcu_read_unlock().  If the __srcu_read_unlock() was seen
> >> -	 * by srcu_readers_active_idx() above, then any destructive
> >> -	 * operation performed after the grace period will happen after
> >> -	 * the corresponding SRCU read-side critical section.
> >> +	 * If the locks are the same as the unlocks, then there must of have
> >> +	 * been no readers on this index at some time in between. This does not
> >> +	 * mean that there are no more readers, as one could have read the
> >> +	 * current index but have incremented the lock counter yet.
> >>  	 *
> >> -	 * Note that there can be at most NR_CPUS worth of readers using
> >> -	 * the old index, which is not enough to overflow even a 32-bit
> >> -	 * integer.  (Yes, this does mean that systems having more than
> >> -	 * a billion or so CPUs need to be 64-bit systems.)  Therefore,
> >> -	 * the sum of the ->seq[] counters cannot possibly overflow.
> >> -	 * Therefore, the only way that the return values of the two
> >> -	 * calls to srcu_readers_seq_idx() can be equal is if there were
> >> -	 * no increments of the corresponding rank of ->seq[] counts
> >> -	 * in the interim.  But the missed-increment scenario laid out
> >> -	 * above includes an increment of the ->seq[] counter by
> >> -	 * the corresponding __srcu_read_lock().  Therefore, if this
> >> -	 * scenario occurs, the return values from the two calls to
> >> -	 * srcu_readers_seq_idx() will differ, and thus the validation
> >> -	 * step below suffices.
> >> +	 * Possible bug: There is no guarantee that there haven't been ULONG_MAX
> >> +	 * increments of ->lock_count[] since the unlocks were counted, meaning
> >> +	 * that this could return true even if there are still active readers.
> >> +	 * Since there are no memory barriers around srcu_flip(), the CPU is not
> >> +	 * required to increment ->completed before running
> >> +	 * srcu_readers_unlock_idx(), which means that there could be an
> >> +	 * arbitrarily large number of critical sections that execute after
> >> +	 * srcu_readers_unlock_idx() but use the old value of ->completed.
> >>  	 */
> >> -	smp_mb(); /* D */
> >> -
> >> -	return srcu_readers_seq_idx(sp, idx) == seq;
> >> +	return srcu_readers_lock_idx(sp, idx) == unlocks;
> >>  }
> >> 
> >>  /**
> >> @@ -266,8 +233,12 @@ static bool srcu_readers_active(struct srcu_struct *sp)
> >>  	unsigned long sum = 0;
> >> 
> >>  	for_each_possible_cpu(cpu) {
> >> -		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
> >> -		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
> >> +		struct srcu_struct_array *cpu_counts =
> >> +			per_cpu_ptr(sp->per_cpu_ref, cpu);
> >> +		sum += READ_ONCE(cpu_counts->lock_count[0]);
> >> +		sum += READ_ONCE(cpu_counts->lock_count[1]);
> >> +		sum -= READ_ONCE(cpu_counts->unlock_count[0]);
> >> +		sum -= READ_ONCE(cpu_counts->unlock_count[1]);
> >>  	}
> >>  	return sum;
> >>  }
> >> @@ -298,9 +269,8 @@ int __srcu_read_lock(struct srcu_struct *sp)
> >>  	int idx;
> >> 
> >>  	idx = READ_ONCE(sp->completed) & 0x1;
> >> -	__this_cpu_inc(sp->per_cpu_ref->c[idx]);
> >> +	__this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
> >>  	smp_mb(); /* B */  /* Avoid leaking the critical section. */
> >> -	__this_cpu_inc(sp->per_cpu_ref->seq[idx]);
> >>  	return idx;
> >>  }
> >>  EXPORT_SYMBOL_GPL(__srcu_read_lock);
> >> @@ -314,7 +284,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
> >>  void __srcu_read_unlock(struct srcu_struct *sp, int idx)
> >>  {
> >>  	smp_mb(); /* C */  /* Avoid leaking the critical section. */
> >> -	this_cpu_dec(sp->per_cpu_ref->c[idx]);
> >> +	this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]);
> >>  }
> >>  EXPORT_SYMBOL_GPL(__srcu_read_unlock);
> >> 
> >> @@ -349,7 +319,7 @@ static bool try_check_zero(struct srcu_struct *sp, int idx,
> >> int trycount)
> >> 
> >>  /*
> >>   * Increment the ->completed counter so that future SRCU readers will
> >> - * use the other rank of the ->c[] and ->seq[] arrays.  This allows
> >> + * use the other rank of the ->(un)lock_count[] arrays.  This allows
> >>   * us to wait for pre-existing readers in a starvation-free manner.
> >>   */
> >>  static void srcu_flip(struct srcu_struct *sp)
> >> --
> >> 2.9.0
> 
> -- 
> Mathieu Desnoyers
> EfficiOS Inc.
> http://www.efficios.com
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH] SRCU: More efficient reader counts.
  2016-11-18 14:08 ` Paul E. McKenney
  2016-11-18 14:27   ` Mathieu Desnoyers
@ 2016-11-18 20:33   ` Lance Roy
  2016-11-18 23:13     ` Paul E. McKenney
  1 sibling, 1 reply; 8+ messages in thread
From: Lance Roy @ 2016-11-18 20:33 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, peterz, rostedt, dhowells,
	edumazet, dvhart, fweisbec, oleg, bobby.prani

On Fri, 18 Nov 2016 06:08:45 -0800
"Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:
> However, let's first take a look at the overflow issue.
> 
> If a given program could have ULONG_MAX or more readers at any given
> time, there would of course be overflow.  However, each read must have
> an srcu_read_lock() outstanding, and the resulting four-byte return
> value must be stored somewhere.  Because the full address space is at
> most ULONG_MAX, the maximum number of outstanding readers is at most
> ULONG_MAX/4, even in the degenerate case where a single CPU/task invokes
> srcu_read_lock() in a tight loop.  And even this assumes that the entire
> address space can somehow be devoted to srcu_read_lock() return values.
> ULONG_MAX/4 is therefore a hard upper bound on the number of outstanding
> SRCU readers.
I agree that there should be at most ULONG_MAX/4 active readers at a time, as
long as the compiler doesn't do something crazy like noticing that
srcu_read_lock() always returns 0 or 1 and then packing the return values into
smaller variables.

> Now srcu_readers_active_idx_check() checks for strict equality between
> the number of locks and unlocks, so we can in theory tolerate ULONG_MAX-1
> readers.  So, the question is whether ULONG_MAX/4 readers can result
> in the updater seeing ULONG_MAX reads, due to memory misordering and
> other issues.
> 
> Because there are no memory barriers surrounding srcu_flip(), the updater
> could miss an extremely large number of srcu_read_unlock()s.  However,
> each missed srcu_read_unlock() must have a corresponding srcu_read_lock(),
> and there is a full memory barrier between between the srcu_flip() and
> the read of the lock count.  There is also a full barrier between any
> srcu_read_lock()'s increment of the lock count and that CPU's/task's next
> srcu_read_lock()'s fetch of the index.  Therefore, if the updater misses
> counting a given srcu_read_lock(), that CPU's/task's next srcu_read_lock()
> must see the new value of the index.  Because srcu_read_lock() disables
> preemption across the index fetch and the lock increment, there can be at
> most NR_CPUS-1 srcu_read_lock() calls that missed the recent srcu_flip()'s
> update to the index.  (I said NR_CPUS earlier, but Mathieu is correct
> in pointing out that srcu_flip() has to run somewhere.)
The trouble is that disabling preemption is not enough to ensure that there
is at most one srcu_read_lock() call per CPU that missed the srcu_flip().

Define a reader to be an SRCU lock+unlock pair. A reader is called active if it
has incremented ->lock_count[] but hasn't incremented ->unlock_count[] yet, and
completed if it has incremented ->unlock_count[]. I think that we only want to
limit the number of active readers and the number of CPUs. In particular, I
don't think there should be a limit on the rate of completion of read side
critical section.

The goal of srcu_readers_active_idx_check() is to verify that there were zero
active readers on the inactive index at some time during its execution. To do
this, it totals the unlock counts, executes a memory barrier, totals the lock
counts, and takes the difference. This difference counts the readers that are
active when srcu_readers_lock_idx() gets to their CPU, plus the readers that
completed after srcu_readers_unlock_idx() and before srcu_readers_lock_idx().
If the true (infinite precision) value of the difference is zero, then there
were no active readers at some point while srcu_readers_lock_idx() is running.
However, the difference is only stored in a long, so there is a potential for
overflow if too many readers complete during srcu_readers_active_idx_check().

For example, let's say there are three threads, each running on their own CPU:

int data, flag;
struct srcu_struct *sp = /* ... */;

Thread 0:
	data = 1;
	synchronize_srcu(sp);
	flag = 1;

Thread 1:
	int data_r, flag_r;
	int idx = srcu_read_lock(sp);
	data_r = data;
	flag_r = flag;
	srcu_read_unlock(sp, idx);
	BUG_ON(flag_r == 1 && data_r == 0);

Thread 2:
	while (true) {
		int idx = srcu_read_lock(sp);
		srcu_read_unlock(sp, idx);
	}

Let's take the following execution order. Thread 1 increments
the CPU 1 version of sp->lock_count[0], sets idx to zero, and loads data (0)
into data_r. Thread 0 then sets data to be 1, verifies that there are no
readers on index 1, and increments sp->completed, but the CPU actually doesn't
preform the last operation, putting it off until the next memory barrier. Thread
0 then calls srcu_readers_active_idx_check() on index 0, which runs
srcu_readers_unlock_idx() (returning 0). Right after srcu_readers_unlock_idx()
completes, thread 2 starts running. Since Thread 0 hasn't actually incremented
sp->completed in any way that is visible to thread 2, srcu_read_lock() will
still return 0. Thread 2 can then run for ULONG_MAX iterations, setting
the CPU 2 version of sp->unlock_count[0] to ULONG_MAX. CPU 0 then finally gets
around to incrementing sp->completed, runs its memory barrier, and then reads
the lock counts: 1, 0, ULONG_MAX. The total of ULONG_MAX+1 will overflow to 0
and compare equal with earlier unlock count. Thread 0 will then think that the
grace period is over and set flag to one. Thread 1 can then read flag (1) into
flag_r and run srcu_read_unlock(). The BUG_ON statement will then fail.

Although ULONG_MAX readers completed during srcu_readers_active_idx_check(),
there were at most 2 active readers at any time, so this program doesn't run
into any limit.

I hope that was clear enough.

Thanks,
Lance

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH] SRCU: More efficient reader counts.
  2016-11-18 20:33   ` Lance Roy
@ 2016-11-18 23:13     ` Paul E. McKenney
  2016-11-19  0:54       ` Lance Roy
  0 siblings, 1 reply; 8+ messages in thread
From: Paul E. McKenney @ 2016-11-18 23:13 UTC (permalink / raw)
  To: Lance Roy
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, peterz, rostedt, dhowells,
	edumazet, dvhart, fweisbec, oleg, bobby.prani

On Fri, Nov 18, 2016 at 12:33:00PM -0800, Lance Roy wrote:
> On Fri, 18 Nov 2016 06:08:45 -0800
> "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:
> > However, let's first take a look at the overflow issue.
> > 
> > If a given program could have ULONG_MAX or more readers at any given
> > time, there would of course be overflow.  However, each read must have
> > an srcu_read_lock() outstanding, and the resulting four-byte return
> > value must be stored somewhere.  Because the full address space is at
> > most ULONG_MAX, the maximum number of outstanding readers is at most
> > ULONG_MAX/4, even in the degenerate case where a single CPU/task invokes
> > srcu_read_lock() in a tight loop.  And even this assumes that the entire
> > address space can somehow be devoted to srcu_read_lock() return values.
> > ULONG_MAX/4 is therefore a hard upper bound on the number of outstanding
> > SRCU readers.
> I agree that there should be at most ULONG_MAX/4 active readers at a time, as
> long as the compiler doesn't do something crazy like noticing that
> srcu_read_lock() always returns 0 or 1 and then packing the return values into
> smaller variables.
> 
> > Now srcu_readers_active_idx_check() checks for strict equality between
> > the number of locks and unlocks, so we can in theory tolerate ULONG_MAX-1
> > readers.  So, the question is whether ULONG_MAX/4 readers can result
> > in the updater seeing ULONG_MAX reads, due to memory misordering and
> > other issues.
> > 
> > Because there are no memory barriers surrounding srcu_flip(), the updater
> > could miss an extremely large number of srcu_read_unlock()s.  However,
> > each missed srcu_read_unlock() must have a corresponding srcu_read_lock(),
> > and there is a full memory barrier between between the srcu_flip() and
> > the read of the lock count.  There is also a full barrier between any
> > srcu_read_lock()'s increment of the lock count and that CPU's/task's next
> > srcu_read_lock()'s fetch of the index.  Therefore, if the updater misses
> > counting a given srcu_read_lock(), that CPU's/task's next srcu_read_lock()
> > must see the new value of the index.  Because srcu_read_lock() disables
> > preemption across the index fetch and the lock increment, there can be at
> > most NR_CPUS-1 srcu_read_lock() calls that missed the recent srcu_flip()'s
> > update to the index.  (I said NR_CPUS earlier, but Mathieu is correct
> > in pointing out that srcu_flip() has to run somewhere.)
> The trouble is that disabling preemption is not enough to ensure that there
> is at most one srcu_read_lock() call per CPU that missed the srcu_flip().
> 
> Define a reader to be an SRCU lock+unlock pair. A reader is called active if it
> has incremented ->lock_count[] but hasn't incremented ->unlock_count[] yet, and
> completed if it has incremented ->unlock_count[]. I think that we only want to
> limit the number of active readers and the number of CPUs. In particular, I
> don't think there should be a limit on the rate of completion of read side
> critical section.
> 
> The goal of srcu_readers_active_idx_check() is to verify that there were zero
> active readers on the inactive index at some time during its execution. To do
> this, it totals the unlock counts, executes a memory barrier, totals the lock
> counts, and takes the difference. This difference counts the readers that are
> active when srcu_readers_lock_idx() gets to their CPU, plus the readers that
> completed after srcu_readers_unlock_idx() and before srcu_readers_lock_idx().
> If the true (infinite precision) value of the difference is zero, then there
> were no active readers at some point while srcu_readers_lock_idx() is running.
> However, the difference is only stored in a long, so there is a potential for
> overflow if too many readers complete during srcu_readers_active_idx_check().
> 
> For example, let's say there are three threads, each running on their own CPU:
> 
> int data, flag;
> struct srcu_struct *sp = /* ... */;
> 
> Thread 0:
> 	data = 1;
> 	synchronize_srcu(sp);
> 	flag = 1;
> 
> Thread 1:
> 	int data_r, flag_r;
> 	int idx = srcu_read_lock(sp);
> 	data_r = data;
> 	flag_r = flag;
> 	srcu_read_unlock(sp, idx);
> 	BUG_ON(flag_r == 1 && data_r == 0);
> 
> Thread 2:
> 	while (true) {
> 		int idx = srcu_read_lock(sp);
> 		srcu_read_unlock(sp, idx);
> 	}
> 
> Let's take the following execution order. Thread 1 increments
> the CPU 1 version of sp->lock_count[0], sets idx to zero, and loads data (0)
> into data_r. Thread 0 then sets data to be 1, verifies that there are no
> readers on index 1, and increments sp->completed, but the CPU actually doesn't
> preform the last operation, putting it off until the next memory barrier. Thread
> 0 then calls srcu_readers_active_idx_check() on index 0, which runs
> srcu_readers_unlock_idx() (returning 0). Right after srcu_readers_unlock_idx()
> completes, thread 2 starts running. Since Thread 0 hasn't actually incremented
> sp->completed in any way that is visible to thread 2, srcu_read_lock() will
> still return 0. Thread 2 can then run for ULONG_MAX iterations, setting
> the CPU 2 version of sp->unlock_count[0] to ULONG_MAX. CPU 0 then finally gets
> around to incrementing sp->completed, runs its memory barrier, and then reads
> the lock counts: 1, 0, ULONG_MAX. The total of ULONG_MAX+1 will overflow to 0
> and compare equal with earlier unlock count. Thread 0 will then think that the
> grace period is over and set flag to one. Thread 1 can then read flag (1) into
> flag_r and run srcu_read_unlock(). The BUG_ON statement will then fail.
> 
> Although ULONG_MAX readers completed during srcu_readers_active_idx_check(),
> there were at most 2 active readers at any time, so this program doesn't run
> into any limit.
> 
> I hope that was clear enough.

Indeed it is!

So adding a full memory barrier immediately after the srcu_flip() should
prevent this, because if the updater failed to see an unlock increment,
the second following lock for that CPU/task would be guaranteed to see
the flip.  Or am I still missing something?

Is there a sequence of events that requires a full memory barrier
before the srcu_flip()?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH] SRCU: More efficient reader counts.
  2016-11-18 23:13     ` Paul E. McKenney
@ 2016-11-19  0:54       ` Lance Roy
  2016-11-19 10:42         ` Paul E. McKenney
  0 siblings, 1 reply; 8+ messages in thread
From: Lance Roy @ 2016-11-19  0:54 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, peterz, rostedt, dhowells,
	edumazet, dvhart, fweisbec, oleg, bobby.prani

On Fri, 18 Nov 2016 15:13:45 -0800
"Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:
> On Fri, Nov 18, 2016 at 12:33:00PM -0800, Lance Roy wrote:  
> > The trouble is that disabling preemption is not enough to ensure that there
> > is at most one srcu_read_lock() call per CPU that missed the srcu_flip().
> > 
> > Define a reader to be an SRCU lock+unlock pair. A reader is called active
> > if it has incremented ->lock_count[] but hasn't incremented  
> > ->unlock_count[] yet, and completed if it has incremented
> > ->unlock_count[]. I think that we only want to limit the number of active
> > readers and the number of CPUs. In particular, I don't think there should
> > be a limit on the rate of completion of read side critical section.
> > 
> > The goal of srcu_readers_active_idx_check() is to verify that there were
> > zero active readers on the inactive index at some time during its
> > execution. To do this, it totals the unlock counts, executes a memory
> > barrier, totals the lock counts, and takes the difference. This difference
> > counts the readers that are active when srcu_readers_lock_idx() gets to
> > their CPU, plus the readers that completed after srcu_readers_unlock_idx()
> > and before srcu_readers_lock_idx(). If the true (infinite precision) value
> > of the difference is zero, then there were no active readers at some point
> > while srcu_readers_lock_idx() is running. However, the difference is only
> > stored in a long, so there is a potential for overflow if too many readers
> > complete during srcu_readers_active_idx_check().
> > 
> > For example, let's say there are three threads, each running on their own
> > CPU:
> > 
> > int data, flag;
> > struct srcu_struct *sp = /* ... */;
> > 
> > Thread 0:
> > 	data = 1;
> > 	synchronize_srcu(sp);
> > 	flag = 1;
> > 
> > Thread 1:
> > 	int data_r, flag_r;
> > 	int idx = srcu_read_lock(sp);
> > 	data_r = data;
> > 	flag_r = flag;
> > 	srcu_read_unlock(sp, idx);
> > 	BUG_ON(flag_r == 1 && data_r == 0);
> > 
> > Thread 2:
> > 	while (true) {
> > 		int idx = srcu_read_lock(sp);
> > 		srcu_read_unlock(sp, idx);
> > 	}
> > 
> > Let's take the following execution order. Thread 1 increments
> > the CPU 1 version of sp->lock_count[0], sets idx to zero, and loads data (0)
> > into data_r. Thread 0 then sets data to be 1, verifies that there are no
> > readers on index 1, and increments sp->completed, but the CPU actually
> > doesn't preform the last operation, putting it off until the next memory
> > barrier. Thread 0 then calls srcu_readers_active_idx_check() on index 0,
> > which runs srcu_readers_unlock_idx() (returning 0). Right after
> > srcu_readers_unlock_idx() completes, thread 2 starts running. Since Thread
> > 0 hasn't actually incremented sp->completed in any way that is visible to
> > thread 2, srcu_read_lock() will still return 0. Thread 2 can then run for
> > ULONG_MAX iterations, setting the CPU 2 version of sp->unlock_count[0] to
> > ULONG_MAX. CPU 0 then finally gets around to incrementing sp->completed,
> > runs its memory barrier, and then reads the lock counts: 1, 0, ULONG_MAX.
> > The total of ULONG_MAX+1 will overflow to 0 and compare equal with earlier
> > unlock count. Thread 0 will then think that the grace period is over and
> > set flag to one. Thread 1 can then read flag (1) into flag_r and run
> > srcu_read_unlock(). The BUG_ON statement will then fail.
> > 
> > Although ULONG_MAX readers completed during srcu_readers_active_idx_check(),
> > there were at most 2 active readers at any time, so this program doesn't run
> > into any limit.
> > 
> > I hope that was clear enough.    
> 
> Indeed it is!
> 
> So adding a full memory barrier immediately after the srcu_flip() should
> prevent this, because if the updater failed to see an unlock increment,
> the second following lock for that CPU/task would be guaranteed to see
> the flip.  Or am I still missing something?  
Yes, adding a full memory barrier after srcu_flip() prevents this problem.

> Is there a sequence of events that requires a full memory barrier
> before the srcu_flip()?  
I am now unsure if a memory barrier before srcu_flip() is necessary. I thought
that it would be needed to prevent the CPU from preforming the increment early,
but I have just noticed that srcu_advance_batches() will return early if the
first try_check_zero() fails, creating a control dependency. I think that this
control dependency should be enough to prevent the problem from occurring.

One interesting thing about this version is that there is only an address
dependency between the load of ->completed and the increment of ->load_count[].
This means that without an smp_read_barrier_depends() between the two, a reader
could use the new value of ->completed to increment ->load_count[], before
actually reading ->completed. (I was surprised that this ordering doesn't come
for free, as it seems like violating it would require speculative writes.) It
doesn't matter much if the dependency barrier is actually there, because
leaving it out just means at most 1 more spurious increment per CPU.

I think that with the added synchronization in srcu_flip() there would be at
most 3 * NR_CPUS readers that start and complete inside one
srcu_readers_active_idx_check(). With the read side dependency barrier this
would drop to 2 * NR_CPUS.

Thanks,
Lance

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH] SRCU: More efficient reader counts.
  2016-11-19  0:54       ` Lance Roy
@ 2016-11-19 10:42         ` Paul E. McKenney
  0 siblings, 0 replies; 8+ messages in thread
From: Paul E. McKenney @ 2016-11-19 10:42 UTC (permalink / raw)
  To: Lance Roy
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, peterz, rostedt, dhowells,
	edumazet, dvhart, fweisbec, oleg, bobby.prani

On Fri, Nov 18, 2016 at 04:54:42PM -0800, Lance Roy wrote:
> On Fri, 18 Nov 2016 15:13:45 -0800
> "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:
> > On Fri, Nov 18, 2016 at 12:33:00PM -0800, Lance Roy wrote:  
> > > The trouble is that disabling preemption is not enough to ensure that there
> > > is at most one srcu_read_lock() call per CPU that missed the srcu_flip().
> > > 
> > > Define a reader to be an SRCU lock+unlock pair. A reader is called active
> > > if it has incremented ->lock_count[] but hasn't incremented  
> > > ->unlock_count[] yet, and completed if it has incremented
> > > ->unlock_count[]. I think that we only want to limit the number of active
> > > readers and the number of CPUs. In particular, I don't think there should
> > > be a limit on the rate of completion of read side critical section.
> > > 
> > > The goal of srcu_readers_active_idx_check() is to verify that there were
> > > zero active readers on the inactive index at some time during its
> > > execution. To do this, it totals the unlock counts, executes a memory
> > > barrier, totals the lock counts, and takes the difference. This difference
> > > counts the readers that are active when srcu_readers_lock_idx() gets to
> > > their CPU, plus the readers that completed after srcu_readers_unlock_idx()
> > > and before srcu_readers_lock_idx(). If the true (infinite precision) value
> > > of the difference is zero, then there were no active readers at some point
> > > while srcu_readers_lock_idx() is running. However, the difference is only
> > > stored in a long, so there is a potential for overflow if too many readers
> > > complete during srcu_readers_active_idx_check().
> > > 
> > > For example, let's say there are three threads, each running on their own
> > > CPU:
> > > 
> > > int data, flag;
> > > struct srcu_struct *sp = /* ... */;
> > > 
> > > Thread 0:
> > > 	data = 1;
> > > 	synchronize_srcu(sp);
> > > 	flag = 1;
> > > 
> > > Thread 1:
> > > 	int data_r, flag_r;
> > > 	int idx = srcu_read_lock(sp);
> > > 	data_r = data;
> > > 	flag_r = flag;
> > > 	srcu_read_unlock(sp, idx);
> > > 	BUG_ON(flag_r == 1 && data_r == 0);
> > > 
> > > Thread 2:
> > > 	while (true) {
> > > 		int idx = srcu_read_lock(sp);
> > > 		srcu_read_unlock(sp, idx);
> > > 	}
> > > 
> > > Let's take the following execution order. Thread 1 increments
> > > the CPU 1 version of sp->lock_count[0], sets idx to zero, and loads data (0)
> > > into data_r. Thread 0 then sets data to be 1, verifies that there are no
> > > readers on index 1, and increments sp->completed, but the CPU actually
> > > doesn't preform the last operation, putting it off until the next memory
> > > barrier. Thread 0 then calls srcu_readers_active_idx_check() on index 0,
> > > which runs srcu_readers_unlock_idx() (returning 0). Right after
> > > srcu_readers_unlock_idx() completes, thread 2 starts running. Since Thread
> > > 0 hasn't actually incremented sp->completed in any way that is visible to
> > > thread 2, srcu_read_lock() will still return 0. Thread 2 can then run for
> > > ULONG_MAX iterations, setting the CPU 2 version of sp->unlock_count[0] to
> > > ULONG_MAX. CPU 0 then finally gets around to incrementing sp->completed,
> > > runs its memory barrier, and then reads the lock counts: 1, 0, ULONG_MAX.
> > > The total of ULONG_MAX+1 will overflow to 0 and compare equal with earlier
> > > unlock count. Thread 0 will then think that the grace period is over and
> > > set flag to one. Thread 1 can then read flag (1) into flag_r and run
> > > srcu_read_unlock(). The BUG_ON statement will then fail.
> > > 
> > > Although ULONG_MAX readers completed during srcu_readers_active_idx_check(),
> > > there were at most 2 active readers at any time, so this program doesn't run
> > > into any limit.
> > > 
> > > I hope that was clear enough.    
> > 
> > Indeed it is!
> > 
> > So adding a full memory barrier immediately after the srcu_flip() should
> > prevent this, because if the updater failed to see an unlock increment,
> > the second following lock for that CPU/task would be guaranteed to see
> > the flip.  Or am I still missing something?  
> Yes, adding a full memory barrier after srcu_flip() prevents this problem.
> 
> > Is there a sequence of events that requires a full memory barrier
> > before the srcu_flip()?  
> 
> I am now unsure if a memory barrier before srcu_flip() is necessary. I thought
> that it would be needed to prevent the CPU from preforming the increment early,
> but I have just noticed that srcu_advance_batches() will return early if the
> first try_check_zero() fails, creating a control dependency. I think that this
> control dependency should be enough to prevent the problem from occurring.

It might well be.  What would be a good way to prove it one way or the
other?  (If there is much doubt, adding a memory barrier to this slowpath
is not the worst thing in the world.)

> One interesting thing about this version is that there is only an address
> dependency between the load of ->completed and the increment of ->load_count[].
> This means that without an smp_read_barrier_depends() between the two, a reader
> could use the new value of ->completed to increment ->load_count[], before
> actually reading ->completed. (I was surprised that this ordering doesn't come
> for free, as it seems like violating it would require speculative writes.) It
> doesn't matter much if the dependency barrier is actually there, because
> leaving it out just means at most 1 more spurious increment per CPU.

There is an address dependency, but if I follow you correctly, this address
dependency is carried through an array index, that is to say through an
integer.  Carrying a dependency through an integer is very dangerous because
the compiler has all sorts of optimizations that can break integer
dependency chains.  So we should not rely on this.  If this ordering
is critically important, the loads should be converted to acquire
loads or whatever is best to preserve this ordering.

> I think that with the added synchronization in srcu_flip() there would be at
> most 3 * NR_CPUS readers that start and complete inside one
> srcu_readers_active_idx_check(). With the read side dependency barrier this
> would drop to 2 * NR_CPUS.

I have a hard time getting excited about the difference between
2 * NR_CPUS and 3 * NR_CPUS.  ;-)

So it sounds like it would be good to add the memory barrier after
the srcu_flip().  This would be fine as a separate patch.  Would
you like to put this patch together, or would you prefer that I
do so?

Either way, it looks like the ordering comments need further upgrades.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2016-11-19 10:42 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-11-17 22:03 [RFC PATCH] SRCU: More efficient reader counts Lance Roy
2016-11-18 14:08 ` Paul E. McKenney
2016-11-18 14:27   ` Mathieu Desnoyers
2016-11-18 14:47     ` Paul E. McKenney
2016-11-18 20:33   ` Lance Roy
2016-11-18 23:13     ` Paul E. McKenney
2016-11-19  0:54       ` Lance Roy
2016-11-19 10:42         ` Paul E. McKenney

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.