All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
To: Boqun Feng <boqun.feng@gmail.com>
Cc: linux-kernel@vger.kernel.org, mingo@kernel.org,
	jiangshanlai@gmail.com, dipankar@in.ibm.com,
	akpm@linux-foundation.org, mathieu.desnoyers@efficios.com,
	josh@joshtriplett.org, tglx@linutronix.de, peterz@infradead.org,
	rostedt@goodmis.org, dhowells@redhat.com, edumazet@google.com,
	dvhart@linux.intel.com, fweisbec@gmail.com, oleg@redhat.com,
	bobby.prani@gmail.com, ldr709@gmail.com
Subject: Re: [PATCH RFC tip/core/rcu] SRCU rewrite
Date: Tue, 15 Nov 2016 06:37:00 -0800	[thread overview]
Message-ID: <20161115143700.GZ4127@linux.vnet.ibm.com> (raw)
In-Reply-To: <20161115014445.GC12110@tardis.cn.ibm.com>

On Tue, Nov 15, 2016 at 09:44:45AM +0800, Boqun Feng wrote:
> On Mon, Nov 14, 2016 at 10:36:36AM -0800, Paul E. McKenney wrote:
> > SRCU uses two per-cpu counters: a nesting counter to count the number of
> > active critical sections, and a sequence counter to ensure that the nesting
> > counters don't change while they are being added together in
> > srcu_readers_active_idx_check().
> > 
> > This patch instead uses per-cpu lock and unlock counters. Because the both
> > counters only increase and srcu_readers_active_idx_check() reads the unlock
> > counter before the lock counter, this achieves the same end without having
> > to increment two different counters in srcu_read_lock(). This also saves a
> > smp_mb() in srcu_readers_active_idx_check().
> > 
> > A possible problem with this patch is that it can only handle
> > ULONG_MAX - NR_CPUS simultaneous readers, whereas the old version could
> > handle up to ULONG_MAX.
> > 
> > Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> > Signed-off-by: Lance Roy <ldr709@gmail.com>
> > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > [ paulmck: Queued for 4.12, that is, merge window after this coming one. ]
> > 
> > diff --git a/include/linux/srcu.h b/include/linux/srcu.h
> > index dc8eb63c6568..0caea34d8c5f 100644
> > --- a/include/linux/srcu.h
> > +++ b/include/linux/srcu.h
> > @@ -34,8 +34,8 @@
> >  #include <linux/workqueue.h>
> >  
> >  struct srcu_struct_array {
> > -	unsigned long c[2];
> > -	unsigned long seq[2];
> > +	unsigned long lock_count[2];
> > +	unsigned long unlock_count[2];
> >  };
> >  
> >  struct rcu_batch {
> > diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
> > index 87c51225ceec..6e4fd7680c70 100644
> > --- a/kernel/rcu/rcutorture.c
> > +++ b/kernel/rcu/rcutorture.c
> > @@ -564,10 +564,24 @@ static void srcu_torture_stats(void)
> >  	pr_alert("%s%s per-CPU(idx=%d):",
> >  		 torture_type, TORTURE_FLAG, idx);
> >  	for_each_possible_cpu(cpu) {
> > +		unsigned long l0, l1;
> > +		unsigned long u0, u1;
> >  		long c0, c1;
> > +		struct srcu_struct_array* counts =
> > +			per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
> >  
> > -		c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx];
> > -		c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx];
> > +		u0 = counts->unlock_count[!idx];
> > +		u1 = counts->unlock_count[idx];
> > +
> > +		/* Make sure that a lock is always counted if the corresponding
> > +		   unlock is counted. */
> > +		smp_rmb();
> > +
> > +		l0 = counts->lock_count[!idx];
> > +		l1 = counts->lock_count[idx];
> > +
> > +		c0 = (long)(l0 - u0);
> > +		c1 = (long)(l1 - u1);
> >  		pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
> >  	}
> >  	pr_cont("\n");
> > diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
> > index 9b9cdd549caa..edfdfadec821 100644
> > --- a/kernel/rcu/srcu.c
> > +++ b/kernel/rcu/srcu.c
> > @@ -141,34 +141,38 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
> >  #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
> >  
> >  /*
> > - * Returns approximate total of the readers' ->seq[] values for the
> > + * Returns approximate total of the readers' ->lock_count[] values for the
> >   * rank of per-CPU counters specified by idx.
> >   */
> > -static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
> > +static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
> >  {
> >  	int cpu;
> >  	unsigned long sum = 0;
> >  	unsigned long t;
> >  
> >  	for_each_possible_cpu(cpu) {
> > -		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
> > +		struct srcu_struct_array* cpu_counts =
> > +			per_cpu_ptr(sp->per_cpu_ref, cpu);
> > +		t = READ_ONCE(cpu_counts->lock_count[idx]);
> >  		sum += t;
> >  	}
> >  	return sum;
> >  }
> >  
> >  /*
> > - * Returns approximate number of readers active on the specified rank
> > - * of the per-CPU ->c[] counters.
> > + * Returns approximate total of the readers' ->unlock_count[] values for the
> > + * rank of per-CPU counters specified by idx.
> >   */
> > -static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
> > +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
> >  {
> >  	int cpu;
> >  	unsigned long sum = 0;
> >  	unsigned long t;
> >  
> >  	for_each_possible_cpu(cpu) {
> > -		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
> > +		struct srcu_struct_array* cpu_counts =
> > +			per_cpu_ptr(sp->per_cpu_ref, cpu);
> > +		t = READ_ONCE(cpu_counts->unlock_count[idx]);
> >  		sum += t;
> >  	}
> >  	return sum;
> > @@ -176,79 +180,43 @@ static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
> >  
> >  /*
> >   * Return true if the number of pre-existing readers is determined to
> > - * be stably zero.  An example unstable zero can occur if the call
> > - * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
> > - * but due to task migration, sees the corresponding __srcu_read_unlock()
> > - * decrement.  This can happen because srcu_readers_active_idx() takes
> > - * time to sum the array, and might in fact be interrupted or preempted
> > - * partway through the summation.
> > + * be zero.
> >   */
> >  static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
> >  {
> > -	unsigned long seq;
> > +	unsigned long unlocks;
> >  
> > -	seq = srcu_readers_seq_idx(sp, idx);
> > +	unlocks = srcu_readers_unlock_idx(sp, idx);
> >  
> >  	/*
> > -	 * The following smp_mb() A pairs with the smp_mb() B located in
> > -	 * __srcu_read_lock().  This pairing ensures that if an
> > -	 * __srcu_read_lock() increments its counter after the summation
> > -	 * in srcu_readers_active_idx(), then the corresponding SRCU read-side
> > -	 * critical section will see any changes made prior to the start
> > -	 * of the current SRCU grace period.
> > +	 * Make sure that a lock is always counted if the corresponding unlock
> > +	 * is counted. Needs to be a smp_mb() as the read side may contain a
> > +	 * read from a variable that is written to before the synchronize_srcu()
> > +	 * in the write side. In this case smp_mb()s A and B act like the store
> > +	 * buffering pattern.
> >  	 *
> > -	 * Also, if the above call to srcu_readers_seq_idx() saw the
> > -	 * increment of ->seq[], then the call to srcu_readers_active_idx()
> > -	 * must see the increment of ->c[].
> > +	 * This smp_mb() also pairs with smp_mb() C to prevent writes after the
> > +	 * synchronize_srcu() from being executed before the grace period ends.
> >  	 */
> >  	smp_mb(); /* A */
> >  
> >  	/*
> > -	 * Note that srcu_readers_active_idx() can incorrectly return
> > -	 * zero even though there is a pre-existing reader throughout.
> > -	 * To see this, suppose that task A is in a very long SRCU
> > -	 * read-side critical section that started on CPU 0, and that
> > -	 * no other reader exists, so that the sum of the counters
> > -	 * is equal to one.  Then suppose that task B starts executing
> > -	 * srcu_readers_active_idx(), summing up to CPU 1, and then that
> > -	 * task C starts reading on CPU 0, so that its increment is not
> > -	 * summed, but finishes reading on CPU 2, so that its decrement
> > -	 * -is- summed.  Then when task B completes its sum, it will
> > -	 * incorrectly get zero, despite the fact that task A has been
> > -	 * in its SRCU read-side critical section the whole time.
> > -	 *
> > -	 * We therefore do a validation step should srcu_readers_active_idx()
> > -	 * return zero.
> > -	 */
> > -	if (srcu_readers_active_idx(sp, idx) != 0)
> > -		return false;
> > -
> > -	/*
> > -	 * The remainder of this function is the validation step.
> > -	 * The following smp_mb() D pairs with the smp_mb() C in
> > -	 * __srcu_read_unlock().  If the __srcu_read_unlock() was seen
> > -	 * by srcu_readers_active_idx() above, then any destructive
> > -	 * operation performed after the grace period will happen after
> > -	 * the corresponding SRCU read-side critical section.
> > +	 * If the locks are the same as the unlocks, then there must of have
> > +	 * been no readers on this index at some time in between. This does not
> > +	 * mean that there are no more readers, as one could have read the
> > +	 * current index but have incremented the lock counter yet.
> >  	 *
> > -	 * Note that there can be at most NR_CPUS worth of readers using
> > -	 * the old index, which is not enough to overflow even a 32-bit
> > -	 * integer.  (Yes, this does mean that systems having more than
> > -	 * a billion or so CPUs need to be 64-bit systems.)  Therefore,
> > -	 * the sum of the ->seq[] counters cannot possibly overflow.
> > -	 * Therefore, the only way that the return values of the two
> > -	 * calls to srcu_readers_seq_idx() can be equal is if there were
> > -	 * no increments of the corresponding rank of ->seq[] counts
> > -	 * in the interim.  But the missed-increment scenario laid out
> > -	 * above includes an increment of the ->seq[] counter by
> > -	 * the corresponding __srcu_read_lock().  Therefore, if this
> > -	 * scenario occurs, the return values from the two calls to
> > -	 * srcu_readers_seq_idx() will differ, and thus the validation
> > -	 * step below suffices.
> > +	 * Note that there can be at most NR_CPUS worth of readers using the old
> > +	 * index that haven't incremented ->lock_count[] yet.  Therefore, the
> > +	 * sum of the ->lock_count[]s cannot increment enough times to overflow
> > +	 * and end up equal the sum of the ->unlock_count[]s, as long as there
> > +	 * are at most ULONG_MAX - NR_CPUS readers at a time.  (Yes, this does
> > +	 * mean that systems having more than a billion or so CPUs need to be
> > +	 * 64-bit systems.)  Therefore, the only way that the return values of
> > +	 * the two calls to srcu_readers_(un)lock_idx() can be equal is if there
> > +	 * are no active readers using this index.
> >  	 */
> > -	smp_mb(); /* D */
> > -
> > -	return srcu_readers_seq_idx(sp, idx) == seq;
> > +	return srcu_readers_lock_idx(sp, idx) == unlocks;
> >  }
> >  
> >  /**
> > @@ -266,8 +234,12 @@ static bool srcu_readers_active(struct srcu_struct *sp)
> >  	unsigned long sum = 0;
> >  
> >  	for_each_possible_cpu(cpu) {
> > -		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
> > -		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
> > +		struct srcu_struct_array* cpu_counts =
> > +			per_cpu_ptr(sp->per_cpu_ref, cpu);
> > +		sum += READ_ONCE(cpu_counts->lock_count[0]);
> > +		sum += READ_ONCE(cpu_counts->lock_count[1]);
> > +		sum -= READ_ONCE(cpu_counts->unlock_count[0]);
> > +		sum -= READ_ONCE(cpu_counts->unlock_count[1]);
> >  	}
> >  	return sum;
> >  }
> > @@ -298,9 +270,8 @@ int __srcu_read_lock(struct srcu_struct *sp)
> >  	int idx;
> >  
> >  	idx = READ_ONCE(sp->completed) & 0x1;
> > -	__this_cpu_inc(sp->per_cpu_ref->c[idx]);
> > +	__this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
> >  	smp_mb(); /* B */  /* Avoid leaking the critical section. */
> > -	__this_cpu_inc(sp->per_cpu_ref->seq[idx]);
> >  	return idx;
> 
> __srcu_read_lock() used to be called with preemption disabled. I guess
> the reason was because we have two percpu variables to increase. So with
> only one percpu right, could we remove the preempt_{dis,en}able() in
> srcu_read_lock() and use this_cpu_inc() here?

Quite possibly...

							Thanx, Paul

> Regards,
> Boqun
> 
> >  }
> >  EXPORT_SYMBOL_GPL(__srcu_read_lock);
> > @@ -314,7 +285,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
> >  void __srcu_read_unlock(struct srcu_struct *sp, int idx)
> >  {
> >  	smp_mb(); /* C */  /* Avoid leaking the critical section. */
> > -	this_cpu_dec(sp->per_cpu_ref->c[idx]);
> > +	this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]);
> >  }
> >  EXPORT_SYMBOL_GPL(__srcu_read_unlock);
> >  
> > @@ -349,7 +320,7 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
> >  
> >  /*
> >   * Increment the ->completed counter so that future SRCU readers will
> > - * use the other rank of the ->c[] and ->seq[] arrays.  This allows
> > + * use the other rank of the ->(un)lock_count[] arrays.  This allows
> >   * us to wait for pre-existing readers in a starvation-free manner.
> >   */
> >  static void srcu_flip(struct srcu_struct *sp)
> > 

  reply	other threads:[~2016-11-15 14:37 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-11-14 18:36 [PATCH RFC tip/core/rcu] SRCU rewrite Paul E. McKenney
2016-11-14 19:00 ` Mathieu Desnoyers
2016-11-15  1:44 ` Boqun Feng
2016-11-15 14:37   ` Paul E. McKenney [this message]
2016-11-17 12:18     ` Lai Jiangshan
2016-11-17 13:49       ` Paul E. McKenney
2016-11-17 14:38         ` Paul E. McKenney
2016-11-17 14:45           ` Boqun Feng
2016-11-17 15:54             ` Paul E. McKenney
2016-11-17 15:55             ` Lai Jiangshan
2016-11-17 17:42               ` Paul E. McKenney
2016-11-17 14:31       ` Boqun Feng
2016-11-17 15:03         ` Paul E. McKenney
2016-11-17 15:07         ` Lai Jiangshan
2016-11-17 15:31           ` Mathieu Desnoyers
2016-11-17 15:38             ` Mathieu Desnoyers
2016-11-17 15:53               ` Paul E. McKenney
2016-11-17 16:33                 ` Mathieu Desnoyers
2016-11-17 20:31           ` Lance Roy
2016-11-15  7:51 ` Peter Zijlstra
2016-11-15 13:54   ` Mathieu Desnoyers
2016-11-15 13:59     ` Peter Zijlstra
2016-11-15 14:26       ` Paul E. McKenney
2016-11-15 14:55         ` Peter Zijlstra
2016-11-15 15:43           ` Paul E. McKenney
2016-11-17 13:58 ` Lai Jiangshan
2016-11-17 19:53   ` Lance Roy
2016-11-18 13:27     ` Paul E. McKenney

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20161115143700.GZ4127@linux.vnet.ibm.com \
    --to=paulmck@linux.vnet.ibm.com \
    --cc=akpm@linux-foundation.org \
    --cc=bobby.prani@gmail.com \
    --cc=boqun.feng@gmail.com \
    --cc=dhowells@redhat.com \
    --cc=dipankar@in.ibm.com \
    --cc=dvhart@linux.intel.com \
    --cc=edumazet@google.com \
    --cc=fweisbec@gmail.com \
    --cc=jiangshanlai@gmail.com \
    --cc=josh@joshtriplett.org \
    --cc=ldr709@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=mingo@kernel.org \
    --cc=oleg@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.