From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934472AbcKNSgs (ORCPT ); Mon, 14 Nov 2016 13:36:48 -0500 Received: from mx0a-001b2d01.pphosted.com ([148.163.156.1]:39934 "EHLO mx0a-001b2d01.pphosted.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753070AbcKNSgo (ORCPT ); Mon, 14 Nov 2016 13:36:44 -0500 Date: Mon, 14 Nov 2016 10:36:36 -0800 From: "Paul E. McKenney" To: linux-kernel@vger.kernel.org Cc: mingo@kernel.org, jiangshanlai@gmail.com, dipankar@in.ibm.com, akpm@linux-foundation.org, mathieu.desnoyers@efficios.com, josh@joshtriplett.org, tglx@linutronix.de, peterz@infradead.org, rostedt@goodmis.org, dhowells@redhat.com, edumazet@google.com, dvhart@linux.intel.com, fweisbec@gmail.com, oleg@redhat.com, bobby.prani@gmail.com, ldr709@gmail.com Subject: [PATCH RFC tip/core/rcu] SRCU rewrite Reply-To: paulmck@linux.vnet.ibm.com MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline User-Agent: Mutt/1.5.21 (2010-09-15) X-TM-AS-GCONF: 00 X-Content-Scanned: Fidelis XPS MAILER x-cbid: 16111418-8235-0000-0000-0000099CC816 X-IBM-SpamModules-Scores: X-IBM-SpamModules-Versions: BY=3.00006077; HX=3.00000240; KW=3.00000007; PH=3.00000004; SC=3.00000189; SDB=6.00780635; UDB=6.00376456; IPR=6.00558137; BA=6.00004878; NDR=6.00000001; ZLA=6.00000005; ZF=6.00000009; ZB=6.00000000; ZP=6.00000000; ZH=6.00000000; ZU=6.00000002; MB=3.00013322; XFM=3.00000011; UTC=2016-11-14 18:36:41 X-IBM-AV-DETECTION: SAVI=unused REMOTE=unused XFE=unused x-cbparentid: 16111418-8236-0000-0000-000036757498 Message-Id: <20161114183636.GA28589@linux.vnet.ibm.com> X-Proofpoint-Virus-Version: vendor=fsecure engine=2.50.10432:,, definitions=2016-11-14_11:,, signatures=0 X-Proofpoint-Spam-Details: rule=outbound_notspam policy=outbound score=0 spamscore=0 suspectscore=3 malwarescore=0 phishscore=0 adultscore=0 bulkscore=0 classifier=spam adjust=0 reason=mlx scancount=1 engine=8.0.1-1609300000 definitions=main-1611140368 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org SRCU uses two per-cpu counters: a nesting counter to count the number of active critical sections, and a sequence counter to ensure that the nesting counters don't change while they are being added together in srcu_readers_active_idx_check(). This patch instead uses per-cpu lock and unlock counters. Because the both counters only increase and srcu_readers_active_idx_check() reads the unlock counter before the lock counter, this achieves the same end without having to increment two different counters in srcu_read_lock(). This also saves a smp_mb() in srcu_readers_active_idx_check(). A possible problem with this patch is that it can only handle ULONG_MAX - NR_CPUS simultaneous readers, whereas the old version could handle up to ULONG_MAX. Suggested-by: Mathieu Desnoyers Signed-off-by: Lance Roy Signed-off-by: Paul E. McKenney [ paulmck: Queued for 4.12, that is, merge window after this coming one. ] diff --git a/include/linux/srcu.h b/include/linux/srcu.h index dc8eb63c6568..0caea34d8c5f 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -34,8 +34,8 @@ #include struct srcu_struct_array { - unsigned long c[2]; - unsigned long seq[2]; + unsigned long lock_count[2]; + unsigned long unlock_count[2]; }; struct rcu_batch { diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 87c51225ceec..6e4fd7680c70 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -564,10 +564,24 @@ static void srcu_torture_stats(void) pr_alert("%s%s per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { + unsigned long l0, l1; + unsigned long u0, u1; long c0, c1; + struct srcu_struct_array* counts = + per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); - c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx]; - c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx]; + u0 = counts->unlock_count[!idx]; + u1 = counts->unlock_count[idx]; + + /* Make sure that a lock is always counted if the corresponding + unlock is counted. */ + smp_rmb(); + + l0 = counts->lock_count[!idx]; + l1 = counts->lock_count[idx]; + + c0 = (long)(l0 - u0); + c1 = (long)(l1 - u1); pr_cont(" %d(%ld,%ld)", cpu, c0, c1); } pr_cont("\n"); diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 9b9cdd549caa..edfdfadec821 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -141,34 +141,38 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* - * Returns approximate total of the readers' ->seq[] values for the + * Returns approximate total of the readers' ->lock_count[] values for the * rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) +static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) { int cpu; unsigned long sum = 0; unsigned long t; for_each_possible_cpu(cpu) { - t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); + struct srcu_struct_array* cpu_counts = + per_cpu_ptr(sp->per_cpu_ref, cpu); + t = READ_ONCE(cpu_counts->lock_count[idx]); sum += t; } return sum; } /* - * Returns approximate number of readers active on the specified rank - * of the per-CPU ->c[] counters. + * Returns approximate total of the readers' ->unlock_count[] values for the + * rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) { int cpu; unsigned long sum = 0; unsigned long t; for_each_possible_cpu(cpu) { - t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); + struct srcu_struct_array* cpu_counts = + per_cpu_ptr(sp->per_cpu_ref, cpu); + t = READ_ONCE(cpu_counts->unlock_count[idx]); sum += t; } return sum; @@ -176,79 +180,43 @@ static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) /* * Return true if the number of pre-existing readers is determined to - * be stably zero. An example unstable zero can occur if the call - * to srcu_readers_active_idx() misses an __srcu_read_lock() increment, - * but due to task migration, sees the corresponding __srcu_read_unlock() - * decrement. This can happen because srcu_readers_active_idx() takes - * time to sum the array, and might in fact be interrupted or preempted - * partway through the summation. + * be zero. */ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) { - unsigned long seq; + unsigned long unlocks; - seq = srcu_readers_seq_idx(sp, idx); + unlocks = srcu_readers_unlock_idx(sp, idx); /* - * The following smp_mb() A pairs with the smp_mb() B located in - * __srcu_read_lock(). This pairing ensures that if an - * __srcu_read_lock() increments its counter after the summation - * in srcu_readers_active_idx(), then the corresponding SRCU read-side - * critical section will see any changes made prior to the start - * of the current SRCU grace period. + * Make sure that a lock is always counted if the corresponding unlock + * is counted. Needs to be a smp_mb() as the read side may contain a + * read from a variable that is written to before the synchronize_srcu() + * in the write side. In this case smp_mb()s A and B act like the store + * buffering pattern. * - * Also, if the above call to srcu_readers_seq_idx() saw the - * increment of ->seq[], then the call to srcu_readers_active_idx() - * must see the increment of ->c[]. + * This smp_mb() also pairs with smp_mb() C to prevent writes after the + * synchronize_srcu() from being executed before the grace period ends. */ smp_mb(); /* A */ /* - * Note that srcu_readers_active_idx() can incorrectly return - * zero even though there is a pre-existing reader throughout. - * To see this, suppose that task A is in a very long SRCU - * read-side critical section that started on CPU 0, and that - * no other reader exists, so that the sum of the counters - * is equal to one. Then suppose that task B starts executing - * srcu_readers_active_idx(), summing up to CPU 1, and then that - * task C starts reading on CPU 0, so that its increment is not - * summed, but finishes reading on CPU 2, so that its decrement - * -is- summed. Then when task B completes its sum, it will - * incorrectly get zero, despite the fact that task A has been - * in its SRCU read-side critical section the whole time. - * - * We therefore do a validation step should srcu_readers_active_idx() - * return zero. - */ - if (srcu_readers_active_idx(sp, idx) != 0) - return false; - - /* - * The remainder of this function is the validation step. - * The following smp_mb() D pairs with the smp_mb() C in - * __srcu_read_unlock(). If the __srcu_read_unlock() was seen - * by srcu_readers_active_idx() above, then any destructive - * operation performed after the grace period will happen after - * the corresponding SRCU read-side critical section. + * If the locks are the same as the unlocks, then there must of have + * been no readers on this index at some time in between. This does not + * mean that there are no more readers, as one could have read the + * current index but have incremented the lock counter yet. * - * Note that there can be at most NR_CPUS worth of readers using - * the old index, which is not enough to overflow even a 32-bit - * integer. (Yes, this does mean that systems having more than - * a billion or so CPUs need to be 64-bit systems.) Therefore, - * the sum of the ->seq[] counters cannot possibly overflow. - * Therefore, the only way that the return values of the two - * calls to srcu_readers_seq_idx() can be equal is if there were - * no increments of the corresponding rank of ->seq[] counts - * in the interim. But the missed-increment scenario laid out - * above includes an increment of the ->seq[] counter by - * the corresponding __srcu_read_lock(). Therefore, if this - * scenario occurs, the return values from the two calls to - * srcu_readers_seq_idx() will differ, and thus the validation - * step below suffices. + * Note that there can be at most NR_CPUS worth of readers using the old + * index that haven't incremented ->lock_count[] yet. Therefore, the + * sum of the ->lock_count[]s cannot increment enough times to overflow + * and end up equal the sum of the ->unlock_count[]s, as long as there + * are at most ULONG_MAX - NR_CPUS readers at a time. (Yes, this does + * mean that systems having more than a billion or so CPUs need to be + * 64-bit systems.) Therefore, the only way that the return values of + * the two calls to srcu_readers_(un)lock_idx() can be equal is if there + * are no active readers using this index. */ - smp_mb(); /* D */ - - return srcu_readers_seq_idx(sp, idx) == seq; + return srcu_readers_lock_idx(sp, idx) == unlocks; } /** @@ -266,8 +234,12 @@ static bool srcu_readers_active(struct srcu_struct *sp) unsigned long sum = 0; for_each_possible_cpu(cpu) { - sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); - sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); + struct srcu_struct_array* cpu_counts = + per_cpu_ptr(sp->per_cpu_ref, cpu); + sum += READ_ONCE(cpu_counts->lock_count[0]); + sum += READ_ONCE(cpu_counts->lock_count[1]); + sum -= READ_ONCE(cpu_counts->unlock_count[0]); + sum -= READ_ONCE(cpu_counts->unlock_count[1]); } return sum; } @@ -298,9 +270,8 @@ int __srcu_read_lock(struct srcu_struct *sp) int idx; idx = READ_ONCE(sp->completed) & 0x1; - __this_cpu_inc(sp->per_cpu_ref->c[idx]); + __this_cpu_inc(sp->per_cpu_ref->lock_count[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ - __this_cpu_inc(sp->per_cpu_ref->seq[idx]); return idx; } EXPORT_SYMBOL_GPL(__srcu_read_lock); @@ -314,7 +285,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); void __srcu_read_unlock(struct srcu_struct *sp, int idx) { smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_dec(sp->per_cpu_ref->c[idx]); + this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -349,7 +320,7 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) /* * Increment the ->completed counter so that future SRCU readers will - * use the other rank of the ->c[] and ->seq[] arrays. This allows + * use the other rank of the ->(un)lock_count[] arrays. This allows * us to wait for pre-existing readers in a starvation-free manner. */ static void srcu_flip(struct srcu_struct *sp)