From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752921AbbJFQ3u (ORCPT ); Tue, 6 Oct 2015 12:29:50 -0400 Received: from e34.co.us.ibm.com ([32.97.110.152]:44120 "EHLO e34.co.us.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751812AbbJFQ3q (ORCPT ); Tue, 6 Oct 2015 12:29:46 -0400 X-IBM-Helo: d03dlp03.boulder.ibm.com X-IBM-MailFrom: paulmck@linux.vnet.ibm.com X-IBM-RcptTo: linux-kernel@vger.kernel.org From: "Paul E. McKenney" To: linux-kernel@vger.kernel.org Cc: mingo@kernel.org, jiangshanlai@gmail.com, dipankar@in.ibm.com, akpm@linux-foundation.org, mathieu.desnoyers@efficios.com, josh@joshtriplett.org, tglx@linutronix.de, peterz@infradead.org, rostedt@goodmis.org, dhowells@redhat.com, edumazet@google.com, dvhart@linux.intel.com, fweisbec@gmail.com, oleg@redhat.com, bobby.prani@gmail.com, "Paul E. McKenney" Subject: [PATCH tip/core/rcu 05/18] rcu: Move synchronize_sched_expedited() to combining tree Date: Tue, 6 Oct 2015 09:29:24 -0700 Message-Id: <1444148977-14108-5-git-send-email-paulmck@linux.vnet.ibm.com> X-Mailer: git-send-email 2.5.2 In-Reply-To: <1444148977-14108-1-git-send-email-paulmck@linux.vnet.ibm.com> References: <20151006162907.GA12020@linux.vnet.ibm.com> <1444148977-14108-1-git-send-email-paulmck@linux.vnet.ibm.com> X-TM-AS-MML: disable X-Content-Scanned: Fidelis XPS MAILER x-cbid: 15100616-0017-0000-0000-00000E6A7A76 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Currently, synchronize_sched_expedited() uses a single global counter to track the number of remaining context switches that the current expedited grace period must wait on. This is problematic on large systems, where the resulting memory contention can be pathological. This commit therefore makes synchronize_sched_expedited() instead use the combining tree in the same manner as synchronize_rcu_expedited(), keeping memory contention down to a dull roar. This commit creates a temporary function sync_sched_exp_select_cpus() that is very similar to sync_rcu_exp_select_cpus(). A later commit will consolidate these two functions, which becomes possible when synchronize_sched_expedited() switches from stop_one_cpu_nowait() to smp_call_function_single(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 123 ++++++++++++++++++++++++++++++++++++------------------ kernel/rcu/tree.h | 1 - 2 files changed, 82 insertions(+), 42 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6e92bf4337bd..d2cdcada6fe0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3642,19 +3642,77 @@ static int synchronize_sched_expedited_cpu_stop(void *data) struct rcu_data *rdp = data; struct rcu_state *rsp = rdp->rsp; - /* We are here: If we are last, do the wakeup. */ - rdp->exp_done = true; - if (atomic_dec_and_test(&rsp->expedited_need_qs)) - wake_up(&rsp->expedited_wq); + /* Report the quiescent state. */ + rcu_report_exp_rdp(rsp, rdp, true); return 0; } +/* + * Select the nodes that the upcoming expedited grace period needs + * to wait for. + */ +static void sync_sched_exp_select_cpus(struct rcu_state *rsp) +{ + int cpu; + unsigned long flags; + unsigned long mask; + unsigned long mask_ofl_test; + unsigned long mask_ofl_ipi; + struct rcu_data *rdp; + struct rcu_node *rnp; + + sync_exp_reset_tree(rsp); + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + + /* Each pass checks a CPU for identity, offline, and idle. */ + mask_ofl_test = 0; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + if (raw_smp_processor_id() == cpu || + cpu_is_offline(cpu) || + !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + mask_ofl_test |= rdp->grpmask; + } + mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; + + /* + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited + * GP until such time as the ->expmask bits are cleared. + */ + if (rcu_preempt_has_tasks(rnp)) + rnp->exp_tasks = rnp->blkd_tasks.next; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + + /* IPI the remaining CPUs for expedited quiescent state. */ + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + if (!(mask_ofl_ipi & mask)) + continue; + rdp = per_cpu_ptr(rsp->rda, cpu); + stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, + rdp, &rdp->exp_stop_work); + mask_ofl_ipi &= ~mask; + } + /* Report quiescent states for those that went offline. */ + mask_ofl_test |= mask_ofl_ipi; + if (mask_ofl_test) + rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); + } +} + static void synchronize_sched_expedited_wait(struct rcu_state *rsp) { int cpu; unsigned long jiffies_stall; unsigned long jiffies_start; - struct rcu_data *rdp; + unsigned long mask; + struct rcu_node *rnp; + struct rcu_node *rnp_root = rcu_get_root(rsp); int ret; jiffies_stall = rcu_jiffies_till_stall_check(); @@ -3663,33 +3721,36 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) for (;;) { ret = wait_event_interruptible_timeout( rsp->expedited_wq, - !atomic_read(&rsp->expedited_need_qs), + sync_rcu_preempt_exp_done(rnp_root), jiffies_stall); if (ret > 0) return; if (ret < 0) { /* Hit a signal, disable CPU stall warnings. */ wait_event(rsp->expedited_wq, - !atomic_read(&rsp->expedited_need_qs)); + sync_rcu_preempt_exp_done(rnp_root)); return; } pr_err("INFO: %s detected expedited stalls on CPUs: {", rsp->name); - for_each_online_cpu(cpu) { - rdp = per_cpu_ptr(rsp->rda, cpu); - - if (rdp->exp_done) - continue; - pr_cont(" %d", cpu); + rcu_for_each_leaf_node(rsp, rnp) { + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + if (!(rnp->expmask & mask)) + continue; + pr_cont(" %d", cpu); + } + mask <<= 1; } pr_cont(" } %lu jiffies s: %lu\n", jiffies - jiffies_start, rsp->expedited_sequence); - for_each_online_cpu(cpu) { - rdp = per_cpu_ptr(rsp->rda, cpu); - - if (rdp->exp_done) - continue; - dump_cpu_task(cpu); + rcu_for_each_leaf_node(rsp, rnp) { + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + if (!(rnp->expmask & mask)) + continue; + dump_cpu_task(cpu); + } } jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; } @@ -3713,7 +3774,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) */ void synchronize_sched_expedited(void) { - int cpu; unsigned long s; struct rcu_node *rnp; struct rcu_state *rsp = &rcu_sched_state; @@ -3736,27 +3796,8 @@ void synchronize_sched_expedited(void) } rcu_exp_gp_seq_start(rsp); - - /* Stop each CPU that is online, non-idle, and not us. */ - atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */ - for_each_online_cpu(cpu) { - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - rdp->exp_done = false; - - /* Skip our CPU and any idle CPUs. */ - if (raw_smp_processor_id() == cpu || - !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) - continue; - atomic_inc(&rsp->expedited_need_qs); - stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, - rdp, &rdp->exp_stop_work); - } - - /* Remove extra count and, if necessary, wait for CPUs to stop. */ - if (!atomic_dec_and_test(&rsp->expedited_need_qs)) - synchronize_sched_expedited_wait(rsp); + sync_sched_exp_select_cpus(rsp); + synchronize_sched_expedited_wait(rsp); rcu_exp_gp_seq_end(rsp); mutex_unlock(&rnp->exp_funnel_mutex); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a57f25ecca58..efe361c764ab 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -383,7 +383,6 @@ struct rcu_data { struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ struct mutex exp_funnel_mutex; - bool exp_done; /* Expedited QS for this CPU? */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU -- 2.5.2