LKML Archive on lore.kernel.org
 help / color / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: "Levin, Alexander (Sasha Levin)" <alexander.levin@verizon.com>
Cc: Sasha Levin <levinsasha928@gmail.com>,
	Ingo Molnar <mingo@kernel.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	"linux-kernel@vger.kernel.org List"
	<linux-kernel@vger.kernel.org>
Subject: Re: [PATCH 3/4] sched: WARN when migrating to an offline CPU
Date: Fri, 29 Sep 2017 13:11:26 +0200
Message-ID: <20170929111126.xdqcgye7iyvphb6a@hirez.programming.kicks-ass.net> (raw)
In-Reply-To: <20170928114249.qddcf4pntvuttswd@hirez.programming.kicks-ass.net>

On Thu, Sep 28, 2017 at 01:42:49PM +0200, Peter Zijlstra wrote:
> On Thu, Sep 28, 2017 at 11:03:10AM +0000, Levin, Alexander (Sasha Levin) wrote:
> > On Thu, Sep 28, 2017 at 12:35:41PM +0200, Peter Zijlstra wrote:
> > >On Thu, Sep 28, 2017 at 02:14:15AM -0700, Sasha Levin wrote:
> 
> > >> [2035565360.446794] Unregister pv shared memory for cpu 2
> > >> [2035565360.467930] numa_remove_cpu cpu 2 node 2: mask now 6
> > >> [2035565360.471431] ------------[ cut here ]------------
> > >> [2035565360.472548] WARNING: CPU: 2 PID: 24 at > kernel/sched/core.c:1178 set_task_cpu (kernel/sched/core.c:1157)
> > >> [2035565360.473840] Modules linked in:
> > >> [2035565360.474632] CPU: 2 PID: 24 Comm: migration/2 Not tainted > 4.14.0-rc2-next-20170927+ #252
> > >
> > >Urgh, weird. That really shouldn't happen. Can you easily reproduce?
> > 
> > Looks like yes. Seems like it's enough to stress CPU hotplug + trinity.
> 
> OK, I'll see if I can reproduce building kernels and hotplug stress.
> Otherwise I'll try and cook up some debug patches for you.

I can't seem to trigger :-(

Can you please run with the below patch and:

  # echo 1 > /proc/sys/kernel/traceoff_on_warning

---
 kernel/sched/core.c |  3 +++
 kernel/sched/fair.c | 10 ++++++++++
 2 files changed, 13 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 18a6966567da..c613f7756981 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5592,6 +5592,7 @@ int sched_cpu_activate(unsigned int cpu)
 	struct rq_flags rf;
 
 	set_cpu_active(cpu, true);
+	trace_printk("active: %d mask: %*pbl\n", cpu, cpumask_pr_args(cpu_active_mask));
 
 	if (sched_smp_initialized) {
 		sched_domains_numa_masks_set(cpu);
@@ -5624,6 +5625,7 @@ int sched_cpu_deactivate(unsigned int cpu)
 	int ret;
 
 	set_cpu_active(cpu, false);
+	trace_printk("not-active: %d mask: %*pbl\n", cpu, cpumask_pr_args(cpu_active_mask));
 	/*
 	 * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
 	 * users of this state to go away such that all new such users will
@@ -5632,6 +5634,7 @@ int sched_cpu_deactivate(unsigned int cpu)
 	 * Do sync before park smpboot threads to take care the rcu boost case.
 	 */
 	synchronize_rcu_mult(call_rcu, call_rcu_sched);
+	trace_printk("rcu-sync: %d\n", cpu);
 
 	if (!sched_smp_initialized)
 		return 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e08a23..cb8f43a59f33 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8150,8 +8150,11 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		.tasks		= LIST_HEAD_INIT(env.tasks),
 	};
 
+
 	cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
 
+	trace_printk("dst_cpu: %d cpus: %*pbl\n", this_cpu, cpumask_pr_args(cpus));
+
 	schedstat_inc(sd->lb_count[idle]);
 
 redo:
@@ -8248,6 +8251,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
 			env.dst_cpu	 = env.new_dst_cpu;
+
+			trace_printk("dst_cpu: %d\n", env.dst_cpu);
+
 			env.flags	&= ~LBF_DST_PINNED;
 			env.loop	 = 0;
 			env.loop_break	 = sched_nr_migrate_break;
@@ -8465,6 +8471,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
 
 	update_blocked_averages(this_cpu);
 	rcu_read_lock();
+	trace_printk("rcu-read-lock: %d\n", this_cpu);
 	for_each_domain(this_cpu, sd) {
 		int continue_balancing = 1;
 		u64 t0, domain_cost;
@@ -8500,6 +8507,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
 		if (pulled_task || this_rq->nr_running > 0)
 			break;
 	}
+	trace_printk("rcu-read-unlock: %d\n", this_cpu);
 	rcu_read_unlock();
 
 	raw_spin_lock(&this_rq->lock);
@@ -8790,6 +8798,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
 	update_blocked_averages(cpu);
 
 	rcu_read_lock();
+	trace_printk("rcu-read-lock: %d\n", cpu);
 	for_each_domain(cpu, sd) {
 		/*
 		 * Decay the newidle max times here because this is a regular
@@ -8853,6 +8862,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
 		rq->max_idle_balance_cost =
 			max((u64)sysctl_sched_migration_cost, max_cost);
 	}
+	trace_printk("rcu-read-unlock: %d\n", cpu);
 	rcu_read_unlock();
 
 	/*

  reply index

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-09-07 15:03 [PATCH 0/4] sched: Fix some load-balancer vs hotplug holes Peter Zijlstra
2017-09-07 15:03 ` [PATCH 1/4] sched/fair: Avoid newidle balance for !active CPUs Peter Zijlstra
2017-09-12 18:04   ` [tip:sched/urgent] " tip-bot for Peter Zijlstra
2017-09-07 15:03 ` [PATCH 2/4] sched/fair: Plug hole between hotplug and active load_balance Peter Zijlstra
2017-09-12 18:05   ` [tip:sched/urgent] sched/fair: Plug hole between hotplug and active_load_balance() tip-bot for Peter Zijlstra
2017-09-07 15:03 ` [PATCH 3/4] sched: WARN when migrating to an offline CPU Peter Zijlstra
2017-09-12 18:05   ` [tip:sched/urgent] sched/core: WARN() " tip-bot for Peter Zijlstra
2017-09-28  9:14   ` [PATCH 3/4] sched: WARN " Sasha Levin
2017-09-28 10:35     ` Peter Zijlstra
2017-09-28 11:03       ` Levin, Alexander (Sasha Levin)
2017-09-28 11:42         ` Peter Zijlstra
2017-09-29 11:11           ` Peter Zijlstra [this message]
2017-10-07  2:07             ` Levin, Alexander (Sasha Levin)
2017-10-07  9:15               ` Peter Zijlstra
     [not found]                 ` <20171007174327.ky6g5viokxg5ysdm@sasha-lappy>
2017-10-09  8:04                   ` Peter Zijlstra
2017-10-10  1:18                     ` Levin, Alexander (Sasha Levin)
2017-09-07 15:03 ` [PATCH 4/4] sched/debug: Add debugfs knob for "sched_debug" Peter Zijlstra
2017-09-12 18:05   ` [tip:sched/urgent] " tip-bot for Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170929111126.xdqcgye7iyvphb6a@hirez.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=alexander.levin@verizon.com \
    --cc=levinsasha928@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

LKML Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/lkml/0 lkml/git/0.git
	git clone --mirror https://lore.kernel.org/lkml/1 lkml/git/1.git
	git clone --mirror https://lore.kernel.org/lkml/2 lkml/git/2.git
	git clone --mirror https://lore.kernel.org/lkml/3 lkml/git/3.git
	git clone --mirror https://lore.kernel.org/lkml/4 lkml/git/4.git
	git clone --mirror https://lore.kernel.org/lkml/5 lkml/git/5.git
	git clone --mirror https://lore.kernel.org/lkml/6 lkml/git/6.git
	git clone --mirror https://lore.kernel.org/lkml/7 lkml/git/7.git
	git clone --mirror https://lore.kernel.org/lkml/8 lkml/git/8.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 lkml lkml/ https://lore.kernel.org/lkml \
		linux-kernel@vger.kernel.org
	public-inbox-index lkml

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-kernel


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git