linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Tao Zhou <tao.zhou@linux.dev>
To: Peter Zijlstra <peterz@infradead.org>
Cc: Eugene Syromiatnikov <esyr@redhat.com>,
	joel@joelfernandes.org, chris.hyser@oracle.com,
	joshdon@google.com, mingo@kernel.org, vincent.guittot@linaro.org,
	valentin.schneider@arm.com, mgorman@suse.de,
	linux-kernel@vger.kernel.org, tglx@linutronix.de,
	Christian Brauner <christian.brauner@ubuntu.com>,
	ldv@strace.io, tao.zhou@linux.dev
Subject: Re: [PATCH] sched: Fix Core-wide rq->lock for uninitialized CPUs
Date: Thu, 19 Aug 2021 23:50:54 +0800	[thread overview]
Message-ID: <YR593iy7cxWUbphb@geo.homenetwork> (raw)
In-Reply-To: <YR473ZGeKqMs6kw+@hirez.programming.kicks-ass.net>

On Thu, Aug 19, 2021 at 01:09:17PM +0200, Peter Zijlstra wrote:

> On Wed, Aug 18, 2021 at 01:17:34AM +0200, Eugene Syromiatnikov wrote:
> > On Tue, Aug 17, 2021 at 05:52:43PM +0200, Peter Zijlstra wrote:
> > > Urgh... lemme guess, your HP BIOS is funny and reports more possible
> > > CPUs than you actually have resulting in cpu_possible_mask !=
> > > cpu_online_mask. Alternatively, you booted with nr_cpus= or something
> > > daft like that.
> > 
> > Yep, it seems to be the case:
> > 
> >     # cat /sys/devices/system/cpu/possible
> >     0-7
> >     # cat /sys/devices/system/cpu/online
> >     0-3
> > 
> 
> I think the below should work... can you please verify?
> 
> ---
> Subject: sched: Fix Core-wide rq->lock for uninitialized CPUs
> 
> Eugene tripped over the case where rq_lock(), as called in a
> for_each_possible_cpu() loop came apart because rq->core hadn't been
> setup yet.
> 
> This is a somewhat unusual, but valid case.
> 
> Rework things such that rq->core is initialized to point at itself. IOW
> initialize each CPU as a single threaded Core. CPU online will then join
> the new CPU (thread) to an existing Core where needed.
> 
> For completeness sake, have CPU offline fully undo the state so as to
> not presume the topology will match the next time it comes online.
> 
> Fixes: 9edeaea1bc45 ("sched: Core-wide rq->lock")
> Reported-by: Eugene Syromiatnikov <esyr@redhat.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/sched/core.c  | 143 +++++++++++++++++++++++++++++++++++++++++----------
>  kernel/sched/sched.h |   2 +-
>  2 files changed, 118 insertions(+), 27 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 0f14e09a4f99..21d633971fcf 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -237,9 +237,30 @@ static DEFINE_MUTEX(sched_core_mutex);
>  static atomic_t sched_core_count;
>  static struct cpumask sched_core_mask;
>  
> +static void sched_core_lock(int cpu, unsigned long *flags)
> +{
> +	const struct cpumask *smt_mask = cpu_smt_mask(cpu);
> +	int t, i = 0;
> +
> +	local_irq_save(*flags);
> +	for_each_cpu(t, smt_mask)
> +		raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
> +}
> +
> +static void sched_core_unlock(int cpu, unsigned long *flags)
> +{
> +	const struct cpumask *smt_mask = cpu_smt_mask(cpu);
> +	int t;
> +
> +	for_each_cpu(t, smt_mask)
> +		raw_spin_unlock(&cpu_rq(t)->__lock);
> +	local_irq_restore(*flags);
> +}
> +
>  static void __sched_core_flip(bool enabled)
>  {
> -	int cpu, t, i;
> +	unsigned long flags;
> +	int cpu, t;
>  
>  	cpus_read_lock();
>  
> @@ -250,19 +271,12 @@ static void __sched_core_flip(bool enabled)
>  	for_each_cpu(cpu, &sched_core_mask) {
>  		const struct cpumask *smt_mask = cpu_smt_mask(cpu);
>  
> -		i = 0;
> -		local_irq_disable();
> -		for_each_cpu(t, smt_mask) {
> -			/* supports up to SMT8 */
> -			raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
> -		}
> +		sched_core_lock(cpu, &flags);
>  
>  		for_each_cpu(t, smt_mask)
>  			cpu_rq(t)->core_enabled = enabled;
>  
> -		for_each_cpu(t, smt_mask)
> -			raw_spin_unlock(&cpu_rq(t)->__lock);
> -		local_irq_enable();
> +		sched_core_unlock(cpu, &flags);
>  
>  		cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
>  	}
> @@ -5979,35 +5993,109 @@ void queue_core_balance(struct rq *rq)
>  	queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
>  }
>  
> -static inline void sched_core_cpu_starting(unsigned int cpu)
> +static void sched_core_cpu_starting(unsigned int cpu)
>  {
>  	const struct cpumask *smt_mask = cpu_smt_mask(cpu);
> -	struct rq *rq, *core_rq = NULL;
> -	int i;
> +	struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
> +	unsigned long flags;
> +	int t;
>  
> -	core_rq = cpu_rq(cpu)->core;
> +	sched_core_lock(cpu, &flags);
>  
> -	if (!core_rq) {
> -		for_each_cpu(i, smt_mask) {
> -			rq = cpu_rq(i);
> -			if (rq->core && rq->core == rq)
> -				core_rq = rq;
> +	WARN_ON_ONCE(rq->core != rq);
> +
> +	/* if we're the first, we'll be our own leader */
> +	if (cpumask_weight(smt_mask) == 1)
> +		goto unlock;
> +
> +	/* find the leader */
> +	for_each_cpu(t, smt_mask) {
> +		if (t == cpu)
> +			continue;
> +		rq = cpu_rq(t);
> +		if (rq->core == rq) {
> +			core_rq = rq;
> +			break;
>  		}
> +	}
>  
> -		if (!core_rq)
> -			core_rq = cpu_rq(cpu);
> +	if (WARN_ON_ONCE(!core_rq)) /* whoopsie */
> +		goto unlock;
>  
> -		for_each_cpu(i, smt_mask) {
> -			rq = cpu_rq(i);
> +	/* install and validate core_rq */
> +	for_each_cpu(t, smt_mask) {
> +		rq = cpu_rq(t);
>  
> -			WARN_ON_ONCE(rq->core && rq->core != core_rq);
> +		if (t == cpu)
>  			rq->core = core_rq;
> -		}
> +
> +		WARN_ON_ONCE(rq->core != core_rq);
>  	}

Yoh, Peter!

Read not more than ten times but stay here long enough until I realize
something that I am not sure. You are not only just fix the problem,
you change the rq's core selection method(manner.). Original source now
set the core to one rq, here you distribute the core_rq to not just only
one rq.

For example(SMT3): rq1 --> rq0, rq0 --> rq2, rq2 --> rq2(/* whoopsie */).

And the core can change when deactivate.
Not sure I'm getting all the things right, but this is more performance
efficient because the lock contention is avoided when pick task from my
imagination.
Again, I'm not know much about everything, just put some words.

> +
> +unlock:
> +	sched_core_unlock(cpu, &flags);
>  }
> +
> +static void sched_core_cpu_deactivate(unsigned int cpu)
> +{
> +	const struct cpumask *smt_mask = cpu_smt_mask(cpu);
> +	struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
> +	unsigned long flags;
> +	int t;
> +
> +	sched_core_lock(cpu, &flags);
> +
> +	/* if we're the last man standing, nothing to do */
> +	if (cpumask_weight(smt_mask) == 1) {
> +		WARN_ON_ONCE(rq->core != rq);
> +		goto unlock;
> +	}
> +
> +	/* if we're not the leader, nothing to do */
> +	if (rq->core != rq)
> +		goto unlock;
> +
> +	/* find a new leader */
> +	for_each_cpu(t, smt_mask) {
> +		if (t == cpu)
> +			continue;
> +		core_rq = cpu_rq(t);
> +		break;
> +	}
> +
> +	if (WARN_ON_ONCE(!core_rq)) /* impossible */
> +		goto unlock;
> +
> +	/* copy the shared state to the new leader */
> +	core_rq->core_task_seq      = rq->core_task_seq;
> +	core_rq->core_pick_seq      = rq->core_pick_seq;
> +	core_rq->core_cookie        = rq->core_cookie;
> +	core_rq->core_forceidle     = rq->core_forceidle;
> +	core_rq->core_forceidle_seq = rq->core_forceidle_seq;
> +
> +	/* install new leader */
> +	for_each_cpu(t, smt_mask) {
> +		rq = cpu_rq(t);
> +		rq->core = core_rq;
> +	}
> +
> +unlock:
> +	sched_core_unlock(cpu, &flags);
> +}
> +
> +static inline void sched_core_cpu_dying(unsigned int cpu)
> +{
> +	struct rq *rq = cpu_rq(cpu);
> +
> +	if (rq->core != rq)
> +		rq->core = rq;
> +}
> +
>  #else /* !CONFIG_SCHED_CORE */
>  
>  static inline void sched_core_cpu_starting(unsigned int cpu) {}
> +static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
> +static inline void sched_core_cpu_dying(unsigned int cpu) {}
>  
>  static struct task_struct *
>  pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> @@ -9009,6 +9097,8 @@ int sched_cpu_deactivate(unsigned int cpu)
>  	 */
>  	if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
>  		static_branch_dec_cpuslocked(&sched_smt_present);
> +
> +	sched_core_cpu_deactivate(cpu);
>  #endif
>  
>  	if (!sched_smp_initialized)
> @@ -9113,6 +9203,7 @@ int sched_cpu_dying(unsigned int cpu)
>  	calc_load_migrate(rq);
>  	update_max_interval();
>  	hrtick_clear(rq);
> +	sched_core_cpu_dying(cpu);
>  	return 0;
>  }
>  #endif
> @@ -9324,7 +9415,7 @@ void __init sched_init(void)
>  		atomic_set(&rq->nr_iowait, 0);
>  
>  #ifdef CONFIG_SCHED_CORE
> -		rq->core = NULL;
> +		rq->core = rq;
>  		rq->core_pick = NULL;
>  		rq->core_enabled = 0;
>  		rq->core_tree = RB_ROOT;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index a9a660c6e08a..e6347c88c467 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1103,7 +1103,7 @@ struct rq {
>  	unsigned int		core_sched_seq;
>  	struct rb_root		core_tree;
>  
> -	/* shared state */
> +	/* shared state -- careful with sched_core_cpu_deactivate() */
>  	unsigned int		core_task_seq;
>  	unsigned int		core_pick_seq;
>  	unsigned long		core_cookie;


Thanks,
Tao

  reply	other threads:[~2021-08-19 15:50 UTC|newest]

Thread overview: 103+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-22 12:04 [PATCH 00/19] sched: Core Scheduling Peter Zijlstra
2021-04-22 12:05 ` [PATCH 01/19] sched/fair: Add a few assertions Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-05-13  8:56     ` Ning, Hongyu
2021-04-22 12:05 ` [PATCH 02/19] sched: Provide raw_spin_rq_*lock*() helpers Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 03/19] sched: Wrap rq::lock access Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 04/19] sched: Prepare for Core-wide rq->lock Peter Zijlstra
2021-04-24  1:22   ` Josh Don
2021-04-26  8:31     ` Peter Zijlstra
2021-04-26 22:21       ` Josh Don
2021-04-27 17:10         ` Don Hiatt
2021-04-27 23:35           ` Josh Don
2021-04-28  1:03             ` Aubrey Li
2021-04-28  6:05               ` Aubrey Li
2021-04-28 10:57                 ` Aubrey Li
2021-04-28 16:41                   ` Don Hiatt
2021-04-29 20:48                     ` Josh Don
2021-04-29 21:09                       ` Don Hiatt
2021-04-29 23:22                         ` Josh Don
2021-04-30 16:18                           ` Don Hiatt
2021-04-30  8:26                         ` Aubrey Li
2021-04-28 16:04             ` Don Hiatt
2021-04-27 23:30         ` Josh Don
2021-04-28  9:13           ` Peter Zijlstra
2021-04-28 10:35             ` Aubrey Li
2021-04-28 11:03               ` Peter Zijlstra
2021-04-28 14:18                 ` Paul E. McKenney
2021-04-29 20:11             ` Josh Don
2021-05-03 19:17               ` Peter Zijlstra
2021-04-28  7:13         ` Peter Zijlstra
2021-04-28  6:02   ` Aubrey Li
2021-04-29  8:03   ` Aubrey Li
2021-04-29 20:39     ` Josh Don
2021-04-30  8:20       ` Aubrey Li
2021-04-30  8:48         ` Josh Don
2021-04-30 14:15           ` Aubrey Li
2021-05-04  7:38       ` Peter Zijlstra
2021-05-05 16:20         ` Don Hiatt
2021-05-06 10:25           ` Peter Zijlstra
2021-05-07  9:50   ` [PATCH v2 " Peter Zijlstra
2021-05-08  8:07     ` Aubrey Li
2021-05-12  9:07       ` Peter Zijlstra
2021-04-22 12:05 ` [PATCH 05/19] sched: " Peter Zijlstra
2021-05-07  9:50   ` [PATCH v2 " Peter Zijlstra
2021-05-12 10:28     ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 06/19] sched: Optimize rq_lockp() usage Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 07/19] sched: Allow sched_core_put() from atomic context Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 08/19] sched: Introduce sched_class::pick_task() Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 09/19] sched: Basic tracking of matching tasks Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 10/19] sched: Add core wide task selection and scheduling Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 11/19] sched/fair: Fix forced idle sibling starvation corner case Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Vineeth Pillai
2021-04-22 12:05 ` [PATCH 12/19] sched: Fix priority inversion of cookied task with sibling Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Joel Fernandes (Google)
2021-04-22 12:05 ` [PATCH 13/19] sched/fair: Snapshot the min_vruntime of CPUs on force idle Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Joel Fernandes (Google)
2021-04-22 12:05 ` [PATCH 14/19] sched: Trivial forced-newidle balancer Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 15/19] sched: Migration changes for core scheduling Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Aubrey Li
2021-04-22 12:05 ` [PATCH 16/19] sched: Trivial core scheduling cookie management Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 17/19] sched: Inherit task cookie on fork() Peter Zijlstra
2021-05-10 16:06   ` Joel Fernandes
2021-05-10 16:22     ` Chris Hyser
2021-05-10 20:47       ` Joel Fernandes
2021-05-10 21:38         ` Chris Hyser
2021-05-12  9:05           ` Peter Zijlstra
2021-05-12 20:20             ` Josh Don
2021-05-12 21:07               ` Don Hiatt
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 18/19] sched: prctl() core-scheduling interface Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Chris Hyser
2021-06-14 23:36   ` [PATCH 18/19] " Josh Don
2021-06-15 11:31     ` Joel Fernandes
2021-08-05 16:53   ` Eugene Syromiatnikov
2021-08-05 17:00     ` Peter Zijlstra
2021-08-17 15:15   ` Eugene Syromiatnikov
2021-08-17 15:52     ` Peter Zijlstra
2021-08-17 23:17       ` Eugene Syromiatnikov
2021-08-19 11:09         ` [PATCH] sched: Fix Core-wide rq->lock for uninitialized CPUs Peter Zijlstra
2021-08-19 15:50           ` Tao Zhou [this message]
2021-08-19 16:19           ` Eugene Syromiatnikov
2021-08-20  0:18           ` Josh Don
2021-08-20 10:02             ` Peter Zijlstra
2021-08-23  9:07           ` [tip: sched/urgent] " tip-bot2 for Peter Zijlstra
2021-04-22 12:05 ` [PATCH 19/19] kselftest: Add test for core sched prctl interface Peter Zijlstra
2021-05-12 10:28   ` [tip: sched/core] " tip-bot2 for Chris Hyser
2021-04-22 16:43 ` [PATCH 00/19] sched: Core Scheduling Don Hiatt
2021-04-22 17:29   ` Peter Zijlstra
2021-04-30  6:47 ` Ning, Hongyu
2021-05-06 10:29   ` Peter Zijlstra
2021-05-06 12:53     ` Ning, Hongyu
2021-05-07 18:02 ` Joel Fernandes
2021-05-10 16:16 ` Vincent Guittot
2021-05-11  7:00   ` Vincent Guittot

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=YR593iy7cxWUbphb@geo.homenetwork \
    --to=tao.zhou@linux.dev \
    --cc=chris.hyser@oracle.com \
    --cc=christian.brauner@ubuntu.com \
    --cc=esyr@redhat.com \
    --cc=joel@joelfernandes.org \
    --cc=joshdon@google.com \
    --cc=ldv@strace.io \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@kernel.org \
    --cc=peterz@infradead.org \
    --cc=tglx@linutronix.de \
    --cc=valentin.schneider@arm.com \
    --cc=vincent.guittot@linaro.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).