linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2] sched: migrate_disable() preparations
@ 2020-09-11  8:17 Peter Zijlstra
  2020-09-11  8:17 ` [PATCH 1/2] sched: Fix balance_callback() Peter Zijlstra
  2020-09-11  8:17 ` [PATCH 2/2] sched/hotplug: Ensure only per-cpu kthreads run during hotplug Peter Zijlstra
  0 siblings, 2 replies; 13+ messages in thread
From: Peter Zijlstra @ 2020-09-11  8:17 UTC (permalink / raw)
  To: mingo, vincent.guittot, tglx
  Cc: linux-kernel, dietmar.eggemann, rostedt, bristot, swood,
	valentin.schneider, peterz

Hi!

These two patches are the result of Thomas pestering me with migrate_disable()
patches for upstream. The first one is a cleanup/fix for the existing
balance_callback machinery. The second (ab)uses the context_switch() tail
invocation of balance_callbacks() to push away 'undesirables' during CPU
hotplug after we've marked the CPU as !active.

With this in place, Thomas can do his horrible migrate_disable() thing ;-)


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/2] sched: Fix balance_callback()
  2020-09-11  8:17 [PATCH 0/2] sched: migrate_disable() preparations Peter Zijlstra
@ 2020-09-11  8:17 ` Peter Zijlstra
  2020-09-11 12:17   ` Valentin Schneider
  2020-09-11  8:17 ` [PATCH 2/2] sched/hotplug: Ensure only per-cpu kthreads run during hotplug Peter Zijlstra
  1 sibling, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2020-09-11  8:17 UTC (permalink / raw)
  To: mingo, vincent.guittot, tglx
  Cc: linux-kernel, dietmar.eggemann, rostedt, bristot, swood,
	valentin.schneider, peterz

The intent of balance_callback() has always been to delay executing
balancing operations until the end of the current rq->lock section.
This is because balance operations must often drop rq->lock, and that
isn't safe in general.

However, as noted by Scott, there were a few holes in that scheme;
balance_callback() was called after rq->lock was dropped, which means
another CPU can interleave and touch the callback list.

Rework code to call the balance callbacks before dropping rq->lock
where possible, and otherwise splice the balance list onto a local
stack.

This guarantees that the balance list must be empty when we take
rq->lock. IOW, we'll only ever run our own balance callbacks.

Reported-by: Scott Wood <swood@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |  119 ++++++++++++++++++++++++++++++++-------------------
 kernel/sched/sched.h |    2 
 2 files changed, 77 insertions(+), 44 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3489,6 +3489,69 @@ static inline void finish_task(struct ta
 #endif
 }
 
+#ifdef CONFIG_SMP
+
+static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+	void (*func)(struct rq *rq);
+	struct callback_head *next;
+
+	lockdep_assert_held(&rq->lock);
+
+	while (head) {
+		func = (void (*)(struct rq *))head->func;
+		next = head->next;
+		head->next = NULL;
+		head = next;
+
+		func(rq);
+	}
+}
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+	struct callback_head *head = rq->balance_callback;
+
+	lockdep_assert_held(&rq->lock);
+	if (head)
+		rq->balance_callback = NULL;
+
+	return head;
+}
+
+static void __balance_callbacks(struct rq *rq)
+{
+	do_balance_callbacks(rq, splice_balance_callbacks(rq));
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+	unsigned long flags;
+
+	if (unlikely(head)) {
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		do_balance_callbacks(rq, head);
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	}
+}
+
+#else
+
+static inline void __balance_callbacks(struct rq *rq)
+{
+}
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+	return NULL;
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+}
+
+#endif
+
 static inline void
 prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
 {
@@ -3514,6 +3577,7 @@ static inline void finish_lock_switch(st
 	 * prev into current:
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+	__balance_callbacks(rq);
 	raw_spin_unlock_irq(&rq->lock);
 }
 
@@ -3655,43 +3719,6 @@ static struct rq *finish_task_switch(str
 	return rq;
 }
 
-#ifdef CONFIG_SMP
-
-/* rq->lock is NOT held, but preemption is disabled */
-static void __balance_callback(struct rq *rq)
-{
-	struct callback_head *head, *next;
-	void (*func)(struct rq *rq);
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&rq->lock, flags);
-	head = rq->balance_callback;
-	rq->balance_callback = NULL;
-	while (head) {
-		func = (void (*)(struct rq *))head->func;
-		next = head->next;
-		head->next = NULL;
-		head = next;
-
-		func(rq);
-	}
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-static inline void balance_callback(struct rq *rq)
-{
-	if (unlikely(rq->balance_callback))
-		__balance_callback(rq);
-}
-
-#else
-
-static inline void balance_callback(struct rq *rq)
-{
-}
-
-#endif
-
 /**
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
@@ -3711,7 +3738,6 @@ asmlinkage __visible void schedule_tail(
 	 */
 
 	rq = finish_task_switch(prev);
-	balance_callback(rq);
 	preempt_enable();
 
 	if (current->set_child_tid)
@@ -4527,10 +4553,11 @@ static void __sched notrace __schedule(b
 		rq = context_switch(rq, prev, next, &rf);
 	} else {
 		rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-		rq_unlock_irq(rq, &rf);
-	}
 
-	balance_callback(rq);
+		rq_unpin_lock(rq, &rf);
+		__balance_callbacks(rq);
+		raw_spin_unlock_irq(&rq->lock);
+	}
 }
 
 void __noreturn do_task_dead(void)
@@ -4941,9 +4968,11 @@ void rt_mutex_setprio(struct task_struct
 out_unlock:
 	/* Avoid rq from going away on us: */
 	preempt_disable();
-	__task_rq_unlock(rq, &rf);
 
-	balance_callback(rq);
+	rq_unpin_lock(rq, &rf);
+	__balance_callbacks(rq);
+	raw_spin_unlock(&rq->lock);
+
 	preempt_enable();
 }
 #else
@@ -5217,6 +5246,7 @@ static int __sched_setscheduler(struct t
 	int retval, oldprio, oldpolicy = -1, queued, running;
 	int new_effective_prio, policy = attr->sched_policy;
 	const struct sched_class *prev_class;
+	struct callback_head *head;
 	struct rq_flags rf;
 	int reset_on_fork;
 	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -5455,6 +5485,7 @@ static int __sched_setscheduler(struct t
 
 	/* Avoid rq from going away on us: */
 	preempt_disable();
+	head = splice_balance_callbacks(rq);
 	task_rq_unlock(rq, p, &rf);
 
 	if (pi) {
@@ -5463,7 +5494,7 @@ static int __sched_setscheduler(struct t
 	}
 
 	/* Run balance callbacks after we've adjusted the PI chain: */
-	balance_callback(rq);
+	balance_callbacks(rq, head);
 	preempt_enable();
 
 	return 0;
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1220,6 +1220,8 @@ static inline void rq_pin_lock(struct rq
 #ifdef CONFIG_SCHED_DEBUG
 	rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
 	rf->clock_update_flags = 0;
+
+	SCHED_WARN_ON(rq->balance_callback);
 #endif
 }
 



^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 2/2] sched/hotplug: Ensure only per-cpu kthreads run during hotplug
  2020-09-11  8:17 [PATCH 0/2] sched: migrate_disable() preparations Peter Zijlstra
  2020-09-11  8:17 ` [PATCH 1/2] sched: Fix balance_callback() Peter Zijlstra
@ 2020-09-11  8:17 ` Peter Zijlstra
  2020-09-11 12:17   ` Valentin Schneider
  2020-09-16 10:18   ` Sebastian Andrzej Siewior
  1 sibling, 2 replies; 13+ messages in thread
From: Peter Zijlstra @ 2020-09-11  8:17 UTC (permalink / raw)
  To: mingo, vincent.guittot, tglx
  Cc: linux-kernel, dietmar.eggemann, rostedt, bristot, swood,
	valentin.schneider, peterz

In preparation for migrate_disable(), make sure only per-cpu kthreads
are allowed to run on !active CPUs.

This is ran (as one of the very first steps) from the cpu-hotplug
task which is a per-cpu kthread and completion of the hotplug
operation only requires such tasks.

This contraint enables the migrate_disable() implementation to wait
for completion of all migrate_disable regions on this CPU at hotplug
time without fear of any new ones starting.

This replaces the unlikely(rq->balance_callbacks) test at the tail of
context_switch with an unlikely(rq->balance_work), the fast path is
not affected.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |  103 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |    5 ++
 2 files changed, 106 insertions(+), 2 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3513,8 +3513,10 @@ static inline struct callback_head *spli
 	struct callback_head *head = rq->balance_callback;
 
 	lockdep_assert_held(&rq->lock);
-	if (head)
+	if (head) {
 		rq->balance_callback = NULL;
+		rq->balance_flags &= ~BALANCE_WORK;
+	}
 
 	return head;
 }
@@ -3569,6 +3571,8 @@ prepare_lock_switch(struct rq *rq, struc
 #endif
 }
 
+static bool balance_push(struct rq *rq);
+
 static inline void finish_lock_switch(struct rq *rq)
 {
 	/*
@@ -3577,7 +3581,16 @@ static inline void finish_lock_switch(st
 	 * prev into current:
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-	__balance_callbacks(rq);
+	if (unlikely(rq->balance_flags)) {
+		/*
+		 * Run the balance_callbacks, except on hotplug
+		 * when we need to push the current task away.
+		 */
+		if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+		    !(rq->balance_flags & BALANCE_PUSH) ||
+		    !balance_push(rq))
+			__balance_callbacks(rq);
+	}
 	raw_spin_unlock_irq(&rq->lock);
 }
 
@@ -6836,6 +6849,87 @@ static void migrate_tasks(struct rq *dea
 
 	rq->stop = stop;
 }
+
+static int __balance_push_stop(void *arg)
+{
+	struct task_struct *p = arg;
+	struct rq *rq = this_rq();
+	struct rq_flags rf;
+	int cpu;
+
+	raw_spin_lock_irq(&p->pi_lock);
+	rq_lock(rq, &rf);
+
+	if (task_rq(p) == rq && task_on_rq_queued(p)) {
+		cpu = select_fallback_rq(rq->cpu, p);
+		rq = __migrate_task(rq, &rf, p, cpu);
+	}
+
+	rq_unlock(rq, &rf);
+	raw_spin_unlock_irq(&p->pi_lock);
+
+	put_task_struct(p);
+
+	return 0;
+}
+
+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
+
+/*
+ * Ensure we only run per-cpu kthreads once the CPU goes !active.
+ */
+static bool balance_push(struct rq *rq)
+{
+	struct task_struct *push_task = rq->curr;
+
+	lockdep_assert_held(&rq->lock);
+	SCHED_WARN_ON(rq->cpu != smp_processor_id());
+
+	/*
+	 * Both the cpu-hotplug and stop task are in this class and are
+	 * required to complete the hotplug process.
+	 */
+	if (is_per_cpu_kthread(push_task))
+		return false;
+
+	get_task_struct(push_task);
+	/*
+	 * Temporarily drop rq->lock such that we can wake-up the stop task.
+	 * Both preemption and IRQs are still disabled.
+	 */
+	raw_spin_unlock(&rq->lock);
+	stop_one_cpu_nowait(rq->cpu, __balance_push_stop, push_task,
+			    this_cpu_ptr(&push_work));
+	/*
+	 * At this point need_resched() is true and we'll take the loop in
+	 * schedule(). The next pick is obviously going to be the stop task
+	 * which is_per_cpu_kthread() and will push this task away.
+	 */
+	raw_spin_lock(&rq->lock);
+
+	return true;
+}
+
+static void balance_push_set(int cpu, bool on)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct rq_flags rf;
+
+	rq_lock_irqsave(rq, &rf);
+	if (on)
+		rq->balance_flags |= BALANCE_PUSH;
+	else
+		rq->balance_flags &= ~BALANCE_PUSH;
+	rq_unlock_irqrestore(rq, &rf);
+}
+
+#else
+
+static inline bool balance_push(struct rq *rq)
+{
+	return false;
+}
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 void set_rq_online(struct rq *rq)
@@ -6921,6 +7015,8 @@ int sched_cpu_activate(unsigned int cpu)
 	struct rq *rq = cpu_rq(cpu);
 	struct rq_flags rf;
 
+	balance_push_set(cpu, false);
+
 #ifdef CONFIG_SCHED_SMT
 	/*
 	 * When going up, increment the number of cores with SMT present.
@@ -6968,6 +7064,8 @@ int sched_cpu_deactivate(unsigned int cp
 	 */
 	synchronize_rcu();
 
+	balance_push_set(cpu, true);
+
 #ifdef CONFIG_SCHED_SMT
 	/*
 	 * When going down, decrement the number of cores with SMT present.
@@ -6981,6 +7079,7 @@ int sched_cpu_deactivate(unsigned int cp
 
 	ret = cpuset_cpu_inactive(cpu);
 	if (ret) {
+		balance_push_set(cpu, false);
 		set_cpu_active(cpu, true);
 		return ret;
 	}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -973,6 +973,7 @@ struct rq {
 	unsigned long		cpu_capacity_orig;
 
 	struct callback_head	*balance_callback;
+	unsigned char		balance_flags;
 
 	unsigned char		nohz_idle_balance;
 	unsigned char		idle_balance;
@@ -1384,6 +1385,9 @@ init_numa_balancing(unsigned long clone_
 
 #ifdef CONFIG_SMP
 
+#define BALANCE_WORK	0x01
+#define BALANCE_PUSH	0x02
+
 static inline void
 queue_balance_callback(struct rq *rq,
 		       struct callback_head *head,
@@ -1397,6 +1401,7 @@ queue_balance_callback(struct rq *rq,
 	head->func = (void (*)(struct callback_head *))func;
 	head->next = rq->balance_callback;
 	rq->balance_callback = head;
+	rq->balance_flags |= BALANCE_WORK;
 }
 
 #define rcu_dereference_check_sched_domain(p) \



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/2] sched: Fix balance_callback()
  2020-09-11  8:17 ` [PATCH 1/2] sched: Fix balance_callback() Peter Zijlstra
@ 2020-09-11 12:17   ` Valentin Schneider
  2020-09-11 12:25     ` peterz
  0 siblings, 1 reply; 13+ messages in thread
From: Valentin Schneider @ 2020-09-11 12:17 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, vincent.guittot, tglx, linux-kernel, dietmar.eggemann,
	rostedt, bristot, swood


On 11/09/20 09:17, Peter Zijlstra wrote:
> The intent of balance_callback() has always been to delay executing
> balancing operations until the end of the current rq->lock section.
> This is because balance operations must often drop rq->lock, and that
> isn't safe in general.
>
> However, as noted by Scott, there were a few holes in that scheme;
> balance_callback() was called after rq->lock was dropped, which means
> another CPU can interleave and touch the callback list.
>

So that can be say __schedule() tail racing with some setprio; what's the
worst that can (currently) happen here? Something like say two consecutive
enqueuing of push_rt_tasks() to the callback list?

> Rework code to call the balance callbacks before dropping rq->lock
> where possible, and otherwise splice the balance list onto a local
> stack.
>
> This guarantees that the balance list must be empty when we take
> rq->lock. IOW, we'll only ever run our own balance callbacks.
>

Makes sense to me.

Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>

> Reported-by: Scott Wood <swood@redhat.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

[...]

> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1220,6 +1220,8 @@ static inline void rq_pin_lock(struct rq
>  #ifdef CONFIG_SCHED_DEBUG
>       rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
>       rf->clock_update_flags = 0;
> +
> +	SCHED_WARN_ON(rq->balance_callback);

Clever!

>  #endif
>  }
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] sched/hotplug: Ensure only per-cpu kthreads run during hotplug
  2020-09-11  8:17 ` [PATCH 2/2] sched/hotplug: Ensure only per-cpu kthreads run during hotplug Peter Zijlstra
@ 2020-09-11 12:17   ` Valentin Schneider
  2020-09-11 12:29     ` peterz
  2020-09-16 10:18   ` Sebastian Andrzej Siewior
  1 sibling, 1 reply; 13+ messages in thread
From: Valentin Schneider @ 2020-09-11 12:17 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, vincent.guittot, tglx, linux-kernel, dietmar.eggemann,
	rostedt, bristot, swood, Vincent Donnefort


(+Cc VincentD, who's been looking at some of this)

On 11/09/20 09:17, Peter Zijlstra wrote:
> In preparation for migrate_disable(), make sure only per-cpu kthreads
> are allowed to run on !active CPUs.
>
> This is ran (as one of the very first steps) from the cpu-hotplug
> task which is a per-cpu kthread and completion of the hotplug
> operation only requires such tasks.
>
> This contraint enables the migrate_disable() implementation to wait

s/contraint/constraint/

> for completion of all migrate_disable regions on this CPU at hotplug
> time without fear of any new ones starting.
>
> This replaces the unlikely(rq->balance_callbacks) test at the tail of
> context_switch with an unlikely(rq->balance_work), the fast path is
> not affected.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/sched/core.c  |  103 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>  kernel/sched/sched.h |    5 ++
>  2 files changed, 106 insertions(+), 2 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6836,6 +6849,87 @@ static void migrate_tasks(struct rq *dea
>
>       rq->stop = stop;
>  }
> +
> +static int __balance_push_stop(void *arg)

__balance_push_cpu_stop()? _cpu_stop() seems to be the usual suffix.

> +{
> +	struct task_struct *p = arg;
> +	struct rq *rq = this_rq();
> +	struct rq_flags rf;
> +	int cpu;
> +
> +	raw_spin_lock_irq(&p->pi_lock);
> +	rq_lock(rq, &rf);
> +
> +	if (task_rq(p) == rq && task_on_rq_queued(p)) {
> +		cpu = select_fallback_rq(rq->cpu, p);
> +		rq = __migrate_task(rq, &rf, p, cpu);
> +	}
> +
> +	rq_unlock(rq, &rf);
> +	raw_spin_unlock_irq(&p->pi_lock);
> +
> +	put_task_struct(p);
> +
> +	return 0;
> +}
> +
> +static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
> +
> +/*
> + * Ensure we only run per-cpu kthreads once the CPU goes !active.
> + */
> +static bool balance_push(struct rq *rq)
> +{
> +	struct task_struct *push_task = rq->curr;
> +
> +	lockdep_assert_held(&rq->lock);
> +	SCHED_WARN_ON(rq->cpu != smp_processor_id());
> +
> +	/*
> +	 * Both the cpu-hotplug and stop task are in this class and are
> +	 * required to complete the hotplug process.

Nit: s/class/case/? I can't not read "class" as "sched_class", and
those two are *not* in the same sched_class, and confusion ensues.

> +	 */
> +	if (is_per_cpu_kthread(push_task))
> +		return false;
> +
> +	get_task_struct(push_task);
> +	/*
> +	 * Temporarily drop rq->lock such that we can wake-up the stop task.
> +	 * Both preemption and IRQs are still disabled.
> +	 */
> +	raw_spin_unlock(&rq->lock);
> +	stop_one_cpu_nowait(rq->cpu, __balance_push_stop, push_task,
> +			    this_cpu_ptr(&push_work));
> +	/*
> +	 * At this point need_resched() is true and we'll take the loop in
> +	 * schedule(). The next pick is obviously going to be the stop task
> +	 * which is_per_cpu_kthread() and will push this task away.
> +	 */
> +	raw_spin_lock(&rq->lock);
> +
> +	return true;
> +}
> +
> @@ -6968,6 +7064,8 @@ int sched_cpu_deactivate(unsigned int cp
>        */
>       synchronize_rcu();
>
> +	balance_push_set(cpu, true);
> +

IIUC this is going to make every subsequent finish_lock_switch()
migrate the switched-to task if it isn't a pcpu kthread. So this is going
to lead to a dance of

switch_to(<task0>) -> switch_to(<stopper>) -> switch_to(<task1>) -> switch_to(<stopper>) ...

Wouldn't it be better to batch all those in a migrate_tasks() sibling that
skips pcpu kthreads?

>  #ifdef CONFIG_SCHED_SMT
>       /*
>        * When going down, decrement the number of cores with SMT present.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/2] sched: Fix balance_callback()
  2020-09-11 12:17   ` Valentin Schneider
@ 2020-09-11 12:25     ` peterz
  2020-09-11 13:27       ` Valentin Schneider
  0 siblings, 1 reply; 13+ messages in thread
From: peterz @ 2020-09-11 12:25 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: mingo, vincent.guittot, tglx, linux-kernel, dietmar.eggemann,
	rostedt, bristot, swood

On Fri, Sep 11, 2020 at 01:17:02PM +0100, Valentin Schneider wrote:
> On 11/09/20 09:17, Peter Zijlstra wrote:
> > The intent of balance_callback() has always been to delay executing
> > balancing operations until the end of the current rq->lock section.
> > This is because balance operations must often drop rq->lock, and that
> > isn't safe in general.
> >
> > However, as noted by Scott, there were a few holes in that scheme;
> > balance_callback() was called after rq->lock was dropped, which means
> > another CPU can interleave and touch the callback list.
> >
> 
> So that can be say __schedule() tail racing with some setprio; what's the
> worst that can (currently) happen here? Something like say two consecutive
> enqueuing of push_rt_tasks() to the callback list?

Yeah, but that isn't in fact the case I worry most about.

What can happen (and what I've spotted once before) is that someone
attempts to enqueue a balance_callback from a rq->lock region that
doesn't handle the calls.

Currently that 'works', that is, it will get ran _eventually_. But
ideally we'd want that to not work and issue a WARN. We want the
callbacks to be timely.

So basically all of these machinations we in order to add the WARN :-)

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] sched/hotplug: Ensure only per-cpu kthreads run during hotplug
  2020-09-11 12:17   ` Valentin Schneider
@ 2020-09-11 12:29     ` peterz
  2020-09-11 13:48       ` Valentin Schneider
  0 siblings, 1 reply; 13+ messages in thread
From: peterz @ 2020-09-11 12:29 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: mingo, vincent.guittot, tglx, linux-kernel, dietmar.eggemann,
	rostedt, bristot, swood, Vincent Donnefort

On Fri, Sep 11, 2020 at 01:17:45PM +0100, Valentin Schneider wrote:

> > @@ -6968,6 +7064,8 @@ int sched_cpu_deactivate(unsigned int cp
> >        */
> >       synchronize_rcu();
> >
> > +	balance_push_set(cpu, true);
> > +
> 
> IIUC this is going to make every subsequent finish_lock_switch()
> migrate the switched-to task if it isn't a pcpu kthread. So this is going
> to lead to a dance of
> 
> switch_to(<task0>) -> switch_to(<stopper>) -> switch_to(<task1>) -> switch_to(<stopper>) ...
> 
> Wouldn't it be better to batch all those in a migrate_tasks() sibling that
> skips pcpu kthreads?

That's 'difficult', this is hotplug, performance is not a consideration.

Basically we don't have an iterator for the runqueues, so finding these
tasks is hard.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/2] sched: Fix balance_callback()
  2020-09-11 12:25     ` peterz
@ 2020-09-11 13:27       ` Valentin Schneider
  0 siblings, 0 replies; 13+ messages in thread
From: Valentin Schneider @ 2020-09-11 13:27 UTC (permalink / raw)
  To: peterz
  Cc: mingo, vincent.guittot, tglx, linux-kernel, dietmar.eggemann,
	rostedt, bristot, swood


On 11/09/20 13:25, peterz@infradead.org wrote:
> On Fri, Sep 11, 2020 at 01:17:02PM +0100, Valentin Schneider wrote:
>> So that can be say __schedule() tail racing with some setprio; what's the
>> worst that can (currently) happen here? Something like say two consecutive
>> enqueuing of push_rt_tasks() to the callback list?
>
> Yeah, but that isn't in fact the case I worry most about.
>
> What can happen (and what I've spotted once before) is that someone
> attempts to enqueue a balance_callback from a rq->lock region that
> doesn't handle the calls.
>
> Currently that 'works', that is, it will get ran _eventually_. But
> ideally we'd want that to not work and issue a WARN. We want the
> callbacks to be timely.
>
> So basically all of these machinations we in order to add the WARN :-)

Makes sense, thanks!

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] sched/hotplug: Ensure only per-cpu kthreads run during hotplug
  2020-09-11 12:29     ` peterz
@ 2020-09-11 13:48       ` Valentin Schneider
  0 siblings, 0 replies; 13+ messages in thread
From: Valentin Schneider @ 2020-09-11 13:48 UTC (permalink / raw)
  To: peterz
  Cc: mingo, vincent.guittot, tglx, linux-kernel, dietmar.eggemann,
	rostedt, bristot, swood, Vincent Donnefort


On 11/09/20 13:29, peterz@infradead.org wrote:
> On Fri, Sep 11, 2020 at 01:17:45PM +0100, Valentin Schneider wrote:
>
>> > @@ -6968,6 +7064,8 @@ int sched_cpu_deactivate(unsigned int cp
>> >        */
>> >       synchronize_rcu();
>> >
>> > +	balance_push_set(cpu, true);
>> > +
>>
>> IIUC this is going to make every subsequent finish_lock_switch()
>> migrate the switched-to task if it isn't a pcpu kthread. So this is going
>> to lead to a dance of
>>
>> switch_to(<task0>) -> switch_to(<stopper>) -> switch_to(<task1>) -> switch_to(<stopper>) ...
>>
>> Wouldn't it be better to batch all those in a migrate_tasks() sibling that
>> skips pcpu kthreads?
>
> That's 'difficult', this is hotplug, performance is not a consideration.
>

Aye aye.

The reason I'm trying to care about this is (don't wince too hard) for that
CPU pause thing [1]. Vincent's been the one doing all the actual work so I
should let him pitch in, but TL;DR if we could "relatively quickly" migrate
all !pcpu tasks after clearing the active bit, we could stop hotplug there
and have our "CPU pause" thing.

[1]: https://lwn.net/Articles/820825/

> Basically we don't have an iterator for the runqueues, so finding these
> tasks is hard.

Mmph right, we'd pretty much have to "enjoy" iterating & skipping pcpu
threads every __pick_migrate_task() call...

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] sched/hotplug: Ensure only per-cpu kthreads run during hotplug
  2020-09-11  8:17 ` [PATCH 2/2] sched/hotplug: Ensure only per-cpu kthreads run during hotplug Peter Zijlstra
  2020-09-11 12:17   ` Valentin Schneider
@ 2020-09-16 10:18   ` Sebastian Andrzej Siewior
  2020-09-16 12:10     ` peterz
  1 sibling, 1 reply; 13+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-09-16 10:18 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, vincent.guittot, tglx, linux-kernel, dietmar.eggemann,
	rostedt, bristot, swood, valentin.schneider

On 2020-09-11 10:17:47 [+0200], Peter Zijlstra wrote:
> In preparation for migrate_disable(), make sure only per-cpu kthreads
> are allowed to run on !active CPUs.
> 
> This is ran (as one of the very first steps) from the cpu-hotplug
> task which is a per-cpu kthread and completion of the hotplug
> operation only requires such tasks.
> 
> This contraint enables the migrate_disable() implementation to wait
> for completion of all migrate_disable regions on this CPU at hotplug
> time without fear of any new ones starting.
> 
> This replaces the unlikely(rq->balance_callbacks) test at the tail of
> context_switch with an unlikely(rq->balance_work), the fast path is
> not affected.

With this on top of -rc5 I get:

[   42.678670] process 1816 (hackbench) no longer affine to cpu2
[   42.678684] process 1817 (hackbench) no longer affine to cpu2
[   42.710502] ------------[ cut here ]------------
[   42.711505] rq->clock_update_flags < RQCF_ACT_SKIP
[   42.711514] WARNING: CPU: 2 PID: 19 at kernel/sched/sched.h:1141 update_curr+0xe3/0x3f0
[   42.714005] Modules linked in:
[   42.714582] CPU: 2 PID: 19 Comm: migration/2 Not tainted 5.9.0-rc5+ #107
[   42.715836] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-1 04/01/2014
[   42.717367] RIP: 0010:update_curr+0xe3/0x3f0
[   42.718212] Code: 09 00 00 01 0f 87 6c ff ff ff 80 3d 52 bd 59 01 00 0f 85 5f ff ff ff 48 c7 c7 f8 7e 2b 82 c6 05 3e bd 59 01 01 e8 bc 9a fb ff <0f> 0b e9 45 ff ff ff 8b 05 d8 d4 59 01 48 8d 6b 80 8b 15 ba 5a 5b
[   42.721839] RSP: 0018:ffffc900000e3d60 EFLAGS: 00010086
[   42.722827] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
[   42.724166] RDX: ffff888275fa4540 RSI: ffffffff810f37e6 RDI: ffffffff82463940
[   42.725547] RBP: ffff888276ca9600 R08: 0000000000000000 R09: 0000000000000026
[   42.726925] R10: 0000000000000046 R11: 0000000000000046 R12: ffff888276ca9580
[   42.728268] R13: ffff888276ca9580 R14: ffff888276ca9600 R15: ffff88826aa5c5c0
[   42.729659] FS:  0000000000000000(0000) GS:ffff888276c80000(0000) knlGS:0000000000000000
[   42.731186] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   42.732272] CR2: 00007f7952603f90 CR3: 000000026aa56000 CR4: 00000000003506e0
[   42.733657] Call Trace:
[   42.734136]  dequeue_task_fair+0xfa/0x6f0
[   42.734899]  do_set_cpus_allowed+0xbb/0x1c0
[   42.735692]  cpuset_cpus_allowed_fallback+0x73/0x1a0
[   42.736635]  select_fallback_rq+0x129/0x160
[   42.737450]  __balance_push_stop+0x132/0x230
[   42.738291]  ? migration_cpu_stop+0x1f0/0x1f0
[   42.739118]  cpu_stopper_thread+0x76/0x100
[   42.739897]  ? smpboot_thread_fn+0x21/0x1f0
[   42.740691]  smpboot_thread_fn+0x106/0x1f0
[   42.741487]  ? __smpboot_create_thread.part.0+0x100/0x100
[   42.742539]  kthread+0x135/0x150
[   42.743153]  ? kthread_create_worker_on_cpu+0x60/0x60
[   42.744106]  ret_from_fork+0x22/0x30
[   42.744792] irq event stamp: 100
[   42.745433] hardirqs last  enabled at (99): [<ffffffff81a8d46f>] _raw_spin_unlock_irq+0x1f/0x30
[   42.747116] hardirqs last disabled at (100): [<ffffffff81a8d27e>] _raw_spin_lock_irq+0x3e/0x40
[   42.748748] softirqs last  enabled at (0): [<ffffffff81077b75>] copy_process+0x665/0x1b00
[   42.750353] softirqs last disabled at (0): [<0000000000000000>] 0x0
[   42.751552] ---[ end trace d98bb30ad2616d58 ]---

That comes due to DEQUEUE_NOCLOCK in do_set_cpus_allowed(). And then
there is this:

[   42.752454] ======================================================
[   42.752455] WARNING: possible circular locking dependency detected
[   42.752455] 5.9.0-rc5+ #107 Not tainted
[   42.752455] ------------------------------------------------------
[   42.752456] migration/2/19 is trying to acquire lock:
[   42.752456] ffffffff824639f8 ((console_sem).lock){-.-.}-{2:2}, at: down_trylock+0xa/0x30
[   42.752458] but task is already holding lock:
[   42.752458] ffff888276ca9598 (&rq->lock){-.-.}-{2:2}, at: __balance_push_stop+0x50/0x230
[   42.752460] which lock already depends on the new lock.
[   42.752460] the existing dependency chain (in reverse order) is:
[   42.752461] -> #2 (&rq->lock){-.-.}-{2:2}:
[   42.752462]        _raw_spin_lock+0x27/0x40
[   42.752462]        task_fork_fair+0x30/0x1b0
[   42.752462]        sched_fork+0x10b/0x280
[   42.752463]        copy_process+0x702/0x1b00
[   42.752464]        _do_fork+0x5a/0x450
[   42.752464]        kernel_thread+0x50/0x70
[   42.752465]        rest_init+0x19/0x240
[   42.752465]        start_kernel+0x548/0x56a
[   42.752466]        secondary_startup_64+0xa4/0xb0

[   42.752467] -> #1 (&p->pi_lock){-.-.}-{2:2}:
[   42.752468]        _raw_spin_lock_irqsave+0x36/0x50
[   42.752469]        try_to_wake_up+0x4e/0x720
[   42.752469]        up+0x3b/0x50
[   42.752470]        __up_console_sem+0x29/0x60
[   42.752470]        console_unlock+0x390/0x690
[   42.752471]        con_font_op+0x2ec/0x480
[   42.752471]        vt_ioctl+0x4d1/0x16e0
[   42.752471]        tty_ioctl+0x3e2/0x9a0
[   42.752472]        __x64_sys_ioctl+0x81/0x9a
[   42.752472]        do_syscall_64+0x33/0x40
[   42.752472]        entry_SYSCALL_64_after_hwframe+0x44/0xa9

[   42.752473] -> #0 ((console_sem).lock){-.-.}-{2:2}:
[   42.752474]        __lock_acquire+0x11af/0x2010
[   42.752474]        lock_acquire+0xb7/0x400
[   42.752474]        _raw_spin_lock_irqsave+0x36/0x50
[   42.752474]        down_trylock+0xa/0x30
[   42.752475]        __down_trylock_console_sem+0x23/0x90
[   42.752475]        vprintk_emit+0xf6/0x380
[   42.752476]        printk+0x53/0x6a
[   42.752476]        __warn_printk+0x42/0x84
[   42.752476]        update_curr+0xe3/0x3f0
[   42.752477]        dequeue_task_fair+0xfa/0x6f0
[   42.752477]        do_set_cpus_allowed+0xbb/0x1c0
[   42.752478]        cpuset_cpus_allowed_fallback+0x73/0x1a0
[   42.752479]        select_fallback_rq+0x129/0x160
[   42.752479]        __balance_push_stop+0x132/0x230
[   42.752480]        cpu_stopper_thread+0x76/0x100
[   42.752480]        smpboot_thread_fn+0x106/0x1f0
[   42.752480]        kthread+0x135/0x150
[   42.752480]        ret_from_fork+0x22/0x30

[   42.752481] other info that might help us debug this:
[   42.752482] Chain exists of:
[   42.752482]   (console_sem).lock --> &p->pi_lock --> &rq->lock
[   42.752483]  Possible unsafe locking scenario:
[   42.752484]        CPU0                    CPU1
[   42.752484]        ----                    ----
[   42.752484]   lock(&rq->lock);
[   42.752485]                                lock(&p->pi_lock);
[   42.752486]                                lock(&rq->lock);
[   42.752486]   lock((console_sem).lock);
[   42.752487]  *** DEADLOCK ***
[   42.752488] 3 locks held by migration/2/19:
[   42.752488]  #0: ffff88826aa5d0f8 (&p->pi_lock){-.-.}-{2:2}, at: __balance_push_stop+0x48/0x230
[   42.752489]  #1: ffff888276ca9598 (&rq->lock){-.-.}-{2:2}, at: __balance_push_stop+0x50/0x230
[   42.752490]  #2: ffffffff82478e00 (rcu_read_lock){....}-{1:2}, at: cpuset_cpus_allowed_fallback+0x0/0x1a0
[   42.752491] stack backtrace:
[   42.752492] CPU: 2 PID: 19 Comm: migration/2 Not tainted 5.9.0-rc5+ #107
[   42.752492] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-1 04/01/2014
[   42.752492] Call Trace:
[   42.752493]  dump_stack+0x77/0xa0
[   42.752493]  check_noncircular+0x156/0x170
[   42.752493]  __lock_acquire+0x11af/0x2010
[   42.752493]  ? __lock_acquire+0x1692/0x2010
[   42.752493]  lock_acquire+0xb7/0x400
[   42.752494]  ? down_trylock+0xa/0x30
[   42.752494]  ? log_store.constprop.0+0x1a4/0x250
[   42.752494]  ? printk+0x53/0x6a
[   42.752494]  _raw_spin_lock_irqsave+0x36/0x50
[   42.752495]  ? down_trylock+0xa/0x30
[   42.752495]  down_trylock+0xa/0x30
[   42.752495]  ? printk+0x53/0x6a
[   42.752495]  __down_trylock_console_sem+0x23/0x90
[   42.752495]  vprintk_emit+0xf6/0x380
[   42.752496]  printk+0x53/0x6a
[   42.752496]  __warn_printk+0x42/0x84
[   42.752496]  update_curr+0xe3/0x3f0
[   42.752496]  dequeue_task_fair+0xfa/0x6f0
[   42.752497]  do_set_cpus_allowed+0xbb/0x1c0
[   42.752497]  cpuset_cpus_allowed_fallback+0x73/0x1a0
[   42.752497]  select_fallback_rq+0x129/0x160
[   42.752497]  __balance_push_stop+0x132/0x230
[   42.752497]  ? migration_cpu_stop+0x1f0/0x1f0
[   42.752498]  cpu_stopper_thread+0x76/0x100
[   42.752498]  ? smpboot_thread_fn+0x21/0x1f0
[   42.752498]  smpboot_thread_fn+0x106/0x1f0
[   42.752498]  ? __smpboot_create_thread.part.0+0x100/0x100
[   42.752499]  kthread+0x135/0x150
[   42.752499]  ? kthread_create_worker_on_cpu+0x60/0x60
[   42.752499]  ret_from_fork+0x22/0x30
[   42.878095] numa_remove_cpu cpu 2 node 0: mask now 0-1,3-7
[   42.879977] smpboot: CPU 2 is now offline

Sebastian

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] sched/hotplug: Ensure only per-cpu kthreads run during hotplug
  2020-09-16 10:18   ` Sebastian Andrzej Siewior
@ 2020-09-16 12:10     ` peterz
  2020-09-16 13:58       ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 13+ messages in thread
From: peterz @ 2020-09-16 12:10 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: mingo, vincent.guittot, tglx, linux-kernel, dietmar.eggemann,
	rostedt, bristot, swood, valentin.schneider

On Wed, Sep 16, 2020 at 12:18:45PM +0200, Sebastian Andrzej Siewior wrote:
> With this on top of -rc5 I get:
> 
> [   42.678670] process 1816 (hackbench) no longer affine to cpu2
> [   42.678684] process 1817 (hackbench) no longer affine to cpu2
> [   42.710502] ------------[ cut here ]------------
> [   42.711505] rq->clock_update_flags < RQCF_ACT_SKIP
> [   42.711514] WARNING: CPU: 2 PID: 19 at kernel/sched/sched.h:1141 update_curr+0xe3/0x3f0

> [   42.736635]  select_fallback_rq+0x129/0x160
> [   42.737450]  __balance_push_stop+0x132/0x230

Duh.. I wonder why I didn't see that.. anyway, I think the below version
ought to cure that.

> That comes due to DEQUEUE_NOCLOCK in do_set_cpus_allowed(). And then
> there is this:
> 
> [   42.752454] ======================================================
> [   42.752455] WARNING: possible circular locking dependency detected
> [   42.752455] 5.9.0-rc5+ #107 Not tainted
> [   42.752455] ------------------------------------------------------
> [   42.752456] migration/2/19 is trying to acquire lock:
> [   42.752456] ffffffff824639f8 ((console_sem).lock){-.-.}-{2:2}, at: down_trylock+0xa/0x30
> [   42.752458] but task is already holding lock:
> [   42.752458] ffff888276ca9598 (&rq->lock){-.-.}-{2:2}, at: __balance_push_stop+0x50/0x230
> [   42.752460] which lock already depends on the new lock.

Yeah, that's the known issue with printk()...


---
Subject: sched/hotplug: Ensure only per-cpu kthreads run during hotplug
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri Sep 11 09:54:27 CEST 2020

In preparation for migrate_disable(), make sure only per-cpu kthreads
are allowed to run on !active CPUs.

This is ran (as one of the very first steps) from the cpu-hotplug
task which is a per-cpu kthread and completion of the hotplug
operation only requires such tasks.

This constraint enables the migrate_disable() implementation to wait
for completion of all migrate_disable regions on this CPU at hotplug
time without fear of any new ones starting.

This replaces the unlikely(rq->balance_callbacks) test at the tail of
context_switch with an unlikely(rq->balance_work), the fast path is
not affected.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |  114 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |    5 ++
 2 files changed, 117 insertions(+), 2 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3513,8 +3513,10 @@ static inline struct callback_head *spli
 	struct callback_head *head = rq->balance_callback;
 
 	lockdep_assert_held(&rq->lock);
-	if (head)
+	if (head) {
 		rq->balance_callback = NULL;
+		rq->balance_flags &= ~BALANCE_WORK;
+	}
 
 	return head;
 }
@@ -3535,6 +3537,22 @@ static inline void balance_callbacks(str
 	}
 }
 
+static bool balance_push(struct rq *rq);
+
+static inline void balance_switch(struct rq *rq)
+{
+	if (unlikely(rq->balance_flags)) {
+		/*
+		 * Run the balance_callbacks, except on hotplug
+		 * when we need to push the current task away.
+		 */
+		if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+		    !(rq->balance_flags & BALANCE_PUSH) ||
+		    !balance_push(rq))
+			__balance_callbacks(rq);
+	}
+}
+
 #else
 
 static inline void __balance_callbacks(struct rq *rq)
@@ -3550,6 +3568,10 @@ static inline void balance_callbacks(str
 {
 }
 
+static inline void balance_switch(struct rq *rq)
+{
+}
+
 #endif
 
 static inline void
@@ -3577,7 +3599,7 @@ static inline void finish_lock_switch(st
 	 * prev into current:
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-	__balance_callbacks(rq);
+	balance_switch(rq);
 	raw_spin_unlock_irq(&rq->lock);
 }
 
@@ -6836,6 +6858,89 @@ static void migrate_tasks(struct rq *dea
 
 	rq->stop = stop;
 }
+
+static int __balance_push_cpu_stop(void *arg)
+{
+	struct task_struct *p = arg;
+	struct rq *rq = this_rq();
+	struct rq_flags rf;
+	int cpu;
+
+	raw_spin_lock_irq(&p->pi_lock);
+	rq_lock(rq, &rf);
+
+	update_rq_clock();
+
+	if (task_rq(p) == rq && task_on_rq_queued(p)) {
+		cpu = select_fallback_rq(rq->cpu, p);
+		rq = __migrate_task(rq, &rf, p, cpu);
+	}
+
+	rq_unlock(rq, &rf);
+	raw_spin_unlock_irq(&p->pi_lock);
+
+	put_task_struct(p);
+
+	return 0;
+}
+
+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
+
+/*
+ * Ensure we only run per-cpu kthreads once the CPU goes !active.
+ */
+static bool balance_push(struct rq *rq)
+{
+	struct task_struct *push_task = rq->curr;
+
+	lockdep_assert_held(&rq->lock);
+	SCHED_WARN_ON(rq->cpu != smp_processor_id());
+
+	/*
+	 * Both the cpu-hotplug and stop task are in this case and are
+	 * required to complete the hotplug process.
+	 */
+	if (is_per_cpu_kthread(push_task))
+		return false;
+
+	get_task_struct(push_task);
+	/*
+	 * Temporarily drop rq->lock such that we can wake-up the stop task.
+	 * Both preemption and IRQs are still disabled.
+	 */
+	raw_spin_unlock(&rq->lock);
+	stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
+			    this_cpu_ptr(&push_work));
+	/*
+	 * At this point need_resched() is true and we'll take the loop in
+	 * schedule(). The next pick is obviously going to be the stop task
+	 * which is_per_cpu_kthread() and will push this task away.
+	 */
+	raw_spin_lock(&rq->lock);
+
+	return true;
+}
+
+static void balance_push_set(int cpu, bool on)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct rq_flags rf;
+
+	rq_lock_irqsave(rq, &rf);
+	if (on)
+		rq->balance_flags |= BALANCE_PUSH;
+	else
+		rq->balance_flags &= ~BALANCE_PUSH;
+	rq_unlock_irqrestore(rq, &rf);
+}
+
+#else
+
+static inline bool balance_push(struct rq *rq)
+{
+	return false;
+}
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 void set_rq_online(struct rq *rq)
@@ -6921,6 +7026,8 @@ int sched_cpu_activate(unsigned int cpu)
 	struct rq *rq = cpu_rq(cpu);
 	struct rq_flags rf;
 
+	balance_push_set(cpu, false);
+
 #ifdef CONFIG_SCHED_SMT
 	/*
 	 * When going up, increment the number of cores with SMT present.
@@ -6968,6 +7075,8 @@ int sched_cpu_deactivate(unsigned int cp
 	 */
 	synchronize_rcu();
 
+	balance_push_set(cpu, true);
+
 #ifdef CONFIG_SCHED_SMT
 	/*
 	 * When going down, decrement the number of cores with SMT present.
@@ -6981,6 +7090,7 @@ int sched_cpu_deactivate(unsigned int cp
 
 	ret = cpuset_cpu_inactive(cpu);
 	if (ret) {
+		balance_push_set(cpu, false);
 		set_cpu_active(cpu, true);
 		return ret;
 	}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -973,6 +973,7 @@ struct rq {
 	unsigned long		cpu_capacity_orig;
 
 	struct callback_head	*balance_callback;
+	unsigned char		balance_flags;
 
 	unsigned char		nohz_idle_balance;
 	unsigned char		idle_balance;
@@ -1384,6 +1385,9 @@ init_numa_balancing(unsigned long clone_
 
 #ifdef CONFIG_SMP
 
+#define BALANCE_WORK	0x01
+#define BALANCE_PUSH	0x02
+
 static inline void
 queue_balance_callback(struct rq *rq,
 		       struct callback_head *head,
@@ -1397,6 +1401,7 @@ queue_balance_callback(struct rq *rq,
 	head->func = (void (*)(struct callback_head *))func;
 	head->next = rq->balance_callback;
 	rq->balance_callback = head;
+	rq->balance_flags |= BALANCE_WORK;
 }
 
 #define rcu_dereference_check_sched_domain(p) \

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] sched/hotplug: Ensure only per-cpu kthreads run during hotplug
  2020-09-16 12:10     ` peterz
@ 2020-09-16 13:58       ` Sebastian Andrzej Siewior
  2020-09-16 14:07         ` peterz
  0 siblings, 1 reply; 13+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-09-16 13:58 UTC (permalink / raw)
  To: peterz
  Cc: mingo, vincent.guittot, tglx, linux-kernel, dietmar.eggemann,
	rostedt, bristot, swood, valentin.schneider

On 2020-09-16 14:10:20 [+0200], peterz@infradead.org wrote:

squeeze that in please:

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a4fe22b8b8418..bed3cd28af578 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6866,7 +6866,7 @@ static int __balance_push_cpu_stop(void *arg)
 	raw_spin_lock_irq(&p->pi_lock);
 	rq_lock(rq, &rf);
 
-	update_rq_clock();
+	update_rq_clock(rq);
 
 	if (task_rq(p) == rq && task_on_rq_queued(p)) {
 		cpu = select_fallback_rq(rq->cpu, p);


and count me in :)

Sebastian

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] sched/hotplug: Ensure only per-cpu kthreads run during hotplug
  2020-09-16 13:58       ` Sebastian Andrzej Siewior
@ 2020-09-16 14:07         ` peterz
  0 siblings, 0 replies; 13+ messages in thread
From: peterz @ 2020-09-16 14:07 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: mingo, vincent.guittot, tglx, linux-kernel, dietmar.eggemann,
	rostedt, bristot, swood, valentin.schneider

On Wed, Sep 16, 2020 at 03:58:17PM +0200, Sebastian Andrzej Siewior wrote:
> On 2020-09-16 14:10:20 [+0200], peterz@infradead.org wrote:
> 
> squeeze that in please:
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index a4fe22b8b8418..bed3cd28af578 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6866,7 +6866,7 @@ static int __balance_push_cpu_stop(void *arg)
>  	raw_spin_lock_irq(&p->pi_lock);
>  	rq_lock(rq, &rf);
>  
> -	update_rq_clock();
> +	update_rq_clock(rq);
>  
>  	if (task_rq(p) == rq && task_on_rq_queued(p)) {
>  		cpu = select_fallback_rq(rq->cpu, p);
> 
> 
> and count me in :)

Duh... /me goes in search of the brown paper bag -- again!

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2020-09-16 20:47 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-11  8:17 [PATCH 0/2] sched: migrate_disable() preparations Peter Zijlstra
2020-09-11  8:17 ` [PATCH 1/2] sched: Fix balance_callback() Peter Zijlstra
2020-09-11 12:17   ` Valentin Schneider
2020-09-11 12:25     ` peterz
2020-09-11 13:27       ` Valentin Schneider
2020-09-11  8:17 ` [PATCH 2/2] sched/hotplug: Ensure only per-cpu kthreads run during hotplug Peter Zijlstra
2020-09-11 12:17   ` Valentin Schneider
2020-09-11 12:29     ` peterz
2020-09-11 13:48       ` Valentin Schneider
2020-09-16 10:18   ` Sebastian Andrzej Siewior
2020-09-16 12:10     ` peterz
2020-09-16 13:58       ` Sebastian Andrzej Siewior
2020-09-16 14:07         ` peterz

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).