linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v4 00/19] sched: Migrate disable support
@ 2020-10-23 10:11 Peter Zijlstra
  2020-10-23 10:11 ` [PATCH v4 01/19] stop_machine: Add function and caller debug info Peter Zijlstra
                   ` (20 more replies)
  0 siblings, 21 replies; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-23 10:11 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

Hi,

The fourth version of migrate_disable() for PREEMPT_RT.

Two changes since last time:

 - fixes !SMP builds (bigeasy)
 - TLA+ validation of migrate_disable() vs sched_setaffinity() (valsch)

Esp. that latter resulted in significant changes to patch #10. Huge thanks to
Valentin.

---
 fs/proc/array.c               |    4 
 include/linux/cpuhotplug.h    |    1 
 include/linux/cpumask.h       |    6 
 include/linux/preempt.h       |   69 +++
 include/linux/sched.h         |    5 
 include/linux/sched/hotplug.h |    2 
 include/linux/stop_machine.h  |    5 
 include/trace/events/sched.h  |   12 
 kernel/cpu.c                  |    9 
 kernel/sched/core.c           |  925 +++++++++++++++++++++++++++++++++---------
 kernel/sched/cpudeadline.c    |    4 
 kernel/sched/cpupri.c         |    4 
 kernel/sched/deadline.c       |   47 +-
 kernel/sched/rt.c             |   81 ++-
 kernel/sched/sched.h          |   59 ++
 kernel/stop_machine.c         |   23 -
 kernel/workqueue.c            |    4 
 lib/cpumask.c                 |   18 
 lib/dump_stack.c              |    2 
 lib/smp_processor_id.c        |    5 
 20 files changed, 1056 insertions(+), 229 deletions(-)


^ permalink raw reply	[flat|nested] 81+ messages in thread

* [PATCH v4 01/19] stop_machine: Add function and caller debug info
  2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
@ 2020-10-23 10:11 ` Peter Zijlstra
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
  2020-10-23 10:12 ` [PATCH v4 02/19] sched: Fix balance_callback() Peter Zijlstra
                   ` (19 subsequent siblings)
  20 siblings, 1 reply; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-23 10:11 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

Crashes in stop-machine are hard to connect to the calling code, add a
little something to help with that.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/stop_machine.h |    5 +++++
 kernel/stop_machine.c        |   23 ++++++++++++++++++++---
 lib/dump_stack.c             |    2 ++
 3 files changed, 27 insertions(+), 3 deletions(-)

--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -24,6 +24,7 @@ typedef int (*cpu_stop_fn_t)(void *arg);
 struct cpu_stop_work {
 	struct list_head	list;		/* cpu_stopper->works */
 	cpu_stop_fn_t		fn;
+	unsigned long		caller;
 	void			*arg;
 	struct cpu_stop_done	*done;
 };
@@ -36,6 +37,8 @@ void stop_machine_park(int cpu);
 void stop_machine_unpark(int cpu);
 void stop_machine_yield(const struct cpumask *cpumask);
 
+extern void print_stop_info(const char *log_lvl, struct task_struct *task);
+
 #else	/* CONFIG_SMP */
 
 #include <linux/workqueue.h>
@@ -80,6 +83,8 @@ static inline bool stop_one_cpu_nowait(u
 	return false;
 }
 
+static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { }
+
 #endif	/* CONFIG_SMP */
 
 /*
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -42,11 +42,23 @@ struct cpu_stopper {
 	struct list_head	works;		/* list of pending works */
 
 	struct cpu_stop_work	stop_work;	/* for stop_cpus */
+	unsigned long		caller;
+	cpu_stop_fn_t		fn;
 };
 
 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
 static bool stop_machine_initialized = false;
 
+void print_stop_info(const char *log_lvl, struct task_struct *task)
+{
+	struct cpu_stopper *stopper = this_cpu_ptr(&cpu_stopper);
+
+	if (task != stopper->thread)
+		return;
+
+	printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller);
+}
+
 /* static data for stop_cpus */
 static DEFINE_MUTEX(stop_cpus_mutex);
 static bool stop_cpus_in_progress;
@@ -123,7 +135,7 @@ static bool cpu_stop_queue_work(unsigned
 int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
 {
 	struct cpu_stop_done done;
-	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
+	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ };
 
 	cpu_stop_init_done(&done, 1);
 	if (!cpu_stop_queue_work(cpu, &work))
@@ -331,7 +343,8 @@ int stop_two_cpus(unsigned int cpu1, uns
 	work1 = work2 = (struct cpu_stop_work){
 		.fn = multi_cpu_stop,
 		.arg = &msdata,
-		.done = &done
+		.done = &done,
+		.caller = _RET_IP_,
 	};
 
 	cpu_stop_init_done(&done, 2);
@@ -367,7 +380,7 @@ int stop_two_cpus(unsigned int cpu1, uns
 bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 			struct cpu_stop_work *work_buf)
 {
-	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
+	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, };
 	return cpu_stop_queue_work(cpu, work_buf);
 }
 
@@ -487,6 +500,8 @@ static void cpu_stopper_thread(unsigned
 		int ret;
 
 		/* cpu stop callbacks must not sleep, make in_atomic() == T */
+		stopper->caller = work->caller;
+		stopper->fn = fn;
 		preempt_count_inc();
 		ret = fn(arg);
 		if (done) {
@@ -495,6 +510,8 @@ static void cpu_stopper_thread(unsigned
 			cpu_stop_signal_done(done);
 		}
 		preempt_count_dec();
+		stopper->fn = NULL;
+		stopper->caller = 0;
 		WARN_ONCE(preempt_count(),
 			  "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg);
 		goto repeat;
--- a/lib/dump_stack.c
+++ b/lib/dump_stack.c
@@ -12,6 +12,7 @@
 #include <linux/atomic.h>
 #include <linux/kexec.h>
 #include <linux/utsname.h>
+#include <linux/stop_machine.h>
 
 static char dump_stack_arch_desc_str[128];
 
@@ -57,6 +58,7 @@ void dump_stack_print_info(const char *l
 		       log_lvl, dump_stack_arch_desc_str);
 
 	print_worker_info(log_lvl, current);
+	print_stop_info(log_lvl, current);
 }
 
 /**



^ permalink raw reply	[flat|nested] 81+ messages in thread

* [PATCH v4 02/19] sched: Fix balance_callback()
  2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
  2020-10-23 10:11 ` [PATCH v4 01/19] stop_machine: Add function and caller debug info Peter Zijlstra
@ 2020-10-23 10:12 ` Peter Zijlstra
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
  2020-10-23 10:12 ` [PATCH v4 03/19] sched/hotplug: Ensure only per-cpu kthreads run during hotplug Peter Zijlstra
                   ` (18 subsequent siblings)
  20 siblings, 1 reply; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-23 10:12 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

The intent of balance_callback() has always been to delay executing
balancing operations until the end of the current rq->lock section.
This is because balance operations must often drop rq->lock, and that
isn't safe in general.

However, as noted by Scott, there were a few holes in that scheme;
balance_callback() was called after rq->lock was dropped, which means
another CPU can interleave and touch the callback list.

Rework code to call the balance callbacks before dropping rq->lock
where possible, and otherwise splice the balance list onto a local
stack.

This guarantees that the balance list must be empty when we take
rq->lock. IOW, we'll only ever run our own balance callbacks.

Reported-by: Scott Wood <swood@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |  119 ++++++++++++++++++++++++++++++++-------------------
 kernel/sched/sched.h |    3 +
 2 files changed, 78 insertions(+), 44 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3494,6 +3494,69 @@ static inline void finish_task(struct ta
 #endif
 }
 
+#ifdef CONFIG_SMP
+
+static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+	void (*func)(struct rq *rq);
+	struct callback_head *next;
+
+	lockdep_assert_held(&rq->lock);
+
+	while (head) {
+		func = (void (*)(struct rq *))head->func;
+		next = head->next;
+		head->next = NULL;
+		head = next;
+
+		func(rq);
+	}
+}
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+	struct callback_head *head = rq->balance_callback;
+
+	lockdep_assert_held(&rq->lock);
+	if (head)
+		rq->balance_callback = NULL;
+
+	return head;
+}
+
+static void __balance_callbacks(struct rq *rq)
+{
+	do_balance_callbacks(rq, splice_balance_callbacks(rq));
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+	unsigned long flags;
+
+	if (unlikely(head)) {
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		do_balance_callbacks(rq, head);
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	}
+}
+
+#else
+
+static inline void __balance_callbacks(struct rq *rq)
+{
+}
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+	return NULL;
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+}
+
+#endif
+
 static inline void
 prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
 {
@@ -3519,6 +3582,7 @@ static inline void finish_lock_switch(st
 	 * prev into current:
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+	__balance_callbacks(rq);
 	raw_spin_unlock_irq(&rq->lock);
 }
 
@@ -3660,43 +3724,6 @@ static struct rq *finish_task_switch(str
 	return rq;
 }
 
-#ifdef CONFIG_SMP
-
-/* rq->lock is NOT held, but preemption is disabled */
-static void __balance_callback(struct rq *rq)
-{
-	struct callback_head *head, *next;
-	void (*func)(struct rq *rq);
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&rq->lock, flags);
-	head = rq->balance_callback;
-	rq->balance_callback = NULL;
-	while (head) {
-		func = (void (*)(struct rq *))head->func;
-		next = head->next;
-		head->next = NULL;
-		head = next;
-
-		func(rq);
-	}
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-static inline void balance_callback(struct rq *rq)
-{
-	if (unlikely(rq->balance_callback))
-		__balance_callback(rq);
-}
-
-#else
-
-static inline void balance_callback(struct rq *rq)
-{
-}
-
-#endif
-
 /**
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
@@ -3716,7 +3743,6 @@ asmlinkage __visible void schedule_tail(
 	 */
 
 	rq = finish_task_switch(prev);
-	balance_callback(rq);
 	preempt_enable();
 
 	if (current->set_child_tid)
@@ -4532,10 +4558,11 @@ static void __sched notrace __schedule(b
 		rq = context_switch(rq, prev, next, &rf);
 	} else {
 		rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-		rq_unlock_irq(rq, &rf);
-	}
 
-	balance_callback(rq);
+		rq_unpin_lock(rq, &rf);
+		__balance_callbacks(rq);
+		raw_spin_unlock_irq(&rq->lock);
+	}
 }
 
 void __noreturn do_task_dead(void)
@@ -4946,9 +4973,11 @@ void rt_mutex_setprio(struct task_struct
 out_unlock:
 	/* Avoid rq from going away on us: */
 	preempt_disable();
-	__task_rq_unlock(rq, &rf);
 
-	balance_callback(rq);
+	rq_unpin_lock(rq, &rf);
+	__balance_callbacks(rq);
+	raw_spin_unlock(&rq->lock);
+
 	preempt_enable();
 }
 #else
@@ -5222,6 +5251,7 @@ static int __sched_setscheduler(struct t
 	int retval, oldprio, oldpolicy = -1, queued, running;
 	int new_effective_prio, policy = attr->sched_policy;
 	const struct sched_class *prev_class;
+	struct callback_head *head;
 	struct rq_flags rf;
 	int reset_on_fork;
 	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -5460,6 +5490,7 @@ static int __sched_setscheduler(struct t
 
 	/* Avoid rq from going away on us: */
 	preempt_disable();
+	head = splice_balance_callbacks(rq);
 	task_rq_unlock(rq, p, &rf);
 
 	if (pi) {
@@ -5468,7 +5499,7 @@ static int __sched_setscheduler(struct t
 	}
 
 	/* Run balance callbacks after we've adjusted the PI chain: */
-	balance_callback(rq);
+	balance_callbacks(rq, head);
 	preempt_enable();
 
 	return 0;
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1221,6 +1221,9 @@ static inline void rq_pin_lock(struct rq
 	rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
 	rf->clock_update_flags = 0;
 #endif
+#ifdef CONFIG_SMP
+	SCHED_WARN_ON(rq->balance_callback);
+#endif
 }
 
 static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)



^ permalink raw reply	[flat|nested] 81+ messages in thread

* [PATCH v4 03/19] sched/hotplug: Ensure only per-cpu kthreads run during hotplug
  2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
  2020-10-23 10:11 ` [PATCH v4 01/19] stop_machine: Add function and caller debug info Peter Zijlstra
  2020-10-23 10:12 ` [PATCH v4 02/19] sched: Fix balance_callback() Peter Zijlstra
@ 2020-10-23 10:12 ` Peter Zijlstra
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
  2020-10-23 10:12 ` [PATCH v4 04/19] sched/core: Wait for tasks being pushed away on hotplug Peter Zijlstra
                   ` (17 subsequent siblings)
  20 siblings, 1 reply; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-23 10:12 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

In preparation for migrate_disable(), make sure only per-cpu kthreads
are allowed to run on !active CPUs.

This is ran (as one of the very first steps) from the cpu-hotplug
task which is a per-cpu kthread and completion of the hotplug
operation only requires such tasks.

This constraint enables the migrate_disable() implementation to wait
for completion of all migrate_disable regions on this CPU at hotplug
time without fear of any new ones starting.

This replaces the unlikely(rq->balance_callbacks) test at the tail of
context_switch with an unlikely(rq->balance_work), the fast path is
not affected.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |  114 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |    7 ++-
 2 files changed, 118 insertions(+), 3 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3509,8 +3509,10 @@ static inline struct callback_head *spli
 	struct callback_head *head = rq->balance_callback;
 
 	lockdep_assert_held(&rq->lock);
-	if (head)
+	if (head) {
 		rq->balance_callback = NULL;
+		rq->balance_flags &= ~BALANCE_WORK;
+	}
 
 	return head;
 }
@@ -3531,6 +3533,21 @@ static inline void balance_callbacks(str
 	}
 }
 
+static void balance_push(struct rq *rq);
+
+static inline void balance_switch(struct rq *rq)
+{
+	if (likely(!rq->balance_flags))
+		return;
+
+	if (rq->balance_flags & BALANCE_PUSH) {
+		balance_push(rq);
+		return;
+	}
+
+	__balance_callbacks(rq);
+}
+
 #else
 
 static inline void __balance_callbacks(struct rq *rq)
@@ -3546,6 +3563,10 @@ static inline void balance_callbacks(str
 {
 }
 
+static inline void balance_switch(struct rq *rq)
+{
+}
+
 #endif
 
 static inline void
@@ -3573,7 +3594,7 @@ static inline void finish_lock_switch(st
 	 * prev into current:
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-	__balance_callbacks(rq);
+	balance_switch(rq);
 	raw_spin_unlock_irq(&rq->lock);
 }
 
@@ -6831,6 +6852,90 @@ static void migrate_tasks(struct rq *dea
 
 	rq->stop = stop;
 }
+
+static int __balance_push_cpu_stop(void *arg)
+{
+	struct task_struct *p = arg;
+	struct rq *rq = this_rq();
+	struct rq_flags rf;
+	int cpu;
+
+	raw_spin_lock_irq(&p->pi_lock);
+	rq_lock(rq, &rf);
+
+	update_rq_clock(rq);
+
+	if (task_rq(p) == rq && task_on_rq_queued(p)) {
+		cpu = select_fallback_rq(rq->cpu, p);
+		rq = __migrate_task(rq, &rf, p, cpu);
+	}
+
+	rq_unlock(rq, &rf);
+	raw_spin_unlock_irq(&p->pi_lock);
+
+	put_task_struct(p);
+
+	return 0;
+}
+
+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
+
+/*
+ * Ensure we only run per-cpu kthreads once the CPU goes !active.
+ */
+static void balance_push(struct rq *rq)
+{
+	struct task_struct *push_task = rq->curr;
+
+	lockdep_assert_held(&rq->lock);
+	SCHED_WARN_ON(rq->cpu != smp_processor_id());
+
+	/*
+	 * Both the cpu-hotplug and stop task are in this case and are
+	 * required to complete the hotplug process.
+	 */
+	if (is_per_cpu_kthread(push_task))
+		return;
+
+	get_task_struct(push_task);
+	/*
+	 * Temporarily drop rq->lock such that we can wake-up the stop task.
+	 * Both preemption and IRQs are still disabled.
+	 */
+	raw_spin_unlock(&rq->lock);
+	stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
+			    this_cpu_ptr(&push_work));
+	/*
+	 * At this point need_resched() is true and we'll take the loop in
+	 * schedule(). The next pick is obviously going to be the stop task
+	 * which is_per_cpu_kthread() and will push this task away.
+	 */
+	raw_spin_lock(&rq->lock);
+}
+
+static void balance_push_set(int cpu, bool on)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct rq_flags rf;
+
+	rq_lock_irqsave(rq, &rf);
+	if (on)
+		rq->balance_flags |= BALANCE_PUSH;
+	else
+		rq->balance_flags &= ~BALANCE_PUSH;
+	rq_unlock_irqrestore(rq, &rf);
+}
+
+#else
+
+static inline void balance_push(struct rq *rq)
+{
+}
+
+static inline void balance_push_set(int cpu, bool on)
+{
+}
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 void set_rq_online(struct rq *rq)
@@ -6916,6 +7021,8 @@ int sched_cpu_activate(unsigned int cpu)
 	struct rq *rq = cpu_rq(cpu);
 	struct rq_flags rf;
 
+	balance_push_set(cpu, false);
+
 #ifdef CONFIG_SCHED_SMT
 	/*
 	 * When going up, increment the number of cores with SMT present.
@@ -6963,6 +7070,8 @@ int sched_cpu_deactivate(unsigned int cp
 	 */
 	synchronize_rcu();
 
+	balance_push_set(cpu, true);
+
 #ifdef CONFIG_SCHED_SMT
 	/*
 	 * When going down, decrement the number of cores with SMT present.
@@ -6976,6 +7085,7 @@ int sched_cpu_deactivate(unsigned int cp
 
 	ret = cpuset_cpu_inactive(cpu);
 	if (ret) {
+		balance_push_set(cpu, false);
 		set_cpu_active(cpu, true);
 		return ret;
 	}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -973,6 +973,7 @@ struct rq {
 	unsigned long		cpu_capacity_orig;
 
 	struct callback_head	*balance_callback;
+	unsigned char		balance_flags;
 
 	unsigned char		nohz_idle_balance;
 	unsigned char		idle_balance;
@@ -1385,6 +1386,9 @@ init_numa_balancing(unsigned long clone_
 
 #ifdef CONFIG_SMP
 
+#define BALANCE_WORK	0x01
+#define BALANCE_PUSH	0x02
+
 static inline void
 queue_balance_callback(struct rq *rq,
 		       struct callback_head *head,
@@ -1392,12 +1396,13 @@ queue_balance_callback(struct rq *rq,
 {
 	lockdep_assert_held(&rq->lock);
 
-	if (unlikely(head->next))
+	if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH)))
 		return;
 
 	head->func = (void (*)(struct callback_head *))func;
 	head->next = rq->balance_callback;
 	rq->balance_callback = head;
+	rq->balance_flags |= BALANCE_WORK;
 }
 
 #define rcu_dereference_check_sched_domain(p) \



^ permalink raw reply	[flat|nested] 81+ messages in thread

* [PATCH v4 04/19] sched/core: Wait for tasks being pushed away on hotplug
  2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (2 preceding siblings ...)
  2020-10-23 10:12 ` [PATCH v4 03/19] sched/hotplug: Ensure only per-cpu kthreads run during hotplug Peter Zijlstra
@ 2020-10-23 10:12 ` Peter Zijlstra
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Thomas Gleixner
  2020-10-23 10:12 ` [PATCH v4 05/19] workqueue: Manually break affinity " Peter Zijlstra
                   ` (16 subsequent siblings)
  20 siblings, 1 reply; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-23 10:12 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

From: Thomas Gleixner <tglx@linutronix.de>

RT kernels need to ensure that all tasks which are not per CPU kthreads
have left the outgoing CPU to guarantee that no tasks are force migrated
within a migrate disabled section.

There is also some desire to (ab)use fine grained CPU hotplug control to
clear a CPU from active state to force migrate tasks which are not per CPU
kthreads away for power control purposes.

Add a mechanism which waits until all tasks which should leave the CPU
after the CPU active flag is cleared have moved to a different online CPU.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |   40 +++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |    4 ++++
 2 files changed, 43 insertions(+), 1 deletion(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6894,8 +6894,21 @@ static void balance_push(struct rq *rq)
 	 * Both the cpu-hotplug and stop task are in this case and are
 	 * required to complete the hotplug process.
 	 */
-	if (is_per_cpu_kthread(push_task))
+	if (is_per_cpu_kthread(push_task)) {
+		/*
+		 * If this is the idle task on the outgoing CPU try to wake
+		 * up the hotplug control thread which might wait for the
+		 * last task to vanish. The rcuwait_active() check is
+		 * accurate here because the waiter is pinned on this CPU
+		 * and can't obviously be running in parallel.
+		 */
+		if (!rq->nr_running && rcuwait_active(&rq->hotplug_wait)) {
+			raw_spin_unlock(&rq->lock);
+			rcuwait_wake_up(&rq->hotplug_wait);
+			raw_spin_lock(&rq->lock);
+		}
 		return;
+	}
 
 	get_task_struct(push_task);
 	/*
@@ -6926,6 +6939,20 @@ static void balance_push_set(int cpu, bo
 	rq_unlock_irqrestore(rq, &rf);
 }
 
+/*
+ * Invoked from a CPUs hotplug control thread after the CPU has been marked
+ * inactive. All tasks which are not per CPU kernel threads are either
+ * pushed off this CPU now via balance_push() or placed on a different CPU
+ * during wakeup. Wait until the CPU is quiescent.
+ */
+static void balance_hotplug_wait(void)
+{
+	struct rq *rq = this_rq();
+
+	rcuwait_wait_event(&rq->hotplug_wait, rq->nr_running == 1,
+			   TASK_UNINTERRUPTIBLE);
+}
+
 #else
 
 static inline void balance_push(struct rq *rq)
@@ -6936,6 +6963,10 @@ static inline void balance_push_set(int
 {
 }
 
+static inline void balance_hotplug_wait(void)
+{
+}
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 void set_rq_online(struct rq *rq)
@@ -7090,6 +7121,10 @@ int sched_cpu_deactivate(unsigned int cp
 		return ret;
 	}
 	sched_domains_numa_masks_clear(cpu);
+
+	/* Wait for all non per CPU kernel threads to vanish. */
+	balance_hotplug_wait();
+
 	return 0;
 }
 
@@ -7330,6 +7365,9 @@ void __init sched_init(void)
 
 		rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
 #endif
+#ifdef CONFIG_HOTPLUG_CPU
+		rcuwait_init(&rq->hotplug_wait);
+#endif
 #endif /* CONFIG_SMP */
 		hrtick_rq_init(rq);
 		atomic_set(&rq->nr_iowait, 0);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1004,6 +1004,10 @@ struct rq {
 
 	/* This is used to determine avg_idle's max value */
 	u64			max_idle_balance_cost;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	struct rcuwait		hotplug_wait;
+#endif
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING



^ permalink raw reply	[flat|nested] 81+ messages in thread

* [PATCH v4 05/19] workqueue: Manually break affinity on hotplug
  2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (3 preceding siblings ...)
  2020-10-23 10:12 ` [PATCH v4 04/19] sched/core: Wait for tasks being pushed away on hotplug Peter Zijlstra
@ 2020-10-23 10:12 ` Peter Zijlstra
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
  2020-10-23 10:12 ` [PATCH v4 06/19] sched/hotplug: Consolidate task migration on CPU unplug Peter Zijlstra
                   ` (15 subsequent siblings)
  20 siblings, 1 reply; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-23 10:12 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

Don't rely on the scheduler to force break affinity for us -- it will
stop doing that for per-cpu-kthreads.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c |    4 ++++
 1 file changed, 4 insertions(+)

--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4905,6 +4905,10 @@ static void unbind_workers(int cpu)
 		pool->flags |= POOL_DISASSOCIATED;
 
 		raw_spin_unlock_irq(&pool->lock);
+
+		for_each_pool_worker(worker, pool)
+			WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0);
+
 		mutex_unlock(&wq_pool_attach_mutex);
 
 		/*



^ permalink raw reply	[flat|nested] 81+ messages in thread

* [PATCH v4 06/19] sched/hotplug: Consolidate task migration on CPU unplug
  2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (4 preceding siblings ...)
  2020-10-23 10:12 ` [PATCH v4 05/19] workqueue: Manually break affinity " Peter Zijlstra
@ 2020-10-23 10:12 ` Peter Zijlstra
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Thomas Gleixner
  2020-10-23 10:12 ` [PATCH v4 07/19] sched: Fix hotplug vs CPU bandwidth control Peter Zijlstra
                   ` (14 subsequent siblings)
  20 siblings, 1 reply; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-23 10:12 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

From: Thomas Gleixner <tglx@linutronix.de>

With the new mechanism which kicks tasks off the outgoing CPU at the end of
schedule() the situation on an outgoing CPU right before the stopper thread
brings it down completely is:

 - All user tasks and all unbound kernel threads have either been migrated
   away or are not running and the next wakeup will move them to a online CPU.

 - All per CPU kernel threads, except cpu hotplug thread and the stopper
   thread have either been unbound or parked by the responsible CPU hotplug
   callback.

That means that at the last step before the stopper thread is invoked the
cpu hotplug thread is the last legitimate running task on the outgoing
CPU.

Add a final wait step right before the stopper thread is kicked which
ensures that any still running tasks on the way to park or on the way to
kick themself of the CPU are either sleeping or gone.

This allows to remove the migrate_tasks() crutch in sched_cpu_dying(). If
sched_cpu_dying() detects that there is still another running task aside of
the stopper thread then it will explode with the appropriate fireworks.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/cpuhotplug.h    |    1 
 include/linux/sched/hotplug.h |    2 
 kernel/cpu.c                  |    9 ++
 kernel/sched/core.c           |  154 +++++++++---------------------------------
 4 files changed, 46 insertions(+), 120 deletions(-)

--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -151,6 +151,7 @@ enum cpuhp_state {
 	CPUHP_AP_ONLINE,
 	CPUHP_TEARDOWN_CPU,
 	CPUHP_AP_ONLINE_IDLE,
+	CPUHP_AP_SCHED_WAIT_EMPTY,
 	CPUHP_AP_SMPBOOT_THREADS,
 	CPUHP_AP_X86_VDSO_VMA_ONLINE,
 	CPUHP_AP_IRQ_AFFINITY_ONLINE,
--- a/include/linux/sched/hotplug.h
+++ b/include/linux/sched/hotplug.h
@@ -11,8 +11,10 @@ extern int sched_cpu_activate(unsigned i
 extern int sched_cpu_deactivate(unsigned int cpu);
 
 #ifdef CONFIG_HOTPLUG_CPU
+extern int sched_cpu_wait_empty(unsigned int cpu);
 extern int sched_cpu_dying(unsigned int cpu);
 #else
+# define sched_cpu_wait_empty	NULL
 # define sched_cpu_dying	NULL
 #endif
 
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1602,7 +1602,7 @@ static struct cpuhp_step cpuhp_hp_states
 		.name			= "ap:online",
 	},
 	/*
-	 * Handled on controll processor until the plugged processor manages
+	 * Handled on control processor until the plugged processor manages
 	 * this itself.
 	 */
 	[CPUHP_TEARDOWN_CPU] = {
@@ -1611,6 +1611,13 @@ static struct cpuhp_step cpuhp_hp_states
 		.teardown.single	= takedown_cpu,
 		.cant_stop		= true,
 	},
+
+	[CPUHP_AP_SCHED_WAIT_EMPTY] = {
+		.name			= "sched:waitempty",
+		.startup.single		= NULL,
+		.teardown.single	= sched_cpu_wait_empty,
+	},
+
 	/* Handle smpboot threads park/unpark */
 	[CPUHP_AP_SMPBOOT_THREADS] = {
 		.name			= "smpboot/threads:online",
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6759,120 +6759,6 @@ void idle_task_exit(void)
 	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
 }
 
-/*
- * Since this CPU is going 'away' for a while, fold any nr_active delta
- * we might have. Assumes we're called after migrate_tasks() so that the
- * nr_active count is stable. We need to take the teardown thread which
- * is calling this into account, so we hand in adjust = 1 to the load
- * calculation.
- *
- * Also see the comment "Global load-average calculations".
- */
-static void calc_load_migrate(struct rq *rq)
-{
-	long delta = calc_load_fold_active(rq, 1);
-	if (delta)
-		atomic_long_add(delta, &calc_load_tasks);
-}
-
-static struct task_struct *__pick_migrate_task(struct rq *rq)
-{
-	const struct sched_class *class;
-	struct task_struct *next;
-
-	for_each_class(class) {
-		next = class->pick_next_task(rq);
-		if (next) {
-			next->sched_class->put_prev_task(rq, next);
-			return next;
-		}
-	}
-
-	/* The idle class should always have a runnable task */
-	BUG();
-}
-
-/*
- * Migrate all tasks from the rq, sleeping tasks will be migrated by
- * try_to_wake_up()->select_task_rq().
- *
- * Called with rq->lock held even though we'er in stop_machine() and
- * there's no concurrency possible, we hold the required locks anyway
- * because of lock validation efforts.
- */
-static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
-{
-	struct rq *rq = dead_rq;
-	struct task_struct *next, *stop = rq->stop;
-	struct rq_flags orf = *rf;
-	int dest_cpu;
-
-	/*
-	 * Fudge the rq selection such that the below task selection loop
-	 * doesn't get stuck on the currently eligible stop task.
-	 *
-	 * We're currently inside stop_machine() and the rq is either stuck
-	 * in the stop_machine_cpu_stop() loop, or we're executing this code,
-	 * either way we should never end up calling schedule() until we're
-	 * done here.
-	 */
-	rq->stop = NULL;
-
-	/*
-	 * put_prev_task() and pick_next_task() sched
-	 * class method both need to have an up-to-date
-	 * value of rq->clock[_task]
-	 */
-	update_rq_clock(rq);
-
-	for (;;) {
-		/*
-		 * There's this thread running, bail when that's the only
-		 * remaining thread:
-		 */
-		if (rq->nr_running == 1)
-			break;
-
-		next = __pick_migrate_task(rq);
-
-		/*
-		 * Rules for changing task_struct::cpus_mask are holding
-		 * both pi_lock and rq->lock, such that holding either
-		 * stabilizes the mask.
-		 *
-		 * Drop rq->lock is not quite as disastrous as it usually is
-		 * because !cpu_active at this point, which means load-balance
-		 * will not interfere. Also, stop-machine.
-		 */
-		rq_unlock(rq, rf);
-		raw_spin_lock(&next->pi_lock);
-		rq_relock(rq, rf);
-
-		/*
-		 * Since we're inside stop-machine, _nothing_ should have
-		 * changed the task, WARN if weird stuff happened, because in
-		 * that case the above rq->lock drop is a fail too.
-		 */
-		if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
-			raw_spin_unlock(&next->pi_lock);
-			continue;
-		}
-
-		/* Find suitable destination for @next, with force if needed. */
-		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
-		rq = __migrate_task(rq, rf, next, dest_cpu);
-		if (rq != dead_rq) {
-			rq_unlock(rq, rf);
-			rq = dead_rq;
-			*rf = orf;
-			rq_relock(rq, rf);
-		}
-		raw_spin_unlock(&next->pi_lock);
-	}
-
-	rq->stop = stop;
-}
-
 static int __balance_push_cpu_stop(void *arg)
 {
 	struct task_struct *p = arg;
@@ -7128,10 +7014,6 @@ int sched_cpu_deactivate(unsigned int cp
 		return ret;
 	}
 	sched_domains_numa_masks_clear(cpu);
-
-	/* Wait for all non per CPU kernel threads to vanish. */
-	balance_hotplug_wait();
-
 	return 0;
 }
 
@@ -7151,6 +7033,41 @@ int sched_cpu_starting(unsigned int cpu)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Invoked immediately before the stopper thread is invoked to bring the
+ * CPU down completely. At this point all per CPU kthreads except the
+ * hotplug thread (current) and the stopper thread (inactive) have been
+ * either parked or have been unbound from the outgoing CPU. Ensure that
+ * any of those which might be on the way out are gone.
+ *
+ * If after this point a bound task is being woken on this CPU then the
+ * responsible hotplug callback has failed to do it's job.
+ * sched_cpu_dying() will catch it with the appropriate fireworks.
+ */
+int sched_cpu_wait_empty(unsigned int cpu)
+{
+	balance_hotplug_wait();
+	return 0;
+}
+
+/*
+ * Since this CPU is going 'away' for a while, fold any nr_active delta we
+ * might have. Called from the CPU stopper task after ensuring that the
+ * stopper is the last running task on the CPU, so nr_active count is
+ * stable. We need to take the teardown thread which is calling this into
+ * account, so we hand in adjust = 1 to the load calculation.
+ *
+ * Also see the comment "Global load-average calculations".
+ */
+static void calc_load_migrate(struct rq *rq)
+{
+	long delta = calc_load_fold_active(rq, 1);
+
+	if (delta)
+		atomic_long_add(delta, &calc_load_tasks);
+}
+
 int sched_cpu_dying(unsigned int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -7164,7 +7081,6 @@ int sched_cpu_dying(unsigned int cpu)
 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 		set_rq_offline(rq);
 	}
-	migrate_tasks(rq, &rf);
 	BUG_ON(rq->nr_running != 1);
 	rq_unlock_irqrestore(rq, &rf);
 



^ permalink raw reply	[flat|nested] 81+ messages in thread

* [PATCH v4 07/19] sched: Fix hotplug vs CPU bandwidth control
  2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (5 preceding siblings ...)
  2020-10-23 10:12 ` [PATCH v4 06/19] sched/hotplug: Consolidate task migration on CPU unplug Peter Zijlstra
@ 2020-10-23 10:12 ` Peter Zijlstra
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
  2020-10-23 10:12 ` [PATCH v4 08/19] sched: Massage set_cpus_allowed() Peter Zijlstra
                   ` (13 subsequent siblings)
  20 siblings, 1 reply; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-23 10:12 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

Since we now migrate tasks away before DYING, we should also move
bandwidth unthrottle, otherwise we can gain tasks from unthrottle
after we expect all tasks to be gone already.

Also; it looks like the RT balancers don't respect cpu_active() and
instead rely on rq->online in part, complete this. This too requires
we do set_rq_offline() earlier to match the cpu_active() semantics.
(The bigger patch is to convert RT to cpu_active() entirely)

Since set_rq_online() is called from sched_cpu_activate(), place
set_rq_offline() in sched_cpu_deactivate().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c     |   14 ++++++++++----
 kernel/sched/deadline.c |    5 +----
 kernel/sched/rt.c       |    5 +----
 3 files changed, 12 insertions(+), 12 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6979,6 +6979,8 @@ int sched_cpu_activate(unsigned int cpu)
 
 int sched_cpu_deactivate(unsigned int cpu)
 {
+	struct rq *rq = cpu_rq(cpu);
+	struct rq_flags rf;
 	int ret;
 
 	set_cpu_active(cpu, false);
@@ -6993,6 +6995,14 @@ int sched_cpu_deactivate(unsigned int cp
 
 	balance_push_set(cpu, true);
 
+	rq_lock_irqsave(rq, &rf);
+	if (rq->rd) {
+		update_rq_clock(rq);
+		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+		set_rq_offline(rq);
+	}
+	rq_unlock_irqrestore(rq, &rf);
+
 #ifdef CONFIG_SCHED_SMT
 	/*
 	 * When going down, decrement the number of cores with SMT present.
@@ -7074,10 +7084,6 @@ int sched_cpu_dying(unsigned int cpu)
 	sched_tick_stop(cpu);
 
 	rq_lock_irqsave(rq, &rf);
-	if (rq->rd) {
-		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-		set_rq_offline(rq);
-	}
 	BUG_ON(rq->nr_running != 1);
 	rq_unlock_irqrestore(rq, &rf);
 
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -543,7 +543,7 @@ static int push_dl_task(struct rq *rq);
 
 static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
 {
-	return dl_task(prev);
+	return rq->online && dl_task(prev);
 }
 
 static DEFINE_PER_CPU(struct callback_head, dl_push_head);
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -265,7 +265,7 @@ static void pull_rt_task(struct rq *this
 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 {
 	/* Try to pull RT tasks here if we lower this rq's prio */
-	return rq->rt.highest_prio.curr > prev->prio;
+	return rq->online && rq->rt.highest_prio.curr > prev->prio;
 }
 
 static inline int rt_overloaded(struct rq *rq)



^ permalink raw reply	[flat|nested] 81+ messages in thread

* [PATCH v4 08/19] sched: Massage set_cpus_allowed()
  2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (6 preceding siblings ...)
  2020-10-23 10:12 ` [PATCH v4 07/19] sched: Fix hotplug vs CPU bandwidth control Peter Zijlstra
@ 2020-10-23 10:12 ` Peter Zijlstra
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
  2020-10-23 10:12 ` [PATCH v4 09/19] sched: Add migrate_disable() Peter Zijlstra
                   ` (12 subsequent siblings)
  20 siblings, 1 reply; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-23 10:12 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

Thread a u32 flags word through the *set_cpus_allowed*() callchain.
This will allow adding behavioural tweaks for future users.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c     |   28 ++++++++++++++++++----------
 kernel/sched/deadline.c |    5 +++--
 kernel/sched/sched.h    |    7 +++++--
 3 files changed, 26 insertions(+), 14 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1828,13 +1828,14 @@ static int migration_cpu_stop(void *data
  * sched_class::set_cpus_allowed must do the below, but is not required to
  * actually call this function.
  */
-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
 {
 	cpumask_copy(&p->cpus_mask, new_mask);
 	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+static void
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
 {
 	struct rq *rq = task_rq(p);
 	bool queued, running;
@@ -1855,7 +1856,7 @@ void do_set_cpus_allowed(struct task_str
 	if (running)
 		put_prev_task(rq, p);
 
-	p->sched_class->set_cpus_allowed(p, new_mask);
+	p->sched_class->set_cpus_allowed(p, new_mask, flags);
 
 	if (queued)
 		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -1863,6 +1864,11 @@ void do_set_cpus_allowed(struct task_str
 		set_next_task(rq, p);
 }
 
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	__do_set_cpus_allowed(p, new_mask, 0);
+}
+
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
@@ -1873,7 +1879,8 @@ void do_set_cpus_allowed(struct task_str
  * call is not atomic; no spinlocks may be held.
  */
 static int __set_cpus_allowed_ptr(struct task_struct *p,
-				  const struct cpumask *new_mask, bool check)
+				  const struct cpumask *new_mask,
+				  u32 flags)
 {
 	const struct cpumask *cpu_valid_mask = cpu_active_mask;
 	unsigned int dest_cpu;
@@ -1895,7 +1902,7 @@ static int __set_cpus_allowed_ptr(struct
 	 * Must re-check here, to close a race against __kthread_bind(),
 	 * sched_setaffinity() is not guaranteed to observe the flag.
 	 */
-	if (check && (p->flags & PF_NO_SETAFFINITY)) {
+	if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1914,7 +1921,7 @@ static int __set_cpus_allowed_ptr(struct
 		goto out;
 	}
 
-	do_set_cpus_allowed(p, new_mask);
+	__do_set_cpus_allowed(p, new_mask, flags);
 
 	if (p->flags & PF_KTHREAD) {
 		/*
@@ -1951,7 +1958,7 @@ static int __set_cpus_allowed_ptr(struct
 
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
-	return __set_cpus_allowed_ptr(p, new_mask, false);
+	return __set_cpus_allowed_ptr(p, new_mask, 0);
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
@@ -2410,7 +2417,8 @@ void sched_set_stop_task(int cpu, struct
 #else
 
 static inline int __set_cpus_allowed_ptr(struct task_struct *p,
-					 const struct cpumask *new_mask, bool check)
+					 const struct cpumask *new_mask,
+					 u32 flags)
 {
 	return set_cpus_allowed_ptr(p, new_mask);
 }
@@ -6025,7 +6033,7 @@ long sched_setaffinity(pid_t pid, const
 	}
 #endif
 again:
-	retval = __set_cpus_allowed_ptr(p, new_mask, true);
+	retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
 
 	if (!retval) {
 		cpuset_cpus_allowed(p, cpus_allowed);
@@ -6608,7 +6616,7 @@ void init_idle(struct task_struct *idle,
 	 *
 	 * And since this is boot we can forgo the serialization.
 	 */
-	set_cpus_allowed_common(idle, cpumask_of(cpu));
+	set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
 #endif
 	/*
 	 * We're having a chicken and egg problem, even though we are
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2280,7 +2280,8 @@ static void task_woken_dl(struct rq *rq,
 }
 
 static void set_cpus_allowed_dl(struct task_struct *p,
-				const struct cpumask *new_mask)
+				const struct cpumask *new_mask,
+				u32 flags)
 {
 	struct root_domain *src_rd;
 	struct rq *rq;
@@ -2309,7 +2310,7 @@ static void set_cpus_allowed_dl(struct t
 		raw_spin_unlock(&src_dl_b->lock);
 	}
 
-	set_cpus_allowed_common(p, new_mask);
+	set_cpus_allowed_common(p, new_mask, flags);
 }
 
 /* Assumes rq->lock is held */
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1806,7 +1806,8 @@ struct sched_class {
 	void (*task_woken)(struct rq *this_rq, struct task_struct *task);
 
 	void (*set_cpus_allowed)(struct task_struct *p,
-				 const struct cpumask *newmask);
+				 const struct cpumask *newmask,
+				 u32 flags);
 
 	void (*rq_online)(struct rq *rq);
 	void (*rq_offline)(struct rq *rq);
@@ -1899,7 +1900,9 @@ extern void update_group_capacity(struct
 
 extern void trigger_load_balance(struct rq *rq);
 
-extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
+#define SCA_CHECK		0x01
+
+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
 
 #endif
 



^ permalink raw reply	[flat|nested] 81+ messages in thread

* [PATCH v4 09/19] sched: Add migrate_disable()
  2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (7 preceding siblings ...)
  2020-10-23 10:12 ` [PATCH v4 08/19] sched: Massage set_cpus_allowed() Peter Zijlstra
@ 2020-10-23 10:12 ` Peter Zijlstra
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
  2020-10-23 10:12 ` [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr() Peter Zijlstra
                   ` (11 subsequent siblings)
  20 siblings, 1 reply; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-23 10:12 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

Add the base migrate_disable() support (under protest).

While migrate_disable() is (currently) required for PREEMPT_RT, it is
also one of the biggest flaws in the system.

Notably this is just the base implementation, it is broken vs
sched_setaffinity() and hotplug, both solved in additional patches for
ease of review.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/preempt.h |   65 +++++++++++++++++++++++++++
 include/linux/sched.h   |    3 +
 kernel/sched/core.c     |  112 +++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sched/sched.h    |    6 +-
 lib/smp_processor_id.c  |    5 ++
 5 files changed, 183 insertions(+), 8 deletions(-)

--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -322,6 +322,69 @@ static inline void preempt_notifier_init
 
 #endif
 
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+
+/*
+ * Migrate-Disable and why it is (strongly) undesired.
+ *
+ * The premise of the Real-Time schedulers we have on Linux
+ * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks
+ * concurrently, provided there are sufficient runnable tasks, also known as
+ * work-conserving. For instance SCHED_DEADLINE tries to schedule the M
+ * earliest deadline threads, and SCHED_FIFO the M highest priority threads.
+ *
+ * The correctness of various scheduling models depends on this, but is it
+ * broken by migrate_disable() that doesn't imply preempt_disable(). Where
+ * preempt_disable() implies an immediate priority ceiling, preemptible
+ * migrate_disable() allows nesting.
+ *
+ * The worst case is that all tasks preempt one another in a migrate_disable()
+ * region and stack on a single CPU. This then reduces the available bandwidth
+ * to a single CPU. And since Real-Time schedulability theory considers the
+ * Worst-Case only, all Real-Time analysis shall revert to single-CPU
+ * (instantly solving the SMP analysis problem).
+ *
+ *
+ * The reason we have it anyway.
+ *
+ * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
+ * number of primitives into becoming preemptible, they would also allow
+ * migration. This turns out to break a bunch of per-cpu usage. To this end,
+ * all these primitives employ migirate_disable() to restore this implicit
+ * assumption.
+ *
+ * This is a 'temporary' work-around at best. The correct solution is getting
+ * rid of the above assumptions and reworking the code to employ explicit
+ * per-cpu locking or short preempt-disable regions.
+ *
+ * The end goal must be to get rid of migrate_disable(), alternatively we need
+ * a schedulability theory that does not depend on abritrary migration.
+ *
+ *
+ * Notes on the implementation.
+ *
+ * The implementation is particularly tricky since existing code patterns
+ * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
+ * This means that it cannot use cpus_read_lock() to serialize against hotplug,
+ * nor can it easily migrate itself into a pending affinity mask change on
+ * migrate_enable().
+ *
+ *
+ * Note: even non-work-conserving schedulers like semi-partitioned depends on
+ *       migration, so migrate_disable() is not only a problem for
+ *       work-conserving schedulers.
+ *
+ */
+extern void migrate_disable(void);
+extern void migrate_enable(void);
+
+#elif defined(CONFIG_PREEMPT_RT)
+
+static inline void migrate_disable(void) { }
+static inline void migrate_enable(void) { }
+
+#else /* !CONFIG_PREEMPT_RT */
+
 /**
  * migrate_disable - Prevent migration of the current task
  *
@@ -352,4 +415,6 @@ static __always_inline void migrate_enab
 	preempt_enable();
 }
 
+#endif /* CONFIG_SMP && CONFIG_PREEMPT_RT */
+
 #endif /* __LINUX_PREEMPT_H */
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -714,6 +714,9 @@ struct task_struct {
 	int				nr_cpus_allowed;
 	const cpumask_t			*cpus_ptr;
 	cpumask_t			cpus_mask;
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+	int				migration_disabled;
+#endif
 
 #ifdef CONFIG_PREEMPT_RCU
 	int				rcu_read_lock_nesting;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1696,6 +1696,61 @@ void check_preempt_curr(struct rq *rq, s
 
 #ifdef CONFIG_SMP
 
+#ifdef CONFIG_PREEMPT_RT
+
+static void
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask,
+				  u32 flags);
+
+static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
+{
+	if (likely(!p->migration_disabled))
+		return;
+
+	if (p->cpus_ptr != &p->cpus_mask)
+		return;
+
+	/*
+	 * Violates locking rules! see comment in __do_set_cpus_allowed().
+	 */
+	__do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
+}
+
+void migrate_disable(void)
+{
+	if (current->migration_disabled++)
+		return;
+
+	barrier();
+}
+EXPORT_SYMBOL_GPL(migrate_disable);
+
+void migrate_enable(void)
+{
+	struct task_struct *p = current;
+
+	if (--p->migration_disabled)
+		return;
+
+	barrier();
+
+	if (p->cpus_ptr == &p->cpus_mask)
+		return;
+
+	__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+}
+EXPORT_SYMBOL_GPL(migrate_enable);
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+	return p->migration_disabled;
+}
+
+#endif
+
 /*
  * Per-CPU kthreads are allowed to run on !active && online CPUs, see
  * __set_cpus_allowed_ptr() and select_fallback_rq().
@@ -1705,7 +1760,7 @@ static inline bool is_cpu_allowed(struct
 	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 		return false;
 
-	if (is_per_cpu_kthread(p))
+	if (is_per_cpu_kthread(p) || is_migration_disabled(p))
 		return cpu_online(cpu);
 
 	return cpu_active(cpu);
@@ -1826,6 +1881,11 @@ static int migration_cpu_stop(void *data
  */
 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
 {
+	if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
+		p->cpus_ptr = new_mask;
+		return;
+	}
+
 	cpumask_copy(&p->cpus_mask, new_mask);
 	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
@@ -1836,7 +1896,22 @@ __do_set_cpus_allowed(struct task_struct
 	struct rq *rq = task_rq(p);
 	bool queued, running;
 
-	lockdep_assert_held(&p->pi_lock);
+	/*
+	 * This here violates the locking rules for affinity, since we're only
+	 * supposed to change these variables while holding both rq->lock and
+	 * p->pi_lock.
+	 *
+	 * HOWEVER, it magically works, because ttwu() is the only code that
+	 * accesses these variables under p->pi_lock and only does so after
+	 * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
+	 * before finish_task().
+	 *
+	 * XXX do further audits, this smells like something putrid.
+	 */
+	if (flags & SCA_MIGRATE_DISABLE)
+		SCHED_WARN_ON(!p->on_cpu);
+	else
+		lockdep_assert_held(&p->pi_lock);
 
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
@@ -1887,9 +1962,14 @@ static int __set_cpus_allowed_ptr(struct
 	rq = task_rq_lock(p, &rf);
 	update_rq_clock(rq);
 
-	if (p->flags & PF_KTHREAD) {
+	if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
 		/*
-		 * Kernel threads are allowed on online && !active CPUs
+		 * Kernel threads are allowed on online && !active CPUs.
+		 *
+		 * Specifically, migration_disabled() tasks must not fail the
+		 * cpumask_any_and_distribute() pick below, esp. so on
+		 * SCA_MIGRATE_ENABLE, otherwise we'll not call
+		 * set_cpus_allowed_common() and actually reset p->cpus_ptr.
 		 */
 		cpu_valid_mask = cpu_online_mask;
 	}
@@ -1903,7 +1983,7 @@ static int __set_cpus_allowed_ptr(struct
 		goto out;
 	}
 
-	if (cpumask_equal(&p->cpus_mask, new_mask))
+	if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask))
 		goto out;
 
 	/*
@@ -1995,6 +2075,8 @@ void set_task_cpu(struct task_struct *p,
 	 * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
 	 */
 	WARN_ON_ONCE(!cpu_online(new_cpu));
+
+	WARN_ON_ONCE(is_migration_disabled(p));
 #endif
 
 	trace_sched_migrate_task(p, new_cpu);
@@ -2325,6 +2407,12 @@ static int select_fallback_rq(int cpu, s
 			}
 			fallthrough;
 		case possible:
+			/*
+			 * XXX When called from select_task_rq() we only
+			 * hold p->pi_lock and again violate locking order.
+			 *
+			 * More yuck to audit.
+			 */
 			do_set_cpus_allowed(p, cpu_possible_mask);
 			state = fail;
 			break;
@@ -2359,7 +2447,7 @@ int select_task_rq(struct task_struct *p
 {
 	lockdep_assert_held(&p->pi_lock);
 
-	if (p->nr_cpus_allowed > 1)
+	if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
 		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 	else
 		cpu = cpumask_any(p->cpus_ptr);
@@ -2421,6 +2509,17 @@ static inline int __set_cpus_allowed_ptr
 
 #endif /* CONFIG_SMP */
 
+#if !defined(CONFIG_SMP) || !defined(CONFIG_PREEMPT_RT)
+
+static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+	return false;
+}
+
+#endif
+
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 {
@@ -4570,6 +4669,7 @@ static void __sched notrace __schedule(b
 		 */
 		++*switch_count;
 
+		migrate_disable_switch(rq, prev);
 		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
 
 		trace_sched_switch(preempt, prev, next);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1902,14 +1902,16 @@ static inline bool sched_fair_runnable(s
 extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
 extern struct task_struct *pick_next_task_idle(struct rq *rq);
 
+#define SCA_CHECK		0x01
+#define SCA_MIGRATE_DISABLE	0x02
+#define SCA_MIGRATE_ENABLE	0x04
+
 #ifdef CONFIG_SMP
 
 extern void update_group_capacity(struct sched_domain *sd, int cpu);
 
 extern void trigger_load_balance(struct rq *rq);
 
-#define SCA_CHECK		0x01
-
 extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
 
 #endif
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -26,6 +26,11 @@ unsigned int check_preemption_disabled(c
 	if (current->nr_cpus_allowed == 1)
 		goto out;
 
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+	if (current->migration_disabled)
+		goto out;
+#endif
+
 	/*
 	 * It is valid to assume CPU-locality during early bootup:
 	 */



^ permalink raw reply	[flat|nested] 81+ messages in thread

* [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (8 preceding siblings ...)
  2020-10-23 10:12 ` [PATCH v4 09/19] sched: Add migrate_disable() Peter Zijlstra
@ 2020-10-23 10:12 ` Peter Zijlstra
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
  2020-11-12 16:38   ` [PATCH v4 10/19] " Qian Cai
  2020-10-23 10:12 ` [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative Peter Zijlstra
                   ` (10 subsequent siblings)
  20 siblings, 2 replies; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-23 10:12 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

Concurrent migrate_disable() and set_cpus_allowed_ptr() has
interesting features. We rely on set_cpus_allowed_ptr() to not return
until the task runs inside the provided mask. This expectation is
exported to userspace.

This means that any set_cpus_allowed_ptr() caller must wait until
migrate_enable() allows migrations.

At the same time, we don't want migrate_enable() to schedule, due to
patterns like:

	preempt_disable();
	migrate_disable();
	...
	migrate_enable();
	preempt_enable();

And:

	raw_spin_lock(&B);
	spin_unlock(&A);

this means that when migrate_enable() must restore the affinity
mask, it cannot wait for completion thereof. Luck will have it that
that is exactly the case where there is a pending
set_cpus_allowed_ptr(), so let that provide storage for the async stop
machine.

Much thanks to Valentin who used TLA+ most effective and found lots of
'interesting' cases.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/sched.h |    1 
 kernel/sched/core.c   |  234 +++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 205 insertions(+), 30 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -713,6 +713,7 @@ struct task_struct {
 	int				nr_cpus_allowed;
 	const cpumask_t			*cpus_ptr;
 	cpumask_t			cpus_mask;
+	void				*migration_pending;
 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
 	int				migration_disabled;
 #endif
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1732,15 +1732,26 @@ void migrate_enable(void)
 {
 	struct task_struct *p = current;
 
-	if (--p->migration_disabled)
+	if (p->migration_disabled > 1) {
+		p->migration_disabled--;
 		return;
+	}
 
+	/*
+	 * Ensure stop_task runs either before or after this, and that
+	 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+	 */
+	preempt_disable();
+	if (p->cpus_ptr != &p->cpus_mask)
+		__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+	/*
+	 * Mustn't clear migration_disabled() until cpus_ptr points back at the
+	 * regular cpus_mask, otherwise things that race (eg.
+	 * select_fallback_rq) get confused.
+	 */
 	barrier();
-
-	if (p->cpus_ptr == &p->cpus_mask)
-		return;
-
-	__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+	p->migration_disabled = 0;
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
 
@@ -1805,8 +1816,16 @@ static struct rq *move_queued_task(struc
 }
 
 struct migration_arg {
-	struct task_struct *task;
-	int dest_cpu;
+	struct task_struct		*task;
+	int				dest_cpu;
+	struct set_affinity_pending	*pending;
+};
+
+struct set_affinity_pending {
+	refcount_t		refs;
+	struct completion	done;
+	struct cpu_stop_work	stop_work;
+	struct migration_arg	arg;
 };
 
 /*
@@ -1838,16 +1857,19 @@ static struct rq *__migrate_task(struct
  */
 static int migration_cpu_stop(void *data)
 {
+	struct set_affinity_pending *pending;
 	struct migration_arg *arg = data;
 	struct task_struct *p = arg->task;
+	int dest_cpu = arg->dest_cpu;
 	struct rq *rq = this_rq();
+	bool complete = false;
 	struct rq_flags rf;
 
 	/*
 	 * The original target CPU might have gone down and we might
 	 * be on another CPU but it doesn't matter.
 	 */
-	local_irq_disable();
+	local_irq_save(rf.flags);
 	/*
 	 * We need to explicitly wake pending tasks before running
 	 * __migrate_task() such that we will not miss enforcing cpus_ptr
@@ -1857,21 +1879,83 @@ static int migration_cpu_stop(void *data
 
 	raw_spin_lock(&p->pi_lock);
 	rq_lock(rq, &rf);
+
+	pending = p->migration_pending;
 	/*
 	 * If task_rq(p) != rq, it cannot be migrated here, because we're
 	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
 	 * we're holding p->pi_lock.
 	 */
 	if (task_rq(p) == rq) {
+		if (is_migration_disabled(p))
+			goto out;
+
+		if (pending) {
+			p->migration_pending = NULL;
+			complete = true;
+		}
+
+		/* migrate_enable() --  we must not race against SCA */
+		if (dest_cpu < 0) {
+			/*
+			 * When this was migrate_enable() but we no longer
+			 * have a @pending, a concurrent SCA 'fixed' things
+			 * and we should be valid again. Nothing to do.
+			 */
+			if (!pending) {
+				WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));
+				goto out;
+			}
+
+			dest_cpu = cpumask_any_distribute(&p->cpus_mask);
+		}
+
 		if (task_on_rq_queued(p))
-			rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
+			rq = __migrate_task(rq, &rf, p, dest_cpu);
 		else
-			p->wake_cpu = arg->dest_cpu;
+			p->wake_cpu = dest_cpu;
+
+	} else if (dest_cpu < 0) {
+		/*
+		 * This happens when we get migrated between migrate_enable()'s
+		 * preempt_enable() and scheduling the stopper task. At that
+		 * point we're a regular task again and not current anymore.
+		 *
+		 * A !PREEMPT kernel has a giant hole here, which makes it far
+		 * more likely.
+		 */
+
+		/*
+		 * When this was migrate_enable() but we no longer have an
+		 * @pending, a concurrent SCA 'fixed' things and we should be
+		 * valid again. Nothing to do.
+		 */
+		if (!pending) {
+			WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));
+			goto out;
+		}
+
+		/*
+		 * When migrate_enable() hits a rq mis-match we can't reliably
+		 * determine is_migration_disabled() and so have to chase after
+		 * it.
+		 */
+		task_rq_unlock(rq, p, &rf);
+		stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
+				    &pending->arg, &pending->stop_work);
+		return 0;
 	}
-	rq_unlock(rq, &rf);
-	raw_spin_unlock(&p->pi_lock);
+out:
+	task_rq_unlock(rq, p, &rf);
+
+	if (complete)
+		complete_all(&pending->done);
+
+	/* For pending->{arg,stop_work} */
+	pending = arg->pending;
+	if (pending && refcount_dec_and_test(&pending->refs))
+		wake_up_var(&pending->refs);
 
-	local_irq_enable();
 	return 0;
 }
 
@@ -1941,6 +2025,110 @@ void do_set_cpus_allowed(struct task_str
 }
 
 /*
+ * This function is wildly self concurrent, consider at least 3 times.
+ */
+static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
+			    int dest_cpu, unsigned int flags)
+{
+	struct set_affinity_pending my_pending = { }, *pending = NULL;
+	struct migration_arg arg = {
+		.task = p,
+		.dest_cpu = dest_cpu,
+	};
+	bool complete = false;
+
+	/* Can the task run on the task's current CPU? If so, we're done */
+	if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+		pending = p->migration_pending;
+		if (pending) {
+			refcount_inc(&pending->refs);
+			p->migration_pending = NULL;
+			complete = true;
+		}
+		task_rq_unlock(rq, p, rf);
+
+		if (complete)
+			goto do_complete;
+
+		return 0;
+	}
+
+	if (!(flags & SCA_MIGRATE_ENABLE)) {
+		/* serialized by p->pi_lock */
+		if (!p->migration_pending) {
+			refcount_set(&my_pending.refs, 1);
+			init_completion(&my_pending.done);
+			p->migration_pending = &my_pending;
+		} else {
+			pending = p->migration_pending;
+			refcount_inc(&pending->refs);
+		}
+	}
+	pending = p->migration_pending;
+	/*
+	 * - !MIGRATE_ENABLE:
+	 *   we'll have installed a pending if there wasn't one already.
+	 *
+	 * - MIGRATE_ENABLE:
+	 *   we're here because the current CPU isn't matching anymore,
+	 *   the only way that can happen is because of a concurrent
+	 *   set_cpus_allowed_ptr() call, which should then still be
+	 *   pending completion.
+	 *
+	 * Either way, we really should have a @pending here.
+	 */
+	if (WARN_ON_ONCE(!pending))
+		return -EINVAL;
+
+	if (flags & SCA_MIGRATE_ENABLE) {
+
+		refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
+		task_rq_unlock(rq, p, rf);
+
+		pending->arg = (struct migration_arg) {
+			.task = p,
+			.dest_cpu = -1,
+			.pending = pending,
+		};
+
+		stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+				    &pending->arg, &pending->stop_work);
+
+		return 0;
+	}
+
+	if (task_running(rq, p) || p->state == TASK_WAKING) {
+
+		task_rq_unlock(rq, p, rf);
+		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+
+	} else {
+
+		if (!is_migration_disabled(p)) {
+			if (task_on_rq_queued(p))
+				rq = move_queued_task(rq, rf, p, dest_cpu);
+
+			p->migration_pending = NULL;
+			complete = true;
+		}
+		task_rq_unlock(rq, p, rf);
+
+do_complete:
+		if (complete)
+			complete_all(&pending->done);
+	}
+
+	wait_for_completion(&pending->done);
+
+	if (refcount_dec_and_test(&pending->refs))
+		wake_up_var(&pending->refs);
+
+	wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
+
+	return 0;
+}
+
+/*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
  * is removed from the allowed bitmask.
@@ -2009,23 +2197,8 @@ static int __set_cpus_allowed_ptr(struct
 			p->nr_cpus_allowed != 1);
 	}
 
-	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask))
-		goto out;
+	return affine_move_task(rq, p, &rf, dest_cpu, flags);
 
-	if (task_running(rq, p) || p->state == TASK_WAKING) {
-		struct migration_arg arg = { p, dest_cpu };
-		/* Need help from migration thread: drop lock and wait. */
-		task_rq_unlock(rq, p, &rf);
-		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-		return 0;
-	} else if (task_on_rq_queued(p)) {
-		/*
-		 * OK, since we're going to drop the lock immediately
-		 * afterwards anyway.
-		 */
-		rq = move_queued_task(rq, &rf, p, dest_cpu);
-	}
 out:
 	task_rq_unlock(rq, p, &rf);
 
@@ -3205,6 +3378,7 @@ static void __sched_fork(unsigned long c
 	init_numa_balancing(clone_flags, p);
 #ifdef CONFIG_SMP
 	p->wake_entry.u_flags = CSD_TYPE_TTWU;
+	p->migration_pending = NULL;
 #endif
 }
 



^ permalink raw reply	[flat|nested] 81+ messages in thread

* [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative
  2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (9 preceding siblings ...)
  2020-10-23 10:12 ` [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr() Peter Zijlstra
@ 2020-10-23 10:12 ` Peter Zijlstra
  2020-10-29 16:27   ` Valentin Schneider
                     ` (2 more replies)
  2020-10-23 10:12 ` [PATCH v4 12/19] sched,rt: Use cpumask_any*_distribute() Peter Zijlstra
                   ` (9 subsequent siblings)
  20 siblings, 3 replies; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-23 10:12 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

From: Thomas Gleixner <tglx@linutronix.de>

On CPU unplug tasks which are in a migrate disabled region cannot be pushed
to a different CPU until they returned to migrateable state.

Account the number of tasks on a runqueue which are in a migrate disabled
section and make the hotplug wait mechanism respect that.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |   36 ++++++++++++++++++++++++++++++------
 kernel/sched/sched.h |    4 ++++
 2 files changed, 34 insertions(+), 6 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1725,10 +1725,17 @@ static void migrate_disable_switch(struc
 
 void migrate_disable(void)
 {
-	if (current->migration_disabled++)
+	struct task_struct *p = current;
+
+	if (p->migration_disabled) {
+		p->migration_disabled++;
 		return;
+	}
 
-	barrier();
+	preempt_disable();
+	this_rq()->nr_pinned++;
+	p->migration_disabled = 1;
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_disable);
 
@@ -1755,6 +1762,7 @@ void migrate_enable(void)
 	 */
 	barrier();
 	p->migration_disabled = 0;
+	this_rq()->nr_pinned--;
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
@@ -1764,6 +1772,11 @@ static inline bool is_migration_disabled
 	return p->migration_disabled;
 }
 
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+	return rq->nr_pinned;
+}
+
 #endif
 
 /*
@@ -2634,6 +2647,11 @@ static inline bool is_migration_disabled
 	return false;
 }
 
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+	return false;
+}
+
 #endif
 
 static void
@@ -7006,15 +7024,20 @@ static bool balance_push(struct rq *rq)
 	 * Both the cpu-hotplug and stop task are in this case and are
 	 * required to complete the hotplug process.
 	 */
-	if (is_per_cpu_kthread(push_task)) {
+	if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
 		/*
 		 * If this is the idle task on the outgoing CPU try to wake
 		 * up the hotplug control thread which might wait for the
 		 * last task to vanish. The rcuwait_active() check is
 		 * accurate here because the waiter is pinned on this CPU
 		 * and can't obviously be running in parallel.
+		 *
+		 * On RT kernels this also has to check whether there are
+		 * pinned and scheduled out tasks on the runqueue. They
+		 * need to leave the migrate disabled section first.
 		 */
-		if (!rq->nr_running && rcuwait_active(&rq->hotplug_wait)) {
+		if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
+		    rcuwait_active(&rq->hotplug_wait)) {
 			raw_spin_unlock(&rq->lock);
 			rcuwait_wake_up(&rq->hotplug_wait);
 			raw_spin_lock(&rq->lock);
@@ -7063,7 +7086,8 @@ static void balance_hotplug_wait(void)
 {
 	struct rq *rq = this_rq();
 
-	rcuwait_wait_event(&rq->hotplug_wait, rq->nr_running == 1,
+	rcuwait_wait_event(&rq->hotplug_wait,
+			   rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
 			   TASK_UNINTERRUPTIBLE);
 }
 
@@ -7310,7 +7334,7 @@ int sched_cpu_dying(unsigned int cpu)
 	sched_tick_stop(cpu);
 
 	rq_lock_irqsave(rq, &rf);
-	BUG_ON(rq->nr_running != 1);
+	BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
 	rq_unlock_irqrestore(rq, &rf);
 
 	calc_load_migrate(rq);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1053,6 +1053,10 @@ struct rq {
 	/* Must be inspected within a rcu lock section */
 	struct cpuidle_state	*idle_state;
 #endif
+
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	unsigned int		nr_pinned;
+#endif
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED



^ permalink raw reply	[flat|nested] 81+ messages in thread

* [PATCH v4 12/19] sched,rt: Use cpumask_any*_distribute()
  2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (10 preceding siblings ...)
  2020-10-23 10:12 ` [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative Peter Zijlstra
@ 2020-10-23 10:12 ` Peter Zijlstra
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
  2020-10-23 10:12 ` [PATCH v4 13/19] sched,rt: Use the full cpumask for balancing Peter Zijlstra
                   ` (8 subsequent siblings)
  20 siblings, 1 reply; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-23 10:12 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

Replace a bunch of cpumask_any*() instances with
cpumask_any*_distribute(), by injecting this little bit of random in
cpu selection, we reduce the chance two competing balance operations
working off the same lowest_mask pick the same CPU.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/cpumask.h |    6 ++++++
 kernel/sched/deadline.c |    6 +++---
 kernel/sched/rt.c       |    6 +++---
 lib/cpumask.c           |   18 ++++++++++++++++++
 4 files changed, 30 insertions(+), 6 deletions(-)

--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -199,6 +199,11 @@ static inline int cpumask_any_and_distri
 	return cpumask_next_and(-1, src1p, src2p);
 }
 
+static inline int cpumask_any_distribute(const struct cpumask *srcp)
+{
+	return cpumask_first(srcp);
+}
+
 #define for_each_cpu(cpu, mask)			\
 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
 #define for_each_cpu_not(cpu, mask)		\
@@ -252,6 +257,7 @@ int cpumask_any_but(const struct cpumask
 unsigned int cpumask_local_spread(unsigned int i, int node);
 int cpumask_any_and_distribute(const struct cpumask *src1p,
 			       const struct cpumask *src2p);
+int cpumask_any_distribute(const struct cpumask *srcp);
 
 /**
  * for_each_cpu - iterate over every cpu in a mask
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1981,8 +1981,8 @@ static int find_later_rq(struct task_str
 				return this_cpu;
 			}
 
-			best_cpu = cpumask_first_and(later_mask,
-							sched_domain_span(sd));
+			best_cpu = cpumask_any_and_distribute(later_mask,
+							      sched_domain_span(sd));
 			/*
 			 * Last chance: if a CPU being in both later_mask
 			 * and current sd span is valid, that becomes our
@@ -2004,7 +2004,7 @@ static int find_later_rq(struct task_str
 	if (this_cpu != -1)
 		return this_cpu;
 
-	cpu = cpumask_any(later_mask);
+	cpu = cpumask_any_distribute(later_mask);
 	if (cpu < nr_cpu_ids)
 		return cpu;
 
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1752,8 +1752,8 @@ static int find_lowest_rq(struct task_st
 				return this_cpu;
 			}
 
-			best_cpu = cpumask_first_and(lowest_mask,
-						     sched_domain_span(sd));
+			best_cpu = cpumask_any_and_distribute(lowest_mask,
+							      sched_domain_span(sd));
 			if (best_cpu < nr_cpu_ids) {
 				rcu_read_unlock();
 				return best_cpu;
@@ -1770,7 +1770,7 @@ static int find_lowest_rq(struct task_st
 	if (this_cpu != -1)
 		return this_cpu;
 
-	cpu = cpumask_any(lowest_mask);
+	cpu = cpumask_any_distribute(lowest_mask);
 	if (cpu < nr_cpu_ids)
 		return cpu;
 
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -267,3 +267,21 @@ int cpumask_any_and_distribute(const str
 	return next;
 }
 EXPORT_SYMBOL(cpumask_any_and_distribute);
+
+int cpumask_any_distribute(const struct cpumask *srcp)
+{
+	int next, prev;
+
+	/* NOTE: our first selection will skip 0. */
+	prev = __this_cpu_read(distribute_cpu_mask_prev);
+
+	next = cpumask_next(prev, srcp);
+	if (next >= nr_cpu_ids)
+		next = cpumask_first(srcp);
+
+	if (next < nr_cpu_ids)
+		__this_cpu_write(distribute_cpu_mask_prev, next);
+
+	return next;
+}
+EXPORT_SYMBOL(cpumask_any_distribute);



^ permalink raw reply	[flat|nested] 81+ messages in thread

* [PATCH v4 13/19] sched,rt: Use the full cpumask for balancing
  2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (11 preceding siblings ...)
  2020-10-23 10:12 ` [PATCH v4 12/19] sched,rt: Use cpumask_any*_distribute() Peter Zijlstra
@ 2020-10-23 10:12 ` Peter Zijlstra
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
  2020-10-23 10:12 ` [PATCH v4 14/19] sched, lockdep: Annotate ->pi_lock recursion Peter Zijlstra
                   ` (7 subsequent siblings)
  20 siblings, 1 reply; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-23 10:12 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

We want migrate_disable() tasks to get PULLs in order for them to PUSH
away the higher priority task.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/cpudeadline.c |    4 ++--
 kernel/sched/cpupri.c      |    4 ++--
 kernel/sched/deadline.c    |    4 ++--
 kernel/sched/rt.c          |    4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -120,7 +120,7 @@ int cpudl_find(struct cpudl *cp, struct
 	const struct sched_dl_entity *dl_se = &p->dl;
 
 	if (later_mask &&
-	    cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
+	    cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) {
 		unsigned long cap, max_cap = 0;
 		int cpu, max_cpu = -1;
 
@@ -151,7 +151,7 @@ int cpudl_find(struct cpudl *cp, struct
 
 		WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
 
-		if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
+		if (cpumask_test_cpu(best_cpu, &p->cpus_mask) &&
 		    dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
 			if (later_mask)
 				cpumask_set_cpu(best_cpu, later_mask);
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -73,11 +73,11 @@ static inline int __cpupri_find(struct c
 	if (skip)
 		return 0;
 
-	if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
+	if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
 		return 0;
 
 	if (lowest_mask) {
-		cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
+		cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
 
 		/*
 		 * We have to ensure that we have at least one bit
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1888,7 +1888,7 @@ static void task_fork_dl(struct task_str
 static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    cpumask_test_cpu(cpu, p->cpus_ptr))
+	    cpumask_test_cpu(cpu, &p->cpus_mask))
 		return 1;
 	return 0;
 }
@@ -2038,7 +2038,7 @@ static struct rq *find_lock_later_rq(str
 		/* Retry if something changed. */
 		if (double_lock_balance(rq, later_rq)) {
 			if (unlikely(task_rq(task) != rq ||
-				     !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
+				     !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
 				     task_running(rq, task) ||
 				     !dl_task(task) ||
 				     !task_on_rq_queued(task))) {
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1658,7 +1658,7 @@ static void put_prev_task_rt(struct rq *
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    cpumask_test_cpu(cpu, p->cpus_ptr))
+	    cpumask_test_cpu(cpu, &p->cpus_mask))
 		return 1;
 
 	return 0;
@@ -1811,7 +1811,7 @@ static struct rq *find_lock_lowest_rq(st
 			 * Also make sure that it wasn't scheduled on its rq.
 			 */
 			if (unlikely(task_rq(task) != rq ||
-				     !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
+				     !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
 				     task_running(rq, task) ||
 				     !rt_task(task) ||
 				     !task_on_rq_queued(task))) {



^ permalink raw reply	[flat|nested] 81+ messages in thread

* [PATCH v4 14/19] sched, lockdep: Annotate ->pi_lock recursion
  2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (12 preceding siblings ...)
  2020-10-23 10:12 ` [PATCH v4 13/19] sched,rt: Use the full cpumask for balancing Peter Zijlstra
@ 2020-10-23 10:12 ` Peter Zijlstra
  2020-10-29 16:27   ` Valentin Schneider
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
  2020-10-23 10:12 ` [PATCH v4 15/19] sched: Fix migrate_disable() vs rt/dl balancing Peter Zijlstra
                   ` (6 subsequent siblings)
  20 siblings, 2 replies; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-23 10:12 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

There's a valid ->pi_lock recursion issue where the actual PI code
tries to wake up the stop task. Make lockdep aware so it doesn't
complain about this.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c |   15 +++++++++++++++
 1 file changed, 15 insertions(+)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2602,6 +2602,7 @@ int select_task_rq(struct task_struct *p
 
 void sched_set_stop_task(int cpu, struct task_struct *stop)
 {
+	static struct lock_class_key stop_pi_lock;
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
 	struct task_struct *old_stop = cpu_rq(cpu)->stop;
 
@@ -2617,6 +2618,20 @@ void sched_set_stop_task(int cpu, struct
 		sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
 
 		stop->sched_class = &stop_sched_class;
+
+		/*
+		 * The PI code calls rt_mutex_setprio() with ->pi_lock held to
+		 * adjust the effective priority of a task. As a result,
+		 * rt_mutex_setprio() can trigger (RT) balancing operations,
+		 * which can then trigger wakeups of the stop thread to push
+		 * around the current task.
+		 *
+		 * The stop task itself will never be part of the PI-chain, it
+		 * never blocks, therefore that ->pi_lock recursion is safe.
+		 * Tell lockdep about this by placing the stop->pi_lock in its
+		 * own class.
+		 */
+		lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
 	}
 
 	cpu_rq(cpu)->stop = stop;



^ permalink raw reply	[flat|nested] 81+ messages in thread

* [PATCH v4 15/19] sched: Fix migrate_disable() vs rt/dl balancing
  2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (13 preceding siblings ...)
  2020-10-23 10:12 ` [PATCH v4 14/19] sched, lockdep: Annotate ->pi_lock recursion Peter Zijlstra
@ 2020-10-23 10:12 ` Peter Zijlstra
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
  2020-12-26 13:54   ` [PATCH v4 15/19] " Qais Yousef
  2020-10-23 10:12 ` [PATCH v4 16/19] sched/proc: Print accurate cpumask vs migrate_disable() Peter Zijlstra
                   ` (5 subsequent siblings)
  20 siblings, 2 replies; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-23 10:12 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

In order to minimize the interference of migrate_disable() on lower
priority tasks, which can be deprived of runtime due to being stuck
below a higher priority task. Teach the RT/DL balancers to push away
these higher priority tasks when a lower priority task gets selected
to run on a freshly demoted CPU (pull).

This adds migration interference to the higher priority task, but
restores bandwidth to system that would otherwise be irrevocably lost.
Without this it would be possible to have all tasks on the system
stuck on a single CPU, each task preempted in a migrate_disable()
section with a single high priority task running.

This way we can still approximate running the M highest priority tasks
on the system.

Migrating the top task away is (ofcourse) still subject to
migrate_disable() too, which means the lower task is subject to an
interference equivalent to the worst case migrate_disable() section.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/preempt.h |   38 +++++++++++++++------------
 include/linux/sched.h   |    3 +-
 kernel/sched/core.c     |   67 ++++++++++++++++++++++++++++++++++++++++--------
 kernel/sched/deadline.c |   29 +++++++++++++++-----
 kernel/sched/rt.c       |   63 ++++++++++++++++++++++++++++++++++++---------
 kernel/sched/sched.h    |   32 ++++++++++++++++++++++
 6 files changed, 185 insertions(+), 47 deletions(-)

--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -325,24 +325,28 @@ static inline void preempt_notifier_init
 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
 
 /*
- * Migrate-Disable and why it is (strongly) undesired.
+ * Migrate-Disable and why it is undesired.
  *
- * The premise of the Real-Time schedulers we have on Linux
- * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks
- * concurrently, provided there are sufficient runnable tasks, also known as
- * work-conserving. For instance SCHED_DEADLINE tries to schedule the M
- * earliest deadline threads, and SCHED_FIFO the M highest priority threads.
- *
- * The correctness of various scheduling models depends on this, but is it
- * broken by migrate_disable() that doesn't imply preempt_disable(). Where
- * preempt_disable() implies an immediate priority ceiling, preemptible
- * migrate_disable() allows nesting.
- *
- * The worst case is that all tasks preempt one another in a migrate_disable()
- * region and stack on a single CPU. This then reduces the available bandwidth
- * to a single CPU. And since Real-Time schedulability theory considers the
- * Worst-Case only, all Real-Time analysis shall revert to single-CPU
- * (instantly solving the SMP analysis problem).
+ * When a preempted task becomes elegible to run under the ideal model (IOW it
+ * becomes one of the M highest priority tasks), it might still have to wait
+ * for the preemptee's migrate_disable() section to complete. Thereby suffering
+ * a reduction in bandwidth in the exact duration of the migrate_disable()
+ * section.
+ *
+ * Per this argument, the change from preempt_disable() to migrate_disable()
+ * gets us:
+ *
+ * - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
+ *   it would have had to wait for the lower priority task.
+ *
+ * - a lower priority tasks; which under preempt_disable() could've instantly
+ *   migrated away when another CPU becomes available, is now constrained
+ *   by the ability to push the higher priority task away, which might itself be
+ *   in a migrate_disable() section, reducing it's available bandwidth.
+ *
+ * IOW it trades latency / moves the interference term, but it stays in the
+ * system, and as long as it remains unbounded, the system is not fully
+ * deterministic.
  *
  *
  * The reason we have it anyway.
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -716,8 +716,9 @@ struct task_struct {
 	cpumask_t			cpus_mask;
 	void				*migration_pending;
 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
-	int				migration_disabled;
+	unsigned short			migration_disabled;
 #endif
+	unsigned short			migration_flags;
 
 #ifdef CONFIG_PREEMPT_RCU
 	int				rcu_read_lock_nesting;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1763,11 +1763,6 @@ void migrate_enable(void)
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
 
-static inline bool is_migration_disabled(struct task_struct *p)
-{
-	return p->migration_disabled;
-}
-
 static inline bool rq_has_pinned_tasks(struct rq *rq)
 {
 	return rq->nr_pinned;
@@ -1974,6 +1969,49 @@ static int migration_cpu_stop(void *data
 	return 0;
 }
 
+int push_cpu_stop(void *arg)
+{
+	struct rq *lowest_rq = NULL, *rq = this_rq();
+	struct task_struct *p = arg;
+
+	raw_spin_lock_irq(&p->pi_lock);
+	raw_spin_lock(&rq->lock);
+
+	if (task_rq(p) != rq)
+		goto out_unlock;
+
+	if (is_migration_disabled(p)) {
+		p->migration_flags |= MDF_PUSH;
+		goto out_unlock;
+	}
+
+	p->migration_flags &= ~MDF_PUSH;
+
+	if (p->sched_class->find_lock_rq)
+		lowest_rq = p->sched_class->find_lock_rq(p, rq);
+
+	if (!lowest_rq)
+		goto out_unlock;
+
+	// XXX validate p is still the highest prio task
+	if (task_rq(p) == rq) {
+		deactivate_task(rq, p, 0);
+		set_task_cpu(p, lowest_rq->cpu);
+		activate_task(lowest_rq, p, 0);
+		resched_curr(lowest_rq);
+	}
+
+	double_unlock_balance(rq, lowest_rq);
+
+out_unlock:
+	rq->push_busy = false;
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock_irq(&p->pi_lock);
+
+	put_task_struct(p);
+	return 0;
+}
+
 /*
  * sched_class::set_cpus_allowed must do the below, but is not required to
  * actually call this function.
@@ -2054,6 +2092,14 @@ static int affine_move_task(struct rq *r
 
 	/* Can the task run on the task's current CPU? If so, we're done */
 	if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+		struct task_struct *push_task = NULL;
+
+		if ((flags & SCA_MIGRATE_ENABLE) &&
+		    (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
+			rq->push_busy = true;
+			push_task = get_task_struct(p);
+		}
+
 		pending = p->migration_pending;
 		if (pending) {
 			refcount_inc(&pending->refs);
@@ -2062,6 +2108,11 @@ static int affine_move_task(struct rq *r
 		}
 		task_rq_unlock(rq, p, rf);
 
+		if (push_task) {
+			stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+					    p, &rq->push_work);
+		}
+
 		if (complete)
 			goto do_complete;
 
@@ -2098,6 +2149,7 @@ static int affine_move_task(struct rq *r
 	if (flags & SCA_MIGRATE_ENABLE) {
 
 		refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
+		p->migration_flags &= ~MDF_PUSH;
 		task_rq_unlock(rq, p, rf);
 
 		pending->arg = (struct migration_arg) {
@@ -2716,11 +2768,6 @@ static inline int __set_cpus_allowed_ptr
 
 static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
 
-static inline bool is_migration_disabled(struct task_struct *p)
-{
-	return false;
-}
-
 static inline bool rq_has_pinned_tasks(struct rq *rq)
 {
 	return false;
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2129,6 +2129,9 @@ static int push_dl_task(struct rq *rq)
 		return 0;
 
 retry:
+	if (is_migration_disabled(next_task))
+		return 0;
+
 	if (WARN_ON(next_task == rq->curr))
 		return 0;
 
@@ -2206,7 +2209,7 @@ static void push_dl_tasks(struct rq *rq)
 static void pull_dl_task(struct rq *this_rq)
 {
 	int this_cpu = this_rq->cpu, cpu;
-	struct task_struct *p;
+	struct task_struct *p, *push_task;
 	bool resched = false;
 	struct rq *src_rq;
 	u64 dmin = LONG_MAX;
@@ -2236,6 +2239,7 @@ static void pull_dl_task(struct rq *this
 			continue;
 
 		/* Might drop this_rq->lock */
+		push_task = NULL;
 		double_lock_balance(this_rq, src_rq);
 
 		/*
@@ -2267,17 +2271,27 @@ static void pull_dl_task(struct rq *this
 					   src_rq->curr->dl.deadline))
 				goto skip;
 
-			resched = true;
-
-			deactivate_task(src_rq, p, 0);
-			set_task_cpu(p, this_cpu);
-			activate_task(this_rq, p, 0);
-			dmin = p->dl.deadline;
+			if (is_migration_disabled(p)) {
+				push_task = get_push_task(src_rq);
+			} else {
+				deactivate_task(src_rq, p, 0);
+				set_task_cpu(p, this_cpu);
+				activate_task(this_rq, p, 0);
+				dmin = p->dl.deadline;
+				resched = true;
+			}
 
 			/* Is there any other task even earlier? */
 		}
 skip:
 		double_unlock_balance(this_rq, src_rq);
+
+		if (push_task) {
+			raw_spin_unlock(&this_rq->lock);
+			stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+					    push_task, &src_rq->push_work);
+			raw_spin_lock(&this_rq->lock);
+		}
 	}
 
 	if (resched)
@@ -2524,6 +2538,7 @@ const struct sched_class dl_sched_class
 	.rq_online              = rq_online_dl,
 	.rq_offline             = rq_offline_dl,
 	.task_woken		= task_woken_dl,
+	.find_lock_rq		= find_lock_later_rq,
 #endif
 
 	.task_tick		= task_tick_dl,
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1859,7 +1859,7 @@ static struct task_struct *pick_next_pus
  * running task can migrate over to a CPU that is running a task
  * of lesser priority.
  */
-static int push_rt_task(struct rq *rq)
+static int push_rt_task(struct rq *rq, bool pull)
 {
 	struct task_struct *next_task;
 	struct rq *lowest_rq;
@@ -1873,6 +1873,34 @@ static int push_rt_task(struct rq *rq)
 		return 0;
 
 retry:
+	if (is_migration_disabled(next_task)) {
+		struct task_struct *push_task = NULL;
+		int cpu;
+
+		if (!pull || rq->push_busy)
+			return 0;
+
+		cpu = find_lowest_rq(rq->curr);
+		if (cpu == -1 || cpu == rq->cpu)
+			return 0;
+
+		/*
+		 * Given we found a CPU with lower priority than @next_task,
+		 * therefore it should be running. However we cannot migrate it
+		 * to this other CPU, instead attempt to push the current
+		 * running task on this CPU away.
+		 */
+		push_task = get_push_task(rq);
+		if (push_task) {
+			raw_spin_unlock(&rq->lock);
+			stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+					    push_task, &rq->push_work);
+			raw_spin_lock(&rq->lock);
+		}
+
+		return 0;
+	}
+
 	if (WARN_ON(next_task == rq->curr))
 		return 0;
 
@@ -1927,12 +1955,10 @@ static int push_rt_task(struct rq *rq)
 	deactivate_task(rq, next_task, 0);
 	set_task_cpu(next_task, lowest_rq->cpu);
 	activate_task(lowest_rq, next_task, 0);
-	ret = 1;
-
 	resched_curr(lowest_rq);
+	ret = 1;
 
 	double_unlock_balance(rq, lowest_rq);
-
 out:
 	put_task_struct(next_task);
 
@@ -1942,7 +1968,7 @@ static int push_rt_task(struct rq *rq)
 static void push_rt_tasks(struct rq *rq)
 {
 	/* push_rt_task will return true if it moved an RT */
-	while (push_rt_task(rq))
+	while (push_rt_task(rq, false))
 		;
 }
 
@@ -2095,7 +2121,8 @@ void rto_push_irq_work_func(struct irq_w
 	 */
 	if (has_pushable_tasks(rq)) {
 		raw_spin_lock(&rq->lock);
-		push_rt_tasks(rq);
+		while (push_rt_task(rq, true))
+			;
 		raw_spin_unlock(&rq->lock);
 	}
 
@@ -2120,7 +2147,7 @@ static void pull_rt_task(struct rq *this
 {
 	int this_cpu = this_rq->cpu, cpu;
 	bool resched = false;
-	struct task_struct *p;
+	struct task_struct *p, *push_task;
 	struct rq *src_rq;
 	int rt_overload_count = rt_overloaded(this_rq);
 
@@ -2167,6 +2194,7 @@ static void pull_rt_task(struct rq *this
 		 * double_lock_balance, and another CPU could
 		 * alter this_rq
 		 */
+		push_task = NULL;
 		double_lock_balance(this_rq, src_rq);
 
 		/*
@@ -2194,11 +2222,14 @@ static void pull_rt_task(struct rq *this
 			if (p->prio < src_rq->curr->prio)
 				goto skip;
 
-			resched = true;
-
-			deactivate_task(src_rq, p, 0);
-			set_task_cpu(p, this_cpu);
-			activate_task(this_rq, p, 0);
+			if (is_migration_disabled(p)) {
+				push_task = get_push_task(src_rq);
+			} else {
+				deactivate_task(src_rq, p, 0);
+				set_task_cpu(p, this_cpu);
+				activate_task(this_rq, p, 0);
+				resched = true;
+			}
 			/*
 			 * We continue with the search, just in
 			 * case there's an even higher prio task
@@ -2208,6 +2239,13 @@ static void pull_rt_task(struct rq *this
 		}
 skip:
 		double_unlock_balance(this_rq, src_rq);
+
+		if (push_task) {
+			raw_spin_unlock(&this_rq->lock);
+			stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+					    push_task, &src_rq->push_work);
+			raw_spin_lock(&this_rq->lock);
+		}
 	}
 
 	if (resched)
@@ -2449,6 +2487,7 @@ const struct sched_class rt_sched_class
 	.rq_offline             = rq_offline_rt,
 	.task_woken		= task_woken_rt,
 	.switched_from		= switched_from_rt,
+	.find_lock_rq		= find_lock_lowest_rq,
 #endif
 
 	.task_tick		= task_tick_rt,
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1057,6 +1057,8 @@ struct rq {
 #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
 	unsigned int		nr_pinned;
 #endif
+	unsigned int		push_busy;
+	struct cpu_stop_work	push_work;
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1084,6 +1086,16 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
+#define MDF_PUSH	0x01
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+	return p->migration_disabled;
+#else
+	return false;
+#endif
+}
 
 #ifdef CONFIG_SCHED_SMT
 extern void __update_idle_core(struct rq *rq);
@@ -1823,6 +1835,8 @@ struct sched_class {
 
 	void (*rq_online)(struct rq *rq);
 	void (*rq_offline)(struct rq *rq);
+
+	struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
 #endif
 
 	void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
@@ -1918,6 +1932,24 @@ extern void trigger_load_balance(struct
 
 extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
 
+static inline struct task_struct *get_push_task(struct rq *rq)
+{
+	struct task_struct *p = rq->curr;
+
+	lockdep_assert_held(&rq->lock);
+
+	if (rq->push_busy)
+		return NULL;
+
+	if (p->nr_cpus_allowed == 1)
+		return NULL;
+
+	rq->push_busy = true;
+	return get_task_struct(p);
+}
+
+extern int push_cpu_stop(void *arg);
+
 #endif
 
 #ifdef CONFIG_CPU_IDLE



^ permalink raw reply	[flat|nested] 81+ messages in thread

* [PATCH v4 16/19] sched/proc: Print accurate cpumask vs migrate_disable()
  2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (14 preceding siblings ...)
  2020-10-23 10:12 ` [PATCH v4 15/19] sched: Fix migrate_disable() vs rt/dl balancing Peter Zijlstra
@ 2020-10-23 10:12 ` Peter Zijlstra
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
  2020-10-23 10:12 ` [PATCH v4 17/19] sched: Add migrate_disable() tracepoints Peter Zijlstra
                   ` (4 subsequent siblings)
  20 siblings, 1 reply; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-23 10:12 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

Ensure /proc/*/status doesn't print 'random' cpumasks due to
migrate_disable().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 fs/proc/array.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -382,9 +382,9 @@ static inline void task_context_switch_c
 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
 {
 	seq_printf(m, "Cpus_allowed:\t%*pb\n",
-		   cpumask_pr_args(task->cpus_ptr));
+		   cpumask_pr_args(&task->cpus_mask));
 	seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
-		   cpumask_pr_args(task->cpus_ptr));
+		   cpumask_pr_args(&task->cpus_mask));
 }
 
 static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)



^ permalink raw reply	[flat|nested] 81+ messages in thread

* [PATCH v4 17/19] sched: Add migrate_disable() tracepoints
  2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (15 preceding siblings ...)
  2020-10-23 10:12 ` [PATCH v4 16/19] sched/proc: Print accurate cpumask vs migrate_disable() Peter Zijlstra
@ 2020-10-23 10:12 ` Peter Zijlstra
  2020-10-29 16:27   ` Valentin Schneider
  2020-10-23 10:12 ` [PATCH v4 18/19] sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable() Peter Zijlstra
                   ` (3 subsequent siblings)
  20 siblings, 1 reply; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-23 10:12 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

XXX write a tracer:

 - 'migirate_disable() -> migrate_enable()' time in task_sched_runtime()
 - 'migrate_pull -> sched-in' time in task_sched_runtime()

The first will give worst case for the second, which is the actual
interference experienced by the task to due migration constraints of
migrate_disable().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/trace/events/sched.h |   12 ++++++++++++
 kernel/sched/core.c          |    4 ++++
 kernel/sched/deadline.c      |    1 +
 kernel/sched/rt.c            |    8 +++++++-
 4 files changed, 24 insertions(+), 1 deletion(-)

--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -646,6 +646,18 @@ DECLARE_TRACE(sched_update_nr_running_tp
 	TP_PROTO(struct rq *rq, int change),
 	TP_ARGS(rq, change));
 
+DECLARE_TRACE(sched_migrate_disable_tp,
+	      TP_PROTO(struct task_struct *p),
+	      TP_ARGS(p));
+
+DECLARE_TRACE(sched_migrate_enable_tp,
+	      TP_PROTO(struct task_struct *p),
+	      TP_ARGS(p));
+
+DECLARE_TRACE(sched_migrate_pull_tp,
+	      TP_PROTO(struct task_struct *p),
+	      TP_ARGS(p));
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1732,6 +1732,8 @@ void migrate_disable(void)
 		return;
 	}
 
+	trace_sched_migrate_disable_tp(p);
+
 	preempt_disable();
 	this_rq()->nr_pinned++;
 	p->migration_disabled = 1;
@@ -1764,6 +1766,8 @@ void migrate_enable(void)
 	p->migration_disabled = 0;
 	this_rq()->nr_pinned--;
 	preempt_enable();
+
+	trace_sched_migrate_enable_tp(p);
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
 
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2245,6 +2245,7 @@ static void pull_dl_task(struct rq *this
 				goto skip;
 
 			if (is_migration_disabled(p)) {
+				trace_sched_migrate_pull_tp(p);
 				push_task = get_push_task(src_rq);
 			} else {
 				deactivate_task(src_rq, p, 0);
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1877,7 +1877,12 @@ static int push_rt_task(struct rq *rq, b
 		struct task_struct *push_task = NULL;
 		int cpu;
 
-		if (!pull || rq->push_busy)
+		if (!pull)
+			return 0;
+
+		trace_sched_migrate_pull_tp(next_task);
+
+		if (rq->push_busy)
 			return 0;
 
 		cpu = find_lowest_rq(rq->curr);
@@ -2223,6 +2228,7 @@ static void pull_rt_task(struct rq *this
 				goto skip;
 
 			if (is_migration_disabled(p)) {
+				trace_sched_migrate_pull_tp(p);
 				push_task = get_push_task(src_rq);
 			} else {
 				deactivate_task(src_rq, p, 0);



^ permalink raw reply	[flat|nested] 81+ messages in thread

* [PATCH v4 18/19] sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable()
  2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (16 preceding siblings ...)
  2020-10-23 10:12 ` [PATCH v4 17/19] sched: Add migrate_disable() tracepoints Peter Zijlstra
@ 2020-10-23 10:12 ` Peter Zijlstra
  2020-10-23 10:12 ` [PATCH v4 19/19] sched: Comment affine_move_task() Peter Zijlstra
                   ` (2 subsequent siblings)
  20 siblings, 0 replies; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-23 10:12 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

From: Valentin Schneider <valentin.schneider@arm.com>

  migrate_disable();
  set_cpus_allowed_ptr(current, {something excluding task_cpu(current)});
  affine_move_task(); <-- never returns

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201013140116.26651-1-valentin.schneider@arm.com
---
 kernel/sched/core.c |   13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2182,8 +2182,17 @@ static int __set_cpus_allowed_ptr(struct
 		goto out;
 	}
 
-	if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask))
-		goto out;
+	if (!(flags & SCA_MIGRATE_ENABLE)) {
+		if (cpumask_equal(&p->cpus_mask, new_mask))
+			goto out;
+
+		if (WARN_ON_ONCE(p == current &&
+				 is_migration_disabled(p) &&
+				 !cpumask_test_cpu(task_cpu(p), new_mask))) {
+			ret = -EBUSY;
+			goto out;
+		}
+	}
 
 	/*
 	 * Picking a ~random cpu helps in cases where we are changing affinity



^ permalink raw reply	[flat|nested] 81+ messages in thread

* [PATCH v4 19/19] sched: Comment affine_move_task()
  2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (17 preceding siblings ...)
  2020-10-23 10:12 ` [PATCH v4 18/19] sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable() Peter Zijlstra
@ 2020-10-23 10:12 ` Peter Zijlstra
  2020-10-29 16:27   ` Valentin Schneider
  2020-10-29 19:03 ` [PATCH v4 00/19] sched: Migrate disable support Valentin Schneider
  2020-11-09 16:39 ` Daniel Bristot de Oliveira
  20 siblings, 1 reply; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-23 10:12 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

From: Valentin Schneider <valentin.schneider@arm.com>


Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201013140116.26651-2-valentin.schneider@arm.com
---
 kernel/sched/core.c |   81 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 79 insertions(+), 2 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2082,7 +2082,75 @@ void do_set_cpus_allowed(struct task_str
 }
 
 /*
- * This function is wildly self concurrent, consider at least 3 times.
+ * This function is wildly self concurrent; here be dragons.
+ *
+ *
+ * When given a valid mask, __set_cpus_allowed_ptr() must block until the
+ * designated task is enqueued on an allowed CPU. If that task is currently
+ * running, we have to kick it out using the CPU stopper.
+ *
+ * Migrate-Disable comes along and tramples all over our nice sandcastle.
+ * Consider:
+ *
+ *     Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ *     P0@CPU0                  P1
+ *
+ *     migrate_disable();
+ *     <preempted>
+ *                              set_cpus_allowed_ptr(P0, [1]);
+ *
+ * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes
+ * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
+ * This means we need the following scheme:
+ *
+ *     P0@CPU0                  P1
+ *
+ *     migrate_disable();
+ *     <preempted>
+ *                              set_cpus_allowed_ptr(P0, [1]);
+ *                                <blocks>
+ *     <resumes>
+ *     migrate_enable();
+ *       __set_cpus_allowed_ptr();
+ *       <wakes local stopper>
+ *                         `--> <woken on migration completion>
+ *
+ * Now the fun stuff: there may be several P1-like tasks, i.e. multiple
+ * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
+ * task p are serialized by p->pi_lock, which we can leverage: the one that
+ * should come into effect at the end of the Migrate-Disable region is the last
+ * one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
+ * but we still need to properly signal those waiting tasks at the appropriate
+ * moment.
+ *
+ * This is implemented using struct set_affinity_pending. The first
+ * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
+ * setup an instance of that struct and install it on the targeted task_struct.
+ * Any and all further callers will reuse that instance. Those then wait for
+ * a completion signaled at the tail of the CPU stopper callback (1), triggered
+ * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
+ *
+ *
+ * (1) In the cases covered above. There is one more where the completion is
+ * signaled within affine_move_task() itself: when a subsequent affinity request
+ * cancels the need for an active migration. Consider:
+ *
+ *     Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ *     P0@CPU0            P1                             P2
+ *
+ *     migrate_disable();
+ *     <preempted>
+ *                        set_cpus_allowed_ptr(P0, [1]);
+ *                          <blocks>
+ *                                                       set_cpus_allowed_ptr(P0, [0, 1]);
+ *                                                         <signal completion>
+ *                          <awakes>
+ *
+ * Note that the above is safe vs a concurrent migrate_enable(), as any
+ * pending affinity completion is preceded an uninstallion of
+ * p->migration_pending done with p->pi_lock held.
  */
 static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
 			    int dest_cpu, unsigned int flags)
@@ -2126,6 +2194,7 @@ static int affine_move_task(struct rq *r
 	if (!(flags & SCA_MIGRATE_ENABLE)) {
 		/* serialized by p->pi_lock */
 		if (!p->migration_pending) {
+			/* Install the request */
 			refcount_set(&my_pending.refs, 1);
 			init_completion(&my_pending.done);
 			p->migration_pending = &my_pending;
@@ -2169,7 +2238,11 @@ static int affine_move_task(struct rq *r
 	}
 
 	if (task_running(rq, p) || p->state == TASK_WAKING) {
-
+		/*
+		 * Lessen races (and headaches) by delegating
+		 * is_migration_disabled(p) checks to the stopper, which will
+		 * run on the same CPU as said p.
+		 */
 		task_rq_unlock(rq, p, rf);
 		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 
@@ -2194,6 +2267,10 @@ static int affine_move_task(struct rq *r
 	if (refcount_dec_and_test(&pending->refs))
 		wake_up_var(&pending->refs);
 
+	/*
+	 * Block the original owner of &pending until all subsequent callers
+	 * have seen the completion and decremented the refcount
+	 */
 	wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
 
 	return 0;



^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative
  2020-10-23 10:12 ` [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative Peter Zijlstra
@ 2020-10-29 16:27   ` Valentin Schneider
  2020-10-29 17:34     ` Peter Zijlstra
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Thomas Gleixner
  2020-11-13 15:06   ` [PATCH v4 11/19] " Qian Cai
  2 siblings, 1 reply; 81+ messages in thread
From: Valentin Schneider @ 2020-10-29 16:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210


On 23/10/20 11:12, Peter Zijlstra wrote:
> @@ -7006,15 +7024,20 @@ static bool balance_push(struct rq *rq)
>        * Both the cpu-hotplug and stop task are in this case and are
>        * required to complete the hotplug process.
>        */
> -	if (is_per_cpu_kthread(push_task)) {
> +	if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {

is_migration_disabled(p) implies rq_has_pinned_tasks(task_rq(p)), right?

So having a "simple"

  if (is_migration_disabled(push_task))
        return;

would help simpletons like me trying to read through this.

>               /*
>                * If this is the idle task on the outgoing CPU try to wake
>                * up the hotplug control thread which might wait for the
>                * last task to vanish. The rcuwait_active() check is
>                * accurate here because the waiter is pinned on this CPU
>                * and can't obviously be running in parallel.
> +		 *
> +		 * On RT kernels this also has to check whether there are
> +		 * pinned and scheduled out tasks on the runqueue. They
> +		 * need to leave the migrate disabled section first.
>                */
> -		if (!rq->nr_running && rcuwait_active(&rq->hotplug_wait)) {
> +		if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
> +		    rcuwait_active(&rq->hotplug_wait)) {
>                       raw_spin_unlock(&rq->lock);
>                       rcuwait_wake_up(&rq->hotplug_wait);
>                       raw_spin_lock(&rq->lock);

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 14/19] sched, lockdep: Annotate ->pi_lock recursion
  2020-10-23 10:12 ` [PATCH v4 14/19] sched, lockdep: Annotate ->pi_lock recursion Peter Zijlstra
@ 2020-10-29 16:27   ` Valentin Schneider
  2020-10-29 17:38     ` Peter Zijlstra
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
  1 sibling, 1 reply; 81+ messages in thread
From: Valentin Schneider @ 2020-10-29 16:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210


On 23/10/20 11:12, Peter Zijlstra wrote:
> @@ -2617,6 +2618,20 @@ void sched_set_stop_task(int cpu, struct
>               sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
>
>               stop->sched_class = &stop_sched_class;
> +
> +		/*
> +		 * The PI code calls rt_mutex_setprio() with ->pi_lock held to
> +		 * adjust the effective priority of a task. As a result,
> +		 * rt_mutex_setprio() can trigger (RT) balancing operations,
> +		 * which can then trigger wakeups of the stop thread to push
> +		 * around the current task.
> +		 *
> +		 * The stop task itself will never be part of the PI-chain, it
> +		 * never blocks, therefore that ->pi_lock recursion is safe.

Isn't it that the stopper task can only run when preemption is re-enabled,
and the ->pi_lock is dropped before then?

If we were to have an SCA-like function that would kick the stopper but
"forget" to release the pi_lock, then we would very much like lockdep to
complain, right? Or is that something else entirely?

> +		 * Tell lockdep about this by placing the stop->pi_lock in its
> +		 * own class.
> +		 */
> +		lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
>       }
>
>       cpu_rq(cpu)->stop = stop;

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 17/19] sched: Add migrate_disable() tracepoints
  2020-10-23 10:12 ` [PATCH v4 17/19] sched: Add migrate_disable() tracepoints Peter Zijlstra
@ 2020-10-29 16:27   ` Valentin Schneider
  2020-10-29 17:43     ` Peter Zijlstra
  0 siblings, 1 reply; 81+ messages in thread
From: Valentin Schneider @ 2020-10-29 16:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210


On 23/10/20 11:12, Peter Zijlstra wrote:
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1732,6 +1732,8 @@ void migrate_disable(void)
>               return;
>       }
>
> +	trace_sched_migrate_disable_tp(p);
> +
>       preempt_disable();
>       this_rq()->nr_pinned++;
>       p->migration_disabled = 1;
> @@ -1764,6 +1766,8 @@ void migrate_enable(void)
>       p->migration_disabled = 0;
>       this_rq()->nr_pinned--;
>       preempt_enable();
> +
> +	trace_sched_migrate_enable_tp(p);

Don't you want those directly after the ->migration_disabled write?
esp. for migrate_enable(), if that preempt_enable() leads to a context
switch then the disable->enable deltas won't reflect the kernel view.

That delta may indeed include the time it took to run the stopper and
fix the task's affinity on migrate_enable(), but it could include all
sorts of other higher-priority tasks.

>  }
>  EXPORT_SYMBOL_GPL(migrate_enable);

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 19/19] sched: Comment affine_move_task()
  2020-10-23 10:12 ` [PATCH v4 19/19] sched: Comment affine_move_task() Peter Zijlstra
@ 2020-10-29 16:27   ` Valentin Schneider
  2020-10-29 17:44     ` Peter Zijlstra
  0 siblings, 1 reply; 81+ messages in thread
From: Valentin Schneider @ 2020-10-29 16:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210


On 23/10/20 11:12, Peter Zijlstra wrote:
> + * (1) In the cases covered above. There is one more where the completion is
> + * signaled within affine_move_task() itself: when a subsequent affinity request
> + * cancels the need for an active migration. Consider:
> + *
> + *     Initial conditions: P0->cpus_mask = [0, 1]
> + *
> + *     P0@CPU0            P1                             P2
> + *
> + *     migrate_disable();
> + *     <preempted>
> + *                        set_cpus_allowed_ptr(P0, [1]);
> + *                          <blocks>
> + *                                                       set_cpus_allowed_ptr(P0, [0, 1]);
> + *                                                         <signal completion>
> + *                          <awakes>
> + *
> + * Note that the above is safe vs a concurrent migrate_enable(), as any
> + * pending affinity completion is preceded an uninstallion of
> + * p->migration_pending done with p->pi_lock held.

I too must have been thinking too much about ponies lately.

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0a3f9fd3b061..d8c85f180b09 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2147,7 +2147,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
  *                          <awakes>
  *
  * Note that the above is safe vs a concurrent migrate_enable(), as any
- * pending affinity completion is preceded an uninstallion of
+ * pending affinity completion is preceded by an uninstallation of
  * p->migration_pending done with p->pi_lock held.
  */
 static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative
  2020-10-29 16:27   ` Valentin Schneider
@ 2020-10-29 17:34     ` Peter Zijlstra
  2020-10-29 17:55       ` Valentin Schneider
  0 siblings, 1 reply; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-29 17:34 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210

On Thu, Oct 29, 2020 at 04:27:09PM +0000, Valentin Schneider wrote:
> 
> On 23/10/20 11:12, Peter Zijlstra wrote:
> > @@ -7006,15 +7024,20 @@ static bool balance_push(struct rq *rq)
> >        * Both the cpu-hotplug and stop task are in this case and are
> >        * required to complete the hotplug process.
> >        */
> > -	if (is_per_cpu_kthread(push_task)) {
> > +	if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
> 
> is_migration_disabled(p) implies rq_has_pinned_tasks(task_rq(p)), right?

In general, no, in this particular case, yes. Specifically you need
migrate_disable() + schedule() in order to get nr_pinned incremented.

We just happen to run at the tail end of schedule(), so yeah, here it
works.

> So having a "simple"
> 
>   if (is_migration_disabled(push_task))
>         return;
> 
> would help simpletons like me trying to read through this.

Can do I suppose, although I'm no sure what, if anything that helps,
because then we needs yet another comment explaining things.

I ended up with the below. Is that an improvement?

---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3d7d5b7b9c99..c9c69511ece4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7226,11 +7226,19 @@ static void balance_push(struct rq *rq)
 	lockdep_assert_held(&rq->lock);
 	SCHED_WARN_ON(rq->cpu != smp_processor_id());
 
+	/*
+	 * When migrate_disable(), we'll also have nr_pinned incremented due to
+	 * this being the tail end of schedule(). Therefore we do not need to wake
+	 * the hotplug_wait and go straight to jail^Wexit.
+	 */
+	if (is_migration_disabled(push_task))
+		return;
+
 	/*
 	 * Both the cpu-hotplug and stop task are in this case and are
 	 * required to complete the hotplug process.
 	 */
-	if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
+	if (is_per_cpu_kthread(push_task)) {
 		/*
 		 * If this is the idle task on the outgoing CPU try to wake
 		 * up the hotplug control thread which might wait for the


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 14/19] sched, lockdep: Annotate ->pi_lock recursion
  2020-10-29 16:27   ` Valentin Schneider
@ 2020-10-29 17:38     ` Peter Zijlstra
  2020-10-29 18:09       ` Valentin Schneider
  0 siblings, 1 reply; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-29 17:38 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210

On Thu, Oct 29, 2020 at 04:27:16PM +0000, Valentin Schneider wrote:
> 
> On 23/10/20 11:12, Peter Zijlstra wrote:
> > @@ -2617,6 +2618,20 @@ void sched_set_stop_task(int cpu, struct
> >               sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
> >
> >               stop->sched_class = &stop_sched_class;
> > +
> > +		/*
> > +		 * The PI code calls rt_mutex_setprio() with ->pi_lock held to
> > +		 * adjust the effective priority of a task. As a result,
> > +		 * rt_mutex_setprio() can trigger (RT) balancing operations,
> > +		 * which can then trigger wakeups of the stop thread to push
> > +		 * around the current task.
> > +		 *
> > +		 * The stop task itself will never be part of the PI-chain, it
> > +		 * never blocks, therefore that ->pi_lock recursion is safe.
> 
> Isn't it that the stopper task can only run when preemption is re-enabled,
> and the ->pi_lock is dropped before then?
> 
> If we were to have an SCA-like function that would kick the stopper but
> "forget" to release the pi_lock, then we would very much like lockdep to
> complain, right? Or is that something else entirely?

You've forgotten the other, and original, purpose of ->pi_lock, guarding
the actual PI chain. Please have a look at rt_mutex_adjust_prio_chain()
and its comment.

But no, this isn't about running, this is about doing an actual wakeup
(of the stopper task) while holding an ->pi_lock instance (guaranteed
not the stopper task's). And since wakeup will take ->pi_lock, lockdep
will get all whiny about ->pi_lock self recursion.

> > +		 * Tell lockdep about this by placing the stop->pi_lock in its
> > +		 * own class.
> > +		 */
> > +		lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
> >       }
> >
> >       cpu_rq(cpu)->stop = stop;

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 17/19] sched: Add migrate_disable() tracepoints
  2020-10-29 16:27   ` Valentin Schneider
@ 2020-10-29 17:43     ` Peter Zijlstra
  2020-10-29 17:56       ` Valentin Schneider
  0 siblings, 1 reply; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-29 17:43 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210

On Thu, Oct 29, 2020 at 04:27:26PM +0000, Valentin Schneider wrote:
> 
> On 23/10/20 11:12, Peter Zijlstra wrote:
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -1732,6 +1732,8 @@ void migrate_disable(void)
> >               return;
> >       }
> >
> > +	trace_sched_migrate_disable_tp(p);
> > +
> >       preempt_disable();
> >       this_rq()->nr_pinned++;
> >       p->migration_disabled = 1;
> > @@ -1764,6 +1766,8 @@ void migrate_enable(void)
> >       p->migration_disabled = 0;
> >       this_rq()->nr_pinned--;
> >       preempt_enable();
> > +
> > +	trace_sched_migrate_enable_tp(p);
> 
> Don't you want those directly after the ->migration_disabled write?
> esp. for migrate_enable(), if that preempt_enable() leads to a context
> switch then the disable->enable deltas won't reflect the kernel view.
> 
> That delta may indeed include the time it took to run the stopper and
> fix the task's affinity on migrate_enable(), but it could include all
> sorts of other higher-priority tasks.

I can put them in the preempt_disable() section I suppose, but these
tracers should be looking at task_sched_runtime(), not walltime, and
then the preemption doesn't matter.

Also, a distinct lack of actual users atm.. :/

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 19/19] sched: Comment affine_move_task()
  2020-10-29 16:27   ` Valentin Schneider
@ 2020-10-29 17:44     ` Peter Zijlstra
  0 siblings, 0 replies; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-29 17:44 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210

On Thu, Oct 29, 2020 at 04:27:45PM +0000, Valentin Schneider wrote:
> I too must have been thinking too much about ponies lately.

> - * pending affinity completion is preceded an uninstallion of
> + * pending affinity completion is preceded by an uninstallation of

Hah!, took me a few tries to figure out wth 'by' had to do with ponies.
Reading be hard ;-)

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative
  2020-10-29 17:34     ` Peter Zijlstra
@ 2020-10-29 17:55       ` Valentin Schneider
  0 siblings, 0 replies; 81+ messages in thread
From: Valentin Schneider @ 2020-10-29 17:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210


On 29/10/20 17:34, Peter Zijlstra wrote:
> On Thu, Oct 29, 2020 at 04:27:09PM +0000, Valentin Schneider wrote:
[...]
> Can do I suppose, although I'm no sure what, if anything that helps,
> because then we needs yet another comment explaining things.
>
> I ended up with the below. Is that an improvement?

I'm leaning towards "yes", but YMMV.

>
> ---
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 3d7d5b7b9c99..c9c69511ece4 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7226,11 +7226,19 @@ static void balance_push(struct rq *rq)
>  	lockdep_assert_held(&rq->lock);
>  	SCHED_WARN_ON(rq->cpu != smp_processor_id());
>  
> +	/*
> +	 * When migrate_disable(), we'll also have nr_pinned incremented due to
> +	 * this being the tail end of schedule(). Therefore we do not need to wake
> +	 * the hotplug_wait and go straight to jail^Wexit.
> +	 */
> +	if (is_migration_disabled(push_task))
> +		return;
> +
>  	/*
>  	 * Both the cpu-hotplug and stop task are in this case and are
>  	 * required to complete the hotplug process.
>  	 */
> -	if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
> +	if (is_per_cpu_kthread(push_task)) {
>  		/*
>  		 * If this is the idle task on the outgoing CPU try to wake
>  		 * up the hotplug control thread which might wait for the


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 17/19] sched: Add migrate_disable() tracepoints
  2020-10-29 17:43     ` Peter Zijlstra
@ 2020-10-29 17:56       ` Valentin Schneider
  2020-10-29 17:59         ` Peter Zijlstra
  0 siblings, 1 reply; 81+ messages in thread
From: Valentin Schneider @ 2020-10-29 17:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210


On 29/10/20 17:43, Peter Zijlstra wrote:
> On Thu, Oct 29, 2020 at 04:27:26PM +0000, Valentin Schneider wrote:
>> Don't you want those directly after the ->migration_disabled write?
>> esp. for migrate_enable(), if that preempt_enable() leads to a context
>> switch then the disable->enable deltas won't reflect the kernel view.
>> 
>> That delta may indeed include the time it took to run the stopper and
>> fix the task's affinity on migrate_enable(), but it could include all
>> sorts of other higher-priority tasks.
>
> I can put them in the preempt_disable() section I suppose, but these
> tracers should be looking at task_sched_runtime(), not walltime, and
> then the preemption doesn't matter.
>

True. I was thinking of how to process it downstream, and the first thing
that came to mind was that rd->overutilized flag which we do monitor
fairly closely; however that is system-wide while migrate_disable() is
task-specific.

> Also, a distinct lack of actual users atm.. :/

If you'd rather ditch this one altogether until someone asks for it, that
also works for me.

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 17/19] sched: Add migrate_disable() tracepoints
  2020-10-29 17:56       ` Valentin Schneider
@ 2020-10-29 17:59         ` Peter Zijlstra
  0 siblings, 0 replies; 81+ messages in thread
From: Peter Zijlstra @ 2020-10-29 17:59 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210

On Thu, Oct 29, 2020 at 05:56:12PM +0000, Valentin Schneider wrote:
> 
> On 29/10/20 17:43, Peter Zijlstra wrote:
> > On Thu, Oct 29, 2020 at 04:27:26PM +0000, Valentin Schneider wrote:
> >> Don't you want those directly after the ->migration_disabled write?
> >> esp. for migrate_enable(), if that preempt_enable() leads to a context
> >> switch then the disable->enable deltas won't reflect the kernel view.
> >> 
> >> That delta may indeed include the time it took to run the stopper and
> >> fix the task's affinity on migrate_enable(), but it could include all
> >> sorts of other higher-priority tasks.
> >
> > I can put them in the preempt_disable() section I suppose, but these
> > tracers should be looking at task_sched_runtime(), not walltime, and
> > then the preemption doesn't matter.
> >
> 
> True. I was thinking of how to process it downstream, and the first thing
> that came to mind was that rd->overutilized flag which we do monitor
> fairly closely; however that is system-wide while migrate_disable() is
> task-specific.
> 
> > Also, a distinct lack of actual users atm.. :/
> 
> If you'd rather ditch this one altogether until someone asks for it, that
> also works for me.

Yeah, I can pull this patch until we get someone that actually needs it.

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 14/19] sched, lockdep: Annotate ->pi_lock recursion
  2020-10-29 17:38     ` Peter Zijlstra
@ 2020-10-29 18:09       ` Valentin Schneider
  0 siblings, 0 replies; 81+ messages in thread
From: Valentin Schneider @ 2020-10-29 18:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210


On 29/10/20 17:38, Peter Zijlstra wrote:
> On Thu, Oct 29, 2020 at 04:27:16PM +0000, Valentin Schneider wrote:
>> 
>> On 23/10/20 11:12, Peter Zijlstra wrote:
>> > @@ -2617,6 +2618,20 @@ void sched_set_stop_task(int cpu, struct
>> >               sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
>> >
>> >               stop->sched_class = &stop_sched_class;
>> > +
>> > +		/*
>> > +		 * The PI code calls rt_mutex_setprio() with ->pi_lock held to
>> > +		 * adjust the effective priority of a task. As a result,
>> > +		 * rt_mutex_setprio() can trigger (RT) balancing operations,
>> > +		 * which can then trigger wakeups of the stop thread to push
>> > +		 * around the current task.
>> > +		 *
>> > +		 * The stop task itself will never be part of the PI-chain, it
>> > +		 * never blocks, therefore that ->pi_lock recursion is safe.
>> 
>> Isn't it that the stopper task can only run when preemption is re-enabled,
>> and the ->pi_lock is dropped before then?
>> 
>> If we were to have an SCA-like function that would kick the stopper but
>> "forget" to release the pi_lock, then we would very much like lockdep to
>> complain, right? Or is that something else entirely?
>
> You've forgotten the other, and original, purpose of ->pi_lock, guarding
> the actual PI chain. Please have a look at rt_mutex_adjust_prio_chain()
> and its comment.
>
> But no, this isn't about running, this is about doing an actual wakeup
> (of the stopper task) while holding an ->pi_lock instance (guaranteed
> not the stopper task's). And since wakeup will take ->pi_lock, lockdep
> will get all whiny about ->pi_lock self recursion.
>

Gotcha. Thanks, and apologies for the noise.

>> > +		 * Tell lockdep about this by placing the stop->pi_lock in its
>> > +		 * own class.
>> > +		 */
>> > +		lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
>> >       }
>> >
>> >       cpu_rq(cpu)->stop = stop;


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 00/19] sched: Migrate disable support
  2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (18 preceding siblings ...)
  2020-10-23 10:12 ` [PATCH v4 19/19] sched: Comment affine_move_task() Peter Zijlstra
@ 2020-10-29 19:03 ` Valentin Schneider
  2020-11-09 16:39 ` Daniel Bristot de Oliveira
  20 siblings, 0 replies; 81+ messages in thread
From: Valentin Schneider @ 2020-10-29 19:03 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210


On 23/10/20 11:11, Peter Zijlstra wrote:
> Hi,
>
> The fourth version of migrate_disable() for PREEMPT_RT.
>
> Two changes since last time:
>
>  - fixes !SMP builds (bigeasy)
>  - TLA+ validation of migrate_disable() vs sched_setaffinity() (valsch)
>
> Esp. that latter resulted in significant changes to patch #10. Huge thanks to
> Valentin.
>

I've been poking at that model some more; there's always going to be some
gap between what is being modelled vs what the code actually does, but
I'm mostly happy with the state it is in now.


So far it's been 1 victim task looping around a Migration-Disabled region
while some other threads fight over changing its affinity. The victim task
can also be preempted by some higher-priority task. I didn't make the
victim task block between Migration-Disabled regions.

Last run was 2 CPUs and 2 affinity-changing threads, I'll try to bump that
a bit but it is already somewhat lengthy to run (~10h).

In terms of what this has been checking
- Forward progress (no process forever blocked on some condition / lock)
- Refcounts always >= 0 (hit a few -1's in earlier versions)
- Affinity is respected
  I've "implemented" that as:

    If 'p' is running on a CPU, then that CPU must be in p->cpus_mask when
      !p->migration_pending && p->pi_lock is not held


I don't think I have much else to add, so feel free to add

  Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>

To patches 01-14, 16-17.

The RT side of patch 15 looked fine to me, but I didn't spend much time on
DL. We might still want to steal those .pick_task() from coresched to
implement that XXX in push_cpu_stop(), but AIUI this is more of a
performance thing than a correctness one.

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 00/19] sched: Migrate disable support
  2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
                   ` (19 preceding siblings ...)
  2020-10-29 19:03 ` [PATCH v4 00/19] sched: Migrate disable support Valentin Schneider
@ 2020-11-09 16:39 ` Daniel Bristot de Oliveira
  20 siblings, 0 replies; 81+ messages in thread
From: Daniel Bristot de Oliveira @ 2020-11-09 16:39 UTC (permalink / raw)
  To: Peter Zijlstra, tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, valentin.schneider,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, vincent.donnefort, tj, ouwen210

On 10/23/20 12:11 PM, Peter Zijlstra wrote:
> Hi,
> 
> The fourth version of migrate_disable() for PREEMPT_RT.
> 
> Two changes since last time:
> 
>  - fixes !SMP builds (bigeasy)
>  - TLA+ validation of migrate_disable() vs sched_setaffinity() (valsch)
> 
> Esp. that latter resulted in significant changes to patch #10. Huge thanks to
> Valentin.

While I will still work on this, testing more and trying to
analyze the effects from the (rt) scheduling perspective, I
do not see any other issues (kudos to Valentin, impressive
review).

Feel free to add:

Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>

Thanks!
-- Daniel


^ permalink raw reply	[flat|nested] 81+ messages in thread

* [tip: sched/core] sched/proc: Print accurate cpumask vs migrate_disable()
  2020-10-23 10:12 ` [PATCH v4 16/19] sched/proc: Print accurate cpumask vs migrate_disable() Peter Zijlstra
@ 2020-11-11  8:23   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 81+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-11-11  8:23 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel),
	Valentin Schneider, Daniel Bristot de Oliveira, x86,
	linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     86fbcd3b4ba2c3e19daf705bc13d90fb53aab648
Gitweb:        https://git.kernel.org/tip/86fbcd3b4ba2c3e19daf705bc13d90fb53aab648
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Mon, 05 Oct 2020 12:49:16 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Nov 2020 18:39:01 +01:00

sched/proc: Print accurate cpumask vs migrate_disable()

Ensure /proc/*/status doesn't print 'random' cpumasks due to
migrate_disable().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102347.593984734@infradead.org
---
 fs/proc/array.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 65ec202..7052441 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -382,9 +382,9 @@ static inline void task_context_switch_counts(struct seq_file *m,
 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
 {
 	seq_printf(m, "Cpus_allowed:\t%*pb\n",
-		   cpumask_pr_args(task->cpus_ptr));
+		   cpumask_pr_args(&task->cpus_mask));
 	seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
-		   cpumask_pr_args(task->cpus_ptr));
+		   cpumask_pr_args(&task->cpus_mask));
 }
 
 static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)

^ permalink raw reply	[flat|nested] 81+ messages in thread

* [tip: sched/core] sched: Fix migrate_disable() vs rt/dl balancing
  2020-10-23 10:12 ` [PATCH v4 15/19] sched: Fix migrate_disable() vs rt/dl balancing Peter Zijlstra
@ 2020-11-11  8:23   ` tip-bot2 for Peter Zijlstra
  2020-12-26 13:54   ` [PATCH v4 15/19] " Qais Yousef
  1 sibling, 0 replies; 81+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-11-11  8:23 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel), Daniel Bristot de Oliveira, x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     a7c81556ec4d341dfdbf2cc478ead89d73e474a7
Gitweb:        https://git.kernel.org/tip/a7c81556ec4d341dfdbf2cc478ead89d73e474a7
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Mon, 28 Sep 2020 17:06:07 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Nov 2020 18:39:01 +01:00

sched: Fix migrate_disable() vs rt/dl balancing

In order to minimize the interference of migrate_disable() on lower
priority tasks, which can be deprived of runtime due to being stuck
below a higher priority task. Teach the RT/DL balancers to push away
these higher priority tasks when a lower priority task gets selected
to run on a freshly demoted CPU (pull).

This adds migration interference to the higher priority task, but
restores bandwidth to system that would otherwise be irrevocably lost.
Without this it would be possible to have all tasks on the system
stuck on a single CPU, each task preempted in a migrate_disable()
section with a single high priority task running.

This way we can still approximate running the M highest priority tasks
on the system.

Migrating the top task away is (ofcourse) still subject to
migrate_disable() too, which means the lower task is subject to an
interference equivalent to the worst case migrate_disable() section.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102347.499155098@infradead.org
---
 include/linux/preempt.h | 40 +++++++++++++-----------
 include/linux/sched.h   |  3 +-
 kernel/sched/core.c     | 67 ++++++++++++++++++++++++++++++++++------
 kernel/sched/deadline.c | 29 ++++++++++++-----
 kernel/sched/rt.c       | 63 ++++++++++++++++++++++++++++++--------
 kernel/sched/sched.h    | 32 +++++++++++++++++++-
 6 files changed, 186 insertions(+), 48 deletions(-)

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 97ba7c9..8b43922 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -325,24 +325,28 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
 
 /*
- * Migrate-Disable and why it is (strongly) undesired.
- *
- * The premise of the Real-Time schedulers we have on Linux
- * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks
- * concurrently, provided there are sufficient runnable tasks, also known as
- * work-conserving. For instance SCHED_DEADLINE tries to schedule the M
- * earliest deadline threads, and SCHED_FIFO the M highest priority threads.
- *
- * The correctness of various scheduling models depends on this, but is it
- * broken by migrate_disable() that doesn't imply preempt_disable(). Where
- * preempt_disable() implies an immediate priority ceiling, preemptible
- * migrate_disable() allows nesting.
- *
- * The worst case is that all tasks preempt one another in a migrate_disable()
- * region and stack on a single CPU. This then reduces the available bandwidth
- * to a single CPU. And since Real-Time schedulability theory considers the
- * Worst-Case only, all Real-Time analysis shall revert to single-CPU
- * (instantly solving the SMP analysis problem).
+ * Migrate-Disable and why it is undesired.
+ *
+ * When a preempted task becomes elegible to run under the ideal model (IOW it
+ * becomes one of the M highest priority tasks), it might still have to wait
+ * for the preemptee's migrate_disable() section to complete. Thereby suffering
+ * a reduction in bandwidth in the exact duration of the migrate_disable()
+ * section.
+ *
+ * Per this argument, the change from preempt_disable() to migrate_disable()
+ * gets us:
+ *
+ * - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
+ *   it would have had to wait for the lower priority task.
+ *
+ * - a lower priority tasks; which under preempt_disable() could've instantly
+ *   migrated away when another CPU becomes available, is now constrained
+ *   by the ability to push the higher priority task away, which might itself be
+ *   in a migrate_disable() section, reducing it's available bandwidth.
+ *
+ * IOW it trades latency / moves the interference term, but it stays in the
+ * system, and as long as it remains unbounded, the system is not fully
+ * deterministic.
  *
  *
  * The reason we have it anyway.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 90a0c92..3af9d52 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -716,8 +716,9 @@ struct task_struct {
 	cpumask_t			cpus_mask;
 	void				*migration_pending;
 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
-	int				migration_disabled;
+	unsigned short			migration_disabled;
 #endif
+	unsigned short			migration_flags;
 
 #ifdef CONFIG_PREEMPT_RCU
 	int				rcu_read_lock_nesting;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9ce2fc7..e92d785 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1763,11 +1763,6 @@ void migrate_enable(void)
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
 
-static inline bool is_migration_disabled(struct task_struct *p)
-{
-	return p->migration_disabled;
-}
-
 static inline bool rq_has_pinned_tasks(struct rq *rq)
 {
 	return rq->nr_pinned;
@@ -1972,6 +1967,49 @@ out:
 	return 0;
 }
 
+int push_cpu_stop(void *arg)
+{
+	struct rq *lowest_rq = NULL, *rq = this_rq();
+	struct task_struct *p = arg;
+
+	raw_spin_lock_irq(&p->pi_lock);
+	raw_spin_lock(&rq->lock);
+
+	if (task_rq(p) != rq)
+		goto out_unlock;
+
+	if (is_migration_disabled(p)) {
+		p->migration_flags |= MDF_PUSH;
+		goto out_unlock;
+	}
+
+	p->migration_flags &= ~MDF_PUSH;
+
+	if (p->sched_class->find_lock_rq)
+		lowest_rq = p->sched_class->find_lock_rq(p, rq);
+
+	if (!lowest_rq)
+		goto out_unlock;
+
+	// XXX validate p is still the highest prio task
+	if (task_rq(p) == rq) {
+		deactivate_task(rq, p, 0);
+		set_task_cpu(p, lowest_rq->cpu);
+		activate_task(lowest_rq, p, 0);
+		resched_curr(lowest_rq);
+	}
+
+	double_unlock_balance(rq, lowest_rq);
+
+out_unlock:
+	rq->push_busy = false;
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock_irq(&p->pi_lock);
+
+	put_task_struct(p);
+	return 0;
+}
+
 /*
  * sched_class::set_cpus_allowed must do the below, but is not required to
  * actually call this function.
@@ -2052,6 +2090,14 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
 
 	/* Can the task run on the task's current CPU? If so, we're done */
 	if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+		struct task_struct *push_task = NULL;
+
+		if ((flags & SCA_MIGRATE_ENABLE) &&
+		    (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
+			rq->push_busy = true;
+			push_task = get_task_struct(p);
+		}
+
 		pending = p->migration_pending;
 		if (pending) {
 			refcount_inc(&pending->refs);
@@ -2060,6 +2106,11 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
 		}
 		task_rq_unlock(rq, p, rf);
 
+		if (push_task) {
+			stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+					    p, &rq->push_work);
+		}
+
 		if (complete)
 			goto do_complete;
 
@@ -2098,6 +2149,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
 	if (flags & SCA_MIGRATE_ENABLE) {
 
 		refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
+		p->migration_flags &= ~MDF_PUSH;
 		task_rq_unlock(rq, p, rf);
 
 		pending->arg = (struct migration_arg) {
@@ -2716,11 +2768,6 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
 
 static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
 
-static inline bool is_migration_disabled(struct task_struct *p)
-{
-	return false;
-}
-
 static inline bool rq_has_pinned_tasks(struct rq *rq)
 {
 	return false;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3d3fd83..eed2e44 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2129,6 +2129,9 @@ static int push_dl_task(struct rq *rq)
 		return 0;
 
 retry:
+	if (is_migration_disabled(next_task))
+		return 0;
+
 	if (WARN_ON(next_task == rq->curr))
 		return 0;
 
@@ -2206,7 +2209,7 @@ static void push_dl_tasks(struct rq *rq)
 static void pull_dl_task(struct rq *this_rq)
 {
 	int this_cpu = this_rq->cpu, cpu;
-	struct task_struct *p;
+	struct task_struct *p, *push_task;
 	bool resched = false;
 	struct rq *src_rq;
 	u64 dmin = LONG_MAX;
@@ -2236,6 +2239,7 @@ static void pull_dl_task(struct rq *this_rq)
 			continue;
 
 		/* Might drop this_rq->lock */
+		push_task = NULL;
 		double_lock_balance(this_rq, src_rq);
 
 		/*
@@ -2267,17 +2271,27 @@ static void pull_dl_task(struct rq *this_rq)
 					   src_rq->curr->dl.deadline))
 				goto skip;
 
-			resched = true;
-
-			deactivate_task(src_rq, p, 0);
-			set_task_cpu(p, this_cpu);
-			activate_task(this_rq, p, 0);
-			dmin = p->dl.deadline;
+			if (is_migration_disabled(p)) {
+				push_task = get_push_task(src_rq);
+			} else {
+				deactivate_task(src_rq, p, 0);
+				set_task_cpu(p, this_cpu);
+				activate_task(this_rq, p, 0);
+				dmin = p->dl.deadline;
+				resched = true;
+			}
 
 			/* Is there any other task even earlier? */
 		}
 skip:
 		double_unlock_balance(this_rq, src_rq);
+
+		if (push_task) {
+			raw_spin_unlock(&this_rq->lock);
+			stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+					    push_task, &src_rq->push_work);
+			raw_spin_lock(&this_rq->lock);
+		}
 	}
 
 	if (resched)
@@ -2524,6 +2538,7 @@ const struct sched_class dl_sched_class
 	.rq_online              = rq_online_dl,
 	.rq_offline             = rq_offline_dl,
 	.task_woken		= task_woken_dl,
+	.find_lock_rq		= find_lock_later_rq,
 #endif
 
 	.task_tick		= task_tick_dl,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index cf63346..c592e47 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1859,7 +1859,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
  * running task can migrate over to a CPU that is running a task
  * of lesser priority.
  */
-static int push_rt_task(struct rq *rq)
+static int push_rt_task(struct rq *rq, bool pull)
 {
 	struct task_struct *next_task;
 	struct rq *lowest_rq;
@@ -1873,6 +1873,34 @@ static int push_rt_task(struct rq *rq)
 		return 0;
 
 retry:
+	if (is_migration_disabled(next_task)) {
+		struct task_struct *push_task = NULL;
+		int cpu;
+
+		if (!pull || rq->push_busy)
+			return 0;
+
+		cpu = find_lowest_rq(rq->curr);
+		if (cpu == -1 || cpu == rq->cpu)
+			return 0;
+
+		/*
+		 * Given we found a CPU with lower priority than @next_task,
+		 * therefore it should be running. However we cannot migrate it
+		 * to this other CPU, instead attempt to push the current
+		 * running task on this CPU away.
+		 */
+		push_task = get_push_task(rq);
+		if (push_task) {
+			raw_spin_unlock(&rq->lock);
+			stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+					    push_task, &rq->push_work);
+			raw_spin_lock(&rq->lock);
+		}
+
+		return 0;
+	}
+
 	if (WARN_ON(next_task == rq->curr))
 		return 0;
 
@@ -1927,12 +1955,10 @@ retry:
 	deactivate_task(rq, next_task, 0);
 	set_task_cpu(next_task, lowest_rq->cpu);
 	activate_task(lowest_rq, next_task, 0);
-	ret = 1;
-
 	resched_curr(lowest_rq);
+	ret = 1;
 
 	double_unlock_balance(rq, lowest_rq);
-
 out:
 	put_task_struct(next_task);
 
@@ -1942,7 +1968,7 @@ out:
 static void push_rt_tasks(struct rq *rq)
 {
 	/* push_rt_task will return true if it moved an RT */
-	while (push_rt_task(rq))
+	while (push_rt_task(rq, false))
 		;
 }
 
@@ -2095,7 +2121,8 @@ void rto_push_irq_work_func(struct irq_work *work)
 	 */
 	if (has_pushable_tasks(rq)) {
 		raw_spin_lock(&rq->lock);
-		push_rt_tasks(rq);
+		while (push_rt_task(rq, true))
+			;
 		raw_spin_unlock(&rq->lock);
 	}
 
@@ -2120,7 +2147,7 @@ static void pull_rt_task(struct rq *this_rq)
 {
 	int this_cpu = this_rq->cpu, cpu;
 	bool resched = false;
-	struct task_struct *p;
+	struct task_struct *p, *push_task;
 	struct rq *src_rq;
 	int rt_overload_count = rt_overloaded(this_rq);
 
@@ -2167,6 +2194,7 @@ static void pull_rt_task(struct rq *this_rq)
 		 * double_lock_balance, and another CPU could
 		 * alter this_rq
 		 */
+		push_task = NULL;
 		double_lock_balance(this_rq, src_rq);
 
 		/*
@@ -2194,11 +2222,14 @@ static void pull_rt_task(struct rq *this_rq)
 			if (p->prio < src_rq->curr->prio)
 				goto skip;
 
-			resched = true;
-
-			deactivate_task(src_rq, p, 0);
-			set_task_cpu(p, this_cpu);
-			activate_task(this_rq, p, 0);
+			if (is_migration_disabled(p)) {
+				push_task = get_push_task(src_rq);
+			} else {
+				deactivate_task(src_rq, p, 0);
+				set_task_cpu(p, this_cpu);
+				activate_task(this_rq, p, 0);
+				resched = true;
+			}
 			/*
 			 * We continue with the search, just in
 			 * case there's an even higher prio task
@@ -2208,6 +2239,13 @@ static void pull_rt_task(struct rq *this_rq)
 		}
 skip:
 		double_unlock_balance(this_rq, src_rq);
+
+		if (push_task) {
+			raw_spin_unlock(&this_rq->lock);
+			stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+					    push_task, &src_rq->push_work);
+			raw_spin_lock(&this_rq->lock);
+		}
 	}
 
 	if (resched)
@@ -2449,6 +2487,7 @@ const struct sched_class rt_sched_class
 	.rq_offline             = rq_offline_rt,
 	.task_woken		= task_woken_rt,
 	.switched_from		= switched_from_rt,
+	.find_lock_rq		= find_lock_lowest_rq,
 #endif
 
 	.task_tick		= task_tick_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 42de140..56992aa 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1057,6 +1057,8 @@ struct rq {
 #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
 	unsigned int		nr_pinned;
 #endif
+	unsigned int		push_busy;
+	struct cpu_stop_work	push_work;
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1084,6 +1086,16 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
+#define MDF_PUSH	0x01
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+	return p->migration_disabled;
+#else
+	return false;
+#endif
+}
 
 #ifdef CONFIG_SCHED_SMT
 extern void __update_idle_core(struct rq *rq);
@@ -1823,6 +1835,8 @@ struct sched_class {
 
 	void (*rq_online)(struct rq *rq);
 	void (*rq_offline)(struct rq *rq);
+
+	struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
 #endif
 
 	void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
@@ -1918,6 +1932,24 @@ extern void trigger_load_balance(struct rq *rq);
 
 extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
 
+static inline struct task_struct *get_push_task(struct rq *rq)
+{
+	struct task_struct *p = rq->curr;
+
+	lockdep_assert_held(&rq->lock);
+
+	if (rq->push_busy)
+		return NULL;
+
+	if (p->nr_cpus_allowed == 1)
+		return NULL;
+
+	rq->push_busy = true;
+	return get_task_struct(p);
+}
+
+extern int push_cpu_stop(void *arg);
+
 #endif
 
 #ifdef CONFIG_CPU_IDLE

^ permalink raw reply	[flat|nested] 81+ messages in thread

* [tip: sched/core] sched,rt: Use the full cpumask for balancing
  2020-10-23 10:12 ` [PATCH v4 13/19] sched,rt: Use the full cpumask for balancing Peter Zijlstra
@ 2020-11-11  8:23   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 81+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-11-11  8:23 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel),
	Valentin Schneider, Daniel Bristot de Oliveira, x86,
	linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     95158a89dd50035b4ff5b8aa913854166b50fe6d
Gitweb:        https://git.kernel.org/tip/95158a89dd50035b4ff5b8aa913854166b50fe6d
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Thu, 01 Oct 2020 16:05:39 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Nov 2020 18:39:00 +01:00

sched,rt: Use the full cpumask for balancing

We want migrate_disable() tasks to get PULLs in order for them to PUSH
away the higher priority task.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102347.310519774@infradead.org
---
 kernel/sched/cpudeadline.c | 4 ++--
 kernel/sched/cpupri.c      | 4 ++--
 kernel/sched/deadline.c    | 4 ++--
 kernel/sched/rt.c          | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 8cb06c8..ceb03d7 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -120,7 +120,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 	const struct sched_dl_entity *dl_se = &p->dl;
 
 	if (later_mask &&
-	    cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
+	    cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) {
 		unsigned long cap, max_cap = 0;
 		int cpu, max_cpu = -1;
 
@@ -151,7 +151,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 
 		WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
 
-		if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
+		if (cpumask_test_cpu(best_cpu, &p->cpus_mask) &&
 		    dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
 			if (later_mask)
 				cpumask_set_cpu(best_cpu, later_mask);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 0033731..11c4df2 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -73,11 +73,11 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
 	if (skip)
 		return 0;
 
-	if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
+	if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
 		return 0;
 
 	if (lowest_mask) {
-		cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
+		cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
 
 		/*
 		 * We have to ensure that we have at least one bit
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 206a070..3d3fd83 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1912,7 +1912,7 @@ static void task_fork_dl(struct task_struct *p)
 static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    cpumask_test_cpu(cpu, p->cpus_ptr))
+	    cpumask_test_cpu(cpu, &p->cpus_mask))
 		return 1;
 	return 0;
 }
@@ -2062,7 +2062,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
 		/* Retry if something changed. */
 		if (double_lock_balance(rq, later_rq)) {
 			if (unlikely(task_rq(task) != rq ||
-				     !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
+				     !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
 				     task_running(rq, task) ||
 				     !dl_task(task) ||
 				     !task_on_rq_queued(task))) {
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2525a1b..cf63346 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1658,7 +1658,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    cpumask_test_cpu(cpu, p->cpus_ptr))
+	    cpumask_test_cpu(cpu, &p->cpus_mask))
 		return 1;
 
 	return 0;
@@ -1811,7 +1811,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 			 * Also make sure that it wasn't scheduled on its rq.
 			 */
 			if (unlikely(task_rq(task) != rq ||
-				     !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
+				     !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
 				     task_running(rq, task) ||
 				     !rt_task(task) ||
 				     !task_on_rq_queued(task))) {

^ permalink raw reply	[flat|nested] 81+ messages in thread

* [tip: sched/core] sched, lockdep: Annotate ->pi_lock recursion
  2020-10-23 10:12 ` [PATCH v4 14/19] sched, lockdep: Annotate ->pi_lock recursion Peter Zijlstra
  2020-10-29 16:27   ` Valentin Schneider
@ 2020-11-11  8:23   ` tip-bot2 for Peter Zijlstra
  1 sibling, 0 replies; 81+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-11-11  8:23 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel),
	Valentin Schneider, Daniel Bristot de Oliveira, x86,
	linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     ded467dc83ac7173f1532bb0faa25022ff8769e5
Gitweb:        https://git.kernel.org/tip/ded467dc83ac7173f1532bb0faa25022ff8769e5
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Thu, 01 Oct 2020 16:13:01 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Nov 2020 18:39:01 +01:00

sched, lockdep: Annotate ->pi_lock recursion

There's a valid ->pi_lock recursion issue where the actual PI code
tries to wake up the stop task. Make lockdep aware so it doesn't
complain about this.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102347.406912197@infradead.org
---
 kernel/sched/core.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6ea593c..9ce2fc7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2658,6 +2658,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 
 void sched_set_stop_task(int cpu, struct task_struct *stop)
 {
+	static struct lock_class_key stop_pi_lock;
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
 	struct task_struct *old_stop = cpu_rq(cpu)->stop;
 
@@ -2673,6 +2674,20 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
 		sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
 
 		stop->sched_class = &stop_sched_class;
+
+		/*
+		 * The PI code calls rt_mutex_setprio() with ->pi_lock held to
+		 * adjust the effective priority of a task. As a result,
+		 * rt_mutex_setprio() can trigger (RT) balancing operations,
+		 * which can then trigger wakeups of the stop thread to push
+		 * around the current task.
+		 *
+		 * The stop task itself will never be part of the PI-chain, it
+		 * never blocks, therefore that ->pi_lock recursion is safe.
+		 * Tell lockdep about this by placing the stop->pi_lock in its
+		 * own class.
+		 */
+		lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
 	}
 
 	cpu_rq(cpu)->stop = stop;

^ permalink raw reply	[flat|nested] 81+ messages in thread

* [tip: sched/core] sched/core: Make migrate disable and CPU hotplug cooperative
  2020-10-23 10:12 ` [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative Peter Zijlstra
  2020-10-29 16:27   ` Valentin Schneider
@ 2020-11-11  8:23   ` tip-bot2 for Thomas Gleixner
  2020-11-13 15:06   ` [PATCH v4 11/19] " Qian Cai
  2 siblings, 0 replies; 81+ messages in thread
From: tip-bot2 for Thomas Gleixner @ 2020-11-11  8:23 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Thomas Gleixner, Peter Zijlstra (Intel),
	Valentin Schneider, Daniel Bristot de Oliveira, x86,
	linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     3015ef4b98f53fe7eba4f5f82f562c0e074d213c
Gitweb:        https://git.kernel.org/tip/3015ef4b98f53fe7eba4f5f82f562c0e074d213c
Author:        Thomas Gleixner <tglx@linutronix.de>
AuthorDate:    Wed, 26 Aug 2020 14:08:10 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Nov 2020 18:39:00 +01:00

sched/core: Make migrate disable and CPU hotplug cooperative

On CPU unplug tasks which are in a migrate disabled region cannot be pushed
to a different CPU until they returned to migrateable state.

Account the number of tasks on a runqueue which are in a migrate disabled
section and make the hotplug wait mechanism respect that.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102347.067278757@infradead.org
---
 kernel/sched/core.c  | 36 ++++++++++++++++++++++++++++++------
 kernel/sched/sched.h |  4 ++++
 2 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0efc1e4..6ea593c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1721,10 +1721,17 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
 
 void migrate_disable(void)
 {
-	if (current->migration_disabled++)
+	struct task_struct *p = current;
+
+	if (p->migration_disabled) {
+		p->migration_disabled++;
 		return;
+	}
 
-	barrier();
+	preempt_disable();
+	this_rq()->nr_pinned++;
+	p->migration_disabled = 1;
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_disable);
 
@@ -1751,6 +1758,7 @@ void migrate_enable(void)
 	 */
 	barrier();
 	p->migration_disabled = 0;
+	this_rq()->nr_pinned--;
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
@@ -1760,6 +1768,11 @@ static inline bool is_migration_disabled(struct task_struct *p)
 	return p->migration_disabled;
 }
 
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+	return rq->nr_pinned;
+}
+
 #endif
 
 /*
@@ -2693,6 +2706,11 @@ static inline bool is_migration_disabled(struct task_struct *p)
 	return false;
 }
 
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+	return false;
+}
+
 #endif
 
 static void
@@ -7066,15 +7084,20 @@ static void balance_push(struct rq *rq)
 	 * Both the cpu-hotplug and stop task are in this case and are
 	 * required to complete the hotplug process.
 	 */
-	if (is_per_cpu_kthread(push_task)) {
+	if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
 		/*
 		 * If this is the idle task on the outgoing CPU try to wake
 		 * up the hotplug control thread which might wait for the
 		 * last task to vanish. The rcuwait_active() check is
 		 * accurate here because the waiter is pinned on this CPU
 		 * and can't obviously be running in parallel.
+		 *
+		 * On RT kernels this also has to check whether there are
+		 * pinned and scheduled out tasks on the runqueue. They
+		 * need to leave the migrate disabled section first.
 		 */
-		if (!rq->nr_running && rcuwait_active(&rq->hotplug_wait)) {
+		if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
+		    rcuwait_active(&rq->hotplug_wait)) {
 			raw_spin_unlock(&rq->lock);
 			rcuwait_wake_up(&rq->hotplug_wait);
 			raw_spin_lock(&rq->lock);
@@ -7121,7 +7144,8 @@ static void balance_hotplug_wait(void)
 {
 	struct rq *rq = this_rq();
 
-	rcuwait_wait_event(&rq->hotplug_wait, rq->nr_running == 1,
+	rcuwait_wait_event(&rq->hotplug_wait,
+			   rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
 			   TASK_UNINTERRUPTIBLE);
 }
 
@@ -7366,7 +7390,7 @@ int sched_cpu_dying(unsigned int cpu)
 	sched_tick_stop(cpu);
 
 	rq_lock_irqsave(rq, &rf);
-	BUG_ON(rq->nr_running != 1);
+	BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
 	rq_unlock_irqrestore(rq, &rf);
 
 	calc_load_migrate(rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 72d8e47..42de140 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1053,6 +1053,10 @@ struct rq {
 	/* Must be inspected within a rcu lock section */
 	struct cpuidle_state	*idle_state;
 #endif
+
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	unsigned int		nr_pinned;
+#endif
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED

^ permalink raw reply	[flat|nested] 81+ messages in thread

* [tip: sched/core] sched,rt: Use cpumask_any*_distribute()
  2020-10-23 10:12 ` [PATCH v4 12/19] sched,rt: Use cpumask_any*_distribute() Peter Zijlstra
@ 2020-11-11  8:23   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 81+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-11-11  8:23 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel),
	Valentin Schneider, Daniel Bristot de Oliveira, x86,
	linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     14e292f8d45380c519a83d9b0f37089a17eedcdf
Gitweb:        https://git.kernel.org/tip/14e292f8d45380c519a83d9b0f37089a17eedcdf
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Thu, 01 Oct 2020 15:54:14 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Nov 2020 18:39:00 +01:00

sched,rt: Use cpumask_any*_distribute()

Replace a bunch of cpumask_any*() instances with
cpumask_any*_distribute(), by injecting this little bit of random in
cpu selection, we reduce the chance two competing balance operations
working off the same lowest_mask pick the same CPU.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102347.190759694@infradead.org
---
 include/linux/cpumask.h |  6 ++++++
 kernel/sched/deadline.c |  6 +++---
 kernel/sched/rt.c       |  6 +++---
 lib/cpumask.c           | 18 ++++++++++++++++++
 4 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index f0d895d..383684e 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -199,6 +199,11 @@ static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
 	return cpumask_next_and(-1, src1p, src2p);
 }
 
+static inline int cpumask_any_distribute(const struct cpumask *srcp)
+{
+	return cpumask_first(srcp);
+}
+
 #define for_each_cpu(cpu, mask)			\
 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
 #define for_each_cpu_not(cpu, mask)		\
@@ -252,6 +257,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
 unsigned int cpumask_local_spread(unsigned int i, int node);
 int cpumask_any_and_distribute(const struct cpumask *src1p,
 			       const struct cpumask *src2p);
+int cpumask_any_distribute(const struct cpumask *srcp);
 
 /**
  * for_each_cpu - iterate over every cpu in a mask
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e97c7c2..206a070 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2002,8 +2002,8 @@ static int find_later_rq(struct task_struct *task)
 				return this_cpu;
 			}
 
-			best_cpu = cpumask_first_and(later_mask,
-							sched_domain_span(sd));
+			best_cpu = cpumask_any_and_distribute(later_mask,
+							      sched_domain_span(sd));
 			/*
 			 * Last chance: if a CPU being in both later_mask
 			 * and current sd span is valid, that becomes our
@@ -2025,7 +2025,7 @@ static int find_later_rq(struct task_struct *task)
 	if (this_cpu != -1)
 		return this_cpu;
 
-	cpu = cpumask_any(later_mask);
+	cpu = cpumask_any_distribute(later_mask);
 	if (cpu < nr_cpu_ids)
 		return cpu;
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 40a4663..2525a1b 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1752,8 +1752,8 @@ static int find_lowest_rq(struct task_struct *task)
 				return this_cpu;
 			}
 
-			best_cpu = cpumask_first_and(lowest_mask,
-						     sched_domain_span(sd));
+			best_cpu = cpumask_any_and_distribute(lowest_mask,
+							      sched_domain_span(sd));
 			if (best_cpu < nr_cpu_ids) {
 				rcu_read_unlock();
 				return best_cpu;
@@ -1770,7 +1770,7 @@ static int find_lowest_rq(struct task_struct *task)
 	if (this_cpu != -1)
 		return this_cpu;
 
-	cpu = cpumask_any(lowest_mask);
+	cpu = cpumask_any_distribute(lowest_mask);
 	if (cpu < nr_cpu_ids)
 		return cpu;
 
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 85da6ab..3592402 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -267,3 +267,21 @@ int cpumask_any_and_distribute(const struct cpumask *src1p,
 	return next;
 }
 EXPORT_SYMBOL(cpumask_any_and_distribute);
+
+int cpumask_any_distribute(const struct cpumask *srcp)
+{
+	int next, prev;
+
+	/* NOTE: our first selection will skip 0. */
+	prev = __this_cpu_read(distribute_cpu_mask_prev);
+
+	next = cpumask_next(prev, srcp);
+	if (next >= nr_cpu_ids)
+		next = cpumask_first(srcp);
+
+	if (next < nr_cpu_ids)
+		__this_cpu_write(distribute_cpu_mask_prev, next);
+
+	return next;
+}
+EXPORT_SYMBOL(cpumask_any_distribute);

^ permalink raw reply	[flat|nested] 81+ messages in thread

* [tip: sched/core] sched: Add migrate_disable()
  2020-10-23 10:12 ` [PATCH v4 09/19] sched: Add migrate_disable() Peter Zijlstra
@ 2020-11-11  8:23   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 81+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-11-11  8:23 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel),
	Valentin Schneider, Daniel Bristot de Oliveira, x86,
	linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     af449901b84c98cbd84a0113223ba3bcfcb12a26
Gitweb:        https://git.kernel.org/tip/af449901b84c98cbd84a0113223ba3bcfcb12a26
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Thu, 17 Sep 2020 10:38:30 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Nov 2020 18:38:59 +01:00

sched: Add migrate_disable()

Add the base migrate_disable() support (under protest).

While migrate_disable() is (currently) required for PREEMPT_RT, it is
also one of the biggest flaws in the system.

Notably this is just the base implementation, it is broken vs
sched_setaffinity() and hotplug, both solved in additional patches for
ease of review.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102346.818170844@infradead.org
---
 include/linux/preempt.h |  65 +++++++++++++++++++++++-
 include/linux/sched.h   |   3 +-
 kernel/sched/core.c     | 112 ++++++++++++++++++++++++++++++++++++---
 kernel/sched/sched.h    |   6 +-
 lib/smp_processor_id.c  |   5 ++-
 5 files changed, 183 insertions(+), 8 deletions(-)

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 7d9c1c0..97ba7c9 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -322,6 +322,69 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
 
 #endif
 
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+
+/*
+ * Migrate-Disable and why it is (strongly) undesired.
+ *
+ * The premise of the Real-Time schedulers we have on Linux
+ * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks
+ * concurrently, provided there are sufficient runnable tasks, also known as
+ * work-conserving. For instance SCHED_DEADLINE tries to schedule the M
+ * earliest deadline threads, and SCHED_FIFO the M highest priority threads.
+ *
+ * The correctness of various scheduling models depends on this, but is it
+ * broken by migrate_disable() that doesn't imply preempt_disable(). Where
+ * preempt_disable() implies an immediate priority ceiling, preemptible
+ * migrate_disable() allows nesting.
+ *
+ * The worst case is that all tasks preempt one another in a migrate_disable()
+ * region and stack on a single CPU. This then reduces the available bandwidth
+ * to a single CPU. And since Real-Time schedulability theory considers the
+ * Worst-Case only, all Real-Time analysis shall revert to single-CPU
+ * (instantly solving the SMP analysis problem).
+ *
+ *
+ * The reason we have it anyway.
+ *
+ * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
+ * number of primitives into becoming preemptible, they would also allow
+ * migration. This turns out to break a bunch of per-cpu usage. To this end,
+ * all these primitives employ migirate_disable() to restore this implicit
+ * assumption.
+ *
+ * This is a 'temporary' work-around at best. The correct solution is getting
+ * rid of the above assumptions and reworking the code to employ explicit
+ * per-cpu locking or short preempt-disable regions.
+ *
+ * The end goal must be to get rid of migrate_disable(), alternatively we need
+ * a schedulability theory that does not depend on abritrary migration.
+ *
+ *
+ * Notes on the implementation.
+ *
+ * The implementation is particularly tricky since existing code patterns
+ * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
+ * This means that it cannot use cpus_read_lock() to serialize against hotplug,
+ * nor can it easily migrate itself into a pending affinity mask change on
+ * migrate_enable().
+ *
+ *
+ * Note: even non-work-conserving schedulers like semi-partitioned depends on
+ *       migration, so migrate_disable() is not only a problem for
+ *       work-conserving schedulers.
+ *
+ */
+extern void migrate_disable(void);
+extern void migrate_enable(void);
+
+#elif defined(CONFIG_PREEMPT_RT)
+
+static inline void migrate_disable(void) { }
+static inline void migrate_enable(void) { }
+
+#else /* !CONFIG_PREEMPT_RT */
+
 /**
  * migrate_disable - Prevent migration of the current task
  *
@@ -352,4 +415,6 @@ static __always_inline void migrate_enable(void)
 	preempt_enable();
 }
 
+#endif /* CONFIG_SMP && CONFIG_PREEMPT_RT */
+
 #endif /* __LINUX_PREEMPT_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 063cd12..0732356 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -714,6 +714,9 @@ struct task_struct {
 	int				nr_cpus_allowed;
 	const cpumask_t			*cpus_ptr;
 	cpumask_t			cpus_mask;
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+	int				migration_disabled;
+#endif
 
 #ifdef CONFIG_PREEMPT_RCU
 	int				rcu_read_lock_nesting;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 396accb..6a3f1c2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1696,6 +1696,61 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 
 #ifdef CONFIG_SMP
 
+#ifdef CONFIG_PREEMPT_RT
+
+static void
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask,
+				  u32 flags);
+
+static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
+{
+	if (likely(!p->migration_disabled))
+		return;
+
+	if (p->cpus_ptr != &p->cpus_mask)
+		return;
+
+	/*
+	 * Violates locking rules! see comment in __do_set_cpus_allowed().
+	 */
+	__do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
+}
+
+void migrate_disable(void)
+{
+	if (current->migration_disabled++)
+		return;
+
+	barrier();
+}
+EXPORT_SYMBOL_GPL(migrate_disable);
+
+void migrate_enable(void)
+{
+	struct task_struct *p = current;
+
+	if (--p->migration_disabled)
+		return;
+
+	barrier();
+
+	if (p->cpus_ptr == &p->cpus_mask)
+		return;
+
+	__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+}
+EXPORT_SYMBOL_GPL(migrate_enable);
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+	return p->migration_disabled;
+}
+
+#endif
+
 /*
  * Per-CPU kthreads are allowed to run on !active && online CPUs, see
  * __set_cpus_allowed_ptr() and select_fallback_rq().
@@ -1705,7 +1760,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
 	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 		return false;
 
-	if (is_per_cpu_kthread(p))
+	if (is_per_cpu_kthread(p) || is_migration_disabled(p))
 		return cpu_online(cpu);
 
 	return cpu_active(cpu);
@@ -1826,6 +1881,11 @@ static int migration_cpu_stop(void *data)
  */
 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
 {
+	if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
+		p->cpus_ptr = new_mask;
+		return;
+	}
+
 	cpumask_copy(&p->cpus_mask, new_mask);
 	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
@@ -1836,7 +1896,22 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32
 	struct rq *rq = task_rq(p);
 	bool queued, running;
 
-	lockdep_assert_held(&p->pi_lock);
+	/*
+	 * This here violates the locking rules for affinity, since we're only
+	 * supposed to change these variables while holding both rq->lock and
+	 * p->pi_lock.
+	 *
+	 * HOWEVER, it magically works, because ttwu() is the only code that
+	 * accesses these variables under p->pi_lock and only does so after
+	 * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
+	 * before finish_task().
+	 *
+	 * XXX do further audits, this smells like something putrid.
+	 */
+	if (flags & SCA_MIGRATE_DISABLE)
+		SCHED_WARN_ON(!p->on_cpu);
+	else
+		lockdep_assert_held(&p->pi_lock);
 
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
@@ -1887,9 +1962,14 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 	rq = task_rq_lock(p, &rf);
 	update_rq_clock(rq);
 
-	if (p->flags & PF_KTHREAD) {
+	if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
 		/*
-		 * Kernel threads are allowed on online && !active CPUs
+		 * Kernel threads are allowed on online && !active CPUs.
+		 *
+		 * Specifically, migration_disabled() tasks must not fail the
+		 * cpumask_any_and_distribute() pick below, esp. so on
+		 * SCA_MIGRATE_ENABLE, otherwise we'll not call
+		 * set_cpus_allowed_common() and actually reset p->cpus_ptr.
 		 */
 		cpu_valid_mask = cpu_online_mask;
 	}
@@ -1903,7 +1983,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 		goto out;
 	}
 
-	if (cpumask_equal(&p->cpus_mask, new_mask))
+	if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask))
 		goto out;
 
 	/*
@@ -1995,6 +2075,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	 * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
 	 */
 	WARN_ON_ONCE(!cpu_online(new_cpu));
+
+	WARN_ON_ONCE(is_migration_disabled(p));
 #endif
 
 	trace_sched_migrate_task(p, new_cpu);
@@ -2325,6 +2407,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 			}
 			fallthrough;
 		case possible:
+			/*
+			 * XXX When called from select_task_rq() we only
+			 * hold p->pi_lock and again violate locking order.
+			 *
+			 * More yuck to audit.
+			 */
 			do_set_cpus_allowed(p, cpu_possible_mask);
 			state = fail;
 			break;
@@ -2359,7 +2447,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
 	lockdep_assert_held(&p->pi_lock);
 
-	if (p->nr_cpus_allowed > 1)
+	if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
 		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 	else
 		cpu = cpumask_any(p->cpus_ptr);
@@ -2421,6 +2509,17 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
 
 #endif /* CONFIG_SMP */
 
+#if !defined(CONFIG_SMP) || !defined(CONFIG_PREEMPT_RT)
+
+static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+	return false;
+}
+
+#endif
+
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 {
@@ -4570,6 +4669,7 @@ static void __sched notrace __schedule(bool preempt)
 		 */
 		++*switch_count;
 
+		migrate_disable_switch(rq, prev);
 		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
 
 		trace_sched_switch(preempt, prev, next);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0420d80..72d8e47 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1902,14 +1902,16 @@ static inline bool sched_fair_runnable(struct rq *rq)
 extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
 extern struct task_struct *pick_next_task_idle(struct rq *rq);
 
+#define SCA_CHECK		0x01
+#define SCA_MIGRATE_DISABLE	0x02
+#define SCA_MIGRATE_ENABLE	0x04
+
 #ifdef CONFIG_SMP
 
 extern void update_group_capacity(struct sched_domain *sd, int cpu);
 
 extern void trigger_load_balance(struct rq *rq);
 
-#define SCA_CHECK		0x01
-
 extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
 
 #endif
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 525222e..faaa927 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -26,6 +26,11 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2)
 	if (current->nr_cpus_allowed == 1)
 		goto out;
 
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+	if (current->migration_disabled)
+		goto out;
+#endif
+
 	/*
 	 * It is valid to assume CPU-locality during early bootup:
 	 */

^ permalink raw reply	[flat|nested] 81+ messages in thread

* [tip: sched/core] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-10-23 10:12 ` [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr() Peter Zijlstra
@ 2020-11-11  8:23   ` tip-bot2 for Peter Zijlstra
  2020-11-12 16:38   ` [PATCH v4 10/19] " Qian Cai
  1 sibling, 0 replies; 81+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-11-11  8:23 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel),
	Valentin Schneider, Daniel Bristot de Oliveira, x86,
	linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     6d337eab041d56bb8f0e7794f39906c21054c512
Gitweb:        https://git.kernel.org/tip/6d337eab041d56bb8f0e7794f39906c21054c512
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Fri, 18 Sep 2020 17:24:31 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Nov 2020 18:39:00 +01:00

sched: Fix migrate_disable() vs set_cpus_allowed_ptr()

Concurrent migrate_disable() and set_cpus_allowed_ptr() has
interesting features. We rely on set_cpus_allowed_ptr() to not return
until the task runs inside the provided mask. This expectation is
exported to userspace.

This means that any set_cpus_allowed_ptr() caller must wait until
migrate_enable() allows migrations.

At the same time, we don't want migrate_enable() to schedule, due to
patterns like:

	preempt_disable();
	migrate_disable();
	...
	migrate_enable();
	preempt_enable();

And:

	raw_spin_lock(&B);
	spin_unlock(&A);

this means that when migrate_enable() must restore the affinity
mask, it cannot wait for completion thereof. Luck will have it that
that is exactly the case where there is a pending
set_cpus_allowed_ptr(), so let that provide storage for the async stop
machine.

Much thanks to Valentin who used TLA+ most effective and found lots of
'interesting' cases.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102346.921768277@infradead.org
---
 include/linux/sched.h |   1 +-
 kernel/sched/core.c   | 236 +++++++++++++++++++++++++++++++++++------
 2 files changed, 207 insertions(+), 30 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0732356..90a0c92 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -714,6 +714,7 @@ struct task_struct {
 	int				nr_cpus_allowed;
 	const cpumask_t			*cpus_ptr;
 	cpumask_t			cpus_mask;
+	void				*migration_pending;
 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
 	int				migration_disabled;
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6a3f1c2..0efc1e4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1732,15 +1732,26 @@ void migrate_enable(void)
 {
 	struct task_struct *p = current;
 
-	if (--p->migration_disabled)
+	if (p->migration_disabled > 1) {
+		p->migration_disabled--;
 		return;
+	}
 
+	/*
+	 * Ensure stop_task runs either before or after this, and that
+	 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+	 */
+	preempt_disable();
+	if (p->cpus_ptr != &p->cpus_mask)
+		__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+	/*
+	 * Mustn't clear migration_disabled() until cpus_ptr points back at the
+	 * regular cpus_mask, otherwise things that race (eg.
+	 * select_fallback_rq) get confused.
+	 */
 	barrier();
-
-	if (p->cpus_ptr == &p->cpus_mask)
-		return;
-
-	__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+	p->migration_disabled = 0;
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
 
@@ -1805,8 +1816,16 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
 }
 
 struct migration_arg {
-	struct task_struct *task;
-	int dest_cpu;
+	struct task_struct		*task;
+	int				dest_cpu;
+	struct set_affinity_pending	*pending;
+};
+
+struct set_affinity_pending {
+	refcount_t		refs;
+	struct completion	done;
+	struct cpu_stop_work	stop_work;
+	struct migration_arg	arg;
 };
 
 /*
@@ -1838,16 +1857,19 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
  */
 static int migration_cpu_stop(void *data)
 {
+	struct set_affinity_pending *pending;
 	struct migration_arg *arg = data;
 	struct task_struct *p = arg->task;
+	int dest_cpu = arg->dest_cpu;
 	struct rq *rq = this_rq();
+	bool complete = false;
 	struct rq_flags rf;
 
 	/*
 	 * The original target CPU might have gone down and we might
 	 * be on another CPU but it doesn't matter.
 	 */
-	local_irq_disable();
+	local_irq_save(rf.flags);
 	/*
 	 * We need to explicitly wake pending tasks before running
 	 * __migrate_task() such that we will not miss enforcing cpus_ptr
@@ -1857,21 +1879,83 @@ static int migration_cpu_stop(void *data)
 
 	raw_spin_lock(&p->pi_lock);
 	rq_lock(rq, &rf);
+
+	pending = p->migration_pending;
 	/*
 	 * If task_rq(p) != rq, it cannot be migrated here, because we're
 	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
 	 * we're holding p->pi_lock.
 	 */
 	if (task_rq(p) == rq) {
+		if (is_migration_disabled(p))
+			goto out;
+
+		if (pending) {
+			p->migration_pending = NULL;
+			complete = true;
+		}
+
+		/* migrate_enable() --  we must not race against SCA */
+		if (dest_cpu < 0) {
+			/*
+			 * When this was migrate_enable() but we no longer
+			 * have a @pending, a concurrent SCA 'fixed' things
+			 * and we should be valid again. Nothing to do.
+			 */
+			if (!pending) {
+				WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));
+				goto out;
+			}
+
+			dest_cpu = cpumask_any_distribute(&p->cpus_mask);
+		}
+
 		if (task_on_rq_queued(p))
-			rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
+			rq = __migrate_task(rq, &rf, p, dest_cpu);
 		else
-			p->wake_cpu = arg->dest_cpu;
+			p->wake_cpu = dest_cpu;
+
+	} else if (dest_cpu < 0) {
+		/*
+		 * This happens when we get migrated between migrate_enable()'s
+		 * preempt_enable() and scheduling the stopper task. At that
+		 * point we're a regular task again and not current anymore.
+		 *
+		 * A !PREEMPT kernel has a giant hole here, which makes it far
+		 * more likely.
+		 */
+
+		/*
+		 * When this was migrate_enable() but we no longer have an
+		 * @pending, a concurrent SCA 'fixed' things and we should be
+		 * valid again. Nothing to do.
+		 */
+		if (!pending) {
+			WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));
+			goto out;
+		}
+
+		/*
+		 * When migrate_enable() hits a rq mis-match we can't reliably
+		 * determine is_migration_disabled() and so have to chase after
+		 * it.
+		 */
+		task_rq_unlock(rq, p, &rf);
+		stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
+				    &pending->arg, &pending->stop_work);
+		return 0;
 	}
-	rq_unlock(rq, &rf);
-	raw_spin_unlock(&p->pi_lock);
+out:
+	task_rq_unlock(rq, p, &rf);
+
+	if (complete)
+		complete_all(&pending->done);
+
+	/* For pending->{arg,stop_work} */
+	pending = arg->pending;
+	if (pending && refcount_dec_and_test(&pending->refs))
+		wake_up_var(&pending->refs);
 
-	local_irq_enable();
 	return 0;
 }
 
@@ -1941,6 +2025,112 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 }
 
 /*
+ * This function is wildly self concurrent, consider at least 3 times.
+ */
+static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
+			    int dest_cpu, unsigned int flags)
+{
+	struct set_affinity_pending my_pending = { }, *pending = NULL;
+	struct migration_arg arg = {
+		.task = p,
+		.dest_cpu = dest_cpu,
+	};
+	bool complete = false;
+
+	/* Can the task run on the task's current CPU? If so, we're done */
+	if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+		pending = p->migration_pending;
+		if (pending) {
+			refcount_inc(&pending->refs);
+			p->migration_pending = NULL;
+			complete = true;
+		}
+		task_rq_unlock(rq, p, rf);
+
+		if (complete)
+			goto do_complete;
+
+		return 0;
+	}
+
+	if (!(flags & SCA_MIGRATE_ENABLE)) {
+		/* serialized by p->pi_lock */
+		if (!p->migration_pending) {
+			refcount_set(&my_pending.refs, 1);
+			init_completion(&my_pending.done);
+			p->migration_pending = &my_pending;
+		} else {
+			pending = p->migration_pending;
+			refcount_inc(&pending->refs);
+		}
+	}
+	pending = p->migration_pending;
+	/*
+	 * - !MIGRATE_ENABLE:
+	 *   we'll have installed a pending if there wasn't one already.
+	 *
+	 * - MIGRATE_ENABLE:
+	 *   we're here because the current CPU isn't matching anymore,
+	 *   the only way that can happen is because of a concurrent
+	 *   set_cpus_allowed_ptr() call, which should then still be
+	 *   pending completion.
+	 *
+	 * Either way, we really should have a @pending here.
+	 */
+	if (WARN_ON_ONCE(!pending)) {
+		task_rq_unlock(rq, p, rf);
+		return -EINVAL;
+	}
+
+	if (flags & SCA_MIGRATE_ENABLE) {
+
+		refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
+		task_rq_unlock(rq, p, rf);
+
+		pending->arg = (struct migration_arg) {
+			.task = p,
+			.dest_cpu = -1,
+			.pending = pending,
+		};
+
+		stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+				    &pending->arg, &pending->stop_work);
+
+		return 0;
+	}
+
+	if (task_running(rq, p) || p->state == TASK_WAKING) {
+
+		task_rq_unlock(rq, p, rf);
+		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+
+	} else {
+
+		if (!is_migration_disabled(p)) {
+			if (task_on_rq_queued(p))
+				rq = move_queued_task(rq, rf, p, dest_cpu);
+
+			p->migration_pending = NULL;
+			complete = true;
+		}
+		task_rq_unlock(rq, p, rf);
+
+do_complete:
+		if (complete)
+			complete_all(&pending->done);
+	}
+
+	wait_for_completion(&pending->done);
+
+	if (refcount_dec_and_test(&pending->refs))
+		wake_up_var(&pending->refs);
+
+	wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
+
+	return 0;
+}
+
+/*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
  * is removed from the allowed bitmask.
@@ -2009,23 +2199,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 			p->nr_cpus_allowed != 1);
 	}
 
-	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask))
-		goto out;
+	return affine_move_task(rq, p, &rf, dest_cpu, flags);
 
-	if (task_running(rq, p) || p->state == TASK_WAKING) {
-		struct migration_arg arg = { p, dest_cpu };
-		/* Need help from migration thread: drop lock and wait. */
-		task_rq_unlock(rq, p, &rf);
-		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-		return 0;
-	} else if (task_on_rq_queued(p)) {
-		/*
-		 * OK, since we're going to drop the lock immediately
-		 * afterwards anyway.
-		 */
-		rq = move_queued_task(rq, &rf, p, dest_cpu);
-	}
 out:
 	task_rq_unlock(rq, p, &rf);
 
@@ -3205,6 +3380,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	init_numa_balancing(clone_flags, p);
 #ifdef CONFIG_SMP
 	p->wake_entry.u_flags = CSD_TYPE_TTWU;
+	p->migration_pending = NULL;
 #endif
 }
 

^ permalink raw reply	[flat|nested] 81+ messages in thread

* [tip: sched/core] sched: Massage set_cpus_allowed()
  2020-10-23 10:12 ` [PATCH v4 08/19] sched: Massage set_cpus_allowed() Peter Zijlstra
@ 2020-11-11  8:23   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 81+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-11-11  8:23 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel),
	Valentin Schneider, Daniel Bristot de Oliveira, x86,
	linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     9cfc3e18adb0362533e911bf3ce6ec8c821cfccc
Gitweb:        https://git.kernel.org/tip/9cfc3e18adb0362533e911bf3ce6ec8c821cfccc
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 16 Sep 2020 14:59:08 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Nov 2020 18:38:59 +01:00

sched: Massage set_cpus_allowed()

Thread a u32 flags word through the *set_cpus_allowed*() callchain.
This will allow adding behavioural tweaks for future users.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102346.729082820@infradead.org
---
 kernel/sched/core.c     | 28 ++++++++++++++++++----------
 kernel/sched/deadline.c |  5 +++--
 kernel/sched/sched.h    |  7 +++++--
 3 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dcb88a0..396accb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1824,13 +1824,14 @@ static int migration_cpu_stop(void *data)
  * sched_class::set_cpus_allowed must do the below, but is not required to
  * actually call this function.
  */
-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
 {
 	cpumask_copy(&p->cpus_mask, new_mask);
 	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+static void
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
 {
 	struct rq *rq = task_rq(p);
 	bool queued, running;
@@ -1851,7 +1852,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 	if (running)
 		put_prev_task(rq, p);
 
-	p->sched_class->set_cpus_allowed(p, new_mask);
+	p->sched_class->set_cpus_allowed(p, new_mask, flags);
 
 	if (queued)
 		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -1859,6 +1860,11 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 		set_next_task(rq, p);
 }
 
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	__do_set_cpus_allowed(p, new_mask, 0);
+}
+
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
@@ -1869,7 +1875,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
  * call is not atomic; no spinlocks may be held.
  */
 static int __set_cpus_allowed_ptr(struct task_struct *p,
-				  const struct cpumask *new_mask, bool check)
+				  const struct cpumask *new_mask,
+				  u32 flags)
 {
 	const struct cpumask *cpu_valid_mask = cpu_active_mask;
 	unsigned int dest_cpu;
@@ -1891,7 +1898,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 	 * Must re-check here, to close a race against __kthread_bind(),
 	 * sched_setaffinity() is not guaranteed to observe the flag.
 	 */
-	if (check && (p->flags & PF_NO_SETAFFINITY)) {
+	if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1910,7 +1917,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 		goto out;
 	}
 
-	do_set_cpus_allowed(p, new_mask);
+	__do_set_cpus_allowed(p, new_mask, flags);
 
 	if (p->flags & PF_KTHREAD) {
 		/*
@@ -1947,7 +1954,7 @@ out:
 
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
-	return __set_cpus_allowed_ptr(p, new_mask, false);
+	return __set_cpus_allowed_ptr(p, new_mask, 0);
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
@@ -2406,7 +2413,8 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
 #else
 
 static inline int __set_cpus_allowed_ptr(struct task_struct *p,
-					 const struct cpumask *new_mask, bool check)
+					 const struct cpumask *new_mask,
+					 u32 flags)
 {
 	return set_cpus_allowed_ptr(p, new_mask);
 }
@@ -6006,7 +6014,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	}
 #endif
 again:
-	retval = __set_cpus_allowed_ptr(p, new_mask, true);
+	retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
 
 	if (!retval) {
 		cpuset_cpus_allowed(p, cpus_allowed);
@@ -6590,7 +6598,7 @@ void init_idle(struct task_struct *idle, int cpu)
 	 *
 	 * And since this is boot we can forgo the serialization.
 	 */
-	set_cpus_allowed_common(idle, cpumask_of(cpu));
+	set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
 #endif
 	/*
 	 * We're having a chicken and egg problem, even though we are
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 77880fe..e97c7c2 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2301,7 +2301,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
 }
 
 static void set_cpus_allowed_dl(struct task_struct *p,
-				const struct cpumask *new_mask)
+				const struct cpumask *new_mask,
+				u32 flags)
 {
 	struct root_domain *src_rd;
 	struct rq *rq;
@@ -2330,7 +2331,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
 		raw_spin_unlock(&src_dl_b->lock);
 	}
 
-	set_cpus_allowed_common(p, new_mask);
+	set_cpus_allowed_common(p, new_mask, flags);
 }
 
 /* Assumes rq->lock is held */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c6f707a..0420d80 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1814,7 +1814,8 @@ struct sched_class {
 	void (*task_woken)(struct rq *this_rq, struct task_struct *task);
 
 	void (*set_cpus_allowed)(struct task_struct *p,
-				 const struct cpumask *newmask);
+				 const struct cpumask *newmask,
+				 u32 flags);
 
 	void (*rq_online)(struct rq *rq);
 	void (*rq_offline)(struct rq *rq);
@@ -1907,7 +1908,9 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu);
 
 extern void trigger_load_balance(struct rq *rq);
 
-extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
+#define SCA_CHECK		0x01
+
+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
 
 #endif
 

^ permalink raw reply	[flat|nested] 81+ messages in thread

* [tip: sched/core] sched/hotplug: Consolidate task migration on CPU unplug
  2020-10-23 10:12 ` [PATCH v4 06/19] sched/hotplug: Consolidate task migration on CPU unplug Peter Zijlstra
@ 2020-11-11  8:23   ` tip-bot2 for Thomas Gleixner
  0 siblings, 0 replies; 81+ messages in thread
From: tip-bot2 for Thomas Gleixner @ 2020-11-11  8:23 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Thomas Gleixner, Peter Zijlstra (Intel),
	Valentin Schneider, Daniel Bristot de Oliveira, x86,
	linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     1cf12e08bc4d50a76b80c42a3109c53d8794a0c9
Gitweb:        https://git.kernel.org/tip/1cf12e08bc4d50a76b80c42a3109c53d8794a0c9
Author:        Thomas Gleixner <tglx@linutronix.de>
AuthorDate:    Wed, 16 Sep 2020 09:27:18 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Nov 2020 18:38:58 +01:00

sched/hotplug: Consolidate task migration on CPU unplug

With the new mechanism which kicks tasks off the outgoing CPU at the end of
schedule() the situation on an outgoing CPU right before the stopper thread
brings it down completely is:

 - All user tasks and all unbound kernel threads have either been migrated
   away or are not running and the next wakeup will move them to a online CPU.

 - All per CPU kernel threads, except cpu hotplug thread and the stopper
   thread have either been unbound or parked by the responsible CPU hotplug
   callback.

That means that at the last step before the stopper thread is invoked the
cpu hotplug thread is the last legitimate running task on the outgoing
CPU.

Add a final wait step right before the stopper thread is kicked which
ensures that any still running tasks on the way to park or on the way to
kick themself of the CPU are either sleeping or gone.

This allows to remove the migrate_tasks() crutch in sched_cpu_dying(). If
sched_cpu_dying() detects that there is still another running task aside of
the stopper thread then it will explode with the appropriate fireworks.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102346.547163969@infradead.org
---
 include/linux/cpuhotplug.h    |   1 +-
 include/linux/sched/hotplug.h |   2 +-
 kernel/cpu.c                  |   9 +-
 kernel/sched/core.c           | 154 +++++++--------------------------
 4 files changed, 46 insertions(+), 120 deletions(-)

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index bc56287..0042ef3 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -152,6 +152,7 @@ enum cpuhp_state {
 	CPUHP_AP_ONLINE,
 	CPUHP_TEARDOWN_CPU,
 	CPUHP_AP_ONLINE_IDLE,
+	CPUHP_AP_SCHED_WAIT_EMPTY,
 	CPUHP_AP_SMPBOOT_THREADS,
 	CPUHP_AP_X86_VDSO_VMA_ONLINE,
 	CPUHP_AP_IRQ_AFFINITY_ONLINE,
diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h
index 9a62ffd..412cdab 100644
--- a/include/linux/sched/hotplug.h
+++ b/include/linux/sched/hotplug.h
@@ -11,8 +11,10 @@ extern int sched_cpu_activate(unsigned int cpu);
 extern int sched_cpu_deactivate(unsigned int cpu);
 
 #ifdef CONFIG_HOTPLUG_CPU
+extern int sched_cpu_wait_empty(unsigned int cpu);
 extern int sched_cpu_dying(unsigned int cpu);
 #else
+# define sched_cpu_wait_empty	NULL
 # define sched_cpu_dying	NULL
 #endif
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ff2578..fa535ea 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1602,7 +1602,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 		.name			= "ap:online",
 	},
 	/*
-	 * Handled on controll processor until the plugged processor manages
+	 * Handled on control processor until the plugged processor manages
 	 * this itself.
 	 */
 	[CPUHP_TEARDOWN_CPU] = {
@@ -1611,6 +1611,13 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 		.teardown.single	= takedown_cpu,
 		.cant_stop		= true,
 	},
+
+	[CPUHP_AP_SCHED_WAIT_EMPTY] = {
+		.name			= "sched:waitempty",
+		.startup.single		= NULL,
+		.teardown.single	= sched_cpu_wait_empty,
+	},
+
 	/* Handle smpboot threads park/unpark */
 	[CPUHP_AP_SMPBOOT_THREADS] = {
 		.name			= "smpboot/threads:online",
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e1093c4..6c89806 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6741,120 +6741,6 @@ void idle_task_exit(void)
 	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
 }
 
-/*
- * Since this CPU is going 'away' for a while, fold any nr_active delta
- * we might have. Assumes we're called after migrate_tasks() so that the
- * nr_active count is stable. We need to take the teardown thread which
- * is calling this into account, so we hand in adjust = 1 to the load
- * calculation.
- *
- * Also see the comment "Global load-average calculations".
- */
-static void calc_load_migrate(struct rq *rq)
-{
-	long delta = calc_load_fold_active(rq, 1);
-	if (delta)
-		atomic_long_add(delta, &calc_load_tasks);
-}
-
-static struct task_struct *__pick_migrate_task(struct rq *rq)
-{
-	const struct sched_class *class;
-	struct task_struct *next;
-
-	for_each_class(class) {
-		next = class->pick_next_task(rq);
-		if (next) {
-			next->sched_class->put_prev_task(rq, next);
-			return next;
-		}
-	}
-
-	/* The idle class should always have a runnable task */
-	BUG();
-}
-
-/*
- * Migrate all tasks from the rq, sleeping tasks will be migrated by
- * try_to_wake_up()->select_task_rq().
- *
- * Called with rq->lock held even though we'er in stop_machine() and
- * there's no concurrency possible, we hold the required locks anyway
- * because of lock validation efforts.
- */
-static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
-{
-	struct rq *rq = dead_rq;
-	struct task_struct *next, *stop = rq->stop;
-	struct rq_flags orf = *rf;
-	int dest_cpu;
-
-	/*
-	 * Fudge the rq selection such that the below task selection loop
-	 * doesn't get stuck on the currently eligible stop task.
-	 *
-	 * We're currently inside stop_machine() and the rq is either stuck
-	 * in the stop_machine_cpu_stop() loop, or we're executing this code,
-	 * either way we should never end up calling schedule() until we're
-	 * done here.
-	 */
-	rq->stop = NULL;
-
-	/*
-	 * put_prev_task() and pick_next_task() sched
-	 * class method both need to have an up-to-date
-	 * value of rq->clock[_task]
-	 */
-	update_rq_clock(rq);
-
-	for (;;) {
-		/*
-		 * There's this thread running, bail when that's the only
-		 * remaining thread:
-		 */
-		if (rq->nr_running == 1)
-			break;
-
-		next = __pick_migrate_task(rq);
-
-		/*
-		 * Rules for changing task_struct::cpus_mask are holding
-		 * both pi_lock and rq->lock, such that holding either
-		 * stabilizes the mask.
-		 *
-		 * Drop rq->lock is not quite as disastrous as it usually is
-		 * because !cpu_active at this point, which means load-balance
-		 * will not interfere. Also, stop-machine.
-		 */
-		rq_unlock(rq, rf);
-		raw_spin_lock(&next->pi_lock);
-		rq_relock(rq, rf);
-
-		/*
-		 * Since we're inside stop-machine, _nothing_ should have
-		 * changed the task, WARN if weird stuff happened, because in
-		 * that case the above rq->lock drop is a fail too.
-		 */
-		if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
-			raw_spin_unlock(&next->pi_lock);
-			continue;
-		}
-
-		/* Find suitable destination for @next, with force if needed. */
-		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
-		rq = __migrate_task(rq, rf, next, dest_cpu);
-		if (rq != dead_rq) {
-			rq_unlock(rq, rf);
-			rq = dead_rq;
-			*rf = orf;
-			rq_relock(rq, rf);
-		}
-		raw_spin_unlock(&next->pi_lock);
-	}
-
-	rq->stop = stop;
-}
-
 static int __balance_push_cpu_stop(void *arg)
 {
 	struct task_struct *p = arg;
@@ -7123,10 +7009,6 @@ int sched_cpu_deactivate(unsigned int cpu)
 		return ret;
 	}
 	sched_domains_numa_masks_clear(cpu);
-
-	/* Wait for all non per CPU kernel threads to vanish. */
-	balance_hotplug_wait();
-
 	return 0;
 }
 
@@ -7146,6 +7028,41 @@ int sched_cpu_starting(unsigned int cpu)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Invoked immediately before the stopper thread is invoked to bring the
+ * CPU down completely. At this point all per CPU kthreads except the
+ * hotplug thread (current) and the stopper thread (inactive) have been
+ * either parked or have been unbound from the outgoing CPU. Ensure that
+ * any of those which might be on the way out are gone.
+ *
+ * If after this point a bound task is being woken on this CPU then the
+ * responsible hotplug callback has failed to do it's job.
+ * sched_cpu_dying() will catch it with the appropriate fireworks.
+ */
+int sched_cpu_wait_empty(unsigned int cpu)
+{
+	balance_hotplug_wait();
+	return 0;
+}
+
+/*
+ * Since this CPU is going 'away' for a while, fold any nr_active delta we
+ * might have. Called from the CPU stopper task after ensuring that the
+ * stopper is the last running task on the CPU, so nr_active count is
+ * stable. We need to take the teardown thread which is calling this into
+ * account, so we hand in adjust = 1 to the load calculation.
+ *
+ * Also see the comment "Global load-average calculations".
+ */
+static void calc_load_migrate(struct rq *rq)
+{
+	long delta = calc_load_fold_active(rq, 1);
+
+	if (delta)
+		atomic_long_add(delta, &calc_load_tasks);
+}
+
 int sched_cpu_dying(unsigned int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -7159,7 +7076,6 @@ int sched_cpu_dying(unsigned int cpu)
 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 		set_rq_offline(rq);
 	}
-	migrate_tasks(rq, &rf);
 	BUG_ON(rq->nr_running != 1);
 	rq_unlock_irqrestore(rq, &rf);
 

^ permalink raw reply	[flat|nested] 81+ messages in thread

* [tip: sched/core] sched: Fix hotplug vs CPU bandwidth control
  2020-10-23 10:12 ` [PATCH v4 07/19] sched: Fix hotplug vs CPU bandwidth control Peter Zijlstra
@ 2020-11-11  8:23   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 81+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-11-11  8:23 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel),
	Valentin Schneider, Daniel Bristot de Oliveira, x86,
	linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     120455c514f7321981c907a01c543b05aff3f254
Gitweb:        https://git.kernel.org/tip/120455c514f7321981c907a01c543b05aff3f254
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Fri, 25 Sep 2020 16:42:31 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Nov 2020 18:38:59 +01:00

sched: Fix hotplug vs CPU bandwidth control

Since we now migrate tasks away before DYING, we should also move
bandwidth unthrottle, otherwise we can gain tasks from unthrottle
after we expect all tasks to be gone already.

Also; it looks like the RT balancers don't respect cpu_active() and
instead rely on rq->online in part, complete this. This too requires
we do set_rq_offline() earlier to match the cpu_active() semantics.
(The bigger patch is to convert RT to cpu_active() entirely)

Since set_rq_online() is called from sched_cpu_activate(), place
set_rq_offline() in sched_cpu_deactivate().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102346.639538965@infradead.org
---
 kernel/sched/core.c     | 14 ++++++++++----
 kernel/sched/deadline.c |  2 +-
 kernel/sched/rt.c       |  2 +-
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6c89806..dcb88a0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6977,6 +6977,8 @@ int sched_cpu_activate(unsigned int cpu)
 
 int sched_cpu_deactivate(unsigned int cpu)
 {
+	struct rq *rq = cpu_rq(cpu);
+	struct rq_flags rf;
 	int ret;
 
 	set_cpu_active(cpu, false);
@@ -6991,6 +6993,14 @@ int sched_cpu_deactivate(unsigned int cpu)
 
 	balance_push_set(cpu, true);
 
+	rq_lock_irqsave(rq, &rf);
+	if (rq->rd) {
+		update_rq_clock(rq);
+		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+		set_rq_offline(rq);
+	}
+	rq_unlock_irqrestore(rq, &rf);
+
 #ifdef CONFIG_SCHED_SMT
 	/*
 	 * When going down, decrement the number of cores with SMT present.
@@ -7072,10 +7082,6 @@ int sched_cpu_dying(unsigned int cpu)
 	sched_tick_stop(cpu);
 
 	rq_lock_irqsave(rq, &rf);
-	if (rq->rd) {
-		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-		set_rq_offline(rq);
-	}
 	BUG_ON(rq->nr_running != 1);
 	rq_unlock_irqrestore(rq, &rf);
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index f232305..77880fe 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -543,7 +543,7 @@ static int push_dl_task(struct rq *rq);
 
 static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
 {
-	return dl_task(prev);
+	return rq->online && dl_task(prev);
 }
 
 static DEFINE_PER_CPU(struct callback_head, dl_push_head);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 49ec096..40a4663 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -265,7 +265,7 @@ static void pull_rt_task(struct rq *this_rq);
 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 {
 	/* Try to pull RT tasks here if we lower this rq's prio */
-	return rq->rt.highest_prio.curr > prev->prio;
+	return rq->online && rq->rt.highest_prio.curr > prev->prio;
 }
 
 static inline int rt_overloaded(struct rq *rq)

^ permalink raw reply	[flat|nested] 81+ messages in thread

* [tip: sched/core] sched/core: Wait for tasks being pushed away on hotplug
  2020-10-23 10:12 ` [PATCH v4 04/19] sched/core: Wait for tasks being pushed away on hotplug Peter Zijlstra
@ 2020-11-11  8:23   ` tip-bot2 for Thomas Gleixner
  0 siblings, 0 replies; 81+ messages in thread
From: tip-bot2 for Thomas Gleixner @ 2020-11-11  8:23 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Thomas Gleixner, Peter Zijlstra (Intel),
	Valentin Schneider, Daniel Bristot de Oliveira, x86,
	linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     f2469a1fb43f85d243ce72638367fb6e15c33491
Gitweb:        https://git.kernel.org/tip/f2469a1fb43f85d243ce72638367fb6e15c33491
Author:        Thomas Gleixner <tglx@linutronix.de>
AuthorDate:    Mon, 14 Sep 2020 14:47:28 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Nov 2020 18:38:58 +01:00

sched/core: Wait for tasks being pushed away on hotplug

RT kernels need to ensure that all tasks which are not per CPU kthreads
have left the outgoing CPU to guarantee that no tasks are force migrated
within a migrate disabled section.

There is also some desire to (ab)use fine grained CPU hotplug control to
clear a CPU from active state to force migrate tasks which are not per CPU
kthreads away for power control purposes.

Add a mechanism which waits until all tasks which should leave the CPU
after the CPU active flag is cleared have moved to a different online CPU.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102346.377836842@infradead.org
---
 kernel/sched/core.c  | 40 +++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |  4 ++++
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1f8bfc9..e1093c4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6896,8 +6896,21 @@ static void balance_push(struct rq *rq)
 	 * Both the cpu-hotplug and stop task are in this case and are
 	 * required to complete the hotplug process.
 	 */
-	if (is_per_cpu_kthread(push_task))
+	if (is_per_cpu_kthread(push_task)) {
+		/*
+		 * If this is the idle task on the outgoing CPU try to wake
+		 * up the hotplug control thread which might wait for the
+		 * last task to vanish. The rcuwait_active() check is
+		 * accurate here because the waiter is pinned on this CPU
+		 * and can't obviously be running in parallel.
+		 */
+		if (!rq->nr_running && rcuwait_active(&rq->hotplug_wait)) {
+			raw_spin_unlock(&rq->lock);
+			rcuwait_wake_up(&rq->hotplug_wait);
+			raw_spin_lock(&rq->lock);
+		}
 		return;
+	}
 
 	get_task_struct(push_task);
 	/*
@@ -6928,6 +6941,20 @@ static void balance_push_set(int cpu, bool on)
 	rq_unlock_irqrestore(rq, &rf);
 }
 
+/*
+ * Invoked from a CPUs hotplug control thread after the CPU has been marked
+ * inactive. All tasks which are not per CPU kernel threads are either
+ * pushed off this CPU now via balance_push() or placed on a different CPU
+ * during wakeup. Wait until the CPU is quiescent.
+ */
+static void balance_hotplug_wait(void)
+{
+	struct rq *rq = this_rq();
+
+	rcuwait_wait_event(&rq->hotplug_wait, rq->nr_running == 1,
+			   TASK_UNINTERRUPTIBLE);
+}
+
 #else
 
 static inline void balance_push(struct rq *rq)
@@ -6938,6 +6965,10 @@ static inline void balance_push_set(int cpu, bool on)
 {
 }
 
+static inline void balance_hotplug_wait(void)
+{
+}
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 void set_rq_online(struct rq *rq)
@@ -7092,6 +7123,10 @@ int sched_cpu_deactivate(unsigned int cpu)
 		return ret;
 	}
 	sched_domains_numa_masks_clear(cpu);
+
+	/* Wait for all non per CPU kernel threads to vanish. */
+	balance_hotplug_wait();
+
 	return 0;
 }
 
@@ -7332,6 +7367,9 @@ void __init sched_init(void)
 
 		rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
 #endif
+#ifdef CONFIG_HOTPLUG_CPU
+		rcuwait_init(&rq->hotplug_wait);
+#endif
 #endif /* CONFIG_SMP */
 		hrtick_rq_init(rq);
 		atomic_set(&rq->nr_iowait, 0);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a71ac84..c6f707a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1004,6 +1004,10 @@ struct rq {
 
 	/* This is used to determine avg_idle's max value */
 	u64			max_idle_balance_cost;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	struct rcuwait		hotplug_wait;
+#endif
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING

^ permalink raw reply	[flat|nested] 81+ messages in thread

* [tip: sched/core] workqueue: Manually break affinity on hotplug
  2020-10-23 10:12 ` [PATCH v4 05/19] workqueue: Manually break affinity " Peter Zijlstra
@ 2020-11-11  8:23   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 81+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-11-11  8:23 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel),
	Valentin Schneider, Tejun Heo, Daniel Bristot de Oliveira, x86,
	linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     06249738a41a70f2201a148866899f84cbebc45e
Gitweb:        https://git.kernel.org/tip/06249738a41a70f2201a148866899f84cbebc45e
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Fri, 25 Sep 2020 15:45:11 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Nov 2020 18:38:58 +01:00

workqueue: Manually break affinity on hotplug

Don't rely on the scheduler to force break affinity for us -- it will
stop doing that for per-cpu-kthreads.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102346.464718669@infradead.org
---
 kernel/workqueue.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 437935e..c71da2a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4908,6 +4908,10 @@ static void unbind_workers(int cpu)
 		pool->flags |= POOL_DISASSOCIATED;
 
 		raw_spin_unlock_irq(&pool->lock);
+
+		for_each_pool_worker(worker, pool)
+			WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0);
+
 		mutex_unlock(&wq_pool_attach_mutex);
 
 		/*

^ permalink raw reply	[flat|nested] 81+ messages in thread

* [tip: sched/core] sched: Fix balance_callback()
  2020-10-23 10:12 ` [PATCH v4 02/19] sched: Fix balance_callback() Peter Zijlstra
@ 2020-11-11  8:23   ` tip-bot2 for Peter Zijlstra
  2020-11-11 20:30     ` Paul Bolle
  0 siblings, 1 reply; 81+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-11-11  8:23 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Scott Wood, Peter Zijlstra (Intel),
	Valentin Schneider, Daniel Bristot de Oliveira, x86,
	linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     565790d28b1e33ee2f77bad5348b99f6dfc366fd
Gitweb:        https://git.kernel.org/tip/565790d28b1e33ee2f77bad5348b99f6dfc366fd
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Mon, 11 May 2020 14:13:00 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Nov 2020 18:38:57 +01:00

sched: Fix balance_callback()

The intent of balance_callback() has always been to delay executing
balancing operations until the end of the current rq->lock section.
This is because balance operations must often drop rq->lock, and that
isn't safe in general.

However, as noted by Scott, there were a few holes in that scheme;
balance_callback() was called after rq->lock was dropped, which means
another CPU can interleave and touch the callback list.

Rework code to call the balance callbacks before dropping rq->lock
where possible, and otherwise splice the balance list onto a local
stack.

This guarantees that the balance list must be empty when we take
rq->lock. IOW, we'll only ever run our own balance callbacks.

Reported-by: Scott Wood <swood@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102346.203901269@infradead.org
---
 kernel/sched/core.c  | 119 ++++++++++++++++++++++++++----------------
 kernel/sched/sched.h |   3 +-
 2 files changed, 78 insertions(+), 44 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5e24104..0196a3f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3485,6 +3485,69 @@ static inline void finish_task(struct task_struct *prev)
 #endif
 }
 
+#ifdef CONFIG_SMP
+
+static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+	void (*func)(struct rq *rq);
+	struct callback_head *next;
+
+	lockdep_assert_held(&rq->lock);
+
+	while (head) {
+		func = (void (*)(struct rq *))head->func;
+		next = head->next;
+		head->next = NULL;
+		head = next;
+
+		func(rq);
+	}
+}
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+	struct callback_head *head = rq->balance_callback;
+
+	lockdep_assert_held(&rq->lock);
+	if (head)
+		rq->balance_callback = NULL;
+
+	return head;
+}
+
+static void __balance_callbacks(struct rq *rq)
+{
+	do_balance_callbacks(rq, splice_balance_callbacks(rq));
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+	unsigned long flags;
+
+	if (unlikely(head)) {
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		do_balance_callbacks(rq, head);
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	}
+}
+
+#else
+
+static inline void __balance_callbacks(struct rq *rq)
+{
+}
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+	return NULL;
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+}
+
+#endif
+
 static inline void
 prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
 {
@@ -3510,6 +3573,7 @@ static inline void finish_lock_switch(struct rq *rq)
 	 * prev into current:
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+	__balance_callbacks(rq);
 	raw_spin_unlock_irq(&rq->lock);
 }
 
@@ -3651,43 +3715,6 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	return rq;
 }
 
-#ifdef CONFIG_SMP
-
-/* rq->lock is NOT held, but preemption is disabled */
-static void __balance_callback(struct rq *rq)
-{
-	struct callback_head *head, *next;
-	void (*func)(struct rq *rq);
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&rq->lock, flags);
-	head = rq->balance_callback;
-	rq->balance_callback = NULL;
-	while (head) {
-		func = (void (*)(struct rq *))head->func;
-		next = head->next;
-		head->next = NULL;
-		head = next;
-
-		func(rq);
-	}
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-static inline void balance_callback(struct rq *rq)
-{
-	if (unlikely(rq->balance_callback))
-		__balance_callback(rq);
-}
-
-#else
-
-static inline void balance_callback(struct rq *rq)
-{
-}
-
-#endif
-
 /**
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
@@ -3707,7 +3734,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 	 */
 
 	rq = finish_task_switch(prev);
-	balance_callback(rq);
 	preempt_enable();
 
 	if (current->set_child_tid)
@@ -4523,10 +4549,11 @@ static void __sched notrace __schedule(bool preempt)
 		rq = context_switch(rq, prev, next, &rf);
 	} else {
 		rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-		rq_unlock_irq(rq, &rf);
-	}
 
-	balance_callback(rq);
+		rq_unpin_lock(rq, &rf);
+		__balance_callbacks(rq);
+		raw_spin_unlock_irq(&rq->lock);
+	}
 }
 
 void __noreturn do_task_dead(void)
@@ -4937,9 +4964,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 out_unlock:
 	/* Avoid rq from going away on us: */
 	preempt_disable();
-	__task_rq_unlock(rq, &rf);
 
-	balance_callback(rq);
+	rq_unpin_lock(rq, &rf);
+	__balance_callbacks(rq);
+	raw_spin_unlock(&rq->lock);
+
 	preempt_enable();
 }
 #else
@@ -5213,6 +5242,7 @@ static int __sched_setscheduler(struct task_struct *p,
 	int retval, oldprio, oldpolicy = -1, queued, running;
 	int new_effective_prio, policy = attr->sched_policy;
 	const struct sched_class *prev_class;
+	struct callback_head *head;
 	struct rq_flags rf;
 	int reset_on_fork;
 	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -5451,6 +5481,7 @@ change:
 
 	/* Avoid rq from going away on us: */
 	preempt_disable();
+	head = splice_balance_callbacks(rq);
 	task_rq_unlock(rq, p, &rf);
 
 	if (pi) {
@@ -5459,7 +5490,7 @@ change:
 	}
 
 	/* Run balance callbacks after we've adjusted the PI chain: */
-	balance_callback(rq);
+	balance_callbacks(rq, head);
 	preempt_enable();
 
 	return 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index df80bfc..738a00b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1221,6 +1221,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
 	rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
 	rf->clock_update_flags = 0;
 #endif
+#ifdef CONFIG_SMP
+	SCHED_WARN_ON(rq->balance_callback);
+#endif
 }
 
 static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)

^ permalink raw reply	[flat|nested] 81+ messages in thread

* [tip: sched/core] sched/hotplug: Ensure only per-cpu kthreads run during hotplug
  2020-10-23 10:12 ` [PATCH v4 03/19] sched/hotplug: Ensure only per-cpu kthreads run during hotplug Peter Zijlstra
@ 2020-11-11  8:23   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 81+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-11-11  8:23 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel),
	Valentin Schneider, Daniel Bristot de Oliveira, x86,
	linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     2558aacff8586699bcd248b406febb28b0a25de2
Gitweb:        https://git.kernel.org/tip/2558aacff8586699bcd248b406febb28b0a25de2
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Fri, 11 Sep 2020 09:54:27 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Nov 2020 18:38:57 +01:00

sched/hotplug: Ensure only per-cpu kthreads run during hotplug

In preparation for migrate_disable(), make sure only per-cpu kthreads
are allowed to run on !active CPUs.

This is ran (as one of the very first steps) from the cpu-hotplug
task which is a per-cpu kthread and completion of the hotplug
operation only requires such tasks.

This constraint enables the migrate_disable() implementation to wait
for completion of all migrate_disable regions on this CPU at hotplug
time without fear of any new ones starting.

This replaces the unlikely(rq->balance_callbacks) test at the tail of
context_switch with an unlikely(rq->balance_work), the fast path is
not affected.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102346.292709163@infradead.org
---
 kernel/sched/core.c  | 114 +++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |   7 ++-
 2 files changed, 118 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0196a3f..1f8bfc9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3509,8 +3509,10 @@ static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
 	struct callback_head *head = rq->balance_callback;
 
 	lockdep_assert_held(&rq->lock);
-	if (head)
+	if (head) {
 		rq->balance_callback = NULL;
+		rq->balance_flags &= ~BALANCE_WORK;
+	}
 
 	return head;
 }
@@ -3531,6 +3533,21 @@ static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
 	}
 }
 
+static void balance_push(struct rq *rq);
+
+static inline void balance_switch(struct rq *rq)
+{
+	if (likely(!rq->balance_flags))
+		return;
+
+	if (rq->balance_flags & BALANCE_PUSH) {
+		balance_push(rq);
+		return;
+	}
+
+	__balance_callbacks(rq);
+}
+
 #else
 
 static inline void __balance_callbacks(struct rq *rq)
@@ -3546,6 +3563,10 @@ static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
 {
 }
 
+static inline void balance_switch(struct rq *rq)
+{
+}
+
 #endif
 
 static inline void
@@ -3573,7 +3594,7 @@ static inline void finish_lock_switch(struct rq *rq)
 	 * prev into current:
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-	__balance_callbacks(rq);
+	balance_switch(rq);
 	raw_spin_unlock_irq(&rq->lock);
 }
 
@@ -6833,6 +6854,90 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
 
 	rq->stop = stop;
 }
+
+static int __balance_push_cpu_stop(void *arg)
+{
+	struct task_struct *p = arg;
+	struct rq *rq = this_rq();
+	struct rq_flags rf;
+	int cpu;
+
+	raw_spin_lock_irq(&p->pi_lock);
+	rq_lock(rq, &rf);
+
+	update_rq_clock(rq);
+
+	if (task_rq(p) == rq && task_on_rq_queued(p)) {
+		cpu = select_fallback_rq(rq->cpu, p);
+		rq = __migrate_task(rq, &rf, p, cpu);
+	}
+
+	rq_unlock(rq, &rf);
+	raw_spin_unlock_irq(&p->pi_lock);
+
+	put_task_struct(p);
+
+	return 0;
+}
+
+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
+
+/*
+ * Ensure we only run per-cpu kthreads once the CPU goes !active.
+ */
+static void balance_push(struct rq *rq)
+{
+	struct task_struct *push_task = rq->curr;
+
+	lockdep_assert_held(&rq->lock);
+	SCHED_WARN_ON(rq->cpu != smp_processor_id());
+
+	/*
+	 * Both the cpu-hotplug and stop task are in this case and are
+	 * required to complete the hotplug process.
+	 */
+	if (is_per_cpu_kthread(push_task))
+		return;
+
+	get_task_struct(push_task);
+	/*
+	 * Temporarily drop rq->lock such that we can wake-up the stop task.
+	 * Both preemption and IRQs are still disabled.
+	 */
+	raw_spin_unlock(&rq->lock);
+	stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
+			    this_cpu_ptr(&push_work));
+	/*
+	 * At this point need_resched() is true and we'll take the loop in
+	 * schedule(). The next pick is obviously going to be the stop task
+	 * which is_per_cpu_kthread() and will push this task away.
+	 */
+	raw_spin_lock(&rq->lock);
+}
+
+static void balance_push_set(int cpu, bool on)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct rq_flags rf;
+
+	rq_lock_irqsave(rq, &rf);
+	if (on)
+		rq->balance_flags |= BALANCE_PUSH;
+	else
+		rq->balance_flags &= ~BALANCE_PUSH;
+	rq_unlock_irqrestore(rq, &rf);
+}
+
+#else
+
+static inline void balance_push(struct rq *rq)
+{
+}
+
+static inline void balance_push_set(int cpu, bool on)
+{
+}
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 void set_rq_online(struct rq *rq)
@@ -6918,6 +7023,8 @@ int sched_cpu_activate(unsigned int cpu)
 	struct rq *rq = cpu_rq(cpu);
 	struct rq_flags rf;
 
+	balance_push_set(cpu, false);
+
 #ifdef CONFIG_SCHED_SMT
 	/*
 	 * When going up, increment the number of cores with SMT present.
@@ -6965,6 +7072,8 @@ int sched_cpu_deactivate(unsigned int cpu)
 	 */
 	synchronize_rcu();
 
+	balance_push_set(cpu, true);
+
 #ifdef CONFIG_SCHED_SMT
 	/*
 	 * When going down, decrement the number of cores with SMT present.
@@ -6978,6 +7087,7 @@ int sched_cpu_deactivate(unsigned int cpu)
 
 	ret = cpuset_cpu_inactive(cpu);
 	if (ret) {
+		balance_push_set(cpu, false);
 		set_cpu_active(cpu, true);
 		return ret;
 	}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 738a00b..a71ac84 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -973,6 +973,7 @@ struct rq {
 	unsigned long		cpu_capacity_orig;
 
 	struct callback_head	*balance_callback;
+	unsigned char		balance_flags;
 
 	unsigned char		nohz_idle_balance;
 	unsigned char		idle_balance;
@@ -1385,6 +1386,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
 
 #ifdef CONFIG_SMP
 
+#define BALANCE_WORK	0x01
+#define BALANCE_PUSH	0x02
+
 static inline void
 queue_balance_callback(struct rq *rq,
 		       struct callback_head *head,
@@ -1392,12 +1396,13 @@ queue_balance_callback(struct rq *rq,
 {
 	lockdep_assert_held(&rq->lock);
 
-	if (unlikely(head->next))
+	if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH)))
 		return;
 
 	head->func = (void (*)(struct callback_head *))func;
 	head->next = rq->balance_callback;
 	rq->balance_callback = head;
+	rq->balance_flags |= BALANCE_WORK;
 }
 
 #define rcu_dereference_check_sched_domain(p) \

^ permalink raw reply	[flat|nested] 81+ messages in thread

* [tip: sched/core] stop_machine: Add function and caller debug info
  2020-10-23 10:11 ` [PATCH v4 01/19] stop_machine: Add function and caller debug info Peter Zijlstra
@ 2020-11-11  8:23   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 81+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-11-11  8:23 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel),
	Valentin Schneider, Daniel Bristot de Oliveira, x86,
	linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     a8b62fd0850503cf1e557d7e5a98d3f1f5c25eef
Gitweb:        https://git.kernel.org/tip/a8b62fd0850503cf1e557d7e5a98d3f1f5c25eef
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Mon, 21 Sep 2020 12:58:17 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Nov 2020 18:38:57 +01:00

stop_machine: Add function and caller debug info

Crashes in stop-machine are hard to connect to the calling code, add a
little something to help with that.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201023102346.116513635@infradead.org
---
 include/linux/stop_machine.h |  5 +++++
 kernel/sched/core.c          |  1 +
 kernel/stop_machine.c        | 27 ++++++++++++++++++++++++---
 lib/dump_stack.c             |  2 ++
 4 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 76d8b09..30577c3 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -24,6 +24,7 @@ typedef int (*cpu_stop_fn_t)(void *arg);
 struct cpu_stop_work {
 	struct list_head	list;		/* cpu_stopper->works */
 	cpu_stop_fn_t		fn;
+	unsigned long		caller;
 	void			*arg;
 	struct cpu_stop_done	*done;
 };
@@ -36,6 +37,8 @@ void stop_machine_park(int cpu);
 void stop_machine_unpark(int cpu);
 void stop_machine_yield(const struct cpumask *cpumask);
 
+extern void print_stop_info(const char *log_lvl, struct task_struct *task);
+
 #else	/* CONFIG_SMP */
 
 #include <linux/workqueue.h>
@@ -80,6 +83,8 @@ static inline bool stop_one_cpu_nowait(unsigned int cpu,
 	return false;
 }
 
+static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { }
+
 #endif	/* CONFIG_SMP */
 
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d2003a7..5e24104 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6447,6 +6447,7 @@ void sched_show_task(struct task_struct *p)
 		(unsigned long)task_thread_info(p)->flags);
 
 	print_worker_info(KERN_INFO, p);
+	print_stop_info(KERN_INFO, p);
 	show_stack(p, NULL, KERN_INFO);
 	put_task_stack(p);
 }
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 865bb02..3cf567c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -42,11 +42,27 @@ struct cpu_stopper {
 	struct list_head	works;		/* list of pending works */
 
 	struct cpu_stop_work	stop_work;	/* for stop_cpus */
+	unsigned long		caller;
+	cpu_stop_fn_t		fn;
 };
 
 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
 static bool stop_machine_initialized = false;
 
+void print_stop_info(const char *log_lvl, struct task_struct *task)
+{
+	/*
+	 * If @task is a stopper task, it cannot migrate and task_cpu() is
+	 * stable.
+	 */
+	struct cpu_stopper *stopper = per_cpu_ptr(&cpu_stopper, task_cpu(task));
+
+	if (task != stopper->thread)
+		return;
+
+	printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller);
+}
+
 /* static data for stop_cpus */
 static DEFINE_MUTEX(stop_cpus_mutex);
 static bool stop_cpus_in_progress;
@@ -123,7 +139,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
 int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
 {
 	struct cpu_stop_done done;
-	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
+	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ };
 
 	cpu_stop_init_done(&done, 1);
 	if (!cpu_stop_queue_work(cpu, &work))
@@ -331,7 +347,8 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 	work1 = work2 = (struct cpu_stop_work){
 		.fn = multi_cpu_stop,
 		.arg = &msdata,
-		.done = &done
+		.done = &done,
+		.caller = _RET_IP_,
 	};
 
 	cpu_stop_init_done(&done, 2);
@@ -367,7 +384,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 			struct cpu_stop_work *work_buf)
 {
-	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
+	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, };
 	return cpu_stop_queue_work(cpu, work_buf);
 }
 
@@ -487,6 +504,8 @@ repeat:
 		int ret;
 
 		/* cpu stop callbacks must not sleep, make in_atomic() == T */
+		stopper->caller = work->caller;
+		stopper->fn = fn;
 		preempt_count_inc();
 		ret = fn(arg);
 		if (done) {
@@ -495,6 +514,8 @@ repeat:
 			cpu_stop_signal_done(done);
 		}
 		preempt_count_dec();
+		stopper->fn = NULL;
+		stopper->caller = 0;
 		WARN_ONCE(preempt_count(),
 			  "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg);
 		goto repeat;
diff --git a/lib/dump_stack.c b/lib/dump_stack.c
index a00ee6e..f5a33b6 100644
--- a/lib/dump_stack.c
+++ b/lib/dump_stack.c
@@ -12,6 +12,7 @@
 #include <linux/atomic.h>
 #include <linux/kexec.h>
 #include <linux/utsname.h>
+#include <linux/stop_machine.h>
 
 static char dump_stack_arch_desc_str[128];
 
@@ -57,6 +58,7 @@ void dump_stack_print_info(const char *log_lvl)
 		       log_lvl, dump_stack_arch_desc_str);
 
 	print_worker_info(log_lvl, current);
+	print_stop_info(log_lvl, current);
 }
 
 /**

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [tip: sched/core] sched: Fix balance_callback()
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
@ 2020-11-11 20:30     ` Paul Bolle
  2020-11-11 20:45       ` Peter Zijlstra
  0 siblings, 1 reply; 81+ messages in thread
From: Paul Bolle @ 2020-11-11 20:30 UTC (permalink / raw)
  To: linux-kernel, linux-tip-commits
  Cc: Scott Wood, Peter Zijlstra (Intel),
	Valentin Schneider, Daniel Bristot de Oliveira, x86

tip-bot2 for Peter Zijlstra schreef op wo 11-11-2020 om 08:23 [+0000]:
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> [...]
> +static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
> +{
> +	void (*func)(struct rq *rq);
> +	struct callback_head *next;
> +
> +	lockdep_assert_held(&rq->lock);
> +
> +	while (head) {
> +		func = (void (*)(struct rq *))head->func;
> +		next = head->next;
> +		head->next = NULL;
> +		head = next;

Naive question: is there some subtle C-issue that is evaded here by setting
head->next to NULL prior to copying over it?

(I know this piece of code only got copied around in this patch and this is
therefor not something that this patch actually introduced.)

> +
> +		func(rq);
> +	}
> +}

Thanks,


Paul Bolle


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [tip: sched/core] sched: Fix balance_callback()
  2020-11-11 20:30     ` Paul Bolle
@ 2020-11-11 20:45       ` Peter Zijlstra
  0 siblings, 0 replies; 81+ messages in thread
From: Peter Zijlstra @ 2020-11-11 20:45 UTC (permalink / raw)
  To: Paul Bolle
  Cc: linux-kernel, linux-tip-commits, Scott Wood, Valentin Schneider,
	Daniel Bristot de Oliveira, x86

On Wed, Nov 11, 2020 at 09:30:42PM +0100, Paul Bolle wrote:
> tip-bot2 for Peter Zijlstra schreef op wo 11-11-2020 om 08:23 [+0000]:
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > [...]
> > +static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
> > +{
> > +	void (*func)(struct rq *rq);
> > +	struct callback_head *next;
> > +
> > +	lockdep_assert_held(&rq->lock);
> > +
> > +	while (head) {
> > +		func = (void (*)(struct rq *))head->func;
> > +		next = head->next;
> > +		head->next = NULL;
> > +		head = next;
> 
> Naive question: is there some subtle C-issue that is evaded here by setting
> head->next to NULL prior to copying over it?
> 
> (I know this piece of code only got copied around in this patch and this is
> therefor not something that this patch actually introduced.)

It's like list_del_init(), it zeros the entry before unlinking it.
queue_balance_callback() relies on this.

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-10-23 10:12 ` [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr() Peter Zijlstra
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
@ 2020-11-12 16:38   ` Qian Cai
  2020-11-12 17:26     ` Valentin Schneider
  2020-11-20 12:34     ` [tip: sched/core] sched/core: Add missing completion for affine_move_task() waiters tip-bot2 for Valentin Schneider
  1 sibling, 2 replies; 81+ messages in thread
From: Qian Cai @ 2020-11-12 16:38 UTC (permalink / raw)
  To: Peter Zijlstra, tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, valentin.schneider,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210

On Fri, 2020-10-23 at 12:12 +0200, Peter Zijlstra wrote:
> Concurrent migrate_disable() and set_cpus_allowed_ptr() has
> interesting features. We rely on set_cpus_allowed_ptr() to not return
> until the task runs inside the provided mask. This expectation is
> exported to userspace.
> 
> This means that any set_cpus_allowed_ptr() caller must wait until
> migrate_enable() allows migrations.
> 
> At the same time, we don't want migrate_enable() to schedule, due to
> patterns like:
> 
> 	preempt_disable();
> 	migrate_disable();
> 	...
> 	migrate_enable();
> 	preempt_enable();
> 
> And:
> 
> 	raw_spin_lock(&B);
> 	spin_unlock(&A);
> 
> this means that when migrate_enable() must restore the affinity
> mask, it cannot wait for completion thereof. Luck will have it that
> that is exactly the case where there is a pending
> set_cpus_allowed_ptr(), so let that provide storage for the async stop
> machine.
> 
> Much thanks to Valentin who used TLA+ most effective and found lots of
> 'interesting' cases.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/sched.h |    1 
>  kernel/sched/core.c   |  234 +++++++++++++++++++++++++++++++++++++++++++-----
> --
>  2 files changed, 205 insertions(+), 30 deletions(-)

Some syscall fuzzing from an unprivileged user starts to trigger this below
since this commit first appeared in the linux-next today. Does it ring any
bells?

[12065.065837][ T1310] INFO: task trinity-c30:91730 blocked for more than 368 seconds.
[12065.073524][ T1310]       Tainted: G             L    5.10.0-rc3-next-20201112 #2
[12065.081076][ T1310] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12065.089648][ T1310] task:trinity-c30     state:D stack:26576 pid:91730 ppid: 82688 flags:0x00000000
[12065.098818][ T1310] Call trace:
[12065.101987][ T1310]  __switch_to+0xf0/0x1a8
[12065.106227][ T1310]  __schedule+0x6ec/0x1708
[12065.110505][ T1310]  schedule+0x1bc/0x3b0
[12065.114562][ T1310]  schedule_timeout+0x3c4/0x4c0
[12065.119275][ T1310]  wait_for_completion+0x13c/0x248
[12065.124257][ T1310]  affine_move_task+0x410/0x688
(inlined by) affine_move_task at kernel/sched/core.c:2261
[12065.129013][ T1310]  __set_cpus_allowed_ptr+0x1b4/0x370
[12065.134248][ T1310]  sched_setaffinity+0x4f0/0x7e8
[12065.139088][ T1310]  __arm64_sys_sched_setaffinity+0x1f4/0x2a0
[12065.144972][ T1310]  do_el0_svc+0x124/0x228
[12065.149165][ T1310]  el0_sync_handler+0x208/0x384
[12065.153876][ T1310]  el0_sync+0x140/0x180
[12065.157971][ T1310] 
[12065.157971][ T1310] Showing all locks held in the system:
[12065.166401][ T1310] 1 lock held by khungtaskd/1310:
[12065.171288][ T1310]  #0: ffff800018d0cb40 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire.constprop.56+0x0/0x38
[12065.182210][ T1310] 4 locks held by trinity-main/82688:
[12065.187515][ T1310] 2 locks held by kworker/u513:3/82813:
[12065.192922][ T1310]  #0: ffff000000419d38 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x69c/0x18c8
[12065.203890][ T1310]  #1: ffff0000122bfd40 ((work_completion)(&buf->work)){+.+.}-{0:0}, at: __update_idle_core+0xa8/0x460
[12065.214916][ T1310] 1 lock held by trinity-c35/137168:
[12065.220061][ T1310]  #0: ffff0087ce767898 (&tty->ldisc_sem){++++}-{0:0}, at: ldsem_down_read+0x3c/0x48
[12065.229483][ T1310] 3 locks held by trinity-c61/137611:
[12065.234757][ T1310] 1 lock held by trinity-c7/137630:
[12065.239828][ T1310] 1 lock held by trinity-c57/137714:
[12065.242612][T137611] futex_wake_op: trinity-c61 tries to shift op by 1008; fix this program
[12065.245012][ T1310] 1 lock held by trinity-c52/137771:
[12065.258538][ T1310] 2 locks held by trinity-c42/137835:
[12065.263783][ T1310] 4 locks held by trinity-c22/137868:
[12065.269051][ T1310]  #0: ffff000e78503798 (&rq->lock){-.-.}-{2:2}, at: newidle_balance+0x92c/0xd78
[12065.278155][ T1310]  #1: ffff0087ce767930 (&tty->atomic_write_lock){+.+.}-{3:3}, at: tty_write_lock+0x30/0x58
[12065.288317][ T1310]  #2: ffff800018d0cb40 (rcu_read_lock){....}-{1:2}, at: __mutex_lock+0x24c/0x1310
[12065.297592][ T1310]  #3: ffff800018d0cb40 (rcu_read_lock){....}-{1:2}, at: lock_page_memcg+0x98/0x240
[12065.307026][ T1310] 2 locks held by trinity-c34/137896:
[12065.312266][ T1310]  #0: ffff000e78463798 (&rq->lock){-.-.}-{2:2}, at: __schedule+0x22c/0x1708
[12065.321023][ T1310]  #1: ffff800018d0cb40 (rcu_read_lock){....}-{1:2}, at: __update_idle_core+0xa8/0x460
[12065.330663][ T1310] 2 locks held by trinity-c43/137909:
[12065.335996][ T1310] 1 lock held by trinity-c24/137910:
[12065.341164][ T1310] 1 lock held by trinity-c1/137954:
[12065.346272][ T1310] 1 lock held by trinity-c49/138020:
[12065.351425][ T1310] 1 lock held by trinity-c10/138021:
[12065.356649][ T1310] 1 lock held by trinity-c32/138039:
[12065.361813][ T1310] 4 locks held by trinity-c36/138042:
[12065.367129][ T1310] 2 locks held by trinity-c14/138061:
[12065.372378][ T1310] 2 locks held by trinity-c38/138070:
[12065.377688][ T1310] 1 lock held by trinity-c50/138074:
[12065.382885][ T1310] 1 lock held by trinity-c12/138085:
[12065.388186][ T1310] 1 lock held by trinity-c4/138087:
[12065.393272][ T1310] 3 locks held by trinity-c6/138091:
[12065.398492][ T1310] 2 locks held by trinity-c48/138095:
[12065.403757][ T1310] 2 locks held by trinity-c62/138097:
[12065.409045][ T1310] 2 locks held by trinity-main/138107:
[12065.414441][ T1310] 1 lock held by modprobe/138108:
[12065.419351][ T1310] 
[12065.421560][ T1310] =============================================
[12065.421560][ T1310] 


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-11-12 16:38   ` [PATCH v4 10/19] " Qian Cai
@ 2020-11-12 17:26     ` Valentin Schneider
  2020-11-12 18:01       ` Qian Cai
  2020-11-12 18:35       ` Qian Cai
  2020-11-20 12:34     ` [tip: sched/core] sched/core: Add missing completion for affine_move_task() waiters tip-bot2 for Valentin Schneider
  1 sibling, 2 replies; 81+ messages in thread
From: Valentin Schneider @ 2020-11-12 17:26 UTC (permalink / raw)
  To: Qian Cai
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj, ouwen210


On 12/11/20 16:38, Qian Cai wrote:
> Some syscall fuzzing from an unprivileged user starts to trigger this below
> since this commit first appeared in the linux-next today. Does it ring any
> bells?
>

What's the .config? I'm interested in
CONFIG_PREEMPT
CONFIG_PREEMPT_RT
CONFIG_SMP

From a quick look it seems that tree doesn't have Thomas' "generalization" of
migrate_disable(), so if this doesn't have PREEMPT_RT we could forget about
migrate_disable() for now.

> [12065.065837][ T1310] INFO: task trinity-c30:91730 blocked for more than 368 seconds.
> [12065.073524][ T1310]       Tainted: G             L    5.10.0-rc3-next-20201112 #2
> [12065.081076][ T1310] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [12065.089648][ T1310] task:trinity-c30     state:D stack:26576 pid:91730 ppid: 82688 flags:0x00000000
> [12065.098818][ T1310] Call trace:
> [12065.101987][ T1310]  __switch_to+0xf0/0x1a8
> [12065.106227][ T1310]  __schedule+0x6ec/0x1708
> [12065.110505][ T1310]  schedule+0x1bc/0x3b0
> [12065.114562][ T1310]  schedule_timeout+0x3c4/0x4c0
> [12065.119275][ T1310]  wait_for_completion+0x13c/0x248
> [12065.124257][ T1310]  affine_move_task+0x410/0x688
> (inlined by) affine_move_task at kernel/sched/core.c:2261
> [12065.129013][ T1310]  __set_cpus_allowed_ptr+0x1b4/0x370
> [12065.134248][ T1310]  sched_setaffinity+0x4f0/0x7e8
> [12065.139088][ T1310]  __arm64_sys_sched_setaffinity+0x1f4/0x2a0
> [12065.144972][ T1310]  do_el0_svc+0x124/0x228
> [12065.149165][ T1310]  el0_sync_handler+0x208/0x384
> [12065.153876][ T1310]  el0_sync+0x140/0x180
> [12065.157971][ T1310]

So that's a task changing the affinity of some task (either itself or
another; I can't say without a decoded stacktrace), and then blocking on a
wait_for_completion() that apparently never happens.

I don't see stop_one_cpu() in the trace, so I assume it's the !task_running
case, for which the completion should be completed before getting to the
wait (unless we *do* have migrate_disable()).

Could you please run scripts/decode_stacktrace.sh on the above?

> [12065.157971][ T1310] Showing all locks held in the system:
> [12065.166401][ T1310] 1 lock held by khungtaskd/1310:
> [12065.171288][ T1310]  #0: ffff800018d0cb40 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire.constprop.56+0x0/0x38
> [12065.182210][ T1310] 4 locks held by trinity-main/82688:
> [12065.187515][ T1310] 2 locks held by kworker/u513:3/82813:
> [12065.192922][ T1310]  #0: ffff000000419d38 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x69c/0x18c8
> [12065.203890][ T1310]  #1: ffff0000122bfd40 ((work_completion)(&buf->work)){+.+.}-{0:0}, at: __update_idle_core+0xa8/0x460
> [12065.214916][ T1310] 1 lock held by trinity-c35/137168:
> [12065.220061][ T1310]  #0: ffff0087ce767898 (&tty->ldisc_sem){++++}-{0:0}, at: ldsem_down_read+0x3c/0x48
> [12065.229483][ T1310] 3 locks held by trinity-c61/137611:
> [12065.234757][ T1310] 1 lock held by trinity-c7/137630:
> [12065.239828][ T1310] 1 lock held by trinity-c57/137714:
> [12065.242612][T137611] futex_wake_op: trinity-c61 tries to shift op by 1008; fix this program
> [12065.245012][ T1310] 1 lock held by trinity-c52/137771:
> [12065.258538][ T1310] 2 locks held by trinity-c42/137835:
> [12065.263783][ T1310] 4 locks held by trinity-c22/137868:
> [12065.269051][ T1310]  #0: ffff000e78503798 (&rq->lock){-.-.}-{2:2}, at: newidle_balance+0x92c/0xd78
> [12065.278155][ T1310]  #1: ffff0087ce767930 (&tty->atomic_write_lock){+.+.}-{3:3}, at: tty_write_lock+0x30/0x58
> [12065.288317][ T1310]  #2: ffff800018d0cb40 (rcu_read_lock){....}-{1:2}, at: __mutex_lock+0x24c/0x1310
> [12065.297592][ T1310]  #3: ffff800018d0cb40 (rcu_read_lock){....}-{1:2}, at: lock_page_memcg+0x98/0x240
> [12065.307026][ T1310] 2 locks held by trinity-c34/137896:
> [12065.312266][ T1310]  #0: ffff000e78463798 (&rq->lock){-.-.}-{2:2}, at: __schedule+0x22c/0x1708
> [12065.321023][ T1310]  #1: ffff800018d0cb40 (rcu_read_lock){....}-{1:2}, at: __update_idle_core+0xa8/0x460
> [12065.330663][ T1310] 2 locks held by trinity-c43/137909:
> [12065.335996][ T1310] 1 lock held by trinity-c24/137910:
> [12065.341164][ T1310] 1 lock held by trinity-c1/137954:
> [12065.346272][ T1310] 1 lock held by trinity-c49/138020:
> [12065.351425][ T1310] 1 lock held by trinity-c10/138021:
> [12065.356649][ T1310] 1 lock held by trinity-c32/138039:
> [12065.361813][ T1310] 4 locks held by trinity-c36/138042:
> [12065.367129][ T1310] 2 locks held by trinity-c14/138061:
> [12065.372378][ T1310] 2 locks held by trinity-c38/138070:
> [12065.377688][ T1310] 1 lock held by trinity-c50/138074:
> [12065.382885][ T1310] 1 lock held by trinity-c12/138085:
> [12065.388186][ T1310] 1 lock held by trinity-c4/138087:
> [12065.393272][ T1310] 3 locks held by trinity-c6/138091:
> [12065.398492][ T1310] 2 locks held by trinity-c48/138095:
> [12065.403757][ T1310] 2 locks held by trinity-c62/138097:
> [12065.409045][ T1310] 2 locks held by trinity-main/138107:
> [12065.414441][ T1310] 1 lock held by modprobe/138108:
> [12065.419351][ T1310]
> [12065.421560][ T1310] =============================================
> [12065.421560][ T1310]

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-11-12 17:26     ` Valentin Schneider
@ 2020-11-12 18:01       ` Qian Cai
  2020-11-12 19:31         ` Valentin Schneider
  2020-11-12 18:35       ` Qian Cai
  1 sibling, 1 reply; 81+ messages in thread
From: Qian Cai @ 2020-11-12 18:01 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj, ouwen210

On Thu, 2020-11-12 at 17:26 +0000, Valentin Schneider wrote:
> On 12/11/20 16:38, Qian Cai wrote:
> > Some syscall fuzzing from an unprivileged user starts to trigger this below
> > since this commit first appeared in the linux-next today. Does it ring any
> > bells?
> > 
> 
> What's the .config? I'm interested in
> CONFIG_PREEMPT
> CONFIG_PREEMPT_RT
> CONFIG_SMP

https://cailca.coding.net/public/linux/mm/git/files/master/arm64.config

# CONFIG_PREEMPT is not set
CONFIG_SMP=y

Also, I have been able to reproduce this on powerpc as well just now.

> 
> From a quick look it seems that tree doesn't have Thomas' "generalization" of
> migrate_disable(), so if this doesn't have PREEMPT_RT we could forget about
> migrate_disable() for now.
> 
> > [12065.065837][ T1310] INFO: task trinity-c30:91730 blocked for more than
> > 368 seconds.
> > [12065.073524][ T1310]       Tainted: G             L    5.10.0-rc3-next-
> > 20201112 #2
> > [12065.081076][ T1310] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
> > disables this message.
> > [12065.089648][ T1310] task:trinity-c30     state:D stack:26576 pid:91730
> > ppid: 82688 flags:0x00000000
> > [12065.098818][ T1310] Call trace:
> > [12065.101987][ T1310]  __switch_to+0xf0/0x1a8
> > [12065.106227][ T1310]  __schedule+0x6ec/0x1708
> > [12065.110505][ T1310]  schedule+0x1bc/0x3b0
> > [12065.114562][ T1310]  schedule_timeout+0x3c4/0x4c0
> > [12065.119275][ T1310]  wait_for_completion+0x13c/0x248
> > [12065.124257][ T1310]  affine_move_task+0x410/0x688
> > (inlined by) affine_move_task at kernel/sched/core.c:2261
> > [12065.129013][ T1310]  __set_cpus_allowed_ptr+0x1b4/0x370
> > [12065.134248][ T1310]  sched_setaffinity+0x4f0/0x7e8
> > [12065.139088][ T1310]  __arm64_sys_sched_setaffinity+0x1f4/0x2a0
> > [12065.144972][ T1310]  do_el0_svc+0x124/0x228
> > [12065.149165][ T1310]  el0_sync_handler+0x208/0x384
> > [12065.153876][ T1310]  el0_sync+0x140/0x180
> > [12065.157971][ T1310]
> 
> So that's a task changing the affinity of some task (either itself or
> another; I can't say without a decoded stacktrace), and then blocking on a
> wait_for_completion() that apparently never happens.
> 
> I don't see stop_one_cpu() in the trace, so I assume it's the !task_running
> case, for which the completion should be completed before getting to the
> wait (unless we *do* have migrate_disable()).
> 
> Could you please run scripts/decode_stacktrace.sh on the above?

[12065.101987][ T1310] __switch_to (arch/arm64/kernel/process.c:580) 
[12065.106227][ T1310] __schedule (kernel/sched/core.c:4272 kernel/sched/core.c:5019) 
[12065.110505][ T1310] schedule (./arch/arm64/include/asm/current.h:19 (discriminator 1) ./arch/arm64/include/asm/preempt.h:53 (discriminator 1) kernel/sched/core.c:5099 (discriminator 1)) 
[12065.114562][ T1310] schedule_timeout (kernel/time/timer.c:1848) 
[12065.119275][ T1310] wait_for_completion (kernel/sched/completion.c:85 kernel/sched/completion.c:106 kernel/sched/completion.c:117 kernel/sched/completion.c:138) 
[12065.124257][ T1310] affine_move_task (./include/linux/instrumented.h:101 ./include/asm-generic/atomic-instrumented.h:220 ./include/linux/refcount.h:272 ./include/linux/refcount.h:315 ./include/linux/refcount.h:333 kernel/sched/core.c:2263) 
[12065.129013][ T1310] __set_cpus_allowed_ptr (kernel/sched/core.c:2353) 
[12065.134248][ T1310] sched_setaffinity (kernel/sched/core.c:6460) 
[12065.139088][ T1310] __arm64_sys_sched_setaffinity (kernel/sched/core.c:6511 kernel/sched/core.c:6500 kernel/sched/core.c:6500) 
[12065.144972][ T1310] do_el0_svc (arch/arm64/kernel/syscall.c:36 arch/arm64/kernel/syscall.c:48 arch/arm64/kernel/syscall.c:159 arch/arm64/kernel/syscall.c:205) 
[12065.149165][ T1310] el0_sync_handler (arch/arm64/kernel/entry-common.c:236 arch/arm64/kernel/entry-common.c:254) 
[12065.153876][ T1310] el0_sync (arch/arm64/kernel/entry.S:741)

== powerpc ==
[18060.020301][ T676] [c000200014227670] [c000000000a6d1e8] __func__.5350+0x1220e0/0x181338 unreliable 
[18060.020333][ T676] [c000200014227850] [c00000000001a278] __switch_to (arch/powerpc/kernel/process.c:1273) 
[18060.020351][ T676] [c0002000142278c0] [c0000000008f3e94] __schedule (kernel/sched/core.c:4269 kernel/sched/core.c:5019) 
[18060.020377][ T676] [c000200014227990] [c0000000008f4638] schedule (./include/asm-generic/preempt.h:59 (discriminator 1) kernel/sched/core.c:5099 (discriminator 1)) 
[18060.020394][ T676] [c0002000142279c0] [c0000000008fbd34] schedule_timeout (kernel/time/timer.c:1847) 
[18060.020420][ T676] [c000200014227ac0] [c0000000008f6398] wait_for_completion (kernel/sched/completion.c:85 kernel/sched/completion.c:106 kernel/sched/completion.c:117 kernel/sched/completion.c:138) 
[18060.020455][ T676] [c000200014227b30] [c000000000100fd4] affine_move_task (kernel/sched/core.c:2261) 
[18060.020481][ T676] [c000200014227c90] [c000000000101444] __set_cpus_allowed_ptr (kernel/sched/core.c:2353) 
[18060.020507][ T676] [c000200014227d00] [c000000000106eac] sched_setaffinity (kernel/sched/core.c:6460) 
[18060.020533][ T676] [c000200014227d70] [c000000000107134] sys_sched_setaffinity (kernel/sched/core.c:6511 kernel/sched/core.c:6500) 
[18060.020559][ T676] [c000200014227dc0] [c00000000002a6d8] system_call_exception (arch/powerpc/kernel/syscall_64.c:111) 
[18060.020585][ T676] [c000200014227e20] [c00000000000d0a8] system_call_common (arch/powerpc/kernel/entry_64.S:302)



^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-11-12 17:26     ` Valentin Schneider
  2020-11-12 18:01       ` Qian Cai
@ 2020-11-12 18:35       ` Qian Cai
  1 sibling, 0 replies; 81+ messages in thread
From: Qian Cai @ 2020-11-12 18:35 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj, ouwen210

On Thu, 2020-11-12 at 17:26 +0000, Valentin Schneider wrote:
> On 12/11/20 16:38, Qian Cai wrote:
> > Some syscall fuzzing from an unprivileged user starts to trigger this below
> > since this commit first appeared in the linux-next today. Does it ring any
> > bells?

X86 in a KVM guest as well.

guest .config: 
https://cailca.coding.net/public/linux/mm/git/files/master/x86.config

To reproduce:

# /usr/libexec/qemu-kvm -name kata -cpu host -smp 48 -m 48g -hda rhel-8.3-
x86_64-kvm.img.qcow2 -cdrom kata.iso -nic user,hostfwd=tcp::2222-:22 -nographic

== inside the guest ===
# git clone https://e.coding.net/cailca/linux/mm
# cd mm; make
# ./random -x 0-100 -f

[17213.432777][ T348] INFO: task trinity-c7:216885 can't die for more than 122 seconds.
[17213.434895][ T348] task:trinity-c7      state:D stack:27088 pid:216885 ppid:103237 flags:0x00004004
[17213.437297][ T348] Call Trace:
[17213.438142][ T348] __schedule (kernel/sched/core.c:4272 kernel/sched/core.c:5019) 
[17213.439256][ T348] ? __sched_text_start (kernel/sched/core.c:4901) 
[17213.440477][ T348] schedule (./arch/x86/include/asm/current.h:15 (discriminator 1) ./include/linux/sched.h:1892 (discriminator 1) kernel/sched/core.c:5100 (discriminator 1)) 
[17213.441501][ T348] schedule_timeout (kernel/time/timer.c:1848) 
[17213.442834][ T348] ? usleep_range (kernel/time/timer.c:1833) 
[17213.444070][ T348] ? wait_for_completion (kernel/sched/completion.c:85 kernel/sched/completion.c:106 kernel/sched/completion.c:117 kernel/sched/completion.c:138) 
[17213.445457][ T348] ? lock_downgrade (kernel/locking/lockdep.c:5443) 
[17213.446695][ T348] ? rcu_read_unlock (./include/linux/rcupdate.h:692 (discriminator 5)) 
[17213.447911][ T348] ? do_raw_spin_lock (./arch/x86/include/asm/atomic.h:202 ./include/asm-generic/atomic-instrumented.h:707 ./include/asm-generic/qspinlock.h:82 kernel/locking/spinlock_debug.c:113) 
[17213.449190][ T348] ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:4036 kernel/locking/lockdep.c:4096 kernel/locking/lockdep.c:4048) 
[17213.450714][ T348] ? _raw_spin_unlock_irq (./arch/x86/include/asm/irqflags.h:54 ./arch/x86/include/asm/irqflags.h:94 ./include/linux/spinlock_api_smp.h:168 kernel/locking/spinlock.c:199) 
[17213.452042][ T348] wait_for_completion (kernel/sched/completion.c:86 kernel/sched/completion.c:106 kernel/sched/completion.c:117 kernel/sched/completion.c:138) 
[17213.453468][ T348] ? wait_for_completion_interruptible (kernel/sched/completion.c:137) 
[17213.455152][ T348] ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:4036 kernel/locking/lockdep.c:4096 kernel/locking/lockdep.c:4048) 
[17213.456651][ T348] ? _raw_spin_unlock_irqrestore (./include/linux/spinlock_api_smp.h:160 kernel/locking/spinlock.c:191) 
[17213.458115][ T348] affine_move_task (./include/linux/instrumented.h:101 ./include/asm-generic/atomic-instrumented.h:220 ./include/linux/refcount.h:272 ./include/linux/refcount.h:315 ./include/linux/refcount.h:333 kernel/sched/core.c:2263) 
[17213.459313][ T348] ? move_queued_task (kernel/sched/core.c:2151) 
[17213.460553][ T348] ? update_curr (kernel/sched/sched.h:1176 kernel/sched/fair.c:845) 
[17213.461684][ T348] ? enqueue_entity (kernel/sched/fair.c:4247) 
[17213.463001][ T348] ? set_next_task_fair (./arch/x86/include/asm/jump_label.h:25 (discriminator 2) ./include/linux/jump_label.h:200 (discriminator 2) kernel/sched/fair.c:4567 (discriminator 2) kernel/sched/fair.c:4683 (discriminator 2) kernel/sched/fair.c:10953 (discriminator 2)) 
[17213.464294][ T348] __set_cpus_allowed_ptr (kernel/sched/core.c:2353) 
[17213.465668][ T348] ? affine_move_task (kernel/sched/core.c:2287) 
[17213.466952][ T348] ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:4036 kernel/locking/lockdep.c:4096 kernel/locking/lockdep.c:4048) 
[17213.468452][ T348] ? _raw_spin_unlock_irqrestore (./include/linux/spinlock_api_smp.h:160 kernel/locking/spinlock.c:191) 
[17213.469908][ T348] sched_setaffinity (kernel/sched/core.c:6460) 
[17213.471127][ T348] ? __ia32_sys_sched_getattr (kernel/sched/core.c:6393) 
[17213.472644][ T348] ? _copy_from_user (./arch/x86/include/asm/uaccess_64.h:46 ./arch/x86/include/asm/uaccess_64.h:52 lib/usercopy.c:16) 
[17213.473850][ T348] __x64_sys_sched_setaffinity (kernel/sched/core.c:6511 kernel/sched/core.c:6500 kernel/sched/core.c:6500) 
[17213.475307][ T348] ? sched_setaffinity (kernel/sched/core.c:6500) 
[17213.476542][ T348] ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:4036 kernel/locking/lockdep.c:4096 kernel/locking/lockdep.c:4048) 
[17213.477991][ T348] ? syscall_enter_from_user_mode (./arch/x86/include/asm/irqflags.h:54 ./arch/x86/include/asm/irqflags.h:94 kernel/entry/common.c:98) 
[17213.479428][ T348] ? trace_hardirqs_on (kernel/trace/trace_preemptirq.c:50 (discriminator 22)) 
[17213.480642][ T348] do_syscall_64 (arch/x86/entry/common.c:46) 
[17213.481706][ T348] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:127) 
[17213.483236][ T348] RIP: 0033:0x7f2f00ebe78d


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-11-12 18:01       ` Qian Cai
@ 2020-11-12 19:31         ` Valentin Schneider
  2020-11-12 19:41           ` Qian Cai
                             ` (2 more replies)
  0 siblings, 3 replies; 81+ messages in thread
From: Valentin Schneider @ 2020-11-12 19:31 UTC (permalink / raw)
  To: Qian Cai
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj, ouwen210


On 12/11/20 18:01, Qian Cai wrote:
> On Thu, 2020-11-12 at 17:26 +0000, Valentin Schneider wrote:
>> On 12/11/20 16:38, Qian Cai wrote:
>> > Some syscall fuzzing from an unprivileged user starts to trigger this below
>> > since this commit first appeared in the linux-next today. Does it ring any
>> > bells?
>> >
>>
>> What's the .config? I'm interested in
>> CONFIG_PREEMPT
>> CONFIG_PREEMPT_RT
>> CONFIG_SMP
>
> https://cailca.coding.net/public/linux/mm/git/files/master/arm64.config
>
> # CONFIG_PREEMPT is not set
> CONFIG_SMP=y
>

So that's CONFIG_PREEMPT_NONE=y

> Also, I have been able to reproduce this on powerpc as well just now.
>
[...]
>
> [12065.101987][ T1310] __switch_to (arch/arm64/kernel/process.c:580)
> [12065.106227][ T1310] __schedule (kernel/sched/core.c:4272 kernel/sched/core.c:5019)
> [12065.110505][ T1310] schedule (./arch/arm64/include/asm/current.h:19 (discriminator 1) ./arch/arm64/include/asm/preempt.h:53 (discriminator 1) kernel/sched/core.c:5099 (discriminator 1))
> [12065.114562][ T1310] schedule_timeout (kernel/time/timer.c:1848)
> [12065.119275][ T1310] wait_for_completion (kernel/sched/completion.c:85 kernel/sched/completion.c:106 kernel/sched/completion.c:117 kernel/sched/completion.c:138)
> [12065.124257][ T1310] affine_move_task (./include/linux/instrumented.h:101 ./include/asm-generic/atomic-instrumented.h:220 ./include/linux/refcount.h:272 ./include/linux/refcount.h:315 ./include/linux/refcount.h:333 kernel/sched/core.c:2263)
> [12065.129013][ T1310] __set_cpus_allowed_ptr (kernel/sched/core.c:2353)
> [12065.134248][ T1310] sched_setaffinity (kernel/sched/core.c:6460)
> [12065.139088][ T1310] __arm64_sys_sched_setaffinity (kernel/sched/core.c:6511 kernel/sched/core.c:6500 kernel/sched/core.c:6500)
> [12065.144972][ T1310] do_el0_svc (arch/arm64/kernel/syscall.c:36 arch/arm64/kernel/syscall.c:48 arch/arm64/kernel/syscall.c:159 arch/arm64/kernel/syscall.c:205)
> [12065.149165][ T1310] el0_sync_handler (arch/arm64/kernel/entry-common.c:236 arch/arm64/kernel/entry-common.c:254)
> [12065.153876][ T1310] el0_sync (arch/arm64/kernel/entry.S:741)
>

Thanks!

One thing I don't get: that trace shows refcount_dec_and_test()
(kernel/sched/core.c:2263) happening before the wait_for_completion(). It's
not the case in the below trace.

> == powerpc ==
> [18060.020301][ T676] [c000200014227670] [c000000000a6d1e8] __func__.5350+0x1220e0/0x181338 unreliable
> [18060.020333][ T676] [c000200014227850] [c00000000001a278] __switch_to (arch/powerpc/kernel/process.c:1273)
> [18060.020351][ T676] [c0002000142278c0] [c0000000008f3e94] __schedule (kernel/sched/core.c:4269 kernel/sched/core.c:5019)
> [18060.020377][ T676] [c000200014227990] [c0000000008f4638] schedule (./include/asm-generic/preempt.h:59 (discriminator 1) kernel/sched/core.c:5099 (discriminator 1))
> [18060.020394][ T676] [c0002000142279c0] [c0000000008fbd34] schedule_timeout (kernel/time/timer.c:1847)
> [18060.020420][ T676] [c000200014227ac0] [c0000000008f6398] wait_for_completion (kernel/sched/completion.c:85 kernel/sched/completion.c:106 kernel/sched/completion.c:117 kernel/sched/completion.c:138)
> [18060.020455][ T676] [c000200014227b30] [c000000000100fd4] affine_move_task (kernel/sched/core.c:2261)
> [18060.020481][ T676] [c000200014227c90] [c000000000101444] __set_cpus_allowed_ptr (kernel/sched/core.c:2353)
> [18060.020507][ T676] [c000200014227d00] [c000000000106eac] sched_setaffinity (kernel/sched/core.c:6460)
> [18060.020533][ T676] [c000200014227d70] [c000000000107134] sys_sched_setaffinity (kernel/sched/core.c:6511 kernel/sched/core.c:6500)
> [18060.020559][ T676] [c000200014227dc0] [c00000000002a6d8] system_call_exception (arch/powerpc/kernel/syscall_64.c:111)
> [18060.020585][ T676] [c000200014227e20] [c00000000000d0a8] system_call_common (arch/powerpc/kernel/entry_64.S:302)

I take back what I said in that previous email; we could have gone through
the task_running() stop_one_cpu() and *then* hit the

  wait_for_completion(&pending->done);

and that is actually the only case that makes sense to me here, because the
!task_running() one will do the completion before waiting (that kernel has
no way to make a task Migration Disabled).

I think what is happening here is:

  affine_move_task()
      // task_running() case
      stop_one_cpu()
      wait_for_completion(&pending->done);

and this is !PREEMPT, so the stopper can very well hit:

  migration_cpu_stop()
    // Task moved between unlocks and scheduling the stopper
    task_rq(p) != rq &&
    // task_running() case
    dest_cpu >= 0

    => no complete_all(), ever :(

This is an annoying case because we didn't have to bother about it before;
a rq mismatch meant the task was fine, because we modified
->cpus_allowed_mask prior.

With migrate_disable(), we have to chase after the bloody task because
we have to preempt it to get a stable is_migration_disabled() reading. It
could have been Migration Disabled, got some pending installed, got out of
Migration Disabled, moved around, gone Migration Disabled again and got
some more pending before we get to run the stopper :(

a) Do you also get this on CONFIG_PREEMPT=y?
b) Could you try the below?

---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 02076e6d3792..fad0a8e62aca 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1923,7 +1923,7 @@ static int migration_cpu_stop(void *data)
 		else
 			p->wake_cpu = dest_cpu;
 
-	} else if (dest_cpu < 0) {
+	} else if (dest_cpu < 0 || pending) {
 		/*
 		 * This happens when we get migrated between migrate_enable()'s
 		 * preempt_enable() and scheduling the stopper task. At that
@@ -1933,6 +1933,17 @@ static int migration_cpu_stop(void *data)
 		 * more likely.
 		 */
 
+		/*
+		 * The task moved before the stopper got to run. We're holding
+		 * ->pi_lock, so the allowed mask is stable - if it got
+		 * somewhere allowed, we're done.
+		 */
+		if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
+			p->migration_pending = NULL;
+			complete = true;
+			goto out;
+		}
+
 		/*
 		 * When this was migrate_enable() but we no longer have an
 		 * @pending, a concurrent SCA 'fixed' things and we should be

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-11-12 19:31         ` Valentin Schneider
@ 2020-11-12 19:41           ` Qian Cai
  2020-11-12 20:37           ` Qian Cai
  2020-11-13 10:27           ` Peter Zijlstra
  2 siblings, 0 replies; 81+ messages in thread
From: Qian Cai @ 2020-11-12 19:41 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj, ouwen210

On Thu, 2020-11-12 at 19:31 +0000, Valentin Schneider wrote:
> One thing I don't get: that trace shows refcount_dec_and_test()
> (kernel/sched/core.c:2263) happening before the wait_for_completion(). It's
> not the case in the below trace.

Yes, that is normal. Sometimes, the decoding is a bit off not sure because of
some debugging options like KASAN obscures it.

> a) Do you also get this on CONFIG_PREEMPT=y?

I don't know. None of the systems here has that, but I could probably try.

> b) Could you try the below?

Let me run it and report.

> 
> ---
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 02076e6d3792..fad0a8e62aca 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1923,7 +1923,7 @@ static int migration_cpu_stop(void *data)
>  		else
>  			p->wake_cpu = dest_cpu;
>  
> -	} else if (dest_cpu < 0) {
> +	} else if (dest_cpu < 0 || pending) {
>  		/*
>  		 * This happens when we get migrated between migrate_enable()'s
>  		 * preempt_enable() and scheduling the stopper task. At that
> @@ -1933,6 +1933,17 @@ static int migration_cpu_stop(void *data)
>  		 * more likely.
>  		 */
>  
> +		/*
> +		 * The task moved before the stopper got to run. We're holding
> +		 * ->pi_lock, so the allowed mask is stable - if it got
> +		 * somewhere allowed, we're done.
> +		 */
> +		if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
> +			p->migration_pending = NULL;
> +			complete = true;
> +			goto out;
> +		}
> +
>  		/*
>  		 * When this was migrate_enable() but we no longer have an
>  		 * @pending, a concurrent SCA 'fixed' things and we should be
> 


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-11-12 19:31         ` Valentin Schneider
  2020-11-12 19:41           ` Qian Cai
@ 2020-11-12 20:37           ` Qian Cai
  2020-11-12 21:26             ` Valentin Schneider
  2020-11-13 10:27           ` Peter Zijlstra
  2 siblings, 1 reply; 81+ messages in thread
From: Qian Cai @ 2020-11-12 20:37 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj, ouwen210

On Thu, 2020-11-12 at 19:31 +0000, Valentin Schneider wrote:
> a) Do you also get this on CONFIG_PREEMPT=y?

This also happens with:

CONFIG_PREEMPT=y
CONFIG_PREEMPTION=y
CONFIG_PREEMPT_RCU=y
CONFIG_PREEMPT_NOTIFIERS=y
CONFIG_DEBUG_PREEMPT=y
CONFIG_PREEMPTIRQ_TRACEPOINTS=y

[ 1235.044945][  T330] INFO: task trinity-c4:60050 blocked for more than 245 seconds.
[ 1235.052540][  T330]       Not tainted 5.10.0-rc3-next-20201112+ #2
[ 1235.058774][  T330] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 1235.067392][  T330] task:trinity-c4      state:D stack:26880 pid:60050 ppid:  1722 flags:0x00004000
[ 1235.076505][  T330] Call Trace:
[ 1235.079680][ T330] __schedule (kernel/sched/core.c:4272 kernel/sched/core.c:5019) 
[ 1235.083971][ T330] ? __sched_text_start (kernel/sched/core.c:4901) 
[ 1235.088721][ T330] schedule (kernel/sched/core.c:5099 (discriminator 1)) 
[ 1235.092661][ T330] schedule_timeout (kernel/time/timer.c:1848) 
[ 1235.097399][ T330] ? usleep_range (kernel/time/timer.c:1833) 
[ 1235.101945][ T330] ? wait_for_completion (kernel/sched/completion.c:85 kernel/sched/completion.c:106 kernel/sched/completion.c:117 kernel/sched/completion.c:138) 
[ 1235.107156][ T330] ? lock_downgrade (kernel/locking/lockdep.c:5443) 
[ 1235.111883][ T330] ? rcu_read_unlock (./include/linux/rcupdate.h:692 (discriminator 5)) 
[ 1235.116561][ T330] ? do_raw_spin_lock (./arch/x86/include/asm/atomic.h:202 ./include/asm-generic/atomic-instrumented.h:707 ./include/asm-generic/qspinlock.h:82 kernel/locking/spinlock_debug.c:113) 
[ 1235.121459][ T330] ? _raw_spin_unlock_irq (./arch/x86/include/asm/irqflags.h:54 ./arch/x86/include/asm/irqflags.h:94 ./include/linux/spinlock_api_smp.h:168 kernel/locking/spinlock.c:199) 
[ 1235.126601][ T330] wait_for_completion (kernel/sched/completion.c:86 kernel/sched/completion.c:106 kernel/sched/completion.c:117 kernel/sched/completion.c:138) 
[ 1235.131591][ T330] ? wait_for_completion_interruptible (kernel/sched/completion.c:137) 
[ 1235.138013][ T330] ? _raw_spin_unlock_irqrestore (./include/linux/spinlock_api_smp.h:160 kernel/locking/spinlock.c:191) 
[ 1235.143698][ T330] affine_move_task (./include/linux/instrumented.h:101 ./include/asm-generic/atomic-instrumented.h:220 ./include/linux/refcount.h:272 ./include/linux/refcount.h:315 ./include/linux/refcount.h:333 kernel/sched/core.c:2263) 
[ 1235.148451][ T330] ? move_queued_task (kernel/sched/core.c:2151) 
[ 1235.153351][ T330] ? update_curr (kernel/sched/sched.h:1176 kernel/sched/fair.c:845) 
[ 1235.157848][ T330] ? enqueue_entity (kernel/sched/fair.c:4247) 
[ 1235.162658][ T330] ? set_next_task_fair (./arch/x86/include/asm/jump_label.h:25 (discriminator 2) ./include/linux/jump_label.h:200 (discriminator 2) kernel/sched/fair.c:4567 (discriminator 2) kernel/sched/fair.c:4683 (discriminator 2) kernel/sched/fair.c:10953 (discriminator 2)) 
[ 1235.167667][ T330] __set_cpus_allowed_ptr (kernel/sched/core.c:2353) 
[ 1235.172905][ T330] ? affine_move_task (kernel/sched/core.c:2287) 
[ 1235.177826][ T330] ? _raw_spin_unlock_irqrestore (./include/linux/spinlock_api_smp.h:160 kernel/locking/spinlock.c:191) 
[ 1235.183501][ T330] sched_setaffinity (kernel/sched/core.c:6460) 
[ 1235.188345][ T330] ? __ia32_sys_sched_getattr (kernel/sched/core.c:6393) 
[ 1235.193937][ T330] ? _copy_from_user (./arch/x86/include/asm/uaccess_64.h:46 ./arch/x86/include/asm/uaccess_64.h:52 lib/usercopy.c:16) 
[ 1235.198605][ T330] __x64_sys_sched_setaffinity (kernel/sched/core.c:6511 kernel/sched/core.c:6500 kernel/sched/core.c:6500) 
[ 1235.204291][ T330] ? sched_setaffinity (kernel/sched/core.c:6500) 
[ 1235.209324][ T330] ? syscall_enter_from_user_mode (./arch/x86/include/asm/irqflags.h:54 ./arch/x86/include/asm/irqflags.h:94 kernel/entry/common.c:98) 
[ 1235.215133][ T330] do_syscall_64 (arch/x86/entry/common.c:46) 
[ 1235.219431][ T330] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:127) 
[ 1235.225251][  T330] RIP: 0033:0x7fb102b1178d

> b) Could you try the below?

It is running good so far on multiple systems. I'll keep it running and report
back if it happens again.


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-11-12 20:37           ` Qian Cai
@ 2020-11-12 21:26             ` Valentin Schneider
  0 siblings, 0 replies; 81+ messages in thread
From: Valentin Schneider @ 2020-11-12 21:26 UTC (permalink / raw)
  To: Qian Cai
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj, ouwen210


On 12/11/20 20:37, Qian Cai wrote:
> On Thu, 2020-11-12 at 19:31 +0000, Valentin Schneider wrote:
>> a) Do you also get this on CONFIG_PREEMPT=y?
>
> This also happens with:
>
> CONFIG_PREEMPT=y
> CONFIG_PREEMPTION=y
> CONFIG_PREEMPT_RCU=y
> CONFIG_PREEMPT_NOTIFIERS=y
> CONFIG_DEBUG_PREEMPT=y
> CONFIG_PREEMPTIRQ_TRACEPOINTS=y
>

Hmph, it should be much less likely to happen with PREEMPT=y, but isn't per
se impossible. Thanks for giving it a shot.

> [ 1235.044945][  T330] INFO: task trinity-c4:60050 blocked for more than 245 seconds.
> [ 1235.052540][  T330]       Not tainted 5.10.0-rc3-next-20201112+ #2
> [ 1235.058774][  T330] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [ 1235.067392][  T330] task:trinity-c4      state:D stack:26880 pid:60050 ppid:  1722 flags:0x00004000
> [ 1235.076505][  T330] Call Trace:
> [ 1235.079680][ T330] __schedule (kernel/sched/core.c:4272 kernel/sched/core.c:5019) 
> [ 1235.083971][ T330] ? __sched_text_start (kernel/sched/core.c:4901) 
> [ 1235.088721][ T330] schedule (kernel/sched/core.c:5099 (discriminator 1)) 
> [ 1235.092661][ T330] schedule_timeout (kernel/time/timer.c:1848) 
> [ 1235.097399][ T330] ? usleep_range (kernel/time/timer.c:1833) 
> [ 1235.101945][ T330] ? wait_for_completion (kernel/sched/completion.c:85 kernel/sched/completion.c:106 kernel/sched/completion.c:117 kernel/sched/completion.c:138) 
> [ 1235.107156][ T330] ? lock_downgrade (kernel/locking/lockdep.c:5443) 
> [ 1235.111883][ T330] ? rcu_read_unlock (./include/linux/rcupdate.h:692 (discriminator 5)) 
> [ 1235.116561][ T330] ? do_raw_spin_lock (./arch/x86/include/asm/atomic.h:202 ./include/asm-generic/atomic-instrumented.h:707 ./include/asm-generic/qspinlock.h:82 kernel/locking/spinlock_debug.c:113) 
> [ 1235.121459][ T330] ? _raw_spin_unlock_irq (./arch/x86/include/asm/irqflags.h:54 ./arch/x86/include/asm/irqflags.h:94 ./include/linux/spinlock_api_smp.h:168 kernel/locking/spinlock.c:199) 
> [ 1235.126601][ T330] wait_for_completion (kernel/sched/completion.c:86 kernel/sched/completion.c:106 kernel/sched/completion.c:117 kernel/sched/completion.c:138) 
> [ 1235.131591][ T330] ? wait_for_completion_interruptible (kernel/sched/completion.c:137) 
> [ 1235.138013][ T330] ? _raw_spin_unlock_irqrestore (./include/linux/spinlock_api_smp.h:160 kernel/locking/spinlock.c:191) 
> [ 1235.143698][ T330] affine_move_task (./include/linux/instrumented.h:101 ./include/asm-generic/atomic-instrumented.h:220 ./include/linux/refcount.h:272 ./include/linux/refcount.h:315 ./include/linux/refcount.h:333 kernel/sched/core.c:2263) 
> [ 1235.148451][ T330] ? move_queued_task (kernel/sched/core.c:2151) 
> [ 1235.153351][ T330] ? update_curr (kernel/sched/sched.h:1176 kernel/sched/fair.c:845) 
> [ 1235.157848][ T330] ? enqueue_entity (kernel/sched/fair.c:4247) 
> [ 1235.162658][ T330] ? set_next_task_fair (./arch/x86/include/asm/jump_label.h:25 (discriminator 2) ./include/linux/jump_label.h:200 (discriminator 2) kernel/sched/fair.c:4567 (discriminator 2) kernel/sched/fair.c:4683 (discriminator 2) kernel/sched/fair.c:10953 (discriminator 2)) 
> [ 1235.167667][ T330] __set_cpus_allowed_ptr (kernel/sched/core.c:2353) 
> [ 1235.172905][ T330] ? affine_move_task (kernel/sched/core.c:2287) 
> [ 1235.177826][ T330] ? _raw_spin_unlock_irqrestore (./include/linux/spinlock_api_smp.h:160 kernel/locking/spinlock.c:191) 
> [ 1235.183501][ T330] sched_setaffinity (kernel/sched/core.c:6460) 
> [ 1235.188345][ T330] ? __ia32_sys_sched_getattr (kernel/sched/core.c:6393) 
> [ 1235.193937][ T330] ? _copy_from_user (./arch/x86/include/asm/uaccess_64.h:46 ./arch/x86/include/asm/uaccess_64.h:52 lib/usercopy.c:16) 
> [ 1235.198605][ T330] __x64_sys_sched_setaffinity (kernel/sched/core.c:6511 kernel/sched/core.c:6500 kernel/sched/core.c:6500) 
> [ 1235.204291][ T330] ? sched_setaffinity (kernel/sched/core.c:6500) 
> [ 1235.209324][ T330] ? syscall_enter_from_user_mode (./arch/x86/include/asm/irqflags.h:54 ./arch/x86/include/asm/irqflags.h:94 kernel/entry/common.c:98) 
> [ 1235.215133][ T330] do_syscall_64 (arch/x86/entry/common.c:46) 
> [ 1235.219431][ T330] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:127) 
> [ 1235.225251][  T330] RIP: 0033:0x7fb102b1178d
>
>> b) Could you try the below?
>
> It is running good so far on multiple systems. I'll keep it running and report
> back if it happens again.

Thanks! All of this is somewhat fragile, so I'll want to have another look
with a fresher mind; if the diff makes a difference at least it'll mean
I wasn't completely off.

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-11-12 19:31         ` Valentin Schneider
  2020-11-12 19:41           ` Qian Cai
  2020-11-12 20:37           ` Qian Cai
@ 2020-11-13 10:27           ` Peter Zijlstra
  2 siblings, 0 replies; 81+ messages in thread
From: Peter Zijlstra @ 2020-11-13 10:27 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Qian Cai, tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210

On Thu, Nov 12, 2020 at 07:31:12PM +0000, Valentin Schneider wrote:

> I think what is happening here is:
> 
>   affine_move_task()
>       // task_running() case
>       stop_one_cpu()
>       wait_for_completion(&pending->done);
> 
> and this is !PREEMPT, so the stopper can very well hit:
> 
>   migration_cpu_stop()
>     // Task moved between unlocks and scheduling the stopper
>     task_rq(p) != rq &&
>     // task_running() case
>     dest_cpu >= 0
> 
>     => no complete_all(), ever :(

Damn...

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 02076e6d3792..fad0a8e62aca 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1923,7 +1923,7 @@ static int migration_cpu_stop(void *data)
>  		else
>  			p->wake_cpu = dest_cpu;
>  
> -	} else if (dest_cpu < 0) {
> +	} else if (dest_cpu < 0 || pending) {
>  		/*
>  		 * This happens when we get migrated between migrate_enable()'s
>  		 * preempt_enable() and scheduling the stopper task. At that
> @@ -1933,6 +1933,17 @@ static int migration_cpu_stop(void *data)
>  		 * more likely.
>  		 */
>  
> +		/*
> +		 * The task moved before the stopper got to run. We're holding
> +		 * ->pi_lock, so the allowed mask is stable - if it got
> +		 * somewhere allowed, we're done.
> +		 */
> +		if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
> +			p->migration_pending = NULL;
> +			complete = true;
> +			goto out;
> +		}
> +
>  		/*
>  		 * When this was migrate_enable() but we no longer have an
>  		 * @pending, a concurrent SCA 'fixed' things and we should be

Agreed, this is very clearly a missing case and the proposed solution
seems straight forward enough; but I'm struggling to convince my sleep
deprived brain we're actually complete now.

I'll continue staring at it a little more. Could you make it into a
proper patch?

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative
  2020-10-23 10:12 ` [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative Peter Zijlstra
  2020-10-29 16:27   ` Valentin Schneider
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Thomas Gleixner
@ 2020-11-13 15:06   ` Qian Cai
  2020-11-17 19:28     ` Valentin Schneider
  2 siblings, 1 reply; 81+ messages in thread
From: Qian Cai @ 2020-11-13 15:06 UTC (permalink / raw)
  To: Peter Zijlstra, tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, valentin.schneider,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210

On Fri, 2020-10-23 at 12:12 +0200, Peter Zijlstra wrote:
> From: Thomas Gleixner <tglx@linutronix.de>
> 
> On CPU unplug tasks which are in a migrate disabled region cannot be pushed
> to a different CPU until they returned to migrateable state.
> 
> Account the number of tasks on a runqueue which are in a migrate disabled
> section and make the hotplug wait mechanism respect that.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/sched/core.c  |   36 ++++++++++++++++++++++++++++++------
>  kernel/sched/sched.h |    4 ++++
>  2 files changed, 34 insertions(+), 6 deletions(-)
> 
[] 
> @@ -7310,7 +7334,7 @@ int sched_cpu_dying(unsigned int cpu)
>  	sched_tick_stop(cpu);
>  
>  	rq_lock_irqsave(rq, &rf);
> -	BUG_ON(rq->nr_running != 1);
> +	BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));

CPU hotplug is now triggering this. This is with Valentin's affine_move_task()
fix on top:

https://lore.kernel.org/lkml/20201113112414.2569-1-valentin.schneider@arm.com/

[  809.412232][  T428] kernel BUG at kernel/sched/core.c:7547!
[  809.417841][  T428] invalid opcode: 0000 [#1] SMP KASAN PTI
[  809.423445][  T428] CPU: 72 PID: 428 Comm: migration/72 Tainted: G          I       5.10.0-rc3-next-20201113+ #1
[  809.433678][  T428] Hardware name: HPE ProLiant DL560 Gen10/ProLiant DL560 Gen10, BIOS U34 11/13/2019
[  809.442951][  T428] Stopper: multi_cpu_stop+0x0/0x350 <- 0x0
[  809.448643][  T428] RIP: 0010:sched_cpu_dying+0x10f/0x130
[  809.454071][  T428] Code: 10 00 31 c0 48 83 c4 08 5b 41 5c 41 5d 5d c3 be 08 00 00 00 48 c7 c7 60 3f b5 96 e8 ab 81 4d 00 f0 4c 01 25 73 c4 5a 09 eb a3 <0f> 0b 48 89 34 24 e8 86 7d 4d 00 48 8b 34 24 e9 5d ff ff ff e8 88
[  809.473650][  T428] RSP: 0018:ffffc9000889fca0 EFLAGS: 00010002
[  809.479606][  T428] RAX: 0000000000000000 RBX: ffff8887dfcb23c0 RCX: ffffffff8e057e0d
[  809.487482][  T428] RDX: 1ffff110fbf96480 RSI: 0000000000007c11 RDI: ffff8887dfcb2400
[  809.495355][  T428] RBP: ffffc9000889fcc0 R08: fffffbfff2cb8e96 R09: fffffbfff2cb8e96
[  809.503229][  T428] R10: ffffffff965c74af R11: fffffbfff2cb8e95 R12: ffff8887dfcb23d8
[  809.511103][  T428] R13: 0000000000000086 R14: ffffffff8d5038e0 R15: 0000000000000003
[  809.518979][  T428] FS:  0000000000000000(0000) GS:ffff8887dfc80000(0000) knlGS:0000000000000000
[  809.527815][  T428] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  809.534291][  T428] CR2: 00007fea4cdf899c CR3: 00000018c7414002 CR4: 00000000007706e0
[  809.542165][  T428] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  809.550040][  T428] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  809.557913][  T428] PKRU: 55555554
[  809.561332][  T428] Call Trace:
[  809.564489][  T428]  ? x86_pmu_starting_cpu+0x20/0x20
[  809.569570][  T428]  ? sched_cpu_wait_empty+0x220/0x220
[  809.574826][  T428]  cpuhp_invoke_callback+0x1d8/0x1520
[  809.580082][  T428]  ? x2apic_send_IPI_mask+0x10/0x10
[  809.585161][  T428]  ? clear_local_APIC+0x788/0xc10
[  809.590068][  T428]  ? cpuhp_invoke_callback+0x1520/0x1520
[  809.595584][  T428]  take_cpu_down+0x10f/0x1a0
[  809.600053][  T428]  multi_cpu_stop+0x149/0x350
[  809.604607][  T428]  ? stop_machine_yield+0x10/0x10
[  809.609511][  T428]  cpu_stopper_thread+0x200/0x400
[  809.614416][  T428]  ? cpu_stop_create+0x70/0x70
[  809.619059][  T428]  smpboot_thread_fn+0x30a/0x770
[  809.623878][  T428]  ? smpboot_register_percpu_thread+0x370/0x370
[  809.630005][  T428]  ? trace_hardirqs_on+0x1c/0x150
[  809.634910][  T428]  ? __kthread_parkme+0xcc/0x1a0
[  809.639729][  T428]  ? smpboot_register_percpu_thread+0x370/0x370
[  809.645855][  T428]  kthread+0x352/0x420
[  809.649798][  T428]  ? kthread_create_on_node+0xc0/0xc0
[  809.655052][  T428]  ret_from_fork+0x22/0x30
[  809.659345][  T428] Modules linked in: nls_ascii nls_cp437 vfat fat kvm_intel kvm ses enclosure irqbypass efivarfs ip_tables x_tables sd_mod nvme tg3 firmware_class smartpqi nvme_core scsi_transport_sas libphy dm_mirror dm_region_hash dm_log dm_mod
[  809.681502][  T428] ---[ end trace 416318a3e677bf17 ]---
[  809.686844][  T428] RIP: 0010:sched_cpu_dying+0x10f/0x130
[  809.692273][  T428] Code: 10 00 31 c0 48 83 c4 08 5b 41 5c 41 5d 5d c3 be 08 00 00 00 48 c7 c7 60 3f b5 96 e8 ab 81 4d 00 f0 4c 01 25 73 c4 5a 09 eb a3 <0f> 0b 48 89 34 24 e8 86 7d 4d 00 48 8b 34 24 e9 5d ff ff ff e8 88
[  809.711853][  T428] RSP: 0018:ffffc9000889fca0 EFLAGS: 00010002
[  809.717807][  T428] RAX: 0000000000000000 RBX: ffff8887dfcb23c0 RCX: ffffffff8e057e0d
[  809.725681][  T428] RDX: 1ffff110fbf96480 RSI: 0000000000007c11 RDI: ffff8887dfcb2400
[  809.733556][  T428] RBP: ffffc9000889fcc0 R08: fffffbfff2cb8e96 R09: fffffbfff2cb8e96
[  809.741432][  T428] R10: ffffffff965c74af R11: fffffbfff2cb8e95 R12: ffff8887dfcb23d8
[  809.749307][  T428] R13: 0000000000000086 R14: ffffffff8d5038e0 R15: 0000000000000003
[  809.757182][  T428] FS:  0000000000000000(0000) GS:ffff8887dfc80000(0000) knlGS:0000000000000000
[  809.766018][  T428] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  809.772495][  T428] CR2: 00007fea4cdf899c CR3: 00000018c7414002 CR4: 00000000007706e0
[  809.780369][  T428] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  809.788242][  T428] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  809.796118][  T428] PKRU: 55555554
[  809.799538][  T428] Kernel panic - not syncing: Fatal exception
[  809.805543][  T428] 
[  809.805544][  T428] unchecked MSR access error: WRMSR to 0x83f (tried to write 0x00000000000000f6) at rIP: 0xffffffff8d4c8a43 (native_apic_msr_write+0x23/0x40)
[  809.805545][  T428] Call Trace:
[  809.805546][  T428]  arch_irq_work_raise+0x9b/0x110
[  809.805547][  T428]  irq_work_queue+0x24/0x40
[  809.805548][  T428]  printk_safe_log_store+0x185/0x1b0
[  809.805549][  T428]  ? kmsg_dump_rewind_nolock+0x80/0x80
[  809.805549][  T428]  ? ret_from_fork+0x22/0x30
[  809.805550][  T428]  printk+0x9a/0xc0
[  809.805551][  T428]  ? record_print_text.cold.38+0x11/0x11
[  809.805552][  T428]  ? stack_trace_consume_entry+0x160/0x160
[  809.805553][  T428]  ? save_trace+0x3d/0xb40
[  809.805554][  T428]  print_circular_bug_header.cold.69+0x10/0xd7
[  809.805556][  T428]  print_circular_bug.isra.42+0x1ac/0x300
[  809.805557][  T428]  check_noncircular+0x27b/0x320
[  809.805558][  T428]  ? print_circular_bug.isra.42+0x300/0x300
[  809.805559][  T428]  ? data_alloc.isra.8+0x15f/0x470
[  809.805560][  T428]  ? mark_lock.part.47+0x109/0x1a30
[  809.805560][  T428]  ? data_push_tail.part.6+0x310/0x310
[  809.805561][  T428]  ? print_usage_bug+0x1a0/0x1a0
[  809.805562][  T428]  check_prevs_add+0x38e/0x26f0
[  809.805563][  T428]  ? lock_is_held_type+0x19/0xe0
[  809.805563][  T428]  ? check_irq_usage+0xbb0/0xbb0
[  809.805564][  T428]  ? rcu_read_lock_held+0xb0/0xb0
[  809.805565][  T428]  __lock_acquire+0x2b9e/0x3920
[  809.805566][  T428]  ? __add_preferred_console.constprop.25+0x1d0/0x1d0
[  809.805567][  T428]  ? lockdep_hardirqs_on_prepare+0x3d0/0x3d0
[  809.805567][  T428]  lock_acquire+0x1c8/0x820
[  809.805568][  T428]  ? down_trylock+0xe/0x70
[  809.805569][  T428]  ? rcu_read_unlock+0x40/0x40
[  809.805570][  T428]  ? vprintk_emit+0x89/0x2c0
[  809.805571][  T428]  ? lock_downgrade+0x700/0x700
[  809.805572][  T428]  ? rcu_read_unlock+0x40/0x40
[  809.805573][  T428]  ? vprintk_emit+0x107/0x2c0
[  809.805575][  T428]  _raw_spin_lock_irqsave+0x30/0x50
[  809.805576][  T428]  ? down_trylock+0xe/0x70
[  809.805577][  T428]  down_trylock+0xe/0x70
[  809.805578][  T428]  __down_trylock_console_sem+0x23/0x90
[  809.805578][  T428]  console_trylock+0xe/0x60
[  809.805579][  T428]  vprintk_emit+0x107/0x2c0
[  809.805580][  T428]  ? sched_cpu_dying+0x10f/0x130
[  809.805581][  T428]  printk+0x9a/0xc0
[  809.805582][  T428]  ? record_print_text.cold.38+0x11/0x11
[  809.805583][  T428]  report_bug.cold.2+0x30/0x52
[  809.805584][  T428]  handle_bug+0x44/0x80
[  809.805585][  T428]  exc_invalid_op+0x13/0x40
[  809.805585][  T428]  asm_exc_invalid_op+0x12/0x20
[  809.805586][  T428] RIP: 0010:sched_cpu_dying+0x10f/0x130
[  809.805588][  T428] Code: 10 00 31 c0 48 83 c4 08 5b 41 5c 41 5d 5d c3 be 08 00 00 00 48 c7 c7 60 3f b5 96 e8 ab 81 4d 00 f0 4c 01 25 73 c4 5a 09 eb a3 <0f> 0b 48 89 34 24 e8 86 7d 4d 00 48 8b 34 24 e9 5d ff ff ff e8 88
[  809.805589][  T428] RSP: 0018:ffffc9000889fca0 EFLAGS: 00010002
[  809.805591][  T428] RAX: 0000000000000000 RBX: ffff8887dfcb23c0 RCX: ffffffff8e057e0d
[  809.805591][  T428] RDX: 1ffff110fbf96480 RSI: 0000000000007c11 RDI: ffff8887dfcb2400
[  809.805593][  T428] RBP: ffffc9000889fcc0 R08: fffffbfff2cb8e96 R09: fffffbfff2cb8e96
[  809.805594][  T428] R10: ffffffff965c74af R11: fffffbfff2cb8e95 R12: ffff8887dfcb23d8
[  809.805594][  T428] R13: 0000000000000086 R14: ffffffff8d5038e0 R15: 0000000000000003
[  809.805595][  T428]  ? cpuhp_invoke_callback+0x1520/0x1520
[  809.805596][  T428]  ? prandom_u32+0x18d/0x390
[  809.805597][  T428]  ? x86_pmu_starting_cpu+0x20/0x20
[  809.805598][  T428]  ? sched_cpu_wait_empty+0x220/0x220
[  809.805599][  T428]  cpuhp_invoke_callback+0x1d8/0x1520
[  809.805600][  T428]  ? x2apic_send_IPI_mask+0x10/0x10
[  809.805600][  T428]  ? clear_local_APIC+0x788/0xc10
[  809.805601][  T428]  ? cpuhp_invoke_callback+0x1520/0x1520
[  809.805602][  T428]  take_cpu_down+0x10f/0x1a0
[  809.805603][  T428]  multi_cpu_stop+0x149/0x350
[  809.805603][  T428]  ? stop_machine_yield+0x10/0x10
[  809.805604][  T428]  cpu_stopper_thread+0x200/0x400
[  809.805605][  T428]  ? cpu_stop_create+0x70/0x70
[  809.805606][  T428]  smpboot_thread_fn+0x30a/0x770
[  809.805607][  T428]  ? smpboot_register_percpu_thread+0x370/0x370
[  809.805607][  T428]  ? trace_hardirqs_on+0x1c/0x150
[  809.805608][  T428]  ? __kthread_parkme+0xcc/0x1a0
[  809.805609][  T428]  ? smpboot_register_percpu_thread+0x370/0x370
[  809.805610][  T428]  kthread+0x352/0x420
[  809.805611][  T428]  ? kthread_create_on_node+0xc0/0xc0
[  809.805612][  T428]  ret_from_fork+0x22/0x30
[  809.805613][  T428] ======================================================
[  809.805614][  T428] WARNING: possible circular locking dependency detected
[  809.805616][  T428] 5.10.0-rc3-next-20201113+ #1 Tainted: G          I      
[  809.805617][  T428] ------------------------------------------------------
[  809.805618][  T428] migration/72/428 is trying to acquire lock:
[  809.805619][  T428] ffffffff962c7d58 ((console_sem).lock){-.-.}-{2:2}, at: down_trylock+0xe/0x70
[  809.805623][  T428] 
[  809.805623][  T428] but task is already holding lock:
[  809.805624][  T428] ffff8887dfcb23d8 (&rq->lock){-.-.}-{2:2}, at: sched_cpu_dying+0x4e/0x130
[  809.805628][  T428] 
[  809.805628][  T428] which lock already depends on the new lock.
[  809.805629][  T428] 
[  809.805630][  T428] 
[  809.805631][  T428] the existing dependency chain (in reverse order) is:
[  809.805631][  T428] 
[  809.805632][  T428] -> #2 (&rq->lock){-.-.}-{2:2}:
[  809.805635][  T428]        lock_acquire+0x1c8/0x820
[  809.805636][  T428]        _raw_spin_lock+0x27/0x40
[  809.805637][  T428]        task_fork_fair+0x32/0x4c0
[  809.805637][  T428]        sched_fork+0x3aa/0x8c0
[  809.805638][  T428]        copy_process+0x1c87/0x6440
[  809.805639][  T428]        kernel_clone+0xbd/0xc90
[  809.805640][  T428]        kernel_thread+0x95/0xd0
[  809.805640][  T428]        rest_init+0x21/0x28a
[  809.805641][  T428]        start_kernel+0x381/0x39f
[  809.805642][  T428]        secondary_startup_64_no_verify+0xc2/0xcb
[  809.805642][  T428] 
[  809.805643][  T428] -> #1 (&p->pi_lock){-.-.}-{2:2}:
[  809.805646][  T428]        lock_acquire+0x1c8/0x820
[  809.805647][  T428]        _raw_spin_lock_irqsave+0x30/0x50
[  809.805648][  T428]        try_to_wake_up+0x9b/0xc40
[  809.805648][  T428]        up+0x8d/0xd0
[  809.805649][  T428]        __up_console_sem+0x29/0x60
[  809.805650][  T428]      0x89/0x2c0
[  809.805726][  T428]  ? lock_downgrade+0x700/0x700
[  809.805727][  T428]  ? rcu_read_unlock+0x40/0x40
[  809.805727][  T428]  ? vprintk_emit+0x107/0x2c0
[  809.805728][  T428]  _raw_spin_lock_irqsave+0x30/0x50
[  809.805729][  T428]  ? down_trylock+0xe/0x70
[  809.805729][  T428]  down_trylock+0xe/0x70
[  809.805730][  T428]  __down_trylock_console_sem+0x23/0x90
[  809.805731][  T428]  console_trylock+0xe/0x60
[  809.805732][  T428]  vprintk_emit+0x107/0x2c0
[  809.805733][  T428]  ? sched_cpu_dying+0x10f/0x130
[  809.805733][  T428]  printk+0x9a/0xc0
[  809.805734][  T428]  ? record_print_text.cold.38+0x11/0x11
[  809.805735][  T428]  report_bug.cold.2+0x30/0x52
[  809.805735][  T428]  handle_bug+0x44/0x80
[  809.805736][  T428]  exc_invalid_op+0x13/0x40
[  809.805737][  T428]  asm_exc_invalid_op+0x12/0x20
[  809.805738][  T428] RIP: 0010:sched_cpu_dying+0x10f/0x130
[  809.805739][  T428] Code: 10 00 31 c0 48 83 c4 08 5b 41 5c 41 5d 5d c3 be 08 00 00 00 48 c7 c7 60 3f b5 96 e8 ab 81 4d 00 f0 4c 01 25 73 c4 5a 09 eb a3 <0f> 0b 48 89 34 24 e8 86 7d 4d 00 48

>  	rq_unlock_irqrestore(rq, &rf);
>  
>  	calc_load_migrate(rq);



^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative
  2020-11-13 15:06   ` [PATCH v4 11/19] " Qian Cai
@ 2020-11-17 19:28     ` Valentin Schneider
  2020-11-18 14:44       ` Qian Cai
                         ` (2 more replies)
  0 siblings, 3 replies; 81+ messages in thread
From: Valentin Schneider @ 2020-11-17 19:28 UTC (permalink / raw)
  To: Qian Cai
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj, ouwen210


On 13/11/20 15:06, Qian Cai wrote:
> On Fri, 2020-10-23 at 12:12 +0200, Peter Zijlstra wrote:
[...]
>> @@ -7310,7 +7334,7 @@ int sched_cpu_dying(unsigned int cpu)
>>  	sched_tick_stop(cpu);
>>  
>>  	rq_lock_irqsave(rq, &rf);
>> -	BUG_ON(rq->nr_running != 1);
>> +	BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
>
> CPU hotplug is now triggering this. This is with Valentin's affine_move_task()
> fix on top:
>
> https://lore.kernel.org/lkml/20201113112414.2569-1-valentin.schneider@arm.com/
>
> [  809.412232][  T428] kernel BUG at kernel/sched/core.c:7547!
> [  809.417841][  T428] invalid opcode: 0000 [#1] SMP KASAN PTI
> [  809.423445][  T428] CPU: 72 PID: 428 Comm: migration/72 Tainted: G          I       5.10.0-rc3-next-20201113+ #1
> [  809.433678][  T428] Hardware name: HPE ProLiant DL560 Gen10/ProLiant DL560 Gen10, BIOS U34 11/13/2019
> [  809.442951][  T428] Stopper: multi_cpu_stop+0x0/0x350 <- 0x0
> [  809.448643][  T428] RIP: 0010:sched_cpu_dying+0x10f/0x130
> [  809.454071][  T428] Code: 10 00 31 c0 48 83 c4 08 5b 41 5c 41 5d 5d c3 be 08 00 00 00 48 c7 c7 60 3f b5 96 e8 ab 81 4d 00 f0 4c 01 25 73 c4 5a 09 eb a3 <0f> 0b 48 89 34 24 e8 86 7d 4d 00 48 8b 34 24 e9 5d ff ff ff e8 88
> [  809.473650][  T428] RSP: 0018:ffffc9000889fca0 EFLAGS: 00010002
> [  809.479606][  T428] RAX: 0000000000000000 RBX: ffff8887dfcb23c0 RCX: ffffffff8e057e0d
> [  809.487482][  T428] RDX: 1ffff110fbf96480 RSI: 0000000000007c11 RDI: ffff8887dfcb2400
> [  809.495355][  T428] RBP: ffffc9000889fcc0 R08: fffffbfff2cb8e96 R09: fffffbfff2cb8e96
> [  809.503229][  T428] R10: ffffffff965c74af R11: fffffbfff2cb8e95 R12: ffff8887dfcb23d8
> [  809.511103][  T428] R13: 0000000000000086 R14: ffffffff8d5038e0 R15: 0000000000000003
> [  809.518979][  T428] FS:  0000000000000000(0000) GS:ffff8887dfc80000(0000) knlGS:0000000000000000
> [  809.527815][  T428] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  809.534291][  T428] CR2: 00007fea4cdf899c CR3: 00000018c7414002 CR4: 00000000007706e0
> [  809.542165][  T428] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  809.550040][  T428] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [  809.557913][  T428] PKRU: 55555554
> [  809.561332][  T428] Call Trace:
> [  809.564489][  T428]  ? x86_pmu_starting_cpu+0x20/0x20
> [  809.569570][  T428]  ? sched_cpu_wait_empty+0x220/0x220
> [  809.574826][  T428]  cpuhp_invoke_callback+0x1d8/0x1520
> [  809.580082][  T428]  ? x2apic_send_IPI_mask+0x10/0x10
> [  809.585161][  T428]  ? clear_local_APIC+0x788/0xc10
> [  809.590068][  T428]  ? cpuhp_invoke_callback+0x1520/0x1520
> [  809.595584][  T428]  take_cpu_down+0x10f/0x1a0
> [  809.600053][  T428]  multi_cpu_stop+0x149/0x350
> [  809.604607][  T428]  ? stop_machine_yield+0x10/0x10
> [  809.609511][  T428]  cpu_stopper_thread+0x200/0x400
> [  809.614416][  T428]  ? cpu_stop_create+0x70/0x70
> [  809.619059][  T428]  smpboot_thread_fn+0x30a/0x770
> [  809.623878][  T428]  ? smpboot_register_percpu_thread+0x370/0x370
> [  809.630005][  T428]  ? trace_hardirqs_on+0x1c/0x150
> [  809.634910][  T428]  ? __kthread_parkme+0xcc/0x1a0
> [  809.639729][  T428]  ? smpboot_register_percpu_thread+0x370/0x370
> [  809.645855][  T428]  kthread+0x352/0x420
> [  809.649798][  T428]  ? kthread_create_on_node+0xc0/0xc0
> [  809.655052][  T428]  ret_from_fork+0x22/0x30
> [  809.659345][  T428] Modules linked in: nls_ascii nls_cp437 vfat fat kvm_intel kvm ses enclosure irqbypass efivarfs ip_tables x_tables sd_mod nvme tg3 firmware_class smartpqi nvme_core scsi_transport_sas libphy dm_mirror dm_region_hash dm_log dm_mod


We did have some breakage in that area, but all the holes I was aware of
have been plugged. What would help here is to see which tasks are still
queued on that outgoing CPU, and their recent activity.

Something like
- ftrace_dump_on_oops on your kernel cmdline
- trace-cmd start -e 'sched:*'
 <start the test here>

ought to do it. Then you can paste the (tail of the) ftrace dump.

I also had this laying around, which may or may not be of some help:
---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a6aaf9fb3400..c4a4cb8b47a2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7534,7 +7534,25 @@ int sched_cpu_dying(unsigned int cpu)
 	sched_tick_stop(cpu);
 
 	rq_lock_irqsave(rq, &rf);
-	BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
+
+	if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
+		struct task_struct *g, *p;
+
+		pr_crit("CPU%d nr_running=%d\n", cpu, rq->nr_running);
+		rcu_read_lock();
+		for_each_process_thread(g, p) {
+			if (task_cpu(p) != cpu)
+				continue;
+
+			if (!task_on_rq_queued(p))
+				continue;
+
+			pr_crit("\tp=%s\n", p->comm);
+		}
+		rcu_read_unlock();
+		BUG();
+	}
+
 	rq_unlock_irqrestore(rq, &rf);
 
 	calc_load_migrate(rq);

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative
  2020-11-17 19:28     ` Valentin Schneider
@ 2020-11-18 14:44       ` Qian Cai
  2020-11-23 18:13         ` Sebastian Andrzej Siewior
  2020-12-04  0:23       ` Qian Cai
  2020-12-04 21:19       ` Qian Cai
  2 siblings, 1 reply; 81+ messages in thread
From: Qian Cai @ 2020-11-18 14:44 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj, ouwen210

On Tue, 2020-11-17 at 19:28 +0000, Valentin Schneider wrote:
> We did have some breakage in that area, but all the holes I was aware of
> have been plugged. What would help here is to see which tasks are still
> queued on that outgoing CPU, and their recent activity.
> 
> Something like
> - ftrace_dump_on_oops on your kernel cmdline
> - trace-cmd start -e 'sched:*'
>  <start the test here>
> 
> ought to do it. Then you can paste the (tail of the) ftrace dump.
> 
> I also had this laying around, which may or may not be of some help:

Once I have found a reliable reproducer, I'll report back.


^ permalink raw reply	[flat|nested] 81+ messages in thread

* [tip: sched/core] sched/core: Add missing completion for affine_move_task() waiters
  2020-11-12 16:38   ` [PATCH v4 10/19] " Qian Cai
  2020-11-12 17:26     ` Valentin Schneider
@ 2020-11-20 12:34     ` tip-bot2 for Valentin Schneider
  1 sibling, 0 replies; 81+ messages in thread
From: tip-bot2 for Valentin Schneider @ 2020-11-20 12:34 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Qian Cai, Valentin Schneider, Peter Zijlstra (Intel), x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     d707faa64d03d26b529cc4aea59dab1b016d4d33
Gitweb:        https://git.kernel.org/tip/d707faa64d03d26b529cc4aea59dab1b016d4d33
Author:        Valentin Schneider <valentin.schneider@arm.com>
AuthorDate:    Fri, 13 Nov 2020 11:24:14 
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 19 Nov 2020 11:25:45 +01:00

sched/core: Add missing completion for affine_move_task() waiters

Qian reported that some fuzzer issuing sched_setaffinity() ends up stuck on
a wait_for_completion(). The problematic pattern seems to be:

  affine_move_task()
      // task_running() case
      stop_one_cpu();
      wait_for_completion(&pending->done);

Combined with, on the stopper side:

  migration_cpu_stop()
    // Task moved between unlocks and scheduling the stopper
    task_rq(p) != rq &&
    // task_running() case
    dest_cpu >= 0

    => no complete_all()

This can happen with both PREEMPT and !PREEMPT, although !PREEMPT should
be more likely to see this given the targeted task has a much bigger window
to block and be woken up elsewhere before the stopper runs.

Make migration_cpu_stop() always look at pending affinity requests; signal
their completion if the stopper hits a rq mismatch but the task is
still within its allowed mask. When Migrate-Disable isn't involved, this
matches the previous set_cpus_allowed_ptr() vs migration_cpu_stop()
behaviour.

Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()")
Reported-by: Qian Cai <cai@redhat.com>
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/lkml/8b62fd1ad1b18def27f18e2ee2df3ff5b36d0762.camel@redhat.com
---
 kernel/sched/core.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a6aaf9f..4d1fd4b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1923,7 +1923,7 @@ static int migration_cpu_stop(void *data)
 		else
 			p->wake_cpu = dest_cpu;
 
-	} else if (dest_cpu < 0) {
+	} else if (dest_cpu < 0 || pending) {
 		/*
 		 * This happens when we get migrated between migrate_enable()'s
 		 * preempt_enable() and scheduling the stopper task. At that
@@ -1934,6 +1934,17 @@ static int migration_cpu_stop(void *data)
 		 */
 
 		/*
+		 * The task moved before the stopper got to run. We're holding
+		 * ->pi_lock, so the allowed mask is stable - if it got
+		 * somewhere allowed, we're done.
+		 */
+		if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
+			p->migration_pending = NULL;
+			complete = true;
+			goto out;
+		}
+
+		/*
 		 * When this was migrate_enable() but we no longer have an
 		 * @pending, a concurrent SCA 'fixed' things and we should be
 		 * valid again. Nothing to do.

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative
  2020-11-18 14:44       ` Qian Cai
@ 2020-11-23 18:13         ` Sebastian Andrzej Siewior
  2020-12-02 21:59           ` Qian Cai
  2020-12-03 12:31           ` Qian Cai
  0 siblings, 2 replies; 81+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-11-23 18:13 UTC (permalink / raw)
  To: Qian Cai
  Cc: Valentin Schneider, Peter Zijlstra, tglx, mingo, linux-kernel,
	qais.yousef, swood, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

On 2020-11-18 09:44:34 [-0500], Qian Cai wrote:
> On Tue, 2020-11-17 at 19:28 +0000, Valentin Schneider wrote:
> > We did have some breakage in that area, but all the holes I was aware of
> > have been plugged. What would help here is to see which tasks are still
> > queued on that outgoing CPU, and their recent activity.
> > 
> > Something like
> > - ftrace_dump_on_oops on your kernel cmdline
> > - trace-cmd start -e 'sched:*'
> >  <start the test here>
> > 
> > ought to do it. Then you can paste the (tail of the) ftrace dump.
> > 
> > I also had this laying around, which may or may not be of some help:
> 
> Once I have found a reliable reproducer, I'll report back.

any update?

Sebastian

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative
  2020-11-23 18:13         ` Sebastian Andrzej Siewior
@ 2020-12-02 21:59           ` Qian Cai
  2020-12-03 12:31           ` Qian Cai
  1 sibling, 0 replies; 81+ messages in thread
From: Qian Cai @ 2020-12-02 21:59 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Valentin Schneider, Peter Zijlstra, tglx, mingo, linux-kernel,
	qais.yousef, swood, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

On Mon, 2020-11-23 at 19:13 +0100, Sebastian Andrzej Siewior wrote:
> On 2020-11-18 09:44:34 [-0500], Qian Cai wrote:
> > On Tue, 2020-11-17 at 19:28 +0000, Valentin Schneider wrote:
> > > We did have some breakage in that area, but all the holes I was aware of
> > > have been plugged. What would help here is to see which tasks are still
> > > queued on that outgoing CPU, and their recent activity.
> > > 
> > > Something like
> > > - ftrace_dump_on_oops on your kernel cmdline
> > > - trace-cmd start -e 'sched:*'
> > >  <start the test here>
> > > 
> > > ought to do it. Then you can paste the (tail of the) ftrace dump.
> > > 
> > > I also had this laying around, which may or may not be of some help:
> > 
> > Once I have found a reliable reproducer, I'll report back.
> 
> any update?

Just back from a vacation. I have been running the same workload on today's
linux-next for a few hours and it has been good so far. I'll surely report back
if it happens again in our daily runs.


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative
  2020-11-23 18:13         ` Sebastian Andrzej Siewior
  2020-12-02 21:59           ` Qian Cai
@ 2020-12-03 12:31           ` Qian Cai
  1 sibling, 0 replies; 81+ messages in thread
From: Qian Cai @ 2020-12-03 12:31 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Valentin Schneider, Peter Zijlstra, tglx, mingo, linux-kernel,
	qais.yousef, swood, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort, tj, ouwen210

On Mon, 2020-11-23 at 19:13 +0100, Sebastian Andrzej Siewior wrote:
> On 2020-11-18 09:44:34 [-0500], Qian Cai wrote:
> > On Tue, 2020-11-17 at 19:28 +0000, Valentin Schneider wrote:
> > > We did have some breakage in that area, but all the holes I was aware of
> > > have been plugged. What would help here is to see which tasks are still
> > > queued on that outgoing CPU, and their recent activity.
> > > 
> > > Something like
> > > - ftrace_dump_on_oops on your kernel cmdline
> > > - trace-cmd start -e 'sched:*'
> > >  <start the test here>
> > > 
> > > ought to do it. Then you can paste the (tail of the) ftrace dump.
> > > 
> > > I also had this laying around, which may or may not be of some help:
> > 
> > Once I have found a reliable reproducer, I'll report back.
> 
> any update?

Hmm, the bug is still there after running a bit longer. Let me apply Valentin's
patch and setup ftrace to try to catch it again.

[ 6152.085915][   T61] kernel BUG at kernel/sched/core.c:7594!
[ 6152.091523][   T61] invalid opcode: 0000 [#1] SMP KASAN PTI
[ 6152.097126][   T61] CPU: 10 PID: 61 Comm: migration/10 Tainted: G          IO      5.10.0-rc6-next-20201201+ #1
[ 6152.107272][   T61] Hardware name: HPE ProLiant DL560 Gen10/ProLiant DL560 Gen10, BIOS U34 11/13/2019
[ 6152.116545][   T61] Stopper: multi_cpu_stop+0x0/0x350 <- 0x0
[ 6152.122237][   T61] RIP: 0010:sched_cpu_dying+0x14f/0x180
[ 6152.127667][   T61] Code: 10 00 31 c0 48 83 c4 08 5b 41 5c 41 5d 5d c3 be 08 00 00 00 48 c7 c7 60 5f 15 a1 e8 1b e5 4d 00 f0 4c 01 25 63 c8 5a 09 eb a3 <0f> 0b 48 89 34 24 e8 f6 e0 4d 00 48 8b 34 24 e9 1e ff ff ff 48 89
[ 6152.147248][   T61] RSP: 0018:ffffc90006fbfca0 EFLAGS: 00010002
[ 6152.153202][   T61] RAX: 000000000000723d RBX: ffff8887dfab2400 RCX: 1ffff110fbf56488
[ 6152.161076][   T61] RDX: 0000000000000000 RSI: 000000000000723d RDI: ffff8887dfab2440
[ 6152.168950][   T61] RBP: ffffc90006fbfcc0 R08: fffffbfff417923d R09: fffffbfff417923d
[ 6152.176824][   T61] R10: ffffffffa0bc91e7 R11: fffffbfff417923c R12: ffff8887dfab2418
[ 6152.184698][   T61] R13: 0000000000000086 R14: ffffffff97b03da0 R15: 0000000000000003
[ 6152.192574][   T61] FS:  0000000000000000(0000) GS:ffff8887dfa80000(0000) knlGS:0000000000000000
[ 6152.201409][   T61] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 6152.207886][   T61] CR2: 000055fed2192f58 CR3: 0000000cb7e14006 CR4: 00000000007706e0
[ 6152.215761][   T61] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 6152.223636][   T61] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 6152.231509][   T61] PKRU: 55555554
[ 6152.234928][   T61] Call Trace:
[ 6152.238086][   T61]  ? x86_pmu_starting_cpu+0x20/0x20
[ 6152.243166][   T61]  ? sched_cpu_wait_empty+0x290/0x290
[ 6152.248422][   T61]  cpuhp_invoke_callback+0x1d8/0x1520
[ 6152.253677][   T61]  ? x2apic_send_IPI_mask+0x10/0x10
[ 6152.258758][   T61]  ? clear_local_APIC+0x788/0xc10
[ 6152.263663][   T61]  ? cpuhp_invoke_callback+0x1520/0x1520
[ 6152.269178][   T61]  take_cpu_down+0x10f/0x1a0
[ 6152.273646][   T61]  multi_cpu_stop+0x149/0x350
[ 6152.278201][   T61]  ? stop_machine_yield+0x10/0x10
[ 6152.283106][   T61]  cpu_stopper_thread+0x200/0x400
[ 6152.288012][   T61]  ? cpu_stop_create+0x70/0x70
[ 6152.292655][   T61]  smpboot_thread_fn+0x30a/0x770
[ 6152.297472][   T61]  ? smpboot_register_percpu_thread+0x370/0x370
[ 6152.303600][   T61]  ? trace_hardirqs_on+0x1c/0x150
[ 6152.308504][   T61]  ? __kthread_parkme+0xcc/0x1a0
[ 6152.313321][   T61]  ? smpboot_register_percpu_thread+0x370/0x370
[ 6152.319447][   T61]  kthread+0x354/0x420
[ 6152.323390][   T61]  ? kthread_create_on_node+0xc0/0xc0
[ 6152.328645][   T61]  ret_from_fork+0x22/0x30
[ 6152.332938][   T61] Modules linked in: isofs cdrom fuse loop nls_ascii nls_cp437 vfat fat kvm_intel kvm irqbypass ses enclosure efivarfs ip_tables x_tables sd_mod tg3 nvme firmware_class smartpqi nvme_core libphy scsi_transport_sas dm_mirror dm_region_hash dm_log dm_mod [last unloaded: dummy_del_mod]
[ 6152.359732][   T61] ---[ end trace f59b31dec044f746 ]---
[ 6152.365076][   T61] RIP: 0010:sched_cpu_dying+0x14f/0x180
[ 6152.370505][   T61] Code: 10 00 31 c0 48 83 c4 08 5b 41 5c 41 5d 5d c3 be 08 00 00 00 48 c7 c7 60 5f 15 a1 e8 1b e5 4d 00 f0 4c 01 25 63 c8 5a 09 eb a3 <0f> 0b 48 89 34 24 e8 f6 e0 4d 00 48 8b 34 24 e9 1e ff ff ff 48 89
[ 6152.390085][   T61] RSP: 0018:ffffc90006fbfca0 EFLAGS: 00010002
[ 6152.396039][   T61] RAX: 000000000000723d RBX: ffff8887dfab2400 RCX: 1ffff110fbf56488
[ 6152.403914][   T61] RDX: 0000000000000000 RSI: 000000000000723d RDI: ffff8887dfab2440
[ 6152.411789][   T61] RBP: ffffc90006fbfcc0 R08: fffffbfff417923d R09: fffffbfff417923d
[ 6152.419662][   T61] R10: ffffffffa0bc91e7 R11: fffffbfff417923c R12: ffff8887dfab2418
[ 6152.427537][   T61] R13: 0000000000000086 R14: ffffffff97b03da0 R15: 0000000000000003
[ 6152.435412][   T61] FS:  0000000000000000(0000) GS:ffff8887dfa80000(0000) knlGS:0000000000000000
[ 6152.444248][   T61] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 6152.450725][   T61] CR2: 000055fed2192f58 CR3: 0000000cb7e14006 CR4: 00000000007706e0
[ 6152.458600][   T61] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 6152.466476][   T61] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 6152.474350][   T61] PKRU: 55555554
[ 6152.477771][   T61] Kernel panic - not syncing: Fatal exception
[ 6152.483729][   T61] 
[ 6152.483731][   T61] unchecked MSR access error: WRMSR to 0x83f (tried to write 0x00000000000000f6) at rIP: 0xffffffff97ac8b93 (native_apic_msr_write+0x23/0x40)
[ 6152.483732][   T61] Call Trace:
[ 6152.483733][   T61]  arch_irq_work_raise+0x9b/0x110
[ 6152.483733][   T61]  irq_work_queue+0x24/0x40
[ 6152.483734][   T61]  printk_safe_log_store+0x185/0x1b0
[ 6152.483735][   T61]  ? kmsg_dump_rewind_nolock+0x80/0x80
[ 6152.483736][   T61]  ? ret_from_fork+0x22/0x30
[ 6152.483736][   T61]  printk+0x9a/0xc0
[ 6152.483737][   T61]  ? record_print_text.cold.37+0x11/0x11
[ 6152.483739][   T61]  ? stack_trace_consume_entry+0x160/0x160
[ 6152.483740][   T61]  ? save_trace+0x3d/0xc40
[ 6152.483741][   T61]  print_circular_bug_header.cold.69+0x10/0xd7
[ 6152.483742][   T61]  print_circular_bug.isra.42+0x230/0x430
[ 6152.483743][   T61]  check_noncircular+0x27b/0x320
[ 6152.483745][   T61]  ? print_circular_bug.isra.42+0x430/0x430
[ 6152.483746][   T61]  ? mark_lock.part.47+0x109/0x1f90
[ 6152.483747][   T61]  ? print_usage_bug+0x2b0/0x2b0
[ 6152.483748][   T61]  ? mark_lock.part.47+0x109/0x1f90
[ 6152.483749][   T61]  check_prevs_add+0x38e/0x2800
[ 6152.483749][   T61]  ? lock_is_held_type+0x19/0xe0
[ 6152.483750][   T61]  ? check_irq_usage+0xcf0/0xcf0
[ 6152.483751][   T61]  ? rcu_read_lock_held+0xb0/0xb0
[ 6152.483752][   T61]  __lock_acquire+0x2c86/0x3e60
[ 6152.483753][   T61]  ? __add_preferred_console.constprop.25+0x220/0x220
[ 6152.483753][   T61]  ? lockdep_hardirqs_on_prepare+0x3d0/0x3d0
[ 6152.483754][   T61]  lock_acquire+0x1c8/0x820
[ 6152.483755][   T61]  ? down_trylock+0xe/0x70
[ 6152.483756][   T61]  ? rcu_read_unlock+0x40/0x40
[ 6152.483757][   T61]  ? vprintk_emit+0x89/0x2c0
[ 6152.483758][   T61]  ? lock_downgrade+0x700/0x700
[ 6152.483759][   T61]  ? rcu_read_unlock+0x40/0x40
[ 6152.483760][   T61]  ? vprintk_emit+0x107/0x2c0
[ 6152.483761][   T61]  _raw_spin_lock_irqsave+0x30/0x50
[ 6152.483761][   T61]  ? down_trylock+0xe/0x70
[ 6152.483762][   T61]  down_trylock+0xe/0x70
[ 6152.483763][   T61]  __down_trylock_console_sem+0x23/0x90
[ 6152.483764][   T61]  console_trylock+0xe/0x60
[ 6152.483764][   T61]  vprintk_emit+0x107/0x2c0
[ 6152.483765][   T61]  ? sched_cpu_dying+0x14f/0x180
[ 6152.483765][   T61]  printk+0x9a/0xc0
[ 6152.483766][   T61]  ? record_print_text.cold.37+0x11/0x11
[ 6152.483767][   T61]  report_bug.cold.2+0x30/0x52
[ 6152.483768][   T61]  handle_bug+0x44/0x80
[ 6152.483769][   T61]  exc_invalid_op+0x13/0x40
[ 6152.483769][   T61]  asm_exc_invalid_op+0x12/0x20
[ 6152.483770][   T61] RIP: 0010:sched_cpu_dying+0x14f/0x180
[ 6152.483772][   T61] Code: 10 00 31 c0 48 83 c4 08 5b 41 5c 41 5d 5d c3 be 08 00 00 00 48 c7 c7 60 5f 15 a1 e8 1b e5 4d 00 f0 4c 01 25 63 c8 5a 09 eb a3 <0f> 0b 48 89 34 24 e8 f6 e0 4d 00 48 8b 34 24 e9 1e ff ff ff 48 89
[ 6152.483774][   T61] RSP: 0018:ffffc90006fbfca0 EFLAGS: 00010002
[ 6152.483775][   T61] RAX: 000000000000723d RBX: ffff8887dfab2400 RCX: 1ffff110fbf56488
[ 6152.483777][   T61] RDX: 0000000000000000 RSI: 000000000000723d RDI: ffff8887dfab2440
[ 6152.483778][   T61] RBP: ffffc90006fbfcc0 R08: fffffbfff417923d R09: fffffbfff417923d
[ 6152.483779][   T61] R10: ffffffffa0bc91e7 R11: fffffbfff417923c R12: ffff8887dfab2418
[ 6152.483780][   T61] R13: 0000000000000086 R14: ffffffff97b03da0 R15: 0000000000000003
[ 6152.483781][   T61]  ? cpuhp_invoke_callback+0x1520/0x1520
[ 6152.483781][   T61]  ? x86_pmu_starting_cpu+0x20/0x20
[ 6152.483782][   T61]  ? sched_cpu_wait_empty+0x290/0x290
[ 6152.483783][   T61]  cpuhp_invoke_callback+0x1d8/0x1520
[ 6152.483784][   T61]  ? x2apic_send_IPI_mask+0x10/0x10
[ 6152.483784][   T61]  ? clear_local_APIC+0x788/0xc10
[ 6152.483785][   T61]  ? cpuhp_invoke_callback+0x1520/0x1520
[ 6152.483786][   T61]  take_cpu_down+0x10f/0x1a0
[ 6152.483786][   T61]  multi_cpu_stop+0x149/0x350
[ 6152.483787][   T61]  ? stop_machine_yield+0x10/0x10
[ 6152.483788][   T61]  cpu_stopper_thread+0x200/0x400
[ 6152.483789][   T61]  ? cpu_stop_create+0x70/0x70
[ 6152.483789][   T61]  smpboot_thread_fn+0x30a/0x770
[ 6152.483790][   T61]  ? smpboot_register_percpu_thread+0x370/0x370
[ 6152.483791][   T61]  ? trace_hardirqs_on+0x1c/0x150
[ 6152.483792][   T61]  ? __kthread_parkme+0xcc/0x1a0
[ 6152.483792][   T61]  ? smpboot_register_percpu_thread+0x370/0x370
[ 6152.483793][   T61]  kthread+0x354/0x420
[ 6152.483794][   T61]  ? kthread_create_on_node+0xc0/0xc0
[ 6152.483794][   T61]  ret_from_fork+0x22/0x30
[ 6152.483796][   T61] ======================================================
[ 6152.483797][   T61] WARNING: possible circular locking dependency detected
[ 6152.483798][   T61] 5.10.0-rc6-next-20201201+ #1 Tainted: G          IO     
[ 6152.483799][   T61] ------------------------------------------------------
[ 6152.483800][   T61] migration/10/61 is trying to acquire lock:
[ 6152.483801][   T61] ffffffffa08c7e98 ((console_sem).lock){-.-.}-{2:2}, at: down_trylock+0xe/0x70
[ 6152.483805][   T61] 
[ 6152.483805][   T61] but task is already holding lock:
[ 6152.483806][   T61] ffff8887dfab2418 (&rq->lock){-.-.}-{2:2}, at: sched_cpu_dying+0x4e/0x180
[ 6152.483809][   T61] 
[ 6152.483810][   T61] which lock already depends on the new lock.
[ 6152.483810][   T61] 
[ 6152.483811][   T61] 
[ 6152.483812][   T61] the existing dependency chain (in reverse order) is:
[ 6152.483812][   T61] 
[ 6152.483813][   T61] -> #2 (&rq->lock){-.-.}-{2:2}:
[ 6152.483815][   T61]        lock_acquire+0x1c8/0x820
[ 6152.483816][   T61]        _raw_spin_lock+0x27/0x40
[ 6152.483817][   T61]        task_fork_fair+0x32/0x4c0
[ 6152.483818][   T61]        sched_fork+0x3b3/0x8d0
[ 6152.483818][   T61]        copy_process+0x1c89/0x6470
[ 6152.483819][   T61]        kernel_clone+0xbd/0xc90
[ 6152.483820][   T61]        kernel_thread+0x95/0xd0
[ 6152.483820][   T61]        rest_init+0x21/0x28a
[ 6152.483821][   T61]        start_kernel+0x37c/0x39a
[ 6152.483822][   T61]        secondary_startup_64_no_verify+0xc2/0xcb
[ 6152.483822][   T61] 
[ 6152.483823][   T61] -> #1 (&p->pi_lock){-.-.}-{2:2}:
[ 6152.483826][   T61]        lock_acquire+0x1c8/0x820
[ 6152.483827][   T61]        _raw_spin_lock_irqsave+0x30/0x50
[ 6152.483827][   T61]        try_to_wake_up+0x9c/0xf90
[ 6152.483828][   T61]        up+0x8d/0xd0
[ 6152.483829][   T61]        __up_console_sem+0x29/0x60
[ 6152.483830][   T61]        console_unlock+0x581/0xa20
[ 6152.483830][   T61]        vprintk_emit+0x201/0x2c0
[ 6152.483831][   T61]        printk+0x9a/0xc0
[ 6152.483832][   T61]        do_exit.cold.38+0x55/0x1e5
[ 6152.483832][   T61]        do_group_exit+0xeb/0x2d0
[ 6152.483833][   T61]        __x64_sys_exit_group+0x35/0x40
[ 6152.483834][   T61]        do_syscall_64+0x33/0x40
[ 6152.483835][   T61]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 6152.483835][   T61] 
[ 6152.483836][   T61] -> #0 ((console_sem).lock){-.-.}-{2:2}:
[ 6152.483839][   T61]        check_prevs_add+0x38e/0x2800
[ 6152.483839][   T61]        __lock_acquire+0x2c86/0x3e60
[ 6152.483840][   T61]        lock_acquire+0x1c8/0x820
[ 6152.483841][   T61]        _raw_spin_lock_irqsave+0x30/0x50
[ 6152.483842][   T61]        down_trylock+0xe/0x70
[ 6152.483843][   T61]        __down_trylock_console_sem+0x23/0x90
[ 6152.483844][   T61]        console_trylock+0xe/0x60
[ 6152.483845][   T61]        vprintk_emit+0x107/0x2c0
[ 6152.483846][   T61]        printk+0x9a/0xc0
[ 6152.483847][   T61]        report_bug.cold.2+0x30/0x52
[ 6152.483848][   T61]        handle_bug+0x44/0x80
[ 6152.483849][   T61]        exc_invalid_][   T61] R10: ffffffffa0bc91e7 R11: fffffbfff417923c R12: ffff8887dfab2418
[ 6152.483920][   T61] R13: 0000000000000086 R14: ffffffff97b03da0 R15: 0000000000000003
[ 6152.483920][   T61]  ? cpuhp_invoke_callback+0x1520/0x1520
[ 6152.483921][   T61]  ? x86_pmu_starting_cpu+0x20/0x20
[ 6152.483922][   T61]  ? sched_cpu_wait_empty+0x290/0x290
[ 6152.483923][   T61]  cpuhp_invoke_callback+0x1d8/0x1520
[ 6152.483923][   T61]  ? x2apic_send_IPI_mask+0x10/0x10
[ 6152.483924][   T61]  ? clear_local_APIC+0x788/0xc10
[ 6152.483925][   T61]  ? cpuhp_invoke_callback+0x1520/0x1520
[ 6152.483925][   T61]  take_cpu_down+0x10f/0x1a0
[ 6152.483926][   T61]  multi_cpu_stop+0x149/0x350
[ 6152.483927][   T61]  ? stop_machine_yield+0x10/0x10
[ 6152.483928][   T61]  cpu_stopper_thread+0x200/0x400
[ 6152.483929][   T61]  ? cpu_stop_create+0x70/0x70
[ 6152.483930][   T61]  smpboot_thread_fn+0x30a/0x770
[ 6152.483931][   T61]  ? smpboot_register_percpu_thread+0x370/0x370
[ 6152.483932][   T61]  ? trace_hardirqs_on+0x1c/0x150


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative
  2020-11-17 19:28     ` Valentin Schneider
  2020-11-18 14:44       ` Qian Cai
@ 2020-12-04  0:23       ` Qian Cai
  2020-12-04 21:19       ` Qian Cai
  2 siblings, 0 replies; 81+ messages in thread
From: Qian Cai @ 2020-12-04  0:23 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj, ouwen210

FYI, it did crash on arm64 (Thunder X2) as well, so I'll re-run to gather more
information too.

.config: https://cailca.coding.net/public/linux/mm/git/files/master/arm64.config

[20370.682747][T77637] psci: CPU123 killed (polled 0 ms) 
[20370.823651][  T635] IRQ 43: no longer affine to CPU124 
[20370.828862][  T635] IRQ 49: no longer affine to CPU124 
[20370.834072][  T635] IRQ 60: no longer affine to CPU124 
[20370.839517][  T635] IRQ 94: no longer affine to CPU124 
[20370.845778][T77637] CPU124: shutdown 
[20370.861891][T77637] psci: CPU124 killed (polled 10 ms) 
[20371.425434][T77637] CPU125: shutdown 
[20371.441464][T77637] psci: CPU125 killed (polled 10 ms) 
[20371.984072][T77637] CPU126: shutdown 
[20372.000057][T77637] psci: CPU126 killed (polled 10 ms) 
[20372.223858][  T650] ------------[ cut here ]------------ 
[20372.229599][  T650] kernel BUG at kernel/sched/core.c:7594! 
[20372.235165][  T650] Internal error: Oops - BUG: 0 [#1] SMP 
[20372.240643][  T650] Modules linked in: vfio_pci vfio_virqfd vfio_iommu_type1
vfio loop processor ip_tables x_tables sd_mod mlx5_core firmware_class ahci
libahci libata dm_mirror dm_region_hash dm_log dm_mod efivarfs 
[20372.259814][  T650] CPU: 127 PID: 650 Comm: migration/127 Tainted: G             L    5.10.0-rc6-next-20201203+ #5 
[20372.270152][  T650] Hardware name: HPE Apollo 70             /C01_APACHE_MB         , BIOS L50_5.13_1.16 07/29/2020 
[20372.280579][  T650] Stopper: multi_cpu_stop+0x0/0x390 <- 0x0 
[20372.286230][  T650] pstate: 20400089 (nzCv daIf +PAN -UAO -TCO BTYPE=--) 
[20372.292923][  T650] pc : sched_cpu_dying+0x198/0x1b8 
[20372.297879][  T650] lr : sched_cpu_dying+0x68/0x1b8 
[20372.302748][  T650] sp : ffff00001076fba0 
[20372.306747][  T650] x29: ffff00001076fba0 x28: 0000000000000000  
[20372.312751][  T650] x27: 0000000000000001 x26: ffff800011db3000  
[20372.318753][  T650] x25: ffff000e7bdd16a8 x24: 000000000000005a  
[20372.324754][  T650] x23: 000000000000007f x22: 0000000000000080  
[20372.330756][  T650] x21: 000000000000fab7 x20: ffff000e7be63818  
[20372.336757][  T650] x19: ffff000e7be63800 x18: 1fffe001cf7cb3ed  
[20372.342758][  T650] x17: 0000000000001308 x16: 0000000000000000  
[20372.348759][  T650] x15: 000000000001053f x14: 000000000001053f  
[20372.354761][  T650] x13: ffff6000020edf65 x12: 1fffe000020edf64  
[20372.360763][  T650] x11: 1fffe000020edf64 x10: ffff6000020edf64  
[20372.366764][  T650] x9 : dfff800000000000 x8 : ffff00001076fb23  
[20372.372766][  T650] x7 : 0000000000000001 x6 : 0000000000000001  
[20372.378767][  T650] x5 : 1fffe000020b9a0a x4 : dfff800000000000  
[20372.384769][  T650] x3 : dfff800000000000 x2 : 0000000000000003  
[20372.390770][  T650] x1 : ffff000e7be63840 x0 : 0000000000000002  
[20372.396771][  T650] Call trace: 
[20372.399905][  T650]  sched_cpu_dying+0x198/0x1b8 
[20372.404514][  T650]  cpuhp_invoke_callback+0x208/0x2bf0 
[20372.409730][  T650]  take_cpu_down+0x11c/0x1f0 
[20372.414165][  T650]  multi_cpu_stop+0x184/0x390 
[20372.418687][  T650]  cpu_stopper_thread+0x1f0/0x430 
[20372.423557][  T650]  smpboot_thread_fn+0x3a8/0x9c8 
[20372.428339][  T650]  kthread+0x3a0/0x448 
[20372.432253][  T650]  ret_from_fork+0x10/0x1c 
[20372.436517][  T650] Code: d65f03c0 911a82a2 140004fb 17ffffd9 (d4210000)  
[20372.443298][  T650] ---[ end trace c51d5b6889ec29a8 ]--- 
[20372.448602][  T650] Kernel panic - not syncing: Oops - BUG: Fatal exception 


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative
  2020-11-17 19:28     ` Valentin Schneider
  2020-11-18 14:44       ` Qian Cai
  2020-12-04  0:23       ` Qian Cai
@ 2020-12-04 21:19       ` Qian Cai
  2020-12-05 18:37         ` Valentin Schneider
  2020-12-07 19:27         ` Valentin Schneider
  2 siblings, 2 replies; 81+ messages in thread
From: Qian Cai @ 2020-12-04 21:19 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj, ouwen210

On Tue, 2020-11-17 at 19:28 +0000, Valentin Schneider wrote:
> We did have some breakage in that area, but all the holes I was aware of
> have been plugged. What would help here is to see which tasks are still
> queued on that outgoing CPU, and their recent activity.
> 
> Something like
> - ftrace_dump_on_oops on your kernel cmdline
> - trace-cmd start -e 'sched:*'
>  <start the test here>
> 
> ought to do it. Then you can paste the (tail of the) ftrace dump.
> 
> I also had this laying around, which may or may not be of some help:

Okay, your patch did not help, since it can still be reproduced using this,

https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/hotplug/cpu_hotplug/functional/cpuhotplug04.sh

# while :; do cpuhotplug04.sh -l 1; done

The ftrace dump has too much output on this 256-CPU system, so I have not had
the patient to wait for it to finish after 15-min. But here is the log capturing
so far (search for "kernel BUG" there).

http://people.redhat.com/qcai/console.log

> ---
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index a6aaf9fb3400..c4a4cb8b47a2 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7534,7 +7534,25 @@ int sched_cpu_dying(unsigned int cpu)
>  	sched_tick_stop(cpu);
>  
>  	rq_lock_irqsave(rq, &rf);
> -	BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
> +
> +	if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
> +		struct task_struct *g, *p;
> +
> +		pr_crit("CPU%d nr_running=%d\n", cpu, rq->nr_running);
> +		rcu_read_lock();
> +		for_each_process_thread(g, p) {
> +			if (task_cpu(p) != cpu)
> +				continue;
> +
> +			if (!task_on_rq_queued(p))
> +				continue;
> +
> +			pr_crit("\tp=%s\n", p->comm);
> +		}
> +		rcu_read_unlock();
> +		BUG();
> +	}
> +
>  	rq_unlock_irqrestore(rq, &rf);
>  
>  	calc_load_migrate(rq);
> 


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative
  2020-12-04 21:19       ` Qian Cai
@ 2020-12-05 18:37         ` Valentin Schneider
  2020-12-06  1:17           ` Qian Cai
  2020-12-07 19:27         ` Valentin Schneider
  1 sibling, 1 reply; 81+ messages in thread
From: Valentin Schneider @ 2020-12-05 18:37 UTC (permalink / raw)
  To: Qian Cai
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj, ouwen210


On 04/12/20 21:19, Qian Cai wrote:
> On Tue, 2020-11-17 at 19:28 +0000, Valentin Schneider wrote:
>> We did have some breakage in that area, but all the holes I was aware of
>> have been plugged. What would help here is to see which tasks are still
>> queued on that outgoing CPU, and their recent activity.
>>
>> Something like
>> - ftrace_dump_on_oops on your kernel cmdline
>> - trace-cmd start -e 'sched:*'
>>  <start the test here>
>>
>> ought to do it. Then you can paste the (tail of the) ftrace dump.
>>
>> I also had this laying around, which may or may not be of some help:
>
> Okay, your patch did not help, since it can still be reproduced using this,
>

It wasn't meant to fix this, only add some more debug prints :)

> https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/hotplug/cpu_hotplug/functional/cpuhotplug04.sh
>
> # while :; do cpuhotplug04.sh -l 1; done
>
> The ftrace dump has too much output on this 256-CPU system, so I have not had
> the patient to wait for it to finish after 15-min. But here is the log capturing
> so far (search for "kernel BUG" there).
>
> http://people.redhat.com/qcai/console.log
>

From there I see:

[20798.166987][  T650] CPU127 nr_running=2
[20798.171185][  T650]  p=migration/127
[20798.175161][  T650]  p=kworker/127:1

so this might be another workqueue hurdle. This should be prevented by:

  06249738a41a ("workqueue: Manually break affinity on hotplug")

In any case, I'll give this a try on a TX2 next week and see where it gets
me.

Note that much earlier in your log, you have a softlockup on CPU127:

[   74.278367][  C127] watchdog: BUG: soft lockup - CPU#127 stuck for 23s! [swapper/0:1]

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative
  2020-12-05 18:37         ` Valentin Schneider
@ 2020-12-06  1:17           ` Qian Cai
  0 siblings, 0 replies; 81+ messages in thread
From: Qian Cai @ 2020-12-06  1:17 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj, ouwen210

On Sat, 2020-12-05 at 18:37 +0000, Valentin Schneider wrote:
> From there I see:
> 
> [20798.166987][  T650] CPU127 nr_running=2
> [20798.171185][  T650]  p=migration/127
> [20798.175161][  T650]  p=kworker/127:1
> 
> so this might be another workqueue hurdle. This should be prevented by:
> 
>   06249738a41a ("workqueue: Manually break affinity on hotplug")

Well, since it was reproduced on the latest linux-next which has already
included the commit.

> Note that much earlier in your log, you have a softlockup on CPU127:
> 
> [   74.278367][  C127] watchdog: BUG: soft lockup - CPU#127 stuck for 23s!
> [swapper/0:1]

That's something separate. It was there all the time.

https://lore.kernel.org/linux-acpi/20200929183444.25079-1-cai@redhat.com/


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative
  2020-12-04 21:19       ` Qian Cai
  2020-12-05 18:37         ` Valentin Schneider
@ 2020-12-07 19:27         ` Valentin Schneider
  2020-12-08 13:46           ` Qian Cai
  1 sibling, 1 reply; 81+ messages in thread
From: Valentin Schneider @ 2020-12-07 19:27 UTC (permalink / raw)
  To: Qian Cai
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj, ouwen210


On 04/12/20 21:19, Qian Cai wrote:
> On Tue, 2020-11-17 at 19:28 +0000, Valentin Schneider wrote:
>> We did have some breakage in that area, but all the holes I was aware of
>> have been plugged. What would help here is to see which tasks are still
>> queued on that outgoing CPU, and their recent activity.
>>
>> Something like
>> - ftrace_dump_on_oops on your kernel cmdline
>> - trace-cmd start -e 'sched:*'
>>  <start the test here>
>>
>> ought to do it. Then you can paste the (tail of the) ftrace dump.
>>
>> I also had this laying around, which may or may not be of some help:
>
> Okay, your patch did not help, since it can still be reproduced using this,
>
> https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/hotplug/cpu_hotplug/functional/cpuhotplug04.sh
>
> # while :; do cpuhotplug04.sh -l 1; done
>

Ok, can reproduce this on a TX2 on next-20201207. I didn't use your config,
I oldconfig'd my distro config and only modified it to CONFIG_PREEMPT_NONE.
Interestingly the BUG happens on CPU127 here too...

I'll start digging.

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative
  2020-12-07 19:27         ` Valentin Schneider
@ 2020-12-08 13:46           ` Qian Cai
  2020-12-09 19:16             ` Valentin Schneider
  0 siblings, 1 reply; 81+ messages in thread
From: Qian Cai @ 2020-12-08 13:46 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj, ouwen210

On Mon, 2020-12-07 at 19:27 +0000, Valentin Schneider wrote:
> Ok, can reproduce this on a TX2 on next-20201207. I didn't use your config,
> I oldconfig'd my distro config and only modified it to CONFIG_PREEMPT_NONE.
> Interestingly the BUG happens on CPU127 here too...

I think that number is totally random. For example, on this x86, it could happen
for CPU8 or CPU111.


^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative
  2020-12-08 13:46           ` Qian Cai
@ 2020-12-09 19:16             ` Valentin Schneider
  0 siblings, 0 replies; 81+ messages in thread
From: Valentin Schneider @ 2020-12-09 19:16 UTC (permalink / raw)
  To: Qian Cai
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort, tj, ouwen210


On 08/12/20 13:46, Qian Cai wrote:
> On Mon, 2020-12-07 at 19:27 +0000, Valentin Schneider wrote:
>> Ok, can reproduce this on a TX2 on next-20201207. I didn't use your config,
>> I oldconfig'd my distro config and only modified it to CONFIG_PREEMPT_NONE.
>> Interestingly the BUG happens on CPU127 here too...
>
> I think that number is totally random. For example, on this x86, it could happen
> for CPU8 or CPU111.

Actually on the TX2 it seems to *always* happen on CPU127. Your hotplug
script sequentially offlines CPUs in increasing id values, so when CPU127
gets hotplugged it is the last online CPU of NUMA node 0.

I've been staring at traces collected via

  echo 2 > /proc/sys/kernel/ftrace_dump_on_oops
  trace-cmd start -e 'sched:*' -e 'cpuhp:*' -e 'workqueue:*'
  ./hotplug.sh

but it's still not entirely clear to me WTH is going on. I do see kworkers
getting their affinity reset in workqueue_offline_cpu(), but for some
reason there's a new one that wakes up on CPU127 sometime later. I haven't
been able to figure out where it comes from - it obviously isn't part of
the percpu worker pools, as it isn't handled during
workqueue_offline_cpu(), but it still ends up affined to a single CPU...

It looks something like this; traces are only from CPU127

  cpuhp:sched_cpu_wait_empty() # Resets the affinity of some kworker/127:x<2

  sched_switch(idle)

  sched_wakeup(kworker/127:2) # picks CPU127
  sched_switch(kworker/127:2)
  # maybe_create_worker() -> creates kworker/127:3
  sched_wakeup(kworker/127:3) # picks CPU127

  sched_switch(kworker/127:3)
  # maybe_create_worker() -> creates kworker/127:4
  sched_wakeup(kworker/127:4) # picks CPU127

  sched_switch(kworker/127:4)
  # maybe_create_worker() -> creates kworker/127:5
  sched_wakeup(kworker/127:5) # picks CPU127
  sched_wakeup(migration/127)

  sched_switch(migration/127)
  cpuhp:take_cpu_down()

  BUG

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 15/19] sched: Fix migrate_disable() vs rt/dl balancing
  2020-10-23 10:12 ` [PATCH v4 15/19] sched: Fix migrate_disable() vs rt/dl balancing Peter Zijlstra
  2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
@ 2020-12-26 13:54   ` Qais Yousef
  2021-03-05 14:56     ` Peter Zijlstra
  1 sibling, 1 reply; 81+ messages in thread
From: Qais Yousef @ 2020-12-26 13:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, swood, valentin.schneider,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210

Hi Peter

Apologies for the late comments on the patch.

On 10/23/20 12:12, Peter Zijlstra wrote:

[...]

> + * When a preempted task becomes elegible to run under the ideal model (IOW it
> + * becomes one of the M highest priority tasks), it might still have to wait
> + * for the preemptee's migrate_disable() section to complete. Thereby suffering
> + * a reduction in bandwidth in the exact duration of the migrate_disable()
> + * section.
> + *
> + * Per this argument, the change from preempt_disable() to migrate_disable()
> + * gets us:
> + *
> + * - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
> + *   it would have had to wait for the lower priority task.
> + *
> + * - a lower priority tasks; which under preempt_disable() could've instantly
> + *   migrated away when another CPU becomes available, is now constrained
> + *   by the ability to push the higher priority task away, which might itself be
> + *   in a migrate_disable() section, reducing it's available bandwidth.
> + *
> + * IOW it trades latency / moves the interference term, but it stays in the
> + * system, and as long as it remains unbounded, the system is not fully
> + * deterministic.

The idea makes sense but I'm worried about some implementation details.

Specifically:

	* There's no guarantee the target CPU we're pushing to doesn't have
	  a lower priority task in migration_disabled too. So we could end up
	  having to push the task again. Although unlikely in practice, but as
	  I see it the worst case scenario is unbounded here. The planets could
	  align perfectly for the higher priority task to spend the majority of
	  its time migrating between cpus that have low priority RT tasks in
	  migration_disabled regions.

	  We need to track migration disabled at rq level to fix this.
	  It might be necessary to track the priority levels that are in
	  migration_disabled too :-/

	* Since this is a voluntary migration, I think we should ensure it is
	  restricted to cpus_share_cache() to guarantee the price is minimal
	  and acceptable.

	* The push is done via the stopper task; which will steal run time
	  and could contribute to worst case latency. I think it'd fine in
	  practice, but PREEMPT_RT folks will know better.

I think the combined effect of above could end up throwing off RT system
designers who could find their high-priority-hard-RT task is missing its
deadline to be nice to lower priority tasks who go often to migration_disabled
regions.

I seem to remember Clark saying in last LPC that few us latency is not unheard
of now.

> +int push_cpu_stop(void *arg)
> +{
> +	struct rq *lowest_rq = NULL, *rq = this_rq();
> +	struct task_struct *p = arg;
> +
> +	raw_spin_lock_irq(&p->pi_lock);
> +	raw_spin_lock(&rq->lock);
> +
> +	if (task_rq(p) != rq)
> +		goto out_unlock;
> +
> +	if (is_migration_disabled(p)) {
> +		p->migration_flags |= MDF_PUSH;
> +		goto out_unlock;
> +	}
> +
> +	p->migration_flags &= ~MDF_PUSH;
> +
> +	if (p->sched_class->find_lock_rq)
> +		lowest_rq = p->sched_class->find_lock_rq(p, rq);
> +
> +	if (!lowest_rq)
> +		goto out_unlock;
> +
> +	// XXX validate p is still the highest prio task

The task_rq(p) could have left the migration_disabled region by now too. If we
track that at rq level we could be able to do last minute check to bale out of
this voluntary push.

I think we should check that the lowest_rq is not in migration_disabled region
too otherwise the same task could end up here again.

Need to think more about it, but we might be able to get away with verifying
task_rq(p)->curr and lowest_rq->curr aren't in migration disabled. The only
worry I can think of now is that rq->curr is a similar task to this one. That
is: a higher priority task that has preempted a migration_disabled region.

Verifying that task_cpu(p) and lowest_rq->cpu are in the same llc will help
avoid a costly migration. After all this is a voluntary migration.

Once we do all these bale outs; we might need to rate limit another PULL
triggering this continuously. Need to dig more into that.

> +	if (task_rq(p) == rq) {
> +		deactivate_task(rq, p, 0);
> +		set_task_cpu(p, lowest_rq->cpu);
> +		activate_task(lowest_rq, p, 0);
> +		resched_curr(lowest_rq);
> +	}
> +
> +	double_unlock_balance(rq, lowest_rq);
> +
> +out_unlock:
> +	rq->push_busy = false;
> +	raw_spin_unlock(&rq->lock);
> +	raw_spin_unlock_irq(&p->pi_lock);
> +
> +	put_task_struct(p);
> +	return 0;
> +}

[...]

> +static inline struct task_struct *get_push_task(struct rq *rq)
> +{
> +	struct task_struct *p = rq->curr;

Shouldn't we verify the class of the task here? The RT task in migration
disabled could have been preempted by a dl or stopper task. Similarly, the dl
task could have been preempted by a stopper task.

I don't think an RT task should be allowed to push a dl task under any
circumstances?

Cheers

--
Qais Yousef

> +
> +	lockdep_assert_held(&rq->lock);
> +
> +	if (rq->push_busy)
> +		return NULL;
> +
> +	if (p->nr_cpus_allowed == 1)
> +		return NULL;
> +
> +	rq->push_busy = true;
> +	return get_task_struct(p);
> +}
> +
> +extern int push_cpu_stop(void *arg);
> +
>  #endif
>  
>  #ifdef CONFIG_CPU_IDLE
> 
> 

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 15/19] sched: Fix migrate_disable() vs rt/dl balancing
  2020-12-26 13:54   ` [PATCH v4 15/19] " Qais Yousef
@ 2021-03-05 14:56     ` Peter Zijlstra
  2021-03-05 15:41       ` Valentin Schneider
  2021-03-05 16:48       ` Qais Yousef
  0 siblings, 2 replies; 81+ messages in thread
From: Peter Zijlstra @ 2021-03-05 14:56 UTC (permalink / raw)
  To: Qais Yousef
  Cc: tglx, mingo, linux-kernel, bigeasy, swood, valentin.schneider,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210

On Sat, Dec 26, 2020 at 01:54:45PM +0000, Qais Yousef wrote:
> Hi Peter
> 
> Apologies for the late comments on the patch.

Ha!, it seems I too need to apologize for never having actually found
your reply ;-)

> On 10/23/20 12:12, Peter Zijlstra wrote:
> 
> [...]
> 
> > + * When a preempted task becomes elegible to run under the ideal model (IOW it
> > + * becomes one of the M highest priority tasks), it might still have to wait
> > + * for the preemptee's migrate_disable() section to complete. Thereby suffering
> > + * a reduction in bandwidth in the exact duration of the migrate_disable()
> > + * section.
> > + *
> > + * Per this argument, the change from preempt_disable() to migrate_disable()
> > + * gets us:
> > + *
> > + * - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
> > + *   it would have had to wait for the lower priority task.
> > + *
> > + * - a lower priority tasks; which under preempt_disable() could've instantly
> > + *   migrated away when another CPU becomes available, is now constrained
> > + *   by the ability to push the higher priority task away, which might itself be
> > + *   in a migrate_disable() section, reducing it's available bandwidth.
> > + *
> > + * IOW it trades latency / moves the interference term, but it stays in the
> > + * system, and as long as it remains unbounded, the system is not fully
> > + * deterministic.
> 
> The idea makes sense but I'm worried about some implementation details.
> 
> Specifically:
> 
> 	* There's no guarantee the target CPU we're pushing to doesn't have
> 	  a lower priority task in migration_disabled too. So we could end up
> 	  having to push the task again. 

I'm not sure I follow, if the CPU we're pushing to has a
migrate_disable() task of lower priority we'll simply preempt it.

IIRC there's conditions for this push:

 1) we just did migrate_enable();
 2) the task below us also has migrate_disable();
 3) the task below us is actually higher priority than
    the lowest priority task currently running.

So at that point we shoot our high prio task away, and we aim it at the
lowest prio task.

In order to then shoot it away again, someone else needs to block to
make lower prio task we just preempted elegible again.

Still, possible I suppose.

>		Although unlikely in practice, but as
> 	  I see it the worst case scenario is unbounded here. The planets could
> 	  align perfectly for the higher priority task to spend the majority of
> 	  its time migrating between cpus that have low priority RT tasks in
> 	  migration_disabled regions.

I'm thinking it might be limited by the range of priorities. You need to
drop the prio on every round, and you can't keep on dropping priority
levels, at some point we've reached bottom.

> 	  We need to track migration disabled at rq level to fix this.
> 	  It might be necessary to track the priority levels that are in
> 	  migration_disabled too :-/

As a tie breaker, not sure it's worth it.

> 	* Since this is a voluntary migration, I think we should ensure it is
> 	  restricted to cpus_share_cache() to guarantee the price is minimal
> 	  and acceptable.

That might create conflicting goals wrt the SMP invariant (run the N
highest prio tasks).

> 	* The push is done via the stopper task; which will steal run time
> 	  and could contribute to worst case latency. I think it'd fine in
> 	  practice, but PREEMPT_RT folks will know better.
> 
> I think the combined effect of above could end up throwing off RT system
> designers who could find their high-priority-hard-RT task is missing its
> deadline to be nice to lower priority tasks who go often to migration_disabled
> regions.
> 
> I seem to remember Clark saying in last LPC that few us latency is not unheard
> of now.

Those people that care _that_ much typically set hard affinity for their
tasks.

> > +int push_cpu_stop(void *arg)
> > +{
> > +	struct rq *lowest_rq = NULL, *rq = this_rq();
> > +	struct task_struct *p = arg;
> > +
> > +	raw_spin_lock_irq(&p->pi_lock);
> > +	raw_spin_lock(&rq->lock);
> > +
> > +	if (task_rq(p) != rq)
> > +		goto out_unlock;
> > +
> > +	if (is_migration_disabled(p)) {
> > +		p->migration_flags |= MDF_PUSH;
> > +		goto out_unlock;
> > +	}
> > +
> > +	p->migration_flags &= ~MDF_PUSH;
> > +
> > +	if (p->sched_class->find_lock_rq)
> > +		lowest_rq = p->sched_class->find_lock_rq(p, rq);
> > +
> > +	if (!lowest_rq)
> > +		goto out_unlock;
> > +
> > +	// XXX validate p is still the highest prio task
> 
> The task_rq(p) could have left the migration_disabled region by now too. If we
> track that at rq level we could be able to do last minute check to bale out of
> this voluntary push.
> 
> I think we should check that the lowest_rq is not in migration_disabled region
> too otherwise the same task could end up here again.
> 
> Need to think more about it, but we might be able to get away with verifying
> task_rq(p)->curr and lowest_rq->curr aren't in migration disabled. The only
> worry I can think of now is that rq->curr is a similar task to this one. That
> is: a higher priority task that has preempted a migration_disabled region.
> 
> Verifying that task_cpu(p) and lowest_rq->cpu are in the same llc will help
> avoid a costly migration. After all this is a voluntary migration.
> 
> Once we do all these bale outs; we might need to rate limit another PULL
> triggering this continuously. Need to dig more into that.

So we have:

	CPU0		CPU1

	M-preempted	L-running
	H-running

And normally we'd have pushed M, but it can't since it have
migration_disabled(). Moving H over L is the next best thing.

> > +	if (task_rq(p) == rq) {
> > +		deactivate_task(rq, p, 0);
> > +		set_task_cpu(p, lowest_rq->cpu);
> > +		activate_task(lowest_rq, p, 0);
> > +		resched_curr(lowest_rq);
> > +	}
> > +
> > +	double_unlock_balance(rq, lowest_rq);
> > +
> > +out_unlock:
> > +	rq->push_busy = false;
> > +	raw_spin_unlock(&rq->lock);
> > +	raw_spin_unlock_irq(&p->pi_lock);
> > +
> > +	put_task_struct(p);
> > +	return 0;
> > +}
> 
> [...]
> 
> > +static inline struct task_struct *get_push_task(struct rq *rq)
> > +{
> > +	struct task_struct *p = rq->curr;
> 
> Shouldn't we verify the class of the task here? The RT task in migration
> disabled could have been preempted by a dl or stopper task. Similarly, the dl
> task could have been preempted by a stopper task.
> 
> I don't think an RT task should be allowed to push a dl task under any
> circumstances?

Hmm, quite. Fancy doing a patch?

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 15/19] sched: Fix migrate_disable() vs rt/dl balancing
  2021-03-05 14:56     ` Peter Zijlstra
@ 2021-03-05 15:41       ` Valentin Schneider
  2021-03-05 17:11         ` Qais Yousef
  2021-03-10 14:44         ` Qais Yousef
  2021-03-05 16:48       ` Qais Yousef
  1 sibling, 2 replies; 81+ messages in thread
From: Valentin Schneider @ 2021-03-05 15:41 UTC (permalink / raw)
  To: Peter Zijlstra, Qais Yousef
  Cc: tglx, mingo, linux-kernel, bigeasy, swood, juri.lelli,
	vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
	bristot, vincent.donnefort, tj, ouwen210

On 05/03/21 15:56, Peter Zijlstra wrote:
> On Sat, Dec 26, 2020 at 01:54:45PM +0000, Qais Yousef wrote:
>>
>> > +static inline struct task_struct *get_push_task(struct rq *rq)
>> > +{
>> > +	struct task_struct *p = rq->curr;
>>
>> Shouldn't we verify the class of the task here? The RT task in migration
>> disabled could have been preempted by a dl or stopper task. Similarly, the dl
>> task could have been preempted by a stopper task.
>>
>> I don't think an RT task should be allowed to push a dl task under any
>> circumstances?
>
> Hmm, quite. Fancy doing a patch?

Last time we talked about this, I looked into

  push_rt_task() + find_lowest_rq()

IIRC, with how

  find_lowest_rq() + cpupri_find_fitness()

currently work, find_lowest_rq() should return -1 in push_rt_task() if
rq->curr is DL (CPUPRI_INVALID). IOW, Migration-Disabled RT tasks shouldn't
actually interfere with DL tasks (unless a DL task gets scheduled after we
drop the rq lock and kick the stopper, but we have that problem everywhere
including CFS active balance).


Now, for some blabbering. Re SMP invariant; wouldn't we actually want this
to happen? Consider:

  MD := Migration-Disabled.

  rq
           DL
           RT3
           RT2 (MD)   RT1

  current  DL         RT1        idle
           CPU0       CPU1       CPU2

If we were to ignore MD, the best spread for this would be something
like:

  rq
                                 RT1
           DL         RT3        RT2

  current  DL         RT3        RT2
           CPU0       CPU1       CPU2

Now, with Migration-Disabled we can't move RT2 to CPU2 - it has to stay
on CPU0 for as long as it is Migration-Disabled. Thus, a possible spread
would be:

  rq
           RT1
           RT2 (MD)   DL         RT3

  current  RT2        DL         RT3
           CPU0       CPU1       CPU

If you look closely, this is exactly the same as the previous spread
modulo CPU numbers. IOW, this is (again) a CPU renumbering exercise.

To respect the aforementioned scheduling invariant, we've had to move
that DL task, and while it does add interference, it's similar as to why we
push higher RT priority tasks to make room for lower RT priority, migration
disabled tasks. You get interference caused by a lower-priority entity for
the sake of your SMP scheduling invariant.

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 15/19] sched: Fix migrate_disable() vs rt/dl balancing
  2021-03-05 14:56     ` Peter Zijlstra
  2021-03-05 15:41       ` Valentin Schneider
@ 2021-03-05 16:48       ` Qais Yousef
  1 sibling, 0 replies; 81+ messages in thread
From: Qais Yousef @ 2021-03-05 16:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, swood, valentin.schneider,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210

On 03/05/21 15:56, Peter Zijlstra wrote:
> On Sat, Dec 26, 2020 at 01:54:45PM +0000, Qais Yousef wrote:
> > Hi Peter
> > 
> > Apologies for the late comments on the patch.
> 
> Ha!, it seems I too need to apologize for never having actually found
> your reply ;-)

No worries, thanks for taking the time to answer! :-)

> 
> > On 10/23/20 12:12, Peter Zijlstra wrote:
> > 
> > [...]
> > 
> > > + * When a preempted task becomes elegible to run under the ideal model (IOW it
> > > + * becomes one of the M highest priority tasks), it might still have to wait
> > > + * for the preemptee's migrate_disable() section to complete. Thereby suffering
> > > + * a reduction in bandwidth in the exact duration of the migrate_disable()
> > > + * section.
> > > + *
> > > + * Per this argument, the change from preempt_disable() to migrate_disable()
> > > + * gets us:
> > > + *
> > > + * - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
> > > + *   it would have had to wait for the lower priority task.
> > > + *
> > > + * - a lower priority tasks; which under preempt_disable() could've instantly
> > > + *   migrated away when another CPU becomes available, is now constrained
> > > + *   by the ability to push the higher priority task away, which might itself be
> > > + *   in a migrate_disable() section, reducing it's available bandwidth.
> > > + *
> > > + * IOW it trades latency / moves the interference term, but it stays in the
> > > + * system, and as long as it remains unbounded, the system is not fully
> > > + * deterministic.
> > 
> > The idea makes sense but I'm worried about some implementation details.
> > 
> > Specifically:
> > 
> > 	* There's no guarantee the target CPU we're pushing to doesn't have
> > 	  a lower priority task in migration_disabled too. So we could end up
> > 	  having to push the task again. 
> 
> I'm not sure I follow, if the CPU we're pushing to has a
> migrate_disable() task of lower priority we'll simply preempt it.
> 
> IIRC there's conditions for this push:
> 
>  1) we just did migrate_enable();
>  2) the task below us also has migrate_disable();
>  3) the task below us is actually higher priority than
>     the lowest priority task currently running.
> 
> So at that point we shoot our high prio task away, and we aim it at the
> lowest prio task.
> 
> In order to then shoot it away again, someone else needs to block to
> make lower prio task we just preempted elegible again.

Okay. I missed that 3rd condition. I understood only 1 and 2 are required.
So we have to have 3 tasks of different priorities on the rq, the middle being
in migrate_disabled.

It is less of a problem in that case.

> 
> Still, possible I suppose.
> 
> >		Although unlikely in practice, but as
> > 	  I see it the worst case scenario is unbounded here. The planets could
> > 	  align perfectly for the higher priority task to spend the majority of
> > 	  its time migrating between cpus that have low priority RT tasks in
> > 	  migration_disabled regions.
> 
> I'm thinking it might be limited by the range of priorities. You need to
> drop the prio on every round, and you can't keep on dropping priority
> levels, at some point we've reached bottom.

With that 3rd condition in mind, there has to be an element of bad design to
end up with 3 tasks of different priorities on 1 rq that continuously. The
system has to be in some sort of overloaded state, which is a bigger problem to
address first.

> > > +static inline struct task_struct *get_push_task(struct rq *rq)
> > > +{
> > > +	struct task_struct *p = rq->curr;
> > 
> > Shouldn't we verify the class of the task here? The RT task in migration
> > disabled could have been preempted by a dl or stopper task. Similarly, the dl
> > task could have been preempted by a stopper task.
> > 
> > I don't think an RT task should be allowed to push a dl task under any
> > circumstances?
> 
> Hmm, quite. Fancy doing a patch?

I had one. Let me revive and post it next week.

Thanks

--
Qais Yousef

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 15/19] sched: Fix migrate_disable() vs rt/dl balancing
  2021-03-05 15:41       ` Valentin Schneider
@ 2021-03-05 17:11         ` Qais Yousef
  2021-03-10 14:44         ` Qais Yousef
  1 sibling, 0 replies; 81+ messages in thread
From: Qais Yousef @ 2021-03-05 17:11 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort, tj, ouwen210

On 03/05/21 15:41, Valentin Schneider wrote:
> On 05/03/21 15:56, Peter Zijlstra wrote:
> > On Sat, Dec 26, 2020 at 01:54:45PM +0000, Qais Yousef wrote:
> >>
> >> > +static inline struct task_struct *get_push_task(struct rq *rq)
> >> > +{
> >> > +	struct task_struct *p = rq->curr;
> >>
> >> Shouldn't we verify the class of the task here? The RT task in migration
> >> disabled could have been preempted by a dl or stopper task. Similarly, the dl
> >> task could have been preempted by a stopper task.
> >>
> >> I don't think an RT task should be allowed to push a dl task under any
> >> circumstances?
> >
> > Hmm, quite. Fancy doing a patch?
> 
> Last time we talked about this, I looked into
> 
>   push_rt_task() + find_lowest_rq()
> 
> IIRC, with how
> 
>   find_lowest_rq() + cpupri_find_fitness()
> 
> currently work, find_lowest_rq() should return -1 in push_rt_task() if
> rq->curr is DL (CPUPRI_INVALID). IOW, Migration-Disabled RT tasks shouldn't

[...]

> If you look closely, this is exactly the same as the previous spread
> modulo CPU numbers. IOW, this is (again) a CPU renumbering exercise.

I don't see it a re-numbering exercise. The way I understand it a system
designer doesn't expect their DL task to move because of an RT task. I think we
should try to keep it this way, that's why I asked.

To be fair, I need to look at the code again and understand where I missed that
3rd condition Peter mentioned.

Thanks

--
Qais Yousef

^ permalink raw reply	[flat|nested] 81+ messages in thread

* Re: [PATCH v4 15/19] sched: Fix migrate_disable() vs rt/dl balancing
  2021-03-05 15:41       ` Valentin Schneider
  2021-03-05 17:11         ` Qais Yousef
@ 2021-03-10 14:44         ` Qais Yousef
  1 sibling, 0 replies; 81+ messages in thread
From: Qais Yousef @ 2021-03-10 14:44 UTC (permalink / raw)
  To: Valentin Schneider, Peter Zijlstra (Intel)
  Cc: tglx, mingo, linux-kernel, bigeasy, swood, juri.lelli,
	vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
	bristot, vincent.donnefort, tj, ouwen210

On 03/05/21 15:41, Valentin Schneider wrote:
> On 05/03/21 15:56, Peter Zijlstra wrote:
> > On Sat, Dec 26, 2020 at 01:54:45PM +0000, Qais Yousef wrote:
> >>
> >> > +static inline struct task_struct *get_push_task(struct rq *rq)
> >> > +{
> >> > +	struct task_struct *p = rq->curr;
> >>
> >> Shouldn't we verify the class of the task here? The RT task in migration
> >> disabled could have been preempted by a dl or stopper task. Similarly, the dl
> >> task could have been preempted by a stopper task.
> >>
> >> I don't think an RT task should be allowed to push a dl task under any
> >> circumstances?
> >
> > Hmm, quite. Fancy doing a patch?
> 
> Last time we talked about this, I looked into
> 
>   push_rt_task() + find_lowest_rq()
> 
> IIRC, with how
> 
>   find_lowest_rq() + cpupri_find_fitness()
> 
> currently work, find_lowest_rq() should return -1 in push_rt_task() if
> rq->curr is DL (CPUPRI_INVALID). IOW, Migration-Disabled RT tasks shouldn't
> actually interfere with DL tasks (unless a DL task gets scheduled after we
> drop the rq lock and kick the stopper, but we have that problem everywhere
> including CFS active balance).

This makes it less of a problem true, but AFAICT this can still happen in the
pull path.

Anyways, here's the patch as extra bolts and braces to be considered.

Thanks

--
Qais Yousef

--->8----

From 2df733d381f636cc185944c7eda86c824a9a785e Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@arm.com>
Date: Tue, 12 Jan 2021 11:54:16 +0000
Subject: [PATCH] sched: Don't push a higher priority class in get_push_task()

Commit a7c81556ec4d ("sched: Fix migrate_disable() vs rt/dl balancing")
will attempt to push/pull a higher priority task if the candidate task
is in migrate_disable() section. This is an attempt to prevent
starvation of these lower priority task that, in theory at least, could
end up in a situation where they're forever in migrate disable section
with no CPU time to run.

One issue with that is get_push_task() assumes rq->curr is of the same
sched_class, which AFAICT is not guaranteed to be true.

This patch adds extra bolts and braces to ensure that this voluntary
push operation is performed on a task of the same scheduling class only.

Otherwise an RT task could end up causing a DL task to be pushed away.
Which breaks the strict priority between sched classes.

We could also end up trying to push the migration task. Which I think is
harmless and is nothing but a wasted effort.

Fixes: a7c81556ec4d ("sched: Fix migrate_disable() vs rt/dl balancing")
Signed-off-by: Qais Yousef <qais.yousef@arm.com>
---
 kernel/sched/deadline.c |  2 +-
 kernel/sched/rt.c       |  4 ++--
 kernel/sched/sched.h    | 17 ++++++++++++++++-
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index aac3539aa0fe..afadc7e1f968 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2276,7 +2276,7 @@ static void pull_dl_task(struct rq *this_rq)
 				goto skip;
 
 			if (is_migration_disabled(p)) {
-				push_task = get_push_task(src_rq);
+				push_task = get_push_task(src_rq, SCHED_DEADLINE);
 			} else {
 				deactivate_task(src_rq, p, 0);
 				set_task_cpu(p, this_cpu);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8f720b71d13d..c2c5c08e3030 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1892,7 +1892,7 @@ static int push_rt_task(struct rq *rq, bool pull)
 		 * to this other CPU, instead attempt to push the current
 		 * running task on this CPU away.
 		 */
-		push_task = get_push_task(rq);
+		push_task = get_push_task(rq, SCHED_FIFO);
 		if (push_task) {
 			raw_spin_unlock(&rq->lock);
 			stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
@@ -2225,7 +2225,7 @@ static void pull_rt_task(struct rq *this_rq)
 				goto skip;
 
 			if (is_migration_disabled(p)) {
-				push_task = get_push_task(src_rq);
+				push_task = get_push_task(src_rq, SCHED_FIFO);
 			} else {
 				deactivate_task(src_rq, p, 0);
 				set_task_cpu(p, this_cpu);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10a1522b1e30..4e156f008d22 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1954,12 +1954,27 @@ extern void trigger_load_balance(struct rq *rq);
 
 extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
 
-static inline struct task_struct *get_push_task(struct rq *rq)
+static inline struct task_struct *get_push_task(struct rq *rq, int policy)
 {
 	struct task_struct *p = rq->curr;
 
 	lockdep_assert_held(&rq->lock);
 
+	switch(policy) {
+	case SCHED_FIFO:
+	case SCHED_RR:
+		if (!rt_task(p))
+			return NULL;
+		break;
+	case SCHED_DEADLINE:
+		if (!dl_task(p))
+			return NULL;
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return NULL;
+	}
+
 	if (rq->push_busy)
 		return NULL;
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 81+ messages in thread

end of thread, other threads:[~2021-03-10 14:45 UTC | newest]

Thread overview: 81+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-23 10:11 [PATCH v4 00/19] sched: Migrate disable support Peter Zijlstra
2020-10-23 10:11 ` [PATCH v4 01/19] stop_machine: Add function and caller debug info Peter Zijlstra
2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2020-10-23 10:12 ` [PATCH v4 02/19] sched: Fix balance_callback() Peter Zijlstra
2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2020-11-11 20:30     ` Paul Bolle
2020-11-11 20:45       ` Peter Zijlstra
2020-10-23 10:12 ` [PATCH v4 03/19] sched/hotplug: Ensure only per-cpu kthreads run during hotplug Peter Zijlstra
2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2020-10-23 10:12 ` [PATCH v4 04/19] sched/core: Wait for tasks being pushed away on hotplug Peter Zijlstra
2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Thomas Gleixner
2020-10-23 10:12 ` [PATCH v4 05/19] workqueue: Manually break affinity " Peter Zijlstra
2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2020-10-23 10:12 ` [PATCH v4 06/19] sched/hotplug: Consolidate task migration on CPU unplug Peter Zijlstra
2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Thomas Gleixner
2020-10-23 10:12 ` [PATCH v4 07/19] sched: Fix hotplug vs CPU bandwidth control Peter Zijlstra
2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2020-10-23 10:12 ` [PATCH v4 08/19] sched: Massage set_cpus_allowed() Peter Zijlstra
2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2020-10-23 10:12 ` [PATCH v4 09/19] sched: Add migrate_disable() Peter Zijlstra
2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2020-10-23 10:12 ` [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr() Peter Zijlstra
2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2020-11-12 16:38   ` [PATCH v4 10/19] " Qian Cai
2020-11-12 17:26     ` Valentin Schneider
2020-11-12 18:01       ` Qian Cai
2020-11-12 19:31         ` Valentin Schneider
2020-11-12 19:41           ` Qian Cai
2020-11-12 20:37           ` Qian Cai
2020-11-12 21:26             ` Valentin Schneider
2020-11-13 10:27           ` Peter Zijlstra
2020-11-12 18:35       ` Qian Cai
2020-11-20 12:34     ` [tip: sched/core] sched/core: Add missing completion for affine_move_task() waiters tip-bot2 for Valentin Schneider
2020-10-23 10:12 ` [PATCH v4 11/19] sched/core: Make migrate disable and CPU hotplug cooperative Peter Zijlstra
2020-10-29 16:27   ` Valentin Schneider
2020-10-29 17:34     ` Peter Zijlstra
2020-10-29 17:55       ` Valentin Schneider
2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Thomas Gleixner
2020-11-13 15:06   ` [PATCH v4 11/19] " Qian Cai
2020-11-17 19:28     ` Valentin Schneider
2020-11-18 14:44       ` Qian Cai
2020-11-23 18:13         ` Sebastian Andrzej Siewior
2020-12-02 21:59           ` Qian Cai
2020-12-03 12:31           ` Qian Cai
2020-12-04  0:23       ` Qian Cai
2020-12-04 21:19       ` Qian Cai
2020-12-05 18:37         ` Valentin Schneider
2020-12-06  1:17           ` Qian Cai
2020-12-07 19:27         ` Valentin Schneider
2020-12-08 13:46           ` Qian Cai
2020-12-09 19:16             ` Valentin Schneider
2020-10-23 10:12 ` [PATCH v4 12/19] sched,rt: Use cpumask_any*_distribute() Peter Zijlstra
2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2020-10-23 10:12 ` [PATCH v4 13/19] sched,rt: Use the full cpumask for balancing Peter Zijlstra
2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2020-10-23 10:12 ` [PATCH v4 14/19] sched, lockdep: Annotate ->pi_lock recursion Peter Zijlstra
2020-10-29 16:27   ` Valentin Schneider
2020-10-29 17:38     ` Peter Zijlstra
2020-10-29 18:09       ` Valentin Schneider
2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2020-10-23 10:12 ` [PATCH v4 15/19] sched: Fix migrate_disable() vs rt/dl balancing Peter Zijlstra
2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2020-12-26 13:54   ` [PATCH v4 15/19] " Qais Yousef
2021-03-05 14:56     ` Peter Zijlstra
2021-03-05 15:41       ` Valentin Schneider
2021-03-05 17:11         ` Qais Yousef
2021-03-10 14:44         ` Qais Yousef
2021-03-05 16:48       ` Qais Yousef
2020-10-23 10:12 ` [PATCH v4 16/19] sched/proc: Print accurate cpumask vs migrate_disable() Peter Zijlstra
2020-11-11  8:23   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2020-10-23 10:12 ` [PATCH v4 17/19] sched: Add migrate_disable() tracepoints Peter Zijlstra
2020-10-29 16:27   ` Valentin Schneider
2020-10-29 17:43     ` Peter Zijlstra
2020-10-29 17:56       ` Valentin Schneider
2020-10-29 17:59         ` Peter Zijlstra
2020-10-23 10:12 ` [PATCH v4 18/19] sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable() Peter Zijlstra
2020-10-23 10:12 ` [PATCH v4 19/19] sched: Comment affine_move_task() Peter Zijlstra
2020-10-29 16:27   ` Valentin Schneider
2020-10-29 17:44     ` Peter Zijlstra
2020-10-29 19:03 ` [PATCH v4 00/19] sched: Migrate disable support Valentin Schneider
2020-11-09 16:39 ` Daniel Bristot de Oliveira

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).