All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/9] sched: Migrate disable support
@ 2020-09-21 16:35 Peter Zijlstra
  2020-09-21 16:35 ` [PATCH 1/9] stop_machine: Add function and caller debug info Peter Zijlstra
                   ` (10 more replies)
  0 siblings, 11 replies; 45+ messages in thread
From: Peter Zijlstra @ 2020-09-21 16:35 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

Hi,

Here's my take on migrate_disable(). It avoids growing a second means of
changing the affinity, documents how the things violates locking rules but
still mostly works.

It also avoids blocking completely, so no more futex band-aids required.

Also, no more atomics/locks on the fast path.

I also put in a comment that explains how the whole concept is fundamentally
flawed but a necessary evil to get PREEMPT_RT going -- for now.

Somewhat tested with PREEMPT_RT.

---
 include/linux/cpuhotplug.h    |    1 
 include/linux/preempt.h       |   60 +++
 include/linux/sched.h         |    4 
 include/linux/sched/hotplug.h |    2 
 include/linux/stop_machine.h  |    5 
 kernel/cpu.c                  |    9 
 kernel/sched/core.c           |  673 +++++++++++++++++++++++++++++++-----------
 kernel/sched/deadline.c       |    5 
 kernel/sched/sched.h          |   24 +
 kernel/stop_machine.c         |   23 +
 lib/dump_stack.c              |    2 
 lib/smp_processor_id.c        |    5 
 12 files changed, 635 insertions(+), 178 deletions(-)


^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH 1/9] stop_machine: Add function and caller debug info
  2020-09-21 16:35 [PATCH 0/9] sched: Migrate disable support Peter Zijlstra
@ 2020-09-21 16:35 ` Peter Zijlstra
  2020-09-21 16:35 ` [PATCH 2/9] sched: Fix balance_callback() Peter Zijlstra
                   ` (9 subsequent siblings)
  10 siblings, 0 replies; 45+ messages in thread
From: Peter Zijlstra @ 2020-09-21 16:35 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

Crashes in stop-machine are hard to connect to the calling code, add a
little something to help with that.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/stop_machine.h |    5 +++++
 kernel/stop_machine.c        |   23 ++++++++++++++++++++---
 lib/dump_stack.c             |    2 ++
 3 files changed, 27 insertions(+), 3 deletions(-)

--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -24,6 +24,7 @@ typedef int (*cpu_stop_fn_t)(void *arg);
 struct cpu_stop_work {
 	struct list_head	list;		/* cpu_stopper->works */
 	cpu_stop_fn_t		fn;
+	unsigned long		caller;
 	void			*arg;
 	struct cpu_stop_done	*done;
 };
@@ -36,6 +37,8 @@ void stop_machine_park(int cpu);
 void stop_machine_unpark(int cpu);
 void stop_machine_yield(const struct cpumask *cpumask);
 
+extern void print_stop_info(const char *log_lvl, struct task_struct *task);
+
 #else	/* CONFIG_SMP */
 
 #include <linux/workqueue.h>
@@ -80,6 +83,8 @@ static inline bool stop_one_cpu_nowait(u
 	return false;
 }
 
+static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { }
+
 #endif	/* CONFIG_SMP */
 
 /*
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -42,11 +42,23 @@ struct cpu_stopper {
 	struct list_head	works;		/* list of pending works */
 
 	struct cpu_stop_work	stop_work;	/* for stop_cpus */
+	unsigned long		caller;
+	cpu_stop_fn_t		fn;
 };
 
 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
 static bool stop_machine_initialized = false;
 
+void print_stop_info(const char *log_lvl, struct task_struct *task)
+{
+	struct cpu_stopper *stopper = this_cpu_ptr(&cpu_stopper);
+
+	if (task != stopper->thread)
+		return;
+
+	printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller);
+}
+
 /* static data for stop_cpus */
 static DEFINE_MUTEX(stop_cpus_mutex);
 static bool stop_cpus_in_progress;
@@ -123,7 +135,7 @@ static bool cpu_stop_queue_work(unsigned
 int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
 {
 	struct cpu_stop_done done;
-	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
+	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ };
 
 	cpu_stop_init_done(&done, 1);
 	if (!cpu_stop_queue_work(cpu, &work))
@@ -331,7 +343,8 @@ int stop_two_cpus(unsigned int cpu1, uns
 	work1 = work2 = (struct cpu_stop_work){
 		.fn = multi_cpu_stop,
 		.arg = &msdata,
-		.done = &done
+		.done = &done,
+		.caller = _RET_IP_,
 	};
 
 	cpu_stop_init_done(&done, 2);
@@ -367,7 +380,7 @@ int stop_two_cpus(unsigned int cpu1, uns
 bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 			struct cpu_stop_work *work_buf)
 {
-	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
+	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, };
 	return cpu_stop_queue_work(cpu, work_buf);
 }
 
@@ -487,6 +500,8 @@ static void cpu_stopper_thread(unsigned
 		int ret;
 
 		/* cpu stop callbacks must not sleep, make in_atomic() == T */
+		stopper->caller = work->caller;
+		stopper->fn = fn;
 		preempt_count_inc();
 		ret = fn(arg);
 		if (done) {
@@ -495,6 +510,8 @@ static void cpu_stopper_thread(unsigned
 			cpu_stop_signal_done(done);
 		}
 		preempt_count_dec();
+		stopper->fn = NULL;
+		stopper->caller = 0;
 		WARN_ONCE(preempt_count(),
 			  "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg);
 		goto repeat;
--- a/lib/dump_stack.c
+++ b/lib/dump_stack.c
@@ -12,6 +12,7 @@
 #include <linux/atomic.h>
 #include <linux/kexec.h>
 #include <linux/utsname.h>
+#include <linux/stop_machine.h>
 
 static char dump_stack_arch_desc_str[128];
 
@@ -57,6 +58,7 @@ void dump_stack_print_info(const char *l
 		       log_lvl, dump_stack_arch_desc_str);
 
 	print_worker_info(log_lvl, current);
+	print_stop_info(log_lvl, current);
 }
 
 /**



^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH 2/9] sched: Fix balance_callback()
  2020-09-21 16:35 [PATCH 0/9] sched: Migrate disable support Peter Zijlstra
  2020-09-21 16:35 ` [PATCH 1/9] stop_machine: Add function and caller debug info Peter Zijlstra
@ 2020-09-21 16:35 ` Peter Zijlstra
  2020-09-23 14:08   ` Thomas Gleixner
  2020-09-21 16:36 ` [PATCH 3/9] sched/hotplug: Ensure only per-cpu kthreads run during hotplug Peter Zijlstra
                   ` (8 subsequent siblings)
  10 siblings, 1 reply; 45+ messages in thread
From: Peter Zijlstra @ 2020-09-21 16:35 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

The intent of balance_callback() has always been to delay executing
balancing operations until the end of the current rq->lock section.
This is because balance operations must often drop rq->lock, and that
isn't safe in general.

However, as noted by Scott, there were a few holes in that scheme;
balance_callback() was called after rq->lock was dropped, which means
another CPU can interleave and touch the callback list.

Rework code to call the balance callbacks before dropping rq->lock
where possible, and otherwise splice the balance list onto a local
stack.

This guarantees that the balance list must be empty when we take
rq->lock. IOW, we'll only ever run our own balance callbacks.

Reported-by: Scott Wood <swood@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |  119 ++++++++++++++++++++++++++++++++-------------------
 kernel/sched/sched.h |    2 
 2 files changed, 77 insertions(+), 44 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3489,6 +3489,69 @@ static inline void finish_task(struct ta
 #endif
 }
 
+#ifdef CONFIG_SMP
+
+static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+	void (*func)(struct rq *rq);
+	struct callback_head *next;
+
+	lockdep_assert_held(&rq->lock);
+
+	while (head) {
+		func = (void (*)(struct rq *))head->func;
+		next = head->next;
+		head->next = NULL;
+		head = next;
+
+		func(rq);
+	}
+}
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+	struct callback_head *head = rq->balance_callback;
+
+	lockdep_assert_held(&rq->lock);
+	if (head)
+		rq->balance_callback = NULL;
+
+	return head;
+}
+
+static void __balance_callbacks(struct rq *rq)
+{
+	do_balance_callbacks(rq, splice_balance_callbacks(rq));
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+	unsigned long flags;
+
+	if (unlikely(head)) {
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		do_balance_callbacks(rq, head);
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	}
+}
+
+#else
+
+static inline void __balance_callbacks(struct rq *rq)
+{
+}
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+	return NULL;
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+}
+
+#endif
+
 static inline void
 prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
 {
@@ -3514,6 +3577,7 @@ static inline void finish_lock_switch(st
 	 * prev into current:
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+	__balance_callbacks(rq);
 	raw_spin_unlock_irq(&rq->lock);
 }
 
@@ -3655,43 +3719,6 @@ static struct rq *finish_task_switch(str
 	return rq;
 }
 
-#ifdef CONFIG_SMP
-
-/* rq->lock is NOT held, but preemption is disabled */
-static void __balance_callback(struct rq *rq)
-{
-	struct callback_head *head, *next;
-	void (*func)(struct rq *rq);
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&rq->lock, flags);
-	head = rq->balance_callback;
-	rq->balance_callback = NULL;
-	while (head) {
-		func = (void (*)(struct rq *))head->func;
-		next = head->next;
-		head->next = NULL;
-		head = next;
-
-		func(rq);
-	}
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-static inline void balance_callback(struct rq *rq)
-{
-	if (unlikely(rq->balance_callback))
-		__balance_callback(rq);
-}
-
-#else
-
-static inline void balance_callback(struct rq *rq)
-{
-}
-
-#endif
-
 /**
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
@@ -3711,7 +3738,6 @@ asmlinkage __visible void schedule_tail(
 	 */
 
 	rq = finish_task_switch(prev);
-	balance_callback(rq);
 	preempt_enable();
 
 	if (current->set_child_tid)
@@ -4527,10 +4553,11 @@ static void __sched notrace __schedule(b
 		rq = context_switch(rq, prev, next, &rf);
 	} else {
 		rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-		rq_unlock_irq(rq, &rf);
-	}
 
-	balance_callback(rq);
+		rq_unpin_lock(rq, &rf);
+		__balance_callbacks(rq);
+		raw_spin_unlock_irq(&rq->lock);
+	}
 }
 
 void __noreturn do_task_dead(void)
@@ -4941,9 +4968,11 @@ void rt_mutex_setprio(struct task_struct
 out_unlock:
 	/* Avoid rq from going away on us: */
 	preempt_disable();
-	__task_rq_unlock(rq, &rf);
 
-	balance_callback(rq);
+	rq_unpin_lock(rq, &rf);
+	__balance_callbacks(rq);
+	raw_spin_unlock(&rq->lock);
+
 	preempt_enable();
 }
 #else
@@ -5217,6 +5246,7 @@ static int __sched_setscheduler(struct t
 	int retval, oldprio, oldpolicy = -1, queued, running;
 	int new_effective_prio, policy = attr->sched_policy;
 	const struct sched_class *prev_class;
+	struct callback_head *head;
 	struct rq_flags rf;
 	int reset_on_fork;
 	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -5455,6 +5485,7 @@ static int __sched_setscheduler(struct t
 
 	/* Avoid rq from going away on us: */
 	preempt_disable();
+	head = splice_balance_callbacks(rq);
 	task_rq_unlock(rq, p, &rf);
 
 	if (pi) {
@@ -5463,7 +5494,7 @@ static int __sched_setscheduler(struct t
 	}
 
 	/* Run balance callbacks after we've adjusted the PI chain: */
-	balance_callback(rq);
+	balance_callbacks(rq, head);
 	preempt_enable();
 
 	return 0;
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1220,6 +1220,8 @@ static inline void rq_pin_lock(struct rq
 #ifdef CONFIG_SCHED_DEBUG
 	rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
 	rf->clock_update_flags = 0;
+
+	SCHED_WARN_ON(rq->balance_callback);
 #endif
 }
 



^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH 3/9] sched/hotplug: Ensure only per-cpu kthreads run during hotplug
  2020-09-21 16:35 [PATCH 0/9] sched: Migrate disable support Peter Zijlstra
  2020-09-21 16:35 ` [PATCH 1/9] stop_machine: Add function and caller debug info Peter Zijlstra
  2020-09-21 16:35 ` [PATCH 2/9] sched: Fix balance_callback() Peter Zijlstra
@ 2020-09-21 16:36 ` Peter Zijlstra
  2020-09-25 16:38   ` Dietmar Eggemann
  2020-09-21 16:36 ` [PATCH 4/9] sched/core: Wait for tasks being pushed away on hotplug Peter Zijlstra
                   ` (7 subsequent siblings)
  10 siblings, 1 reply; 45+ messages in thread
From: Peter Zijlstra @ 2020-09-21 16:36 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

In preparation for migrate_disable(), make sure only per-cpu kthreads
are allowed to run on !active CPUs.

This is ran (as one of the very first steps) from the cpu-hotplug
task which is a per-cpu kthread and completion of the hotplug
operation only requires such tasks.

This constraint enables the migrate_disable() implementation to wait
for completion of all migrate_disable regions on this CPU at hotplug
time without fear of any new ones starting.

This replaces the unlikely(rq->balance_callbacks) test at the tail of
context_switch with an unlikely(rq->balance_work), the fast path is
not affected.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |  114 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |    5 ++
 2 files changed, 117 insertions(+), 2 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3513,8 +3513,10 @@ static inline struct callback_head *spli
 	struct callback_head *head = rq->balance_callback;
 
 	lockdep_assert_held(&rq->lock);
-	if (head)
+	if (head) {
 		rq->balance_callback = NULL;
+		rq->balance_flags &= ~BALANCE_WORK;
+	}
 
 	return head;
 }
@@ -3535,6 +3537,22 @@ static inline void balance_callbacks(str
 	}
 }
 
+static bool balance_push(struct rq *rq);
+
+static inline void balance_switch(struct rq *rq)
+{
+	if (unlikely(rq->balance_flags)) {
+		/*
+		 * Run the balance_callbacks, except on hotplug
+		 * when we need to push the current task away.
+		 */
+		if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+		    !(rq->balance_flags & BALANCE_PUSH) ||
+		    !balance_push(rq))
+			__balance_callbacks(rq);
+	}
+}
+
 #else
 
 static inline void __balance_callbacks(struct rq *rq)
@@ -3550,6 +3568,10 @@ static inline void balance_callbacks(str
 {
 }
 
+static inline void balance_switch(struct rq *rq)
+{
+}
+
 #endif
 
 static inline void
@@ -3577,7 +3599,7 @@ static inline void finish_lock_switch(st
 	 * prev into current:
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-	__balance_callbacks(rq);
+	balance_switch(rq);
 	raw_spin_unlock_irq(&rq->lock);
 }
 
@@ -6836,6 +6858,89 @@ static void migrate_tasks(struct rq *dea
 
 	rq->stop = stop;
 }
+
+static int __balance_push_cpu_stop(void *arg)
+{
+	struct task_struct *p = arg;
+	struct rq *rq = this_rq();
+	struct rq_flags rf;
+	int cpu;
+
+	raw_spin_lock_irq(&p->pi_lock);
+	rq_lock(rq, &rf);
+
+	update_rq_clock(rq);
+
+	if (task_rq(p) == rq && task_on_rq_queued(p)) {
+		cpu = select_fallback_rq(rq->cpu, p);
+		rq = __migrate_task(rq, &rf, p, cpu);
+	}
+
+	rq_unlock(rq, &rf);
+	raw_spin_unlock_irq(&p->pi_lock);
+
+	put_task_struct(p);
+
+	return 0;
+}
+
+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
+
+/*
+ * Ensure we only run per-cpu kthreads once the CPU goes !active.
+ */
+static bool balance_push(struct rq *rq)
+{
+	struct task_struct *push_task = rq->curr;
+
+	lockdep_assert_held(&rq->lock);
+	SCHED_WARN_ON(rq->cpu != smp_processor_id());
+
+	/*
+	 * Both the cpu-hotplug and stop task are in this case and are
+	 * required to complete the hotplug process.
+	 */
+	if (is_per_cpu_kthread(push_task))
+		return false;
+
+	get_task_struct(push_task);
+	/*
+	 * Temporarily drop rq->lock such that we can wake-up the stop task.
+	 * Both preemption and IRQs are still disabled.
+	 */
+	raw_spin_unlock(&rq->lock);
+	stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
+			    this_cpu_ptr(&push_work));
+	/*
+	 * At this point need_resched() is true and we'll take the loop in
+	 * schedule(). The next pick is obviously going to be the stop task
+	 * which is_per_cpu_kthread() and will push this task away.
+	 */
+	raw_spin_lock(&rq->lock);
+
+	return true;
+}
+
+static void balance_push_set(int cpu, bool on)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct rq_flags rf;
+
+	rq_lock_irqsave(rq, &rf);
+	if (on)
+		rq->balance_flags |= BALANCE_PUSH;
+	else
+		rq->balance_flags &= ~BALANCE_PUSH;
+	rq_unlock_irqrestore(rq, &rf);
+}
+
+#else
+
+static inline bool balance_push(struct rq *rq)
+{
+	return false;
+}
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 void set_rq_online(struct rq *rq)
@@ -6921,6 +7026,8 @@ int sched_cpu_activate(unsigned int cpu)
 	struct rq *rq = cpu_rq(cpu);
 	struct rq_flags rf;
 
+	balance_push_set(cpu, false);
+
 #ifdef CONFIG_SCHED_SMT
 	/*
 	 * When going up, increment the number of cores with SMT present.
@@ -6968,6 +7075,8 @@ int sched_cpu_deactivate(unsigned int cp
 	 */
 	synchronize_rcu();
 
+	balance_push_set(cpu, true);
+
 #ifdef CONFIG_SCHED_SMT
 	/*
 	 * When going down, decrement the number of cores with SMT present.
@@ -6981,6 +7090,7 @@ int sched_cpu_deactivate(unsigned int cp
 
 	ret = cpuset_cpu_inactive(cpu);
 	if (ret) {
+		balance_push_set(cpu, false);
 		set_cpu_active(cpu, true);
 		return ret;
 	}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -973,6 +973,7 @@ struct rq {
 	unsigned long		cpu_capacity_orig;
 
 	struct callback_head	*balance_callback;
+	unsigned char		balance_flags;
 
 	unsigned char		nohz_idle_balance;
 	unsigned char		idle_balance;
@@ -1384,6 +1385,9 @@ init_numa_balancing(unsigned long clone_
 
 #ifdef CONFIG_SMP
 
+#define BALANCE_WORK	0x01
+#define BALANCE_PUSH	0x02
+
 static inline void
 queue_balance_callback(struct rq *rq,
 		       struct callback_head *head,
@@ -1397,6 +1401,7 @@ queue_balance_callback(struct rq *rq,
 	head->func = (void (*)(struct callback_head *))func;
 	head->next = rq->balance_callback;
 	rq->balance_callback = head;
+	rq->balance_flags |= BALANCE_WORK;
 }
 
 #define rcu_dereference_check_sched_domain(p) \



^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH 4/9] sched/core: Wait for tasks being pushed away on hotplug
  2020-09-21 16:35 [PATCH 0/9] sched: Migrate disable support Peter Zijlstra
                   ` (2 preceding siblings ...)
  2020-09-21 16:36 ` [PATCH 3/9] sched/hotplug: Ensure only per-cpu kthreads run during hotplug Peter Zijlstra
@ 2020-09-21 16:36 ` Peter Zijlstra
  2020-09-21 16:36 ` [PATCH 5/9] sched/hotplug: Consolidate task migration on CPU unplug Peter Zijlstra
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 45+ messages in thread
From: Peter Zijlstra @ 2020-09-21 16:36 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

From: Thomas Gleixner <tglx@linutronix.de>

RT kernels need to ensure that all tasks which are not per CPU kthreads
have left the outgoing CPU to guarantee that no tasks are force migrated
within a migrate disabled section.

There is also some desire to (ab)use fine grained CPU hotplug control to
clear a CPU from active state to force migrate tasks which are not per CPU
kthreads away for power control purposes.

Add a mechanism which waits until all tasks which should leave the CPU
after the CPU active flag is cleared have moved to a different online CPU.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |   38 +++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |    4 ++++
 2 files changed, 41 insertions(+), 1 deletion(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2588,6 +2588,20 @@ void sched_ttwu_pending(void *arg)
 	rq_unlock_irqrestore(rq, &rf);
 }
 
+/*
+ * Invoked from a CPUs hotplug control thread after the CPU has been marked
+ * inactive. All tasks which are not per CPU kernel threads are either
+ * pushed off this CPU now via balance_push() or placed on a different CPU
+ * during wakeup. Wait until the CPU is quiescent.
+ */
+static void balance_hotplug_wait(void)
+{
+	struct rq *rq = this_rq();
+
+	rcuwait_wait_event(&rq->hotplug_wait, rq->nr_running == 1,
+			   TASK_UNINTERRUPTIBLE);
+}
+
 void send_call_function_single_ipi(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -6898,8 +6912,21 @@ static bool balance_push(struct rq *rq)
 	 * Both the cpu-hotplug and stop task are in this case and are
 	 * required to complete the hotplug process.
 	 */
-	if (is_per_cpu_kthread(push_task))
+	if (is_per_cpu_kthread(push_task)) {
+		/*
+		 * If this is the idle task on the outgoing CPU try to wake
+		 * up the hotplug control thread which might wait for the
+		 * last task to vanish. The rcuwait_active() check is
+		 * accurate here because the waiter is pinned on this CPU
+		 * and can't obviously be running in parallel.
+		 */
+		if (!rq->nr_running && rcuwait_active(&rq->hotplug_wait)) {
+			raw_spin_unlock(&rq->lock);
+			rcuwait_wake_up(&rq->hotplug_wait);
+			raw_spin_lock(&rq->lock);
+		}
 		return false;
+	}
 
 	get_task_struct(push_task);
 	/*
@@ -6939,6 +6966,8 @@ static inline bool balance_push(struct r
 	return false;
 }
 
+static inline void balance_hotplug_wait(void) { }
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 void set_rq_online(struct rq *rq)
@@ -7093,6 +7122,10 @@ int sched_cpu_deactivate(unsigned int cp
 		return ret;
 	}
 	sched_domains_numa_masks_clear(cpu);
+
+	/* Wait for all non per CPU kernel threads to vanish. */
+	balance_hotplug_wait();
+
 	return 0;
 }
 
@@ -7333,6 +7366,9 @@ void __init sched_init(void)
 
 		rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
 #endif
+#ifdef CONFIG_HOTPLUG_CPU
+		rcuwait_init(&rq->hotplug_wait);
+#endif
 #endif /* CONFIG_SMP */
 		hrtick_rq_init(rq);
 		atomic_set(&rq->nr_iowait, 0);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1004,6 +1004,10 @@ struct rq {
 
 	/* This is used to determine avg_idle's max value */
 	u64			max_idle_balance_cost;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	struct rcuwait		hotplug_wait;
+#endif
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING



^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH 5/9] sched/hotplug: Consolidate task migration on CPU unplug
  2020-09-21 16:35 [PATCH 0/9] sched: Migrate disable support Peter Zijlstra
                   ` (3 preceding siblings ...)
  2020-09-21 16:36 ` [PATCH 4/9] sched/core: Wait for tasks being pushed away on hotplug Peter Zijlstra
@ 2020-09-21 16:36 ` Peter Zijlstra
  2020-10-01 17:12   ` Vincent Donnefort
  2020-09-21 16:36 ` [PATCH 6/9] sched: Massage set_cpus_allowed Peter Zijlstra
                   ` (5 subsequent siblings)
  10 siblings, 1 reply; 45+ messages in thread
From: Peter Zijlstra @ 2020-09-21 16:36 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

From: Thomas Gleixner <tglx@linutronix.de>

With the new mechanism which kicks tasks off the outgoing CPU at the end of
schedule() the situation on an outgoing CPU right before the stopper thread
brings it down completely is:

 - All user tasks and all unbound kernel threads have either been migrated
   away or are not running and the next wakeup will move them to a online CPU.

 - All per CPU kernel threads, except cpu hotplug thread and the stopper
   thread have either been unbound or parked by the responsible CPU hotplug
   callback.

That means that at the last step before the stopper thread is invoked the
cpu hotplug thread is the last legitimate running task on the outgoing
CPU.

Add a final wait step right before the stopper thread is kicked which
ensures that any still running tasks on the way to park or on the way to
kick themself of the CPU are either sleeping or gone.

This allows to remove the migrate_tasks() crutch in sched_cpu_dying(). If
sched_cpu_dying() detects that there is still another running task aside of
the stopper thread then it will explode with the appropriate fireworks.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/cpuhotplug.h    |    1 
 include/linux/sched/hotplug.h |    2 
 kernel/cpu.c                  |    9 ++
 kernel/sched/core.c           |  150 +++++++++---------------------------------
 4 files changed, 46 insertions(+), 116 deletions(-)

--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -152,6 +152,7 @@ enum cpuhp_state {
 	CPUHP_AP_ONLINE,
 	CPUHP_TEARDOWN_CPU,
 	CPUHP_AP_ONLINE_IDLE,
+	CPUHP_AP_SCHED_WAIT_EMPTY,
 	CPUHP_AP_SMPBOOT_THREADS,
 	CPUHP_AP_X86_VDSO_VMA_ONLINE,
 	CPUHP_AP_IRQ_AFFINITY_ONLINE,
--- a/include/linux/sched/hotplug.h
+++ b/include/linux/sched/hotplug.h
@@ -11,8 +11,10 @@ extern int sched_cpu_activate(unsigned i
 extern int sched_cpu_deactivate(unsigned int cpu);
 
 #ifdef CONFIG_HOTPLUG_CPU
+extern int sched_cpu_wait_empty(unsigned int cpu);
 extern int sched_cpu_dying(unsigned int cpu);
 #else
+# define sched_cpu_wait_empty	NULL
 # define sched_cpu_dying	NULL
 #endif
 
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1602,7 +1602,7 @@ static struct cpuhp_step cpuhp_hp_states
 		.name			= "ap:online",
 	},
 	/*
-	 * Handled on controll processor until the plugged processor manages
+	 * Handled on control processor until the plugged processor manages
 	 * this itself.
 	 */
 	[CPUHP_TEARDOWN_CPU] = {
@@ -1611,6 +1611,13 @@ static struct cpuhp_step cpuhp_hp_states
 		.teardown.single	= takedown_cpu,
 		.cant_stop		= true,
 	},
+
+	[CPUHP_AP_SCHED_WAIT_EMPTY] = {
+		.name			= "sched:waitempty",
+		.startup.single		= NULL,
+		.teardown.single	= sched_cpu_wait_empty,
+	},
+
 	/* Handle smpboot threads park/unpark */
 	[CPUHP_AP_SMPBOOT_THREADS] = {
 		.name			= "smpboot/threads:online",
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6759,120 +6759,6 @@ void idle_task_exit(void)
 	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
 }
 
-/*
- * Since this CPU is going 'away' for a while, fold any nr_active delta
- * we might have. Assumes we're called after migrate_tasks() so that the
- * nr_active count is stable. We need to take the teardown thread which
- * is calling this into account, so we hand in adjust = 1 to the load
- * calculation.
- *
- * Also see the comment "Global load-average calculations".
- */
-static void calc_load_migrate(struct rq *rq)
-{
-	long delta = calc_load_fold_active(rq, 1);
-	if (delta)
-		atomic_long_add(delta, &calc_load_tasks);
-}
-
-static struct task_struct *__pick_migrate_task(struct rq *rq)
-{
-	const struct sched_class *class;
-	struct task_struct *next;
-
-	for_each_class(class) {
-		next = class->pick_next_task(rq);
-		if (next) {
-			next->sched_class->put_prev_task(rq, next);
-			return next;
-		}
-	}
-
-	/* The idle class should always have a runnable task */
-	BUG();
-}
-
-/*
- * Migrate all tasks from the rq, sleeping tasks will be migrated by
- * try_to_wake_up()->select_task_rq().
- *
- * Called with rq->lock held even though we'er in stop_machine() and
- * there's no concurrency possible, we hold the required locks anyway
- * because of lock validation efforts.
- */
-static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
-{
-	struct rq *rq = dead_rq;
-	struct task_struct *next, *stop = rq->stop;
-	struct rq_flags orf = *rf;
-	int dest_cpu;
-
-	/*
-	 * Fudge the rq selection such that the below task selection loop
-	 * doesn't get stuck on the currently eligible stop task.
-	 *
-	 * We're currently inside stop_machine() and the rq is either stuck
-	 * in the stop_machine_cpu_stop() loop, or we're executing this code,
-	 * either way we should never end up calling schedule() until we're
-	 * done here.
-	 */
-	rq->stop = NULL;
-
-	/*
-	 * put_prev_task() and pick_next_task() sched
-	 * class method both need to have an up-to-date
-	 * value of rq->clock[_task]
-	 */
-	update_rq_clock(rq);
-
-	for (;;) {
-		/*
-		 * There's this thread running, bail when that's the only
-		 * remaining thread:
-		 */
-		if (rq->nr_running == 1)
-			break;
-
-		next = __pick_migrate_task(rq);
-
-		/*
-		 * Rules for changing task_struct::cpus_mask are holding
-		 * both pi_lock and rq->lock, such that holding either
-		 * stabilizes the mask.
-		 *
-		 * Drop rq->lock is not quite as disastrous as it usually is
-		 * because !cpu_active at this point, which means load-balance
-		 * will not interfere. Also, stop-machine.
-		 */
-		rq_unlock(rq, rf);
-		raw_spin_lock(&next->pi_lock);
-		rq_relock(rq, rf);
-
-		/*
-		 * Since we're inside stop-machine, _nothing_ should have
-		 * changed the task, WARN if weird stuff happened, because in
-		 * that case the above rq->lock drop is a fail too.
-		 */
-		if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
-			raw_spin_unlock(&next->pi_lock);
-			continue;
-		}
-
-		/* Find suitable destination for @next, with force if needed. */
-		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
-		rq = __migrate_task(rq, rf, next, dest_cpu);
-		if (rq != dead_rq) {
-			rq_unlock(rq, rf);
-			rq = dead_rq;
-			*rf = orf;
-			rq_relock(rq, rf);
-		}
-		raw_spin_unlock(&next->pi_lock);
-	}
-
-	rq->stop = stop;
-}
-
 static int __balance_push_cpu_stop(void *arg)
 {
 	struct task_struct *p = arg;
@@ -7145,6 +7031,41 @@ int sched_cpu_starting(unsigned int cpu)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Invoked immediately before the stopper thread is invoked to bring the
+ * CPU down completely. At this point all per CPU kthreads except the
+ * hotplug thread (current) and the stopper thread (inactive) have been
+ * either parked or have been unbound from the outgoing CPU. Ensure that
+ * any of those which might be on the way out are gone.
+ *
+ * If after this point a bound task is being woken on this CPU then the
+ * responsible hotplug callback has failed to do it's job.
+ * sched_cpu_dying() will catch it with the appropriate fireworks.
+ */
+int sched_cpu_wait_empty(unsigned int cpu)
+{
+	balance_hotplug_wait();
+	return 0;
+}
+
+/*
+ * Since this CPU is going 'away' for a while, fold any nr_active delta we
+ * might have. Called from the CPU stopper task after ensuring that the
+ * stopper is the last running task on the CPU, so nr_active count is
+ * stable. We need to take the teardown thread which is calling this into
+ * account, so we hand in adjust = 1 to the load calculation.
+ *
+ * Also see the comment "Global load-average calculations".
+ */
+static void calc_load_migrate(struct rq *rq)
+{
+	long delta = calc_load_fold_active(rq, 1);
+
+	if (delta)
+		atomic_long_add(delta, &calc_load_tasks);
+}
+
 int sched_cpu_dying(unsigned int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -7158,7 +7079,6 @@ int sched_cpu_dying(unsigned int cpu)
 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 		set_rq_offline(rq);
 	}
-	migrate_tasks(rq, &rf);
 	BUG_ON(rq->nr_running != 1);
 	rq_unlock_irqrestore(rq, &rf);
 



^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH 6/9] sched: Massage set_cpus_allowed
  2020-09-21 16:35 [PATCH 0/9] sched: Migrate disable support Peter Zijlstra
                   ` (4 preceding siblings ...)
  2020-09-21 16:36 ` [PATCH 5/9] sched/hotplug: Consolidate task migration on CPU unplug Peter Zijlstra
@ 2020-09-21 16:36 ` Peter Zijlstra
  2020-09-23 14:07   ` Thomas Gleixner
  2020-09-21 16:36 ` [PATCH 7/9] sched: Add migrate_disable() Peter Zijlstra
                   ` (4 subsequent siblings)
  10 siblings, 1 reply; 45+ messages in thread
From: Peter Zijlstra @ 2020-09-21 16:36 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

Thread a u32 flags word through the *set_cpus_allowed*() callchain.
This will allow adding behavioural tweaks for future users.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c     |   28 ++++++++++++++++++----------
 kernel/sched/deadline.c |    5 +++--
 kernel/sched/sched.h    |    7 +++++--
 3 files changed, 26 insertions(+), 14 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1828,13 +1828,14 @@ static int migration_cpu_stop(void *data
  * sched_class::set_cpus_allowed must do the below, but is not required to
  * actually call this function.
  */
-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
 {
 	cpumask_copy(&p->cpus_mask, new_mask);
 	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+static void
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
 {
 	struct rq *rq = task_rq(p);
 	bool queued, running;
@@ -1855,7 +1856,7 @@ void do_set_cpus_allowed(struct task_str
 	if (running)
 		put_prev_task(rq, p);
 
-	p->sched_class->set_cpus_allowed(p, new_mask);
+	p->sched_class->set_cpus_allowed(p, new_mask, flags);
 
 	if (queued)
 		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -1863,6 +1864,11 @@ void do_set_cpus_allowed(struct task_str
 		set_next_task(rq, p);
 }
 
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	__do_set_cpus_allowed(p, new_mask, 0);
+}
+
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
@@ -1873,7 +1879,8 @@ void do_set_cpus_allowed(struct task_str
  * call is not atomic; no spinlocks may be held.
  */
 static int __set_cpus_allowed_ptr(struct task_struct *p,
-				  const struct cpumask *new_mask, bool check)
+				  const struct cpumask *new_mask,
+				  u32 flags)
 {
 	const struct cpumask *cpu_valid_mask = cpu_active_mask;
 	unsigned int dest_cpu;
@@ -1895,7 +1902,7 @@ static int __set_cpus_allowed_ptr(struct
 	 * Must re-check here, to close a race against __kthread_bind(),
 	 * sched_setaffinity() is not guaranteed to observe the flag.
 	 */
-	if (check && (p->flags & PF_NO_SETAFFINITY)) {
+	if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1914,7 +1921,7 @@ static int __set_cpus_allowed_ptr(struct
 		goto out;
 	}
 
-	do_set_cpus_allowed(p, new_mask);
+	__do_set_cpus_allowed(p, new_mask, flags);
 
 	if (p->flags & PF_KTHREAD) {
 		/*
@@ -1951,7 +1958,7 @@ static int __set_cpus_allowed_ptr(struct
 
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
-	return __set_cpus_allowed_ptr(p, new_mask, false);
+	return __set_cpus_allowed_ptr(p, new_mask, 0);
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
@@ -2410,7 +2417,8 @@ void sched_set_stop_task(int cpu, struct
 #else
 
 static inline int __set_cpus_allowed_ptr(struct task_struct *p,
-					 const struct cpumask *new_mask, bool check)
+					 const struct cpumask *new_mask,
+					 u32 flags)
 {
 	return set_cpus_allowed_ptr(p, new_mask);
 }
@@ -6025,7 +6033,7 @@ long sched_setaffinity(pid_t pid, const
 	}
 #endif
 again:
-	retval = __set_cpus_allowed_ptr(p, new_mask, true);
+	retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
 
 	if (!retval) {
 		cpuset_cpus_allowed(p, cpus_allowed);
@@ -6608,7 +6616,7 @@ void init_idle(struct task_struct *idle,
 	 *
 	 * And since this is boot we can forgo the serialization.
 	 */
-	set_cpus_allowed_common(idle, cpumask_of(cpu));
+	set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
 #endif
 	/*
 	 * We're having a chicken and egg problem, even though we are
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2280,7 +2280,8 @@ static void task_woken_dl(struct rq *rq,
 }
 
 static void set_cpus_allowed_dl(struct task_struct *p,
-				const struct cpumask *new_mask)
+				const struct cpumask *new_mask,
+				u32 flags)
 {
 	struct root_domain *src_rd;
 	struct rq *rq;
@@ -2309,7 +2310,7 @@ static void set_cpus_allowed_dl(struct t
 		raw_spin_unlock(&src_dl_b->lock);
 	}
 
-	set_cpus_allowed_common(p, new_mask);
+	set_cpus_allowed_common(p, new_mask, flags);
 }
 
 /* Assumes rq->lock is held */
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1806,7 +1806,8 @@ struct sched_class {
 	void (*task_woken)(struct rq *this_rq, struct task_struct *task);
 
 	void (*set_cpus_allowed)(struct task_struct *p,
-				 const struct cpumask *newmask);
+				 const struct cpumask *newmask,
+				 u32 flags);
 
 	void (*rq_online)(struct rq *rq);
 	void (*rq_offline)(struct rq *rq);
@@ -1899,7 +1900,9 @@ extern void update_group_capacity(struct
 
 extern void trigger_load_balance(struct rq *rq);
 
-extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
+#define SCA_CHECK		0x01
+
+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
 
 #endif
 



^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH 7/9] sched: Add migrate_disable()
  2020-09-21 16:35 [PATCH 0/9] sched: Migrate disable support Peter Zijlstra
                   ` (5 preceding siblings ...)
  2020-09-21 16:36 ` [PATCH 6/9] sched: Massage set_cpus_allowed Peter Zijlstra
@ 2020-09-21 16:36 ` Peter Zijlstra
  2020-09-21 19:16   ` Thomas Gleixner
                     ` (2 more replies)
  2020-09-21 16:36 ` [PATCH 8/9] sched: Fix migrate_disable() vs set_cpus_allowed_ptr() Peter Zijlstra
                   ` (3 subsequent siblings)
  10 siblings, 3 replies; 45+ messages in thread
From: Peter Zijlstra @ 2020-09-21 16:36 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

Add the base migrate_disable() support (under protest).

While migrate_disable() is (currently) required for PREEMPT_RT, it is
also one of the biggest flaws in the system.

Notably this is just the base implementation, it is broken vs
sched_setaffinity() and hotplug, both solved in additional patches for
ease of review.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/preempt.h |   60 ++++++++++++++++++++++++
 include/linux/sched.h   |    3 +
 kernel/sched/core.c     |  116 +++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sched/sched.h    |    2 
 lib/smp_processor_id.c  |    5 ++
 5 files changed, 179 insertions(+), 7 deletions(-)

--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -322,6 +322,64 @@ static inline void preempt_notifier_init
 
 #endif
 
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+
+/*
+ * Migrate-Disable and why it is (strongly) undesired.
+ *
+ * The premise of the Real-Time schedulers we have on Linux
+ * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks
+ * concurrently, provided there are sufficient runnable tasks, also known as
+ * work-conserving. For instance SCHED_DEADLINE tries to schedule the M
+ * earliest deadline threads, and SCHED_FIFO the M highest priority threads.
+ *
+ * The correctness of various scheduling models depends on this, but is it
+ * broken by migrate_disable() that doesn't imply preempt_disable(). Where
+ * preempt_disable() implies an immediate priority ceiling, preemptible
+ * migrate_disable() allows nesting.
+ *
+ * The worst case is that all tasks preempt one another in a migrate_disable()
+ * region and stack on a single CPU. This then reduces the available bandwidth
+ * to a single CPU. And since Real-Time schedulability theory considers the
+ * Worst-Case only, all Real-Time analysis shall revert to single-CPU
+ * (instantly solving the SMP analysis problem).
+ *
+ *
+ * The reason we have it anyway.
+ *
+ * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
+ * number of primitives into becoming preemptible, they would also allow
+ * migration. This turns out to break a bunch of per-cpu usage. To this end,
+ * all these primitives employ migirate_disable() to restore this implicit
+ * assumption.
+ *
+ * This is a 'temporary' work-around at best. The correct solution is getting
+ * rid of the above assumptions and reworking the code to employ explicit
+ * per-cpu locking or short preempt-disable regions.
+ *
+ * The end goal must be to get rid of migrate_disable(), alternatively we need
+ * a schedulability theory that does not depend on abritrary migration.
+ *
+ *
+ * Notes on the implementation.
+ *
+ * The implementation is particularly tricky since existing code patterns
+ * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
+ * This means that it cannot use cpus_read_lock() to serialize against hotplug,
+ * nor can it easily migrate itself into a pending affinity mask change on
+ * migrate_enable().
+ *
+ *
+ * Note: even non-work-conserving schedulers like semi-partitioned depends on
+ *       migration, so migrate_disable() is not only a problem for
+ *       work-conserving schedulers.
+ *
+ */
+extern void migrate_disable(void);
+extern void migrate_enable(void);
+
+#else /* !(CONFIG_SMP && CONFIG_PREEMPT_RT) */
+
 /**
  * migrate_disable - Prevent migration of the current task
  *
@@ -352,4 +410,6 @@ static __always_inline void migrate_enab
 	preempt_enable();
 }
 
+#endif /* CONFIG_SMP && CONFIG_PREEMPT_RT */
+
 #endif /* __LINUX_PREEMPT_H */
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -713,6 +713,9 @@ struct task_struct {
 	int				nr_cpus_allowed;
 	const cpumask_t			*cpus_ptr;
 	cpumask_t			cpus_mask;
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+	int				migration_disabled;
+#endif
 
 #ifdef CONFIG_PREEMPT_RCU
 	int				rcu_read_lock_nesting;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1700,6 +1700,68 @@ void check_preempt_curr(struct rq *rq, s
 
 #ifdef CONFIG_SMP
 
+#ifdef CONFIG_PREEMPT_RT
+
+static void
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask,
+				  u32 flags);
+
+static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
+{
+	if (likely(!p->migration_disabled))
+		return;
+
+	if (p->cpus_ptr != &p->cpus_mask)
+		return;
+
+	/*
+	 * Violates locking rules! see comment in __do_set_cpus_allowed().
+	 */
+	__do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
+}
+
+void migrate_disable(void)
+{
+	if (current->migration_disabled++)
+		return;
+
+	barrier();
+}
+EXPORT_SYMBOL_GPL(migrate_disable);
+
+void migrate_enable(void)
+{
+	if (--current->migration_disabled)
+		return;
+
+	barrier();
+
+	if (p->cpus_ptr == &p->cpus_mask)
+		return;
+
+	__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+}
+EXPORT_SYMBOL_GPL(migrate_enable);
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+	return p->migration_disabled;
+}
+
+#else
+
+static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+	return false;
+}
+
+#endif
+
 /*
  * Per-CPU kthreads are allowed to run on !active && online CPUs, see
  * __set_cpus_allowed_ptr() and select_fallback_rq().
@@ -1709,7 +1771,7 @@ static inline bool is_cpu_allowed(struct
 	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 		return false;
 
-	if (is_per_cpu_kthread(p))
+	if (is_per_cpu_kthread(p) || is_migration_disabled(p))
 		return cpu_online(cpu);
 
 	return cpu_active(cpu);
@@ -1830,8 +1892,19 @@ static int migration_cpu_stop(void *data
  */
 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
 {
-	cpumask_copy(&p->cpus_mask, new_mask);
-	p->nr_cpus_allowed = cpumask_weight(new_mask);
+	if (flags & SCA_MIGRATE_DISABLE) {
+		p->cpus_ptr = new_mask;
+		p->nr_cpus_allowed = 1;
+		return;
+	}
+
+	if (flags & SCA_MIGRATE_ENABLE)
+		p->cpus_ptr = &p->cpus_mask;
+	else
+		cpumask_copy(&p->cpus_mask, new_mask);
+
+	if (p->cpus_ptr == &p->cpus_mask)
+		p->nr_cpus_allowed = cpumask_weight(p->cpus_ptr);
 }
 
 static void
@@ -1840,7 +1913,22 @@ __do_set_cpus_allowed(struct task_struct
 	struct rq *rq = task_rq(p);
 	bool queued, running;
 
-	lockdep_assert_held(&p->pi_lock);
+	/*
+	 * This here violates the locking rules for affinity, since we're only
+	 * supposed to change these variables while holding both rq->lock and
+	 * p->pi_lock.
+	 *
+	 * HOWEVER, it magically works, because ttwu() is the only code that
+	 * accesses these variables under p->pi_lock and only does so after
+	 * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
+	 * before finish_task().
+	 *
+	 * XXX do further audits, this smells like something putrid.
+	 */
+	if (flags & SCA_MIGRATE_DISABLE)
+		SCHED_WARN_ON(!p->on_cpu);
+	else
+		lockdep_assert_held(&p->pi_lock);
 
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
@@ -1891,9 +1979,14 @@ static int __set_cpus_allowed_ptr(struct
 	rq = task_rq_lock(p, &rf);
 	update_rq_clock(rq);
 
-	if (p->flags & PF_KTHREAD) {
+	if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
 		/*
-		 * Kernel threads are allowed on online && !active CPUs
+		 * Kernel threads are allowed on online && !active CPUs.
+		 *
+		 * Specifically, migration_disabled() tasks must not fail the
+		 * cpumask_and_and_distribute() pick below, esp. so on
+		 * SCA_MIGRATE_ENABLE, otherwise we'll not call
+		 * set_cpus_allowed_common() and actually reset p->cpus_ptr.
 		 */
 		cpu_valid_mask = cpu_online_mask;
 	}
@@ -1907,7 +2000,7 @@ static int __set_cpus_allowed_ptr(struct
 		goto out;
 	}
 
-	if (cpumask_equal(&p->cpus_mask, new_mask))
+	if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask))
 		goto out;
 
 	/*
@@ -1999,6 +2092,8 @@ void set_task_cpu(struct task_struct *p,
 	 * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
 	 */
 	WARN_ON_ONCE(!cpu_online(new_cpu));
+
+	WARN_ON_ONCE(is_migration_disabled(p));
 #endif
 
 	trace_sched_migrate_task(p, new_cpu);
@@ -2329,6 +2424,12 @@ static int select_fallback_rq(int cpu, s
 			}
 			fallthrough;
 		case possible:
+			/*
+			 * XXX When called from select_task_rq() we only
+			 * hold p->pi_lock and again violate locking order.
+			 *
+			 * More yuck to audit.
+			 */
 			do_set_cpus_allowed(p, cpu_possible_mask);
 			state = fail;
 			break;
@@ -4589,6 +4690,7 @@ static void __sched notrace __schedule(b
 		 */
 		++*switch_count;
 
+		migrate_disable_switch(rq, prev);
 		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
 
 		trace_sched_switch(preempt, prev, next);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1901,6 +1901,8 @@ extern void update_group_capacity(struct
 extern void trigger_load_balance(struct rq *rq);
 
 #define SCA_CHECK		0x01
+#define SCA_MIGRATE_DISABLE	0x02
+#define SCA_MIGRATE_ENABLE	0x04
 
 extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
 
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -26,6 +26,11 @@ unsigned int check_preemption_disabled(c
 	if (current->nr_cpus_allowed == 1)
 		goto out;
 
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+	if (current->migration_disabled)
+		goto out;
+#endif
+
 	/*
 	 * It is valid to assume CPU-locality during early bootup:
 	 */



^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH 8/9] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-09-21 16:35 [PATCH 0/9] sched: Migrate disable support Peter Zijlstra
                   ` (6 preceding siblings ...)
  2020-09-21 16:36 ` [PATCH 7/9] sched: Add migrate_disable() Peter Zijlstra
@ 2020-09-21 16:36 ` Peter Zijlstra
  2020-09-24 19:59   ` Valentin Schneider
  2020-09-21 16:36 ` [PATCH 9/9] sched/core: Make migrate disable and CPU hotplug cooperative Peter Zijlstra
                   ` (2 subsequent siblings)
  10 siblings, 1 reply; 45+ messages in thread
From: Peter Zijlstra @ 2020-09-21 16:36 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

Concurrent migrate_disable() and set_cpus_allowed_ptr() has
interesting features. We rely on set_cpus_allowed_ptr() to not return
until the task runs inside the provided mask. This expectation is
exported to userspace.

This means that any set_cpus_allowed_ptr() caller must wait until
migrate_enable() allows migrations.

At the same time, we don't want migrate_enable() to schedule, due to
patterns like:

	preempt_disable();
	migrate_disable();
	...
	migrate_enable();
	preempt_enable();

And:

	raw_spin_lock(&B);
	spin_unlock(&A);

this means that when migrate_enable() must restore the affinity
mask, it cannot wait for completion thereof. Luck will have it that
that is exactly the case where there is a pending
set_cpus_allowed_ptr(), so let that provide storage for the async stop
machine.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/sched.h |    1 
 kernel/sched/core.c   |  142 ++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 123 insertions(+), 20 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -713,6 +713,7 @@ struct task_struct {
 	int				nr_cpus_allowed;
 	const cpumask_t			*cpus_ptr;
 	cpumask_t			cpus_mask;
+	void				*migration_pending;
 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
 	int				migration_disabled;
 #endif
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1734,15 +1734,28 @@ EXPORT_SYMBOL_GPL(migrate_disable);
 
 void migrate_enable(void)
 {
-	if (--current->migration_disabled)
-		return;
-
-	barrier();
+	struct task_struct *p = current;
 
-	if (p->cpus_ptr == &p->cpus_mask)
+	if (p->migration_disabled > 1) {
+		p->migration_disabled--;
 		return;
+	}
 
-	__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+	/*
+	 * Ensure stop_task runs either before or after this, and that
+	 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+	 */
+	preempt_disable();
+	if (p->cpus_ptr != &p->cpus_mask)
+		__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+	/*
+	 * Mustn't clear migration_disabled() until cpus_ptr points back at the
+	 * regular cpus_mask, otherwise things that race (eg.
+	 * select_fallback_rq) get confused.
+	 */
+	barrier();
+	p->migration_disabled = 0;
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
 
@@ -1818,6 +1831,7 @@ static struct rq *move_queued_task(struc
 struct migration_arg {
 	struct task_struct *task;
 	int dest_cpu;
+	struct completion *done;
 };
 
 /*
@@ -1852,6 +1866,7 @@ static int migration_cpu_stop(void *data
 	struct migration_arg *arg = data;
 	struct task_struct *p = arg->task;
 	struct rq *rq = this_rq();
+	bool complete = false;
 	struct rq_flags rf;
 
 	/*
@@ -1874,15 +1889,27 @@ static int migration_cpu_stop(void *data
 	 * we're holding p->pi_lock.
 	 */
 	if (task_rq(p) == rq) {
+		if (is_migration_disabled(p))
+			goto out;
+
 		if (task_on_rq_queued(p))
 			rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
 		else
 			p->wake_cpu = arg->dest_cpu;
+
+		if (arg->done) {
+			p->migration_pending = NULL;
+			complete = true;
+		}
 	}
+out:
 	rq_unlock(rq, &rf);
 	raw_spin_unlock(&p->pi_lock);
-
 	local_irq_enable();
+
+	if (complete)
+		complete_all(arg->done);
+
 	return 0;
 }
 
@@ -1957,6 +1984,92 @@ void do_set_cpus_allowed(struct task_str
 	__do_set_cpus_allowed(p, new_mask, 0);
 }
 
+struct set_affinity_pending {
+	refcount_t		refs;
+	struct completion	done;
+	struct cpu_stop_work	stop_work;
+	struct migration_arg	arg;
+};
+
+static int move_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p,
+		     int dest_cpu, unsigned int flags)
+{
+	struct set_affinity_pending my_pending, *pending = NULL;
+	struct migration_arg arg = {
+		.task = p,
+		.dest_cpu = dest_cpu,
+	};
+
+	if (!(flags & SCA_MIGRATE_ENABLE)) {
+		/* serialized by p->pi_lock */
+		if (!p->migration_pending) {
+			refcount_set(&my_pending.refs, 1);
+			init_completion(&my_pending.done);
+			p->migration_pending = &my_pending;
+		} else {
+			pending = p->migration_pending;
+			refcount_inc(&pending->refs);
+		}
+	}
+	pending = p->migration_pending;
+	/*
+	 * - !MIGRATE_ENABLE:
+	 *   we'll have installed a pending if there wasn't one already.
+	 *
+	 * - MIGRATE_ENABLE:
+	 *   we're here because the current CPU isn't matching anymore,
+	 *   the only way that can happen is because of a concurrent
+	 *   set_cpus_allowed_ptr() call, which should then still be
+	 *   pending completion.
+	 *
+	 * Either way, we really should have a @pending here.
+	 */
+	if (WARN_ON_ONCE(!pending))
+		return -EINVAL;
+
+	arg.done = &pending->done;
+
+	if (flags & SCA_MIGRATE_ENABLE) {
+
+		task_rq_unlock(rq, p, rf);
+		pending->arg = arg;
+		stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+				    &pending->arg, &pending->stop_work);
+
+		return 0;
+	}
+
+	if (task_running(rq, p) || p->state == TASK_WAKING) {
+
+		task_rq_unlock(rq, p, rf);
+		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+
+	} else {
+		bool complete = false;
+
+		if (!is_migration_disabled(p)) {
+			if (task_on_rq_queued(p))
+				rq = move_queued_task(rq, rf, p, dest_cpu);
+
+			p->migration_pending = NULL;
+			complete = true;
+		}
+		task_rq_unlock(rq, p, rf);
+
+		if (complete)
+			complete_all(&pending->done);
+	}
+
+	wait_for_completion(&pending->done);
+
+	if (refcount_dec_and_test(&pending->refs))
+		wake_up_var(&pending->refs);
+
+	wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
+
+	return 0;
+}
+
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
@@ -2025,19 +2138,8 @@ static int __set_cpus_allowed_ptr(struct
 	if (cpumask_test_cpu(task_cpu(p), new_mask))
 		goto out;
 
-	if (task_running(rq, p) || p->state == TASK_WAKING) {
-		struct migration_arg arg = { p, dest_cpu };
-		/* Need help from migration thread: drop lock and wait. */
-		task_rq_unlock(rq, p, &rf);
-		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-		return 0;
-	} else if (task_on_rq_queued(p)) {
-		/*
-		 * OK, since we're going to drop the lock immediately
-		 * afterwards anyway.
-		 */
-		rq = move_queued_task(rq, &rf, p, dest_cpu);
-	}
+	return move_task(rq, &rf, p, dest_cpu, flags);
+
 out:
 	task_rq_unlock(rq, p, &rf);
 



^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH 9/9] sched/core: Make migrate disable and CPU hotplug cooperative
  2020-09-21 16:35 [PATCH 0/9] sched: Migrate disable support Peter Zijlstra
                   ` (7 preceding siblings ...)
  2020-09-21 16:36 ` [PATCH 8/9] sched: Fix migrate_disable() vs set_cpus_allowed_ptr() Peter Zijlstra
@ 2020-09-21 16:36 ` Peter Zijlstra
  2020-09-25  9:12 ` [PATCH 0/9] sched: Migrate disable support Dietmar Eggemann
  2020-09-25 18:17 ` Sebastian Andrzej Siewior
  10 siblings, 0 replies; 45+ messages in thread
From: Peter Zijlstra @ 2020-09-21 16:36 UTC (permalink / raw)
  To: tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

From: Thomas Gleixner <tglx@linutronix.de>

On CPU unplug tasks which are in a migrate disabled region cannot be pushed
to a different CPU until they returned to migrateable state.

Account the number of tasks on a runqueue which are in a migrate disabled
section and make the hotplug wait mechanism respect that.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |   36 ++++++++++++++++++++++++++++++------
 kernel/sched/sched.h |    4 ++++
 2 files changed, 34 insertions(+), 6 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1725,10 +1725,17 @@ static void migrate_disable_switch(struc
 
 void migrate_disable(void)
 {
-	if (current->migration_disabled++)
+	struct task_struct *p = current;
+
+	if (p->migration_disabled) {
+		p->migration_disabled++;
 		return;
+	}
 
-	barrier();
+	preempt_disable();
+	this_rq()->nr_pinned++;
+	p->migration_disabled = 1;
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_disable);
 
@@ -1755,6 +1762,7 @@ void migrate_enable(void)
 	 */
 	barrier();
 	p->migration_disabled = 0;
+	this_rq()->nr_pinned--;
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
@@ -1764,6 +1772,11 @@ static inline bool is_migration_disabled
 	return p->migration_disabled;
 }
 
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+	return rq->nr_pinned;
+}
+
 #else
 
 static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
@@ -1773,6 +1786,11 @@ static inline bool is_migration_disabled
 	return false;
 }
 
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+	return false;
+}
+
 #endif
 
 /*
@@ -2809,7 +2827,8 @@ static void balance_hotplug_wait(void)
 {
 	struct rq *rq = this_rq();
 
-	rcuwait_wait_event(&rq->hotplug_wait, rq->nr_running == 1,
+	rcuwait_wait_event(&rq->hotplug_wait,
+			   rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
 			   TASK_UNINTERRUPTIBLE);
 }
 
@@ -7009,15 +7028,20 @@ static bool balance_push(struct rq *rq)
 	 * Both the cpu-hotplug and stop task are in this case and are
 	 * required to complete the hotplug process.
 	 */
-	if (is_per_cpu_kthread(push_task)) {
+	if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
 		/*
 		 * If this is the idle task on the outgoing CPU try to wake
 		 * up the hotplug control thread which might wait for the
 		 * last task to vanish. The rcuwait_active() check is
 		 * accurate here because the waiter is pinned on this CPU
 		 * and can't obviously be running in parallel.
+		 *
+		 * On RT kernels this also has to check whether there are
+		 * pinned and scheduled out tasks on the runqueue. They
+		 * need to leave the migrate disabled section first.
 		 */
-		if (!rq->nr_running && rcuwait_active(&rq->hotplug_wait)) {
+		if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
+		    rcuwait_active(&rq->hotplug_wait)) {
 			raw_spin_unlock(&rq->lock);
 			rcuwait_wake_up(&rq->hotplug_wait);
 			raw_spin_lock(&rq->lock);
@@ -7290,7 +7314,7 @@ int sched_cpu_dying(unsigned int cpu)
 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 		set_rq_offline(rq);
 	}
-	BUG_ON(rq->nr_running != 1);
+	BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
 	rq_unlock_irqrestore(rq, &rf);
 
 	calc_load_migrate(rq);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1053,6 +1053,10 @@ struct rq {
 	/* Must be inspected within a rcu lock section */
 	struct cpuidle_state	*idle_state;
 #endif
+
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	unsigned int		nr_pinned;
+#endif
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED



^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 7/9] sched: Add migrate_disable()
  2020-09-21 16:36 ` [PATCH 7/9] sched: Add migrate_disable() Peter Zijlstra
@ 2020-09-21 19:16   ` Thomas Gleixner
  2020-09-21 20:42     ` Daniel Bristot de Oliveira
  2020-09-23  7:48     ` peterz
  2020-09-24 11:53   ` Valentin Schneider
  2020-09-25 16:50   ` Sebastian Andrzej Siewior
  2 siblings, 2 replies; 45+ messages in thread
From: Thomas Gleixner @ 2020-09-21 19:16 UTC (permalink / raw)
  To: Peter Zijlstra, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

On Mon, Sep 21 2020 at 18:36, Peter Zijlstra wrote:
> Add the base migrate_disable() support (under protest).

:)

> +/*
> + * Migrate-Disable and why it is (strongly) undesired.
> + *
> + * The premise of the Real-Time schedulers we have on Linux
> + * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks
> + * concurrently, provided there are sufficient runnable tasks, also known as
> + * work-conserving. For instance SCHED_DEADLINE tries to schedule the M
> + * earliest deadline threads, and SCHED_FIFO the M highest priority threads.
> + *
> + * The correctness of various scheduling models depends on this, but is it
> + * broken by migrate_disable() that doesn't imply preempt_disable(). Where
> + * preempt_disable() implies an immediate priority ceiling, preemptible
> + * migrate_disable() allows nesting.
> + *
> + * The worst case is that all tasks preempt one another in a migrate_disable()
> + * region and stack on a single CPU. This then reduces the available bandwidth
> + * to a single CPU. And since Real-Time schedulability theory considers the
> + * Worst-Case only, all Real-Time analysis shall revert to single-CPU
> + * (instantly solving the SMP analysis problem).

I'm telling you for years that SMP is the source of all evils and
NR_CPUS=0 is the ultimate solution of all problems. Paul surely
disagrees as he thinks that NR_CPUS<0 is the right thing to do.

But seriously, I completely understand your concern vs. schedulability
theories, but those theories can neither deal well with preemption
disable simply because you can create other trainwrecks when enough low
priority tasks run long enough in preempt disabled regions in
parallel. The scheduler simply does not know ahead how long these
sections will take and how many of them will run in parallel.

The theories make some assumptions about preempt disable and consider it
as temporary priority ceiling, but that's all assumptions as the bounds
of these operations simply unknown.

> + * The reason we have it anyway.
> + *
> + * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
> + * number of primitives into becoming preemptible, they would also allow
> + * migration. This turns out to break a bunch of per-cpu usage. To this end,
> + * all these primitives employ migirate_disable() to restore this implicit
> + * assumption.
> + *
> + * This is a 'temporary' work-around at best. The correct solution is getting
> + * rid of the above assumptions and reworking the code to employ explicit
> + * per-cpu locking or short preempt-disable regions.

What timeframe are you envisioning for 'temporary'? I assume something
which is closer to your retirement than to mine :)

> + * The end goal must be to get rid of migrate_disable(), alternatively we need
> + * a schedulability theory that does not depend on abritrary migration.

Finally something new the academics can twist their brain around :)

But as the kmap discussion has shown, the current situation of enforcing
preempt disable even on a !RT kernel is not pretty either. I looked at
quite some of the kmap_atomic() usage sites and the resulting
workarounds for non-preemptability are pretty horrible especially if
they do copy_from/to_user() or such in those regions. There is tons of
other code which really only requires migrate disable.

Thanks,

        tglx


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 7/9] sched: Add migrate_disable()
  2020-09-21 19:16   ` Thomas Gleixner
@ 2020-09-21 20:42     ` Daniel Bristot de Oliveira
  2020-09-23  8:31       ` Thomas Gleixner
  2020-09-23  7:48     ` peterz
  1 sibling, 1 reply; 45+ messages in thread
From: Daniel Bristot de Oliveira @ 2020-09-21 20:42 UTC (permalink / raw)
  To: Thomas Gleixner, Peter Zijlstra, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, valentin.schneider,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, vincent.donnefort

On 9/21/20 9:16 PM, Thomas Gleixner wrote:
> On Mon, Sep 21 2020 at 18:36, Peter Zijlstra wrote:
>> Add the base migrate_disable() support (under protest).
> 
> :)
> 
>> +/*
>> + * Migrate-Disable and why it is (strongly) undesired.
>> + *
>> + * The premise of the Real-Time schedulers we have on Linux
>> + * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks
>> + * concurrently, provided there are sufficient runnable tasks, also known as
>> + * work-conserving. For instance SCHED_DEADLINE tries to schedule the M
>> + * earliest deadline threads, and SCHED_FIFO the M highest priority threads.
>> + *
>> + * The correctness of various scheduling models depends on this, but is it
>> + * broken by migrate_disable() that doesn't imply preempt_disable(). Where
>> + * preempt_disable() implies an immediate priority ceiling, preemptible
>> + * migrate_disable() allows nesting.
>> + *
>> + * The worst case is that all tasks preempt one another in a migrate_disable()
>> + * region and stack on a single CPU. This then reduces the available bandwidth
>> + * to a single CPU. And since Real-Time schedulability theory considers the
>> + * Worst-Case only, all Real-Time analysis shall revert to single-CPU
>> + * (instantly solving the SMP analysis problem).
> 
> I'm telling you for years that SMP is the source of all evils and
> NR_CPUS=0 is the ultimate solution of all problems. Paul surely
> disagrees as he thinks that NR_CPUS<0 is the right thing to do.

And I would not need to extend the model!

> But seriously, I completely understand your concern vs. schedulability
> theories, but those theories can neither deal well with preemption
> disable simply because you can create other trainwrecks when enough low
> priority tasks run long enough in preempt disabled regions in
> parallel. The scheduler simply does not know ahead how long these
> sections will take and how many of them will run in parallel.
> 
> The theories make some assumptions about preempt disable and consider it
> as temporary priority ceiling, but that's all assumptions as the bounds
> of these operations simply unknown.

Limited preemption is something that is more explored/well known than
limited/arbitrary affinity - I even know a dude that convinced academics about
the effects/properties of preempt disable on the PREEMPT_RT!

But I think that the message here is that: ok, migrate disable is better for the
"scheduling latency" than preempt disable (preempt rt goal). But the
indiscriminate usage of migrate disable has some undesired effects for "response
time" of real-time threads (scheduler goal), so we should use it with caution -
as much as we have with preempt disable. In the end, both are critical for
real-time workloads, and we need more work and analysis on them both.

>> + * The reason we have it anyway.
>> + *
>> + * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
>> + * number of primitives into becoming preemptible, they would also allow
>> + * migration. This turns out to break a bunch of per-cpu usage. To this end,
>> + * all these primitives employ migirate_disable() to restore this implicit
>> + * assumption.
>> + *
>> + * This is a 'temporary' work-around at best. The correct solution is getting
>> + * rid of the above assumptions and reworking the code to employ explicit
>> + * per-cpu locking or short preempt-disable regions.
> 
> What timeframe are you envisioning for 'temporary'? I assume something
> which is closer to your retirement than to mine :)

/me counts how many years he still needs to wait for the retirement.

> 
>> + * The end goal must be to get rid of migrate_disable(), alternatively we need
>> + * a schedulability theory that does not depend on abritrary migration.
> 
> Finally something new the academics can twist their brain around :)

Like if there was not enough already :-)

> But as the kmap discussion has shown, the current situation of enforcing
> preempt disable even on a !RT kernel is not pretty either. I looked at
> quite some of the kmap_atomic() usage sites and the resulting
> workarounds for non-preemptability are pretty horrible especially if
> they do copy_from/to_user() or such in those regions. There is tons of
> other code which really only requires migrate disable
(not having an explicit declaration of the reason to disable preemption make
these all hard to rework... and we will have the same with migrate disable.
Anyways, I agree that disabling only migration helps -rt now [and I like
that]... but I also fear/care for scheduler metrics on the long term... well,
there is still a long way until retirement.)

Thanks!
-- Daniel

> Thanks,
> 
>         tglx
> 


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 7/9] sched: Add migrate_disable()
  2020-09-21 19:16   ` Thomas Gleixner
  2020-09-21 20:42     ` Daniel Bristot de Oliveira
@ 2020-09-23  7:48     ` peterz
  1 sibling, 0 replies; 45+ messages in thread
From: peterz @ 2020-09-23  7:48 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: mingo, linux-kernel, bigeasy, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

On Mon, Sep 21, 2020 at 09:16:54PM +0200, Thomas Gleixner wrote:
> On Mon, Sep 21 2020 at 18:36, Peter Zijlstra wrote:

> > +/*
> > + * Migrate-Disable and why it is (strongly) undesired.
> > + *
> > + * The premise of the Real-Time schedulers we have on Linux
> > + * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks
> > + * concurrently, provided there are sufficient runnable tasks, also known as
> > + * work-conserving. For instance SCHED_DEADLINE tries to schedule the M
> > + * earliest deadline threads, and SCHED_FIFO the M highest priority threads.
> > + *
> > + * The correctness of various scheduling models depends on this, but is it
> > + * broken by migrate_disable() that doesn't imply preempt_disable(). Where
> > + * preempt_disable() implies an immediate priority ceiling, preemptible
> > + * migrate_disable() allows nesting.
> > + *
> > + * The worst case is that all tasks preempt one another in a migrate_disable()
> > + * region and stack on a single CPU. This then reduces the available bandwidth
> > + * to a single CPU. And since Real-Time schedulability theory considers the
> > + * Worst-Case only, all Real-Time analysis shall revert to single-CPU
> > + * (instantly solving the SMP analysis problem).
> 
> I'm telling you for years that SMP is the source of all evils and
> NR_CPUS=0 is the ultimate solution of all problems. Paul surely
> disagrees as he thinks that NR_CPUS<0 is the right thing to do.

Surely imaginary numbers are even better :-)

> But seriously, I completely understand your concern vs. schedulability
> theories, but those theories can neither deal well with preemption
> disable simply because you can create other trainwrecks when enough low
> priority tasks run long enough in preempt disabled regions in
> parallel.

Ah, no, those theories can deal with preemption disable perfectly fine.
The result is an increase in latency. It so happens we don't like that,
but that's our problem :-)

> The scheduler simply does not know ahead how long these
> sections will take and how many of them will run in parallel.

Ah, but the thing is, preempt_disable() does not limit concurrency.
Assuming idle CPUs, the waking task can always go elsewhere.

The thing with migrate_disable() OTOH is that even though there are idle
CPUs, we're actively prohibited from using them.

> The theories make some assumptions about preempt disable and consider it
> as temporary priority ceiling, but that's all assumptions as the bounds
> of these operations simply unknown.

Sure, that directly translates into unbounded (or rather of
non-deterministic duration) latencies, which are bad for determinism.
But the theory is fairly clear on this.

> > + * The reason we have it anyway.
> > + *
> > + * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
> > + * number of primitives into becoming preemptible, they would also allow
> > + * migration. This turns out to break a bunch of per-cpu usage. To this end,
> > + * all these primitives employ migirate_disable() to restore this implicit
> > + * assumption.
> > + *
> > + * This is a 'temporary' work-around at best. The correct solution is getting
> > + * rid of the above assumptions and reworking the code to employ explicit
> > + * per-cpu locking or short preempt-disable regions.
> 
> What timeframe are you envisioning for 'temporary'? I assume something
> which is closer to your retirement than to mine :)

I figured we'd put a WARN on per-cpu usage with only migrate_disable(),
under a Kconfig knob, much like how RCU-lockdep started, once all of
PREEMPT_RT has landed. Gotta keep busy, right :-)

> > + * The end goal must be to get rid of migrate_disable(), alternatively we need
> > + * a schedulability theory that does not depend on abritrary migration.
> 
> Finally something new the academics can twist their brain around :)

I'm sure they've been waiting for more work ;-)

> But as the kmap discussion has shown, the current situation of enforcing
> preempt disable even on a !RT kernel is not pretty either. I looked at
> quite some of the kmap_atomic() usage sites and the resulting
> workarounds for non-preemptability are pretty horrible especially if
> they do copy_from/to_user() or such in those regions. There is tons of
> other code which really only requires migrate disable.

Yes, I'm on that thread, I'll reply there as well, I really hate going
down that path without having a decent understanding of the
ramifications.

The more we spread this muck around, the deeper the hole we dig for
ourselves to climb out of.

The thing is, afaik the only theory that 'works' with migrate_disable()
is fully partitioned, but we break that by having cross CPU blocking
chains.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 7/9] sched: Add migrate_disable()
  2020-09-21 20:42     ` Daniel Bristot de Oliveira
@ 2020-09-23  8:31       ` Thomas Gleixner
  2020-09-23 10:51         ` Daniel Bristot de Oliveira
  2020-09-23 17:08         ` peterz
  0 siblings, 2 replies; 45+ messages in thread
From: Thomas Gleixner @ 2020-09-23  8:31 UTC (permalink / raw)
  To: Daniel Bristot de Oliveira, Peter Zijlstra, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, valentin.schneider,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, vincent.donnefort

On Mon, Sep 21 2020 at 22:42, Daniel Bristot de Oliveira wrote:
> On 9/21/20 9:16 PM, Thomas Gleixner wrote:
>> On Mon, Sep 21 2020 at 18:36, Peter Zijlstra wrote:
>> But seriously, I completely understand your concern vs. schedulability
>> theories, but those theories can neither deal well with preemption
>> disable simply because you can create other trainwrecks when enough low
>> priority tasks run long enough in preempt disabled regions in
>> parallel. The scheduler simply does not know ahead how long these
>> sections will take and how many of them will run in parallel.
>> 
>> The theories make some assumptions about preempt disable and consider it
>> as temporary priority ceiling, but that's all assumptions as the bounds
>> of these operations simply unknown.
>
> Limited preemption is something that is more explored/well known than
> limited/arbitrary affinity - I even know a dude that convinced academics about
> the effects/properties of preempt disable on the PREEMPT_RT!

I'm sure I never met that guy.

> But I think that the message here is that: ok, migrate disable is better for the
> "scheduling latency" than preempt disable (preempt rt goal). But the
> indiscriminate usage of migrate disable has some undesired effects for "response
> time" of real-time threads (scheduler goal), so we should use it with caution -
> as much as we have with preempt disable. In the end, both are critical for
> real-time workloads, and we need more work and analysis on them both.
...
>> But as the kmap discussion has shown, the current situation of enforcing
>> preempt disable even on a !RT kernel is not pretty either. I looked at
>> quite some of the kmap_atomic() usage sites and the resulting
>> workarounds for non-preemptability are pretty horrible especially if
>> they do copy_from/to_user() or such in those regions. There is tons of
>> other code which really only requires migrate disable
>
> (not having an explicit declaration of the reason to disable preemption make
> these all hard to rework... and we will have the same with migrate disable.
> Anyways, I agree that disabling only migration helps -rt now [and I like
> that]... but I also fear/care for scheduler metrics on the long term... well,
> there is still a long way until retirement.)

Lets have a look at theory and practice once more:

1) Preempt disable

   Theories take that into account by adding a SHC ('Sh*t Happens
   Coefficient') into their formulas, but the practical effects cannot
   ever be reflected in theories accurately.

   In practice, preempt disable can cause unbound latencies and while we
   all agree that long preempt/interrupt disabled sections are bad, it's
   not really trivial to break these up without rewriting stuff from
   scratch. The recent discussion about unbound latencies in the page
   allocator is a prime example for that.

   The ever growing usage of per CPU storage is not making anything
   better and right now preempt disable is the only tool we have at the
   moment in mainline to deal with that.

   That forces people to come up with code constructs which are more
   than suboptimal both in terms of code quality and in terms of
   schedulability/latency. We've seen mutexes converted to spinlocks
   just because of that, conditionals depending on execution context
   which turns out to be broken and inconsistent, massive error handling
   trainwrecks, etc.

2) Migrate disable

   Theories do not know anything about it, but in the very end it's
   going to be yet another variant of SHC to be defined.

   In practice migrate disable could be taken into account on placement
   decisions, but yes we don't have anything like that at the moment.

   The theoretical worst case which forces all and everything on a
   single CPU is an understandable concern, but the practical relevance
   is questionable. I surely stared at a lot of traces on heavily loaded
   RT systems, but too many prempted migrate disabled tasks was truly
   never a practical problem. I'm sure you can create a workload
   scenario which triggers that, but then you always can create
   workloads which are running into the corner cases of any given
   system.

   The charm of migrate disable even on !RT is that it allows for
   simpler code and breaking up preempt disabled sections, which is IMO
   a clear win given that per CPU ness is not going away -unless the
   chip industry comes to senses and goes back to the good old UP
   systems which have natural per CPU ness :)

That said, let me paraphrase that dude you mentioned above:

 Theories are great and useful, but pragmatism has proven to produce
 working solutions even if they cannot work according to theory.

Thanks,

        tglx

        

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 7/9] sched: Add migrate_disable()
  2020-09-23  8:31       ` Thomas Gleixner
@ 2020-09-23 10:51         ` Daniel Bristot de Oliveira
  2020-09-23 17:08         ` peterz
  1 sibling, 0 replies; 45+ messages in thread
From: Daniel Bristot de Oliveira @ 2020-09-23 10:51 UTC (permalink / raw)
  To: Thomas Gleixner, Peter Zijlstra, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, valentin.schneider,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, vincent.donnefort

On 9/23/20 10:31 AM, Thomas Gleixner wrote:
> On Mon, Sep 21 2020 at 22:42, Daniel Bristot de Oliveira wrote:
>> On 9/21/20 9:16 PM, Thomas Gleixner wrote:
>>> On Mon, Sep 21 2020 at 18:36, Peter Zijlstra wrote:
>>> But seriously, I completely understand your concern vs. schedulability
>>> theories, but those theories can neither deal well with preemption
>>> disable simply because you can create other trainwrecks when enough low
>>> priority tasks run long enough in preempt disabled regions in
>>> parallel. The scheduler simply does not know ahead how long these
>>> sections will take and how many of them will run in parallel.
>>>
>>> The theories make some assumptions about preempt disable and consider it
>>> as temporary priority ceiling, but that's all assumptions as the bounds
>>> of these operations simply unknown.
>>
>> Limited preemption is something that is more explored/well known than
>> limited/arbitrary affinity - I even know a dude that convinced academics about
>> the effects/properties of preempt disable on the PREEMPT_RT!
> 
> I'm sure I never met that guy.

It is a funny Italian/Brazilian dude....

>> But I think that the message here is that: ok, migrate disable is better for the
>> "scheduling latency" than preempt disable (preempt rt goal). But the
>> indiscriminate usage of migrate disable has some undesired effects for "response
>> time" of real-time threads (scheduler goal), so we should use it with caution -
>> as much as we have with preempt disable. In the end, both are critical for
>> real-time workloads, and we need more work and analysis on them both.
> ...
>>> But as the kmap discussion has shown, the current situation of enforcing
>>> preempt disable even on a !RT kernel is not pretty either. I looked at
>>> quite some of the kmap_atomic() usage sites and the resulting
>>> workarounds for non-preemptability are pretty horrible especially if
>>> they do copy_from/to_user() or such in those regions. There is tons of
>>> other code which really only requires migrate disable
>>
>> (not having an explicit declaration of the reason to disable preemption make
>> these all hard to rework... and we will have the same with migrate disable.
>> Anyways, I agree that disabling only migration helps -rt now [and I like
>> that]... but I also fear/care for scheduler metrics on the long term... well,
>> there is still a long way until retirement.)
> 
> Lets have a look at theory and practice once more:
> 
> 1) Preempt disable
> 
>    Theories take that into account by adding a SHC ('Sh*t Happens
>    Coefficient') into their formulas, but the practical effects cannot
>    ever be reflected in theories accurately.

It depends, an adequate theory will have the correct balance between SHC and
precision. The idea is to precisely define the behavior as much as possible,
trying to reduce the SHC.

>    In practice, preempt disable can cause unbound latencies and while we
>    all agree that long preempt/interrupt disabled sections are bad, it's
>    not really trivial to break these up without rewriting stuff from
>    scratch. The recent discussion about unbound latencies in the page
>    allocator is a prime example for that.

Here we need to separate two contexts: the synchronization and the code contexts.

At the synchronization level the preempt_disable() is bounded [1]:

The PREEMPT_RT can have only *1* preempt_disable()/enable() not to schedule
section (the worst (SHC)) before starting the process of calling the scheduler.
So the SHC factor is then reduced to the code context [2].

The SHC is in the code context, and it is mitigated by the constant monitoring
of the code sections via tests.

>    The ever growing usage of per CPU storage is not making anything
>    better and right now preempt disable is the only tool we have at the
>    moment in mainline to deal with that.
> 
>    That forces people to come up with code constructs which are more
>    than suboptimal both in terms of code quality and in terms of
>    schedulability/latency. We've seen mutexes converted to spinlocks
>    just because of that, conditionals depending on execution context
>    which turns out to be broken and inconsistent, massive error handling
>    trainwrecks, etc.

I see and agree with you on this point.

> 2) Migrate disable
> 
>    Theories do not know anything about it, but in the very end it's
>    going to be yet another variant of SHC to be defined.

I agree. There are very few things known at this point. However, we can take as
exercise the example that Peter mentioned:

CPU 0					CPU 1
thread on migrate_disable():		high prio thread
 -> preempted!
    -> migrate_dsaible()
       -> preempted!			
		...
          migrate_disable()		leaves the CPU
	  SH happens			IDLE
		...			IDLE
	unfold all on this CPU.		IDLE (decades of ...)


So, at synchronization level... migrate_disable() is not bounded by a
constant as preempt_disable() does. That is the difference that worries peter,
and it is valid from both points of view (theoretical and practical).

>    In practice migrate disable could be taken into account on placement
>    decisions, but yes we don't have anything like that at the moment.

Agreed... we can mitigate that! that is a nice challenge!

>    The theoretical worst case which forces all and everything on a
>    single CPU is an understandable concern, but the practical relevance
>    is questionable. I surely stared at a lot of traces on heavily loaded
>    RT systems, but too many prempted migrate disabled tasks was truly
>    never a practical problem. I'm sure you can create a workload
>    scenario which triggers that, but then you always can create
>    workloads which are running into the corner cases of any given
>    system.

I see your point! and I agree it is a corner case. But... that is what we have
to deal with in RT scheduling, and such discussions are good for RT Linux, at
least to raise the attention to a point that might need to be constantly
monitored and not abused (like preempt_disable for RT).

>    The charm of migrate disable even on !RT is that it allows for
>    simpler code and breaking up preempt disabled sections, which is IMO
>    a clear win given that per CPU ness is not going away -unless the
>    chip industry comes to senses and goes back to the good old UP
>    systems which have natural per CPU ness :)
> 
> That said, let me paraphrase that dude you mentioned above:
> 
>  Theories are great and useful, but pragmatism has proven to produce
>  working solutions even if they cannot work according to theory.

I agree! The way that Linux advances as RTOS is shown to be effective. So
effective that people want more and more. And we can always show later that the
path that Linux took is right in theory! Like that dude did....

But at the same time, we can take advantage of scenarios we can predict being
hard for the future and try to mitigate/isolate them to avoid them. I can't
argue against that either.

At the end, it seems that the decision here is about which hard problem we will
have to deal in the future. [ As I said yesterday, I will have to deal with
migrate_disable anyway because of RT, so I do not have a way out anyway.]

Thanks!
-- Daniel

> Thanks,
> 
>         tglx
> 
>         
> 

For the wider audience
[1] https://bit.ly/33PsV0N
[2] there are other scenarios too, demonstrated in the paper, but here we are
discussing only about preempt_disable() postponing the __scheduler().


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 6/9] sched: Massage set_cpus_allowed
  2020-09-21 16:36 ` [PATCH 6/9] sched: Massage set_cpus_allowed Peter Zijlstra
@ 2020-09-23 14:07   ` Thomas Gleixner
  0 siblings, 0 replies; 45+ messages in thread
From: Thomas Gleixner @ 2020-09-23 14:07 UTC (permalink / raw)
  To: Peter Zijlstra, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

On Mon, Sep 21 2020 at 18:36, Peter Zijlstra wrote:
> @@ -1899,7 +1900,9 @@ extern void update_group_capacity(struct
>  
>  extern void trigger_load_balance(struct rq *rq);
>  
> -extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
> +#define SCA_CHECK		0x01
> +

needs to move out of the CONFIG_SMP ifdef as it's used even on UP.

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/9] sched: Fix balance_callback()
  2020-09-21 16:35 ` [PATCH 2/9] sched: Fix balance_callback() Peter Zijlstra
@ 2020-09-23 14:08   ` Thomas Gleixner
  0 siblings, 0 replies; 45+ messages in thread
From: Thomas Gleixner @ 2020-09-23 14:08 UTC (permalink / raw)
  To: Peter Zijlstra, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, peterz,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

On Mon, Sep 21 2020 at 18:35, Peter Zijlstra wrote:
>  	return 0;
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1220,6 +1220,8 @@ static inline void rq_pin_lock(struct rq
>  #ifdef CONFIG_SCHED_DEBUG
>  	rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
>  	rf->clock_update_flags = 0;
> +
> +	SCHED_WARN_ON(rq->balance_callback);

fails to compile on !SMP

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 7/9] sched: Add migrate_disable()
  2020-09-23  8:31       ` Thomas Gleixner
  2020-09-23 10:51         ` Daniel Bristot de Oliveira
@ 2020-09-23 17:08         ` peterz
  2020-09-23 17:54           ` Daniel Bristot de Oliveira
  1 sibling, 1 reply; 45+ messages in thread
From: peterz @ 2020-09-23 17:08 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Daniel Bristot de Oliveira, mingo, linux-kernel, bigeasy,
	qais.yousef, swood, valentin.schneider, juri.lelli,
	vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
	vincent.donnefort

On Wed, Sep 23, 2020 at 10:31:10AM +0200, Thomas Gleixner wrote:
>    In practice migrate disable could be taken into account on placement
>    decisions, but yes we don't have anything like that at the moment.

I think at the very least we should do some of that.

The premise is wanting to run the M highest priority tasks, when a CPU
drops priority, it tries to PULL a higher priority task towards itself.
If this PULL selects a migrate_disable() tasks, it means the task is in
the M highest prio tasks.

Since obviously that task cannot get pulled, we should then pull the
current running task of that CPU, this would then allow the
migrate_disable() task to resume execution.

I'll go try and cook up something along those lines...



^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 7/9] sched: Add migrate_disable()
  2020-09-23 17:08         ` peterz
@ 2020-09-23 17:54           ` Daniel Bristot de Oliveira
  0 siblings, 0 replies; 45+ messages in thread
From: Daniel Bristot de Oliveira @ 2020-09-23 17:54 UTC (permalink / raw)
  To: peterz, Thomas Gleixner
  Cc: mingo, linux-kernel, bigeasy, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, vincent.donnefort

On 9/23/20 7:08 PM, peterz@infradead.org wrote:
> On Wed, Sep 23, 2020 at 10:31:10AM +0200, Thomas Gleixner wrote:
>>    In practice migrate disable could be taken into account on placement
>>    decisions, but yes we don't have anything like that at the moment.
> I think at the very least we should do some of that.
> 
> The premise is wanting to run the M highest priority tasks, when a CPU
> drops priority, it tries to PULL a higher priority task towards itself.
> If this PULL selects a migrate_disable() tasks, it means the task is in
> the M highest prio tasks.
> 
> Since obviously that task cannot get pulled, we should then pull the
> current running task of that CPU, this would then allow the
> migrate_disable() task to resume execution.
> 
> I'll go try and cook up something along those lines...
> 

sounds promising...

-- Daniel


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 7/9] sched: Add migrate_disable()
  2020-09-21 16:36 ` [PATCH 7/9] sched: Add migrate_disable() Peter Zijlstra
  2020-09-21 19:16   ` Thomas Gleixner
@ 2020-09-24 11:53   ` Valentin Schneider
  2020-09-24 12:29     ` Peter Zijlstra
  2020-09-24 12:35     ` Peter Zijlstra
  2020-09-25 16:50   ` Sebastian Andrzej Siewior
  2 siblings, 2 replies; 45+ messages in thread
From: Valentin Schneider @ 2020-09-24 11:53 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort


Still trying to digest 8/9, but have some comments before I next get
preempted :)

On 21/09/20 17:36, Peter Zijlstra wrote:

[...]

> +void migrate_enable(void)
> +{
> +	if (--current->migration_disabled)
> +		return;
> +
> +	barrier();
> +
> +	if (p->cpus_ptr == &p->cpus_mask)
> +		return;

If we get to here this means we're the migrate_enable() invocation that
marks the end of the migration_disabled region. How can cpus_ptr already
point back to current's cpus_mask?

Also, this is fixed in 8/9 but this here wants an s/p/current/

> +
> +	__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
> +}

[...]

> @@ -1830,8 +1892,19 @@ static int migration_cpu_stop(void *data
>   */
>  void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
>  {
> -	cpumask_copy(&p->cpus_mask, new_mask);
> -	p->nr_cpus_allowed = cpumask_weight(new_mask);
> +	if (flags & SCA_MIGRATE_DISABLE) {
> +		p->cpus_ptr = new_mask;
> +		p->nr_cpus_allowed = 1;
> +		return;
> +	}
> +
> +	if (flags & SCA_MIGRATE_ENABLE)
> +		p->cpus_ptr = &p->cpus_mask;

There's that check in __set_cpus_allowed_ptr() that new_mask *must* be
p->cpus_mask when SCA_MIGRATE_ENABLE; that means we could mayhaps shuffle
that to:

---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7cb13df48366..e0e4e42c5e32 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1892,18 +1892,14 @@ static int migration_cpu_stop(void *data)
  */
 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
 {
-	if (flags & SCA_MIGRATE_DISABLE) {
+	if (flags & (SCA_MIGRATE_DISABLE | SCA_MIGRATE_ENABLE))
                p->cpus_ptr = new_mask;
-		p->nr_cpus_allowed = 1;
-		return;
-	}
-
-	if (flags & SCA_MIGRATE_ENABLE)
-		p->cpus_ptr = &p->cpus_mask;
        else
                cpumask_copy(&p->cpus_mask, new_mask);

-	if (p->cpus_ptr == &p->cpus_mask)
+	if (flags & SCA_MIGRATE_DISABLE)
+		p->nr_cpus_allowed = 1;
+	else if (p->cpus_ptr == &p->cpus_mask)
                p->nr_cpus_allowed = cpumask_weight(p->cpus_ptr);
 }

---

> +	else
> +		cpumask_copy(&p->cpus_mask, new_mask);
> +
> +	if (p->cpus_ptr == &p->cpus_mask)
> +		p->nr_cpus_allowed = cpumask_weight(p->cpus_ptr);
>  }
>
>  static void

[...]

> @@ -1891,9 +1979,14 @@ static int __set_cpus_allowed_ptr(struct
>       rq = task_rq_lock(p, &rf);
>       update_rq_clock(rq);
>
> -	if (p->flags & PF_KTHREAD) {
> +	if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
>               /*
> -		 * Kernel threads are allowed on online && !active CPUs
> +		 * Kernel threads are allowed on online && !active CPUs.
> +		 *
> +		 * Specifically, migration_disabled() tasks must not fail the
> +		 * cpumask_and_and_distribute() pick below, esp. so on

s/and_and/and/

> +		 * SCA_MIGRATE_ENABLE, otherwise we'll not call
> +		 * set_cpus_allowed_common() and actually reset p->cpus_ptr.
>                */
>               cpu_valid_mask = cpu_online_mask;
>       }

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 7/9] sched: Add migrate_disable()
  2020-09-24 11:53   ` Valentin Schneider
@ 2020-09-24 12:29     ` Peter Zijlstra
  2020-09-24 12:33       ` Valentin Schneider
  2020-09-24 12:35     ` Peter Zijlstra
  1 sibling, 1 reply; 45+ messages in thread
From: Peter Zijlstra @ 2020-09-24 12:29 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort

On Thu, Sep 24, 2020 at 12:53:25PM +0100, Valentin Schneider wrote:
> On 21/09/20 17:36, Peter Zijlstra wrote:
> 
> [...]
> 
> > +void migrate_enable(void)
> > +{
> > +	if (--current->migration_disabled)
> > +		return;
> > +
> > +	barrier();
> > +
> > +	if (p->cpus_ptr == &p->cpus_mask)
> > +		return;
> 
> If we get to here this means we're the migrate_enable() invocation that
> marks the end of the migration_disabled region. How can cpus_ptr already
> point back to current's cpus_mask?

It might never have been changed away.


	migrate_disable()
	  ->migration_disabled++;
	|	|
	|	|
	|	v
	|	migrate_disable_switch()
	|	  if (->cpus_ptr == &->cpus_mask)
	|	    __do_set_cpus_allowed(.new_mask = cpumask_of())
	|	|
	|	|
	v	v
	migrate_enable()
	  ->migration_disabled--;


Only if we've passed through a context switch between migrate_disable()
and migrate_enable() will the mask have been changed.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 7/9] sched: Add migrate_disable()
  2020-09-24 12:29     ` Peter Zijlstra
@ 2020-09-24 12:33       ` Valentin Schneider
  0 siblings, 0 replies; 45+ messages in thread
From: Valentin Schneider @ 2020-09-24 12:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort


On 24/09/20 13:29, Peter Zijlstra wrote:
> On Thu, Sep 24, 2020 at 12:53:25PM +0100, Valentin Schneider wrote:
>> On 21/09/20 17:36, Peter Zijlstra wrote:
>>
>> [...]
>>
>> > +void migrate_enable(void)
>> > +{
>> > +	if (--current->migration_disabled)
>> > +		return;
>> > +
>> > +	barrier();
>> > +
>> > +	if (p->cpus_ptr == &p->cpus_mask)
>> > +		return;
>>
>> If we get to here this means we're the migrate_enable() invocation that
>> marks the end of the migration_disabled region. How can cpus_ptr already
>> point back to current's cpus_mask?
>
> It might never have been changed away.
>
>
>       migrate_disable()
>         ->migration_disabled++;
>       |	|
>       |	|
>       |	v
>       |	migrate_disable_switch()
>       |	  if (->cpus_ptr == &->cpus_mask)
>       |	    __do_set_cpus_allowed(.new_mask = cpumask_of())
>       |	|
>       |	|
>       v	v
>       migrate_enable()
>         ->migration_disabled--;
>
>
> Only if we've passed through a context switch between migrate_disable()
> and migrate_enable() will the mask have been changed.

Doh, yes indeed. Thanks.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 7/9] sched: Add migrate_disable()
  2020-09-24 11:53   ` Valentin Schneider
  2020-09-24 12:29     ` Peter Zijlstra
@ 2020-09-24 12:35     ` Peter Zijlstra
  1 sibling, 0 replies; 45+ messages in thread
From: Peter Zijlstra @ 2020-09-24 12:35 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort

On Thu, Sep 24, 2020 at 12:53:25PM +0100, Valentin Schneider wrote:
> > @@ -1830,8 +1892,19 @@ static int migration_cpu_stop(void *data
> >   */
> >  void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
> >  {
> > -	cpumask_copy(&p->cpus_mask, new_mask);
> > -	p->nr_cpus_allowed = cpumask_weight(new_mask);
> > +	if (flags & SCA_MIGRATE_DISABLE) {
> > +		p->cpus_ptr = new_mask;
> > +		p->nr_cpus_allowed = 1;
> > +		return;
> > +	}
> > +
> > +	if (flags & SCA_MIGRATE_ENABLE)
> > +		p->cpus_ptr = &p->cpus_mask;
> 
> There's that check in __set_cpus_allowed_ptr() that new_mask *must* be
> p->cpus_mask when SCA_MIGRATE_ENABLE; that means we could mayhaps shuffle
> that to:
> 
> ---
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 7cb13df48366..e0e4e42c5e32 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1892,18 +1892,14 @@ static int migration_cpu_stop(void *data)
>   */
>  void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
>  {
> +	if (flags & (SCA_MIGRATE_DISABLE | SCA_MIGRATE_ENABLE))
>                 p->cpus_ptr = new_mask;
>         else
>                 cpumask_copy(&p->cpus_mask, new_mask);
> 
> +	if (flags & SCA_MIGRATE_DISABLE)
> +		p->nr_cpus_allowed = 1;
> +	else if (p->cpus_ptr == &p->cpus_mask)
>                 p->nr_cpus_allowed = cpumask_weight(p->cpus_ptr);
>  }

I'm thinking we should let nr_cpus_allowed reflect ->cpus_mask
unconditionally. See these emails:

  https://lkml.kernel.org/r/20200923170809.GY1362448@hirez.programming.kicks-ass.net
  https://lkml.kernel.org/r/20200924082717.GA1362448@hirez.programming.kicks-ass.net

Basically we want to keep migrate_disable() tasks elegible for
migration/PULL.

But yes, I think that function can be simplified.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 8/9] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-09-21 16:36 ` [PATCH 8/9] sched: Fix migrate_disable() vs set_cpus_allowed_ptr() Peter Zijlstra
@ 2020-09-24 19:59   ` Valentin Schneider
  2020-09-25  8:43     ` Peter Zijlstra
  2020-09-25  9:05     ` Peter Zijlstra
  0 siblings, 2 replies; 45+ messages in thread
From: Valentin Schneider @ 2020-09-24 19:59 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort


On 21/09/20 17:36, Peter Zijlstra wrote:
> +struct set_affinity_pending {
> +	refcount_t		refs;
> +	struct completion	done;
> +	struct cpu_stop_work	stop_work;
> +	struct migration_arg	arg;
> +};
> +
> +static int move_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p,
> +		     int dest_cpu, unsigned int flags)
> +{
> +	struct set_affinity_pending my_pending, *pending = NULL;
> +	struct migration_arg arg = {
> +		.task = p,
> +		.dest_cpu = dest_cpu,
> +	};
> +
> +	if (!(flags & SCA_MIGRATE_ENABLE)) {
> +		/* serialized by p->pi_lock */
> +		if (!p->migration_pending) {
> +			refcount_set(&my_pending.refs, 1);
> +			init_completion(&my_pending.done);
> +			p->migration_pending = &my_pending;
> +		} else {
> +			pending = p->migration_pending;
> +			refcount_inc(&pending->refs);
> +		}
> +	}
> +	pending = p->migration_pending;
> +	/*
> +	 * - !MIGRATE_ENABLE:
> +	 *   we'll have installed a pending if there wasn't one already.
> +	 *
> +	 * - MIGRATE_ENABLE:
> +	 *   we're here because the current CPU isn't matching anymore,
> +	 *   the only way that can happen is because of a concurrent
> +	 *   set_cpus_allowed_ptr() call, which should then still be
> +	 *   pending completion.
> +	 *
> +	 * Either way, we really should have a @pending here.
> +	 */
> +	if (WARN_ON_ONCE(!pending))
> +		return -EINVAL;
> +
> +	arg.done = &pending->done;
> +
> +	if (flags & SCA_MIGRATE_ENABLE) {
> +
> +		task_rq_unlock(rq, p, rf);
> +		pending->arg = arg;
> +		stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
> +				    &pending->arg, &pending->stop_work);
> +
> +		return 0;
> +	}
> +
> +	if (task_running(rq, p) || p->state == TASK_WAKING) {
> +
> +		task_rq_unlock(rq, p, rf);
> +		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
> +

Shouldn't we check for is_migrate_disabled(p) before doing any of that?
migration_cpu_stop() does check for it, is there something that prevents us
from acting on it earlier than that?

> +	} else {
> +		bool complete = false;
> +
> +		if (!is_migration_disabled(p)) {
> +			if (task_on_rq_queued(p))
> +				rq = move_queued_task(rq, rf, p, dest_cpu);
> +
> +			p->migration_pending = NULL;
> +			complete = true;
> +		}
> +		task_rq_unlock(rq, p, rf);
> +
> +		if (complete)
> +			complete_all(&pending->done);
> +	}
> +
> +	wait_for_completion(&pending->done);
> +
> +	if (refcount_dec_and_test(&pending->refs))
> +		wake_up_var(&pending->refs);
> +
> +	wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
> +
> +	return 0;
> +}
> +
>  /*
>   * Change a given task's CPU affinity. Migrate the thread to a
>   * proper CPU and schedule it away if the CPU it's executing on
> @@ -2025,19 +2138,8 @@ static int __set_cpus_allowed_ptr(struct
>       if (cpumask_test_cpu(task_cpu(p), new_mask))
>               goto out;

I think this needs a cancellation of any potential pending migration
requests. Consider a task P0 running on CPU0:

   P0                     P1                               P2

   migrate_disable();
   <preempt>
                          set_cpus_allowed_ptr(P0, CPU1);
                          // waits for completion
                                                           set_cpus_allowed_ptr(P0, CPU0);
                                                           // Already good, no waiting for completion
   <resumes>
   migrate_enable();
   // task_cpu(p) allowed, no move_task()

AIUI in this scenario P1 would stay forever waiting. I *think* this can be
cured by making this function slightly more hideous:

---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 01113e6f941f..829334f00f7b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2102,6 +2102,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
                                  u32 flags)
 {
        const struct cpumask *cpu_valid_mask = cpu_active_mask;
+	struct set_affinity_pending *pending;
+	bool cancel_pending = false;
        unsigned int dest_cpu;
        struct rq_flags rf;
        struct rq *rq;
@@ -2158,14 +2160,20 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
        }

        /* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask))
+	if (cpumask_test_cpu(task_cpu(p), new_mask)) {
+		cancel_pending = true;
                goto out;
+	}

        return move_task(rq, &rf, p, dest_cpu, flags);

 out:
+	pending = p->migration_pending;
        task_rq_unlock(rq, p, &rf);

+	if (cancel_pending && pending)
+		complete_all(&pending->done);
+
        return ret;
 }

---

>
> -	if (task_running(rq, p) || p->state == TASK_WAKING) {
> -		struct migration_arg arg = { p, dest_cpu };
> -		/* Need help from migration thread: drop lock and wait. */
> -		task_rq_unlock(rq, p, &rf);
> -		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
> -		return 0;
> -	} else if (task_on_rq_queued(p)) {
> -		/*
> -		 * OK, since we're going to drop the lock immediately
> -		 * afterwards anyway.
> -		 */
> -		rq = move_queued_task(rq, &rf, p, dest_cpu);
> -	}
> +	return move_task(rq, &rf, p, dest_cpu, flags);
> +
>  out:
>       task_rq_unlock(rq, p, &rf);
>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 8/9] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-09-24 19:59   ` Valentin Schneider
@ 2020-09-25  8:43     ` Peter Zijlstra
  2020-09-25 10:07       ` Valentin Schneider
  2020-09-25  9:05     ` Peter Zijlstra
  1 sibling, 1 reply; 45+ messages in thread
From: Peter Zijlstra @ 2020-09-25  8:43 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort

On Thu, Sep 24, 2020 at 08:59:33PM +0100, Valentin Schneider wrote:

> > +	if (task_running(rq, p) || p->state == TASK_WAKING) {
> > +
> > +		task_rq_unlock(rq, p, rf);
> > +		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
> > +
> 
> Shouldn't we check for is_migrate_disabled(p) before doing any of that?
> migration_cpu_stop() does check for it, is there something that prevents us
> from acting on it earlier than that?

Since migrate_disable() / ->migration_disabled is only touched from the
current task, you can only reliably read it from the same CPU.

Hence I only look at it when the stop task has pinned the task, because
at that point I know it's stable.

Doing it earlier gives races, races give me head-aches. This is a slow
path, I don't care about performance.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 8/9] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-09-24 19:59   ` Valentin Schneider
  2020-09-25  8:43     ` Peter Zijlstra
@ 2020-09-25  9:05     ` Peter Zijlstra
  2020-09-25  9:56       ` Peter Zijlstra
  1 sibling, 1 reply; 45+ messages in thread
From: Peter Zijlstra @ 2020-09-25  9:05 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort

On Thu, Sep 24, 2020 at 08:59:33PM +0100, Valentin Schneider wrote:
> > @@ -2025,19 +2138,8 @@ static int __set_cpus_allowed_ptr(struct
> >       if (cpumask_test_cpu(task_cpu(p), new_mask))
> >               goto out;
> 
> I think this needs a cancellation of any potential pending migration
> requests. Consider a task P0 running on CPU0:
> 
>    P0                     P1                               P2
> 
>    migrate_disable();
>    <preempt>
>                           set_cpus_allowed_ptr(P0, CPU1);
>                           // waits for completion
>                                                            set_cpus_allowed_ptr(P0, CPU0);
>                                                            // Already good, no waiting for completion
>    <resumes>
>    migrate_enable();
>    // task_cpu(p) allowed, no move_task()
> 
> AIUI in this scenario P1 would stay forever waiting.

Hurmph, looking at it, I think you're right. But I'm fairly sure I did
test that, maybe I just didn't run it long enough to hit the window ...

> I *think* this can be
> cured by making this function slightly more hideous:

It's a real beauty isn't it :-/

> ---
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 01113e6f941f..829334f00f7b 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2102,6 +2102,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
>                                   u32 flags)
>  {
>         const struct cpumask *cpu_valid_mask = cpu_active_mask;
> +	struct set_affinity_pending *pending;
> +	bool cancel_pending = false;
>         unsigned int dest_cpu;
>         struct rq_flags rf;
>         struct rq *rq;
> @@ -2158,14 +2160,20 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
>         }
> 
>         /* Can the task run on the task's current CPU? If so, we're done */
> -	if (cpumask_test_cpu(task_cpu(p), new_mask))
> +	if (cpumask_test_cpu(task_cpu(p), new_mask)) {
> +		cancel_pending = true;
>                 goto out;
> +	}
> 
>         return move_task(rq, &rf, p, dest_cpu, flags);
> 
>  out:
> +	pending = p->migration_pending;
>         task_rq_unlock(rq, p, &rf);
> 
> +	if (cancel_pending && pending)
> +		complete_all(&pending->done);
> +
>         return ret;
>  }

He who completes pending should also clear ->migration_pending,
otherwise the next caller will be able to observe a dangling pointer.

The other approach is trying to handle that last condition in
move_task(), but I'm quite sure that's going to be aweful too :/



^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 0/9] sched: Migrate disable support
  2020-09-21 16:35 [PATCH 0/9] sched: Migrate disable support Peter Zijlstra
                   ` (8 preceding siblings ...)
  2020-09-21 16:36 ` [PATCH 9/9] sched/core: Make migrate disable and CPU hotplug cooperative Peter Zijlstra
@ 2020-09-25  9:12 ` Dietmar Eggemann
  2020-09-25 10:10   ` Peter Zijlstra
  2020-09-25 18:17 ` Sebastian Andrzej Siewior
  10 siblings, 1 reply; 45+ messages in thread
From: Dietmar Eggemann @ 2020-09-25  9:12 UTC (permalink / raw)
  To: Peter Zijlstra, tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, valentin.schneider,
	juri.lelli, vincent.guittot, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

On 21/09/2020 18:35, Peter Zijlstra wrote:
> Hi,
> 
> Here's my take on migrate_disable(). It avoids growing a second means of
> changing the affinity, documents how the things violates locking rules but
> still mostly works.
> 
> It also avoids blocking completely, so no more futex band-aids required.
> 
> Also, no more atomics/locks on the fast path.
> 
> I also put in a comment that explains how the whole concept is fundamentally
> flawed but a necessary evil to get PREEMPT_RT going -- for now.
> 
> Somewhat tested with PREEMPT_RT.
> 
> ---
>  include/linux/cpuhotplug.h    |    1 
>  include/linux/preempt.h       |   60 +++
>  include/linux/sched.h         |    4 
>  include/linux/sched/hotplug.h |    2 
>  include/linux/stop_machine.h  |    5 
>  kernel/cpu.c                  |    9 
>  kernel/sched/core.c           |  673 +++++++++++++++++++++++++++++++-----------
>  kernel/sched/deadline.c       |    5 
>  kernel/sched/sched.h          |   24 +
>  kernel/stop_machine.c         |   23 +
>  lib/dump_stack.c              |    2 
>  lib/smp_processor_id.c        |    5 
>  12 files changed, 635 insertions(+), 178 deletions(-)
> 

I get this when running 6 (periodic) RT50 tasks with CPU hp stress on my
6 CPU JUNO board (!CONFIG_PREEMPT_RT).

[   55.490263] ------------[ cut here ]------------
[   55.505261] Modules linked in:
[   55.508322] CPU: 3 PID: 24 Comm: migration/3 Not tainted
5.9.0-rc1-00132-gc096e6406c50-dirty #90
[   55.517119] Hardware name: ARM Juno development board (r0) (DT)
[   55.523058] Stopper: multi_cpu_stop+0x0/0x170 <- 0x0
[   55.528029] pstate: 20000085 (nzCv daIf -PAN -UAO BTYPE=--)
[   55.533612] pc : sched_cpu_dying+0x124/0x130
[   55.537887] lr : sched_cpu_dying+0xd8/0x130
[   55.542071] sp : ffff800011f0bca0
[   55.545385] x29: ffff800011f0bca0 x28: 0000000000000002
[   55.550703] x27: 0000000000000000 x26: 0000000000000060
[   55.556022] x25: 0000000000000000 x24: 0000000000000001
[   55.561340] x23: 0000000000000000 x22: 0000000000000003
[   55.566659] x21: 0000000000000080 x20: 0000000000000003
[   55.571977] x19: ffff00097ef9e1c0 x18: 0000000000000010
[   55.577295] x17: 0000000000000000 x16: 0000000000000000
[   55.582613] x15: 0000000000000000 x14: 000000000000015c
[   55.587932] x13: 0000000000000000 x12: 00000000000006f1
[   55.593250] x11: 0000000000000080 x10: 0000000000000000
[   55.598567] x9 : 0000000000000003 x8 : ffff0009743f5900
[   55.603886] x7 : 0000000000000003 x6 : 0000000000000000
[   55.609204] x5 : 0000000000000001 x4 : 0000000000000002
[   55.614521] x3 : 0000000000000000 x2 : 0000000000000013
[   55.619839] x1 : 0000000000000008 x0 : 0000000000000003
[   55.625158] Call trace:
[   55.627607]  sched_cpu_dying+0x124/0x130
[   55.631535]  cpuhp_invoke_callback+0x88/0x210
[   55.635897]  take_cpu_down+0x7c/0xd8
[   55.639475]  multi_cpu_stop+0xac/0x170
[   55.643227]  cpu_stopper_thread+0x98/0x130
[   55.647327]  smpboot_thread_fn+0x1c4/0x280
[   55.651427]  kthread+0x140/0x160
[   55.654658]  ret_from_fork+0x10/0x34
[   55.658239] Code: f000e1c1 913fc021 1400034a 17ffffde (d4210000)
[   55.664342] ---[ end trace c5b8988b7b701e56 ]---
[   55.668963] note: migration/3[24] exited with preempt_count 3

7309 int sched_cpu_dying(unsigned int cpu)
    ...
    BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
    ...

rq->nr_running is always 2 here in this cases.

balance_hotplug_wait and sched_cpu_wait_empty run in cpuhp/X (CFS)
whereas sched_cpu_dying in migration/X ?

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 8/9] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-09-25  9:05     ` Peter Zijlstra
@ 2020-09-25  9:56       ` Peter Zijlstra
  2020-09-25 10:09         ` Valentin Schneider
  0 siblings, 1 reply; 45+ messages in thread
From: Peter Zijlstra @ 2020-09-25  9:56 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort

On Fri, Sep 25, 2020 at 11:05:28AM +0200, Peter Zijlstra wrote:
> On Thu, Sep 24, 2020 at 08:59:33PM +0100, Valentin Schneider wrote:
> > > @@ -2025,19 +2138,8 @@ static int __set_cpus_allowed_ptr(struct
> > >       if (cpumask_test_cpu(task_cpu(p), new_mask))
> > >               goto out;
> > 
> > I think this needs a cancellation of any potential pending migration
> > requests. Consider a task P0 running on CPU0:
> > 
> >    P0                     P1                               P2
> > 
> >    migrate_disable();
> >    <preempt>
> >                           set_cpus_allowed_ptr(P0, CPU1);
> >                           // waits for completion
> >                                                            set_cpus_allowed_ptr(P0, CPU0);
> >                                                            // Already good, no waiting for completion
> >    <resumes>
> >    migrate_enable();
> >    // task_cpu(p) allowed, no move_task()
> > 
> > AIUI in this scenario P1 would stay forever waiting.
> 

> The other approach is trying to handle that last condition in
> move_task(), but I'm quite sure that's going to be aweful too :/

Something like so perhaps?

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2039,6 +2039,10 @@ static int move_task(struct rq *rq, stru
 	if (WARN_ON_ONCE(!pending))
 		return -EINVAL;
 
+	/* Can the task run on the task's current CPU? If so, we're done */
+	if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
+		goto easy;
+
 	arg.done = &pending->done;
 
 	if (flags & SCA_MIGRATE_ENABLE) {
@@ -2063,6 +2067,7 @@ static int move_task(struct rq *rq, stru
 			if (task_on_rq_queued(p))
 				rq = move_queued_task(rq, rf, p, dest_cpu);
 
+easy:
 			p->migration_pending = NULL;
 			complete = true;
 		}
@@ -2151,10 +2156,6 @@ static int __set_cpus_allowed_ptr(struct
 			p->nr_cpus_allowed != 1);
 	}
 
-	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask))
-		goto out;
-
 	return move_task(rq, &rf, p, dest_cpu, flags);
 
 out:

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 8/9] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-09-25  8:43     ` Peter Zijlstra
@ 2020-09-25 10:07       ` Valentin Schneider
  0 siblings, 0 replies; 45+ messages in thread
From: Valentin Schneider @ 2020-09-25 10:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort


On 25/09/20 09:43, Peter Zijlstra wrote:
> On Thu, Sep 24, 2020 at 08:59:33PM +0100, Valentin Schneider wrote:
>
>> > +	if (task_running(rq, p) || p->state == TASK_WAKING) {
>> > +
>> > +		task_rq_unlock(rq, p, rf);
>> > +		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
>> > +
>>
>> Shouldn't we check for is_migrate_disabled(p) before doing any of that?
>> migration_cpu_stop() does check for it, is there something that prevents us
>> from acting on it earlier than that?
>
> Since migrate_disable() / ->migration_disabled is only touched from the
> current task, you can only reliably read it from the same CPU.
>
> Hence I only look at it when the stop task has pinned the task, because
> at that point I know it's stable.
>
> Doing it earlier gives races, races give me head-aches. This is a slow
> path, I don't care about performance.

Makes sense, thanks.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 8/9] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
  2020-09-25  9:56       ` Peter Zijlstra
@ 2020-09-25 10:09         ` Valentin Schneider
  0 siblings, 0 replies; 45+ messages in thread
From: Valentin Schneider @ 2020-09-25 10:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort


On 25/09/20 10:56, Peter Zijlstra wrote:
> On Fri, Sep 25, 2020 at 11:05:28AM +0200, Peter Zijlstra wrote:
>> On Thu, Sep 24, 2020 at 08:59:33PM +0100, Valentin Schneider wrote:
>> > > @@ -2025,19 +2138,8 @@ static int __set_cpus_allowed_ptr(struct
>> > >       if (cpumask_test_cpu(task_cpu(p), new_mask))
>> > >               goto out;
>> >
>> > I think this needs a cancellation of any potential pending migration
>> > requests. Consider a task P0 running on CPU0:
>> >
>> >    P0                     P1                               P2
>> >
>> >    migrate_disable();
>> >    <preempt>
>> >                           set_cpus_allowed_ptr(P0, CPU1);
>> >                           // waits for completion
>> >                                                            set_cpus_allowed_ptr(P0, CPU0);
>> >                                                            // Already good, no waiting for completion
>> >    <resumes>
>> >    migrate_enable();
>> >    // task_cpu(p) allowed, no move_task()
>> >
>> > AIUI in this scenario P1 would stay forever waiting.
>>
>
>> The other approach is trying to handle that last condition in
>> move_task(), but I'm quite sure that's going to be aweful too :/
>
> Something like so perhaps?
>

That looks somewhat sane (not pretty, but we're past that :-)). I was
trying something similar in __set_cpus_allowed_ptr() itself, but this
condenses the completion / refcount logic in one place, so fewer
headaches.

> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2039,6 +2039,10 @@ static int move_task(struct rq *rq, stru
>       if (WARN_ON_ONCE(!pending))
>               return -EINVAL;
>
> +	/* Can the task run on the task's current CPU? If so, we're done */
> +	if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
> +		goto easy;
> +
>       arg.done = &pending->done;
>
>       if (flags & SCA_MIGRATE_ENABLE) {
> @@ -2063,6 +2067,7 @@ static int move_task(struct rq *rq, stru
>                       if (task_on_rq_queued(p))
>                               rq = move_queued_task(rq, rf, p, dest_cpu);
>
> +easy:
>                       p->migration_pending = NULL;
>                       complete = true;
>               }
> @@ -2151,10 +2156,6 @@ static int __set_cpus_allowed_ptr(struct
>                       p->nr_cpus_allowed != 1);
>       }
>
> -	/* Can the task run on the task's current CPU? If so, we're done */
> -	if (cpumask_test_cpu(task_cpu(p), new_mask))
> -		goto out;
> -
>       return move_task(rq, &rf, p, dest_cpu, flags);
>
>  out:

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 0/9] sched: Migrate disable support
  2020-09-25  9:12 ` [PATCH 0/9] sched: Migrate disable support Dietmar Eggemann
@ 2020-09-25 10:10   ` Peter Zijlstra
  2020-09-25 11:58     ` Dietmar Eggemann
  0 siblings, 1 reply; 45+ messages in thread
From: Peter Zijlstra @ 2020-09-25 10:10 UTC (permalink / raw)
  To: Dietmar Eggemann
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort

On Fri, Sep 25, 2020 at 11:12:09AM +0200, Dietmar Eggemann wrote:

> I get this when running 6 (periodic) RT50 tasks with CPU hp stress on my
> 6 CPU JUNO board (!CONFIG_PREEMPT_RT).
> 
> [   55.490263] ------------[ cut here ]------------
> [   55.505261] Modules linked in:
> [   55.508322] CPU: 3 PID: 24 Comm: migration/3 Not tainted
> 5.9.0-rc1-00132-gc096e6406c50-dirty #90
> [   55.517119] Hardware name: ARM Juno development board (r0) (DT)
> [   55.523058] Stopper: multi_cpu_stop+0x0/0x170 <- 0x0
> [   55.528029] pstate: 20000085 (nzCv daIf -PAN -UAO BTYPE=--)
> [   55.533612] pc : sched_cpu_dying+0x124/0x130
> [   55.537887] lr : sched_cpu_dying+0xd8/0x130
> [   55.542071] sp : ffff800011f0bca0
> [   55.545385] x29: ffff800011f0bca0 x28: 0000000000000002
> [   55.550703] x27: 0000000000000000 x26: 0000000000000060
> [   55.556022] x25: 0000000000000000 x24: 0000000000000001
> [   55.561340] x23: 0000000000000000 x22: 0000000000000003
> [   55.566659] x21: 0000000000000080 x20: 0000000000000003
> [   55.571977] x19: ffff00097ef9e1c0 x18: 0000000000000010
> [   55.577295] x17: 0000000000000000 x16: 0000000000000000
> [   55.582613] x15: 0000000000000000 x14: 000000000000015c
> [   55.587932] x13: 0000000000000000 x12: 00000000000006f1
> [   55.593250] x11: 0000000000000080 x10: 0000000000000000
> [   55.598567] x9 : 0000000000000003 x8 : ffff0009743f5900
> [   55.603886] x7 : 0000000000000003 x6 : 0000000000000000
> [   55.609204] x5 : 0000000000000001 x4 : 0000000000000002
> [   55.614521] x3 : 0000000000000000 x2 : 0000000000000013
> [   55.619839] x1 : 0000000000000008 x0 : 0000000000000003
> [   55.625158] Call trace:
> [   55.627607]  sched_cpu_dying+0x124/0x130
> [   55.631535]  cpuhp_invoke_callback+0x88/0x210
> [   55.635897]  take_cpu_down+0x7c/0xd8
> [   55.639475]  multi_cpu_stop+0xac/0x170
> [   55.643227]  cpu_stopper_thread+0x98/0x130
> [   55.647327]  smpboot_thread_fn+0x1c4/0x280
> [   55.651427]  kthread+0x140/0x160
> [   55.654658]  ret_from_fork+0x10/0x34
> [   55.658239] Code: f000e1c1 913fc021 1400034a 17ffffde (d4210000)
> [   55.664342] ---[ end trace c5b8988b7b701e56 ]---
> [   55.668963] note: migration/3[24] exited with preempt_count 3
> 
> 7309 int sched_cpu_dying(unsigned int cpu)
>     ...
>     BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
>     ...
> 
> rq->nr_running is always 2 here in this cases.
> 
> balance_hotplug_wait and sched_cpu_wait_empty run in cpuhp/X (CFS)
> whereas sched_cpu_dying in migration/X ?

takedown_cpu() has:

  kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);

before calling:

  err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));

So when we get to sched_cpu_dying(), the only task that _should_ still
be there is migration/X.

Do you have any idea what thread, other than migration/X, is still
active on that CPU? per sched_cpu_wait_empty() we should've pushed out
all userspace tasks, and the cpu hotplug machinery should've put all the
per-cpu kthreads to sleep at this point.


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 0/9] sched: Migrate disable support
  2020-09-25 10:10   ` Peter Zijlstra
@ 2020-09-25 11:58     ` Dietmar Eggemann
  2020-09-25 12:19       ` Valentin Schneider
  0 siblings, 1 reply; 45+ messages in thread
From: Dietmar Eggemann @ 2020-09-25 11:58 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort

On 25/09/2020 12:10, Peter Zijlstra wrote:
> On Fri, Sep 25, 2020 at 11:12:09AM +0200, Dietmar Eggemann wrote:
> 
>> I get this when running 6 (periodic) RT50 tasks with CPU hp stress on my
>> 6 CPU JUNO board (!CONFIG_PREEMPT_RT).
>>
>> [   55.490263] ------------[ cut here ]------------
>> [   55.505261] Modules linked in:
>> [   55.508322] CPU: 3 PID: 24 Comm: migration/3 Not tainted
>> 5.9.0-rc1-00132-gc096e6406c50-dirty #90
>> [   55.517119] Hardware name: ARM Juno development board (r0) (DT)
>> [   55.523058] Stopper: multi_cpu_stop+0x0/0x170 <- 0x0
>> [   55.528029] pstate: 20000085 (nzCv daIf -PAN -UAO BTYPE=--)
>> [   55.533612] pc : sched_cpu_dying+0x124/0x130
>> [   55.537887] lr : sched_cpu_dying+0xd8/0x130
>> [   55.542071] sp : ffff800011f0bca0
>> [   55.545385] x29: ffff800011f0bca0 x28: 0000000000000002
>> [   55.550703] x27: 0000000000000000 x26: 0000000000000060
>> [   55.556022] x25: 0000000000000000 x24: 0000000000000001
>> [   55.561340] x23: 0000000000000000 x22: 0000000000000003
>> [   55.566659] x21: 0000000000000080 x20: 0000000000000003
>> [   55.571977] x19: ffff00097ef9e1c0 x18: 0000000000000010
>> [   55.577295] x17: 0000000000000000 x16: 0000000000000000
>> [   55.582613] x15: 0000000000000000 x14: 000000000000015c
>> [   55.587932] x13: 0000000000000000 x12: 00000000000006f1
>> [   55.593250] x11: 0000000000000080 x10: 0000000000000000
>> [   55.598567] x9 : 0000000000000003 x8 : ffff0009743f5900
>> [   55.603886] x7 : 0000000000000003 x6 : 0000000000000000
>> [   55.609204] x5 : 0000000000000001 x4 : 0000000000000002
>> [   55.614521] x3 : 0000000000000000 x2 : 0000000000000013
>> [   55.619839] x1 : 0000000000000008 x0 : 0000000000000003
>> [   55.625158] Call trace:
>> [   55.627607]  sched_cpu_dying+0x124/0x130
>> [   55.631535]  cpuhp_invoke_callback+0x88/0x210
>> [   55.635897]  take_cpu_down+0x7c/0xd8
>> [   55.639475]  multi_cpu_stop+0xac/0x170
>> [   55.643227]  cpu_stopper_thread+0x98/0x130
>> [   55.647327]  smpboot_thread_fn+0x1c4/0x280
>> [   55.651427]  kthread+0x140/0x160
>> [   55.654658]  ret_from_fork+0x10/0x34
>> [   55.658239] Code: f000e1c1 913fc021 1400034a 17ffffde (d4210000)
>> [   55.664342] ---[ end trace c5b8988b7b701e56 ]---
>> [   55.668963] note: migration/3[24] exited with preempt_count 3
>>
>> 7309 int sched_cpu_dying(unsigned int cpu)
>>     ...
>>     BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
>>     ...
>>
>> rq->nr_running is always 2 here in this cases.
>>
>> balance_hotplug_wait and sched_cpu_wait_empty run in cpuhp/X (CFS)
>> whereas sched_cpu_dying in migration/X ?
> 
> takedown_cpu() has:
> 
>   kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
> 
> before calling:
> 
>   err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
> 
> So when we get to sched_cpu_dying(), the only task that _should_ still
> be there is migration/X.
> 
> Do you have any idea what thread, other than migration/X, is still
> active on that CPU? per sched_cpu_wait_empty() we should've pushed out
> all userspace tasks, and the cpu hotplug machinery should've put all the
> per-cpu kthreads to sleep at this point.

With Valentin's print_rq() inspired test snippet I always see one of the
RT user tasks as the second guy? BTW, it has to be RT tasks, never
triggered with CFS tasks.

[   57.849268] CPU2 nr_running=2
[   57.852241]  p=migration/2
[   57.854967]  p=task0-0

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 0/9] sched: Migrate disable support
  2020-09-25 11:58     ` Dietmar Eggemann
@ 2020-09-25 12:19       ` Valentin Schneider
  2020-09-25 17:49         ` Valentin Schneider
  0 siblings, 1 reply; 45+ messages in thread
From: Valentin Schneider @ 2020-09-25 12:19 UTC (permalink / raw)
  To: Dietmar Eggemann
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, rostedt, bsegall, mgorman,
	bristot, vincent.donnefort


On 25/09/20 12:58, Dietmar Eggemann wrote:
> On 25/09/2020 12:10, Peter Zijlstra wrote:
>> On Fri, Sep 25, 2020 at 11:12:09AM +0200, Dietmar Eggemann wrote:
>>
>>> I get this when running 6 (periodic) RT50 tasks with CPU hp stress on my
>>> 6 CPU JUNO board (!CONFIG_PREEMPT_RT).
>>>
>>> [   55.490263] ------------[ cut here ]------------
>>> [   55.505261] Modules linked in:
>>> [   55.508322] CPU: 3 PID: 24 Comm: migration/3 Not tainted
>>> 5.9.0-rc1-00132-gc096e6406c50-dirty #90
>>> [   55.517119] Hardware name: ARM Juno development board (r0) (DT)
>>> [   55.523058] Stopper: multi_cpu_stop+0x0/0x170 <- 0x0
>>> [   55.528029] pstate: 20000085 (nzCv daIf -PAN -UAO BTYPE=--)
>>> [   55.533612] pc : sched_cpu_dying+0x124/0x130
>>> [   55.537887] lr : sched_cpu_dying+0xd8/0x130
>>> [   55.542071] sp : ffff800011f0bca0
>>> [   55.545385] x29: ffff800011f0bca0 x28: 0000000000000002
>>> [   55.550703] x27: 0000000000000000 x26: 0000000000000060
>>> [   55.556022] x25: 0000000000000000 x24: 0000000000000001
>>> [   55.561340] x23: 0000000000000000 x22: 0000000000000003
>>> [   55.566659] x21: 0000000000000080 x20: 0000000000000003
>>> [   55.571977] x19: ffff00097ef9e1c0 x18: 0000000000000010
>>> [   55.577295] x17: 0000000000000000 x16: 0000000000000000
>>> [   55.582613] x15: 0000000000000000 x14: 000000000000015c
>>> [   55.587932] x13: 0000000000000000 x12: 00000000000006f1
>>> [   55.593250] x11: 0000000000000080 x10: 0000000000000000
>>> [   55.598567] x9 : 0000000000000003 x8 : ffff0009743f5900
>>> [   55.603886] x7 : 0000000000000003 x6 : 0000000000000000
>>> [   55.609204] x5 : 0000000000000001 x4 : 0000000000000002
>>> [   55.614521] x3 : 0000000000000000 x2 : 0000000000000013
>>> [   55.619839] x1 : 0000000000000008 x0 : 0000000000000003
>>> [   55.625158] Call trace:
>>> [   55.627607]  sched_cpu_dying+0x124/0x130
>>> [   55.631535]  cpuhp_invoke_callback+0x88/0x210
>>> [   55.635897]  take_cpu_down+0x7c/0xd8
>>> [   55.639475]  multi_cpu_stop+0xac/0x170
>>> [   55.643227]  cpu_stopper_thread+0x98/0x130
>>> [   55.647327]  smpboot_thread_fn+0x1c4/0x280
>>> [   55.651427]  kthread+0x140/0x160
>>> [   55.654658]  ret_from_fork+0x10/0x34
>>> [   55.658239] Code: f000e1c1 913fc021 1400034a 17ffffde (d4210000)
>>> [   55.664342] ---[ end trace c5b8988b7b701e56 ]---
>>> [   55.668963] note: migration/3[24] exited with preempt_count 3
>>>
>>> 7309 int sched_cpu_dying(unsigned int cpu)
>>>     ...
>>>     BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
>>>     ...
>>>
>>> rq->nr_running is always 2 here in this cases.
>>>
>>> balance_hotplug_wait and sched_cpu_wait_empty run in cpuhp/X (CFS)
>>> whereas sched_cpu_dying in migration/X ?
>>
>> takedown_cpu() has:
>>
>>   kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
>>
>> before calling:
>>
>>   err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
>>
>> So when we get to sched_cpu_dying(), the only task that _should_ still
>> be there is migration/X.
>>
>> Do you have any idea what thread, other than migration/X, is still
>> active on that CPU? per sched_cpu_wait_empty() we should've pushed out
>> all userspace tasks, and the cpu hotplug machinery should've put all the
>> per-cpu kthreads to sleep at this point.
>
> With Valentin's print_rq() inspired test snippet I always see one of the
> RT user tasks as the second guy? BTW, it has to be RT tasks, never
> triggered with CFS tasks.
>
> [   57.849268] CPU2 nr_running=2
> [   57.852241]  p=migration/2
> [   57.854967]  p=task0-0

I can also trigger the BUG_ON() using the built-in locktorture module
(+enabling hotplug torture), and it happens very early on. I can't trigger
it under qemu sadly :/ Also, in my case it's always a kworker:

[    0.830462] CPU3 nr_running=2
[    0.833443]  p=migration/3
[    0.836150]  p=kworker/3:0

I'm looking into what workqueue.c is doing about hotplug...

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/9] sched/hotplug: Ensure only per-cpu kthreads run during hotplug
  2020-09-21 16:36 ` [PATCH 3/9] sched/hotplug: Ensure only per-cpu kthreads run during hotplug Peter Zijlstra
@ 2020-09-25 16:38   ` Dietmar Eggemann
  2020-10-02 14:20     ` Peter Zijlstra
  0 siblings, 1 reply; 45+ messages in thread
From: Dietmar Eggemann @ 2020-09-25 16:38 UTC (permalink / raw)
  To: Peter Zijlstra, tglx, mingo
  Cc: linux-kernel, bigeasy, qais.yousef, swood, valentin.schneider,
	juri.lelli, vincent.guittot, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

On 21/09/2020 18:36, Peter Zijlstra wrote:

[...]

> This replaces the unlikely(rq->balance_callbacks) test at the tail of
> context_switch with an unlikely(rq->balance_work), the fast path is

While looking for why BALANCE_WORK is needed:

Shouldn't this be unlikely(rq->balance_callback) and
unlikely(rq->balance_flags)?

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 7/9] sched: Add migrate_disable()
  2020-09-21 16:36 ` [PATCH 7/9] sched: Add migrate_disable() Peter Zijlstra
  2020-09-21 19:16   ` Thomas Gleixner
  2020-09-24 11:53   ` Valentin Schneider
@ 2020-09-25 16:50   ` Sebastian Andrzej Siewior
  2020-10-02 14:21     ` Peter Zijlstra
  2 siblings, 1 reply; 45+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-09-25 16:50 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

On 2020-09-21 18:36:04 [+0200], Peter Zijlstra wrote:
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1700,6 +1700,68 @@ void check_preempt_curr(struct rq *rq, s
>  
>  #ifdef CONFIG_SMP
>  
> +#ifdef CONFIG_PREEMPT_RT
> +static inline bool is_migration_disabled(struct task_struct *p)
> +{
> +	return p->migration_disabled;
> +}

Just a thought: having this with int as return type and defined in a
header file then it could be used in check_preemption_disabled() and in
the tracing output.

> +#else
> +
> +static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
> +
> +static inline bool is_migration_disabled(struct task_struct *p)
> +{
> +	return false;
> +}
> +
> +#endif

Sebastian

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 0/9] sched: Migrate disable support
  2020-09-25 12:19       ` Valentin Schneider
@ 2020-09-25 17:49         ` Valentin Schneider
  2020-09-29  9:15           ` Dietmar Eggemann
  0 siblings, 1 reply; 45+ messages in thread
From: Valentin Schneider @ 2020-09-25 17:49 UTC (permalink / raw)
  To: Dietmar Eggemann
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, rostedt, bsegall, mgorman,
	bristot, vincent.donnefort


On 25/09/20 13:19, Valentin Schneider wrote:
> On 25/09/20 12:58, Dietmar Eggemann wrote:
>> With Valentin's print_rq() inspired test snippet I always see one of the
>> RT user tasks as the second guy? BTW, it has to be RT tasks, never
>> triggered with CFS tasks.
>>
>> [   57.849268] CPU2 nr_running=2
>> [   57.852241]  p=migration/2
>> [   57.854967]  p=task0-0
>
> I can also trigger the BUG_ON() using the built-in locktorture module
> (+enabling hotplug torture), and it happens very early on. I can't trigger
> it under qemu sadly :/ Also, in my case it's always a kworker:
>
> [    0.830462] CPU3 nr_running=2
> [    0.833443]  p=migration/3
> [    0.836150]  p=kworker/3:0
>
> I'm looking into what workqueue.c is doing about hotplug...

So with
- The pending migration fixup (20200925095615.GA2651@hirez.programming.kicks-ass.net)
- The workqueue set_cpus_allowed_ptr() change (from IRC)
- The set_rq_offline() move + DL/RT pull && rq->online (also from IRC)

my Juno survives rtmutex + hotplug locktorture, where it would previously
explode < 1s after boot (mostly due to the workqueue thing).

I stared a bit more at the rq_offline() + DL/RT bits and they look fine to
me.

The one thing I'm not entirely sure about is while you plugged the
class->balance() hole, AIUI we might still get RT (DL?) pull callbacks
enqueued - say if we just unthrottled an RT RQ and something changes the
priority of one of the freshly-released tasks (user or rtmutex
interaction), I don't see any stopgap preventing a pull from happening.

I slapped the following on top of my kernel and it didn't die, although I'm
not sure I'm correctly stressing this path. Perhaps we could limit that to
the pull paths, since technically we're okay with pushing out of an !online
RQ.

---
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 50aac5b6db26..00d1a7b85e97 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1403,7 +1403,7 @@ queue_balance_callback(struct rq *rq,
 {
        lockdep_assert_held(&rq->lock);

-	if (unlikely(head->next))
+	if (unlikely(head->next || !rq->online))
                return;

        head->func = (void (*)(struct callback_head *))func;
---

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 0/9] sched: Migrate disable support
  2020-09-21 16:35 [PATCH 0/9] sched: Migrate disable support Peter Zijlstra
                   ` (9 preceding siblings ...)
  2020-09-25  9:12 ` [PATCH 0/9] sched: Migrate disable support Dietmar Eggemann
@ 2020-09-25 18:17 ` Sebastian Andrzej Siewior
  2020-09-25 19:32   ` Valentin Schneider
  10 siblings, 1 reply; 45+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-09-25 18:17 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

On 2020-09-21 18:35:57 [+0200], Peter Zijlstra wrote:
> Hi,
Hi,

> Here's my take on migrate_disable(). It avoids growing a second means of

I have here:

|005: numa_remove_cpu cpu 5 node 0: mask now 0,3-4,6-7
|007: smpboot: CPU 5 is now offline
|006: ------------[ cut here ]------------
|006: rq->balance_callback
|006: WARNING: CPU: 6 PID: 8392 at kernel/sched/sched.h:1234 try_to_wake_up+0x696/0x860
|006: Modules linked in:
|006:
|006: CPU: 6 PID: 8392 Comm: hackbench Not tainted 5.9.0-rc6-rt9+ #60
|006: Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-1 04/01/2014
|006: RIP: 0010:try_to_wake_up+0x696/0x860
|006: Code: c0 01 00 00 01 e9 d9 fb ff ff 80 3d 90 ef 6d 01 00 0f 85 6c fb ff ff 48 c7 c7 d4 4a 2c 82 c6 05 7c ef 6d 01 01 e8 dd 21 fc ff <0f> 0b e9 52 fb ff ff 0f 0b e9 b2
|006: RSP: 0018:ffffc90005b978f8 EFLAGS: 00010082
|006:
|006: RAX: 0000000000000000 RBX: ffff8882755cca40 RCX: 0000000000000000
|006: RDX: ffffffff8247aab8 RSI: 00000000ffffffff RDI: 00000000ffffffff
|006: RBP: 0000000000000000 R08: 0000000000000001 R09: ffffffff8247a9a0
|006: R10: ffffc90005b97838 R11: 332e39313320205b R12: ffff888276da8600
|006: R13: 0000000000000093 R14: ffff8882755cd7a0 R15: ffff888276da8618
|006: FS:  00007f6fa7805740(0000) GS:ffff888276d80000(0000) knlGS:0000000000000000
|006: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
|006: CR2: 00007f6fa796af90 CR3: 0000000262588000 CR4: 00000000003506e0
|006: Call Trace:
|006:  ? cpu_stop_queue_work+0x8e/0x150
|006:  __wake_up_q+0x96/0xc0
|006:  cpu_stop_queue_work+0x9a/0x150
|006:  finish_task_switch.isra.0+0x2f1/0x460
|006:  __schedule+0x3bd/0xb20
|006:  schedule+0x4a/0x100
|006:  schedule_hrtimeout_range_clock+0x14f/0x160
|006:  ? rt_spin_unlock+0x39/0x90
|006:  ? rt_mutex_futex_unlock+0xcb/0xe0
|006:  poll_schedule_timeout.constprop.0+0x4d/0x90
|006:  do_sys_poll+0x314/0x430
|006:  ? __lock_acquire+0x39b/0x2010
|006:  ? poll_schedule_timeout.constprop.0+0x90/0x90
|006:  ? mark_held_locks+0x49/0x70
|006:  ? find_held_lock+0x2b/0x80
|006:  ? rt_spin_unlock+0x39/0x90
|006:  ? rt_mutex_futex_unlock+0xcb/0xe0
|006:  ? rt_spin_unlock+0x51/0x90
|006:  ? handle_mm_fault+0xfbd/0x1510
|006:  ? find_held_lock+0x2b/0x80
|006:  ? do_user_addr_fault+0x214/0x420
|006:  __x64_sys_poll+0x37/0x130
|006:  do_syscall_64+0x33/0x40
|006:  entry_SYSCALL_64_after_hwframe+0x44/0xa9
|006: RIP: 0033:0x7f6fa78fb483

Is this somewhere among the fixes Valentin received?
This SCHED_WARN_ON(rq->balance_callback); in rq_pin_lock().

Sebastian

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 0/9] sched: Migrate disable support
  2020-09-25 18:17 ` Sebastian Andrzej Siewior
@ 2020-09-25 19:32   ` Valentin Schneider
  2020-10-02 14:30     ` Peter Zijlstra
  0 siblings, 1 reply; 45+ messages in thread
From: Valentin Schneider @ 2020-09-25 19:32 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, qais.yousef, swood,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vincent.donnefort


On 25/09/20 19:17, Sebastian Andrzej Siewior wrote:
> On 2020-09-21 18:35:57 [+0200], Peter Zijlstra wrote:
>> Hi,
> Hi,
>
>> Here's my take on migrate_disable(). It avoids growing a second means of
>
> I have here:
>
> |005: numa_remove_cpu cpu 5 node 0: mask now 0,3-4,6-7
> |007: smpboot: CPU 5 is now offline
> |006: ------------[ cut here ]------------
> |006: rq->balance_callback
> |006: WARNING: CPU: 6 PID: 8392 at kernel/sched/sched.h:1234 try_to_wake_up+0x696/0x860
> |006: Modules linked in:
> |006:
> |006: CPU: 6 PID: 8392 Comm: hackbench Not tainted 5.9.0-rc6-rt9+ #60
> |006: Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-1 04/01/2014
> |006: RIP: 0010:try_to_wake_up+0x696/0x860
> |006: Code: c0 01 00 00 01 e9 d9 fb ff ff 80 3d 90 ef 6d 01 00 0f 85 6c fb ff ff 48 c7 c7 d4 4a 2c 82 c6 05 7c ef 6d 01 01 e8 dd 21 fc ff <0f> 0b e9 52 fb ff ff 0f 0b e9 b2
> |006: RSP: 0018:ffffc90005b978f8 EFLAGS: 00010082
> |006:
> |006: RAX: 0000000000000000 RBX: ffff8882755cca40 RCX: 0000000000000000
> |006: RDX: ffffffff8247aab8 RSI: 00000000ffffffff RDI: 00000000ffffffff
> |006: RBP: 0000000000000000 R08: 0000000000000001 R09: ffffffff8247a9a0
> |006: R10: ffffc90005b97838 R11: 332e39313320205b R12: ffff888276da8600
> |006: R13: 0000000000000093 R14: ffff8882755cd7a0 R15: ffff888276da8618
> |006: FS:  00007f6fa7805740(0000) GS:ffff888276d80000(0000) knlGS:0000000000000000
> |006: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> |006: CR2: 00007f6fa796af90 CR3: 0000000262588000 CR4: 00000000003506e0
> |006: Call Trace:
> |006:  ? cpu_stop_queue_work+0x8e/0x150
> |006:  __wake_up_q+0x96/0xc0
> |006:  cpu_stop_queue_work+0x9a/0x150
> |006:  finish_task_switch.isra.0+0x2f1/0x460
> |006:  __schedule+0x3bd/0xb20
> |006:  schedule+0x4a/0x100
> |006:  schedule_hrtimeout_range_clock+0x14f/0x160
> |006:  ? rt_spin_unlock+0x39/0x90
> |006:  ? rt_mutex_futex_unlock+0xcb/0xe0
> |006:  poll_schedule_timeout.constprop.0+0x4d/0x90
> |006:  do_sys_poll+0x314/0x430
> |006:  ? __lock_acquire+0x39b/0x2010
> |006:  ? poll_schedule_timeout.constprop.0+0x90/0x90
> |006:  ? mark_held_locks+0x49/0x70
> |006:  ? find_held_lock+0x2b/0x80
> |006:  ? rt_spin_unlock+0x39/0x90
> |006:  ? rt_mutex_futex_unlock+0xcb/0xe0
> |006:  ? rt_spin_unlock+0x51/0x90
> |006:  ? handle_mm_fault+0xfbd/0x1510
> |006:  ? find_held_lock+0x2b/0x80
> |006:  ? do_user_addr_fault+0x214/0x420
> |006:  __x64_sys_poll+0x37/0x130
> |006:  do_syscall_64+0x33/0x40
> |006:  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> |006: RIP: 0033:0x7f6fa78fb483
>
> Is this somewhere among the fixes Valentin received?
> This SCHED_WARN_ON(rq->balance_callback); in rq_pin_lock().
>

The IRC handout so far is:
https://paste.debian.net/1164646/
https://paste.debian.net/1164656/

As for your splat, I think this is what I was worrying about wrt
suppressing callbacks in the switch but not preventing them from being
queued. Perhaps the below is "better" than what I previously sent.

Technically should be doable with a cpu_active() check instead given this
all gets flipped in sched_cpu_deactivate(), but at least this makes it
obvious that PUSH suppresses any other callback.

---
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 50aac5b6db26..40d78a20fbcb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1403,7 +1403,7 @@ queue_balance_callback(struct rq *rq,
 {
        lockdep_assert_held(&rq->lock);

-	if (unlikely(head->next))
+	if (unlikely(head->next) || (rq->balance_flags & BALANCE_PUSH))
                return;

        head->func = (void (*)(struct callback_head *))func;
---

> Sebastian

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 0/9] sched: Migrate disable support
  2020-09-25 17:49         ` Valentin Schneider
@ 2020-09-29  9:15           ` Dietmar Eggemann
  0 siblings, 0 replies; 45+ messages in thread
From: Dietmar Eggemann @ 2020-09-29  9:15 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Peter Zijlstra, tglx, mingo, linux-kernel, bigeasy, qais.yousef,
	swood, juri.lelli, vincent.guittot, rostedt, bsegall, mgorman,
	bristot, vincent.donnefort

On 25/09/2020 19:49, Valentin Schneider wrote:
> 
> On 25/09/20 13:19, Valentin Schneider wrote:
>> On 25/09/20 12:58, Dietmar Eggemann wrote:
>>> With Valentin's print_rq() inspired test snippet I always see one of the
>>> RT user tasks as the second guy? BTW, it has to be RT tasks, never
>>> triggered with CFS tasks.
>>>
>>> [   57.849268] CPU2 nr_running=2
>>> [   57.852241]  p=migration/2
>>> [   57.854967]  p=task0-0
>>
>> I can also trigger the BUG_ON() using the built-in locktorture module
>> (+enabling hotplug torture), and it happens very early on. I can't trigger
>> it under qemu sadly :/ Also, in my case it's always a kworker:
>>
>> [    0.830462] CPU3 nr_running=2
>> [    0.833443]  p=migration/3
>> [    0.836150]  p=kworker/3:0
>>
>> I'm looking into what workqueue.c is doing about hotplug...
> 
> So with
> - The pending migration fixup (20200925095615.GA2651@hirez.programming.kicks-ass.net)
> - The workqueue set_cpus_allowed_ptr() change (from IRC)
> - The set_rq_offline() move + DL/RT pull && rq->online (also from IRC)
> 
> my Juno survives rtmutex + hotplug locktorture, where it would previously
> explode < 1s after boot (mostly due to the workqueue thing).
> 
> I stared a bit more at the rq_offline() + DL/RT bits and they look fine to
> me.
> 
> The one thing I'm not entirely sure about is while you plugged the
> class->balance() hole, AIUI we might still get RT (DL?) pull callbacks
> enqueued - say if we just unthrottled an RT RQ and something changes the
> priority of one of the freshly-released tasks (user or rtmutex
> interaction), I don't see any stopgap preventing a pull from happening.
> 
> I slapped the following on top of my kernel and it didn't die, although I'm
> not sure I'm correctly stressing this path. Perhaps we could limit that to
> the pull paths, since technically we're okay with pushing out of an !online
> RQ.
> 
> ---
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 50aac5b6db26..00d1a7b85e97 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1403,7 +1403,7 @@ queue_balance_callback(struct rq *rq,
>  {
>         lockdep_assert_held(&rq->lock);
> 
> -	if (unlikely(head->next))
> +	if (unlikely(head->next || !rq->online))
>                 return;
> 
>         head->func = (void (*)(struct callback_head *))func;
> ---

When I use the original patch-set (i.e. without the pending migration
fixup and the two changes from IRC) it looks like that the rt task is
already on the rq before the rq_offline_rt() -> __disable_runtime() call.

pr_crit("CPU%d X: %d %d %lu %lu %d %d %d %llu %llu\n",
         cpu_of(rq), rq->nr_running,
         rt_rq->rt_nr_running, rt_rq->rt_nr_migratory,
         rt_rq->rt_nr_total, rt_rq->overloaded,
         rt_rq->rt_queued, rt_rq->rt_throttled,
         rt_rq->rt_time, rt_rq->rt_runtime);

X = 1 : in rq_offline_rt() before __disable_runtime()
    2 :       "            after        "
    3 : in rq_online_rt()  before __enable_runtime()
    4 :       "            after        "
    5 : in sched_cpu_dying() if (rq->nr_running > 1)

*[   70.369719] CPU0 1: 1 1 1 1 0 1 0 36093160 950000000
*[   70.374689] CPU0 2: 1 1 1 1 0 1 0 36093160 18446744073709551615
*[   70.380615] CPU0 3: 1 1 1 1 0 1 0 36093160 18446744073709551615
*[   70.386540] CPU0 4: 1 1 1 1 0 1 0 0 950000000
 [   70.395637] CPU1 1: 1 0 0 0 0 0 0 31033300 950000000
 [   70.400606] CPU1 2: 1 0 0 0 0 0 0 31033300 18446744073709551615
 [   70.406532] CPU1 3: 1 0 0 0 0 0 0 31033300 18446744073709551615
 [   70.412457] CPU1 4: 1 0 0 0 0 0 0 0 950000000
 [   70.421609] CPU4 1: 0 0 0 0 0 0 0 19397300 950000000
 [   70.426577] CPU4 2: 0 0 0 0 0 0 0 19397300 18446744073709551615
 [   70.432503] CPU4 3: 0 0 0 0 0 0 0 19397300 18446744073709551615
 [   70.438428] CPU4 4: 0 0 0 0 0 0 0 0 950000000
 [   70.484133] CPU3 3: 2 0 0 0 0 0 0 3907020 18446744073709551615
 [   70.489984] CPU3 4: 2 0 0 0 0 0 0 0 950000000
 [   70.540112] CPU2 3: 1 0 0 0 0 0 0 3605180 18446744073709551615
 [   70.545953] CPU2 4: 1 0 0 0 0 0 0 0 950000000
*[   70.647548] CPU0 1: 2 1 1 1 0 1 0 5150760 950000000
*[   70.652441] CPU0 2: 2 1 1 1 0 1 0 5150760 18446744073709551615
*[   70.658281] CPU0 nr_running=2
 [   70.661255]  p=migration/0
 [   70.664022]  p=task0-4
*[   70.666384] CPU0 5: 2 1 1 1 0 1 0 5150760 18446744073709551615
 [   70.672230] ------------[ cut here ]------------
 [   70.676850] kernel BUG at kernel/sched/core.c:7346!
 [   70.681733] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
 [   70.687223] Modules linked in:
 [   70.690284] CPU: 0 PID: 11 Comm: migration/0 Not tainted 5.9.0-
rc1-00134-g7104613975b6-dirty #173
 [   70.699168] Hardware name: ARM Juno development board (r0) (DT)
 [   70.705107] Stopper: multi_cpu_stop+0x0/0x170 <- 0x0
 [   70.710078] pstate: 20000085 (nzCv daIf -PAN -UAO BTYPE=--)
 [   70.715661] pc : sched_cpu_dying+0x210/0x250
 [   70.719936] lr : sched_cpu_dying+0x204/0x250
 [   70.724207] sp : ffff800011e7bc60
 [   70.727521] x29: ffff800011e7bc80 x28: 0000000000000002
 [   70.732840] x27: 0000000000000000 x26: ffff800011ab30c0
 [   70.738159] x25: ffff8000112d37e0 x24: ffff800011ab30c0
 [   70.743477] x23: ffff800011ab3440 x22: ffff000975e40790
 [   70.748796] x21: 0000000000000080 x20: 0000000000000000
 [   70.754115] x19: ffff00097ef591c0 x18: 0000000000000010
 [   70.759433] x17: 0000000000000000 x16: 0000000000000000
 [   70.764752] x15: ffff000975cf2108 x14: ffffffffffffffff
 [   70.770070] x13: ffff800091e7b9e7 x12: ffff800011e7b9ef
 [   70.775388] x11: ffff800011ac2000 x10: ffff800011ce86d0
 [   70.780707] x9 : 0000000000000001 x8 : ffff800011ce9000
 [   70.786026] x7 : ffff8000106edad8 x6 : 000000000000131c
 [   70.791344] x5 : ffff00097ef4f230 x4 : 0000000000000000
 [   70.796662] x3 : 0000000000000027 x2 : 414431aad459c700
 [   70.801981] x1 : 0000000000000000 x0 : 0000000000000002
 [   70.807299] Call trace:
 [   70.809747]  sched_cpu_dying+0x210/0x250
 [   70.813676]  cpuhp_invoke_callback+0x88/0x210
 [   70.818038]  take_cpu_down+0x7c/0xd8
 [   70.821617]  multi_cpu_stop+0xac/0x170
 [   70.825369]  cpu_stopper_thread+0x98/0x130
 [   70.829469]  smpboot_thread_fn+0x1c4/0x280
 [   70.833570]  kthread+0x140/0x160
 [   70.836801]  ret_from_fork+0x10/0x34
 [   70.840384] Code: 94011fd8 b9400660 7100041f 54000040 (d4210000)
 [   70.846487] ---[ end trace 7eb0e0efe803dcfe ]---
 [   70.851109] note: migration/0[11] exited with preempt_count 3

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 5/9] sched/hotplug: Consolidate task migration on CPU unplug
  2020-09-21 16:36 ` [PATCH 5/9] sched/hotplug: Consolidate task migration on CPU unplug Peter Zijlstra
@ 2020-10-01 17:12   ` Vincent Donnefort
  2020-10-02 14:17     ` Peter Zijlstra
  0 siblings, 1 reply; 45+ messages in thread
From: Vincent Donnefort @ 2020-10-01 17:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot

On Mon, Sep 21, 2020 at 06:36:02PM +0200, Peter Zijlstra wrote:

[...]

> +
> +     [CPUHP_AP_SCHED_WAIT_EMPTY] = {
> +             .name                   = "sched:waitempty",
> +             .startup.single         = NULL,
> +             .teardown.single        = sched_cpu_wait_empty,
> +     },
> +
>       /* Handle smpboot threads park/unpark */

Unless I missed something, now that the wait has its own HP step, this
patch can probably also get rid of the balance_hotplug_wait() in
sched_cpu_deactivate() introduced by: 

  [PATCH 4/9] sched/core: Wait for tasks being pushed away on hotplug

-- 
Vincent

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 5/9] sched/hotplug: Consolidate task migration on CPU unplug
  2020-10-01 17:12   ` Vincent Donnefort
@ 2020-10-02 14:17     ` Peter Zijlstra
  0 siblings, 0 replies; 45+ messages in thread
From: Peter Zijlstra @ 2020-10-02 14:17 UTC (permalink / raw)
  To: Vincent Donnefort
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot

On Thu, Oct 01, 2020 at 06:12:30PM +0100, Vincent Donnefort wrote:
> On Mon, Sep 21, 2020 at 06:36:02PM +0200, Peter Zijlstra wrote:
> 
> [...]
> 
> > +
> > +     [CPUHP_AP_SCHED_WAIT_EMPTY] = {
> > +             .name                   = "sched:waitempty",
> > +             .startup.single         = NULL,
> > +             .teardown.single        = sched_cpu_wait_empty,
> > +     },
> > +
> >       /* Handle smpboot threads park/unpark */
> 
> Unless I missed something, now that the wait has its own HP step, this
> patch can probably also get rid of the balance_hotplug_wait() in
> sched_cpu_deactivate() introduced by: 
> 
>   [PATCH 4/9] sched/core: Wait for tasks being pushed away on hotplug

I'd think so. Consider it gone.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/9] sched/hotplug: Ensure only per-cpu kthreads run during hotplug
  2020-09-25 16:38   ` Dietmar Eggemann
@ 2020-10-02 14:20     ` Peter Zijlstra
  0 siblings, 0 replies; 45+ messages in thread
From: Peter Zijlstra @ 2020-10-02 14:20 UTC (permalink / raw)
  To: Dietmar Eggemann
  Cc: tglx, mingo, linux-kernel, bigeasy, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot, rostedt,
	bsegall, mgorman, bristot, vincent.donnefort

On Fri, Sep 25, 2020 at 06:38:50PM +0200, Dietmar Eggemann wrote:
> On 21/09/2020 18:36, Peter Zijlstra wrote:
> 
> [...]
> 
> > This replaces the unlikely(rq->balance_callbacks) test at the tail of
> > context_switch with an unlikely(rq->balance_work), the fast path is
> 
> While looking for why BALANCE_WORK is needed:
> 
> Shouldn't this be unlikely(rq->balance_callback) and
> unlikely(rq->balance_flags)?

But then we have two loads in the fastpath..

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 7/9] sched: Add migrate_disable()
  2020-09-25 16:50   ` Sebastian Andrzej Siewior
@ 2020-10-02 14:21     ` Peter Zijlstra
  2020-10-02 14:36       ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 45+ messages in thread
From: Peter Zijlstra @ 2020-10-02 14:21 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: tglx, mingo, linux-kernel, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

On Fri, Sep 25, 2020 at 06:50:47PM +0200, Sebastian Andrzej Siewior wrote:
> On 2020-09-21 18:36:04 [+0200], Peter Zijlstra wrote:
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -1700,6 +1700,68 @@ void check_preempt_curr(struct rq *rq, s
> >  
> >  #ifdef CONFIG_SMP
> >  
> > +#ifdef CONFIG_PREEMPT_RT
> …
> > +static inline bool is_migration_disabled(struct task_struct *p)
> > +{
> > +	return p->migration_disabled;
> > +}
> 
> Just a thought: having this with int as return type and defined in a
> header file then it could be used in check_preemption_disabled() and in
> the tracing output.

Yeah, I didn't want it in a header where world can access it and attempt
creative use :/

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 0/9] sched: Migrate disable support
  2020-09-25 19:32   ` Valentin Schneider
@ 2020-10-02 14:30     ` Peter Zijlstra
  0 siblings, 0 replies; 45+ messages in thread
From: Peter Zijlstra @ 2020-10-02 14:30 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Sebastian Andrzej Siewior, tglx, mingo, linux-kernel,
	qais.yousef, swood, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

On Fri, Sep 25, 2020 at 08:32:24PM +0100, Valentin Schneider wrote:

> The IRC handout so far is:

I meant to go post a new version today; but of course cleanup took
longer than expected.

In any case, there's a slightly updated version here:

  git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git sched/wip2

it has push/pull balancer changes that sorta work. There's still
something funny with the NO_RT_PUSH_IPI case (and concequently also the
deadline code, which is an exact mirror of that code).

(it looks like it ends up migrating things in circles due to the for
loop in pull_rt_task)

> As for your splat, I think this is what I was worrying about wrt
> suppressing callbacks in the switch but not preventing them from being
> queued. Perhaps the below is "better" than what I previously sent.
> 
> Technically should be doable with a cpu_active() check instead given this
> all gets flipped in sched_cpu_deactivate(), but at least this makes it
> obvious that PUSH suppresses any other callback.
> 
> ---
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 50aac5b6db26..40d78a20fbcb 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1403,7 +1403,7 @@ queue_balance_callback(struct rq *rq,
>  {
>         lockdep_assert_held(&rq->lock);
> 
> -	if (unlikely(head->next))
> +	if (unlikely(head->next) || (rq->balance_flags & BALANCE_PUSH))
>                 return;

Yeah, I like that. Let me go add it.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 7/9] sched: Add migrate_disable()
  2020-10-02 14:21     ` Peter Zijlstra
@ 2020-10-02 14:36       ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 45+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-10-02 14:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, linux-kernel, qais.yousef, swood,
	valentin.schneider, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	vincent.donnefort

On 2020-10-02 16:21:49 [+0200], Peter Zijlstra wrote:
> Yeah, I didn't want it in a header where world can access it and attempt
> creative use :/

tell me about it. Currently cleaning up other creative abuse of things…

Sebastian

^ permalink raw reply	[flat|nested] 45+ messages in thread

end of thread, other threads:[~2020-10-02 14:36 UTC | newest]

Thread overview: 45+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-21 16:35 [PATCH 0/9] sched: Migrate disable support Peter Zijlstra
2020-09-21 16:35 ` [PATCH 1/9] stop_machine: Add function and caller debug info Peter Zijlstra
2020-09-21 16:35 ` [PATCH 2/9] sched: Fix balance_callback() Peter Zijlstra
2020-09-23 14:08   ` Thomas Gleixner
2020-09-21 16:36 ` [PATCH 3/9] sched/hotplug: Ensure only per-cpu kthreads run during hotplug Peter Zijlstra
2020-09-25 16:38   ` Dietmar Eggemann
2020-10-02 14:20     ` Peter Zijlstra
2020-09-21 16:36 ` [PATCH 4/9] sched/core: Wait for tasks being pushed away on hotplug Peter Zijlstra
2020-09-21 16:36 ` [PATCH 5/9] sched/hotplug: Consolidate task migration on CPU unplug Peter Zijlstra
2020-10-01 17:12   ` Vincent Donnefort
2020-10-02 14:17     ` Peter Zijlstra
2020-09-21 16:36 ` [PATCH 6/9] sched: Massage set_cpus_allowed Peter Zijlstra
2020-09-23 14:07   ` Thomas Gleixner
2020-09-21 16:36 ` [PATCH 7/9] sched: Add migrate_disable() Peter Zijlstra
2020-09-21 19:16   ` Thomas Gleixner
2020-09-21 20:42     ` Daniel Bristot de Oliveira
2020-09-23  8:31       ` Thomas Gleixner
2020-09-23 10:51         ` Daniel Bristot de Oliveira
2020-09-23 17:08         ` peterz
2020-09-23 17:54           ` Daniel Bristot de Oliveira
2020-09-23  7:48     ` peterz
2020-09-24 11:53   ` Valentin Schneider
2020-09-24 12:29     ` Peter Zijlstra
2020-09-24 12:33       ` Valentin Schneider
2020-09-24 12:35     ` Peter Zijlstra
2020-09-25 16:50   ` Sebastian Andrzej Siewior
2020-10-02 14:21     ` Peter Zijlstra
2020-10-02 14:36       ` Sebastian Andrzej Siewior
2020-09-21 16:36 ` [PATCH 8/9] sched: Fix migrate_disable() vs set_cpus_allowed_ptr() Peter Zijlstra
2020-09-24 19:59   ` Valentin Schneider
2020-09-25  8:43     ` Peter Zijlstra
2020-09-25 10:07       ` Valentin Schneider
2020-09-25  9:05     ` Peter Zijlstra
2020-09-25  9:56       ` Peter Zijlstra
2020-09-25 10:09         ` Valentin Schneider
2020-09-21 16:36 ` [PATCH 9/9] sched/core: Make migrate disable and CPU hotplug cooperative Peter Zijlstra
2020-09-25  9:12 ` [PATCH 0/9] sched: Migrate disable support Dietmar Eggemann
2020-09-25 10:10   ` Peter Zijlstra
2020-09-25 11:58     ` Dietmar Eggemann
2020-09-25 12:19       ` Valentin Schneider
2020-09-25 17:49         ` Valentin Schneider
2020-09-29  9:15           ` Dietmar Eggemann
2020-09-25 18:17 ` Sebastian Andrzej Siewior
2020-09-25 19:32   ` Valentin Schneider
2020-10-02 14:30     ` Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.