From: "tip-bot2 for Peter Zijlstra" <tip-bot2@linutronix.de>
To: linux-tip-commits@vger.kernel.org
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>,
Ingo Molnar <mingo@kernel.org>, x86 <x86@kernel.org>,
LKML <linux-kernel@vger.kernel.org>
Subject: [tip: sched/core] sched: Replace rq::wake_list
Date: Mon, 01 Jun 2020 09:52:18 -0000 [thread overview]
Message-ID: <159100513859.17951.5366888281495604529.tip-bot2@tip-bot2> (raw)
In-Reply-To: <20200526161908.129371594@infradead.org>
The following commit has been merged into the sched/core branch of tip:
Commit-ID: a148866489fbe243c936fe43e4525d8dbfa0318f
Gitweb: https://git.kernel.org/tip/a148866489fbe243c936fe43e4525d8dbfa0318f
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Tue, 26 May 2020 18:11:04 +02:00
Committer: Ingo Molnar <mingo@kernel.org>
CommitterDate: Thu, 28 May 2020 10:54:16 +02:00
sched: Replace rq::wake_list
The recent commit: 90b5363acd47 ("sched: Clean up scheduler_ipi()")
got smp_call_function_single_async() subtly wrong. Even though it will
return -EBUSY when trying to re-use a csd, that condition is not
atomic and still requires external serialization.
The change in ttwu_queue_remote() got this wrong.
While on first reading ttwu_queue_remote() has an atomic test-and-set
that appears to serialize the use, the matching 'release' is not in
the right place to actually guarantee this serialization.
The actual race is vs the sched_ttwu_pending() call in the idle loop;
that can run the wakeup-list without consuming the CSD.
Instead of trying to chain the lists, merge them.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20200526161908.129371594@infradead.org
---
include/linux/sched.h | 1 +-
include/linux/smp.h | 1 +-
kernel/sched/core.c | 25 ++++++----------------
kernel/sched/idle.c | 1 +-
kernel/sched/sched.h | 8 +-------
kernel/smp.c | 47 +++++++++++++++++++++++++++++++++++-------
6 files changed, 49 insertions(+), 34 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ebc6870..e0f5f41 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -654,6 +654,7 @@ struct task_struct {
#ifdef CONFIG_SMP
struct llist_node wake_entry;
+ unsigned int wake_entry_type;
int on_cpu;
#ifdef CONFIG_THREAD_INFO_IN_TASK
/* Current CPU: */
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 45ad6e3..84f90e2 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -25,6 +25,7 @@ enum {
CSD_TYPE_ASYNC = 0x00,
CSD_TYPE_SYNC = 0x10,
CSD_TYPE_IRQ_WORK = 0x20,
+ CSD_TYPE_TTWU = 0x30,
CSD_FLAG_TYPE_MASK = 0xF0,
};
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b71ed5e..b3c64c6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1538,7 +1538,7 @@ static int migration_cpu_stop(void *data)
* __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
- sched_ttwu_pending();
+ flush_smp_call_function_from_idle();
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
@@ -2272,14 +2272,13 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
}
#ifdef CONFIG_SMP
-void sched_ttwu_pending(void)
+void sched_ttwu_pending(void *arg)
{
+ struct llist_node *llist = arg;
struct rq *rq = this_rq();
- struct llist_node *llist;
struct task_struct *p, *t;
struct rq_flags rf;
- llist = llist_del_all(&rq->wake_list);
if (!llist)
return;
@@ -2299,11 +2298,6 @@ void sched_ttwu_pending(void)
rq_unlock_irqrestore(rq, &rf);
}
-static void wake_csd_func(void *info)
-{
- sched_ttwu_pending();
-}
-
void send_call_function_single_ipi(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -2327,12 +2321,7 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags
p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
WRITE_ONCE(rq->ttwu_pending, 1);
- if (llist_add(&p->wake_entry, &rq->wake_list)) {
- if (!set_nr_if_polling(rq->idle))
- smp_call_function_single_async(cpu, &rq->wake_csd);
- else
- trace_sched_wake_idle_without_ipi(cpu);
- }
+ __smp_call_single_queue(cpu, &p->wake_entry);
}
void wake_up_if_idle(int cpu)
@@ -2772,6 +2761,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->capture_control = NULL;
#endif
init_numa_balancing(clone_flags, p);
+#ifdef CONFIG_SMP
+ p->wake_entry_type = CSD_TYPE_TTWU;
+#endif
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -6564,7 +6556,6 @@ int sched_cpu_dying(unsigned int cpu)
struct rq_flags rf;
/* Handle pending wakeups and then migrate everything off */
- sched_ttwu_pending();
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
@@ -6763,8 +6754,6 @@ void __init sched_init(void)
rq->avg_idle = 2*sysctl_sched_migration_cost;
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
- rq_csd_init(rq, &rq->wake_csd, wake_csd_func);
-
INIT_LIST_HEAD(&rq->cfs_tasks);
rq_attach_root(rq, &def_root_domain);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 387fd75..05deb81 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -294,7 +294,6 @@ static void do_idle(void)
* critical section.
*/
flush_smp_call_function_from_idle();
- sched_ttwu_pending();
schedule_idle();
if (unlikely(klp_patch_pending(current)))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c86fc94..1d4e94c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1023,11 +1023,6 @@ struct rq {
unsigned int ttwu_local;
#endif
-#ifdef CONFIG_SMP
- call_single_data_t wake_csd;
- struct llist_head wake_list;
-#endif
-
#ifdef CONFIG_CPU_IDLE
/* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state;
@@ -1371,8 +1366,6 @@ queue_balance_callback(struct rq *rq,
rq->balance_callback = head;
}
-extern void sched_ttwu_pending(void);
-
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
lockdep_is_held(&sched_domains_mutex))
@@ -1512,7 +1505,6 @@ extern void flush_smp_call_function_from_idle(void);
#else /* !CONFIG_SMP: */
static inline void flush_smp_call_function_from_idle(void) { }
-static inline void sched_ttwu_pending(void) { }
#endif
#include "stats.h"
diff --git a/kernel/smp.c b/kernel/smp.c
index 856562b..0d61dc0 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -196,6 +196,7 @@ void generic_smp_call_function_single_interrupt(void)
flush_smp_call_function_queue(true);
}
+extern void sched_ttwu_pending(void *);
extern void irq_work_single(void *);
/**
@@ -244,6 +245,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
csd->func);
break;
+ case CSD_TYPE_TTWU:
+ pr_warn("IPI task-wakeup sent to offline CPU\n");
+ break;
+
default:
pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
CSD_TYPE(csd));
@@ -275,22 +280,43 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
}
}
+ if (!entry)
+ return;
+
/*
* Second; run all !SYNC callbacks.
*/
+ prev = NULL;
llist_for_each_entry_safe(csd, csd_next, entry, llist) {
int type = CSD_TYPE(csd);
- if (type == CSD_TYPE_ASYNC) {
- smp_call_func_t func = csd->func;
- void *info = csd->info;
+ if (type != CSD_TYPE_TTWU) {
+ if (prev) {
+ prev->next = &csd_next->llist;
+ } else {
+ entry = &csd_next->llist;
+ }
- csd_unlock(csd);
- func(info);
- } else if (type == CSD_TYPE_IRQ_WORK) {
- irq_work_single(csd);
+ if (type == CSD_TYPE_ASYNC) {
+ smp_call_func_t func = csd->func;
+ void *info = csd->info;
+
+ csd_unlock(csd);
+ func(info);
+ } else if (type == CSD_TYPE_IRQ_WORK) {
+ irq_work_single(csd);
+ }
+
+ } else {
+ prev = &csd->llist;
}
}
+
+ /*
+ * Third; only CSD_TYPE_TTWU is left, issue those.
+ */
+ if (entry)
+ sched_ttwu_pending(entry);
}
void flush_smp_call_function_from_idle(void)
@@ -659,6 +685,13 @@ void __init smp_init(void)
BUILD_BUG_ON(offsetof(struct irq_work, flags) !=
offsetof(struct __call_single_data, flags));
+ /*
+ * Assert the CSD_TYPE_TTWU layout is similar enough
+ * for task_struct to be on the @call_single_queue.
+ */
+ BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) !=
+ offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist));
+
idle_threads_init();
cpuhp_threads_init();
next prev parent reply other threads:[~2020-06-01 9:53 UTC|newest]
Thread overview: 62+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-05-26 16:10 [RFC][PATCH 0/7] Fix the scheduler-IPI mess Peter Zijlstra
2020-05-26 16:10 ` [RFC][PATCH 1/7] sched: Fix smp_call_function_single_async() usage for ILB Peter Zijlstra
2020-05-26 23:56 ` Frederic Weisbecker
2020-05-27 10:23 ` Vincent Guittot
2020-05-27 11:28 ` Frederic Weisbecker
2020-05-27 12:07 ` Vincent Guittot
2020-05-29 15:26 ` Valentin Schneider
2020-06-01 9:52 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2020-06-01 11:40 ` Frederic Weisbecker
2020-05-26 16:10 ` [RFC][PATCH 2/7] smp: Optimize flush_smp_call_function_queue() Peter Zijlstra
2020-05-28 12:28 ` Frederic Weisbecker
2020-06-01 9:52 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2020-05-26 16:11 ` [RFC][PATCH 3/7] smp: Move irq_work_run() out of flush_smp_call_function_queue() Peter Zijlstra
2020-05-29 13:04 ` Frederic Weisbecker
2020-06-01 9:52 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2020-05-26 16:11 ` [RFC][PATCH 4/7] smp: Optimize send_call_function_single_ipi() Peter Zijlstra
2020-05-27 9:56 ` Peter Zijlstra
2020-05-27 10:15 ` Peter Zijlstra
2020-05-27 15:56 ` Paul E. McKenney
2020-05-27 16:35 ` Peter Zijlstra
2020-05-27 17:12 ` Peter Zijlstra
2020-05-27 19:39 ` Paul E. McKenney
2020-05-28 1:35 ` Joel Fernandes
2020-05-28 8:59 ` [tip: core/rcu] rcu: Allow for smp_call_function() running callbacks from idle tip-bot2 for Peter Zijlstra
2021-01-21 16:56 ` [RFC][PATCH 4/7] smp: Optimize send_call_function_single_ipi() Peter Zijlstra
2021-01-22 0:20 ` Paul E. McKenney
2021-01-22 8:31 ` Peter Zijlstra
2021-01-22 15:35 ` Paul E. McKenney
2020-05-29 13:01 ` Frederic Weisbecker
2020-06-01 9:52 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2020-05-26 16:11 ` [RFC][PATCH 5/7] irq_work, smp: Allow irq_work on call_single_queue Peter Zijlstra
2020-05-28 23:40 ` Frederic Weisbecker
2020-05-29 13:36 ` Peter Zijlstra
2020-06-05 9:37 ` Peter Zijlstra
2020-06-05 15:02 ` Frederic Weisbecker
2020-06-05 16:17 ` Peter Zijlstra
2020-06-05 15:24 ` Kees Cook
2020-06-10 13:24 ` Frederic Weisbecker
2020-06-01 9:52 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2020-05-26 16:11 ` [RFC][PATCH 6/7] sched: Add rq::ttwu_pending Peter Zijlstra
2020-06-01 9:52 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2020-05-26 16:11 ` [RFC][PATCH 7/7] sched: Replace rq::wake_list Peter Zijlstra
2020-05-29 15:10 ` Valdis Klētnieks
2020-06-01 9:52 ` tip-bot2 for Peter Zijlstra [this message]
2020-06-02 15:16 ` [tip: sched/core] " Frederic Weisbecker
2020-06-04 14:18 ` [RFC][PATCH 7/7] " Guenter Roeck
2020-06-05 0:24 ` Eric Biggers
2020-06-05 7:41 ` Peter Zijlstra
2020-06-05 16:15 ` Eric Biggers
2020-06-06 23:13 ` Guenter Roeck
2020-06-09 20:21 ` Eric Biggers
2020-06-09 21:25 ` Guenter Roeck
2020-06-09 21:38 ` Eric Biggers
2020-06-09 22:06 ` Peter Zijlstra
2020-06-09 23:03 ` Guenter Roeck
2020-06-10 9:09 ` Peter Zijlstra
2020-06-18 17:57 ` Steven Rostedt
2020-06-18 19:06 ` Guenter Roeck
2020-06-09 22:07 ` Peter Zijlstra
2020-06-05 8:10 ` Peter Zijlstra
2020-06-05 13:33 ` Guenter Roeck
2020-06-05 14:09 ` Peter Zijlstra
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=159100513859.17951.5366888281495604529.tip-bot2@tip-bot2 \
--to=tip-bot2@linutronix.de \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-tip-commits@vger.kernel.org \
--cc=mingo@kernel.org \
--cc=peterz@infradead.org \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).