[RFC PATCH 4/4] softirq: Replace ksoftirqd with workqueues entirely

From: Frederic Weisbecker <frederic@kernel.org>
To: LKML <linux-kernel@vger.kernel.org>
Cc: Frederic Weisbecker <frederic@kernel.org>,
	Levin Alexander <alexander.levin@verizon.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Mauro Carvalho Chehab <mchehab@s-opensource.com>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Hannes Frederic Sowa <hannes@stressinduktion.org>,
	"Paul E . McKenney" <paulmck@linux.vnet.ibm.com>,
	Wanpeng Li <wanpeng.li@hotmail.com>,
	Dmitry Safonov <dima@arista.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Andrew Morton <akpm@linux-foundation.org>,
	Paolo Abeni <pabeni@redhat.com>, Radu Rendec <rrendec@arista.com>,
	Ingo Molnar <mingo@kernel.org>,
	Stanislaw Gruszka <sgruszka@redhat.com>,
	Rik van Riel <riel@redhat.com>,
	Eric Dumazet <edumazet@google.com>,
	David Miller <davem@davemloft.net>
Subject: [RFC PATCH 4/4] softirq: Replace ksoftirqd with workqueues entirely
Date: Fri, 19 Jan 2018 16:46:14 +0100	[thread overview]
Message-ID: <1516376774-24076-5-git-send-email-frederic@kernel.org> (raw)
In-Reply-To: <1516376774-24076-1-git-send-email-frederic@kernel.org>

Ksoftirqd only remains to implement threaded IRQs. Convert it to
existing per-vector workqueues to avoid code duplication.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Suggested-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Dmitry Safonov <dima@arista.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Miller <davem@davemloft.net>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Levin Alexander <alexander.levin@verizon.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Radu Rendec <rrendec@arista.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/RCU/stallwarn.txt |  4 +-
 include/linux/interrupt.h       |  7 ----
 kernel/sched/cputime.c          | 13 +++---
 kernel/sched/sched.h            |  4 +-
 kernel/softirq.c                | 87 +++++++++--------------------------------
 net/ipv4/tcp_output.c           |  4 +-
 6 files changed, 31 insertions(+), 88 deletions(-)

diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index a08f928..ea3a8de 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -17,8 +17,8 @@ o	A CPU looping in an RCU read-side critical section.
 o	A CPU looping with interrupts disabled.
 
 o	A CPU looping with preemption disabled.  This condition can
-	result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh
-	stalls.
+	result in RCU-sched stalls and, if softirq workqueue is in use,
+	RCU-bh stalls.
 
 o	A CPU looping with bottom halves disabled.  This condition can
 	result in RCU-sched and RCU-bh stalls.
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 92d044d..680f620 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -507,13 +507,6 @@ extern void __raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq(unsigned int nr);
 
-DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
-
-static inline struct task_struct *this_cpu_ksoftirqd(void)
-{
-	return this_cpu_read(ksoftirqd);
-}
-
 extern int softirq_serving_workqueue(void);
 
 /* Tasklets --- multithreaded analogue of BHs.
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 30f70e5..c5b8dbd 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -64,15 +64,14 @@ void irqtime_account_irq(struct task_struct *curr)
 	irqtime->irq_start_time += delta;
 
 	/*
-	 * We do not account for softirq time from ksoftirqd here.
-	 * We want to continue accounting softirq time to ksoftirqd thread
+	 * We do not account for softirq time from workqueue here.
+	 * We want to continue accounting softirq time to workqueue thread
 	 * in that case, so as not to confuse scheduler with a special task
 	 * that do not consume any time, but still wants to run.
 	 */
 	if (hardirq_count())
 		irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
-	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd() &&
-		 !softirq_serving_workqueue())
+	else if (in_serving_softirq() && !softirq_serving_workqueue())
 		irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
@@ -376,11 +375,11 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 
 	cputime -= other;
 
-	if (this_cpu_ksoftirqd() == p || softirq_serving_workqueue()) {
+	if (softirq_serving_workqueue()) {
 		/*
-		 * ksoftirqd time do not get accounted in cpu_softirq_time.
+		 * Softirq wq time do not get accounted in cpu_softirq_time.
 		 * So, we have to handle it separately here.
-		 * Also, p->stime needs to be updated for ksoftirqd.
+		 * Also, p->stime needs to be updated for workqueue.
 		 */
 		account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
 	} else if (user_tick) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b19552a2..5d481f1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2061,8 +2061,8 @@ struct irqtime {
 DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
 
 /*
- * Returns the irqtime minus the softirq time computed by ksoftirqd.
- * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
+ * Returns the irqtime minus the softirq time computed by workqueue.
+ * Otherwise workqueue's sum_exec_runtime is substracted its own runtime
  * and never move forward.
  */
 static inline u64 irq_time_read(int cpu)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bb0cffa..cf43a8d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -55,8 +55,6 @@ EXPORT_SYMBOL(irq_stat);
 
 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
 
-DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
-
 const char * const softirq_to_name[NR_SOFTIRQS] = {
 	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
 	"TASKLET", "SCHED", "HRTIMER", "RCU"
@@ -76,32 +74,6 @@ struct softirq {
 static DEFINE_PER_CPU(struct softirq, softirq_cpu);
 
 /*
- * we cannot loop indefinitely here to avoid userspace starvation,
- * but we also don't want to introduce a worst case 1/HZ latency
- * to the pending events, so lets the scheduler to balance
- * the softirq load for us.
- */
-static void wakeup_softirqd(void)
-{
-	/* Interrupts are disabled: no need to stop preemption */
-	struct task_struct *tsk = __this_cpu_read(ksoftirqd);
-
-	if (tsk && tsk->state != TASK_RUNNING)
-		wake_up_process(tsk);
-}
-
-/*
- * If ksoftirqd is scheduled, we do not want to process pending softirqs
- * right now. Let ksoftirqd handle this at its own rate, to get fairness.
- */
-static bool ksoftirqd_running(void)
-{
-	struct task_struct *tsk = __this_cpu_read(ksoftirqd);
-
-	return tsk && (tsk->state == TASK_RUNNING);
-}
-
-/*
  * preempt_count and SOFTIRQ_OFFSET usage:
  * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
  *   softirq processing.
@@ -388,7 +360,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
 
 asmlinkage __visible void do_softirq(void)
 {
-	__u32 pending;
+	__u32 pending, pending_work;
 	unsigned long flags;
 
 	if (in_interrupt())
@@ -397,8 +369,9 @@ asmlinkage __visible void do_softirq(void)
 	local_irq_save(flags);
 
 	pending = local_softirq_pending();
+	pending_work = __this_cpu_read(softirq_cpu.pending_work_mask);
 
-	if (pending && !ksoftirqd_running())
+	if (pending & ~pending_work)
 		do_softirq_own_stack();
 
 	local_irq_restore(flags);
@@ -412,7 +385,7 @@ void irq_enter(void)
 	rcu_irq_enter();
 	if (is_idle_task(current) && !in_interrupt()) {
 		/*
-		 * Prevent raise_softirq from needlessly waking up ksoftirqd
+		 * Prevent raise_softirq from needlessly waking up workqueue
 		 * here, as softirq will be serviced on return from interrupt.
 		 */
 		local_bh_disable();
@@ -425,7 +398,15 @@ void irq_enter(void)
 
 static inline void invoke_softirq(void)
 {
-	if (ksoftirqd_running())
+	unsigned int pending_work, pending = local_softirq_pending();
+
+	if (!pending)
+		return;
+
+	pending_work = __this_cpu_read(softirq_cpu.pending_work_mask);
+	pending &= ~pending_work;
+
+	if (!pending)
 		return;
 
 	if (!force_irqthreads) {
@@ -445,7 +426,7 @@ static inline void invoke_softirq(void)
 		do_softirq_own_stack();
 #endif
 	} else {
-		wakeup_softirqd();
+		do_softirq_workqueue(pending);
 	}
 }
 
@@ -474,7 +455,7 @@ void irq_exit(void)
 #endif
 	account_irq_exit_time(current);
 	preempt_count_sub(HARDIRQ_OFFSET);
-	if (!in_interrupt() && local_softirq_pending())
+	if (!in_interrupt())
 		invoke_softirq();
 
 	tick_irq_exit();
@@ -495,11 +476,11 @@ inline void raise_softirq_irqoff(unsigned int nr)
 	 * actually run the softirq once we return from
 	 * the irq or softirq.
 	 *
-	 * Otherwise we wake up ksoftirqd to make sure we
+	 * Otherwise we wake up workqueue to make sure we
 	 * schedule the softirq soon.
 	 */
 	if (!in_interrupt())
-		wakeup_softirqd();
+		do_softirq_workqueue(BIT(nr));
 }
 
 void raise_softirq(unsigned int nr)
@@ -736,27 +717,6 @@ void __init softirq_init(void)
 	open_softirq(HI_SOFTIRQ, tasklet_hi_action);
 }
 
-static int ksoftirqd_should_run(unsigned int cpu)
-{
-	return local_softirq_pending();
-}
-
-static void run_ksoftirqd(unsigned int cpu)
-{
-	local_irq_disable();
-	if (local_softirq_pending()) {
-		/*
-		 * We can safely run softirq on inline stack, as we are not deep
-		 * in the task stack here.
-		 */
-		__do_softirq();
-		local_irq_enable();
-		cond_resched_rcu_qs();
-		return;
-	}
-	local_irq_enable();
-}
-
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * tasklet_kill_immediate is called to remove a tasklet which can already be
@@ -819,22 +779,13 @@ static int takeover_tasklets(unsigned int cpu)
 #define takeover_tasklets	NULL
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static struct smp_hotplug_thread softirq_threads = {
-	.store			= &ksoftirqd,
-	.thread_should_run	= ksoftirqd_should_run,
-	.thread_fn		= run_ksoftirqd,
-	.thread_comm		= "ksoftirqd/%u",
-};
-
-static __init int spawn_ksoftirqd(void)
+static __init int tasklet_set_takeover(void)
 {
 	cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
 				  takeover_tasklets);
-	BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
-
 	return 0;
 }
-early_initcall(spawn_ksoftirqd);
+early_initcall(tasklet_set_takeover);
 
 /*
  * [ These __weak aliases are kept in a separate compilation unit, so that
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b4e4160..3b4811e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -912,7 +912,7 @@ void tcp_wfree(struct sk_buff *skb)
 	 */
 	WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
 
-	/* If this softirq is serviced by ksoftirqd, we are likely under stress.
+	/* If this softirq is serviced by workqueue, we are likely under stress.
 	 * Wait until our queues (qdisc + devices) are drained.
 	 * This gives :
 	 * - less callbacks to tcp_write_xmit(), reducing stress (batches)
@@ -920,7 +920,7 @@ void tcp_wfree(struct sk_buff *skb)
 	 *   to migrate this flow (skb->ooo_okay will be eventually set)
 	 */
 	if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) &&
-	    (this_cpu_ksoftirqd() == current || softirq_serving_workqueue()))
+	    softirq_serving_workqueue())
 		goto out;
 
 	for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
-- 
2.7.4