linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Tejun Heo <tj@kernel.org>
To: Boqun Feng <boqun.feng@gmail.com>
Cc: torvalds@linux-foundation.org, mpatocka@redhat.com,
	linux-kernel@vger.kernel.org, dm-devel@lists.linux.dev,
	msnitzer@redhat.com, ignat@cloudflare.com, damien.lemoal@wdc.com,
	bob.liu@oracle.com, houtao1@huawei.com, peterz@infradead.org,
	mingo@kernel.org, netdev@vger.kernel.org, allen.lkml@gmail.com,
	kernel-team@meta.com, Thomas Gleixner <tglx@linutronix.de>
Subject: [PATCH for-6.9] workqueue: Drain BH work items on hot-unplugged CPUs
Date: Mon, 26 Feb 2024 15:38:55 -1000	[thread overview]
Message-ID: <Zd09L9DgerYjezGT@slm.duckdns.org> (raw)
In-Reply-To: <Zdvw0HdSXcU3JZ4g@boqun-archlinux>

Boqun pointed out that workqueues aren't handling BH work items on offlined
CPUs. Unlike tasklet which transfers out the pending tasks from
CPUHP_SOFTIRQ_DEAD, BH workqueue would just leave them pending which is
problematic. Note that this behavior is specific to BH workqueues as the
non-BH per-CPU workers just become unbound when the CPU goes offline.

This patch fixes the issue by draining the pending BH work items from an
offlined CPU from CPUHP_SOFTIRQ_DEAD. Because work items carry more context,
it's not as easy to transfer the pending work items from one pool to
another. Instead, run BH work items which execute the offlined pools on an
online CPU.

Note that this assumes that no further BH work items will be queued on the
offlined CPUs. This assumption is shared with tasklet and should be fine for
conversions. However, this issue also exists for per-CPU workqueues which
will just keep executing work items queued after CPU offline on unbound
workers and workqueue should reject per-CPU and BH work items queued on
offline CPUs. This will be addressed separately later.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Boqun Feng <boqun.feng@gmail.com>
Link: http://lkml.kernel.org/r/Zdvw0HdSXcU3JZ4g@boqun-archlinux
---
 include/linux/workqueue.h |    1 
 kernel/softirq.c          |    2 +
 kernel/workqueue.c        |   91 ++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 91 insertions(+), 3 deletions(-)

--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -458,6 +458,7 @@ extern struct workqueue_struct *system_b
 extern struct workqueue_struct *system_bh_highpri_wq;
 
 void workqueue_softirq_action(bool highpri);
+void workqueue_softirq_dead(unsigned int cpu);
 
 /**
  * alloc_workqueue - allocate a workqueue
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -932,6 +932,8 @@ static void run_ksoftirqd(unsigned int c
 #ifdef CONFIG_HOTPLUG_CPU
 static int takeover_tasklets(unsigned int cpu)
 {
+	workqueue_softirq_dead(cpu);
+
 	/* CPU is dead, so no lock needed. */
 	local_irq_disable();
 
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -81,6 +81,7 @@ enum worker_pool_flags {
 	POOL_BH			= 1 << 0,	/* is a BH pool */
 	POOL_MANAGER_ACTIVE	= 1 << 1,	/* being managed */
 	POOL_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */
+	POOL_BH_DRAINING	= 1 << 3,	/* draining after CPU offline */
 };
 
 enum worker_flags {
@@ -1218,7 +1219,9 @@ static struct irq_work *bh_pool_irq_work
 static void kick_bh_pool(struct worker_pool *pool)
 {
 #ifdef CONFIG_SMP
-	if (unlikely(pool->cpu != smp_processor_id())) {
+	/* see drain_dead_softirq_workfn() for BH_DRAINING */
+	if (unlikely(pool->cpu != smp_processor_id() &&
+		     !(pool->flags & POOL_BH_DRAINING))) {
 		irq_work_queue_on(bh_pool_irq_work(pool), pool->cpu);
 		return;
 	}
@@ -3155,6 +3158,7 @@ __acquires(&pool->lock)
 	struct worker_pool *pool = worker->pool;
 	unsigned long work_data;
 	int lockdep_start_depth, rcu_start_depth;
+	bool bh_draining = pool->flags & POOL_BH_DRAINING;
 #ifdef CONFIG_LOCKDEP
 	/*
 	 * It is permissible to free the struct work_struct from
@@ -3220,7 +3224,9 @@ __acquires(&pool->lock)
 
 	rcu_start_depth = rcu_preempt_depth();
 	lockdep_start_depth = lockdep_depth(current);
-	lock_map_acquire(&pwq->wq->lockdep_map);
+	/* see drain_dead_softirq_workfn() */
+	if (!bh_draining)
+		lock_map_acquire(&pwq->wq->lockdep_map);
 	lock_map_acquire(&lockdep_map);
 	/*
 	 * Strictly speaking we should mark the invariant state without holding
@@ -3253,7 +3259,8 @@ __acquires(&pool->lock)
 	trace_workqueue_execute_end(work, worker->current_func);
 	pwq->stats[PWQ_STAT_COMPLETED]++;
 	lock_map_release(&lockdep_map);
-	lock_map_release(&pwq->wq->lockdep_map);
+	if (!bh_draining)
+		lock_map_release(&pwq->wq->lockdep_map);
 
 	if (unlikely((worker->task && in_atomic()) ||
 		     lockdep_depth(current) != lockdep_start_depth ||
@@ -3615,6 +3622,84 @@ void workqueue_softirq_action(bool highp
 		bh_worker(list_first_entry(&pool->workers, struct worker, node));
 }
 
+struct wq_drain_dead_softirq_work {
+	struct work_struct	work;
+	struct worker_pool	*pool;
+	struct completion	done;
+};
+
+static void drain_dead_softirq_workfn(struct work_struct *work)
+{
+	struct wq_drain_dead_softirq_work *dead_work =
+		container_of(work, struct wq_drain_dead_softirq_work, work);
+	struct worker_pool *pool = dead_work->pool;
+	bool repeat;
+
+	/*
+	 * @pool's CPU is dead and we want to execute its still pending work
+	 * items from this BH work item which is running on a different CPU. As
+	 * its CPU is dead, @pool can't be kicked and, as work execution path
+	 * will be nested, a lockdep annotation needs to be suppressed. Mark
+	 * @pool with %POOL_BH_DRAINING for the special treatments.
+	 */
+	raw_spin_lock_irq(&pool->lock);
+	pool->flags |= POOL_BH_DRAINING;
+	raw_spin_unlock_irq(&pool->lock);
+
+	bh_worker(list_first_entry(&pool->workers, struct worker, node));
+
+	raw_spin_lock_irq(&pool->lock);
+	pool->flags &= ~POOL_BH_DRAINING;
+	repeat = need_more_worker(pool);
+	raw_spin_unlock_irq(&pool->lock);
+
+	/*
+	 * bh_worker() might hit consecutive execution limit and bail. If there
+	 * still are pending work items, reschedule self and return so that we
+	 * don't hog this CPU's BH.
+	 */
+	if (repeat) {
+		if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
+			queue_work(system_bh_highpri_wq, work);
+		else
+			queue_work(system_bh_wq, work);
+	} else {
+		complete(&dead_work->done);
+	}
+}
+
+/*
+ * @cpu is dead. Drain the remaining BH work items on the current CPU. It's
+ * possible to allocate dead_work per CPU and avoid flushing. However, then we
+ * have to worry about draining overlapping with CPU coming back online or
+ * nesting (one CPU's dead_work queued on another CPU which is also dead and so
+ * on). Let's keep it simple and drain them synchronously. These are BH work
+ * items which shouldn't be requeued on the same pool. Shouldn't take long.
+ */
+void workqueue_softirq_dead(unsigned int cpu)
+{
+	int i;
+
+	for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
+		struct worker_pool *pool = &per_cpu(bh_worker_pools, cpu)[i];
+		struct wq_drain_dead_softirq_work dead_work;
+
+		if (!need_more_worker(pool))
+			continue;
+
+		INIT_WORK(&dead_work.work, drain_dead_softirq_workfn);
+		dead_work.pool = pool;
+		init_completion(&dead_work.done);
+
+		if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
+			queue_work(system_bh_highpri_wq, &dead_work.work);
+		else
+			queue_work(system_bh_wq, &dead_work.work);
+
+		wait_for_completion(&dead_work.done);
+	}
+}
+
 /**
  * check_flush_dependency - check for flush dependency sanity
  * @target_wq: workqueue being flushed

  parent reply	other threads:[~2024-02-27  1:38 UTC|newest]

Thread overview: 56+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-01-30  9:11 [PATCHSET wq/for-6.9] workqueue: Implement BH workqueue and convert several tasklet users Tejun Heo
2024-01-30  9:11 ` [PATCH 1/8] workqueue: Update lock debugging code Tejun Heo
2024-01-30  9:11 ` [PATCH 2/8] workqueue: Factor out init_cpu_worker_pool() Tejun Heo
2024-01-30  9:11 ` [PATCH 3/8] workqueue: Implement BH workqueues to eventually replace tasklets Tejun Heo
2024-01-30 17:25   ` Linus Torvalds
2024-02-01 11:02   ` Lai Jiangshan
2024-02-01 21:47     ` Tejun Heo
2024-02-02  1:15   ` [PATCH v2 " Tejun Heo
2024-02-04  2:20     ` Lai Jiangshan
2024-02-04 21:29   ` [PATCH v3 " Tejun Heo
2024-02-05  4:48     ` Hillf Danton
2024-02-05 17:47       ` Tejun Heo
2024-02-26  2:00     ` Boqun Feng
2024-02-26 18:47       ` Tejun Heo
2024-02-27  1:38       ` Tejun Heo [this message]
2024-02-29 20:37         ` [PATCH for-6.9] workqueue: Drain BH work items on hot-unplugged CPUs Tejun Heo
2024-02-29 21:07           ` Boqun Feng
2024-01-30  9:11 ` [PATCH 4/8] backtracetest: Convert from tasklet to BH workqueue Tejun Heo
2024-01-30  9:11 ` [PATCH 5/8] usb: core: hcd: " Tejun Heo
2024-01-30 16:38   ` Greg Kroah-Hartman
2024-02-20 17:25   ` Davidlohr Bueso
2024-02-20 17:55     ` Linus Torvalds
2024-02-20 18:19       ` Tejun Heo
2024-02-20 19:36       ` Davidlohr Bueso
2024-01-30  9:11 ` [PATCH 6/8] net: tcp: tsq: " Tejun Heo
2024-02-16  5:31   ` Tejun Heo
2024-02-16  8:23     ` Eric Dumazet
2024-02-16 15:52       ` David Wei
2024-02-16 16:24       ` Tejun Heo
2024-01-30  9:11 ` [PATCH 7/8] dm-crypt: " Tejun Heo
2024-01-30 10:46   ` Sebastian Andrzej Siewior
2024-01-30 15:53     ` Tejun Heo
2024-01-31 21:23   ` Mikulas Patocka
2024-01-30  9:11 ` [PATCH 8/8] dm-verity: " Tejun Heo
2024-01-31 21:19   ` Mikulas Patocka
2024-01-31 21:32     ` Tejun Heo
2024-01-31 22:02       ` Mikulas Patocka
2024-01-31 23:19       ` Linus Torvalds
2024-02-01  0:04         ` Tejun Heo
2024-02-01  0:19           ` Mike Snitzer
2024-02-20 19:44             ` Mike Snitzer
2024-02-20 20:05               ` Tejun Heo
2024-02-22 21:24                 ` Mike Snitzer
2024-02-23 17:22                   ` Tejun Heo
2024-02-01  0:07         ` Mike Snitzer
2024-01-30  9:22 ` [PATCHSET wq/for-6.9] workqueue: Implement BH workqueue and convert several tasklet users Tejun Heo
2024-01-30 10:20 ` Sebastian Andrzej Siewior
2024-01-30 15:50   ` Tejun Heo
2024-01-30 23:37 ` Allen
2024-02-04 21:33 ` Tejun Heo
2024-02-05 20:50   ` Allen
2024-02-05 21:12     ` Tejun Heo
2024-02-05 21:31       ` Allen
2024-02-07 19:02         ` Allen
2024-02-08 16:56           ` Tejun Heo
2024-02-08 19:07             ` Allen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Zd09L9DgerYjezGT@slm.duckdns.org \
    --to=tj@kernel.org \
    --cc=allen.lkml@gmail.com \
    --cc=bob.liu@oracle.com \
    --cc=boqun.feng@gmail.com \
    --cc=damien.lemoal@wdc.com \
    --cc=dm-devel@lists.linux.dev \
    --cc=houtao1@huawei.com \
    --cc=ignat@cloudflare.com \
    --cc=kernel-team@meta.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=mpatocka@redhat.com \
    --cc=msnitzer@redhat.com \
    --cc=netdev@vger.kernel.org \
    --cc=peterz@infradead.org \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).