All of lore.kernel.org
 help / color / mirror / Atom feed
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	stable@vger.kernel.org, Tejun Heo <tj@kernel.org>,
	Sasha Levin <sashal@kernel.org>,
	Imran Khan <imran.f.khan@oracle.com>,
	Xuewen Yan <xuewen.yan@unisoc.com>
Subject: [PATCH 5.10 34/79] cgroup: Fix threadgroup_rwsem <-> cpus_read_lock() deadlock
Date: Tue, 13 Sep 2022 16:04:39 +0200	[thread overview]
Message-ID: <20220913140351.945201247@linuxfoundation.org> (raw)
In-Reply-To: <20220913140350.291927556@linuxfoundation.org>

From: Tejun Heo <tj@kernel.org>

[ Upstream commit 4f7e7236435ca0abe005c674ebd6892c6e83aeb3 ]

Bringing up a CPU may involve creating and destroying tasks which requires
read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
cpus_read_lock(). However, cpuset's ->attach(), which may be called with
thredagroup_rwsem write-locked, also wants to disable CPU hotplug and
acquires cpus_read_lock(), leading to a deadlock.

Fix it by guaranteeing that ->attach() is always called with CPU hotplug
disabled and removing cpus_read_lock() call from cpuset_attach().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-and-tested-by: Imran Khan <imran.f.khan@oracle.com>
Reported-and-tested-by: Xuewen Yan <xuewen.yan@unisoc.com>
Fixes: 05c7b7a92cc8 ("cgroup/cpuset: Fix a race between cpuset_attach() and cpu hotplug")
Cc: stable@vger.kernel.org # v5.17+
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 kernel/cgroup/cgroup.c | 77 +++++++++++++++++++++++++++++-------------
 kernel/cgroup/cpuset.c |  3 +-
 2 files changed, 55 insertions(+), 25 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1072843b25709..684c16849eff3 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2304,6 +2304,47 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 }
 EXPORT_SYMBOL_GPL(task_cgroup_path);
 
+/**
+ * cgroup_attach_lock - Lock for ->attach()
+ * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
+ *
+ * cgroup migration sometimes needs to stabilize threadgroups against forks and
+ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
+ * implementations (e.g. cpuset), also need to disable CPU hotplug.
+ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
+ * lead to deadlocks.
+ *
+ * Bringing up a CPU may involve creating and destroying tasks which requires
+ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
+ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
+ * write-locking threadgroup_rwsem, the locking order is reversed and we end up
+ * waiting for an on-going CPU hotplug operation which in turn is waiting for
+ * the threadgroup_rwsem to be released to create new tasks. For more details:
+ *
+ *   http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
+ *
+ * Resolve the situation by always acquiring cpus_read_lock() before optionally
+ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
+ * CPU hotplug is disabled on entry.
+ */
+static void cgroup_attach_lock(bool lock_threadgroup)
+{
+	cpus_read_lock();
+	if (lock_threadgroup)
+		percpu_down_write(&cgroup_threadgroup_rwsem);
+}
+
+/**
+ * cgroup_attach_unlock - Undo cgroup_attach_lock()
+ * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
+ */
+static void cgroup_attach_unlock(bool lock_threadgroup)
+{
+	if (lock_threadgroup)
+		percpu_up_write(&cgroup_threadgroup_rwsem);
+	cpus_read_unlock();
+}
+
 /**
  * cgroup_migrate_add_task - add a migration target task to a migration context
  * @task: target task
@@ -2780,8 +2821,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
 }
 
 struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
-					     bool *locked)
-	__acquires(&cgroup_threadgroup_rwsem)
+					     bool *threadgroup_locked)
 {
 	struct task_struct *tsk;
 	pid_t pid;
@@ -2798,12 +2838,8 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
 	 * Therefore, we can skip the global lock.
 	 */
 	lockdep_assert_held(&cgroup_mutex);
-	if (pid || threadgroup) {
-		percpu_down_write(&cgroup_threadgroup_rwsem);
-		*locked = true;
-	} else {
-		*locked = false;
-	}
+	*threadgroup_locked = pid || threadgroup;
+	cgroup_attach_lock(*threadgroup_locked);
 
 	rcu_read_lock();
 	if (pid) {
@@ -2834,17 +2870,14 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
 	goto out_unlock_rcu;
 
 out_unlock_threadgroup:
-	if (*locked) {
-		percpu_up_write(&cgroup_threadgroup_rwsem);
-		*locked = false;
-	}
+	cgroup_attach_unlock(*threadgroup_locked);
+	*threadgroup_locked = false;
 out_unlock_rcu:
 	rcu_read_unlock();
 	return tsk;
 }
 
-void cgroup_procs_write_finish(struct task_struct *task, bool locked)
-	__releases(&cgroup_threadgroup_rwsem)
+void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
 {
 	struct cgroup_subsys *ss;
 	int ssid;
@@ -2852,8 +2885,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked)
 	/* release reference from cgroup_procs_write_start() */
 	put_task_struct(task);
 
-	if (locked)
-		percpu_up_write(&cgroup_threadgroup_rwsem);
+	cgroup_attach_unlock(threadgroup_locked);
+
 	for_each_subsys(ss, ssid)
 		if (ss->post_attach)
 			ss->post_attach();
@@ -2930,8 +2963,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 	 * write-locking can be skipped safely.
 	 */
 	has_tasks = !list_empty(&mgctx.preloaded_src_csets);
-	if (has_tasks)
-		percpu_down_write(&cgroup_threadgroup_rwsem);
+	cgroup_attach_lock(has_tasks);
 
 	/* NULL dst indicates self on default hierarchy */
 	ret = cgroup_migrate_prepare_dst(&mgctx);
@@ -2952,8 +2984,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 	ret = cgroup_migrate_execute(&mgctx);
 out_finish:
 	cgroup_migrate_finish(&mgctx);
-	if (has_tasks)
-		percpu_up_write(&cgroup_threadgroup_rwsem);
+	cgroup_attach_unlock(has_tasks);
 	return ret;
 }
 
@@ -4809,13 +4840,13 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
 	struct task_struct *task;
 	const struct cred *saved_cred;
 	ssize_t ret;
-	bool locked;
+	bool threadgroup_locked;
 
 	dst_cgrp = cgroup_kn_lock_live(of->kn, false);
 	if (!dst_cgrp)
 		return -ENODEV;
 
-	task = cgroup_procs_write_start(buf, true, &locked);
+	task = cgroup_procs_write_start(buf, true, &threadgroup_locked);
 	ret = PTR_ERR_OR_ZERO(task);
 	if (ret)
 		goto out_unlock;
@@ -4841,7 +4872,7 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
 	ret = cgroup_attach_task(dst_cgrp, task, true);
 
 out_finish:
-	cgroup_procs_write_finish(task, locked);
+	cgroup_procs_write_finish(task, threadgroup_locked);
 out_unlock:
 	cgroup_kn_unlock(of->kn);
 
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index c51863b63f93a..b7830f1f1f3a5 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2212,7 +2212,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 	cgroup_taskset_first(tset, &css);
 	cs = css_cs(css);
 
-	cpus_read_lock();
+	lockdep_assert_cpus_held();	/* see cgroup_attach_lock() */
 	percpu_down_write(&cpuset_rwsem);
 
 	/* prepare for attach */
@@ -2268,7 +2268,6 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 		wake_up(&cpuset_attach_wq);
 
 	percpu_up_write(&cpuset_rwsem);
-	cpus_read_unlock();
 }
 
 /* The various types of files and directories in a cpuset file system */
-- 
2.35.1




  parent reply	other threads:[~2022-09-13 14:51 UTC|newest]

Thread overview: 90+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-09-13 14:04 [PATCH 5.10 00/79] 5.10.143-rc1 review Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 01/79] NFSD: Fix verifier returned in stable WRITEs Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 02/79] xen-blkfront: Cache feature_persistent value before advertisement Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 03/79] tty: n_gsm: initialize more members at gsm_alloc_mux() Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 04/79] tty: n_gsm: avoid call of sleeping functions from atomic context Greg Kroah-Hartman
2022-09-14 12:38   ` Pavel Machek
2022-09-13 14:04 ` [PATCH 5.10 05/79] efi: libstub: Disable struct randomization Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 06/79] efi: capsule-loader: Fix use-after-free in efi_capsule_write Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 07/79] wifi: iwlegacy: 4965: corrected fix for potential off-by-one overflow in il4965_rs_fill_link_cmd() Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 08/79] net: mvpp2: debugfs: fix memory leak when using debugfs_lookup() Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 09/79] fs: only do a memory barrier for the first set_buffer_uptodate() Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 10/79] Revert "mm: kmemleak: take a full lowmem check in kmemleak_*_phys()" Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 11/79] scsi: qla2xxx: Disable ATIO interrupt coalesce for quad port ISP27XX Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 12/79] scsi: megaraid_sas: Fix double kfree() Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 13/79] drm/gem: Fix GEM handle release errors Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 14/79] drm/amdgpu: Move psp_xgmi_terminate call from amdgpu_xgmi_remove_device to psp_hw_fini Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 15/79] drm/amdgpu: Check num_gfx_rings for gfx v9_0 rb setup Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 16/79] drm/radeon: add a force flush to delay work when radeon Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 17/79] parisc: ccio-dma: Handle kmalloc failure in ccio_init_resources() Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 18/79] parisc: Add runtime check to prevent PA2.0 kernels on PA1.x machines Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 19/79] arm64: cacheinfo: Fix incorrect assignment of signed error value to unsigned fw_level Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 20/79] arm64/signal: Raise limit on stack frames Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 21/79] net/core/skbuff: Check the return value of skb_copy_bits() Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 22/79] fbdev: chipsfb: Add missing pci_disable_device() in chipsfb_pci_init() Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 23/79] drm/amdgpu: mmVM_L2_CNTL3 register not initialized correctly Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 24/79] ALSA: emu10k1: Fix out of bounds access in snd_emu10k1_pcm_channel_alloc() Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 25/79] ALSA: aloop: Fix random zeros in capture data when using jiffies timer Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 26/79] ALSA: usb-audio: Fix an out-of-bounds bug in __snd_usb_parse_audio_interface() Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 27/79] kprobes: Prohibit probes in gate area Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 28/79] debugfs: add debugfs_lookup_and_remove() Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 29/79] nvmet: fix a use-after-free Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 30/79] drm/i915: Implement WaEdpLinkRateDataReload Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 31/79] scsi: mpt3sas: Fix use-after-free warning Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 32/79] scsi: lpfc: Add missing destroy_workqueue() in error path Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 33/79] cgroup: Elide write-locking threadgroup_rwsem when updating csses on an empty subtree Greg Kroah-Hartman
2022-09-13 14:04 ` Greg Kroah-Hartman [this message]
2022-09-13 14:04 ` [PATCH 5.10 35/79] cifs: remove useless parameter is_fsctl from SMB2_ioctl() Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 36/79] smb3: missing inode locks in punch hole Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 37/79] ARM: dts: imx6qdl-kontron-samx6i: remove duplicated node Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 38/79] regulator: core: Clean up on enable failure Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 39/79] tee: fix compiler warning in tee_shm_register() Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 40/79] RDMA/cma: Fix arguments order in net device validation Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 41/79] soc: brcmstb: pm-arm: Fix refcount leak and __iomem leak bugs Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 42/79] RDMA/hns: Fix supported page size Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 43/79] RDMA/hns: Fix wrong fixed value of qp->rq.wqe_shift Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 44/79] ARM: dts: at91: sama5d27_wlsom1: specify proper regulator output ranges Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 45/79] ARM: dts: at91: sama5d2_icp: " Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 46/79] ARM: dts: at91: sama5d27_wlsom1: dont keep ldo2 enabled all the time Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 47/79] ARM: dts: at91: sama5d2_icp: dont keep vdd_other " Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 48/79] netfilter: br_netfilter: Drop dst references before setting Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 49/79] netfilter: nf_tables: clean up hook list when offload flags check fails Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 50/79] netfilter: nf_conntrack_irc: Fix forged IP logic Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 51/79] ALSA: usb-audio: Inform the delayed registration more properly Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 52/79] ALSA: usb-audio: Register card again for iface over delayed_register option Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 53/79] rxrpc: Fix an insufficiently large sglist in rxkad_verify_packet_2() Greg Kroah-Hartman
2022-09-13 14:04 ` [PATCH 5.10 54/79] afs: Use the operation issue time instead of the reply time for callbacks Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 55/79] sch_sfb: Dont assume the skb is still around after enqueueing to child Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 56/79] tipc: fix shift wrapping bug in map_get() Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 57/79] ice: use bitmap_free instead of devm_kfree Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 58/79] i40e: Fix kernel crash during module removal Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 59/79] net: fec: Use a spinlock to guard `fep->ptp_clk_on` Greg Kroah-Hartman
2022-09-13 15:57   ` Marc Kleine-Budde
2022-09-13 14:05 ` [PATCH 5.10 60/79] xen-netback: only remove hotplug-status when the vif is actually destroyed Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 61/79] RDMA/siw: Pass a pointer to virt_to_page() Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 62/79] ipv6: sr: fix out-of-bounds read when setting HMAC data Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 63/79] IB/core: Fix a nested dead lock as part of ODP flow Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 64/79] RDMA/mlx5: Set local port to one when accessing counters Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 65/79] nvme-tcp: fix UAF when detecting digest errors Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 66/79] nvme-tcp: fix regression that causes sporadic requests to time out Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 67/79] tcp: fix early ETIMEDOUT after spurious non-SACK RTO Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 68/79] sch_sfb: Also store skb len before calling child enqueue Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 69/79] ASoC: mchp-spdiftx: remove references to mchp_i2s_caps Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 70/79] ASoC: mchp-spdiftx: Fix clang -Wbitfield-constant-conversion Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 71/79] MIPS: loongson32: ls1c: Fix hang during startup Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 72/79] swiotlb: avoid potential left shift overflow Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 73/79] iommu/amd: use full 64-bit value in build_completion_wait() Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 74/79] hwmon: (mr75203) fix VM sensor allocation when "intel,vm-map" not defined Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 75/79] hwmon: (mr75203) update pvt->v_num and vm_num to the actual number of used sensors Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 76/79] hwmon: (mr75203) fix voltage equation for negative source input Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 77/79] hwmon: (mr75203) fix multi-channel voltage reading Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 78/79] hwmon: (mr75203) enable polling for all VM channels Greg Kroah-Hartman
2022-09-13 14:05 ` [PATCH 5.10 79/79] arm64: errata: add detection for AMEVCNTR01 incrementing incorrectly Greg Kroah-Hartman
2022-09-14  9:38 ` [PATCH 5.10 00/79] 5.10.143-rc1 review Sudip Mukherjee
2022-09-14  9:51 ` Pavel Machek
2022-09-14 11:04 ` Naresh Kamboju
2022-09-14 15:27 ` Jon Hunter
2022-09-14 20:58 ` Florian Fainelli
2022-09-15  0:13 ` Guenter Roeck
2022-09-15  7:28 ` Rudi Heitbaum
2022-09-17  3:18 ` zhouzhixiu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220913140351.945201247@linuxfoundation.org \
    --to=gregkh@linuxfoundation.org \
    --cc=imran.f.khan@oracle.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=sashal@kernel.org \
    --cc=stable@vger.kernel.org \
    --cc=tj@kernel.org \
    --cc=xuewen.yan@unisoc.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.