yet another approach to fix the loop lock order inversions v3

All of lore.kernel.org
 help / color / mirror / Atom feed

* yet another approach to fix the loop lock order inversions v3
@ 2022-03-16  8:45 Christoph Hellwig
  2022-03-16  8:45 ` [PATCH 1/8] loop: de-duplicate the idle worker freeing code Christoph Hellwig
                   ` (8 more replies)
  0 siblings, 9 replies; 25+ messages in thread
From: Christoph Hellwig @ 2022-03-16  8:45 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tetsuo Handa, Jan Kara, Darrick J . Wong, linux-block, Ming Lei

Hi Jens, hi Tetsuo,

this series uses the approach from Tetsuo to delay the destroy_workueue
call, extended by a delayed teardown of the workers to fix a potential
race window then the workqueue can be still round after finishing the
commands. 

Changes since v2:
 - rebased to the lastest block for-next tree, which has the async
   clear reverted and ->free_disk
 - impkement ->free_disk for loop to handle open vs delete races
   more gracefully
 - get rid of lo_refcnt entirely

Changes since v1:
 - add comments to document the lo_refcnt synchronization
 - fix comment typos

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH 1/8] loop: de-duplicate the idle worker freeing code
  2022-03-16  8:45 yet another approach to fix the loop lock order inversions v3 Christoph Hellwig
@ 2022-03-16  8:45 ` Christoph Hellwig
  2022-03-16  8:45 ` [PATCH 2/8] loop: initialize the worker tracking fields once Christoph Hellwig
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 25+ messages in thread
From: Christoph Hellwig @ 2022-03-16  8:45 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tetsuo Handa, Jan Kara, Darrick J . Wong, linux-block, Ming Lei

Use a common helper for both timer based and uncoditional freeing of idle
workers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Tested-by: Darrick J. Wong <djwong@kernel.org>
---
 drivers/block/loop.c | 73 +++++++++++++++++++++-----------------------
 1 file changed, 35 insertions(+), 38 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 08f7772bb5261..64227e659efc4 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -809,7 +809,6 @@ struct loop_worker {
 
 static void loop_workfn(struct work_struct *work);
 static void loop_rootcg_workfn(struct work_struct *work);
-static void loop_free_idle_workers(struct timer_list *timer);
 
 #ifdef CONFIG_BLK_CGROUP
 static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
@@ -893,6 +892,39 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
 	spin_unlock_irq(&lo->lo_work_lock);
 }
 
+static void loop_set_timer(struct loop_device *lo)
+{
+	timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT);
+}
+
+static void loop_free_idle_workers(struct loop_device *lo, bool delete_all)
+{
+	struct loop_worker *pos, *worker;
+
+	spin_lock_irq(&lo->lo_work_lock);
+	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
+				idle_list) {
+		if (!delete_all &&
+		    time_is_after_jiffies(worker->last_ran_at +
+					  LOOP_IDLE_WORKER_TIMEOUT))
+			break;
+		list_del(&worker->idle_list);
+		rb_erase(&worker->rb_node, &lo->worker_tree);
+		css_put(worker->blkcg_css);
+		kfree(worker);
+	}
+	if (!list_empty(&lo->idle_worker_list))
+		loop_set_timer(lo);
+	spin_unlock_irq(&lo->lo_work_lock);
+}
+
+static void loop_free_idle_workers_timer(struct timer_list *timer)
+{
+	struct loop_device *lo = container_of(timer, struct loop_device, timer);
+
+	return loop_free_idle_workers(lo, false);
+}
+
 static void loop_update_rotational(struct loop_device *lo)
 {
 	struct file *file = lo->lo_backing_file;
@@ -1027,7 +1059,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 	INIT_LIST_HEAD(&lo->rootcg_cmd_list);
 	INIT_LIST_HEAD(&lo->idle_worker_list);
 	lo->worker_tree = RB_ROOT;
-	timer_setup(&lo->timer, loop_free_idle_workers,
+	timer_setup(&lo->timer, loop_free_idle_workers_timer,
 		TIMER_DEFERRABLE);
 	lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
 	lo->lo_device = bdev;
@@ -1091,7 +1123,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 {
 	struct file *filp;
 	gfp_t gfp = lo->old_gfp_mask;
-	struct loop_worker *pos, *worker;
 
 	/*
 	 * Flush loop_configure() and loop_change_fd(). It is acceptable for
@@ -1121,15 +1152,7 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 	blk_mq_freeze_queue(lo->lo_queue);
 
 	destroy_workqueue(lo->workqueue);
-	spin_lock_irq(&lo->lo_work_lock);
-	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
-				idle_list) {
-		list_del(&worker->idle_list);
-		rb_erase(&worker->rb_node, &lo->worker_tree);
-		css_put(worker->blkcg_css);
-		kfree(worker);
-	}
-	spin_unlock_irq(&lo->lo_work_lock);
+	loop_free_idle_workers(lo, true);
 	del_timer_sync(&lo->timer);
 
 	spin_lock_irq(&lo->lo_lock);
@@ -1887,11 +1910,6 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 	}
 }
 
-static void loop_set_timer(struct loop_device *lo)
-{
-	timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT);
-}
-
 static void loop_process_work(struct loop_worker *worker,
 			struct list_head *cmd_list, struct loop_device *lo)
 {
@@ -1940,27 +1958,6 @@ static void loop_rootcg_workfn(struct work_struct *work)
 	loop_process_work(NULL, &lo->rootcg_cmd_list, lo);
 }
 
-static void loop_free_idle_workers(struct timer_list *timer)
-{
-	struct loop_device *lo = container_of(timer, struct loop_device, timer);
-	struct loop_worker *pos, *worker;
-
-	spin_lock_irq(&lo->lo_work_lock);
-	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
-				idle_list) {
-		if (time_is_after_jiffies(worker->last_ran_at +
-						LOOP_IDLE_WORKER_TIMEOUT))
-			break;
-		list_del(&worker->idle_list);
-		rb_erase(&worker->rb_node, &lo->worker_tree);
-		css_put(worker->blkcg_css);
-		kfree(worker);
-	}
-	if (!list_empty(&lo->idle_worker_list))
-		loop_set_timer(lo);
-	spin_unlock_irq(&lo->lo_work_lock);
-}
-
 static const struct blk_mq_ops loop_mq_ops = {
 	.queue_rq       = loop_queue_rq,
 	.complete	= lo_complete_rq,
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 2/8] loop: initialize the worker tracking fields once
  2022-03-16  8:45 yet another approach to fix the loop lock order inversions v3 Christoph Hellwig
  2022-03-16  8:45 ` [PATCH 1/8] loop: de-duplicate the idle worker freeing code Christoph Hellwig
@ 2022-03-16  8:45 ` Christoph Hellwig
  2022-03-16  9:26   ` Chaitanya Kulkarni
  2022-03-16  8:45 ` [PATCH 3/8] loop: remove the racy bd_inode->i_mapping->nrpages asserts Christoph Hellwig
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2022-03-16  8:45 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tetsuo Handa, Jan Kara, Darrick J . Wong, linux-block, Ming Lei

There is no need to reinitialize idle_worker_list, worker_tree and timer
every time a loop device is configured.  Just initialize them once at
allocation time.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Tested-by: Darrick J. Wong <djwong@kernel.org>
---
 drivers/block/loop.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 64227e659efc4..2d344fefda6b8 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1057,10 +1057,6 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 
 	INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn);
 	INIT_LIST_HEAD(&lo->rootcg_cmd_list);
-	INIT_LIST_HEAD(&lo->idle_worker_list);
-	lo->worker_tree = RB_ROOT;
-	timer_setup(&lo->timer, loop_free_idle_workers_timer,
-		TIMER_DEFERRABLE);
 	lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
 	lo->lo_device = bdev;
 	lo->lo_backing_file = file;
@@ -1973,6 +1969,9 @@ static int loop_add(int i)
 	lo = kzalloc(sizeof(*lo), GFP_KERNEL);
 	if (!lo)
 		goto out;
+	lo->worker_tree = RB_ROOT;
+	INIT_LIST_HEAD(&lo->idle_worker_list);
+	timer_setup(&lo->timer, loop_free_idle_workers_timer, TIMER_DEFERRABLE);
 	lo->lo_state = Lo_unbound;
 
 	err = mutex_lock_killable(&loop_ctl_mutex);
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 3/8] loop: remove the racy bd_inode->i_mapping->nrpages asserts
  2022-03-16  8:45 yet another approach to fix the loop lock order inversions v3 Christoph Hellwig
  2022-03-16  8:45 ` [PATCH 1/8] loop: de-duplicate the idle worker freeing code Christoph Hellwig
  2022-03-16  8:45 ` [PATCH 2/8] loop: initialize the worker tracking fields once Christoph Hellwig
@ 2022-03-16  8:45 ` Christoph Hellwig
  2022-03-16 10:24   ` Jan Kara
  2022-03-16  8:45 ` [PATCH 4/8] loop: don't freeze the queue in lo_release Christoph Hellwig
                   ` (5 subsequent siblings)
  8 siblings, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2022-03-16  8:45 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tetsuo Handa, Jan Kara, Darrick J . Wong, linux-block, Ming Lei

Nothing prevents a file system or userspace opener of the block device
from redirtying the page right afte sync_blockdev returned.  Fortunately
data in the page cache during a block device change is mostly harmless
anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Darrick J. Wong <djwong@kernel.org>
---
 drivers/block/loop.c | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 2d344fefda6b8..e3361c6b22150 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1276,15 +1276,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	/* I/O need to be drained during transfer transition */
 	blk_mq_freeze_queue(lo->lo_queue);
 
-	if (size_changed && lo->lo_device->bd_inode->i_mapping->nrpages) {
-		/* If any pages were dirtied after invalidate_bdev(), try again */
-		err = -EAGAIN;
-		pr_warn("%s: loop%d (%s) still has dirty pages (nrpages=%lu)\n",
-			__func__, lo->lo_number, lo->lo_file_name,
-			lo->lo_device->bd_inode->i_mapping->nrpages);
-		goto out_unfreeze;
-	}
-
 	prev_lo_flags = lo->lo_flags;
 
 	err = loop_set_status_from_info(lo, info);
@@ -1495,21 +1486,10 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
 	invalidate_bdev(lo->lo_device);
 
 	blk_mq_freeze_queue(lo->lo_queue);
-
-	/* invalidate_bdev should have truncated all the pages */
-	if (lo->lo_device->bd_inode->i_mapping->nrpages) {
-		err = -EAGAIN;
-		pr_warn("%s: loop%d (%s) still has dirty pages (nrpages=%lu)\n",
-			__func__, lo->lo_number, lo->lo_file_name,
-			lo->lo_device->bd_inode->i_mapping->nrpages);
-		goto out_unfreeze;
-	}
-
 	blk_queue_logical_block_size(lo->lo_queue, arg);
 	blk_queue_physical_block_size(lo->lo_queue, arg);
 	blk_queue_io_min(lo->lo_queue, arg);
 	loop_update_dio(lo);
-out_unfreeze:
 	blk_mq_unfreeze_queue(lo->lo_queue);
 
 	return err;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 4/8] loop: don't freeze the queue in lo_release
  2022-03-16  8:45 yet another approach to fix the loop lock order inversions v3 Christoph Hellwig
                   ` (2 preceding siblings ...)
  2022-03-16  8:45 ` [PATCH 3/8] loop: remove the racy bd_inode->i_mapping->nrpages asserts Christoph Hellwig
@ 2022-03-16  8:45 ` Christoph Hellwig
  2022-03-16  8:45 ` [PATCH 5/8] loop: only freeze the queue in __loop_clr_fd when needed Christoph Hellwig
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 25+ messages in thread
From: Christoph Hellwig @ 2022-03-16  8:45 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tetsuo Handa, Jan Kara, Darrick J . Wong, linux-block, Ming Lei

By the time the final ->release is called there can't be outstanding I/O.
For non-final ->release there is no need for driver action at all.

Thus remove the useless queue freeze.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Tested-by: Darrick J. Wong <djwong@kernel.org>
---
 drivers/block/loop.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index e3361c6b22150..87d77464c2800 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1753,13 +1753,6 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		 */
 		__loop_clr_fd(lo, true);
 		return;
-	} else if (lo->lo_state == Lo_bound) {
-		/*
-		 * Otherwise keep thread (if running) and config,
-		 * but flush possible ongoing bios in thread.
-		 */
-		blk_mq_freeze_queue(lo->lo_queue);
-		blk_mq_unfreeze_queue(lo->lo_queue);
 	}
 
 out_unlock:
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 5/8] loop: only freeze the queue in __loop_clr_fd when needed
  2022-03-16  8:45 yet another approach to fix the loop lock order inversions v3 Christoph Hellwig
                   ` (3 preceding siblings ...)
  2022-03-16  8:45 ` [PATCH 4/8] loop: don't freeze the queue in lo_release Christoph Hellwig
@ 2022-03-16  8:45 ` Christoph Hellwig
  2022-03-16  8:45 ` [PATCH 6/8] loop: implement ->free_disk Christoph Hellwig
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 25+ messages in thread
From: Christoph Hellwig @ 2022-03-16  8:45 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tetsuo Handa, Jan Kara, Darrick J . Wong, linux-block, Ming Lei

->release is only called after all outstanding I/O has completed, so only
freeze the queue when clearing the backing file of a live loop device.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Tested-by: Darrick J. Wong <djwong@kernel.org>
---
 drivers/block/loop.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 87d77464c2800..0813f63d5bbd2 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1144,8 +1144,13 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 	if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags))
 		blk_queue_write_cache(lo->lo_queue, false, false);
 
-	/* freeze request queue during the transition */
-	blk_mq_freeze_queue(lo->lo_queue);
+	/*
+	 * Freeze the request queue when unbinding on a live file descriptor and
+	 * thus an open device.  When called from ->release we are guaranteed
+	 * that there is no I/O in progress already.
+	 */
+	if (!release)
+		blk_mq_freeze_queue(lo->lo_queue);
 
 	destroy_workqueue(lo->workqueue);
 	loop_free_idle_workers(lo, true);
@@ -1170,7 +1175,8 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 	mapping_set_gfp_mask(filp->f_mapping, gfp);
 	/* This is safe: open() is still holding a reference. */
 	module_put(THIS_MODULE);
-	blk_mq_unfreeze_queue(lo->lo_queue);
+	if (!release)
+		blk_mq_unfreeze_queue(lo->lo_queue);
 
 	disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 6/8] loop: implement ->free_disk
  2022-03-16  8:45 yet another approach to fix the loop lock order inversions v3 Christoph Hellwig
                   ` (4 preceding siblings ...)
  2022-03-16  8:45 ` [PATCH 5/8] loop: only freeze the queue in __loop_clr_fd when needed Christoph Hellwig
@ 2022-03-16  8:45 ` Christoph Hellwig
  2022-03-16 10:30   ` Jan Kara
  2022-03-16  8:45 ` [PATCH 7/8] loop: remove lo_refcount and avoid lo_mutex in ->open / ->release Christoph Hellwig
                   ` (2 subsequent siblings)
  8 siblings, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2022-03-16  8:45 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tetsuo Handa, Jan Kara, Darrick J . Wong, linux-block, Ming Lei

Ensure that the lo_device which is stored in the gendisk private
data is valid until the gendisk is freed.  Currently the loop driver
uses a lot of effort to make sure a device is not freed when it is
still in use, but to to fix a potential deadlock this will be relaxed
a bit soon.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/loop.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 0813f63d5bbd2..cbaa18bcad1fe 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1765,6 +1765,14 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 	mutex_unlock(&lo->lo_mutex);
 }
 
+static void lo_free_disk(struct gendisk *disk)
+{
+	struct loop_device *lo = disk->private_data;
+
+	mutex_destroy(&lo->lo_mutex);
+	kfree(lo);
+}
+
 static const struct block_device_operations lo_fops = {
 	.owner =	THIS_MODULE,
 	.open =		lo_open,
@@ -1773,6 +1781,7 @@ static const struct block_device_operations lo_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl =	lo_compat_ioctl,
 #endif
+	.free_disk =	lo_free_disk,
 };
 
 /*
@@ -2064,14 +2073,13 @@ static void loop_remove(struct loop_device *lo)
 {
 	/* Make this loop device unreachable from pathname. */
 	del_gendisk(lo->lo_disk);
-	blk_cleanup_disk(lo->lo_disk);
+	blk_cleanup_queue(lo->lo_disk->queue);
 	blk_mq_free_tag_set(&lo->tag_set);
 	mutex_lock(&loop_ctl_mutex);
 	idr_remove(&loop_index_idr, lo->lo_number);
 	mutex_unlock(&loop_ctl_mutex);
-	/* There is no route which can find this loop device. */
-	mutex_destroy(&lo->lo_mutex);
-	kfree(lo);
+
+	put_disk(lo->lo_disk);
 }
 
 static void loop_probe(dev_t dev)
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 7/8] loop: remove lo_refcount and avoid lo_mutex in ->open / ->release
  2022-03-16  8:45 yet another approach to fix the loop lock order inversions v3 Christoph Hellwig
                   ` (5 preceding siblings ...)
  2022-03-16  8:45 ` [PATCH 6/8] loop: implement ->free_disk Christoph Hellwig
@ 2022-03-16  8:45 ` Christoph Hellwig
  2022-03-16 11:22   ` Jan Kara
  2022-03-16  8:45 ` [PATCH 8/8] loop: don't destroy lo->workqueue in __loop_clr_fd Christoph Hellwig
  2022-03-16 10:59 ` yet another approach to fix the loop lock order inversions v3 Tetsuo Handa
  8 siblings, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2022-03-16  8:45 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tetsuo Handa, Jan Kara, Darrick J . Wong, linux-block, Ming Lei

lo_refcount counts how many openers a loop device has, but that count
is already provided by the block layer in the bd_openers field of the
whole-disk block_device.  Remove lo_refcount and allow opens to
succeed even on devices beeing deleted - now that ->free_disk is
implemented we can handle that race gracefull and all I/O on it will
just fail. Similarly there is a small race window now where
loop_control_remove does not synchronize the delete vs the remove
due do bd_openers not being under lo_mutex protection, but we can
handle that just as gracefully.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/loop.c | 36 +++++++-----------------------------
 drivers/block/loop.h |  1 -
 2 files changed, 7 insertions(+), 30 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index cbaa18bcad1fe..c270f3715d829 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1244,7 +1244,7 @@ static int loop_clr_fd(struct loop_device *lo)
 	 * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
 	 * command to fail with EBUSY.
 	 */
-	if (atomic_read(&lo->lo_refcnt) > 1) {
+	if (lo->lo_disk->part0->bd_openers > 1) {
 		lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
 		mutex_unlock(&lo->lo_mutex);
 		return 0;
@@ -1724,33 +1724,15 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 }
 #endif
 
-static int lo_open(struct block_device *bdev, fmode_t mode)
-{
-	struct loop_device *lo = bdev->bd_disk->private_data;
-	int err;
-
-	err = mutex_lock_killable(&lo->lo_mutex);
-	if (err)
-		return err;
-	if (lo->lo_state == Lo_deleting)
-		err = -ENXIO;
-	else
-		atomic_inc(&lo->lo_refcnt);
-	mutex_unlock(&lo->lo_mutex);
-	return err;
-}
-
 static void lo_release(struct gendisk *disk, fmode_t mode)
 {
 	struct loop_device *lo = disk->private_data;
 
-	mutex_lock(&lo->lo_mutex);
-	if (atomic_dec_return(&lo->lo_refcnt))
-		goto out_unlock;
+	if (disk->part0->bd_openers > 0)
+		return;
 
-	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
-		if (lo->lo_state != Lo_bound)
-			goto out_unlock;
+	mutex_lock(&lo->lo_mutex);
+	if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) {
 		lo->lo_state = Lo_rundown;
 		mutex_unlock(&lo->lo_mutex);
 		/*
@@ -1760,8 +1742,6 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		__loop_clr_fd(lo, true);
 		return;
 	}
-
-out_unlock:
 	mutex_unlock(&lo->lo_mutex);
 }
 
@@ -1775,7 +1755,6 @@ static void lo_free_disk(struct gendisk *disk)
 
 static const struct block_device_operations lo_fops = {
 	.owner =	THIS_MODULE,
-	.open =		lo_open,
 	.release =	lo_release,
 	.ioctl =	lo_ioctl,
 #ifdef CONFIG_COMPAT
@@ -2029,7 +2008,6 @@ static int loop_add(int i)
 	 */
 	if (!part_shift)
 		disk->flags |= GENHD_FL_NO_PART;
-	atomic_set(&lo->lo_refcnt, 0);
 	mutex_init(&lo->lo_mutex);
 	lo->lo_number		= i;
 	spin_lock_init(&lo->lo_lock);
@@ -2119,12 +2097,12 @@ static int loop_control_remove(int idx)
 	if (ret)
 		goto mark_visible;
 	if (lo->lo_state != Lo_unbound ||
-	    atomic_read(&lo->lo_refcnt) > 0) {
+	    lo->lo_disk->part0->bd_openers > 0) {
 		mutex_unlock(&lo->lo_mutex);
 		ret = -EBUSY;
 		goto mark_visible;
 	}
-	/* Mark this loop device no longer open()-able. */
+	/* Mark this loop device as no more bound, but not quite unbound yet */
 	lo->lo_state = Lo_deleting;
 	mutex_unlock(&lo->lo_mutex);
 
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 082d4b6bfc6a6..449d562738c52 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -28,7 +28,6 @@ struct loop_func_table;
 
 struct loop_device {
 	int		lo_number;
-	atomic_t	lo_refcnt;
 	loff_t		lo_offset;
 	loff_t		lo_sizelimit;
 	int		lo_flags;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 8/8] loop: don't destroy lo->workqueue in __loop_clr_fd
  2022-03-16  8:45 yet another approach to fix the loop lock order inversions v3 Christoph Hellwig
                   ` (6 preceding siblings ...)
  2022-03-16  8:45 ` [PATCH 7/8] loop: remove lo_refcount and avoid lo_mutex in ->open / ->release Christoph Hellwig
@ 2022-03-16  8:45 ` Christoph Hellwig
  2022-03-16 11:25   ` Jan Kara
  2022-03-16 10:59 ` yet another approach to fix the loop lock order inversions v3 Tetsuo Handa
  8 siblings, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2022-03-16  8:45 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tetsuo Handa, Jan Kara, Darrick J . Wong, linux-block, Ming Lei,
	syzbot+6479585dfd4dedd3f7e1

There is no need to destroy the workqueue when clearing unbinding
a loop device from a backing file.  Not doing so on the other hand
avoid creating a complex lock dependency chain involving the global
system_transition_mutex.

Based on a patch from Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>.

Reported-by: syzbot+6479585dfd4dedd3f7e1@syzkaller.appspotmail.com
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/loop.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index c270f3715d829..ffc9f00c433cd 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -808,7 +808,6 @@ struct loop_worker {
 };
 
 static void loop_workfn(struct work_struct *work);
-static void loop_rootcg_workfn(struct work_struct *work);
 
 #ifdef CONFIG_BLK_CGROUP
 static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
@@ -1043,20 +1042,19 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 	    !file->f_op->write_iter)
 		lo->lo_flags |= LO_FLAGS_READ_ONLY;
 
-	lo->workqueue = alloc_workqueue("loop%d",
-					WQ_UNBOUND | WQ_FREEZABLE,
-					0,
-					lo->lo_number);
 	if (!lo->workqueue) {
-		error = -ENOMEM;
-		goto out_unlock;
+		lo->workqueue = alloc_workqueue("loop%d",
+						WQ_UNBOUND | WQ_FREEZABLE,
+						0, lo->lo_number);
+		if (!lo->workqueue) {
+			error = -ENOMEM;
+			goto out_unlock;
+		}
 	}
 
 	disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
 	set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0);
 
-	INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn);
-	INIT_LIST_HEAD(&lo->rootcg_cmd_list);
 	lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
 	lo->lo_device = bdev;
 	lo->lo_backing_file = file;
@@ -1152,10 +1150,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 	if (!release)
 		blk_mq_freeze_queue(lo->lo_queue);
 
-	destroy_workqueue(lo->workqueue);
-	loop_free_idle_workers(lo, true);
-	del_timer_sync(&lo->timer);
-
 	spin_lock_irq(&lo->lo_lock);
 	filp = lo->lo_backing_file;
 	lo->lo_backing_file = NULL;
@@ -1749,6 +1743,10 @@ static void lo_free_disk(struct gendisk *disk)
 {
 	struct loop_device *lo = disk->private_data;
 
+	if (lo->workqueue)
+		destroy_workqueue(lo->workqueue);
+	loop_free_idle_workers(lo, true);
+	del_timer_sync(&lo->timer);
 	mutex_destroy(&lo->lo_mutex);
 	kfree(lo);
 }
@@ -2012,6 +2010,8 @@ static int loop_add(int i)
 	lo->lo_number		= i;
 	spin_lock_init(&lo->lo_lock);
 	spin_lock_init(&lo->lo_work_lock);
+	INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn);
+	INIT_LIST_HEAD(&lo->rootcg_cmd_list);
 	disk->major		= LOOP_MAJOR;
 	disk->first_minor	= i << part_shift;
 	disk->minors		= 1 << part_shift;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/8] loop: initialize the worker tracking fields once
  2022-03-16  8:45 ` [PATCH 2/8] loop: initialize the worker tracking fields once Christoph Hellwig
@ 2022-03-16  9:26   ` Chaitanya Kulkarni
  0 siblings, 0 replies; 25+ messages in thread
From: Chaitanya Kulkarni @ 2022-03-16  9:26 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe
  Cc: Tetsuo Handa, Jan Kara, Darrick J . Wong, linux-block, Ming Lei

On 3/16/22 01:45, Christoph Hellwig wrote:
> There is no need to reinitialize idle_worker_list, worker_tree and timer
> every time a loop device is configured.  Just initialize them once at
> allocation time.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Jan Kara <jack@suse.cz>
> Tested-by: Darrick J. Wong <djwong@kernel.org>
> ---

Looks good.

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>

-ck

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/8] loop: remove the racy bd_inode->i_mapping->nrpages asserts
  2022-03-16  8:45 ` [PATCH 3/8] loop: remove the racy bd_inode->i_mapping->nrpages asserts Christoph Hellwig
@ 2022-03-16 10:24   ` Jan Kara
  0 siblings, 0 replies; 25+ messages in thread
From: Jan Kara @ 2022-03-16 10:24 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Tetsuo Handa, Jan Kara, Darrick J . Wong,
	linux-block, Ming Lei

On Wed 16-03-22 09:45:14, Christoph Hellwig wrote:
> Nothing prevents a file system or userspace opener of the block device
> from redirtying the page right afte sync_blockdev returned.  Fortunately
> data in the page cache during a block device change is mostly harmless
> anyway.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Tested-by: Darrick J. Wong <djwong@kernel.org>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  drivers/block/loop.c | 20 --------------------
>  1 file changed, 20 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 2d344fefda6b8..e3361c6b22150 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -1276,15 +1276,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
>  	/* I/O need to be drained during transfer transition */
>  	blk_mq_freeze_queue(lo->lo_queue);
>  
> -	if (size_changed && lo->lo_device->bd_inode->i_mapping->nrpages) {
> -		/* If any pages were dirtied after invalidate_bdev(), try again */
> -		err = -EAGAIN;
> -		pr_warn("%s: loop%d (%s) still has dirty pages (nrpages=%lu)\n",
> -			__func__, lo->lo_number, lo->lo_file_name,
> -			lo->lo_device->bd_inode->i_mapping->nrpages);
> -		goto out_unfreeze;
> -	}
> -
>  	prev_lo_flags = lo->lo_flags;
>  
>  	err = loop_set_status_from_info(lo, info);
> @@ -1495,21 +1486,10 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
>  	invalidate_bdev(lo->lo_device);
>  
>  	blk_mq_freeze_queue(lo->lo_queue);
> -
> -	/* invalidate_bdev should have truncated all the pages */
> -	if (lo->lo_device->bd_inode->i_mapping->nrpages) {
> -		err = -EAGAIN;
> -		pr_warn("%s: loop%d (%s) still has dirty pages (nrpages=%lu)\n",
> -			__func__, lo->lo_number, lo->lo_file_name,
> -			lo->lo_device->bd_inode->i_mapping->nrpages);
> -		goto out_unfreeze;
> -	}
> -
>  	blk_queue_logical_block_size(lo->lo_queue, arg);
>  	blk_queue_physical_block_size(lo->lo_queue, arg);
>  	blk_queue_io_min(lo->lo_queue, arg);
>  	loop_update_dio(lo);
> -out_unfreeze:
>  	blk_mq_unfreeze_queue(lo->lo_queue);
>  
>  	return err;
> -- 
> 2.30.2
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 6/8] loop: implement ->free_disk
  2022-03-16  8:45 ` [PATCH 6/8] loop: implement ->free_disk Christoph Hellwig
@ 2022-03-16 10:30   ` Jan Kara
  0 siblings, 0 replies; 25+ messages in thread
From: Jan Kara @ 2022-03-16 10:30 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Tetsuo Handa, Jan Kara, Darrick J . Wong,
	linux-block, Ming Lei

On Wed 16-03-22 09:45:17, Christoph Hellwig wrote:
> Ensure that the lo_device which is stored in the gendisk private
> data is valid until the gendisk is freed.  Currently the loop driver
> uses a lot of effort to make sure a device is not freed when it is
> still in use, but to to fix a potential deadlock this will be relaxed
> a bit soon.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  drivers/block/loop.c | 16 ++++++++++++----
>  1 file changed, 12 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 0813f63d5bbd2..cbaa18bcad1fe 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -1765,6 +1765,14 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
>  	mutex_unlock(&lo->lo_mutex);
>  }
>  
> +static void lo_free_disk(struct gendisk *disk)
> +{
> +	struct loop_device *lo = disk->private_data;
> +
> +	mutex_destroy(&lo->lo_mutex);
> +	kfree(lo);
> +}
> +
>  static const struct block_device_operations lo_fops = {
>  	.owner =	THIS_MODULE,
>  	.open =		lo_open,
> @@ -1773,6 +1781,7 @@ static const struct block_device_operations lo_fops = {
>  #ifdef CONFIG_COMPAT
>  	.compat_ioctl =	lo_compat_ioctl,
>  #endif
> +	.free_disk =	lo_free_disk,
>  };
>  
>  /*
> @@ -2064,14 +2073,13 @@ static void loop_remove(struct loop_device *lo)
>  {
>  	/* Make this loop device unreachable from pathname. */
>  	del_gendisk(lo->lo_disk);
> -	blk_cleanup_disk(lo->lo_disk);
> +	blk_cleanup_queue(lo->lo_disk->queue);
>  	blk_mq_free_tag_set(&lo->tag_set);
>  	mutex_lock(&loop_ctl_mutex);
>  	idr_remove(&loop_index_idr, lo->lo_number);
>  	mutex_unlock(&loop_ctl_mutex);
> -	/* There is no route which can find this loop device. */
> -	mutex_destroy(&lo->lo_mutex);
> -	kfree(lo);
> +
> +	put_disk(lo->lo_disk);
>  }
>  
>  static void loop_probe(dev_t dev)
> -- 
> 2.30.2
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: yet another approach to fix the loop lock order inversions v3
  2022-03-16  8:45 yet another approach to fix the loop lock order inversions v3 Christoph Hellwig
                   ` (7 preceding siblings ...)
  2022-03-16  8:45 ` [PATCH 8/8] loop: don't destroy lo->workqueue in __loop_clr_fd Christoph Hellwig
@ 2022-03-16 10:59 ` Tetsuo Handa
  2022-03-22 11:12   ` Christoph Hellwig
  8 siblings, 1 reply; 25+ messages in thread
From: Tetsuo Handa @ 2022-03-16 10:59 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe
  Cc: Jan Kara, Darrick J . Wong, linux-block, Ming Lei

I tested this series using next-20220315.
I can't test locking dependency because lockdep is turned off upon boot.

[   30.251160] ======================================================
[   30.251162] WARNING: possible circular locking dependency detected
[   30.251164] 5.17.0-rc8-next-20220315+ #22 Not tainted
[   30.251166] ------------------------------------------------------
[   30.251168] mount/407 is trying to acquire lock:
[   30.251170] ffff8881003c8898 (&p->pi_lock){-.-.}-{2:2}, at: try_to_wake_up+0x4f/0x4d0
[   30.251185] 
[   30.251185] but task is already holding lock:
[   30.251186] ffffffff830f7798 ((console_sem).lock){-.-.}-{2:2}, at: up+0xd/0x50
[   30.251195] 
[   30.251195] which lock already depends on the new lock.
(...snipped...)
[   30.251433] Chain exists of:
[   30.251433]   &p->pi_lock --> &rq->__lock --> (console_sem).lock
[   30.251433] 
[   30.251440]  Possible unsafe locking scenario:
[   30.251440] 
[   30.251441]        CPU0                    CPU1
[   30.251443]        ----                    ----
[   30.251444]   lock((console_sem).lock);
[   30.251446]                                lock(&rq->__lock);
[   30.251458]                                lock((console_sem).lock);
[   30.251461]   lock(&p->pi_lock);
[   30.251464] 
[   30.251464]  *** DEADLOCK ***
[   30.251464] 
[   30.251465] 2 locks held by mount/407:
[   30.251467]  #0: ffffffff830c7878 (low_water_lock){+.+.}-{2:2}, at: do_exit+0x7ff/0xeb0
[   30.251486]  #1: ffffffff830f7798 ((console_sem).lock){-.-.}-{2:2}, at: up+0xd/0x50

But I can see that due to no longer waiting for lo->lo_mutex from lo_open(),
there are occasional I/O errors. What is your plan to avoid this?

----------------------------------------
[  148.444639] loop56: detected capacity change from 0 to 2048
[  148.448298] I/O error, dev loop56, sector 0 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 0
[  148.456250] loop57: detected capacity change from 0 to 2048
--
[  149.264210] loop70: detected capacity change from 0 to 2048
[  149.267449] I/O error, dev loop70, sector 0 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 0
[  149.314558] loop71: detected capacity change from 0 to 2048
--
[  154.708948] loop172: detected capacity change from 0 to 2048
[  154.712403] I/O error, dev loop172, sector 0 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 0
[  154.760841] loop173: detected capacity change from 0 to 2048
[  154.763728] I/O error, dev loop173, sector 0 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 0
[  154.789921] loop174: detected capacity change from 0 to 2048
--
[  155.469135] loop185: detected capacity change from 0 to 2048
[  155.470800] I/O error, dev loop185, sector 0 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 0
[  155.483457] loop186: detected capacity change from 0 to 2048
--
[  155.568911] loop190: detected capacity change from 0 to 2048
[  155.570783] I/O error, dev loop190, sector 0 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 0
[  155.576789] loop191: detected capacity change from 0 to 2048
--
[  159.671039] loop259: detected capacity change from 0 to 2048
[  159.674879] I/O error, dev loop259, sector 0 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 0
[  159.706059] loop260: detected capacity change from 0 to 2048
--
[  162.845545] loop309: detected capacity change from 0 to 2048
[  162.848151] I/O error, dev loop309, sector 0 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 0
[  162.873731] loop310: detected capacity change from 0 to 2048
--
[  162.940326] loop313: detected capacity change from 0 to 2048
[  162.943770] I/O error, dev loop313, sector 0 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 0
[  163.012664] loop314: detected capacity change from 0 to 2048
--
[  164.725370] loop338: detected capacity change from 0 to 2048
[  164.728747] I/O error, dev loop338, sector 0 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 0
[  164.734669] loop339: detected capacity change from 0 to 2048
--
[  166.463447] loop370: detected capacity change from 0 to 2048
[  166.468262] I/O error, dev loop370, sector 0 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 0
[  166.476621] loop371: detected capacity change from 0 to 2048
--
[  169.133011] loop417: detected capacity change from 0 to 2048
[  169.136747] I/O error, dev loop417, sector 0 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 0
[  169.140203] I/O error, dev loop417, sector 0 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 0
[  169.152730] Buffer I/O error on dev loop417, logical block 0, async page read
[  169.299206] loop418: detected capacity change from 0 to 2048
----------------------------------------



By the way, if CONFIG_BLOCK_LEGACY_AUTOLOAD=n,

# mount -o loop,ro isofs.iso isofs/

unconditionally fails with

mount: isofs/: failed to setup loop device for isofs.iso.

message. Commit 451f0b6f4c44d7b6 ("block: default BLOCK_LEGACY_AUTOLOAD to y")
says "if the device node already exists because old scripts created it manually".
But it is not always manual creation of loop devices; I think it is
ioctl(LOOP_CTL_GET_FREE) in my case.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 7/8] loop: remove lo_refcount and avoid lo_mutex in ->open / ->release
  2022-03-16  8:45 ` [PATCH 7/8] loop: remove lo_refcount and avoid lo_mutex in ->open / ->release Christoph Hellwig
@ 2022-03-16 11:22   ` Jan Kara
  2022-03-16 13:14     ` Tetsuo Handa
  0 siblings, 1 reply; 25+ messages in thread
From: Jan Kara @ 2022-03-16 11:22 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Tetsuo Handa, Jan Kara, Darrick J . Wong,
	linux-block, Ming Lei

On Wed 16-03-22 09:45:18, Christoph Hellwig wrote:
> lo_refcount counts how many openers a loop device has, but that count
> is already provided by the block layer in the bd_openers field of the
> whole-disk block_device.  Remove lo_refcount and allow opens to
> succeed even on devices beeing deleted - now that ->free_disk is
> implemented we can handle that race gracefull and all I/O on it will
> just fail. Similarly there is a small race window now where
> loop_control_remove does not synchronize the delete vs the remove
> due do bd_openers not being under lo_mutex protection, but we can
> handle that just as gracefully.

Honestly, I'm a bit uneasy about these races and ability to have open
removed loop device (given use of loop devices in container environments
potential problems could have security implications). But I guess it is no
different to having open hot-unplugged scsi disk. There may be just an
expectation from userspace that your open either blocks loop device removal
or open fails. But I guess we can deal with that if some real breakage
happens - it does not seem that hard to solve - we just need
loop_control_remove() to grab disk->open_mutex to make transition to
Lo_deleting state safe and keep Lo_deleting check in lo_open(). Plus we'd
need to use READ_ONCE / WRITE_ONCE for lo_state accesses.

> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  drivers/block/loop.c | 36 +++++++-----------------------------
>  drivers/block/loop.h |  1 -
>  2 files changed, 7 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index cbaa18bcad1fe..c270f3715d829 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -1244,7 +1244,7 @@ static int loop_clr_fd(struct loop_device *lo)
>  	 * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
>  	 * command to fail with EBUSY.
>  	 */
> -	if (atomic_read(&lo->lo_refcnt) > 1) {
> +	if (lo->lo_disk->part0->bd_openers > 1) {

But bd_openers can be read safely only under disk->open_mutex. So for this
to be safe against compiler playing nasty tricks with optimizations, we
need to either make bd_openers atomic_t or use READ_ONCE / WRITE_ONCE when
accessing it.

> @@ -1724,33 +1724,15 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
>  }
>  #endif
>  
> -static int lo_open(struct block_device *bdev, fmode_t mode)
> -{
> -	struct loop_device *lo = bdev->bd_disk->private_data;
> -	int err;
> -
> -	err = mutex_lock_killable(&lo->lo_mutex);
> -	if (err)
> -		return err;
> -	if (lo->lo_state == Lo_deleting)
> -		err = -ENXIO;
> -	else
> -		atomic_inc(&lo->lo_refcnt);
> -	mutex_unlock(&lo->lo_mutex);
> -	return err;
> -}
> -

Tetsuo has observed [1] that not grabbing lo_mutex when opening loop device
tends to break systemd-udevd because in loop_configure() we send
DISK_EVENT_MEDIA_CHANGE event before the device is fully configured (but
the configuration gets finished before we release the lo_mutex) and so
systemd-udev gets spurious IO errors when probing new loop devices and is
unhappy. So I think this is the right way to go but we need to reshuffle
loop_configure() a bit first.

[1] https://lore.kernel.org/all/a72c59c6-298b-e4ba-b1f5-2275afab49a1@I-love.SAKURA.ne.jp/T/#u

								Honza

-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 8/8] loop: don't destroy lo->workqueue in __loop_clr_fd
  2022-03-16  8:45 ` [PATCH 8/8] loop: don't destroy lo->workqueue in __loop_clr_fd Christoph Hellwig
@ 2022-03-16 11:25   ` Jan Kara
  2022-03-16 13:24     ` Tetsuo Handa
  0 siblings, 1 reply; 25+ messages in thread
From: Jan Kara @ 2022-03-16 11:25 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Tetsuo Handa, Jan Kara, Darrick J . Wong,
	linux-block, Ming Lei, syzbot+6479585dfd4dedd3f7e1

On Wed 16-03-22 09:45:19, Christoph Hellwig wrote:
> There is no need to destroy the workqueue when clearing unbinding
> a loop device from a backing file.  Not doing so on the other hand
> avoid creating a complex lock dependency chain involving the global
> system_transition_mutex.
> 
> Based on a patch from Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>.
> 
> Reported-by: syzbot+6479585dfd4dedd3f7e1@syzkaller.appspotmail.com
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  drivers/block/loop.c | 26 +++++++++++++-------------
>  1 file changed, 13 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index c270f3715d829..ffc9f00c433cd 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -808,7 +808,6 @@ struct loop_worker {
>  };
>  
>  static void loop_workfn(struct work_struct *work);
> -static void loop_rootcg_workfn(struct work_struct *work);
>  
>  #ifdef CONFIG_BLK_CGROUP
>  static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
> @@ -1043,20 +1042,19 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
>  	    !file->f_op->write_iter)
>  		lo->lo_flags |= LO_FLAGS_READ_ONLY;
>  
> -	lo->workqueue = alloc_workqueue("loop%d",
> -					WQ_UNBOUND | WQ_FREEZABLE,
> -					0,
> -					lo->lo_number);
>  	if (!lo->workqueue) {
> -		error = -ENOMEM;
> -		goto out_unlock;
> +		lo->workqueue = alloc_workqueue("loop%d",
> +						WQ_UNBOUND | WQ_FREEZABLE,
> +						0, lo->lo_number);
> +		if (!lo->workqueue) {
> +			error = -ENOMEM;
> +			goto out_unlock;
> +		}
>  	}
>  
>  	disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
>  	set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0);
>  
> -	INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn);
> -	INIT_LIST_HEAD(&lo->rootcg_cmd_list);
>  	lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
>  	lo->lo_device = bdev;
>  	lo->lo_backing_file = file;
> @@ -1152,10 +1150,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
>  	if (!release)
>  		blk_mq_freeze_queue(lo->lo_queue);
>  
> -	destroy_workqueue(lo->workqueue);
> -	loop_free_idle_workers(lo, true);
> -	del_timer_sync(&lo->timer);
> -
>  	spin_lock_irq(&lo->lo_lock);
>  	filp = lo->lo_backing_file;
>  	lo->lo_backing_file = NULL;
> @@ -1749,6 +1743,10 @@ static void lo_free_disk(struct gendisk *disk)
>  {
>  	struct loop_device *lo = disk->private_data;
>  
> +	if (lo->workqueue)
> +		destroy_workqueue(lo->workqueue);
> +	loop_free_idle_workers(lo, true);
> +	del_timer_sync(&lo->timer);
>  	mutex_destroy(&lo->lo_mutex);
>  	kfree(lo);
>  }
> @@ -2012,6 +2010,8 @@ static int loop_add(int i)
>  	lo->lo_number		= i;
>  	spin_lock_init(&lo->lo_lock);
>  	spin_lock_init(&lo->lo_work_lock);
> +	INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn);
> +	INIT_LIST_HEAD(&lo->rootcg_cmd_list);
>  	disk->major		= LOOP_MAJOR;
>  	disk->first_minor	= i << part_shift;
>  	disk->minors		= 1 << part_shift;
> -- 
> 2.30.2
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 7/8] loop: remove lo_refcount and avoid lo_mutex in ->open / ->release
  2022-03-16 11:22   ` Jan Kara
@ 2022-03-16 13:14     ` Tetsuo Handa
  2022-03-16 14:38       ` Jan Kara
  0 siblings, 1 reply; 25+ messages in thread
From: Tetsuo Handa @ 2022-03-16 13:14 UTC (permalink / raw)
  To: Jan Kara, Christoph Hellwig
  Cc: Jens Axboe, Darrick J . Wong, linux-block, Ming Lei

On 2022/03/16 20:22, Jan Kara wrote:
>> @@ -1244,7 +1244,7 @@ static int loop_clr_fd(struct loop_device *lo)
>>  	 * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
>>  	 * command to fail with EBUSY.
>>  	 */
>> -	if (atomic_read(&lo->lo_refcnt) > 1) {
>> +	if (lo->lo_disk->part0->bd_openers > 1) {
> 
> But bd_openers can be read safely only under disk->open_mutex. So for this
> to be safe against compiler playing nasty tricks with optimizations, we
> need to either make bd_openers atomic_t or use READ_ONCE / WRITE_ONCE when
> accessing it.

Use of READ_ONCE() / WRITE_ONCE() are not for avoiding races but for making
sure that memory access happens only once. It is data_race() which is needed
for tolerating and annotating races. For example, data_race(lo->lo_state) is
needed when accessing lo->lo_state without lo->lo_mutex held.

Use of atomic_t for lo->lo_disk->part0->bd_openers does not help, for currently
lo->lo_mutex is held in order to avoid races. That is, it is disk->open_mutex
which loop_clr_fd() needs to hold when accessing lo->lo_disk->part0->bd_openers.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 8/8] loop: don't destroy lo->workqueue in __loop_clr_fd
  2022-03-16 11:25   ` Jan Kara
@ 2022-03-16 13:24     ` Tetsuo Handa
  2022-03-22 11:10       ` Christoph Hellwig
  0 siblings, 1 reply; 25+ messages in thread
From: Tetsuo Handa @ 2022-03-16 13:24 UTC (permalink / raw)
  To: Jan Kara, Christoph Hellwig
  Cc: Jens Axboe, Darrick J . Wong, linux-block, Ming Lei,
	syzbot+6479585dfd4dedd3f7e1

On 2022/03/16 20:25, Jan Kara wrote:
> On Wed 16-03-22 09:45:19, Christoph Hellwig wrote:
>> There is no need to destroy the workqueue when clearing unbinding
>> a loop device from a backing file.  Not doing so on the other hand
>> avoid creating a complex lock dependency chain involving the global
>> system_transition_mutex.
>>
>> Based on a patch from Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>.
>>
>> Reported-by: syzbot+6479585dfd4dedd3f7e1@syzkaller.appspotmail.com
>> Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> Looks good. Feel free to add:
> 
> Reviewed-by: Jan Kara <jack@suse.cz>

I don't recommend this change.

Since this WQ is invoked when flushing data to disk, this WQ had better use
WQ_MEM_RECLAIM flag when creating. A WQ created with WQ_MEM_RECLAIM flag has
at least one "struct task_struct" in order to guarantee forward progress, but
results in consuming more kernel resources. Therefore, it is preferable to
destroy the WQ when clearing unbinding a loop device from a backing file.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 7/8] loop: remove lo_refcount and avoid lo_mutex in ->open / ->release
  2022-03-16 13:14     ` Tetsuo Handa
@ 2022-03-16 14:38       ` Jan Kara
  2022-03-22 11:09         ` Christoph Hellwig
  0 siblings, 1 reply; 25+ messages in thread
From: Jan Kara @ 2022-03-16 14:38 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Jan Kara, Christoph Hellwig, Jens Axboe, Darrick J . Wong,
	linux-block, Ming Lei

On Wed 16-03-22 22:14:50, Tetsuo Handa wrote:
> On 2022/03/16 20:22, Jan Kara wrote:
> >> @@ -1244,7 +1244,7 @@ static int loop_clr_fd(struct loop_device *lo)
> >>  	 * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
> >>  	 * command to fail with EBUSY.
> >>  	 */
> >> -	if (atomic_read(&lo->lo_refcnt) > 1) {
> >> +	if (lo->lo_disk->part0->bd_openers > 1) {
> > 
> > But bd_openers can be read safely only under disk->open_mutex. So for this
> > to be safe against compiler playing nasty tricks with optimizations, we
> > need to either make bd_openers atomic_t or use READ_ONCE / WRITE_ONCE when
> > accessing it.
> 
> Use of READ_ONCE() / WRITE_ONCE() are not for avoiding races but for making
> sure that memory access happens only once. It is data_race() which is needed
> for tolerating and annotating races. For example, data_race(lo->lo_state) is
> needed when accessing lo->lo_state without lo->lo_mutex held.

Well, but another effect of READ_ONCE() / WRITE_ONCE() is that it
effectively forces the compiler to not store any intermediate value in
bd_openers. If you have code like bdev->bd_openers++, and bd_openers has
value say 1, the compiler is fully within its rights if unlocked reader
sees values, 1, 0, 3, 2. It would have to be a vicious compiler but the C
standard allows that and some of the optimizations compilers end up doing
result in code which is not far from this (read more about KCSAN and the
motivation behind it for details). So data_race() annotation is *not*
enough for unlocked bd_openers usage.

> Use of atomic_t for lo->lo_disk->part0->bd_openers does not help, for
> currently lo->lo_mutex is held in order to avoid races. That is, it is
> disk->open_mutex which loop_clr_fd() needs to hold when accessing
> lo->lo_disk->part0->bd_openers.

It does help because with atomic_t, seeing any intermediate values is not
possible even for unlocked readers.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 7/8] loop: remove lo_refcount and avoid lo_mutex in ->open / ->release
  2022-03-16 14:38       ` Jan Kara
@ 2022-03-22 11:09         ` Christoph Hellwig
  2022-03-23 12:18           ` Jan Kara
  0 siblings, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2022-03-22 11:09 UTC (permalink / raw)
  To: Jan Kara
  Cc: Tetsuo Handa, Christoph Hellwig, Jens Axboe, Darrick J . Wong,
	linux-block, Ming Lei

On Wed, Mar 16, 2022 at 03:38:55PM +0100, Jan Kara wrote:
> Well, but another effect of READ_ONCE() / WRITE_ONCE() is that it
> effectively forces the compiler to not store any intermediate value in
> bd_openers. If you have code like bdev->bd_openers++, and bd_openers has
> value say 1, the compiler is fully within its rights if unlocked reader
> sees values, 1, 0, 3, 2. It would have to be a vicious compiler but the C
> standard allows that and some of the optimizations compilers end up doing
> result in code which is not far from this (read more about KCSAN and the
> motivation behind it for details). So data_race() annotation is *not*
> enough for unlocked bd_openers usage.
> 
> > Use of atomic_t for lo->lo_disk->part0->bd_openers does not help, for
> > currently lo->lo_mutex is held in order to avoid races. That is, it is
> > disk->open_mutex which loop_clr_fd() needs to hold when accessing
> > lo->lo_disk->part0->bd_openers.
> 
> It does help because with atomic_t, seeing any intermediate values is not
> possible even for unlocked readers.

The Linux memory model guarantees atomic reads from 32-bit integers.
But if it makes everyone happier I could do a READ_ONCE here.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 8/8] loop: don't destroy lo->workqueue in __loop_clr_fd
  2022-03-16 13:24     ` Tetsuo Handa
@ 2022-03-22 11:10       ` Christoph Hellwig
  2022-03-22 12:42         ` Tetsuo Handa
  0 siblings, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2022-03-22 11:10 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Jan Kara, Christoph Hellwig, Jens Axboe, Darrick J . Wong,
	linux-block, Ming Lei, syzbot+6479585dfd4dedd3f7e1

On Wed, Mar 16, 2022 at 10:24:20PM +0900, Tetsuo Handa wrote:
> Since this WQ is invoked when flushing data to disk, this WQ had better use
> WQ_MEM_RECLAIM flag when creating. A WQ created with WQ_MEM_RECLAIM flag has
> at least one "struct task_struct" in order to guarantee forward progress, but
> results in consuming more kernel resources. Therefore, it is preferable to
> destroy the WQ when clearing unbinding a loop device from a backing file.

Whіch then gets us into lock dependency problems.  A previously used
but lingering around device will use some resources, so what?  If you
care about the least used resources the only way to get there is to
destroy the device.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: yet another approach to fix the loop lock order inversions v3
  2022-03-16 10:59 ` yet another approach to fix the loop lock order inversions v3 Tetsuo Handa
@ 2022-03-22 11:12   ` Christoph Hellwig
  0 siblings, 0 replies; 25+ messages in thread
From: Christoph Hellwig @ 2022-03-22 11:12 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Christoph Hellwig, Jens Axboe, Jan Kara, Darrick J . Wong,
	linux-block, Ming Lei

On Wed, Mar 16, 2022 at 07:59:15PM +0900, Tetsuo Handa wrote:
> But I can see that due to no longer waiting for lo->lo_mutex from lo_open(),
> there are occasional I/O errors. What is your plan to avoid this?

There is no plan to avoid that.  We'll get this areas due to have the
remove handling works.  We could play tricks with RQF_QUIET, but I'm
not sure that is worth it.

> By the way, if CONFIG_BLOCK_LEGACY_AUTOLOAD=n,
> 
> # mount -o loop,ro isofs.iso isofs/
> 
> unconditionally fails with
> 
> mount: isofs/: failed to setup loop device for isofs.iso.
> 
> message. Commit 451f0b6f4c44d7b6 ("block: default BLOCK_LEGACY_AUTOLOAD to y")
> says "if the device node already exists because old scripts created it manually".
> But it is not always manual creation of loop devices; I think it is
> ioctl(LOOP_CTL_GET_FREE) in my case.

I'll take a look, thanks!

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 8/8] loop: don't destroy lo->workqueue in __loop_clr_fd
  2022-03-22 11:10       ` Christoph Hellwig
@ 2022-03-22 12:42         ` Tetsuo Handa
  2022-03-23  0:06           ` Tetsuo Handa
  0 siblings, 1 reply; 25+ messages in thread
From: Tetsuo Handa @ 2022-03-22 12:42 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, Jens Axboe, Darrick J . Wong, linux-block, Ming Lei,
	syzbot+6479585dfd4dedd3f7e1

On 2022/03/22 20:10, Christoph Hellwig wrote:
> On Wed, Mar 16, 2022 at 10:24:20PM +0900, Tetsuo Handa wrote:
>> Since this WQ is invoked when flushing data to disk, this WQ had better use
>> WQ_MEM_RECLAIM flag when creating. A WQ created with WQ_MEM_RECLAIM flag has
>> at least one "struct task_struct" in order to guarantee forward progress, but
>> results in consuming more kernel resources. Therefore, it is preferable to
>> destroy the WQ when clearing unbinding a loop device from a backing file.
> 
> Whіch then gets us into lock dependency problems.  A previously used
> but lingering around device will use some resources, so what?  If you
> care about the least used resources the only way to get there is to
> destroy the device.

There is no dependency problems and there is no resource wasting if we choose
task_work approach, for that approach runs in lockless context and allows
destroying "struct task_struct" as soon as device is unbound.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 8/8] loop: don't destroy lo->workqueue in __loop_clr_fd
  2022-03-22 12:42         ` Tetsuo Handa
@ 2022-03-23  0:06           ` Tetsuo Handa
  0 siblings, 0 replies; 25+ messages in thread
From: Tetsuo Handa @ 2022-03-23  0:06 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, Jens Axboe, Darrick J . Wong, linux-block, Ming Lei,
	syzbot+6479585dfd4dedd3f7e1

On 2022/03/22 21:42, Tetsuo Handa wrote:
> On 2022/03/22 20:10, Christoph Hellwig wrote:
>> On Wed, Mar 16, 2022 at 10:24:20PM +0900, Tetsuo Handa wrote:
>>> Since this WQ is invoked when flushing data to disk, this WQ had better use
>>> WQ_MEM_RECLAIM flag when creating. A WQ created with WQ_MEM_RECLAIM flag has
>>> at least one "struct task_struct" in order to guarantee forward progress, but
>>> results in consuming more kernel resources. Therefore, it is preferable to
>>> destroy the WQ when clearing unbinding a loop device from a backing file.
>>
>> Whіch then gets us into lock dependency problems.  A previously used
>> but lingering around device will use some resources, so what?  If you
>> care about the least used resources the only way to get there is to
>> destroy the device.
> 
> There is no dependency problems and there is no resource wasting if we choose
> task_work approach, for that approach runs in lockless context and allows
> destroying "struct task_struct" as soon as device is unbound.

This lockdep problem is reaching to whether the loop module should continue using WQ.
There must be a reason to convert kernel threads into WQ again, but use of WQ_MEM_RECLAIM flag
is causing troubles. https://lkml.kernel.org/r/20220322235032.GS1544202@dread.disaster.area

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 7/8] loop: remove lo_refcount and avoid lo_mutex in ->open / ->release
  2022-03-22 11:09         ` Christoph Hellwig
@ 2022-03-23 12:18           ` Jan Kara
  2022-03-23 13:09             ` Tetsuo Handa
  0 siblings, 1 reply; 25+ messages in thread
From: Jan Kara @ 2022-03-23 12:18 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, Tetsuo Handa, Jens Axboe, Darrick J . Wong,
	linux-block, Ming Lei

On Tue 22-03-22 12:09:08, Christoph Hellwig wrote:
> On Wed, Mar 16, 2022 at 03:38:55PM +0100, Jan Kara wrote:
> > Well, but another effect of READ_ONCE() / WRITE_ONCE() is that it
> > effectively forces the compiler to not store any intermediate value in
> > bd_openers. If you have code like bdev->bd_openers++, and bd_openers has
> > value say 1, the compiler is fully within its rights if unlocked reader
> > sees values, 1, 0, 3, 2. It would have to be a vicious compiler but the C
> > standard allows that and some of the optimizations compilers end up doing
> > result in code which is not far from this (read more about KCSAN and the
> > motivation behind it for details). So data_race() annotation is *not*
> > enough for unlocked bd_openers usage.
> > 
> > > Use of atomic_t for lo->lo_disk->part0->bd_openers does not help, for
> > > currently lo->lo_mutex is held in order to avoid races. That is, it is
> > > disk->open_mutex which loop_clr_fd() needs to hold when accessing
> > > lo->lo_disk->part0->bd_openers.
> > 
> > It does help because with atomic_t, seeing any intermediate values is not
> > possible even for unlocked readers.
> 
> The Linux memory model guarantees atomic reads from 32-bit integers.
> But if it makes everyone happier I could do a READ_ONCE here.

Sure, the read is atomic wrt other CPU instructions, but it is not atomic
wrt how the compiler decides to implement bdi->bd_openers++. So we need to
make these bd_openers *updates* atomic so that the unlocked reads are
really safe. That being said I consider the concerns mostly theoretical so
I don't insist but some checker will surely complain sooner rather than
later.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 7/8] loop: remove lo_refcount and avoid lo_mutex in ->open / ->release
  2022-03-23 12:18           ` Jan Kara
@ 2022-03-23 13:09             ` Tetsuo Handa
  0 siblings, 0 replies; 25+ messages in thread
From: Tetsuo Handa @ 2022-03-23 13:09 UTC (permalink / raw)
  To: Jan Kara, Christoph Hellwig
  Cc: Jens Axboe, Darrick J . Wong, linux-block, Ming Lei

On 2022/03/23 21:18, Jan Kara wrote:
>>>> Use of atomic_t for lo->lo_disk->part0->bd_openers does not help, for
>>>> currently lo->lo_mutex is held in order to avoid races. That is, it is
>>>> disk->open_mutex which loop_clr_fd() needs to hold when accessing
>>>> lo->lo_disk->part0->bd_openers.
>>>
>>> It does help because with atomic_t, seeing any intermediate values is not
>>> possible even for unlocked readers.
>>
>> The Linux memory model guarantees atomic reads from 32-bit integers.
>> But if it makes everyone happier I could do a READ_ONCE here.
> 
> Sure, the read is atomic wrt other CPU instructions, but it is not atomic
> wrt how the compiler decides to implement bdi->bd_openers++. So we need to
> make these bd_openers *updates* atomic so that the unlocked reads are
> really safe. That being said I consider the concerns mostly theoretical so
> I don't insist but some checker will surely complain sooner rather than
> later.

KCSAN will complain access without data_race(). But what I'm saying here is
that loop_clr_fd() needs to hold disk->open_mutex, for blkdev_get_whole() is
protected using disk->open_mutex. Then, KCSAN will not complain this access.

static int loop_clr_fd(struct loop_device *lo)
{
        int err;

        if (err)
                return err;
        if (lo->lo_state != Lo_bound) {
                mutex_unlock(&lo->lo_mutex);
                return -ENXIO;
        }
        if (lo->lo_disk->part0->bd_openers > 1) { // Sees lo->lo_disk->part0->bd_openers == 1.
                lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
                mutex_unlock(&lo->lo_mutex);
                return 0;
        }
	// Preemption starts.
							static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
							{
							        struct gendisk *disk = bdev->bd_disk;
							        int ret;
							
							        if (disk->fops->open) { // disk->fops->open == NULL because lo_open() is removed.
							                ret = disk->fops->open(bdev, mode);
							                if (ret) {
							                        /* avoid ghost partitions on a removed medium */
							                        if (ret == -ENOMEDIUM &&
							                             test_bit(GD_NEED_PART_SCAN, &disk->state))
							                                bdev_disk_changed(disk, true);
							                        return ret;
							                }
							        }
							
							        if (!bdev->bd_openers)
							                set_init_blocksize(bdev);
							        if (test_bit(GD_NEED_PART_SCAN, &disk->state))
							                bdev_disk_changed(disk, false);
							        bdev->bd_openers++;
							        return 0;
							}
	// Preemption ends.
        lo->lo_state = Lo_rundown;
        mutex_unlock(&lo->lo_mutex);

        __loop_clr_fd(lo, false); // Despite lo->lo_disk->part0->bd_openers > 1.
        return 0;
}


^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2022-03-23 13:09 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-16  8:45 yet another approach to fix the loop lock order inversions v3 Christoph Hellwig
2022-03-16  8:45 ` [PATCH 1/8] loop: de-duplicate the idle worker freeing code Christoph Hellwig
2022-03-16  8:45 ` [PATCH 2/8] loop: initialize the worker tracking fields once Christoph Hellwig
2022-03-16  9:26   ` Chaitanya Kulkarni
2022-03-16  8:45 ` [PATCH 3/8] loop: remove the racy bd_inode->i_mapping->nrpages asserts Christoph Hellwig
2022-03-16 10:24   ` Jan Kara
2022-03-16  8:45 ` [PATCH 4/8] loop: don't freeze the queue in lo_release Christoph Hellwig
2022-03-16  8:45 ` [PATCH 5/8] loop: only freeze the queue in __loop_clr_fd when needed Christoph Hellwig
2022-03-16  8:45 ` [PATCH 6/8] loop: implement ->free_disk Christoph Hellwig
2022-03-16 10:30   ` Jan Kara
2022-03-16  8:45 ` [PATCH 7/8] loop: remove lo_refcount and avoid lo_mutex in ->open / ->release Christoph Hellwig
2022-03-16 11:22   ` Jan Kara
2022-03-16 13:14     ` Tetsuo Handa
2022-03-16 14:38       ` Jan Kara
2022-03-22 11:09         ` Christoph Hellwig
2022-03-23 12:18           ` Jan Kara
2022-03-23 13:09             ` Tetsuo Handa
2022-03-16  8:45 ` [PATCH 8/8] loop: don't destroy lo->workqueue in __loop_clr_fd Christoph Hellwig
2022-03-16 11:25   ` Jan Kara
2022-03-16 13:24     ` Tetsuo Handa
2022-03-22 11:10       ` Christoph Hellwig
2022-03-22 12:42         ` Tetsuo Handa
2022-03-23  0:06           ` Tetsuo Handa
2022-03-16 10:59 ` yet another approach to fix the loop lock order inversions v3 Tetsuo Handa
2022-03-22 11:12   ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.