yet another approach to fix loop autoclear for xfstets xfs/049

All of lore.kernel.org
 help / color / mirror / Atom feed

* yet another approach to fix loop autoclear for xfstets xfs/049
@ 2022-01-26 15:50 Christoph Hellwig
  2022-01-26 15:50 ` [PATCH 1/8] loop: de-duplicate the idle worker freeing code Christoph Hellwig
                   ` (9 more replies)
  0 siblings, 10 replies; 32+ messages in thread
From: Christoph Hellwig @ 2022-01-26 15:50 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Tetsuo Handa, Jan Kara, linux-block, Ming Lei

Hi Jens, hi Tetsuo,

this series uses the approach from Tetsuo to delay the destroy_workueue
cll, extended by a delayed teardown of the workers to fix a potential
racewindow then the workqueue can be still round after finishing the
commands.  It then also removed the queue freeing in release that is
not needed to fix the dependency chain for that (which can't be
reported by lockdep) as well.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH 1/8] loop: de-duplicate the idle worker freeing code
  2022-01-26 15:50 yet another approach to fix loop autoclear for xfstets xfs/049 Christoph Hellwig
@ 2022-01-26 15:50 ` Christoph Hellwig
  2022-01-27  9:36   ` Jan Kara
  2022-01-26 15:50 ` [PATCH 2/8] loop: initialize the worker tracking fields once Christoph Hellwig
                   ` (8 subsequent siblings)
  9 siblings, 1 reply; 32+ messages in thread
From: Christoph Hellwig @ 2022-01-26 15:50 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Tetsuo Handa, Jan Kara, linux-block, Ming Lei

Use a common helper for both timer based and uncoditional freeing of idle
workers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/loop.c | 73 +++++++++++++++++++++-----------------------
 1 file changed, 35 insertions(+), 38 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 01cbbfc4e9e24..b268bca6e4fb7 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -804,7 +804,6 @@ struct loop_worker {
 
 static void loop_workfn(struct work_struct *work);
 static void loop_rootcg_workfn(struct work_struct *work);
-static void loop_free_idle_workers(struct timer_list *timer);
 
 #ifdef CONFIG_BLK_CGROUP
 static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
@@ -888,6 +887,39 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
 	spin_unlock_irq(&lo->lo_work_lock);
 }
 
+static void loop_set_timer(struct loop_device *lo)
+{
+	timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT);
+}
+
+static void loop_free_idle_workers(struct loop_device *lo, bool delete_all)
+{
+	struct loop_worker *pos, *worker;
+
+	spin_lock_irq(&lo->lo_work_lock);
+	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
+				idle_list) {
+		if (!delete_all &&
+		    time_is_after_jiffies(worker->last_ran_at +
+					  LOOP_IDLE_WORKER_TIMEOUT))
+			break;
+		list_del(&worker->idle_list);
+		rb_erase(&worker->rb_node, &lo->worker_tree);
+		css_put(worker->blkcg_css);
+		kfree(worker);
+	}
+	if (!list_empty(&lo->idle_worker_list))
+		loop_set_timer(lo);
+	spin_unlock_irq(&lo->lo_work_lock);
+}
+
+static void loop_free_idle_workers_timer(struct timer_list *timer)
+{
+	struct loop_device *lo = container_of(timer, struct loop_device, timer);
+
+	return loop_free_idle_workers(lo, false);
+}
+
 static void loop_update_rotational(struct loop_device *lo)
 {
 	struct file *file = lo->lo_backing_file;
@@ -1022,7 +1054,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 	INIT_LIST_HEAD(&lo->rootcg_cmd_list);
 	INIT_LIST_HEAD(&lo->idle_worker_list);
 	lo->worker_tree = RB_ROOT;
-	timer_setup(&lo->timer, loop_free_idle_workers,
+	timer_setup(&lo->timer, loop_free_idle_workers_timer,
 		TIMER_DEFERRABLE);
 	lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
 	lo->lo_device = bdev;
@@ -1086,7 +1118,6 @@ static void __loop_clr_fd(struct loop_device *lo)
 {
 	struct file *filp;
 	gfp_t gfp = lo->old_gfp_mask;
-	struct loop_worker *pos, *worker;
 
 	/*
 	 * Flush loop_configure() and loop_change_fd(). It is acceptable for
@@ -1116,15 +1147,7 @@ static void __loop_clr_fd(struct loop_device *lo)
 	blk_mq_freeze_queue(lo->lo_queue);
 
 	destroy_workqueue(lo->workqueue);
-	spin_lock_irq(&lo->lo_work_lock);
-	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
-				idle_list) {
-		list_del(&worker->idle_list);
-		rb_erase(&worker->rb_node, &lo->worker_tree);
-		css_put(worker->blkcg_css);
-		kfree(worker);
-	}
-	spin_unlock_irq(&lo->lo_work_lock);
+	loop_free_idle_workers(lo, true);
 	del_timer_sync(&lo->timer);
 
 	spin_lock_irq(&lo->lo_lock);
@@ -1871,11 +1894,6 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 	}
 }
 
-static void loop_set_timer(struct loop_device *lo)
-{
-	timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT);
-}
-
 static void loop_process_work(struct loop_worker *worker,
 			struct list_head *cmd_list, struct loop_device *lo)
 {
@@ -1924,27 +1942,6 @@ static void loop_rootcg_workfn(struct work_struct *work)
 	loop_process_work(NULL, &lo->rootcg_cmd_list, lo);
 }
 
-static void loop_free_idle_workers(struct timer_list *timer)
-{
-	struct loop_device *lo = container_of(timer, struct loop_device, timer);
-	struct loop_worker *pos, *worker;
-
-	spin_lock_irq(&lo->lo_work_lock);
-	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
-				idle_list) {
-		if (time_is_after_jiffies(worker->last_ran_at +
-						LOOP_IDLE_WORKER_TIMEOUT))
-			break;
-		list_del(&worker->idle_list);
-		rb_erase(&worker->rb_node, &lo->worker_tree);
-		css_put(worker->blkcg_css);
-		kfree(worker);
-	}
-	if (!list_empty(&lo->idle_worker_list))
-		loop_set_timer(lo);
-	spin_unlock_irq(&lo->lo_work_lock);
-}
-
 static const struct blk_mq_ops loop_mq_ops = {
 	.queue_rq       = loop_queue_rq,
 	.complete	= lo_complete_rq,
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 2/8] loop: initialize the worker tracking fields once
  2022-01-26 15:50 yet another approach to fix loop autoclear for xfstets xfs/049 Christoph Hellwig
  2022-01-26 15:50 ` [PATCH 1/8] loop: de-duplicate the idle worker freeing code Christoph Hellwig
@ 2022-01-26 15:50 ` Christoph Hellwig
  2022-01-27  9:45   ` Jan Kara
  2022-01-26 15:50 ` [PATCH 3/8] block: remove the racy bd_inode->i_mapping->nrpages asserts Christoph Hellwig
                   ` (7 subsequent siblings)
  9 siblings, 1 reply; 32+ messages in thread
From: Christoph Hellwig @ 2022-01-26 15:50 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Tetsuo Handa, Jan Kara, linux-block, Ming Lei

There is no need to reinitialize idle_worker_list, worker_tree and timer
every time a loop device is configured.  Just initialize them once at
allocation time.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/loop.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index b268bca6e4fb7..6ec55a5d9dfc4 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1052,10 +1052,6 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 
 	INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn);
 	INIT_LIST_HEAD(&lo->rootcg_cmd_list);
-	INIT_LIST_HEAD(&lo->idle_worker_list);
-	lo->worker_tree = RB_ROOT;
-	timer_setup(&lo->timer, loop_free_idle_workers_timer,
-		TIMER_DEFERRABLE);
 	lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
 	lo->lo_device = bdev;
 	lo->lo_backing_file = file;
@@ -1957,6 +1953,9 @@ static int loop_add(int i)
 	lo = kzalloc(sizeof(*lo), GFP_KERNEL);
 	if (!lo)
 		goto out;
+	lo->worker_tree = RB_ROOT;
+	INIT_LIST_HEAD(&lo->idle_worker_list);
+	timer_setup(&lo->timer, loop_free_idle_workers_timer, TIMER_DEFERRABLE);
 	lo->lo_state = Lo_unbound;
 
 	err = mutex_lock_killable(&loop_ctl_mutex);
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 3/8] block: remove the racy bd_inode->i_mapping->nrpages asserts
  2022-01-26 15:50 yet another approach to fix loop autoclear for xfstets xfs/049 Christoph Hellwig
  2022-01-26 15:50 ` [PATCH 1/8] loop: de-duplicate the idle worker freeing code Christoph Hellwig
  2022-01-26 15:50 ` [PATCH 2/8] loop: initialize the worker tracking fields once Christoph Hellwig
@ 2022-01-26 15:50 ` Christoph Hellwig
  2022-01-27  9:47   ` Jan Kara
  2022-01-26 15:50 ` [PATCH 4/8] loop: only take lo_mutex for the last reference in lo_release Christoph Hellwig
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 32+ messages in thread
From: Christoph Hellwig @ 2022-01-26 15:50 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Tetsuo Handa, Jan Kara, linux-block, Ming Lei

Nothing prevents a file system or userspace opener of the block device
from redirtying the page right afte sync_blockdev returned.  Fortunately
data in the page cache during a block device change is mostly harmless
anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/loop.c | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6ec55a5d9dfc4..d3a7f281ce1b6 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1278,15 +1278,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	/* I/O need to be drained during transfer transition */
 	blk_mq_freeze_queue(lo->lo_queue);
 
-	if (size_changed && lo->lo_device->bd_inode->i_mapping->nrpages) {
-		/* If any pages were dirtied after invalidate_bdev(), try again */
-		err = -EAGAIN;
-		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
-			__func__, lo->lo_number, lo->lo_file_name,
-			lo->lo_device->bd_inode->i_mapping->nrpages);
-		goto out_unfreeze;
-	}
-
 	prev_lo_flags = lo->lo_flags;
 
 	err = loop_set_status_from_info(lo, info);
@@ -1497,21 +1488,10 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
 	invalidate_bdev(lo->lo_device);
 
 	blk_mq_freeze_queue(lo->lo_queue);
-
-	/* invalidate_bdev should have truncated all the pages */
-	if (lo->lo_device->bd_inode->i_mapping->nrpages) {
-		err = -EAGAIN;
-		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
-			__func__, lo->lo_number, lo->lo_file_name,
-			lo->lo_device->bd_inode->i_mapping->nrpages);
-		goto out_unfreeze;
-	}
-
 	blk_queue_logical_block_size(lo->lo_queue, arg);
 	blk_queue_physical_block_size(lo->lo_queue, arg);
 	blk_queue_io_min(lo->lo_queue, arg);
 	loop_update_dio(lo);
-out_unfreeze:
 	blk_mq_unfreeze_queue(lo->lo_queue);
 
 	return err;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 4/8] loop: only take lo_mutex for the last reference in lo_release
  2022-01-26 15:50 yet another approach to fix loop autoclear for xfstets xfs/049 Christoph Hellwig
                   ` (2 preceding siblings ...)
  2022-01-26 15:50 ` [PATCH 3/8] block: remove the racy bd_inode->i_mapping->nrpages asserts Christoph Hellwig
@ 2022-01-26 15:50 ` Christoph Hellwig
  2022-01-27  9:48   ` Jan Kara
  2022-01-26 15:50 ` [PATCH 5/8] loop: only take lo_mutex for the first reference in lo_open Christoph Hellwig
                   ` (5 subsequent siblings)
  9 siblings, 1 reply; 32+ messages in thread
From: Christoph Hellwig @ 2022-01-26 15:50 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Tetsuo Handa, Jan Kara, linux-block, Ming Lei

lo_refcnt is only incremented in lo_open and decremented in lo_release,
and thus protected by open_mutex.  Only take lo_mutex when lo_release
actually takes action for the final release.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/loop.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index d3a7f281ce1b6..43980ec69dfdd 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1740,10 +1740,10 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 {
 	struct loop_device *lo = disk->private_data;
 
-	mutex_lock(&lo->lo_mutex);
-	if (atomic_dec_return(&lo->lo_refcnt))
-		goto out_unlock;
+	if (!atomic_dec_and_test(&lo->lo_refcnt))
+		return;
 
+	mutex_lock(&lo->lo_mutex);
 	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
 		if (lo->lo_state != Lo_bound)
 			goto out_unlock;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 5/8] loop: only take lo_mutex for the first reference in lo_open
  2022-01-26 15:50 yet another approach to fix loop autoclear for xfstets xfs/049 Christoph Hellwig
                   ` (3 preceding siblings ...)
  2022-01-26 15:50 ` [PATCH 4/8] loop: only take lo_mutex for the last reference in lo_release Christoph Hellwig
@ 2022-01-26 15:50 ` Christoph Hellwig
  2022-01-27 10:28   ` Jan Kara
  2022-01-26 15:50 ` [PATCH 6/8] loop: don't freeze the queue in lo_release Christoph Hellwig
                   ` (4 subsequent siblings)
  9 siblings, 1 reply; 32+ messages in thread
From: Christoph Hellwig @ 2022-01-26 15:50 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Tetsuo Handa, Jan Kara, linux-block, Ming Lei

lo_refcnt is only incremented in lo_open and decremented in lo_release,
and thus protected by open_mutex.  Only take lo_mutex when lo_open is
called the first time, as only for the first open there is any affect
on the driver state (incremental opens on partitions don't end up in
lo_open at all already).

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/loop.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 43980ec69dfdd..4b0058a67c48e 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1725,13 +1725,16 @@ static int lo_open(struct block_device *bdev, fmode_t mode)
 	struct loop_device *lo = bdev->bd_disk->private_data;
 	int err;
 
+	if (atomic_inc_return(&lo->lo_refcnt) > 1)
+		return 0;
+
 	err = mutex_lock_killable(&lo->lo_mutex);
 	if (err)
 		return err;
-	if (lo->lo_state == Lo_deleting)
+	if (lo->lo_state == Lo_deleting) {
+		atomic_dec(&lo->lo_refcnt);
 		err = -ENXIO;
-	else
-		atomic_inc(&lo->lo_refcnt);
+	}
 	mutex_unlock(&lo->lo_mutex);
 	return err;
 }
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 6/8] loop: don't freeze the queue in lo_release
  2022-01-26 15:50 yet another approach to fix loop autoclear for xfstets xfs/049 Christoph Hellwig
                   ` (4 preceding siblings ...)
  2022-01-26 15:50 ` [PATCH 5/8] loop: only take lo_mutex for the first reference in lo_open Christoph Hellwig
@ 2022-01-26 15:50 ` Christoph Hellwig
  2022-01-27 10:42   ` Jan Kara
  2022-01-26 15:50 ` [PATCH 7/8] loop: only freeze the queue in __loop_clr_fd when needed Christoph Hellwig
                   ` (3 subsequent siblings)
  9 siblings, 1 reply; 32+ messages in thread
From: Christoph Hellwig @ 2022-01-26 15:50 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Tetsuo Handa, Jan Kara, linux-block, Ming Lei

By the time the final ->release is called there can't be outstanding I/O.
For non-final ->release there is no need for driver action at all.

Thus remove the useless queue freeze.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/loop.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 4b0058a67c48e..d9a0e2205762f 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1758,13 +1758,6 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		 */
 		loop_schedule_rundown(lo);
 		return;
-	} else if (lo->lo_state == Lo_bound) {
-		/*
-		 * Otherwise keep thread (if running) and config,
-		 * but flush possible ongoing bios in thread.
-		 */
-		blk_mq_freeze_queue(lo->lo_queue);
-		blk_mq_unfreeze_queue(lo->lo_queue);
 	}
 
 out_unlock:
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 7/8] loop: only freeze the queue in __loop_clr_fd when needed
  2022-01-26 15:50 yet another approach to fix loop autoclear for xfstets xfs/049 Christoph Hellwig
                   ` (5 preceding siblings ...)
  2022-01-26 15:50 ` [PATCH 6/8] loop: don't freeze the queue in lo_release Christoph Hellwig
@ 2022-01-26 15:50 ` Christoph Hellwig
  2022-01-27 11:01   ` Jan Kara
  2022-01-26 15:50 ` [PATCH 8/8] loop: make autoclear operation synchronous again Christoph Hellwig
                   ` (2 subsequent siblings)
  9 siblings, 1 reply; 32+ messages in thread
From: Christoph Hellwig @ 2022-01-26 15:50 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Tetsuo Handa, Jan Kara, linux-block, Ming Lei

->release is only called after all outstanding I/O has completed, so only
freeze the queue when clearing the backing file of a live loop device.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/loop.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index d9a0e2205762f..fc0cdd1612b1d 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1110,7 +1110,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 	return error;
 }
 
-static void __loop_clr_fd(struct loop_device *lo)
+static void __loop_clr_fd(struct loop_device *lo, bool release)
 {
 	struct file *filp;
 	gfp_t gfp = lo->old_gfp_mask;
@@ -1139,8 +1139,13 @@ static void __loop_clr_fd(struct loop_device *lo)
 	if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags))
 		blk_queue_write_cache(lo->lo_queue, false, false);
 
-	/* freeze request queue during the transition */
-	blk_mq_freeze_queue(lo->lo_queue);
+	/*
+	 * Freeze the request queue when unbinding on a live file descriptor and
+	 * thus an open device.  When called from ->release we are guaranteed
+	 * that there is no I/O in progress already.
+	 */
+	if (!release)
+		blk_mq_freeze_queue(lo->lo_queue);
 
 	destroy_workqueue(lo->workqueue);
 	loop_free_idle_workers(lo, true);
@@ -1163,7 +1168,8 @@ static void __loop_clr_fd(struct loop_device *lo)
 	/* let user-space know about this change */
 	kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
 	mapping_set_gfp_mask(filp->f_mapping, gfp);
-	blk_mq_unfreeze_queue(lo->lo_queue);
+	if (!release)
+		blk_mq_unfreeze_queue(lo->lo_queue);
 
 	disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
 
@@ -1201,7 +1207,7 @@ static void loop_rundown_workfn(struct work_struct *work)
 	struct block_device *bdev = lo->lo_device;
 	struct gendisk *disk = lo->lo_disk;
 
-	__loop_clr_fd(lo);
+	__loop_clr_fd(lo, true);
 	kobject_put(&bdev->bd_device.kobj);
 	module_put(disk->fops->owner);
 	loop_rundown_completed(lo);
@@ -1247,7 +1253,7 @@ static int loop_clr_fd(struct loop_device *lo)
 	lo->lo_state = Lo_rundown;
 	mutex_unlock(&lo->lo_mutex);
 
-	__loop_clr_fd(lo);
+	__loop_clr_fd(lo, false);
 	loop_rundown_completed(lo);
 	return 0;
 }
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 8/8] loop: make autoclear operation synchronous again
  2022-01-26 15:50 yet another approach to fix loop autoclear for xfstets xfs/049 Christoph Hellwig
                   ` (6 preceding siblings ...)
  2022-01-26 15:50 ` [PATCH 7/8] loop: only freeze the queue in __loop_clr_fd when needed Christoph Hellwig
@ 2022-01-26 15:50 ` Christoph Hellwig
  2022-01-27 11:04   ` Jan Kara
  2022-01-26 19:38 ` yet another approach to fix loop autoclear for xfstets xfs/049 Darrick J. Wong
  2022-01-27  1:05 ` Tetsuo Handa
  9 siblings, 1 reply; 32+ messages in thread
From: Christoph Hellwig @ 2022-01-26 15:50 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tetsuo Handa, Jan Kara, linux-block, Ming Lei, Tetsuo Handa,
	kernel test robot, syzbot

From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>

The kernel test robot is reporting that xfstest can fail at

  umount ext2 on xfs
  umount xfs

sequence, for commit 322c4293ecc58110 ("loop: make autoclear operation
asynchronous") broke what commit ("loop: Make explicit loop device
destruction lazy") wanted to achieve.

Although we cannot guarantee that nobody is holding a reference when
"umount xfs" is called, we should try to close a race window opened
by asynchronous autoclear operation.

It turned out that there is no need to call flush_workqueue() from
__loop_clr_fd(), for blk_mq_freeze_queue() blocks until all pending
"struct work_struct" are processed by loop_process_work().

Revert commit 322c4293ecc58110 ("loop: make autoclear operation
asynchronous"), and simply defer calling destroy_workqueue() till
loop_remove() after ensuring that the worker rbtree and repaing
timer are kept alive while the loop device exists.

Since a loop device is likely reused after once created thanks to
ioctl(LOOP_CTL_GET_FREE), being unable to destroy corresponding WQ
when ioctl(LOOP_CLR_FD) is called should be an acceptable waste.

Reported-by: kernel test robot <oliver.sang@intel.com>
Suggested-by: Christoph Hellwig <hch@lst.de>
Tested-by: syzbot <syzbot+643e4ce4b6ad1347d372@syzkaller.appspotmail.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
[hch: rebased, keep the work rbtree and timer around longer]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/loop.c | 71 ++++++++++++++------------------------------
 drivers/block/loop.h |  1 -
 2 files changed, 23 insertions(+), 49 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index fc0cdd1612b1d..cf7830a68113a 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1038,10 +1038,10 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 	    !file->f_op->write_iter)
 		lo->lo_flags |= LO_FLAGS_READ_ONLY;
 
-	lo->workqueue = alloc_workqueue("loop%d",
-					WQ_UNBOUND | WQ_FREEZABLE,
-					0,
-					lo->lo_number);
+	if (!lo->workqueue)
+		lo->workqueue = alloc_workqueue("loop%d",
+						WQ_UNBOUND | WQ_FREEZABLE,
+						0, lo->lo_number);
 	if (!lo->workqueue) {
 		error = -ENOMEM;
 		goto out_unlock;
@@ -1147,10 +1147,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 	if (!release)
 		blk_mq_freeze_queue(lo->lo_queue);
 
-	destroy_workqueue(lo->workqueue);
-	loop_free_idle_workers(lo, true);
-	del_timer_sync(&lo->timer);
-
 	spin_lock_irq(&lo->lo_lock);
 	filp = lo->lo_backing_file;
 	lo->lo_backing_file = NULL;
@@ -1176,9 +1172,11 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 	if (lo->lo_flags & LO_FLAGS_PARTSCAN) {
 		int err;
 
-		mutex_lock(&lo->lo_disk->open_mutex);
+		if (!release)
+			mutex_lock(&lo->lo_disk->open_mutex);
 		err = bdev_disk_changed(lo->lo_disk, false);
-		mutex_unlock(&lo->lo_disk->open_mutex);
+		if (!release)
+			mutex_unlock(&lo->lo_disk->open_mutex);
 		if (err)
 			pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
 				__func__, lo->lo_number, err);
@@ -1189,39 +1187,17 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 	if (!part_shift)
 		lo->lo_disk->flags |= GENHD_FL_NO_PART;
 
-	fput(filp);
-}
-
-static void loop_rundown_completed(struct loop_device *lo)
-{
 	mutex_lock(&lo->lo_mutex);
 	lo->lo_state = Lo_unbound;
 	mutex_unlock(&lo->lo_mutex);
 	module_put(THIS_MODULE);
-}
-
-static void loop_rundown_workfn(struct work_struct *work)
-{
-	struct loop_device *lo = container_of(work, struct loop_device,
-					      rundown_work);
-	struct block_device *bdev = lo->lo_device;
-	struct gendisk *disk = lo->lo_disk;
 
-	__loop_clr_fd(lo, true);
-	kobject_put(&bdev->bd_device.kobj);
-	module_put(disk->fops->owner);
-	loop_rundown_completed(lo);
-}
-
-static void loop_schedule_rundown(struct loop_device *lo)
-{
-	struct block_device *bdev = lo->lo_device;
-	struct gendisk *disk = lo->lo_disk;
-
-	__module_get(disk->fops->owner);
-	kobject_get(&bdev->bd_device.kobj);
-	INIT_WORK(&lo->rundown_work, loop_rundown_workfn);
-	queue_work(system_long_wq, &lo->rundown_work);
+	/*
+	 * Need not hold lo_mutex to fput backing file. Calling fput holding
+	 * lo_mutex triggers a circular lock dependency possibility warning as
+	 * fput can take open_mutex which is usually taken before lo_mutex.
+	 */
+	fput(filp);
 }
 
 static int loop_clr_fd(struct loop_device *lo)
@@ -1254,7 +1230,6 @@ static int loop_clr_fd(struct loop_device *lo)
 	mutex_unlock(&lo->lo_mutex);
 
 	__loop_clr_fd(lo, false);
-	loop_rundown_completed(lo);
 	return 0;
 }
 
@@ -1753,20 +1728,14 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		return;
 
 	mutex_lock(&lo->lo_mutex);
-	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
-		if (lo->lo_state != Lo_bound)
-			goto out_unlock;
+	if (lo->lo_state == Lo_bound &&
+	    (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) {
 		lo->lo_state = Lo_rundown;
 		mutex_unlock(&lo->lo_mutex);
-		/*
-		 * In autoclear mode, stop the loop thread
-		 * and remove configuration after last close.
-		 */
-		loop_schedule_rundown(lo);
+		__loop_clr_fd(lo, true);
 		return;
 	}
 
-out_unlock:
 	mutex_unlock(&lo->lo_mutex);
 }
 
@@ -2056,6 +2025,12 @@ static void loop_remove(struct loop_device *lo)
 	mutex_lock(&loop_ctl_mutex);
 	idr_remove(&loop_index_idr, lo->lo_number);
 	mutex_unlock(&loop_ctl_mutex);
+
+	if (lo->workqueue)
+		destroy_workqueue(lo->workqueue);
+	loop_free_idle_workers(lo, true);
+	del_timer_sync(&lo->timer);
+
 	/* There is no route which can find this loop device. */
 	mutex_destroy(&lo->lo_mutex);
 	kfree(lo);
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 918a7a2dc0259..082d4b6bfc6a6 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -56,7 +56,6 @@ struct loop_device {
 	struct gendisk		*lo_disk;
 	struct mutex		lo_mutex;
 	bool			idr_visible;
-	struct work_struct      rundown_work;
 };
 
 struct loop_cmd {
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: yet another approach to fix loop autoclear for xfstets xfs/049
  2022-01-26 15:50 yet another approach to fix loop autoclear for xfstets xfs/049 Christoph Hellwig
                   ` (7 preceding siblings ...)
  2022-01-26 15:50 ` [PATCH 8/8] loop: make autoclear operation synchronous again Christoph Hellwig
@ 2022-01-26 19:38 ` Darrick J. Wong
  2022-01-27 16:50   ` Darrick J. Wong
  2022-01-27  1:05 ` Tetsuo Handa
  9 siblings, 1 reply; 32+ messages in thread
From: Darrick J. Wong @ 2022-01-26 19:38 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Tetsuo Handa, Jan Kara, linux-block, Ming Lei, xfs

On Wed, Jan 26, 2022 at 04:50:32PM +0100, Christoph Hellwig wrote:
> Hi Jens, hi Tetsuo,
> 
> this series uses the approach from Tetsuo to delay the destroy_workueue
> cll, extended by a delayed teardown of the workers to fix a potential
> racewindow then the workqueue can be still round after finishing the
> commands.  It then also removed the queue freeing in release that is
> not needed to fix the dependency chain for that (which can't be
> reported by lockdep) as well.

[add xfs to cc list]

This fixes all the regressions I've been seeing in xfs/049 and xfs/073,
thank you.  I'll give this a spin with the rest of fstests overnight.

--D

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: yet another approach to fix loop autoclear for xfstets xfs/049
  2022-01-26 15:50 yet another approach to fix loop autoclear for xfstets xfs/049 Christoph Hellwig
                   ` (8 preceding siblings ...)
  2022-01-26 19:38 ` yet another approach to fix loop autoclear for xfstets xfs/049 Darrick J. Wong
@ 2022-01-27  1:05 ` Tetsuo Handa
  2022-01-28  7:08   ` Christoph Hellwig
  9 siblings, 1 reply; 32+ messages in thread
From: Tetsuo Handa @ 2022-01-27  1:05 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe; +Cc: Jan Kara, linux-block, Ming Lei

On 2022/01/27 0:50, Christoph Hellwig wrote:
> Hi Jens, hi Tetsuo,
> 
> this series uses the approach from Tetsuo to delay the destroy_workueue
> call, extended by a delayed teardown of the workers to fix a potential
> racewindow then the workqueue can be still round after finishing the
> commands.  It then also removed the queue freeing in release that is
> not needed to fix the dependency chain for that (which can't be
> reported by lockdep) as well.

I want to remove disk->open_mutex => lo->lo_mutex => I/O completion chain itself from
/proc/lockdep . Even if real deadlock does not happen due to lo->lo_refcnt exclusion,
I consider that disk->open_mutex => lo->lo_mutex dependency being recorded is a bad sign.
It makes difficult to find real possibility of deadlock when things change in future.
I consider that lo_release() is doing too much things under disk->open_mutex.

I tried to defer lo->lo_mutex in lo_release() as much as possible. But this chain
still remains.

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index cf7830a68113..a9abd213b38d 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1706,12 +1706,19 @@ static int lo_open(struct block_device *bdev, fmode_t mode)
 	struct loop_device *lo = bdev->bd_disk->private_data;
 	int err;

+	/*
+	 * Since lo_open() and lo_release() are serialized by disk->open_mutex,
+	 * we can racelessly increment/decrement lo->lo_refcnt without holding
+	 * lo->lo_mutex.
+	 */
 	if (atomic_inc_return(&lo->lo_refcnt) > 1)
 		return 0;

 	err = mutex_lock_killable(&lo->lo_mutex);
-	if (err)
+	if (err) {
+		atomic_dec(&lo->lo_refcnt);
 		return err;
+	}
 	if (lo->lo_state == Lo_deleting) {
 		atomic_dec(&lo->lo_refcnt);
 		err = -ENXIO;
@@ -1727,16 +1734,18 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 	if (!atomic_dec_and_test(&lo->lo_refcnt))
 		return;

-	mutex_lock(&lo->lo_mutex);
-	if (lo->lo_state == Lo_bound &&
-	    (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) {
-		lo->lo_state = Lo_rundown;
-		mutex_unlock(&lo->lo_mutex);
-		__loop_clr_fd(lo, true);
+	/*
+	 * Since lo_open() and lo_release() are serialized by disk->open_mutex,
+	 * and lo->refcnt == 0 means nobody is using this device, we can read
+	 * lo->lo_state and lo->lo_flags without holding lo->lo_mutex.
+	 */
+	if (lo->lo_state != Lo_bound || !(lo->lo_flags & LO_FLAGS_AUTOCLEAR))
 		return;
-	}
-
+	mutex_lock(&lo->lo_mutex);
+	lo->lo_state = Lo_rundown;
 	mutex_unlock(&lo->lo_mutex);
+	__loop_clr_fd(lo, true);
+	return;
 }

 static const struct block_device_operations lo_fops = {

In __loop_clr_fd(), it waits for loop_validate_mutex. loop_validate_mutex can be
held when loop_change_fd() calls blk_mq_freeze_queue(). Loop recursion interacts
with other loop devices.

While each loop device takes care of only single backing file, we can use multiple
loop devices for multiple backing files within the same mount point (e.g. /dev/loop0
is attached on /mnt/file0 and /dev/loop1 is attached on /mnt/file1), can't we?
But fsfreeze is per a mount point (e.g. /mnt/). That is, fsfreeze also interacts
with other loop devices, doesn't it?

disk->open_mutex is a per a loop device serialization, but loop_validate_mutex and
fsfreeze are global serialization. It is anxious to involve global serialization under
individual serialization, when we already know that involvement of sysfs + fsfreeze
causes possibility of deadlock. I expect that lo_open()/lo_release() are done without
holding disk->open_mutex.

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/8] loop: de-duplicate the idle worker freeing code
  2022-01-26 15:50 ` [PATCH 1/8] loop: de-duplicate the idle worker freeing code Christoph Hellwig
@ 2022-01-27  9:36   ` Jan Kara
  0 siblings, 0 replies; 32+ messages in thread
From: Jan Kara @ 2022-01-27  9:36 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Tetsuo Handa, Jan Kara, linux-block, Ming Lei

On Wed 26-01-22 16:50:33, Christoph Hellwig wrote:
> Use a common helper for both timer based and uncoditional freeing of idle
> workers.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Nice cleanup. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  drivers/block/loop.c | 73 +++++++++++++++++++++-----------------------
>  1 file changed, 35 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 01cbbfc4e9e24..b268bca6e4fb7 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -804,7 +804,6 @@ struct loop_worker {
>  
>  static void loop_workfn(struct work_struct *work);
>  static void loop_rootcg_workfn(struct work_struct *work);
> -static void loop_free_idle_workers(struct timer_list *timer);
>  
>  #ifdef CONFIG_BLK_CGROUP
>  static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
> @@ -888,6 +887,39 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
>  	spin_unlock_irq(&lo->lo_work_lock);
>  }
>  
> +static void loop_set_timer(struct loop_device *lo)
> +{
> +	timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT);
> +}
> +
> +static void loop_free_idle_workers(struct loop_device *lo, bool delete_all)
> +{
> +	struct loop_worker *pos, *worker;
> +
> +	spin_lock_irq(&lo->lo_work_lock);
> +	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
> +				idle_list) {
> +		if (!delete_all &&
> +		    time_is_after_jiffies(worker->last_ran_at +
> +					  LOOP_IDLE_WORKER_TIMEOUT))
> +			break;
> +		list_del(&worker->idle_list);
> +		rb_erase(&worker->rb_node, &lo->worker_tree);
> +		css_put(worker->blkcg_css);
> +		kfree(worker);
> +	}
> +	if (!list_empty(&lo->idle_worker_list))
> +		loop_set_timer(lo);
> +	spin_unlock_irq(&lo->lo_work_lock);
> +}
> +
> +static void loop_free_idle_workers_timer(struct timer_list *timer)
> +{
> +	struct loop_device *lo = container_of(timer, struct loop_device, timer);
> +
> +	return loop_free_idle_workers(lo, false);
> +}
> +
>  static void loop_update_rotational(struct loop_device *lo)
>  {
>  	struct file *file = lo->lo_backing_file;
> @@ -1022,7 +1054,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
>  	INIT_LIST_HEAD(&lo->rootcg_cmd_list);
>  	INIT_LIST_HEAD(&lo->idle_worker_list);
>  	lo->worker_tree = RB_ROOT;
> -	timer_setup(&lo->timer, loop_free_idle_workers,
> +	timer_setup(&lo->timer, loop_free_idle_workers_timer,
>  		TIMER_DEFERRABLE);
>  	lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
>  	lo->lo_device = bdev;
> @@ -1086,7 +1118,6 @@ static void __loop_clr_fd(struct loop_device *lo)
>  {
>  	struct file *filp;
>  	gfp_t gfp = lo->old_gfp_mask;
> -	struct loop_worker *pos, *worker;
>  
>  	/*
>  	 * Flush loop_configure() and loop_change_fd(). It is acceptable for
> @@ -1116,15 +1147,7 @@ static void __loop_clr_fd(struct loop_device *lo)
>  	blk_mq_freeze_queue(lo->lo_queue);
>  
>  	destroy_workqueue(lo->workqueue);
> -	spin_lock_irq(&lo->lo_work_lock);
> -	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
> -				idle_list) {
> -		list_del(&worker->idle_list);
> -		rb_erase(&worker->rb_node, &lo->worker_tree);
> -		css_put(worker->blkcg_css);
> -		kfree(worker);
> -	}
> -	spin_unlock_irq(&lo->lo_work_lock);
> +	loop_free_idle_workers(lo, true);
>  	del_timer_sync(&lo->timer);
>  
>  	spin_lock_irq(&lo->lo_lock);
> @@ -1871,11 +1894,6 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
>  	}
>  }
>  
> -static void loop_set_timer(struct loop_device *lo)
> -{
> -	timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT);
> -}
> -
>  static void loop_process_work(struct loop_worker *worker,
>  			struct list_head *cmd_list, struct loop_device *lo)
>  {
> @@ -1924,27 +1942,6 @@ static void loop_rootcg_workfn(struct work_struct *work)
>  	loop_process_work(NULL, &lo->rootcg_cmd_list, lo);
>  }
>  
> -static void loop_free_idle_workers(struct timer_list *timer)
> -{
> -	struct loop_device *lo = container_of(timer, struct loop_device, timer);
> -	struct loop_worker *pos, *worker;
> -
> -	spin_lock_irq(&lo->lo_work_lock);
> -	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
> -				idle_list) {
> -		if (time_is_after_jiffies(worker->last_ran_at +
> -						LOOP_IDLE_WORKER_TIMEOUT))
> -			break;
> -		list_del(&worker->idle_list);
> -		rb_erase(&worker->rb_node, &lo->worker_tree);
> -		css_put(worker->blkcg_css);
> -		kfree(worker);
> -	}
> -	if (!list_empty(&lo->idle_worker_list))
> -		loop_set_timer(lo);
> -	spin_unlock_irq(&lo->lo_work_lock);
> -}
> -
>  static const struct blk_mq_ops loop_mq_ops = {
>  	.queue_rq       = loop_queue_rq,
>  	.complete	= lo_complete_rq,
> -- 
> 2.30.2
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/8] loop: initialize the worker tracking fields once
  2022-01-26 15:50 ` [PATCH 2/8] loop: initialize the worker tracking fields once Christoph Hellwig
@ 2022-01-27  9:45   ` Jan Kara
  0 siblings, 0 replies; 32+ messages in thread
From: Jan Kara @ 2022-01-27  9:45 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Tetsuo Handa, Jan Kara, linux-block, Ming Lei

On Wed 26-01-22 16:50:34, Christoph Hellwig wrote:
> There is no need to reinitialize idle_worker_list, worker_tree and timer
> every time a loop device is configured.  Just initialize them once at
> allocation time.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  drivers/block/loop.c | 7 +++----
>  1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index b268bca6e4fb7..6ec55a5d9dfc4 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -1052,10 +1052,6 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
>  
>  	INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn);
>  	INIT_LIST_HEAD(&lo->rootcg_cmd_list);
> -	INIT_LIST_HEAD(&lo->idle_worker_list);
> -	lo->worker_tree = RB_ROOT;
> -	timer_setup(&lo->timer, loop_free_idle_workers_timer,
> -		TIMER_DEFERRABLE);
>  	lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
>  	lo->lo_device = bdev;
>  	lo->lo_backing_file = file;
> @@ -1957,6 +1953,9 @@ static int loop_add(int i)
>  	lo = kzalloc(sizeof(*lo), GFP_KERNEL);
>  	if (!lo)
>  		goto out;
> +	lo->worker_tree = RB_ROOT;
> +	INIT_LIST_HEAD(&lo->idle_worker_list);
> +	timer_setup(&lo->timer, loop_free_idle_workers_timer, TIMER_DEFERRABLE);
>  	lo->lo_state = Lo_unbound;
>  
>  	err = mutex_lock_killable(&loop_ctl_mutex);
> -- 
> 2.30.2
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 3/8] block: remove the racy bd_inode->i_mapping->nrpages asserts
  2022-01-26 15:50 ` [PATCH 3/8] block: remove the racy bd_inode->i_mapping->nrpages asserts Christoph Hellwig
@ 2022-01-27  9:47   ` Jan Kara
  2022-01-27  9:49     ` Christoph Hellwig
  0 siblings, 1 reply; 32+ messages in thread
From: Jan Kara @ 2022-01-27  9:47 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Tetsuo Handa, Jan Kara, linux-block, Ming Lei

On Wed 26-01-22 16:50:35, Christoph Hellwig wrote:
> Nothing prevents a file system or userspace opener of the block device
> from redirtying the page right afte sync_blockdev returned.  Fortunately
> data in the page cache during a block device change is mostly harmless
> anyway.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

My understanding was these warnings are there to tell userspace it is doing
something wrong. Something like the warning we issue when DIO races with
buffered IO... I'm not sure how useful they are but I don't see strong
reason to remove them either...

								Honza

> ---
>  drivers/block/loop.c | 20 --------------------
>  1 file changed, 20 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 6ec55a5d9dfc4..d3a7f281ce1b6 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -1278,15 +1278,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
>  	/* I/O need to be drained during transfer transition */
>  	blk_mq_freeze_queue(lo->lo_queue);
>  
> -	if (size_changed && lo->lo_device->bd_inode->i_mapping->nrpages) {
> -		/* If any pages were dirtied after invalidate_bdev(), try again */
> -		err = -EAGAIN;
> -		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
> -			__func__, lo->lo_number, lo->lo_file_name,
> -			lo->lo_device->bd_inode->i_mapping->nrpages);
> -		goto out_unfreeze;
> -	}
> -
>  	prev_lo_flags = lo->lo_flags;
>  
>  	err = loop_set_status_from_info(lo, info);
> @@ -1497,21 +1488,10 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
>  	invalidate_bdev(lo->lo_device);
>  
>  	blk_mq_freeze_queue(lo->lo_queue);
> -
> -	/* invalidate_bdev should have truncated all the pages */
> -	if (lo->lo_device->bd_inode->i_mapping->nrpages) {
> -		err = -EAGAIN;
> -		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
> -			__func__, lo->lo_number, lo->lo_file_name,
> -			lo->lo_device->bd_inode->i_mapping->nrpages);
> -		goto out_unfreeze;
> -	}
> -
>  	blk_queue_logical_block_size(lo->lo_queue, arg);
>  	blk_queue_physical_block_size(lo->lo_queue, arg);
>  	blk_queue_io_min(lo->lo_queue, arg);
>  	loop_update_dio(lo);
> -out_unfreeze:
>  	blk_mq_unfreeze_queue(lo->lo_queue);
>  
>  	return err;
> -- 
> 2.30.2
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 4/8] loop: only take lo_mutex for the last reference in lo_release
  2022-01-26 15:50 ` [PATCH 4/8] loop: only take lo_mutex for the last reference in lo_release Christoph Hellwig
@ 2022-01-27  9:48   ` Jan Kara
  2022-01-27 10:19     ` Jan Kara
  0 siblings, 1 reply; 32+ messages in thread
From: Jan Kara @ 2022-01-27  9:48 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Tetsuo Handa, Jan Kara, linux-block, Ming Lei

On Wed 26-01-22 16:50:36, Christoph Hellwig wrote:
> lo_refcnt is only incremented in lo_open and decremented in lo_release,
> and thus protected by open_mutex.  Only take lo_mutex when lo_release
> actually takes action for the final release.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Yup, good idea. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  drivers/block/loop.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index d3a7f281ce1b6..43980ec69dfdd 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -1740,10 +1740,10 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
>  {
>  	struct loop_device *lo = disk->private_data;
>  
> -	mutex_lock(&lo->lo_mutex);
> -	if (atomic_dec_return(&lo->lo_refcnt))
> -		goto out_unlock;
> +	if (!atomic_dec_and_test(&lo->lo_refcnt))
> +		return;
>  
> +	mutex_lock(&lo->lo_mutex);
>  	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
>  		if (lo->lo_state != Lo_bound)
>  			goto out_unlock;
> -- 
> 2.30.2
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 3/8] block: remove the racy bd_inode->i_mapping->nrpages asserts
  2022-01-27  9:47   ` Jan Kara
@ 2022-01-27  9:49     ` Christoph Hellwig
  2022-01-27 12:23       ` Jan Kara
  0 siblings, 1 reply; 32+ messages in thread
From: Christoph Hellwig @ 2022-01-27  9:49 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Tetsuo Handa, linux-block, Ming Lei

On Thu, Jan 27, 2022 at 10:47:37AM +0100, Jan Kara wrote:
> On Wed 26-01-22 16:50:35, Christoph Hellwig wrote:
> > Nothing prevents a file system or userspace opener of the block device
> > from redirtying the page right afte sync_blockdev returned.  Fortunately
> > data in the page cache during a block device change is mostly harmless
> > anyway.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> My understanding was these warnings are there to tell userspace it is doing
> something wrong. Something like the warning we issue when DIO races with
> buffered IO... I'm not sure how useful they are but I don't see strong
> reason to remove them either...

Well, it is not just a warning, but also fails the command.  With some of
the reduced synchronization blktests loop/002 can hit them pretty reliably.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 4/8] loop: only take lo_mutex for the last reference in lo_release
  2022-01-27  9:48   ` Jan Kara
@ 2022-01-27 10:19     ` Jan Kara
  2022-01-27 10:28       ` Christoph Hellwig
  0 siblings, 1 reply; 32+ messages in thread
From: Jan Kara @ 2022-01-27 10:19 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Tetsuo Handa, Jan Kara, linux-block, Ming Lei

On Thu 27-01-22 10:48:13, Jan Kara wrote:
> On Wed 26-01-22 16:50:36, Christoph Hellwig wrote:
> > lo_refcnt is only incremented in lo_open and decremented in lo_release,
> > and thus protected by open_mutex.  Only take lo_mutex when lo_release
> > actually takes action for the final release.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> Yup, good idea. Feel free to add:
> 
> Reviewed-by: Jan Kara <jack@suse.cz>

On a second thought I retract this... See below:

> > index d3a7f281ce1b6..43980ec69dfdd 100644
> > --- a/drivers/block/loop.c
> > +++ b/drivers/block/loop.c
> > @@ -1740,10 +1740,10 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
> >  {
> >  	struct loop_device *lo = disk->private_data;
> >  
> > -	mutex_lock(&lo->lo_mutex);
> > -	if (atomic_dec_return(&lo->lo_refcnt))
> > -		goto out_unlock;
> > +	if (!atomic_dec_and_test(&lo->lo_refcnt))
> > +		return;
> >  
> > +	mutex_lock(&lo->lo_mutex);

There's a subtle race here like:

Thread 1				Thread2 (mount)
lo_release()
  if (!atomic_dec_and_test(&lo->lo_refcnt))
  - sees we are last one
					lo_open()
					  mutex_lock_killable(&lo->lo_mutex);
					  atomic_inc(&lo->lo_refcnt);
					  mutex_unlock(&lo->lo_mutex);
					ioctl(LOOP_GET_STATUS)
					  sees everything is fine
  mutex_lock(&lo->lo_mutex);
  if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
    cleans up the device
  }
					  is unhappy, mount fails

Just after writing this I have realized that the above sequence is not
actually possible due to disk->open_mutex protecting us and serializing
lo_release() with lo_open() but it needs at least a comment to explain that
we rely on disk->open_mutex to avoid races with lo_open().

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 4/8] loop: only take lo_mutex for the last reference in lo_release
  2022-01-27 10:19     ` Jan Kara
@ 2022-01-27 10:28       ` Christoph Hellwig
  0 siblings, 0 replies; 32+ messages in thread
From: Christoph Hellwig @ 2022-01-27 10:28 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Tetsuo Handa, linux-block, Ming Lei

On Thu, Jan 27, 2022 at 11:19:57AM +0100, Jan Kara wrote:
> Just after writing this I have realized that the above sequence is not
> actually possible due to disk->open_mutex protecting us and serializing
> lo_release() with lo_open() but it needs at least a comment to explain that
> we rely on disk->open_mutex to avoid races with lo_open().

The commit message already notes this.  Not sure we really need a comment
in the code itself, but if you want I can add it.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 5/8] loop: only take lo_mutex for the first reference in lo_open
  2022-01-26 15:50 ` [PATCH 5/8] loop: only take lo_mutex for the first reference in lo_open Christoph Hellwig
@ 2022-01-27 10:28   ` Jan Kara
  2022-01-27 10:31     ` Tetsuo Handa
  0 siblings, 1 reply; 32+ messages in thread
From: Jan Kara @ 2022-01-27 10:28 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Tetsuo Handa, Jan Kara, linux-block, Ming Lei

On Wed 26-01-22 16:50:37, Christoph Hellwig wrote:
> lo_refcnt is only incremented in lo_open and decremented in lo_release,
> and thus protected by open_mutex.  Only take lo_mutex when lo_open is
> called the first time, as only for the first open there is any affect
> on the driver state (incremental opens on partitions don't end up in
> lo_open at all already).
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  drivers/block/loop.c | 9 ++++++---
>  1 file changed, 6 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 43980ec69dfdd..4b0058a67c48e 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -1725,13 +1725,16 @@ static int lo_open(struct block_device *bdev, fmode_t mode)
>  	struct loop_device *lo = bdev->bd_disk->private_data;
>  	int err;
>  
> +	if (atomic_inc_return(&lo->lo_refcnt) > 1)
> +		return 0;
> +
>  	err = mutex_lock_killable(&lo->lo_mutex);
>  	if (err)
>  		return err;
> -	if (lo->lo_state == Lo_deleting)
> +	if (lo->lo_state == Lo_deleting) {
> +		atomic_dec(&lo->lo_refcnt);
>  		err = -ENXIO;
> -	else
> -		atomic_inc(&lo->lo_refcnt);
> +	}
>  	mutex_unlock(&lo->lo_mutex);
>  	return err;

So this also relies on disk->open_mutex for correctness. Otherwise a race
like:

Thread1			Thread2
lo_open()
  if (atomic_inc_return(&lo->lo_refcnt) > 1)
  mutex_lock_killable(&lo->lo_mutex);
			lo_open()
			if (atomic_inc_return(&lo->lo_refcnt) > 1)
			  return 0;
  ..

can result in Thread2 using the loop device before Thread1 actually did the
"first open" checks. So perhaps one common comment for lo_open + lo_release
explaining the locking?

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 5/8] loop: only take lo_mutex for the first reference in lo_open
  2022-01-27 10:28   ` Jan Kara
@ 2022-01-27 10:31     ` Tetsuo Handa
  0 siblings, 0 replies; 32+ messages in thread
From: Tetsuo Handa @ 2022-01-27 10:31 UTC (permalink / raw)
  To: Jan Kara, Christoph Hellwig; +Cc: Jens Axboe, linux-block, Ming Lei

On 2022/01/27 19:28, Jan Kara wrote:
> So this also relies on disk->open_mutex for correctness. Otherwise a race
> like:
> 
> Thread1			Thread2
> lo_open()
>   if (atomic_inc_return(&lo->lo_refcnt) > 1)
>   mutex_lock_killable(&lo->lo_mutex);
> 			lo_open()
> 			if (atomic_inc_return(&lo->lo_refcnt) > 1)
> 			  return 0;
>   ..
> 
> can result in Thread2 using the loop device before Thread1 actually did the
> "first open" checks. So perhaps one common comment for lo_open + lo_release
> explaining the locking?

Please see https://lkml.kernel.org/r/7bebf860-2415-7eb6-55a1-47dc4439d9e9@I-love.SAKURA.ne.jp .

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 6/8] loop: don't freeze the queue in lo_release
  2022-01-26 15:50 ` [PATCH 6/8] loop: don't freeze the queue in lo_release Christoph Hellwig
@ 2022-01-27 10:42   ` Jan Kara
  2022-01-28  6:46     ` Christoph Hellwig
  0 siblings, 1 reply; 32+ messages in thread
From: Jan Kara @ 2022-01-27 10:42 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Tetsuo Handa, Jan Kara, linux-block, Ming Lei

On Wed 26-01-22 16:50:38, Christoph Hellwig wrote:
> By the time the final ->release is called there can't be outstanding I/O.
> For non-final ->release there is no need for driver action at all.
> 
> Thus remove the useless queue freeze.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks good, just one nit below. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 4b0058a67c48e..d9a0e2205762f 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -1758,13 +1758,6 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
>  		 */
>  		loop_schedule_rundown(lo);
>  		return;
> -	} else if (lo->lo_state == Lo_bound) {
> -		/*
> -		 * Otherwise keep thread (if running) and config,
> -		 * but flush possible ongoing bios in thread.
> -		 */
> -		blk_mq_freeze_queue(lo->lo_queue);
> -		blk_mq_unfreeze_queue(lo->lo_queue);
>  	}

Maybe worth a comment like:

	/*
	 * No IO can be running at this point since there are no openers
	 * (covers filesystems, stacked devices, AIO) and the page cache is
	 * evicted.
	 */

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 7/8] loop: only freeze the queue in __loop_clr_fd when needed
  2022-01-26 15:50 ` [PATCH 7/8] loop: only freeze the queue in __loop_clr_fd when needed Christoph Hellwig
@ 2022-01-27 11:01   ` Jan Kara
  2022-01-28  6:48     ` Christoph Hellwig
  0 siblings, 1 reply; 32+ messages in thread
From: Jan Kara @ 2022-01-27 11:01 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Tetsuo Handa, Jan Kara, linux-block, Ming Lei

On Wed 26-01-22 16:50:39, Christoph Hellwig wrote:
> ->release is only called after all outstanding I/O has completed, so only
> freeze the queue when clearing the backing file of a live loop device.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks good. I was just wondering: Is there anything which prevents us from
moving blk_mq_freeze_queue() & blk_mq_unfreeze_queue() calls to
loop_clr_fd() around __loop_clr_fd() call?

Anyway feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  drivers/block/loop.c | 18 ++++++++++++------
>  1 file changed, 12 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index d9a0e2205762f..fc0cdd1612b1d 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -1110,7 +1110,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
>  	return error;
>  }
>  
> -static void __loop_clr_fd(struct loop_device *lo)
> +static void __loop_clr_fd(struct loop_device *lo, bool release)
>  {
>  	struct file *filp;
>  	gfp_t gfp = lo->old_gfp_mask;
> @@ -1139,8 +1139,13 @@ static void __loop_clr_fd(struct loop_device *lo)
>  	if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags))
>  		blk_queue_write_cache(lo->lo_queue, false, false);
>  
> -	/* freeze request queue during the transition */
> -	blk_mq_freeze_queue(lo->lo_queue);
> +	/*
> +	 * Freeze the request queue when unbinding on a live file descriptor and
> +	 * thus an open device.  When called from ->release we are guaranteed
> +	 * that there is no I/O in progress already.
> +	 */
> +	if (!release)
> +		blk_mq_freeze_queue(lo->lo_queue);
>  
>  	destroy_workqueue(lo->workqueue);
>  	loop_free_idle_workers(lo, true);
> @@ -1163,7 +1168,8 @@ static void __loop_clr_fd(struct loop_device *lo)
>  	/* let user-space know about this change */
>  	kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
>  	mapping_set_gfp_mask(filp->f_mapping, gfp);
> -	blk_mq_unfreeze_queue(lo->lo_queue);
> +	if (!release)
> +		blk_mq_unfreeze_queue(lo->lo_queue);
>  
>  	disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
>  
> @@ -1201,7 +1207,7 @@ static void loop_rundown_workfn(struct work_struct *work)
>  	struct block_device *bdev = lo->lo_device;
>  	struct gendisk *disk = lo->lo_disk;
>  
> -	__loop_clr_fd(lo);
> +	__loop_clr_fd(lo, true);
>  	kobject_put(&bdev->bd_device.kobj);
>  	module_put(disk->fops->owner);
>  	loop_rundown_completed(lo);
> @@ -1247,7 +1253,7 @@ static int loop_clr_fd(struct loop_device *lo)
>  	lo->lo_state = Lo_rundown;
>  	mutex_unlock(&lo->lo_mutex);
>  
> -	__loop_clr_fd(lo);
> +	__loop_clr_fd(lo, false);
>  	loop_rundown_completed(lo);
>  	return 0;
>  }
> -- 
> 2.30.2
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 8/8] loop: make autoclear operation synchronous again
  2022-01-26 15:50 ` [PATCH 8/8] loop: make autoclear operation synchronous again Christoph Hellwig
@ 2022-01-27 11:04   ` Jan Kara
  0 siblings, 0 replies; 32+ messages in thread
From: Jan Kara @ 2022-01-27 11:04 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Tetsuo Handa, Jan Kara, linux-block, Ming Lei,
	kernel test robot, syzbot

On Wed 26-01-22 16:50:40, Christoph Hellwig wrote:
> From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> 
> The kernel test robot is reporting that xfstest can fail at
> 
>   umount ext2 on xfs
>   umount xfs
> 
> sequence, for commit 322c4293ecc58110 ("loop: make autoclear operation
> asynchronous") broke what commit ("loop: Make explicit loop device
> destruction lazy") wanted to achieve.
> 
> Although we cannot guarantee that nobody is holding a reference when
> "umount xfs" is called, we should try to close a race window opened
> by asynchronous autoclear operation.
> 
> It turned out that there is no need to call flush_workqueue() from
> __loop_clr_fd(), for blk_mq_freeze_queue() blocks until all pending
> "struct work_struct" are processed by loop_process_work().
> 
> Revert commit 322c4293ecc58110 ("loop: make autoclear operation
> asynchronous"), and simply defer calling destroy_workqueue() till
> loop_remove() after ensuring that the worker rbtree and repaing
> timer are kept alive while the loop device exists.
> 
> Since a loop device is likely reused after once created thanks to
> ioctl(LOOP_CTL_GET_FREE), being unable to destroy corresponding WQ
> when ioctl(LOOP_CLR_FD) is called should be an acceptable waste.
> 
> Reported-by: kernel test robot <oliver.sang@intel.com>
> Suggested-by: Christoph Hellwig <hch@lst.de>
> Tested-by: syzbot <syzbot+643e4ce4b6ad1347d372@syzkaller.appspotmail.com>
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> [hch: rebased, keep the work rbtree and timer around longer]
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks good to me. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  drivers/block/loop.c | 71 ++++++++++++++------------------------------
>  drivers/block/loop.h |  1 -
>  2 files changed, 23 insertions(+), 49 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index fc0cdd1612b1d..cf7830a68113a 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -1038,10 +1038,10 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
>  	    !file->f_op->write_iter)
>  		lo->lo_flags |= LO_FLAGS_READ_ONLY;
>  
> -	lo->workqueue = alloc_workqueue("loop%d",
> -					WQ_UNBOUND | WQ_FREEZABLE,
> -					0,
> -					lo->lo_number);
> +	if (!lo->workqueue)
> +		lo->workqueue = alloc_workqueue("loop%d",
> +						WQ_UNBOUND | WQ_FREEZABLE,
> +						0, lo->lo_number);
>  	if (!lo->workqueue) {
>  		error = -ENOMEM;
>  		goto out_unlock;
> @@ -1147,10 +1147,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
>  	if (!release)
>  		blk_mq_freeze_queue(lo->lo_queue);
>  
> -	destroy_workqueue(lo->workqueue);
> -	loop_free_idle_workers(lo, true);
> -	del_timer_sync(&lo->timer);
> -
>  	spin_lock_irq(&lo->lo_lock);
>  	filp = lo->lo_backing_file;
>  	lo->lo_backing_file = NULL;
> @@ -1176,9 +1172,11 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
>  	if (lo->lo_flags & LO_FLAGS_PARTSCAN) {
>  		int err;
>  
> -		mutex_lock(&lo->lo_disk->open_mutex);
> +		if (!release)
> +			mutex_lock(&lo->lo_disk->open_mutex);
>  		err = bdev_disk_changed(lo->lo_disk, false);
> -		mutex_unlock(&lo->lo_disk->open_mutex);
> +		if (!release)
> +			mutex_unlock(&lo->lo_disk->open_mutex);
>  		if (err)
>  			pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
>  				__func__, lo->lo_number, err);
> @@ -1189,39 +1187,17 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
>  	if (!part_shift)
>  		lo->lo_disk->flags |= GENHD_FL_NO_PART;
>  
> -	fput(filp);
> -}
> -
> -static void loop_rundown_completed(struct loop_device *lo)
> -{
>  	mutex_lock(&lo->lo_mutex);
>  	lo->lo_state = Lo_unbound;
>  	mutex_unlock(&lo->lo_mutex);
>  	module_put(THIS_MODULE);
> -}
> -
> -static void loop_rundown_workfn(struct work_struct *work)
> -{
> -	struct loop_device *lo = container_of(work, struct loop_device,
> -					      rundown_work);
> -	struct block_device *bdev = lo->lo_device;
> -	struct gendisk *disk = lo->lo_disk;
>  
> -	__loop_clr_fd(lo, true);
> -	kobject_put(&bdev->bd_device.kobj);
> -	module_put(disk->fops->owner);
> -	loop_rundown_completed(lo);
> -}
> -
> -static void loop_schedule_rundown(struct loop_device *lo)
> -{
> -	struct block_device *bdev = lo->lo_device;
> -	struct gendisk *disk = lo->lo_disk;
> -
> -	__module_get(disk->fops->owner);
> -	kobject_get(&bdev->bd_device.kobj);
> -	INIT_WORK(&lo->rundown_work, loop_rundown_workfn);
> -	queue_work(system_long_wq, &lo->rundown_work);
> +	/*
> +	 * Need not hold lo_mutex to fput backing file. Calling fput holding
> +	 * lo_mutex triggers a circular lock dependency possibility warning as
> +	 * fput can take open_mutex which is usually taken before lo_mutex.
> +	 */
> +	fput(filp);
>  }
>  
>  static int loop_clr_fd(struct loop_device *lo)
> @@ -1254,7 +1230,6 @@ static int loop_clr_fd(struct loop_device *lo)
>  	mutex_unlock(&lo->lo_mutex);
>  
>  	__loop_clr_fd(lo, false);
> -	loop_rundown_completed(lo);
>  	return 0;
>  }
>  
> @@ -1753,20 +1728,14 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
>  		return;
>  
>  	mutex_lock(&lo->lo_mutex);
> -	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
> -		if (lo->lo_state != Lo_bound)
> -			goto out_unlock;
> +	if (lo->lo_state == Lo_bound &&
> +	    (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) {
>  		lo->lo_state = Lo_rundown;
>  		mutex_unlock(&lo->lo_mutex);
> -		/*
> -		 * In autoclear mode, stop the loop thread
> -		 * and remove configuration after last close.
> -		 */
> -		loop_schedule_rundown(lo);
> +		__loop_clr_fd(lo, true);
>  		return;
>  	}
>  
> -out_unlock:
>  	mutex_unlock(&lo->lo_mutex);
>  }
>  
> @@ -2056,6 +2025,12 @@ static void loop_remove(struct loop_device *lo)
>  	mutex_lock(&loop_ctl_mutex);
>  	idr_remove(&loop_index_idr, lo->lo_number);
>  	mutex_unlock(&loop_ctl_mutex);
> +
> +	if (lo->workqueue)
> +		destroy_workqueue(lo->workqueue);
> +	loop_free_idle_workers(lo, true);
> +	del_timer_sync(&lo->timer);
> +
>  	/* There is no route which can find this loop device. */
>  	mutex_destroy(&lo->lo_mutex);
>  	kfree(lo);
> diff --git a/drivers/block/loop.h b/drivers/block/loop.h
> index 918a7a2dc0259..082d4b6bfc6a6 100644
> --- a/drivers/block/loop.h
> +++ b/drivers/block/loop.h
> @@ -56,7 +56,6 @@ struct loop_device {
>  	struct gendisk		*lo_disk;
>  	struct mutex		lo_mutex;
>  	bool			idr_visible;
> -	struct work_struct      rundown_work;
>  };
>  
>  struct loop_cmd {
> -- 
> 2.30.2
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 3/8] block: remove the racy bd_inode->i_mapping->nrpages asserts
  2022-01-27  9:49     ` Christoph Hellwig
@ 2022-01-27 12:23       ` Jan Kara
  2022-01-28  7:26         ` Christoph Hellwig
  0 siblings, 1 reply; 32+ messages in thread
From: Jan Kara @ 2022-01-27 12:23 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, Jens Axboe, Tetsuo Handa, linux-block, Ming Lei

[-- Attachment #1: Type: text/plain, Size: 1105 bytes --]

On Thu 27-01-22 10:49:42, Christoph Hellwig wrote:
> On Thu, Jan 27, 2022 at 10:47:37AM +0100, Jan Kara wrote:
> > On Wed 26-01-22 16:50:35, Christoph Hellwig wrote:
> > > Nothing prevents a file system or userspace opener of the block device
> > > from redirtying the page right afte sync_blockdev returned.  Fortunately
> > > data in the page cache during a block device change is mostly harmless
> > > anyway.
> > > 
> > > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > 
> > My understanding was these warnings are there to tell userspace it is doing
> > something wrong. Something like the warning we issue when DIO races with
> > buffered IO... I'm not sure how useful they are but I don't see strong
> > reason to remove them either...
> 
> Well, it is not just a warning, but also fails the command.  With some of
> the reduced synchronization blktests loop/002 can hit them pretty reliably.

I see. I guess another place where using mapping->invalidate_lock would be
good to avoid these races... So maybe something like attached patch?

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

[-- Attachment #2: 0001-loop-Protect-loop-device-invalidation-from-racing-pa.patch --]
[-- Type: text/x-patch, Size: 3151 bytes --]

From 3914760aa538f55012f41859857cfe75bdcfc6a2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 27 Jan 2022 12:43:26 +0100
Subject: [PATCH] loop: Protect loop device invalidation from racing page cache
 operations

Grab bdev->i_mutex and bdev->i_mapping->invalidate_lock to protect
operations invalidating loop device page cache from racing operations on
the page cache. As a result we can drop some warnings.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 drivers/block/loop.c | 29 ++++++++---------------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 01cbbfc4e9e2..170e3dc0d8a9 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1252,6 +1252,8 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	if (lo->lo_offset != info->lo_offset ||
 	    lo->lo_sizelimit != info->lo_sizelimit) {
 		size_changed = true;
+		inode_lock(lo->lo_device->bd_inode);
+		filemap_invalidate_lock(lo->lo_device->bd_inode->i_mapping);
 		sync_blockdev(lo->lo_device);
 		invalidate_bdev(lo->lo_device);
 	}
@@ -1259,15 +1261,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	/* I/O need to be drained during transfer transition */
 	blk_mq_freeze_queue(lo->lo_queue);
 
-	if (size_changed && lo->lo_device->bd_inode->i_mapping->nrpages) {
-		/* If any pages were dirtied after invalidate_bdev(), try again */
-		err = -EAGAIN;
-		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
-			__func__, lo->lo_number, lo->lo_file_name,
-			lo->lo_device->bd_inode->i_mapping->nrpages);
-		goto out_unfreeze;
-	}
-
 	prev_lo_flags = lo->lo_flags;
 
 	err = loop_set_status_from_info(lo, info);
@@ -1285,6 +1278,8 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 		loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit,
 					   lo->lo_backing_file);
 		loop_set_size(lo, new_size);
+		filemap_invalidate_unlock(lo->lo_device->bd_inode->i_mapping);
+		inode_unlock(lo->lo_device->bd_inode);
 	}
 
 	loop_config_discard(lo);
@@ -1474,26 +1469,18 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
 	if (lo->lo_queue->limits.logical_block_size == arg)
 		return 0;
 
+	inode_lock(lo->lo_device->bd_inode);
+	filemap_invalidate_lock(lo->lo_device->bd_inode->i_mapping);
 	sync_blockdev(lo->lo_device);
 	invalidate_bdev(lo->lo_device);
-
 	blk_mq_freeze_queue(lo->lo_queue);
-
-	/* invalidate_bdev should have truncated all the pages */
-	if (lo->lo_device->bd_inode->i_mapping->nrpages) {
-		err = -EAGAIN;
-		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
-			__func__, lo->lo_number, lo->lo_file_name,
-			lo->lo_device->bd_inode->i_mapping->nrpages);
-		goto out_unfreeze;
-	}
-
 	blk_queue_logical_block_size(lo->lo_queue, arg);
 	blk_queue_physical_block_size(lo->lo_queue, arg);
 	blk_queue_io_min(lo->lo_queue, arg);
 	loop_update_dio(lo);
-out_unfreeze:
 	blk_mq_unfreeze_queue(lo->lo_queue);
+	filemap_invalidate_unlock(lo->lo_device->bd_inode->i_mapping);
+	inode_unlock(lo->lo_device->bd_inode);
 
 	return err;
 }
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: yet another approach to fix loop autoclear for xfstets xfs/049
  2022-01-26 19:38 ` yet another approach to fix loop autoclear for xfstets xfs/049 Darrick J. Wong
@ 2022-01-27 16:50   ` Darrick J. Wong
  0 siblings, 0 replies; 32+ messages in thread
From: Darrick J. Wong @ 2022-01-27 16:50 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Tetsuo Handa, Jan Kara, linux-block, Ming Lei, xfs

On Wed, Jan 26, 2022 at 11:38:40AM -0800, Darrick J. Wong wrote:
> On Wed, Jan 26, 2022 at 04:50:32PM +0100, Christoph Hellwig wrote:
> > Hi Jens, hi Tetsuo,
> > 
> > this series uses the approach from Tetsuo to delay the destroy_workueue
> > cll, extended by a delayed teardown of the workers to fix a potential
> > racewindow then the workqueue can be still round after finishing the
> > commands.  It then also removed the queue freeing in release that is
> > not needed to fix the dependency chain for that (which can't be
> > reported by lockdep) as well.
> 
> [add xfs to cc list]
> 
> This fixes all the regressions I've been seeing in xfs/049 and xfs/073,
> thank you.  I'll give this a spin with the rest of fstests overnight.

After an overnight run with 5.17-rc1 + {xfs,iomap}-for-next + this patchset,
fstests is back to normal.

Tested-by: Darrick J. Wong <djwong@kernel.org>

--D

> 
> --D

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 6/8] loop: don't freeze the queue in lo_release
  2022-01-27 10:42   ` Jan Kara
@ 2022-01-28  6:46     ` Christoph Hellwig
  0 siblings, 0 replies; 32+ messages in thread
From: Christoph Hellwig @ 2022-01-28  6:46 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Tetsuo Handa, linux-block, Ming Lei

On Thu, Jan 27, 2022 at 11:42:56AM +0100, Jan Kara wrote:
> Maybe worth a comment like:
> 
> 	/*
> 	 * No IO can be running at this point since there are no openers
> 	 * (covers filesystems, stacked devices, AIO) and the page cache is
> 	 * evicted.
> 	 */

I really don't see the point to add this to a specific driver instance.
It is a block layer guarantee so maybe we should document it better
there, but certainly not duplicate it in all the instances.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 7/8] loop: only freeze the queue in __loop_clr_fd when needed
  2022-01-27 11:01   ` Jan Kara
@ 2022-01-28  6:48     ` Christoph Hellwig
  0 siblings, 0 replies; 32+ messages in thread
From: Christoph Hellwig @ 2022-01-28  6:48 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Tetsuo Handa, linux-block, Ming Lei

On Thu, Jan 27, 2022 at 12:01:43PM +0100, Jan Kara wrote:
> On Wed 26-01-22 16:50:39, Christoph Hellwig wrote:
> > ->release is only called after all outstanding I/O has completed, so only
> > freeze the queue when clearing the backing file of a live loop device.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> Looks good. I was just wondering: Is there anything which prevents us from
> moving blk_mq_freeze_queue() & blk_mq_unfreeze_queue() calls to
> loop_clr_fd() around __loop_clr_fd() call?

That would move more code into the freeze protection.  Including lo_mutex
which elsewhere is taken outside blk_freeze_queue.  The lo_mutex
acquisition is only for an assert, so we could skip that, so maybe we
can get there.  But the next patch adds another check for the release
flag so it doesn't seem worth it at this moment.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: yet another approach to fix loop autoclear for xfstets xfs/049
  2022-01-27  1:05 ` Tetsuo Handa
@ 2022-01-28  7:08   ` Christoph Hellwig
  2022-01-28  9:52     ` Tetsuo Handa
  0 siblings, 1 reply; 32+ messages in thread
From: Christoph Hellwig @ 2022-01-28  7:08 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Christoph Hellwig, Jens Axboe, Jan Kara, linux-block, Ming Lei

[nit: please properly trim the lines in your mails, this needed a
 reformat to be readable at all]

On Thu, Jan 27, 2022 at 10:05:45AM +0900, Tetsuo Handa wrote:
> I want to remove disk->open_mutex => lo->lo_mutex => I/O completion chain
> itself from /proc/lockdep . Even if real deadlock does not happen due to
> lo->lo_refcnt exclusion, I consider that disk->open_mutex => lo->lo_mutex
> dependency being recorded is a bad sign.
> It makes difficult to find real possibility of deadlock when things change
> in future. I consider that lo_release() is doing too much things under
> disk->open_mutex.
> 
> I tried to defer lo->lo_mutex in lo_release() as much as possible.
> But this chain still remains.

Yes.  To completely remove it we'd need something like:

 - remove lo_refcnt and rely on bd_openers on the whole device bdev
 - add a block layer flag to temporarily disable a gendisk and fail
   all opens for it.

For now I'd really like to just fix the regression, though.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 3/8] block: remove the racy bd_inode->i_mapping->nrpages asserts
  2022-01-27 12:23       ` Jan Kara
@ 2022-01-28  7:26         ` Christoph Hellwig
  2022-01-28 11:45           ` Jan Kara
  0 siblings, 1 reply; 32+ messages in thread
From: Christoph Hellwig @ 2022-01-28  7:26 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Tetsuo Handa, linux-block, Ming Lei

On Thu, Jan 27, 2022 at 01:23:27PM +0100, Jan Kara wrote:
> On Thu 27-01-22 10:49:42, Christoph Hellwig wrote:
> > On Thu, Jan 27, 2022 at 10:47:37AM +0100, Jan Kara wrote:
> > > On Wed 26-01-22 16:50:35, Christoph Hellwig wrote:
> > > > Nothing prevents a file system or userspace opener of the block device
> > > > from redirtying the page right afte sync_blockdev returned.  Fortunately
> > > > data in the page cache during a block device change is mostly harmless
> > > > anyway.
> > > > 
> > > > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > > 
> > > My understanding was these warnings are there to tell userspace it is doing
> > > something wrong. Something like the warning we issue when DIO races with
> > > buffered IO... I'm not sure how useful they are but I don't see strong
> > > reason to remove them either...
> > 
> > Well, it is not just a warning, but also fails the command.  With some of
> > the reduced synchronization blktests loop/002 can hit them pretty reliably.
> 
> I see. I guess another place where using mapping->invalidate_lock would be
> good to avoid these races... So maybe something like attached patch?

So this looks sensible, but it does nest the inode lock and and the
invalidate lockinside lo_mutex.  I wonder if that is going to create
more problems down the road.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: yet another approach to fix loop autoclear for xfstets xfs/049
  2022-01-28  7:08   ` Christoph Hellwig
@ 2022-01-28  9:52     ` Tetsuo Handa
  0 siblings, 0 replies; 32+ messages in thread
From: Tetsuo Handa @ 2022-01-28  9:52 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Jens Axboe, Jan Kara, linux-block, Ming Lei

On 2022/01/28 16:08, Christoph Hellwig wrote:
> Yes.  To completely remove it we'd need something like:
> 
>  - remove lo_refcnt and rely on bd_openers on the whole device bdev
>  - add a block layer flag to temporarily disable a gendisk and fail
>    all opens for it.
> 
> For now I'd really like to just fix the regression, though.

lo_open() does not need to use lo->lo_mutex. For now just deferring
lo_open()/lo_release() to task work context will fix the regression.


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 3/8] block: remove the racy bd_inode->i_mapping->nrpages asserts
  2022-01-28  7:26         ` Christoph Hellwig
@ 2022-01-28 11:45           ` Jan Kara
  0 siblings, 0 replies; 32+ messages in thread
From: Jan Kara @ 2022-01-28 11:45 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, Jens Axboe, Tetsuo Handa, linux-block, Ming Lei

On Fri 28-01-22 08:26:14, Christoph Hellwig wrote:
> On Thu, Jan 27, 2022 at 01:23:27PM +0100, Jan Kara wrote:
> > On Thu 27-01-22 10:49:42, Christoph Hellwig wrote:
> > > On Thu, Jan 27, 2022 at 10:47:37AM +0100, Jan Kara wrote:
> > > > On Wed 26-01-22 16:50:35, Christoph Hellwig wrote:
> > > > > Nothing prevents a file system or userspace opener of the block device
> > > > > from redirtying the page right afte sync_blockdev returned.  Fortunately
> > > > > data in the page cache during a block device change is mostly harmless
> > > > > anyway.
> > > > > 
> > > > > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > > > 
> > > > My understanding was these warnings are there to tell userspace it is doing
> > > > something wrong. Something like the warning we issue when DIO races with
> > > > buffered IO... I'm not sure how useful they are but I don't see strong
> > > > reason to remove them either...
> > > 
> > > Well, it is not just a warning, but also fails the command.  With some of
> > > the reduced synchronization blktests loop/002 can hit them pretty reliably.
> > 
> > I see. I guess another place where using mapping->invalidate_lock would be
> > good to avoid these races... So maybe something like attached patch?
> 
> So this looks sensible, but it does nest the inode lock and and the
> invalidate lockinside lo_mutex.  I wonder if that is going to create
> more problems down the road.

Yeah, I was wondering about that a bit as well. Quick blktests run didn't
trigger any lockdep warning so I thought it's worth a try. But for now I
guess let's not complicate things, it's difficult enough already.

I have checked the code now and it does not seem to cause any obvious harm
if we have block device page cache while changing loop device parameters.
Just some IO may fail, trigger some warning (e.g. IO beyond end of device)
but nothing worse.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH 8/8] loop: make autoclear operation synchronous again
  2022-01-28 13:00 yet another approach to fix loop autoclear for xfstets xfs/049 v2 Christoph Hellwig
@ 2022-01-28 13:00 ` Christoph Hellwig
  0 siblings, 0 replies; 32+ messages in thread
From: Christoph Hellwig @ 2022-01-28 13:00 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tetsuo Handa, Jan Kara, linux-block, Ming Lei, Tetsuo Handa,
	kernel test robot, Darrick J . Wong, syzbot

From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>

The kernel test robot is reporting that xfstest can fail at

  umount ext2 on xfs
  umount xfs

sequence, for commit 322c4293ecc58110 ("loop: make autoclear operation
asynchronous") broke what commit ("loop: Make explicit loop device
destruction lazy") wanted to achieve.

Although we cannot guarantee that nobody is holding a reference when
"umount xfs" is called, we should try to close a race window opened
by asynchronous autoclear operation.

It turned out that there is no need to call flush_workqueue() from
__loop_clr_fd(), for blk_mq_freeze_queue() blocks until all pending
"struct work_struct" are processed by loop_process_work().

Revert commit 322c4293ecc58110 ("loop: make autoclear operation
asynchronous"), and simply defer calling destroy_workqueue() till
loop_remove() after ensuring that the worker rbtree and repaing
timer are kept alive while the loop device exists.

Since a loop device is likely reused after once created thanks to
ioctl(LOOP_CTL_GET_FREE), being unable to destroy corresponding WQ
when ioctl(LOOP_CLR_FD) is called should be an acceptable waste.

Reported-by: kernel test robot <oliver.sang@intel.com>
Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
[hch: rebased, keep the work rbtree and timer around longer]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Tested-by: Darrick J. Wong <djwong@kernel.org>
Tested-by: syzbot <syzbot+643e4ce4b6ad1347d372@syzkaller.appspotmail.com>
---
 drivers/block/loop.c | 71 ++++++++++++++------------------------------
 drivers/block/loop.h |  1 -
 2 files changed, 23 insertions(+), 49 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 0cdbc60037642..68d01166f7c38 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1038,10 +1038,10 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 	    !file->f_op->write_iter)
 		lo->lo_flags |= LO_FLAGS_READ_ONLY;
 
-	lo->workqueue = alloc_workqueue("loop%d",
-					WQ_UNBOUND | WQ_FREEZABLE,
-					0,
-					lo->lo_number);
+	if (!lo->workqueue)
+		lo->workqueue = alloc_workqueue("loop%d",
+						WQ_UNBOUND | WQ_FREEZABLE,
+						0, lo->lo_number);
 	if (!lo->workqueue) {
 		error = -ENOMEM;
 		goto out_unlock;
@@ -1147,10 +1147,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 	if (!release)
 		blk_mq_freeze_queue(lo->lo_queue);
 
-	destroy_workqueue(lo->workqueue);
-	loop_free_idle_workers(lo, true);
-	del_timer_sync(&lo->timer);
-
 	spin_lock_irq(&lo->lo_lock);
 	filp = lo->lo_backing_file;
 	lo->lo_backing_file = NULL;
@@ -1176,9 +1172,11 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 	if (lo->lo_flags & LO_FLAGS_PARTSCAN) {
 		int err;
 
-		mutex_lock(&lo->lo_disk->open_mutex);
+		if (!release)
+			mutex_lock(&lo->lo_disk->open_mutex);
 		err = bdev_disk_changed(lo->lo_disk, false);
-		mutex_unlock(&lo->lo_disk->open_mutex);
+		if (!release)
+			mutex_unlock(&lo->lo_disk->open_mutex);
 		if (err)
 			pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
 				__func__, lo->lo_number, err);
@@ -1189,39 +1187,17 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 	if (!part_shift)
 		lo->lo_disk->flags |= GENHD_FL_NO_PART;
 
-	fput(filp);
-}
-
-static void loop_rundown_completed(struct loop_device *lo)
-{
 	mutex_lock(&lo->lo_mutex);
 	lo->lo_state = Lo_unbound;
 	mutex_unlock(&lo->lo_mutex);
 	module_put(THIS_MODULE);
-}
-
-static void loop_rundown_workfn(struct work_struct *work)
-{
-	struct loop_device *lo = container_of(work, struct loop_device,
-					      rundown_work);
-	struct block_device *bdev = lo->lo_device;
-	struct gendisk *disk = lo->lo_disk;
 
-	__loop_clr_fd(lo, true);
-	kobject_put(&bdev->bd_device.kobj);
-	module_put(disk->fops->owner);
-	loop_rundown_completed(lo);
-}
-
-static void loop_schedule_rundown(struct loop_device *lo)
-{
-	struct block_device *bdev = lo->lo_device;
-	struct gendisk *disk = lo->lo_disk;
-
-	__module_get(disk->fops->owner);
-	kobject_get(&bdev->bd_device.kobj);
-	INIT_WORK(&lo->rundown_work, loop_rundown_workfn);
-	queue_work(system_long_wq, &lo->rundown_work);
+	/*
+	 * Need not hold lo_mutex to fput backing file. Calling fput holding
+	 * lo_mutex triggers a circular lock dependency possibility warning as
+	 * fput can take open_mutex which is usually taken before lo_mutex.
+	 */
+	fput(filp);
 }
 
 static int loop_clr_fd(struct loop_device *lo)
@@ -1254,7 +1230,6 @@ static int loop_clr_fd(struct loop_device *lo)
 	mutex_unlock(&lo->lo_mutex);
 
 	__loop_clr_fd(lo, false);
-	loop_rundown_completed(lo);
 	return 0;
 }
 
@@ -1761,20 +1736,14 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		return;
 
 	mutex_lock(&lo->lo_mutex);
-	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
-		if (lo->lo_state != Lo_bound)
-			goto out_unlock;
+	if (lo->lo_state == Lo_bound &&
+	    (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) {
 		lo->lo_state = Lo_rundown;
 		mutex_unlock(&lo->lo_mutex);
-		/*
-		 * In autoclear mode, stop the loop thread
-		 * and remove configuration after last close.
-		 */
-		loop_schedule_rundown(lo);
+		__loop_clr_fd(lo, true);
 		return;
 	}
 
-out_unlock:
 	mutex_unlock(&lo->lo_mutex);
 }
 
@@ -2064,6 +2033,12 @@ static void loop_remove(struct loop_device *lo)
 	mutex_lock(&loop_ctl_mutex);
 	idr_remove(&loop_index_idr, lo->lo_number);
 	mutex_unlock(&loop_ctl_mutex);
+
+	if (lo->workqueue)
+		destroy_workqueue(lo->workqueue);
+	loop_free_idle_workers(lo, true);
+	del_timer_sync(&lo->timer);
+
 	/* There is no route which can find this loop device. */
 	mutex_destroy(&lo->lo_mutex);
 	kfree(lo);
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 918a7a2dc0259..082d4b6bfc6a6 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -56,7 +56,6 @@ struct loop_device {
 	struct gendisk		*lo_disk;
 	struct mutex		lo_mutex;
 	bool			idr_visible;
-	struct work_struct      rundown_work;
 };
 
 struct loop_cmd {
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2022-01-28 13:00 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-26 15:50 yet another approach to fix loop autoclear for xfstets xfs/049 Christoph Hellwig
2022-01-26 15:50 ` [PATCH 1/8] loop: de-duplicate the idle worker freeing code Christoph Hellwig
2022-01-27  9:36   ` Jan Kara
2022-01-26 15:50 ` [PATCH 2/8] loop: initialize the worker tracking fields once Christoph Hellwig
2022-01-27  9:45   ` Jan Kara
2022-01-26 15:50 ` [PATCH 3/8] block: remove the racy bd_inode->i_mapping->nrpages asserts Christoph Hellwig
2022-01-27  9:47   ` Jan Kara
2022-01-27  9:49     ` Christoph Hellwig
2022-01-27 12:23       ` Jan Kara
2022-01-28  7:26         ` Christoph Hellwig
2022-01-28 11:45           ` Jan Kara
2022-01-26 15:50 ` [PATCH 4/8] loop: only take lo_mutex for the last reference in lo_release Christoph Hellwig
2022-01-27  9:48   ` Jan Kara
2022-01-27 10:19     ` Jan Kara
2022-01-27 10:28       ` Christoph Hellwig
2022-01-26 15:50 ` [PATCH 5/8] loop: only take lo_mutex for the first reference in lo_open Christoph Hellwig
2022-01-27 10:28   ` Jan Kara
2022-01-27 10:31     ` Tetsuo Handa
2022-01-26 15:50 ` [PATCH 6/8] loop: don't freeze the queue in lo_release Christoph Hellwig
2022-01-27 10:42   ` Jan Kara
2022-01-28  6:46     ` Christoph Hellwig
2022-01-26 15:50 ` [PATCH 7/8] loop: only freeze the queue in __loop_clr_fd when needed Christoph Hellwig
2022-01-27 11:01   ` Jan Kara
2022-01-28  6:48     ` Christoph Hellwig
2022-01-26 15:50 ` [PATCH 8/8] loop: make autoclear operation synchronous again Christoph Hellwig
2022-01-27 11:04   ` Jan Kara
2022-01-26 19:38 ` yet another approach to fix loop autoclear for xfstets xfs/049 Darrick J. Wong
2022-01-27 16:50   ` Darrick J. Wong
2022-01-27  1:05 ` Tetsuo Handa
2022-01-28  7:08   ` Christoph Hellwig
2022-01-28  9:52     ` Tetsuo Handa
2022-01-28 13:00 yet another approach to fix loop autoclear for xfstets xfs/049 v2 Christoph Hellwig
2022-01-28 13:00 ` [PATCH 8/8] loop: make autoclear operation synchronous again Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.