yet another approach to fix loop autoclear for xfstets xfs/049 v2

All of lore.kernel.org
 help / color / mirror / Atom feed

* yet another approach to fix loop autoclear for xfstets xfs/049  v2
@ 2022-01-28 13:00 Christoph Hellwig
  2022-01-28 13:00 ` [PATCH 1/8] loop: de-duplicate the idle worker freeing code Christoph Hellwig
                   ` (7 more replies)
  0 siblings, 8 replies; 21+ messages in thread
From: Christoph Hellwig @ 2022-01-28 13:00 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Tetsuo Handa, Jan Kara, linux-block, Ming Lei

Hi Jens, hi Tetsuo,

this series uses the approach from Tetsuo to delay the destroy_workueue
cll, extended by a delayed teardown of the workers to fix a potential
racewindow then the workqueue can be still round after finishing the
commands.  It then also removed the queue freeing in release that is
not needed to fix the dependency chain for that (which can't be
reported by lockdep) as well.

Changes since v1:
 - add comments to document the lo_refcnt synchronization
 - fix comment typos

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 1/8] loop: de-duplicate the idle worker freeing code
  2022-01-28 13:00 yet another approach to fix loop autoclear for xfstets xfs/049 v2 Christoph Hellwig
@ 2022-01-28 13:00 ` Christoph Hellwig
  2022-01-28 13:00 ` [PATCH 2/8] loop: initialize the worker tracking fields once Christoph Hellwig
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2022-01-28 13:00 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tetsuo Handa, Jan Kara, linux-block, Ming Lei, Darrick J . Wong

Use a common helper for both timer based and uncoditional freeing of idle
workers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Tested-by: Darrick J. Wong <djwong@kernel.org>
---
 drivers/block/loop.c | 73 +++++++++++++++++++++-----------------------
 1 file changed, 35 insertions(+), 38 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 01cbbfc4e9e24..b268bca6e4fb7 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -804,7 +804,6 @@ struct loop_worker {
 
 static void loop_workfn(struct work_struct *work);
 static void loop_rootcg_workfn(struct work_struct *work);
-static void loop_free_idle_workers(struct timer_list *timer);
 
 #ifdef CONFIG_BLK_CGROUP
 static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
@@ -888,6 +887,39 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
 	spin_unlock_irq(&lo->lo_work_lock);
 }
 
+static void loop_set_timer(struct loop_device *lo)
+{
+	timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT);
+}
+
+static void loop_free_idle_workers(struct loop_device *lo, bool delete_all)
+{
+	struct loop_worker *pos, *worker;
+
+	spin_lock_irq(&lo->lo_work_lock);
+	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
+				idle_list) {
+		if (!delete_all &&
+		    time_is_after_jiffies(worker->last_ran_at +
+					  LOOP_IDLE_WORKER_TIMEOUT))
+			break;
+		list_del(&worker->idle_list);
+		rb_erase(&worker->rb_node, &lo->worker_tree);
+		css_put(worker->blkcg_css);
+		kfree(worker);
+	}
+	if (!list_empty(&lo->idle_worker_list))
+		loop_set_timer(lo);
+	spin_unlock_irq(&lo->lo_work_lock);
+}
+
+static void loop_free_idle_workers_timer(struct timer_list *timer)
+{
+	struct loop_device *lo = container_of(timer, struct loop_device, timer);
+
+	return loop_free_idle_workers(lo, false);
+}
+
 static void loop_update_rotational(struct loop_device *lo)
 {
 	struct file *file = lo->lo_backing_file;
@@ -1022,7 +1054,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 	INIT_LIST_HEAD(&lo->rootcg_cmd_list);
 	INIT_LIST_HEAD(&lo->idle_worker_list);
 	lo->worker_tree = RB_ROOT;
-	timer_setup(&lo->timer, loop_free_idle_workers,
+	timer_setup(&lo->timer, loop_free_idle_workers_timer,
 		TIMER_DEFERRABLE);
 	lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
 	lo->lo_device = bdev;
@@ -1086,7 +1118,6 @@ static void __loop_clr_fd(struct loop_device *lo)
 {
 	struct file *filp;
 	gfp_t gfp = lo->old_gfp_mask;
-	struct loop_worker *pos, *worker;
 
 	/*
 	 * Flush loop_configure() and loop_change_fd(). It is acceptable for
@@ -1116,15 +1147,7 @@ static void __loop_clr_fd(struct loop_device *lo)
 	blk_mq_freeze_queue(lo->lo_queue);
 
 	destroy_workqueue(lo->workqueue);
-	spin_lock_irq(&lo->lo_work_lock);
-	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
-				idle_list) {
-		list_del(&worker->idle_list);
-		rb_erase(&worker->rb_node, &lo->worker_tree);
-		css_put(worker->blkcg_css);
-		kfree(worker);
-	}
-	spin_unlock_irq(&lo->lo_work_lock);
+	loop_free_idle_workers(lo, true);
 	del_timer_sync(&lo->timer);
 
 	spin_lock_irq(&lo->lo_lock);
@@ -1871,11 +1894,6 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 	}
 }
 
-static void loop_set_timer(struct loop_device *lo)
-{
-	timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT);
-}
-
 static void loop_process_work(struct loop_worker *worker,
 			struct list_head *cmd_list, struct loop_device *lo)
 {
@@ -1924,27 +1942,6 @@ static void loop_rootcg_workfn(struct work_struct *work)
 	loop_process_work(NULL, &lo->rootcg_cmd_list, lo);
 }
 
-static void loop_free_idle_workers(struct timer_list *timer)
-{
-	struct loop_device *lo = container_of(timer, struct loop_device, timer);
-	struct loop_worker *pos, *worker;
-
-	spin_lock_irq(&lo->lo_work_lock);
-	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
-				idle_list) {
-		if (time_is_after_jiffies(worker->last_ran_at +
-						LOOP_IDLE_WORKER_TIMEOUT))
-			break;
-		list_del(&worker->idle_list);
-		rb_erase(&worker->rb_node, &lo->worker_tree);
-		css_put(worker->blkcg_css);
-		kfree(worker);
-	}
-	if (!list_empty(&lo->idle_worker_list))
-		loop_set_timer(lo);
-	spin_unlock_irq(&lo->lo_work_lock);
-}
-
 static const struct blk_mq_ops loop_mq_ops = {
 	.queue_rq       = loop_queue_rq,
 	.complete	= lo_complete_rq,
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 2/8] loop: initialize the worker tracking fields once
  2022-01-28 13:00 yet another approach to fix loop autoclear for xfstets xfs/049 v2 Christoph Hellwig
  2022-01-28 13:00 ` [PATCH 1/8] loop: de-duplicate the idle worker freeing code Christoph Hellwig
@ 2022-01-28 13:00 ` Christoph Hellwig
  2022-01-28 13:00 ` [PATCH 3/8] loop: remove the racy bd_inode->i_mapping->nrpages asserts Christoph Hellwig
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2022-01-28 13:00 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tetsuo Handa, Jan Kara, linux-block, Ming Lei, Darrick J . Wong

There is no need to reinitialize idle_worker_list, worker_tree and timer
every time a loop device is configured.  Just initialize them once at
allocation time.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Tested-by: Darrick J. Wong <djwong@kernel.org>
---
 drivers/block/loop.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index b268bca6e4fb7..6ec55a5d9dfc4 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1052,10 +1052,6 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 
 	INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn);
 	INIT_LIST_HEAD(&lo->rootcg_cmd_list);
-	INIT_LIST_HEAD(&lo->idle_worker_list);
-	lo->worker_tree = RB_ROOT;
-	timer_setup(&lo->timer, loop_free_idle_workers_timer,
-		TIMER_DEFERRABLE);
 	lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
 	lo->lo_device = bdev;
 	lo->lo_backing_file = file;
@@ -1957,6 +1953,9 @@ static int loop_add(int i)
 	lo = kzalloc(sizeof(*lo), GFP_KERNEL);
 	if (!lo)
 		goto out;
+	lo->worker_tree = RB_ROOT;
+	INIT_LIST_HEAD(&lo->idle_worker_list);
+	timer_setup(&lo->timer, loop_free_idle_workers_timer, TIMER_DEFERRABLE);
 	lo->lo_state = Lo_unbound;
 
 	err = mutex_lock_killable(&loop_ctl_mutex);
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 3/8] loop: remove the racy bd_inode->i_mapping->nrpages asserts
  2022-01-28 13:00 yet another approach to fix loop autoclear for xfstets xfs/049 v2 Christoph Hellwig
  2022-01-28 13:00 ` [PATCH 1/8] loop: de-duplicate the idle worker freeing code Christoph Hellwig
  2022-01-28 13:00 ` [PATCH 2/8] loop: initialize the worker tracking fields once Christoph Hellwig
@ 2022-01-28 13:00 ` Christoph Hellwig
  2022-01-28 13:36   ` Jan Kara
  2022-01-28 13:00 ` [PATCH 4/8] loop: only take lo_mutex for the last reference in lo_release Christoph Hellwig
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 21+ messages in thread
From: Christoph Hellwig @ 2022-01-28 13:00 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tetsuo Handa, Jan Kara, linux-block, Ming Lei, Darrick J . Wong

Nothing prevents a file system or userspace opener of the block device
from redirtying the page right afte sync_blockdev returned.  Fortunately
data in the page cache during a block device change is mostly harmless
anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Darrick J. Wong <djwong@kernel.org>
---
 drivers/block/loop.c | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6ec55a5d9dfc4..d3a7f281ce1b6 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1278,15 +1278,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	/* I/O need to be drained during transfer transition */
 	blk_mq_freeze_queue(lo->lo_queue);
 
-	if (size_changed && lo->lo_device->bd_inode->i_mapping->nrpages) {
-		/* If any pages were dirtied after invalidate_bdev(), try again */
-		err = -EAGAIN;
-		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
-			__func__, lo->lo_number, lo->lo_file_name,
-			lo->lo_device->bd_inode->i_mapping->nrpages);
-		goto out_unfreeze;
-	}
-
 	prev_lo_flags = lo->lo_flags;
 
 	err = loop_set_status_from_info(lo, info);
@@ -1497,21 +1488,10 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
 	invalidate_bdev(lo->lo_device);
 
 	blk_mq_freeze_queue(lo->lo_queue);
-
-	/* invalidate_bdev should have truncated all the pages */
-	if (lo->lo_device->bd_inode->i_mapping->nrpages) {
-		err = -EAGAIN;
-		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
-			__func__, lo->lo_number, lo->lo_file_name,
-			lo->lo_device->bd_inode->i_mapping->nrpages);
-		goto out_unfreeze;
-	}
-
 	blk_queue_logical_block_size(lo->lo_queue, arg);
 	blk_queue_physical_block_size(lo->lo_queue, arg);
 	blk_queue_io_min(lo->lo_queue, arg);
 	loop_update_dio(lo);
-out_unfreeze:
 	blk_mq_unfreeze_queue(lo->lo_queue);
 
 	return err;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 4/8] loop: only take lo_mutex for the last reference in lo_release
  2022-01-28 13:00 yet another approach to fix loop autoclear for xfstets xfs/049 v2 Christoph Hellwig
                   ` (2 preceding siblings ...)
  2022-01-28 13:00 ` [PATCH 3/8] loop: remove the racy bd_inode->i_mapping->nrpages asserts Christoph Hellwig
@ 2022-01-28 13:00 ` Christoph Hellwig
  2022-01-28 13:37   ` Jan Kara
  2022-01-28 13:00 ` [PATCH 5/8] loop: only take lo_mutex for the first reference in lo_open Christoph Hellwig
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 21+ messages in thread
From: Christoph Hellwig @ 2022-01-28 13:00 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tetsuo Handa, Jan Kara, linux-block, Ming Lei, Darrick J . Wong

lo_refcnt is only incremented in lo_open and decremented in lo_release,
and thus protected by open_mutex.  Only take lo_mutex when lo_release
actually takes action for the final release.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Darrick J. Wong <djwong@kernel.org>
---
 drivers/block/loop.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index d3a7f281ce1b6..b58dc95f80d96 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1740,10 +1740,14 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 {
 	struct loop_device *lo = disk->private_data;
 
-	mutex_lock(&lo->lo_mutex);
-	if (atomic_dec_return(&lo->lo_refcnt))
-		goto out_unlock;
+	/*
+	 * Note: this requires disk->open_mutex to protect against races
+	 * with lo_open.
+	 */
+	if (!atomic_dec_and_test(&lo->lo_refcnt))
+		return;
 
+	mutex_lock(&lo->lo_mutex);
 	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
 		if (lo->lo_state != Lo_bound)
 			goto out_unlock;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 5/8] loop: only take lo_mutex for the first reference in lo_open
  2022-01-28 13:00 yet another approach to fix loop autoclear for xfstets xfs/049 v2 Christoph Hellwig
                   ` (3 preceding siblings ...)
  2022-01-28 13:00 ` [PATCH 4/8] loop: only take lo_mutex for the last reference in lo_release Christoph Hellwig
@ 2022-01-28 13:00 ` Christoph Hellwig
  2022-01-28 13:13   ` Tetsuo Handa
  2022-01-28 13:38   ` Jan Kara
  2022-01-28 13:00 ` [PATCH 6/8] loop: don't freeze the queue in lo_release Christoph Hellwig
                   ` (2 subsequent siblings)
  7 siblings, 2 replies; 21+ messages in thread
From: Christoph Hellwig @ 2022-01-28 13:00 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tetsuo Handa, Jan Kara, linux-block, Ming Lei, Darrick J . Wong

lo_refcnt is only incremented in lo_open and decremented in lo_release,
and thus protected by open_mutex.  Only take lo_mutex when lo_open is
called the first time, as only for the first open there is any affect
on the driver state (incremental opens on partitions don't end up in
lo_open at all already).

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Darrick J. Wong <djwong@kernel.org>
---
 drivers/block/loop.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index b58dc95f80d96..f349ddfc0e84a 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1725,13 +1725,20 @@ static int lo_open(struct block_device *bdev, fmode_t mode)
 	struct loop_device *lo = bdev->bd_disk->private_data;
 	int err;
 
+	/*
+	 * Note: this requires disk->open_mutex to protect against races
+	 * with lo_release.
+	 */
+	if (atomic_inc_return(&lo->lo_refcnt) > 1)
+		return 0;
+
 	err = mutex_lock_killable(&lo->lo_mutex);
 	if (err)
 		return err;
-	if (lo->lo_state == Lo_deleting)
+	if (lo->lo_state == Lo_deleting) {
+		atomic_dec(&lo->lo_refcnt);
 		err = -ENXIO;
-	else
-		atomic_inc(&lo->lo_refcnt);
+	}
 	mutex_unlock(&lo->lo_mutex);
 	return err;
 }
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 6/8] loop: don't freeze the queue in lo_release
  2022-01-28 13:00 yet another approach to fix loop autoclear for xfstets xfs/049 v2 Christoph Hellwig
                   ` (4 preceding siblings ...)
  2022-01-28 13:00 ` [PATCH 5/8] loop: only take lo_mutex for the first reference in lo_open Christoph Hellwig
@ 2022-01-28 13:00 ` Christoph Hellwig
  2022-01-28 13:00 ` [PATCH 7/8] loop: only freeze the queue in __loop_clr_fd when needed Christoph Hellwig
  2022-01-28 13:00 ` [PATCH 8/8] loop: make autoclear operation synchronous again Christoph Hellwig
  7 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2022-01-28 13:00 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tetsuo Handa, Jan Kara, linux-block, Ming Lei, Darrick J . Wong

By the time the final ->release is called there can't be outstanding I/O.
For non-final ->release there is no need for driver action at all.

Thus remove the useless queue freeze.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Tested-by: Darrick J. Wong <djwong@kernel.org>
---
 drivers/block/loop.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f349ddfc0e84a..1ca70f735b5bc 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1766,13 +1766,6 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		 */
 		loop_schedule_rundown(lo);
 		return;
-	} else if (lo->lo_state == Lo_bound) {
-		/*
-		 * Otherwise keep thread (if running) and config,
-		 * but flush possible ongoing bios in thread.
-		 */
-		blk_mq_freeze_queue(lo->lo_queue);
-		blk_mq_unfreeze_queue(lo->lo_queue);
 	}
 
 out_unlock:
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 7/8] loop: only freeze the queue in __loop_clr_fd when needed
  2022-01-28 13:00 yet another approach to fix loop autoclear for xfstets xfs/049 v2 Christoph Hellwig
                   ` (5 preceding siblings ...)
  2022-01-28 13:00 ` [PATCH 6/8] loop: don't freeze the queue in lo_release Christoph Hellwig
@ 2022-01-28 13:00 ` Christoph Hellwig
  2022-01-28 13:00 ` [PATCH 8/8] loop: make autoclear operation synchronous again Christoph Hellwig
  7 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2022-01-28 13:00 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tetsuo Handa, Jan Kara, linux-block, Ming Lei, Darrick J . Wong

->release is only called after all outstanding I/O has completed, so only
freeze the queue when clearing the backing file of a live loop device.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Tested-by: Darrick J. Wong <djwong@kernel.org>
---
 drivers/block/loop.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 1ca70f735b5bc..0cdbc60037642 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1110,7 +1110,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 	return error;
 }
 
-static void __loop_clr_fd(struct loop_device *lo)
+static void __loop_clr_fd(struct loop_device *lo, bool release)
 {
 	struct file *filp;
 	gfp_t gfp = lo->old_gfp_mask;
@@ -1139,8 +1139,13 @@ static void __loop_clr_fd(struct loop_device *lo)
 	if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags))
 		blk_queue_write_cache(lo->lo_queue, false, false);
 
-	/* freeze request queue during the transition */
-	blk_mq_freeze_queue(lo->lo_queue);
+	/*
+	 * Freeze the request queue when unbinding on a live file descriptor and
+	 * thus an open device.  When called from ->release we are guaranteed
+	 * that there is no I/O in progress already.
+	 */
+	if (!release)
+		blk_mq_freeze_queue(lo->lo_queue);
 
 	destroy_workqueue(lo->workqueue);
 	loop_free_idle_workers(lo, true);
@@ -1163,7 +1168,8 @@ static void __loop_clr_fd(struct loop_device *lo)
 	/* let user-space know about this change */
 	kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
 	mapping_set_gfp_mask(filp->f_mapping, gfp);
-	blk_mq_unfreeze_queue(lo->lo_queue);
+	if (!release)
+		blk_mq_unfreeze_queue(lo->lo_queue);
 
 	disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
 
@@ -1201,7 +1207,7 @@ static void loop_rundown_workfn(struct work_struct *work)
 	struct block_device *bdev = lo->lo_device;
 	struct gendisk *disk = lo->lo_disk;
 
-	__loop_clr_fd(lo);
+	__loop_clr_fd(lo, true);
 	kobject_put(&bdev->bd_device.kobj);
 	module_put(disk->fops->owner);
 	loop_rundown_completed(lo);
@@ -1247,7 +1253,7 @@ static int loop_clr_fd(struct loop_device *lo)
 	lo->lo_state = Lo_rundown;
 	mutex_unlock(&lo->lo_mutex);
 
-	__loop_clr_fd(lo);
+	__loop_clr_fd(lo, false);
 	loop_rundown_completed(lo);
 	return 0;
 }
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 8/8] loop: make autoclear operation synchronous again
  2022-01-28 13:00 yet another approach to fix loop autoclear for xfstets xfs/049 v2 Christoph Hellwig
                   ` (6 preceding siblings ...)
  2022-01-28 13:00 ` [PATCH 7/8] loop: only freeze the queue in __loop_clr_fd when needed Christoph Hellwig
@ 2022-01-28 13:00 ` Christoph Hellwig
  7 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2022-01-28 13:00 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Tetsuo Handa, Jan Kara, linux-block, Ming Lei, Tetsuo Handa,
	kernel test robot, Darrick J . Wong, syzbot

From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>

The kernel test robot is reporting that xfstest can fail at

  umount ext2 on xfs
  umount xfs

sequence, for commit 322c4293ecc58110 ("loop: make autoclear operation
asynchronous") broke what commit ("loop: Make explicit loop device
destruction lazy") wanted to achieve.

Although we cannot guarantee that nobody is holding a reference when
"umount xfs" is called, we should try to close a race window opened
by asynchronous autoclear operation.

It turned out that there is no need to call flush_workqueue() from
__loop_clr_fd(), for blk_mq_freeze_queue() blocks until all pending
"struct work_struct" are processed by loop_process_work().

Revert commit 322c4293ecc58110 ("loop: make autoclear operation
asynchronous"), and simply defer calling destroy_workqueue() till
loop_remove() after ensuring that the worker rbtree and repaing
timer are kept alive while the loop device exists.

Since a loop device is likely reused after once created thanks to
ioctl(LOOP_CTL_GET_FREE), being unable to destroy corresponding WQ
when ioctl(LOOP_CLR_FD) is called should be an acceptable waste.

Reported-by: kernel test robot <oliver.sang@intel.com>
Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
[hch: rebased, keep the work rbtree and timer around longer]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Tested-by: Darrick J. Wong <djwong@kernel.org>
Tested-by: syzbot <syzbot+643e4ce4b6ad1347d372@syzkaller.appspotmail.com>
---
 drivers/block/loop.c | 71 ++++++++++++++------------------------------
 drivers/block/loop.h |  1 -
 2 files changed, 23 insertions(+), 49 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 0cdbc60037642..68d01166f7c38 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1038,10 +1038,10 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 	    !file->f_op->write_iter)
 		lo->lo_flags |= LO_FLAGS_READ_ONLY;
 
-	lo->workqueue = alloc_workqueue("loop%d",
-					WQ_UNBOUND | WQ_FREEZABLE,
-					0,
-					lo->lo_number);
+	if (!lo->workqueue)
+		lo->workqueue = alloc_workqueue("loop%d",
+						WQ_UNBOUND | WQ_FREEZABLE,
+						0, lo->lo_number);
 	if (!lo->workqueue) {
 		error = -ENOMEM;
 		goto out_unlock;
@@ -1147,10 +1147,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 	if (!release)
 		blk_mq_freeze_queue(lo->lo_queue);
 
-	destroy_workqueue(lo->workqueue);
-	loop_free_idle_workers(lo, true);
-	del_timer_sync(&lo->timer);
-
 	spin_lock_irq(&lo->lo_lock);
 	filp = lo->lo_backing_file;
 	lo->lo_backing_file = NULL;
@@ -1176,9 +1172,11 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 	if (lo->lo_flags & LO_FLAGS_PARTSCAN) {
 		int err;
 
-		mutex_lock(&lo->lo_disk->open_mutex);
+		if (!release)
+			mutex_lock(&lo->lo_disk->open_mutex);
 		err = bdev_disk_changed(lo->lo_disk, false);
-		mutex_unlock(&lo->lo_disk->open_mutex);
+		if (!release)
+			mutex_unlock(&lo->lo_disk->open_mutex);
 		if (err)
 			pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
 				__func__, lo->lo_number, err);
@@ -1189,39 +1187,17 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 	if (!part_shift)
 		lo->lo_disk->flags |= GENHD_FL_NO_PART;
 
-	fput(filp);
-}
-
-static void loop_rundown_completed(struct loop_device *lo)
-{
 	mutex_lock(&lo->lo_mutex);
 	lo->lo_state = Lo_unbound;
 	mutex_unlock(&lo->lo_mutex);
 	module_put(THIS_MODULE);
-}
-
-static void loop_rundown_workfn(struct work_struct *work)
-{
-	struct loop_device *lo = container_of(work, struct loop_device,
-					      rundown_work);
-	struct block_device *bdev = lo->lo_device;
-	struct gendisk *disk = lo->lo_disk;
 
-	__loop_clr_fd(lo, true);
-	kobject_put(&bdev->bd_device.kobj);
-	module_put(disk->fops->owner);
-	loop_rundown_completed(lo);
-}
-
-static void loop_schedule_rundown(struct loop_device *lo)
-{
-	struct block_device *bdev = lo->lo_device;
-	struct gendisk *disk = lo->lo_disk;
-
-	__module_get(disk->fops->owner);
-	kobject_get(&bdev->bd_device.kobj);
-	INIT_WORK(&lo->rundown_work, loop_rundown_workfn);
-	queue_work(system_long_wq, &lo->rundown_work);
+	/*
+	 * Need not hold lo_mutex to fput backing file. Calling fput holding
+	 * lo_mutex triggers a circular lock dependency possibility warning as
+	 * fput can take open_mutex which is usually taken before lo_mutex.
+	 */
+	fput(filp);
 }
 
 static int loop_clr_fd(struct loop_device *lo)
@@ -1254,7 +1230,6 @@ static int loop_clr_fd(struct loop_device *lo)
 	mutex_unlock(&lo->lo_mutex);
 
 	__loop_clr_fd(lo, false);
-	loop_rundown_completed(lo);
 	return 0;
 }
 
@@ -1761,20 +1736,14 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		return;
 
 	mutex_lock(&lo->lo_mutex);
-	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
-		if (lo->lo_state != Lo_bound)
-			goto out_unlock;
+	if (lo->lo_state == Lo_bound &&
+	    (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) {
 		lo->lo_state = Lo_rundown;
 		mutex_unlock(&lo->lo_mutex);
-		/*
-		 * In autoclear mode, stop the loop thread
-		 * and remove configuration after last close.
-		 */
-		loop_schedule_rundown(lo);
+		__loop_clr_fd(lo, true);
 		return;
 	}
 
-out_unlock:
 	mutex_unlock(&lo->lo_mutex);
 }
 
@@ -2064,6 +2033,12 @@ static void loop_remove(struct loop_device *lo)
 	mutex_lock(&loop_ctl_mutex);
 	idr_remove(&loop_index_idr, lo->lo_number);
 	mutex_unlock(&loop_ctl_mutex);
+
+	if (lo->workqueue)
+		destroy_workqueue(lo->workqueue);
+	loop_free_idle_workers(lo, true);
+	del_timer_sync(&lo->timer);
+
 	/* There is no route which can find this loop device. */
 	mutex_destroy(&lo->lo_mutex);
 	kfree(lo);
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 918a7a2dc0259..082d4b6bfc6a6 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -56,7 +56,6 @@ struct loop_device {
 	struct gendisk		*lo_disk;
 	struct mutex		lo_mutex;
 	bool			idr_visible;
-	struct work_struct      rundown_work;
 };
 
 struct loop_cmd {
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 5/8] loop: only take lo_mutex for the first reference in lo_open
  2022-01-28 13:00 ` [PATCH 5/8] loop: only take lo_mutex for the first reference in lo_open Christoph Hellwig
@ 2022-01-28 13:13   ` Tetsuo Handa
  2022-02-05  0:28     ` Tetsuo Handa
  2022-01-28 13:38   ` Jan Kara
  1 sibling, 1 reply; 21+ messages in thread
From: Tetsuo Handa @ 2022-01-28 13:13 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe
  Cc: Jan Kara, linux-block, Ming Lei, Darrick J . Wong

On 2022/01/28 22:00, Christoph Hellwig wrote:
> +	if (atomic_inc_return(&lo->lo_refcnt) > 1)
> +		return 0;
> +
>  	err = mutex_lock_killable(&lo->lo_mutex);
>  	if (err)

You did not notice my diff here...

>  		return err;
> -	if (lo->lo_state == Lo_deleting)
> +	if (lo->lo_state == Lo_deleting) {
> +		atomic_dec(&lo->lo_refcnt);
>  		err = -ENXIO;
> -	else
> -		atomic_inc(&lo->lo_refcnt);
> +	}

Why do we need [PATCH 1/8] [PATCH 2/8] [PATCH 3/8] in this series?
Shouldn't we first make a clean revert, and keep the changes for
this release cycle as small as possible?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/8] loop: remove the racy bd_inode->i_mapping->nrpages asserts
  2022-01-28 13:00 ` [PATCH 3/8] loop: remove the racy bd_inode->i_mapping->nrpages asserts Christoph Hellwig
@ 2022-01-28 13:36   ` Jan Kara
  0 siblings, 0 replies; 21+ messages in thread
From: Jan Kara @ 2022-01-28 13:36 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Tetsuo Handa, Jan Kara, linux-block, Ming Lei,
	Darrick J . Wong

On Fri 28-01-22 14:00:17, Christoph Hellwig wrote:
> Nothing prevents a file system or userspace opener of the block device
> from redirtying the page right afte sync_blockdev returned.  Fortunately
> data in the page cache during a block device change is mostly harmless
> anyway.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Tested-by: Darrick J. Wong <djwong@kernel.org>

OK. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  drivers/block/loop.c | 20 --------------------
>  1 file changed, 20 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 6ec55a5d9dfc4..d3a7f281ce1b6 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -1278,15 +1278,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
>  	/* I/O need to be drained during transfer transition */
>  	blk_mq_freeze_queue(lo->lo_queue);
>  
> -	if (size_changed && lo->lo_device->bd_inode->i_mapping->nrpages) {
> -		/* If any pages were dirtied after invalidate_bdev(), try again */
> -		err = -EAGAIN;
> -		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
> -			__func__, lo->lo_number, lo->lo_file_name,
> -			lo->lo_device->bd_inode->i_mapping->nrpages);
> -		goto out_unfreeze;
> -	}
> -
>  	prev_lo_flags = lo->lo_flags;
>  
>  	err = loop_set_status_from_info(lo, info);
> @@ -1497,21 +1488,10 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
>  	invalidate_bdev(lo->lo_device);
>  
>  	blk_mq_freeze_queue(lo->lo_queue);
> -
> -	/* invalidate_bdev should have truncated all the pages */
> -	if (lo->lo_device->bd_inode->i_mapping->nrpages) {
> -		err = -EAGAIN;
> -		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
> -			__func__, lo->lo_number, lo->lo_file_name,
> -			lo->lo_device->bd_inode->i_mapping->nrpages);
> -		goto out_unfreeze;
> -	}
> -
>  	blk_queue_logical_block_size(lo->lo_queue, arg);
>  	blk_queue_physical_block_size(lo->lo_queue, arg);
>  	blk_queue_io_min(lo->lo_queue, arg);
>  	loop_update_dio(lo);
> -out_unfreeze:
>  	blk_mq_unfreeze_queue(lo->lo_queue);
>  
>  	return err;
> -- 
> 2.30.2
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 4/8] loop: only take lo_mutex for the last reference in lo_release
  2022-01-28 13:00 ` [PATCH 4/8] loop: only take lo_mutex for the last reference in lo_release Christoph Hellwig
@ 2022-01-28 13:37   ` Jan Kara
  0 siblings, 0 replies; 21+ messages in thread
From: Jan Kara @ 2022-01-28 13:37 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Tetsuo Handa, Jan Kara, linux-block, Ming Lei,
	Darrick J . Wong

On Fri 28-01-22 14:00:18, Christoph Hellwig wrote:
> lo_refcnt is only incremented in lo_open and decremented in lo_release,
> and thus protected by open_mutex.  Only take lo_mutex when lo_release
> actually takes action for the final release.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Tested-by: Darrick J. Wong <djwong@kernel.org>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  drivers/block/loop.c | 10 +++++++---
>  1 file changed, 7 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index d3a7f281ce1b6..b58dc95f80d96 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -1740,10 +1740,14 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
>  {
>  	struct loop_device *lo = disk->private_data;
>  
> -	mutex_lock(&lo->lo_mutex);
> -	if (atomic_dec_return(&lo->lo_refcnt))
> -		goto out_unlock;
> +	/*
> +	 * Note: this requires disk->open_mutex to protect against races
> +	 * with lo_open.
> +	 */
> +	if (!atomic_dec_and_test(&lo->lo_refcnt))
> +		return;
>  
> +	mutex_lock(&lo->lo_mutex);
>  	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
>  		if (lo->lo_state != Lo_bound)
>  			goto out_unlock;
> -- 
> 2.30.2
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 5/8] loop: only take lo_mutex for the first reference in lo_open
  2022-01-28 13:00 ` [PATCH 5/8] loop: only take lo_mutex for the first reference in lo_open Christoph Hellwig
  2022-01-28 13:13   ` Tetsuo Handa
@ 2022-01-28 13:38   ` Jan Kara
  1 sibling, 0 replies; 21+ messages in thread
From: Jan Kara @ 2022-01-28 13:38 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Tetsuo Handa, Jan Kara, linux-block, Ming Lei,
	Darrick J . Wong

On Fri 28-01-22 14:00:19, Christoph Hellwig wrote:
> lo_refcnt is only incremented in lo_open and decremented in lo_release,
> and thus protected by open_mutex.  Only take lo_mutex when lo_open is
> called the first time, as only for the first open there is any affect
> on the driver state (incremental opens on partitions don't end up in
> lo_open at all already).
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Tested-by: Darrick J. Wong <djwong@kernel.org>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  drivers/block/loop.c | 13 ++++++++++---
>  1 file changed, 10 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index b58dc95f80d96..f349ddfc0e84a 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -1725,13 +1725,20 @@ static int lo_open(struct block_device *bdev, fmode_t mode)
>  	struct loop_device *lo = bdev->bd_disk->private_data;
>  	int err;
>  
> +	/*
> +	 * Note: this requires disk->open_mutex to protect against races
> +	 * with lo_release.
> +	 */
> +	if (atomic_inc_return(&lo->lo_refcnt) > 1)
> +		return 0;
> +
>  	err = mutex_lock_killable(&lo->lo_mutex);
>  	if (err)
>  		return err;
> -	if (lo->lo_state == Lo_deleting)
> +	if (lo->lo_state == Lo_deleting) {
> +		atomic_dec(&lo->lo_refcnt);
>  		err = -ENXIO;
> -	else
> -		atomic_inc(&lo->lo_refcnt);
> +	}
>  	mutex_unlock(&lo->lo_mutex);
>  	return err;
>  }
> -- 
> 2.30.2
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 5/8] loop: only take lo_mutex for the first reference in lo_open
  2022-01-28 13:13   ` Tetsuo Handa
@ 2022-02-05  0:28     ` Tetsuo Handa
  2022-02-06  7:10       ` Tetsuo Handa
  2022-02-08 13:45       ` Jan Kara
  0 siblings, 2 replies; 21+ messages in thread
From: Tetsuo Handa @ 2022-02-05  0:28 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe
  Cc: Jan Kara, linux-block, Ming Lei, Darrick J . Wong

Ping?

I sent https://lkml.kernel.org/r/20220129071500.3566-1-penguin-kernel@I-love.SAKURA.ne.jp
based on ideas from your series.

Since automated kernel tests are failing, can't we apply
[PATCH 1/7] loop: revert "make autoclear operation asynchronous"
for now if we can't come to a conclusion?

On 2022/01/28 22:13, Tetsuo Handa wrote:
> On 2022/01/28 22:00, Christoph Hellwig wrote:
>> +	if (atomic_inc_return(&lo->lo_refcnt) > 1)
>> +		return 0;
>> +
>>  	err = mutex_lock_killable(&lo->lo_mutex);
>>  	if (err)
> 
> You did not notice my diff here...

You need to drop lo->lo_refcnt before return.

But my latest series no longer uses task work context and no longer
holds lo->lo_mutex from lo_open()/lo_release().

> 
>>  		return err;
>> -	if (lo->lo_state == Lo_deleting)
>> +	if (lo->lo_state == Lo_deleting) {
>> +		atomic_dec(&lo->lo_refcnt);
>>  		err = -ENXIO;
>> -	else
>> -		atomic_inc(&lo->lo_refcnt);
>> +	}
> 
> Why do we need [PATCH 1/8] [PATCH 2/8] [PATCH 3/8] in this series?
> Shouldn't we first make a clean revert, and keep the changes for
> this release cycle as small as possible?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 5/8] loop: only take lo_mutex for the first reference in lo_open
  2022-02-05  0:28     ` Tetsuo Handa
@ 2022-02-06  7:10       ` Tetsuo Handa
  2022-02-08 13:45       ` Jan Kara
  1 sibling, 0 replies; 21+ messages in thread
From: Tetsuo Handa @ 2022-02-06  7:10 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Jan Kara
  Cc: linux-block, Ming Lei, Darrick J . Wong

On 2022/02/05 9:28, Tetsuo Handa wrote:
> Ping?
> 
> I sent https://lkml.kernel.org/r/20220129071500.3566-1-penguin-kernel@I-love.SAKURA.ne.jp
> based on ideas from your series.
> 

I tested my latest series and found that there are two problems.



One is that there still is disk->open_mutex => loop_validate_mutex =>
lo->lo_mutex chain. We would need to consider something like
https://lkml.kernel.org/r/fffda32f-848a-712b-f50e-8a6d7d086039@i-love.sakura.ne.jp
if we go with avoiding lo->lo_mutex while holding disk->open_mutex.



The other is that my latest series which does not hold lo->lo_mutex from
lo_open() causes occasional I/O error (due to

	if (lo->lo_state != Lo_bound)
		return BLK_STS_IOERR;

check in loop_queue_rq()) when systemd-udevd opens a loop device and
reads from it before loop_configure() updates lo->lo_state to Lo_bound.
(And I think the same problem exists for your series because we assumed
that we need to care about only loop_control_remove().)

If we want to avoid I/O error, open() of a loop device (currently lo_open())
needs to wait for loop_configure() to complete, by waiting for lo->lo_mutex
to be released. But this implies that we can't avoid lo->lo_mutex while holding
disk->open_mutex. Use of task work context (my previous series which is intended
for waiting for __loop_clr_fd() to complete) will allow waiting for lo->lo_mutex
without holding disk->open_mutex, but we want to avoid task work context if
possible...

[ 3069.404881] loop184: detected capacity change from 0 to 2048
[ 3069.452117] loop185: detected capacity change from 0 to 2048
[ 3069.457155] I/O error, dev loop185, sector 0 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 0
[ 3069.465309] loop186: detected capacity change from 0 to 2048
[ 3069.506238] loop187: detected capacity change from 0 to 2048
[ 3069.525483] loop188: detected capacity change from 0 to 2048
[ 3069.551447] loop189: detected capacity change from 0 to 2048
[ 3069.576062] loop190: detected capacity change from 0 to 2048
[ 3069.592206] loop191: detected capacity change from 0 to 2048
[ 3069.681413] loop192: detected capacity change from 0 to 2048
[ 3069.911000] loop193: detected capacity change from 0 to 2048
[ 3070.049889] loop194: detected capacity change from 0 to 2048
[ 3070.063847] loop195: detected capacity change from 0 to 2048
[ 3070.168926] loop196: detected capacity change from 0 to 2048
[ 3070.242869] loop197: detected capacity change from 0 to 2048
[ 3070.383477] loop198: detected capacity change from 0 to 2048
[ 3070.439035] loop199: detected capacity change from 0 to 2048
[ 3070.464548] loop200: detected capacity change from 0 to 2048
[ 3070.501891] loop201: detected capacity change from 0 to 2048
[ 3070.513600] loop202: detected capacity change from 0 to 2048
[ 3070.523923] I/O error, dev loop202, sector 0 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 0
[ 3070.531281] I/O error, dev loop202, sector 0 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 0
[ 3070.541353] Buffer I/O error on dev loop202, logical block 0, async page read
[ 3070.684234] loop203: detected capacity change from 0 to 2048
[ 3070.705911] loop204: detected capacity change from 0 to 2048
[ 3070.716153] loop205: detected capacity change from 0 to 2048
[ 3070.735178] loop206: detected capacity change from 0 to 2048
[ 3070.833832] loop207: detected capacity change from 0 to 2048
[ 3070.835086] I/O error, dev loop207, sector 0 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 0
[ 3070.851433] loop208: detected capacity change from 0 to 2048
[ 3070.873609] loop209: detected capacity change from 0 to 2048
[ 3070.890193] loop210: detected capacity change from 0 to 2048


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 5/8] loop: only take lo_mutex for the first reference in lo_open
  2022-02-05  0:28     ` Tetsuo Handa
  2022-02-06  7:10       ` Tetsuo Handa
@ 2022-02-08 13:45       ` Jan Kara
  2022-02-23 14:24         ` Tetsuo Handa
  1 sibling, 1 reply; 21+ messages in thread
From: Jan Kara @ 2022-02-08 13:45 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Christoph Hellwig, Jens Axboe, Jan Kara, linux-block, Ming Lei,
	Darrick J . Wong

On Sat 05-02-22 09:28:33, Tetsuo Handa wrote:
> Ping?
> 
> I sent https://lkml.kernel.org/r/20220129071500.3566-1-penguin-kernel@I-love.SAKURA.ne.jp
> based on ideas from your series.
> 
> Since automated kernel tests are failing, can't we apply
> [PATCH 1/7] loop: revert "make autoclear operation asynchronous"
> for now if we can't come to a conclusion?

That's certainly a good start so feel free to add my Acked-by to the
revert. I agree it should be merged quickly as I think it is better to have
a theoretical deadlock in the code than userspace breakage hit in the wild.
I'll find some more time to look into this but it will take a while.

								Honza

> 
> On 2022/01/28 22:13, Tetsuo Handa wrote:
> > On 2022/01/28 22:00, Christoph Hellwig wrote:
> >> +	if (atomic_inc_return(&lo->lo_refcnt) > 1)
> >> +		return 0;
> >> +
> >>  	err = mutex_lock_killable(&lo->lo_mutex);
> >>  	if (err)
> > 
> > You did not notice my diff here...
> 
> You need to drop lo->lo_refcnt before return.
> 
> But my latest series no longer uses task work context and no longer
> holds lo->lo_mutex from lo_open()/lo_release().
> 
> > 
> >>  		return err;
> >> -	if (lo->lo_state == Lo_deleting)
> >> +	if (lo->lo_state == Lo_deleting) {
> >> +		atomic_dec(&lo->lo_refcnt);
> >>  		err = -ENXIO;
> >> -	else
> >> -		atomic_inc(&lo->lo_refcnt);
> >> +	}
> > 
> > Why do we need [PATCH 1/8] [PATCH 2/8] [PATCH 3/8] in this series?
> > Shouldn't we first make a clean revert, and keep the changes for
> > this release cycle as small as possible?
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 5/8] loop: only take lo_mutex for the first reference in lo_open
  2022-02-08 13:45       ` Jan Kara
@ 2022-02-23 14:24         ` Tetsuo Handa
  0 siblings, 0 replies; 21+ messages in thread
From: Tetsuo Handa @ 2022-02-23 14:24 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, linux-block, Ming Lei, Darrick J . Wong

On 2022/02/08 22:45, Jan Kara wrote:
> On Sat 05-02-22 09:28:33, Tetsuo Handa wrote:
>> Ping?
>>
>> I sent https://lkml.kernel.org/r/20220129071500.3566-1-penguin-kernel@I-love.SAKURA.ne.jp
>> based on ideas from your series.
>>
>> Since automated kernel tests are failing, can't we apply
>> [PATCH 1/7] loop: revert "make autoclear operation asynchronous"
>> for now if we can't come to a conclusion?
> 
> That's certainly a good start so feel free to add my Acked-by to the
> revert. I agree it should be merged quickly as I think it is better to have
> a theoretical deadlock in the code than userspace breakage hit in the wild.
> I'll find some more time to look into this but it will take a while.

Did you get a chance to look into this? As far as I tested, I found two problems
( https://lkml.kernel.org/r/a72c59c6-298b-e4ba-b1f5-2275afab49a1@I-love.SAKURA.ne.jp ).
That is, waiting for lo->lo_mutex upon open is not only for /bin/mount but for other
programs.

I found that we can use anon_inodes approach (basic idea is shown below) as a way
to use task_work context. If we can agree with this approach, I can re-implement
https://lkml.kernel.org/r/20220121114006.3633-1-penguin-kernel@I-love.SAKURA.ne.jp
without exporting task_work_add().

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 19fe19eaa50e..6bd6af1836c6 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -80,6 +80,7 @@
 #include <linux/blk-cgroup.h>
 #include <linux/sched/mm.h>
 #include <linux/statfs.h>
+#include <linux/anon_inodes.h>
 
 #include "loop.h"
 
@@ -1736,6 +1737,25 @@ static int lo_open(struct block_device *bdev, fmode_t mode)
 	return err;
 }
 
+static int loop_no_open(struct inode *inode, struct file *file)
+{
+	return -ENXIO;
+}
+
+static int loop_post_release(struct inode *inode, struct file *file)
+{
+	struct loop_device *lo = file->private_data;
+
+	pr_info("Performing autoclear operation.\n");
+	__loop_clr_fd(lo, false);
+	return 0;
+}
+
+static const struct file_operations loop_close_fops = {
+	.open = loop_no_open,
+	.release = loop_post_release,
+};
+
 static void lo_release(struct gendisk *disk, fmode_t mode)
 {
 	struct loop_device *lo = disk->private_data;
@@ -1745,6 +1765,8 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		goto out_unlock;
 
 	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
+		struct file *file;
+
 		if (lo->lo_state != Lo_bound)
 			goto out_unlock;
 		lo->lo_state = Lo_rundown;
@@ -1753,7 +1775,9 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		 * In autoclear mode, stop the loop thread
 		 * and remove configuration after last close.
 		 */
-		__loop_clr_fd(lo, true);
+		file = anon_inode_getfile("loop.close", &loop_close_fops, lo, O_CLOEXEC);
+		if (!IS_ERR(file))
+			fput(file);
 		return;
 	} else if (lo->lo_state == Lo_bound) {
 		/*


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 4/8] loop: only take lo_mutex for the last reference in lo_release
  2022-01-27 10:19     ` Jan Kara
@ 2022-01-27 10:28       ` Christoph Hellwig
  0 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2022-01-27 10:28 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Tetsuo Handa, linux-block, Ming Lei

On Thu, Jan 27, 2022 at 11:19:57AM +0100, Jan Kara wrote:
> Just after writing this I have realized that the above sequence is not
> actually possible due to disk->open_mutex protecting us and serializing
> lo_release() with lo_open() but it needs at least a comment to explain that
> we rely on disk->open_mutex to avoid races with lo_open().

The commit message already notes this.  Not sure we really need a comment
in the code itself, but if you want I can add it.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 4/8] loop: only take lo_mutex for the last reference in lo_release
  2022-01-27  9:48   ` Jan Kara
@ 2022-01-27 10:19     ` Jan Kara
  2022-01-27 10:28       ` Christoph Hellwig
  0 siblings, 1 reply; 21+ messages in thread
From: Jan Kara @ 2022-01-27 10:19 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Tetsuo Handa, Jan Kara, linux-block, Ming Lei

On Thu 27-01-22 10:48:13, Jan Kara wrote:
> On Wed 26-01-22 16:50:36, Christoph Hellwig wrote:
> > lo_refcnt is only incremented in lo_open and decremented in lo_release,
> > and thus protected by open_mutex.  Only take lo_mutex when lo_release
> > actually takes action for the final release.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> Yup, good idea. Feel free to add:
> 
> Reviewed-by: Jan Kara <jack@suse.cz>

On a second thought I retract this... See below:

> > index d3a7f281ce1b6..43980ec69dfdd 100644
> > --- a/drivers/block/loop.c
> > +++ b/drivers/block/loop.c
> > @@ -1740,10 +1740,10 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
> >  {
> >  	struct loop_device *lo = disk->private_data;
> >  
> > -	mutex_lock(&lo->lo_mutex);
> > -	if (atomic_dec_return(&lo->lo_refcnt))
> > -		goto out_unlock;
> > +	if (!atomic_dec_and_test(&lo->lo_refcnt))
> > +		return;
> >  
> > +	mutex_lock(&lo->lo_mutex);

There's a subtle race here like:

Thread 1				Thread2 (mount)
lo_release()
  if (!atomic_dec_and_test(&lo->lo_refcnt))
  - sees we are last one
					lo_open()
					  mutex_lock_killable(&lo->lo_mutex);
					  atomic_inc(&lo->lo_refcnt);
					  mutex_unlock(&lo->lo_mutex);
					ioctl(LOOP_GET_STATUS)
					  sees everything is fine
  mutex_lock(&lo->lo_mutex);
  if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
    cleans up the device
  }
					  is unhappy, mount fails

Just after writing this I have realized that the above sequence is not
actually possible due to disk->open_mutex protecting us and serializing
lo_release() with lo_open() but it needs at least a comment to explain that
we rely on disk->open_mutex to avoid races with lo_open().

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 4/8] loop: only take lo_mutex for the last reference in lo_release
  2022-01-26 15:50 ` [PATCH 4/8] loop: only take lo_mutex for the last reference in lo_release Christoph Hellwig
@ 2022-01-27  9:48   ` Jan Kara
  2022-01-27 10:19     ` Jan Kara
  0 siblings, 1 reply; 21+ messages in thread
From: Jan Kara @ 2022-01-27  9:48 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Tetsuo Handa, Jan Kara, linux-block, Ming Lei

On Wed 26-01-22 16:50:36, Christoph Hellwig wrote:
> lo_refcnt is only incremented in lo_open and decremented in lo_release,
> and thus protected by open_mutex.  Only take lo_mutex when lo_release
> actually takes action for the final release.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Yup, good idea. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  drivers/block/loop.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index d3a7f281ce1b6..43980ec69dfdd 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -1740,10 +1740,10 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
>  {
>  	struct loop_device *lo = disk->private_data;
>  
> -	mutex_lock(&lo->lo_mutex);
> -	if (atomic_dec_return(&lo->lo_refcnt))
> -		goto out_unlock;
> +	if (!atomic_dec_and_test(&lo->lo_refcnt))
> +		return;
>  
> +	mutex_lock(&lo->lo_mutex);
>  	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
>  		if (lo->lo_state != Lo_bound)
>  			goto out_unlock;
> -- 
> 2.30.2
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 4/8] loop: only take lo_mutex for the last reference in lo_release
  2022-01-26 15:50 yet another approach to fix loop autoclear for xfstets xfs/049 Christoph Hellwig
@ 2022-01-26 15:50 ` Christoph Hellwig
  2022-01-27  9:48   ` Jan Kara
  0 siblings, 1 reply; 21+ messages in thread
From: Christoph Hellwig @ 2022-01-26 15:50 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Tetsuo Handa, Jan Kara, linux-block, Ming Lei

lo_refcnt is only incremented in lo_open and decremented in lo_release,
and thus protected by open_mutex.  Only take lo_mutex when lo_release
actually takes action for the final release.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/loop.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index d3a7f281ce1b6..43980ec69dfdd 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1740,10 +1740,10 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 {
 	struct loop_device *lo = disk->private_data;
 
-	mutex_lock(&lo->lo_mutex);
-	if (atomic_dec_return(&lo->lo_refcnt))
-		goto out_unlock;
+	if (!atomic_dec_and_test(&lo->lo_refcnt))
+		return;
 
+	mutex_lock(&lo->lo_mutex);
 	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
 		if (lo->lo_state != Lo_bound)
 			goto out_unlock;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2022-02-23 14:24 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-28 13:00 yet another approach to fix loop autoclear for xfstets xfs/049 v2 Christoph Hellwig
2022-01-28 13:00 ` [PATCH 1/8] loop: de-duplicate the idle worker freeing code Christoph Hellwig
2022-01-28 13:00 ` [PATCH 2/8] loop: initialize the worker tracking fields once Christoph Hellwig
2022-01-28 13:00 ` [PATCH 3/8] loop: remove the racy bd_inode->i_mapping->nrpages asserts Christoph Hellwig
2022-01-28 13:36   ` Jan Kara
2022-01-28 13:00 ` [PATCH 4/8] loop: only take lo_mutex for the last reference in lo_release Christoph Hellwig
2022-01-28 13:37   ` Jan Kara
2022-01-28 13:00 ` [PATCH 5/8] loop: only take lo_mutex for the first reference in lo_open Christoph Hellwig
2022-01-28 13:13   ` Tetsuo Handa
2022-02-05  0:28     ` Tetsuo Handa
2022-02-06  7:10       ` Tetsuo Handa
2022-02-08 13:45       ` Jan Kara
2022-02-23 14:24         ` Tetsuo Handa
2022-01-28 13:38   ` Jan Kara
2022-01-28 13:00 ` [PATCH 6/8] loop: don't freeze the queue in lo_release Christoph Hellwig
2022-01-28 13:00 ` [PATCH 7/8] loop: only freeze the queue in __loop_clr_fd when needed Christoph Hellwig
2022-01-28 13:00 ` [PATCH 8/8] loop: make autoclear operation synchronous again Christoph Hellwig
  -- strict thread matches above, loose matches on Subject: below --
2022-01-26 15:50 yet another approach to fix loop autoclear for xfstets xfs/049 Christoph Hellwig
2022-01-26 15:50 ` [PATCH 4/8] loop: only take lo_mutex for the last reference in lo_release Christoph Hellwig
2022-01-27  9:48   ` Jan Kara
2022-01-27 10:19     ` Jan Kara
2022-01-27 10:28       ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.