All of lore.kernel.org
 help / color / mirror / Atom feed
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
To: Jan Kara <jack@suse.cz>
Cc: kernel test robot <oliver.sang@intel.com>,
	Jens Axboe <axboe@kernel.dk>,
	syzbot <syzbot+643e4ce4b6ad1347d372@syzkaller.appspotmail.com>,
	Christoph Hellwig <hch@infradead.org>,
	Christoph Hellwig <hch@lst.de>,
	LKML <linux-kernel@vger.kernel.org>,
	lkp@lists.01.org, lkp@intel.com,
	linux-block <linux-block@vger.kernel.org>
Subject: Re: [loop] 322c4293ec: xfstests.xfs.049.fail
Date: Mon, 20 Dec 2021 23:57:40 +0900	[thread overview]
Message-ID: <ee718d4e-7681-d09a-5d2a-4f9ac2172038@i-love.sakura.ne.jp> (raw)
In-Reply-To: <20211220115823.GB20005@quack2.suse.cz>

On 2021/12/20 20:58, Jan Kara wrote:
> On Mon 20-12-21 00:45:46, Tetsuo Handa wrote:
>> On 2021/12/20 0:09, kernel test robot wrote:
>>>     @@ -13,3 +13,5 @@
>>>      --- clean
>>>      --- umount ext2 on xfs
>>>      --- umount xfs
>>>     +!!! umount xfs failed
>>>     +(see /lkp/benchmarks/xfstests/results//xfs/049.full for details)
>>>     ...
>>>     (Run 'diff -u /lkp/benchmarks/xfstests/tests/xfs/049.out /lkp/benchmarks/xfstests/results//xfs/049.out.bad'  to see the entire diff)
>>
>> Yes, we know this race condition can happen.
>>
>> https://lkml.kernel.org/r/16c7d304-60ef-103f-1b2c-8592b48f47c6@i-love.sakura.ne.jp
>> https://lkml.kernel.org/r/YaYfu0H2k0PSQL6W@infradead.org
>>
>> Should we try to wait for autoclear operation to complete?
> 
> So I think we should try to fix this because as Dave writes in the
> changelog for a1ecac3b0656 ("loop: Make explicit loop device destruction
> lazy") which started all this, having random EBUSY failures (either from
> losetup or umount) is annoying and you need to work it around it lots of
> unexpected places.

OK. Let's wait for autoclear operation to complete.

> 
> We cannot easily wait for work completion in the loop device code without
> reintroducing the deadlock - whole lo_release() is called under
> disk->open_mutex which you also need to grab in __loop_clr_fd(). So to
> avoid holding backing file busy longer than expected, we could use
> task_work instead of ordinary work as I suggested - but you were right that
> we need to be somewhat careful and in case we are running in a kthread, we
> would still need to offload to a normal work (but in that case we don't
> care about delaying file release anyway).

Like fput_many() shows, it is not easy to implement fallback correctly.
I worry that exporting task_work_add() to loadable kernel modules results in
random abuse of task_work which does not implement fallback.

Instead of exporting task_work_add(), I think we can apply below diff.
I chose not to rely on WQ context, for there is fput(filp) in __loop_clr_fd()
which we should make sure that __fput(filp) is processed by a thread which
triggered autoclear operation. If this __fput(filp) is scheduled by other thread
because fput(filp) is called by a thread which did not trigger autoclear operation,
I think it is possible that a thread which triggered autoclear operation fails to
wait for completion of __fput(filp), and results in the same problem.

---
 block/bdev.c           |  2 ++
 drivers/block/loop.c   | 42 ++++++++++++------------------------------
 drivers/block/loop.h   |  2 +-
 include/linux/blkdev.h |  5 +++++
 4 files changed, 20 insertions(+), 31 deletions(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c80cfaefc0a8..b252b1d87471 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1227,6 +1227,11 @@ struct block_device_operations {
 	 * driver.
 	 */
 	int (*alternative_gpt_sector)(struct gendisk *disk, sector_t *sector);
+	/*
+	 * Special callback for doing post-release callback without
+	 * disk->open_mutex held. Used by loop driver.
+	 */
+	void (*post_release)(struct gendisk *disk);
 };
 
 #ifdef CONFIG_COMPAT
diff --git a/block/bdev.c b/block/bdev.c
index 8bf93a19041b..0cb638d81a27 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -948,6 +948,8 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
 	else
 		blkdev_put_whole(bdev, mode);
 	mutex_unlock(&disk->open_mutex);
+	if (bdev->bd_disk->fops->post_release)
+		bdev->bd_disk->fops->post_release(bdev->bd_disk);
 
 	module_put(disk->fops->owner);
 	blkdev_put_no_open(bdev);
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 918a7a2dc025..f2e9f38694dc 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -56,7 +56,7 @@ struct loop_device {
 	struct gendisk		*lo_disk;
 	struct mutex		lo_mutex;
 	bool			idr_visible;
-	struct work_struct      rundown_work;
+	struct task_struct	*rundown_owner; /* current or NULL */
 };
 
 struct loop_cmd {
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index b1b05c45c07c..faa3a3097b22 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1165,40 +1165,12 @@ static void __loop_clr_fd(struct loop_device *lo)
 		lo->lo_disk->flags |= GENHD_FL_NO_PART;
 
 	fput(filp);
-}
-
-static void loop_rundown_completed(struct loop_device *lo)
-{
 	mutex_lock(&lo->lo_mutex);
 	lo->lo_state = Lo_unbound;
 	mutex_unlock(&lo->lo_mutex);
 	module_put(THIS_MODULE);
 }
 
-static void loop_rundown_workfn(struct work_struct *work)
-{
-	struct loop_device *lo = container_of(work, struct loop_device,
-					      rundown_work);
-	struct block_device *bdev = lo->lo_device;
-	struct gendisk *disk = lo->lo_disk;
-
-	__loop_clr_fd(lo);
-	kobject_put(&bdev->bd_device.kobj);
-	module_put(disk->fops->owner);
-	loop_rundown_completed(lo);
-}
-
-static void loop_schedule_rundown(struct loop_device *lo)
-{
-	struct block_device *bdev = lo->lo_device;
-	struct gendisk *disk = lo->lo_disk;
-
-	__module_get(disk->fops->owner);
-	kobject_get(&bdev->bd_device.kobj);
-	INIT_WORK(&lo->rundown_work, loop_rundown_workfn);
-	queue_work(system_long_wq, &lo->rundown_work);
-}
-
 static int loop_clr_fd(struct loop_device *lo)
 {
 	int err;
@@ -1229,7 +1201,6 @@ static int loop_clr_fd(struct loop_device *lo)
 	mutex_unlock(&lo->lo_mutex);
 
 	__loop_clr_fd(lo);
-	loop_rundown_completed(lo);
 	return 0;
 }
 
@@ -1754,7 +1725,7 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		 * In autoclear mode, stop the loop thread
 		 * and remove configuration after last close.
 		 */
-		loop_schedule_rundown(lo);
+		lo->rundown_owner = current;
 		return;
 	} else if (lo->lo_state == Lo_bound) {
 		/*
@@ -1769,10 +1740,21 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 	mutex_unlock(&lo->lo_mutex);
 }
 
+static void lo_post_release(struct gendisk *disk)
+{
+	struct loop_device *lo = disk->private_data;
+
+	if (lo->rundown_owner != current)
+		return;
+	lo->rundown_owner = NULL;
+	__loop_clr_fd(lo);
+}
+
 static const struct block_device_operations lo_fops = {
 	.owner =	THIS_MODULE,
 	.open =		lo_open,
 	.release =	lo_release,
+	.post_release = lo_post_release,
 	.ioctl =	lo_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl =	lo_compat_ioctl,
-- 
2.32.0


WARNING: multiple messages have this Message-ID (diff)
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
To: lkp@lists.01.org
Subject: Re: [loop] 322c4293ec: xfstests.xfs.049.fail
Date: Mon, 20 Dec 2021 23:57:40 +0900	[thread overview]
Message-ID: <ee718d4e-7681-d09a-5d2a-4f9ac2172038@i-love.sakura.ne.jp> (raw)
In-Reply-To: <20211220115823.GB20005@quack2.suse.cz>

[-- Attachment #1: Type: text/plain, Size: 6455 bytes --]

On 2021/12/20 20:58, Jan Kara wrote:
> On Mon 20-12-21 00:45:46, Tetsuo Handa wrote:
>> On 2021/12/20 0:09, kernel test robot wrote:
>>>     @@ -13,3 +13,5 @@
>>>      --- clean
>>>      --- umount ext2 on xfs
>>>      --- umount xfs
>>>     +!!! umount xfs failed
>>>     +(see /lkp/benchmarks/xfstests/results//xfs/049.full for details)
>>>     ...
>>>     (Run 'diff -u /lkp/benchmarks/xfstests/tests/xfs/049.out /lkp/benchmarks/xfstests/results//xfs/049.out.bad'  to see the entire diff)
>>
>> Yes, we know this race condition can happen.
>>
>> https://lkml.kernel.org/r/16c7d304-60ef-103f-1b2c-8592b48f47c6(a)i-love.sakura.ne.jp
>> https://lkml.kernel.org/r/YaYfu0H2k0PSQL6W(a)infradead.org
>>
>> Should we try to wait for autoclear operation to complete?
> 
> So I think we should try to fix this because as Dave writes in the
> changelog for a1ecac3b0656 ("loop: Make explicit loop device destruction
> lazy") which started all this, having random EBUSY failures (either from
> losetup or umount) is annoying and you need to work it around it lots of
> unexpected places.

OK. Let's wait for autoclear operation to complete.

> 
> We cannot easily wait for work completion in the loop device code without
> reintroducing the deadlock - whole lo_release() is called under
> disk->open_mutex which you also need to grab in __loop_clr_fd(). So to
> avoid holding backing file busy longer than expected, we could use
> task_work instead of ordinary work as I suggested - but you were right that
> we need to be somewhat careful and in case we are running in a kthread, we
> would still need to offload to a normal work (but in that case we don't
> care about delaying file release anyway).

Like fput_many() shows, it is not easy to implement fallback correctly.
I worry that exporting task_work_add() to loadable kernel modules results in
random abuse of task_work which does not implement fallback.

Instead of exporting task_work_add(), I think we can apply below diff.
I chose not to rely on WQ context, for there is fput(filp) in __loop_clr_fd()
which we should make sure that __fput(filp) is processed by a thread which
triggered autoclear operation. If this __fput(filp) is scheduled by other thread
because fput(filp) is called by a thread which did not trigger autoclear operation,
I think it is possible that a thread which triggered autoclear operation fails to
wait for completion of __fput(filp), and results in the same problem.

---
 block/bdev.c           |  2 ++
 drivers/block/loop.c   | 42 ++++++++++++------------------------------
 drivers/block/loop.h   |  2 +-
 include/linux/blkdev.h |  5 +++++
 4 files changed, 20 insertions(+), 31 deletions(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c80cfaefc0a8..b252b1d87471 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1227,6 +1227,11 @@ struct block_device_operations {
 	 * driver.
 	 */
 	int (*alternative_gpt_sector)(struct gendisk *disk, sector_t *sector);
+	/*
+	 * Special callback for doing post-release callback without
+	 * disk->open_mutex held. Used by loop driver.
+	 */
+	void (*post_release)(struct gendisk *disk);
 };
 
 #ifdef CONFIG_COMPAT
diff --git a/block/bdev.c b/block/bdev.c
index 8bf93a19041b..0cb638d81a27 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -948,6 +948,8 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
 	else
 		blkdev_put_whole(bdev, mode);
 	mutex_unlock(&disk->open_mutex);
+	if (bdev->bd_disk->fops->post_release)
+		bdev->bd_disk->fops->post_release(bdev->bd_disk);
 
 	module_put(disk->fops->owner);
 	blkdev_put_no_open(bdev);
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 918a7a2dc025..f2e9f38694dc 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -56,7 +56,7 @@ struct loop_device {
 	struct gendisk		*lo_disk;
 	struct mutex		lo_mutex;
 	bool			idr_visible;
-	struct work_struct      rundown_work;
+	struct task_struct	*rundown_owner; /* current or NULL */
 };
 
 struct loop_cmd {
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index b1b05c45c07c..faa3a3097b22 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1165,40 +1165,12 @@ static void __loop_clr_fd(struct loop_device *lo)
 		lo->lo_disk->flags |= GENHD_FL_NO_PART;
 
 	fput(filp);
-}
-
-static void loop_rundown_completed(struct loop_device *lo)
-{
 	mutex_lock(&lo->lo_mutex);
 	lo->lo_state = Lo_unbound;
 	mutex_unlock(&lo->lo_mutex);
 	module_put(THIS_MODULE);
 }
 
-static void loop_rundown_workfn(struct work_struct *work)
-{
-	struct loop_device *lo = container_of(work, struct loop_device,
-					      rundown_work);
-	struct block_device *bdev = lo->lo_device;
-	struct gendisk *disk = lo->lo_disk;
-
-	__loop_clr_fd(lo);
-	kobject_put(&bdev->bd_device.kobj);
-	module_put(disk->fops->owner);
-	loop_rundown_completed(lo);
-}
-
-static void loop_schedule_rundown(struct loop_device *lo)
-{
-	struct block_device *bdev = lo->lo_device;
-	struct gendisk *disk = lo->lo_disk;
-
-	__module_get(disk->fops->owner);
-	kobject_get(&bdev->bd_device.kobj);
-	INIT_WORK(&lo->rundown_work, loop_rundown_workfn);
-	queue_work(system_long_wq, &lo->rundown_work);
-}
-
 static int loop_clr_fd(struct loop_device *lo)
 {
 	int err;
@@ -1229,7 +1201,6 @@ static int loop_clr_fd(struct loop_device *lo)
 	mutex_unlock(&lo->lo_mutex);
 
 	__loop_clr_fd(lo);
-	loop_rundown_completed(lo);
 	return 0;
 }
 
@@ -1754,7 +1725,7 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		 * In autoclear mode, stop the loop thread
 		 * and remove configuration after last close.
 		 */
-		loop_schedule_rundown(lo);
+		lo->rundown_owner = current;
 		return;
 	} else if (lo->lo_state == Lo_bound) {
 		/*
@@ -1769,10 +1740,21 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 	mutex_unlock(&lo->lo_mutex);
 }
 
+static void lo_post_release(struct gendisk *disk)
+{
+	struct loop_device *lo = disk->private_data;
+
+	if (lo->rundown_owner != current)
+		return;
+	lo->rundown_owner = NULL;
+	__loop_clr_fd(lo);
+}
+
 static const struct block_device_operations lo_fops = {
 	.owner =	THIS_MODULE,
 	.open =		lo_open,
 	.release =	lo_release,
+	.post_release = lo_post_release,
 	.ioctl =	lo_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl =	lo_compat_ioctl,
-- 
2.32.0

  reply	other threads:[~2021-12-20 15:35 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-12-19 15:09 [loop] 322c4293ec: xfstests.xfs.049.fail kernel test robot
2021-12-19 15:09 ` kernel test robot
2021-12-19 15:45 ` Tetsuo Handa
2021-12-19 15:45   ` Tetsuo Handa
2021-12-20 11:58   ` Jan Kara
2021-12-20 11:58     ` Jan Kara
2021-12-20 14:57     ` Tetsuo Handa [this message]
2021-12-20 14:57       ` Tetsuo Handa

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ee718d4e-7681-d09a-5d2a-4f9ac2172038@i-love.sakura.ne.jp \
    --to=penguin-kernel@i-love.sakura.ne.jp \
    --cc=axboe@kernel.dk \
    --cc=hch@infradead.org \
    --cc=hch@lst.de \
    --cc=jack@suse.cz \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=lkp@intel.com \
    --cc=lkp@lists.01.org \
    --cc=oliver.sang@intel.com \
    --cc=syzbot+643e4ce4b6ad1347d372@syzkaller.appspotmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.