All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] md: add support for REQ_NOWAIT
@ 2021-11-01 21:51 Vishal Verma
  2021-11-02  3:41 ` Li Feng
  2021-11-02  5:01 ` Song Liu
  0 siblings, 2 replies; 86+ messages in thread
From: Vishal Verma @ 2021-11-01 21:51 UTC (permalink / raw)
  To: song, linux-raid, rgoldwyn; +Cc: Vishal Verma

commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support
for checking whether a given bdev supports handling of REQ_NOWAIT or not.
Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable
it for linear target") added support for REQ_NOWAIT for dm. This uses
a similar approach to incorporate REQ_NOWAIT for md based bios.

Signed-off-by: Vishal Verma <vverma@digitalocean.com>
---
 drivers/md/md.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5111ed966947..51b2df32aed5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -419,6 +419,12 @@ void md_handle_request(struct mddev *mddev, struct bio *bio)
 	if (is_suspended(mddev, bio)) {
 		DEFINE_WAIT(__wait);
 		for (;;) {
+			/* Bail out if REQ_NOWAIT is set for the bio */
+			if (bio->bi_opf & REQ_NOWAIT) {
+				bio_wouldblock_error(bio);
+				break;
+			}
+
 			prepare_to_wait(&mddev->sb_wait, &__wait,
 					TASK_UNINTERRUPTIBLE);
 			if (!is_suspended(mddev, bio))
@@ -5792,6 +5798,7 @@ int md_run(struct mddev *mddev)
 	int err;
 	struct md_rdev *rdev;
 	struct md_personality *pers;
+	bool nowait = true;
 
 	if (list_empty(&mddev->disks))
 		/* cannot run an array with no devices.. */
@@ -5862,8 +5869,14 @@ int md_run(struct mddev *mddev)
 			}
 		}
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
+		if (!blk_queue_nowait(bdev_get_queue(rdev->bdev)))
+			nowait = false;
 	}
 
+	/* Set the NOWAIT flags if all underlying devices support it */
+	if (nowait)
+		blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue);
+
 	if (!bioset_initialized(&mddev->bio_set)) {
 		err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
 		if (err)
@@ -7007,6 +7020,14 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 	if (!mddev->thread)
 		md_update_sb(mddev, 1);
+	/* If the new disk does not support REQ_NOWAIT,
+	 * disable on the whole MD.
+	 */
+	if (!blk_queue_nowait(bdev_get_queue(rdev->bdev))) {
+		pr_info("%s: Disabling nowait because %s does not support nowait\n",
+			mdname(mddev), bdevname(rdev->bdev, b));
+		blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue);
+	}
 	/*
 	 * Kick recovery, maybe this spare has to be added to the
 	 * array immediately.
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 86+ messages in thread

* Re: [PATCH] md: add support for REQ_NOWAIT
  2021-11-01 21:51 [PATCH] md: add support for REQ_NOWAIT Vishal Verma
@ 2021-11-02  3:41 ` Li Feng
  2021-11-02  5:01 ` Song Liu
  1 sibling, 0 replies; 86+ messages in thread
From: Li Feng @ 2021-11-02  3:41 UTC (permalink / raw)
  To: Vishal Verma; +Cc: song, linux-raid, rgoldwyn

On Tue, Nov 2, 2021 at 5:52 AM Vishal Verma <vverma@digitalocean.com> wrote:
>
> commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support
> for checking whether a given bdev supports handling of REQ_NOWAIT or not.
> Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable
> it for linear target") added support for REQ_NOWAIT for dm. This uses
> a similar approach to incorporate REQ_NOWAIT for md based bios.
>
> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
> ---
>  drivers/md/md.c | 21 +++++++++++++++++++++
>  1 file changed, 21 insertions(+)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 5111ed966947..51b2df32aed5 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -419,6 +419,12 @@ void md_handle_request(struct mddev *mddev, struct bio *bio)
>         if (is_suspended(mddev, bio)) {
>                 DEFINE_WAIT(__wait);
>                 for (;;) {
> +                       /* Bail out if REQ_NOWAIT is set for the bio */
> +                       if (bio->bi_opf & REQ_NOWAIT) {
> +                               bio_wouldblock_error(bio);
> +                               break;
> +                       }
> +
>                         prepare_to_wait(&mddev->sb_wait, &__wait,
>                                         TASK_UNINTERRUPTIBLE);
>                         if (!is_suspended(mddev, bio))
> @@ -5792,6 +5798,7 @@ int md_run(struct mddev *mddev)
>         int err;
>         struct md_rdev *rdev;
>         struct md_personality *pers;
> +       bool nowait = true;
>
>         if (list_empty(&mddev->disks))
>                 /* cannot run an array with no devices.. */
> @@ -5862,8 +5869,14 @@ int md_run(struct mddev *mddev)
>                         }
>                 }
>                 sysfs_notify_dirent_safe(rdev->sysfs_state);
> +               if (!blk_queue_nowait(bdev_get_queue(rdev->bdev)))
> +                       nowait = false;
I think this is more clear:
nowait = blk_queue_nowait(bdev_get_queue(rdev->bdev));

>         }
>
> +       /* Set the NOWAIT flags if all underlying devices support it */
> +       if (nowait)
> +               blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue);
> +
>         if (!bioset_initialized(&mddev->bio_set)) {
>                 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
>                 if (err)
> @@ -7007,6 +7020,14 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
>         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
>         if (!mddev->thread)
>                 md_update_sb(mddev, 1);
> +       /* If the new disk does not support REQ_NOWAIT,
> +        * disable on the whole MD.
> +        */
> +       if (!blk_queue_nowait(bdev_get_queue(rdev->bdev))) {
> +               pr_info("%s: Disabling nowait because %s does not support nowait\n",
> +                       mdname(mddev), bdevname(rdev->bdev, b));
> +               blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue);
> +       }
>         /*
>          * Kick recovery, maybe this spare has to be added to the
>          * array immediately.
> --
> 2.17.1
>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH] md: add support for REQ_NOWAIT
  2021-11-01 21:51 [PATCH] md: add support for REQ_NOWAIT Vishal Verma
  2021-11-02  3:41 ` Li Feng
@ 2021-11-02  5:01 ` Song Liu
  2021-11-02 14:40   ` [PATCH v2] " Vishal Verma
  1 sibling, 1 reply; 86+ messages in thread
From: Song Liu @ 2021-11-02  5:01 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, rgoldwyn

On Mon, Nov 1, 2021 at 2:52 PM Vishal Verma <vverma@digitalocean.com> wrote:
>
> commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support
> for checking whether a given bdev supports handling of REQ_NOWAIT or not.
> Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable
> it for linear target") added support for REQ_NOWAIT for dm. This uses
> a similar approach to incorporate REQ_NOWAIT for md based bios.
>
> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
> ---
>  drivers/md/md.c | 21 +++++++++++++++++++++
>  1 file changed, 21 insertions(+)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 5111ed966947..51b2df32aed5 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -419,6 +419,12 @@ void md_handle_request(struct mddev *mddev, struct bio *bio)
>         if (is_suspended(mddev, bio)) {
>                 DEFINE_WAIT(__wait);
>                 for (;;) {
> +                       /* Bail out if REQ_NOWAIT is set for the bio */
> +                       if (bio->bi_opf & REQ_NOWAIT) {
> +                               bio_wouldblock_error(bio);
> +                               break;

This doesn't look right to me. We already run bio_endio() in
bio_wouldblock_error(), then we still feed the bio to make_request().
Did I misread the logic?

Please also explain how this patch was tested.

Thanks,
Song

> +                       }
> +
>                         prepare_to_wait(&mddev->sb_wait, &__wait,
>                                         TASK_UNINTERRUPTIBLE);
>                         if (!is_suspended(mddev, bio))


[...]

^ permalink raw reply	[flat|nested] 86+ messages in thread

* [PATCH v2] md: add support for REQ_NOWAIT
  2021-11-02  5:01 ` Song Liu
@ 2021-11-02 14:40   ` Vishal Verma
  2021-11-02 15:31     ` Jens Axboe
  2021-11-02 18:35     ` Song Liu
  0 siblings, 2 replies; 86+ messages in thread
From: Vishal Verma @ 2021-11-02 14:40 UTC (permalink / raw)
  To: song, linux-raid; +Cc: axboe, Vishal Verma

commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support
for checking whether a given bdev supports handling of REQ_NOWAIT or not.
Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable
it for linear target") added support for REQ_NOWAIT for dm. This uses
a similar approach to incorporate REQ_NOWAIT for md based bios.

This patch was tested using t/io_uring tool within FIO. A nvme drive
was partitioned into 2 partitions and a simple raid 0 configuration
/dev/md0 was created.

md0 : active raid0 nvme4n1p1[1] nvme4n1p2[0]
      937423872 blocks super 1.2 512k chunks

Before patch:

$ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100

Running top while the above runs:

$ ps -eL | grep $(pidof io_uring)

  38396   38396 pts/2    00:00:00 io_uring
  38396   38397 pts/2    00:00:15 io_uring
  38396   38398 pts/2    00:00:13 iou-wrk-38397

We can see iou-wrk-38397 io worker thread created which gets created
when io_uring sees that the underlying device (/dev/md0 in this case)
doesn't support nowait.

After patch:

$ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100

Running top while the above runs:

$ ps -eL | grep $(pidof io_uring)

  38341   38341 pts/2    00:10:22 io_uring
  38341   38342 pts/2    00:10:37 io_uring

After running this patch, we don't see any io worker thread
being created which indicated that io_uring saw that the
underlying device does support nowait. This is the exact behaviour
noticed on a dm device which also supports nowait.

I also successfully tested this patch on various other
raid personalities (1, 6 and 10).

Signed-off-by: Vishal Verma <vverma@digitalocean.com>
---
 drivers/md/md.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5111ed966947..11174d32bfd7 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5792,6 +5792,7 @@ int md_run(struct mddev *mddev)
 	int err;
 	struct md_rdev *rdev;
 	struct md_personality *pers;
+	bool nowait = true;
 
 	if (list_empty(&mddev->disks))
 		/* cannot run an array with no devices.. */
@@ -5862,8 +5863,13 @@ int md_run(struct mddev *mddev)
 			}
 		}
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
+		nowait = blk_queue_nowait(bdev_get_queue(rdev->bdev));
 	}
 
+	/* Set the NOWAIT flags if all underlying devices support it */
+	if (nowait)
+		blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue);
+
 	if (!bioset_initialized(&mddev->bio_set)) {
 		err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
 		if (err)
@@ -7007,6 +7013,14 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 	if (!mddev->thread)
 		md_update_sb(mddev, 1);
+	/* If the new disk does not support REQ_NOWAIT,
+	 * disable on the whole MD.
+	 */
+	if (!blk_queue_nowait(bdev_get_queue(rdev->bdev))) {
+		pr_info("%s: Disabling nowait because %s does not support nowait\n",
+			mdname(mddev), bdevname(rdev->bdev, b));
+		blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue);
+	}
 	/*
 	 * Kick recovery, maybe this spare has to be added to the
 	 * array immediately.
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 86+ messages in thread

* Re: [PATCH v2] md: add support for REQ_NOWAIT
  2021-11-02 14:40   ` [PATCH v2] " Vishal Verma
@ 2021-11-02 15:31     ` Jens Axboe
  2021-11-02 18:35     ` Song Liu
  1 sibling, 0 replies; 86+ messages in thread
From: Jens Axboe @ 2021-11-02 15:31 UTC (permalink / raw)
  To: Vishal Verma, song, linux-raid

On 11/2/21 8:40 AM, Vishal Verma wrote:
> commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support
> for checking whether a given bdev supports handling of REQ_NOWAIT or not.
> Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable
> it for linear target") added support for REQ_NOWAIT for dm. This uses
> a similar approach to incorporate REQ_NOWAIT for md based bios.
> 
> This patch was tested using t/io_uring tool within FIO. A nvme drive
> was partitioned into 2 partitions and a simple raid 0 configuration
> /dev/md0 was created.
> 
> md0 : active raid0 nvme4n1p1[1] nvme4n1p2[0]
>       937423872 blocks super 1.2 512k chunks
> 
> Before patch:
> 
> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
> 
> Running top while the above runs:
> 
> $ ps -eL | grep $(pidof io_uring)
> 
>   38396   38396 pts/2    00:00:00 io_uring
>   38396   38397 pts/2    00:00:15 io_uring
>   38396   38398 pts/2    00:00:13 iou-wrk-38397
> 
> We can see iou-wrk-38397 io worker thread created which gets created
> when io_uring sees that the underlying device (/dev/md0 in this case)
> doesn't support nowait.
> 
> After patch:
> 
> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
> 
> Running top while the above runs:
> 
> $ ps -eL | grep $(pidof io_uring)
> 
>   38341   38341 pts/2    00:10:22 io_uring
>   38341   38342 pts/2    00:10:37 io_uring
> 
> After running this patch, we don't see any io worker thread
> being created which indicated that io_uring saw that the
> underlying device does support nowait. This is the exact behaviour
> noticed on a dm device which also supports nowait.
> 
> I also successfully tested this patch on various other
> raid personalities (1, 6 and 10).

This seems incomplete. It looks like it's propagating the nowait
flag based on constituent devices, which is correct, but surely there
are cases off the the make_request path that now need to check for NOWAIT
and return -EAGAIN if they need to block.

From a quick look, raid0 looks fine as-is. raid1 would need checking
for waiting on a read barrier, for example.

If we just add nowait and don't _actually_ fix the driver, then the
nowait exercise is pointless as it would then just mean that io_uring
would block attempting to submit. That's the broken aio behavior, and
that's certainly not the desired outcome here.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v2] md: add support for REQ_NOWAIT
  2021-11-02 14:40   ` [PATCH v2] " Vishal Verma
  2021-11-02 15:31     ` Jens Axboe
@ 2021-11-02 18:35     ` Song Liu
  2021-11-04  4:51       ` [PATCH v3 2/2] md: raid1 add nowait support Vishal Verma
  1 sibling, 1 reply; 86+ messages in thread
From: Song Liu @ 2021-11-02 18:35 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, Jens Axboe

On Tue, Nov 2, 2021 at 7:40 AM Vishal Verma <vverma@digitalocean.com> wrote:
>
> commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support
> for checking whether a given bdev supports handling of REQ_NOWAIT or not.
> Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable
> it for linear target") added support for REQ_NOWAIT for dm. This uses
> a similar approach to incorporate REQ_NOWAIT for md based bios.
>
> This patch was tested using t/io_uring tool within FIO. A nvme drive
> was partitioned into 2 partitions and a simple raid 0 configuration
> /dev/md0 was created.
>
> md0 : active raid0 nvme4n1p1[1] nvme4n1p2[0]
>       937423872 blocks super 1.2 512k chunks
>
> Before patch:
>
> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
>
> Running top while the above runs:
>
> $ ps -eL | grep $(pidof io_uring)
>
>   38396   38396 pts/2    00:00:00 io_uring
>   38396   38397 pts/2    00:00:15 io_uring
>   38396   38398 pts/2    00:00:13 iou-wrk-38397
>
> We can see iou-wrk-38397 io worker thread created which gets created
> when io_uring sees that the underlying device (/dev/md0 in this case)
> doesn't support nowait.
>
> After patch:
>
> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
>
> Running top while the above runs:
>
> $ ps -eL | grep $(pidof io_uring)
>
>   38341   38341 pts/2    00:10:22 io_uring
>   38341   38342 pts/2    00:10:37 io_uring
>
> After running this patch, we don't see any io worker thread
> being created which indicated that io_uring saw that the
> underlying device does support nowait. This is the exact behaviour
> noticed on a dm device which also supports nowait.
>
> I also successfully tested this patch on various other
> raid personalities (1, 6 and 10).
>
> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
> ---
>  drivers/md/md.c | 14 ++++++++++++++
>  1 file changed, 14 insertions(+)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 5111ed966947..11174d32bfd7 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -5792,6 +5792,7 @@ int md_run(struct mddev *mddev)
>         int err;
>         struct md_rdev *rdev;
>         struct md_personality *pers;
> +       bool nowait = true;
>
>         if (list_empty(&mddev->disks))
>                 /* cannot run an array with no devices.. */
> @@ -5862,8 +5863,13 @@ int md_run(struct mddev *mddev)
>                         }
>                 }
>                 sysfs_notify_dirent_safe(rdev->sysfs_state);
> +               nowait = blk_queue_nowait(bdev_get_queue(rdev->bdev));

This doesn't look right to me. I think we need
    nowait = nowait && blk_queue_nowait(bdev_get_queue(rdev->bdev));

Thanks,
Song

^ permalink raw reply	[flat|nested] 86+ messages in thread

* [PATCH v3 2/2] md: raid1 add nowait support
  2021-11-02 18:35     ` Song Liu
@ 2021-11-04  4:51       ` Vishal Verma
  2021-11-04  4:51         ` [PATCH v3 1/2] md: add support for REQ_NOWAIT Vishal Verma
                           ` (2 more replies)
  0 siblings, 3 replies; 86+ messages in thread
From: Vishal Verma @ 2021-11-04  4:51 UTC (permalink / raw)
  To: song, linux-raid, rgoldwyn; +Cc: axboe, Vishal Verma

This adds nowait support to the RAID1 driver. It makes RAID1 driver
return with EAGAIN for situations where it could wait for eg:

  - Waiting for the barrier,
  - Array got frozen,
  - Too many pending I/Os to be queued.

wait_barrier() fn is modified to return bool to support error for
wait barriers. It returns true in case of wait or if wait is not
required and returns false if wait was required but not performed
to support nowait.

Signed-off-by: Vishal Verma <vverma@digitalocean.com>
---
 drivers/md/raid1.c | 74 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 57 insertions(+), 17 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7dc8026cf6ee..2e191fc2147b 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -929,8 +929,9 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
 	wake_up(&conf->wait_barrier);
 }
 
-static void _wait_barrier(struct r1conf *conf, int idx)
+static bool _wait_barrier(struct r1conf *conf, int idx, bool nowait)
 {
+	bool ret = true;
 	/*
 	 * We need to increase conf->nr_pending[idx] very early here,
 	 * then raise_barrier() can be blocked when it waits for
@@ -961,7 +962,7 @@ static void _wait_barrier(struct r1conf *conf, int idx)
 	 */
 	if (!READ_ONCE(conf->array_frozen) &&
 	    !atomic_read(&conf->barrier[idx]))
-		return;
+		return ret;
 
 	/*
 	 * After holding conf->resync_lock, conf->nr_pending[idx]
@@ -979,18 +980,27 @@ static void _wait_barrier(struct r1conf *conf, int idx)
 	 */
 	wake_up(&conf->wait_barrier);
 	/* Wait for the barrier in same barrier unit bucket to drop. */
-	wait_event_lock_irq(conf->wait_barrier,
-			    !conf->array_frozen &&
-			     !atomic_read(&conf->barrier[idx]),
-			    conf->resync_lock);
+	if (conf->array_frozen || atomic_read(&conf->barrier[idx])) {
+		/* Return false when nowait flag is set */
+		if (nowait)
+			ret = false;
+		else {
+			wait_event_lock_irq(conf->wait_barrier,
+					!conf->array_frozen &&
+					!atomic_read(&conf->barrier[idx]),
+					conf->resync_lock);
+		}
+	}
 	atomic_inc(&conf->nr_pending[idx]);
 	atomic_dec(&conf->nr_waiting[idx]);
 	spin_unlock_irq(&conf->resync_lock);
+	return ret;
 }
 
-static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
+static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
 {
 	int idx = sector_to_idx(sector_nr);
+	bool ret = true;
 
 	/*
 	 * Very similar to _wait_barrier(). The difference is, for read
@@ -1002,7 +1012,7 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
 	atomic_inc(&conf->nr_pending[idx]);
 
 	if (!READ_ONCE(conf->array_frozen))
-		return;
+		return ret;
 
 	spin_lock_irq(&conf->resync_lock);
 	atomic_inc(&conf->nr_waiting[idx]);
@@ -1013,19 +1023,27 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
 	 */
 	wake_up(&conf->wait_barrier);
 	/* Wait for array to be unfrozen */
-	wait_event_lock_irq(conf->wait_barrier,
-			    !conf->array_frozen,
-			    conf->resync_lock);
+	if (conf->array_frozen || atomic_read(&conf->barrier[idx])) {
+		if (nowait)
+			/* Return false when nowait flag is set */
+			ret = false;
+		else {
+			wait_event_lock_irq(conf->wait_barrier,
+					!conf->array_frozen,
+					conf->resync_lock);
+		}
+	}
 	atomic_inc(&conf->nr_pending[idx]);
 	atomic_dec(&conf->nr_waiting[idx]);
 	spin_unlock_irq(&conf->resync_lock);
+	return ret;
 }
 
-static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
+static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
 {
 	int idx = sector_to_idx(sector_nr);
 
-	_wait_barrier(conf, idx);
+	return _wait_barrier(conf, idx, nowait);
 }
 
 static void _allow_barrier(struct r1conf *conf, int idx)
@@ -1236,7 +1254,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 	 * Still need barrier for READ in case that whole
 	 * array is frozen.
 	 */
-	wait_read_barrier(conf, bio->bi_iter.bi_sector);
+	if (!wait_read_barrier(conf, bio->bi_iter.bi_sector,
+				bio->bi_opf & REQ_NOWAIT)) {
+		bio_wouldblock_error(bio);
+		return;
+	}
 
 	if (!r1_bio)
 		r1_bio = alloc_r1bio(mddev, bio);
@@ -1336,6 +1358,10 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 		     bio->bi_iter.bi_sector, bio_end_sector(bio))) {
 
 		DEFINE_WAIT(w);
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		for (;;) {
 			prepare_to_wait(&conf->wait_barrier,
 					&w, TASK_IDLE);
@@ -1353,17 +1379,26 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	 * thread has put up a bar for new requests.
 	 * Continue immediately if no resync is active currently.
 	 */
-	wait_barrier(conf, bio->bi_iter.bi_sector);
+	if (!wait_read_barrier(conf, bio->bi_iter.bi_sector,
+				bio->bi_opf & REQ_NOWAIT)) {
+		bio_wouldblock_error(bio);
+		return;
+	}
 
 	r1_bio = alloc_r1bio(mddev, bio);
 	r1_bio->sectors = max_write_sectors;
 
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		raid1_log(mddev, "wait queued");
 		wait_event(conf->wait_barrier,
 			   conf->pending_count < max_queued_requests);
 	}
+
 	/* first select target devices under rcu_lock and
 	 * inc refcount on their rdev.  Record them by setting
 	 * bios[x] to bio
@@ -1458,9 +1493,14 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 				rdev_dec_pending(conf->mirrors[j].rdev, mddev);
 		r1_bio->state = 0;
 		allow_barrier(conf, bio->bi_iter.bi_sector);
+
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
 		md_wait_for_blocked_rdev(blocked_rdev, mddev);
-		wait_barrier(conf, bio->bi_iter.bi_sector);
+		wait_barrier(conf, bio->bi_iter.bi_sector, false);
 		goto retry_write;
 	}
 
@@ -1687,7 +1727,7 @@ static void close_sync(struct r1conf *conf)
 	int idx;
 
 	for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) {
-		_wait_barrier(conf, idx);
+		_wait_barrier(conf, idx, false);
 		_allow_barrier(conf, idx);
 	}
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 86+ messages in thread

* [PATCH v3 1/2] md: add support for REQ_NOWAIT
  2021-11-04  4:51       ` [PATCH v3 2/2] md: raid1 add nowait support Vishal Verma
@ 2021-11-04  4:51         ` Vishal Verma
  2021-11-06 15:38           ` Guoqing Jiang
  2021-11-08 22:17           ` Song Liu
  2021-11-06 15:24         ` [PATCH v3 2/2] md: raid1 add nowait support Guoqing Jiang
  2021-11-08 22:32         ` Song Liu
  2 siblings, 2 replies; 86+ messages in thread
From: Vishal Verma @ 2021-11-04  4:51 UTC (permalink / raw)
  To: song, linux-raid, rgoldwyn; +Cc: axboe, Vishal Verma

commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support
for checking whether a given bdev supports handling of REQ_NOWAIT or not.
Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable
it for linear target") added support for REQ_NOWAIT for dm. This uses
a similar approach to incorporate REQ_NOWAIT for md based bios.

This patch was tested using t/io_uring tool within FIO. A nvme drive
was partitioned into 2 partitions and a simple raid 0 configuration
/dev/md0 was created.

md0 : active raid0 nvme4n1p1[1] nvme4n1p2[0]
      937423872 blocks super 1.2 512k chunks

Before patch:

$ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100

Running top while the above runs:

$ ps -eL | grep $(pidof io_uring)

  38396   38396 pts/2    00:00:00 io_uring
  38396   38397 pts/2    00:00:15 io_uring
  38396   38398 pts/2    00:00:13 iou-wrk-38397

We can see iou-wrk-38397 io worker thread created which gets created
when io_uring sees that the underlying device (/dev/md0 in this case)
doesn't support nowait.

After patch:

$ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100

Running top while the above runs:

$ ps -eL | grep $(pidof io_uring)

  38341   38341 pts/2    00:10:22 io_uring
  38341   38342 pts/2    00:10:37 io_uring

After running this patch, we don't see any io worker thread
being created which indicated that io_uring saw that the
underlying device does support nowait. This is the exact behaviour
noticed on a dm device which also supports nowait.

For all the other raid personalities except raid0, we would need
to train pieces which involves make_request fn in order for them
to correctly handle REQ_NOWAIT.

Signed-off-by: Vishal Verma <vverma@digitalocean.com>
---
 drivers/md/md.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5111ed966947..73089776475f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5792,6 +5792,7 @@ int md_run(struct mddev *mddev)
 	int err;
 	struct md_rdev *rdev;
 	struct md_personality *pers;
+	bool nowait = true;
 
 	if (list_empty(&mddev->disks))
 		/* cannot run an array with no devices.. */
@@ -5862,8 +5863,13 @@ int md_run(struct mddev *mddev)
 			}
 		}
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
+		nowait = nowait && blk_queue_nowait(bdev_get_queue(rdev->bdev));
 	}
 
+	/* Set the NOWAIT flags if all underlying devices support it */
+	if (nowait)
+		blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue);
+
 	if (!bioset_initialized(&mddev->bio_set)) {
 		err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
 		if (err)
@@ -7007,6 +7013,14 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 	if (!mddev->thread)
 		md_update_sb(mddev, 1);
+	/* If the new disk does not support REQ_NOWAIT,
+	 * disable on the whole MD.
+	 */
+	if (!blk_queue_nowait(bdev_get_queue(rdev->bdev))) {
+		pr_info("%s: Disabling nowait because %s does not support nowait\n",
+			mdname(mddev), bdevname(rdev->bdev, b));
+		blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue);
+	}
 	/*
 	 * Kick recovery, maybe this spare has to be added to the
 	 * array immediately.
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 86+ messages in thread

* Re: [PATCH v3 2/2] md: raid1 add nowait support
  2021-11-04  4:51       ` [PATCH v3 2/2] md: raid1 add nowait support Vishal Verma
  2021-11-04  4:51         ` [PATCH v3 1/2] md: add support for REQ_NOWAIT Vishal Verma
@ 2021-11-06 15:24         ` Guoqing Jiang
  2021-11-07  0:18           ` Vishal Verma
  2021-11-08 22:32         ` Song Liu
  2 siblings, 1 reply; 86+ messages in thread
From: Guoqing Jiang @ 2021-11-06 15:24 UTC (permalink / raw)
  To: Vishal Verma, song, linux-raid, rgoldwyn; +Cc: axboe



On 11/4/21 12:51 PM, Vishal Verma wrote:
> This adds nowait support to the RAID1 driver.

What about raid10 and raid456?

Thanks,
Guoqing

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v3 1/2] md: add support for REQ_NOWAIT
  2021-11-04  4:51         ` [PATCH v3 1/2] md: add support for REQ_NOWAIT Vishal Verma
@ 2021-11-06 15:38           ` Guoqing Jiang
  2021-11-07  0:16             ` Vishal Verma
  2021-11-08 22:17           ` Song Liu
  1 sibling, 1 reply; 86+ messages in thread
From: Guoqing Jiang @ 2021-11-06 15:38 UTC (permalink / raw)
  To: Vishal Verma, song, linux-raid, rgoldwyn; +Cc: axboe



On 11/4/21 12:51 PM, Vishal Verma wrote:
> commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support
> for checking whether a given bdev supports handling of REQ_NOWAIT or not.
> Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable
> it for linear target") added support for REQ_NOWAIT for dm. This uses
> a similar approach to incorporate REQ_NOWAIT for md based bios.
>
> This patch was tested using t/io_uring tool within FIO. A nvme drive
> was partitioned into 2 partitions and a simple raid 0 configuration
> /dev/md0 was created.
>
> md0 : active raid0 nvme4n1p1[1] nvme4n1p2[0]
>        937423872 blocks super 1.2 512k chunks
>
> Before patch:
>
> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
>
> Running top while the above runs:
>
> $ ps -eL | grep $(pidof io_uring)
>
>    38396   38396 pts/2    00:00:00 io_uring
>    38396   38397 pts/2    00:00:15 io_uring
>    38396   38398 pts/2    00:00:13 iou-wrk-38397
>
> We can see iou-wrk-38397 io worker thread created which gets created
> when io_uring sees that the underlying device (/dev/md0 in this case)
> doesn't support nowait.
>
> After patch:
>
> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
>
> Running top while the above runs:
>
> $ ps -eL | grep $(pidof io_uring)
>
>    38341   38341 pts/2    00:10:22 io_uring
>    38341   38342 pts/2    00:10:37 io_uring
>
> After running this patch, we don't see any io worker thread
> being created which indicated that io_uring saw that the
> underlying device does support nowait. This is the exact behaviour
> noticed on a dm device which also supports nowait.
>
> For all the other raid personalities except raid0, we would need
> to train pieces which involves make_request fn in order for them
> to correctly handle REQ_NOWAIT.
>
> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
> ---
>   drivers/md/md.c | 14 ++++++++++++++
>   1 file changed, 14 insertions(+)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 5111ed966947..73089776475f 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -5792,6 +5792,7 @@ int md_run(struct mddev *mddev)
>   	int err;
>   	struct md_rdev *rdev;
>   	struct md_personality *pers;
> +	bool nowait = true;
>   
>   	if (list_empty(&mddev->disks))
>   		/* cannot run an array with no devices.. */
> @@ -5862,8 +5863,13 @@ int md_run(struct mddev *mddev)
>   			}
>   		}
>   		sysfs_notify_dirent_safe(rdev->sysfs_state);
> +		nowait = nowait && blk_queue_nowait(bdev_get_queue(rdev->bdev));
>   	}
>   
> +	/* Set the NOWAIT flags if all underlying devices support it */
> +	if (nowait)
> +		blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue);
> +
>   	if (!bioset_initialized(&mddev->bio_set)) {
>   		err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
>   		if (err)
> @@ -7007,6 +7013,14 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
>   	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
>   	if (!mddev->thread)
>   		md_update_sb(mddev, 1);
> +	/* If the new disk does not support REQ_NOWAIT,
> +	 * disable on the whole MD.
> +	 */

The comment style is

/*
  * xxx
  */

> +	if (!blk_queue_nowait(bdev_get_queue(rdev->bdev))) {
> +		pr_info("%s: Disabling nowait because %s does not support nowait\n",
> +			mdname(mddev), bdevname(rdev->bdev, b));

Use %pg to print block device name will be more popular though md has 
lots of legacy code
with bdevname.

Thanks,
Guoqing

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v3 1/2] md: add support for REQ_NOWAIT
  2021-11-06 15:38           ` Guoqing Jiang
@ 2021-11-07  0:16             ` Vishal Verma
  0 siblings, 0 replies; 86+ messages in thread
From: Vishal Verma @ 2021-11-07  0:16 UTC (permalink / raw)
  To: Guoqing Jiang, song, linux-raid, rgoldwyn; +Cc: axboe


On 11/6/21 8:38 AM, Guoqing Jiang wrote:
>
>
> On 11/4/21 12:51 PM, Vishal Verma wrote:
>> commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support
>> for checking whether a given bdev supports handling of REQ_NOWAIT or 
>> not.
>> Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and 
>> enable
>> it for linear target") added support for REQ_NOWAIT for dm. This uses
>> a similar approach to incorporate REQ_NOWAIT for md based bios.
>>
>> This patch was tested using t/io_uring tool within FIO. A nvme drive
>> was partitioned into 2 partitions and a simple raid 0 configuration
>> /dev/md0 was created.
>>
>> md0 : active raid0 nvme4n1p1[1] nvme4n1p2[0]
>>        937423872 blocks super 1.2 512k chunks
>>
>> Before patch:
>>
>> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
>>
>> Running top while the above runs:
>>
>> $ ps -eL | grep $(pidof io_uring)
>>
>>    38396   38396 pts/2    00:00:00 io_uring
>>    38396   38397 pts/2    00:00:15 io_uring
>>    38396   38398 pts/2    00:00:13 iou-wrk-38397
>>
>> We can see iou-wrk-38397 io worker thread created which gets created
>> when io_uring sees that the underlying device (/dev/md0 in this case)
>> doesn't support nowait.
>>
>> After patch:
>>
>> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
>>
>> Running top while the above runs:
>>
>> $ ps -eL | grep $(pidof io_uring)
>>
>>    38341   38341 pts/2    00:10:22 io_uring
>>    38341   38342 pts/2    00:10:37 io_uring
>>
>> After running this patch, we don't see any io worker thread
>> being created which indicated that io_uring saw that the
>> underlying device does support nowait. This is the exact behaviour
>> noticed on a dm device which also supports nowait.
>>
>> For all the other raid personalities except raid0, we would need
>> to train pieces which involves make_request fn in order for them
>> to correctly handle REQ_NOWAIT.
>>
>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
>> ---
>>   drivers/md/md.c | 14 ++++++++++++++
>>   1 file changed, 14 insertions(+)
>>
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index 5111ed966947..73089776475f 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -5792,6 +5792,7 @@ int md_run(struct mddev *mddev)
>>       int err;
>>       struct md_rdev *rdev;
>>       struct md_personality *pers;
>> +    bool nowait = true;
>>         if (list_empty(&mddev->disks))
>>           /* cannot run an array with no devices.. */
>> @@ -5862,8 +5863,13 @@ int md_run(struct mddev *mddev)
>>               }
>>           }
>>           sysfs_notify_dirent_safe(rdev->sysfs_state);
>> +        nowait = nowait && 
>> blk_queue_nowait(bdev_get_queue(rdev->bdev));
>>       }
>>   +    /* Set the NOWAIT flags if all underlying devices support it */
>> +    if (nowait)
>> +        blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue);
>> +
>>       if (!bioset_initialized(&mddev->bio_set)) {
>>           err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, 
>> BIOSET_NEED_BVECS);
>>           if (err)
>> @@ -7007,6 +7013,14 @@ static int hot_add_disk(struct mddev *mddev, 
>> dev_t dev)
>>       set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
>>       if (!mddev->thread)
>>           md_update_sb(mddev, 1);
>> +    /* If the new disk does not support REQ_NOWAIT,
>> +     * disable on the whole MD.
>> +     */
>
> The comment style is
>
> /*
>  * xxx
>  */
> Ack, will fix it.
>> +    if (!blk_queue_nowait(bdev_get_queue(rdev->bdev))) {
>> +        pr_info("%s: Disabling nowait because %s does not support 
>> nowait\n",
>> +            mdname(mddev), bdevname(rdev->bdev, b));
>
> Use %pg to print block device name will be more popular though md has 
> lots of legacy code
> with bdevname.
>
> Thanks for pointing, will fix it!
>
> Thanks,
> Guoqing

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v3 2/2] md: raid1 add nowait support
  2021-11-06 15:24         ` [PATCH v3 2/2] md: raid1 add nowait support Guoqing Jiang
@ 2021-11-07  0:18           ` Vishal Verma
  0 siblings, 0 replies; 86+ messages in thread
From: Vishal Verma @ 2021-11-07  0:18 UTC (permalink / raw)
  To: Guoqing Jiang, song, linux-raid, rgoldwyn; +Cc: axboe

Yes, that will be next.
Sorry I wasn't clearer, but wanted to see if the changes so far for 
raid1 makes sense or not.

On 11/6/21 8:24 AM, Guoqing Jiang wrote:
>
>
> On 11/4/21 12:51 PM, Vishal Verma wrote:
>> This adds nowait support to the RAID1 driver.
>
> What about raid10 and raid456?
>
> Thanks,
> Guoqing

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v3 1/2] md: add support for REQ_NOWAIT
  2021-11-04  4:51         ` [PATCH v3 1/2] md: add support for REQ_NOWAIT Vishal Verma
  2021-11-06 15:38           ` Guoqing Jiang
@ 2021-11-08 22:17           ` Song Liu
  2021-11-08 22:36             ` Vishal Verma
  1 sibling, 1 reply; 86+ messages in thread
From: Song Liu @ 2021-11-08 22:17 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, rgoldwyn, Jens Axboe

On Wed, Nov 3, 2021 at 9:52 PM Vishal Verma <vverma@digitalocean.com> wrote:
>
> commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support
> for checking whether a given bdev supports handling of REQ_NOWAIT or not.
> Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable
> it for linear target") added support for REQ_NOWAIT for dm. This uses
> a similar approach to incorporate REQ_NOWAIT for md based bios.
>
> This patch was tested using t/io_uring tool within FIO. A nvme drive
> was partitioned into 2 partitions and a simple raid 0 configuration
> /dev/md0 was created.
>
> md0 : active raid0 nvme4n1p1[1] nvme4n1p2[0]
>       937423872 blocks super 1.2 512k chunks
>
> Before patch:
>
> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
>
> Running top while the above runs:
>
> $ ps -eL | grep $(pidof io_uring)
>
>   38396   38396 pts/2    00:00:00 io_uring
>   38396   38397 pts/2    00:00:15 io_uring
>   38396   38398 pts/2    00:00:13 iou-wrk-38397
>
> We can see iou-wrk-38397 io worker thread created which gets created
> when io_uring sees that the underlying device (/dev/md0 in this case)
> doesn't support nowait.
>
> After patch:
>
> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
>
> Running top while the above runs:
>
> $ ps -eL | grep $(pidof io_uring)
>
>   38341   38341 pts/2    00:10:22 io_uring
>   38341   38342 pts/2    00:10:37 io_uring
>
> After running this patch, we don't see any io worker thread
> being created which indicated that io_uring saw that the
> underlying device does support nowait. This is the exact behaviour
> noticed on a dm device which also supports nowait.
>
> For all the other raid personalities except raid0, we would need
> to train pieces which involves make_request fn in order for them
> to correctly handle REQ_NOWAIT.
>
> Signed-off-by: Vishal Verma <vverma@digitalocean.com>

I think we still need the logic in md_handle_request() similar to v1?

Thanks,
Song

> ---
>  drivers/md/md.c | 14 ++++++++++++++
>  1 file changed, 14 insertions(+)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 5111ed966947..73089776475f 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -5792,6 +5792,7 @@ int md_run(struct mddev *mddev)
>         int err;
>         struct md_rdev *rdev;
>         struct md_personality *pers;
> +       bool nowait = true;
>
>         if (list_empty(&mddev->disks))
>                 /* cannot run an array with no devices.. */
> @@ -5862,8 +5863,13 @@ int md_run(struct mddev *mddev)
>                         }
>                 }
>                 sysfs_notify_dirent_safe(rdev->sysfs_state);
> +               nowait = nowait && blk_queue_nowait(bdev_get_queue(rdev->bdev));
>         }
>
> +       /* Set the NOWAIT flags if all underlying devices support it */
> +       if (nowait)
> +               blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue);
> +
>         if (!bioset_initialized(&mddev->bio_set)) {
>                 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
>                 if (err)
> @@ -7007,6 +7013,14 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
>         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
>         if (!mddev->thread)
>                 md_update_sb(mddev, 1);
> +       /* If the new disk does not support REQ_NOWAIT,
> +        * disable on the whole MD.
> +        */
> +       if (!blk_queue_nowait(bdev_get_queue(rdev->bdev))) {
> +               pr_info("%s: Disabling nowait because %s does not support nowait\n",
> +                       mdname(mddev), bdevname(rdev->bdev, b));
> +               blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue);
> +       }
>         /*
>          * Kick recovery, maybe this spare has to be added to the
>          * array immediately.
> --
> 2.17.1
>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v3 2/2] md: raid1 add nowait support
  2021-11-04  4:51       ` [PATCH v3 2/2] md: raid1 add nowait support Vishal Verma
  2021-11-04  4:51         ` [PATCH v3 1/2] md: add support for REQ_NOWAIT Vishal Verma
  2021-11-06 15:24         ` [PATCH v3 2/2] md: raid1 add nowait support Guoqing Jiang
@ 2021-11-08 22:32         ` Song Liu
  2021-11-08 22:39           ` Vishal Verma
  2021-11-10 18:14           ` [RFC PATCH v4 1/4] md: add support for REQ_NOWAIT Vishal Verma
  2 siblings, 2 replies; 86+ messages in thread
From: Song Liu @ 2021-11-08 22:32 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, rgoldwyn, Jens Axboe

On Wed, Nov 3, 2021 at 9:52 PM Vishal Verma <vverma@digitalocean.com> wrote:
>
> This adds nowait support to the RAID1 driver. It makes RAID1 driver
> return with EAGAIN for situations where it could wait for eg:
>
>   - Waiting for the barrier,
>   - Array got frozen,
>   - Too many pending I/Os to be queued.
>
> wait_barrier() fn is modified to return bool to support error for
> wait barriers. It returns true in case of wait or if wait is not
> required and returns false if wait was required but not performed
> to support nowait.
>
> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
> ---
>  drivers/md/raid1.c | 74 +++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 57 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 7dc8026cf6ee..2e191fc2147b 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -929,8 +929,9 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
>         wake_up(&conf->wait_barrier);
>  }
>
> -static void _wait_barrier(struct r1conf *conf, int idx)
> +static bool _wait_barrier(struct r1conf *conf, int idx, bool nowait)
>  {
> +       bool ret = true;
>         /*
>          * We need to increase conf->nr_pending[idx] very early here,
>          * then raise_barrier() can be blocked when it waits for
> @@ -961,7 +962,7 @@ static void _wait_barrier(struct r1conf *conf, int idx)
>          */
>         if (!READ_ONCE(conf->array_frozen) &&
>             !atomic_read(&conf->barrier[idx]))
> -               return;
> +               return ret;
>
>         /*
>          * After holding conf->resync_lock, conf->nr_pending[idx]
> @@ -979,18 +980,27 @@ static void _wait_barrier(struct r1conf *conf, int idx)
>          */
>         wake_up(&conf->wait_barrier);
>         /* Wait for the barrier in same barrier unit bucket to drop. */
> -       wait_event_lock_irq(conf->wait_barrier,
> -                           !conf->array_frozen &&
> -                            !atomic_read(&conf->barrier[idx]),
> -                           conf->resync_lock);
> +       if (conf->array_frozen || atomic_read(&conf->barrier[idx])) {
> +               /* Return false when nowait flag is set */
> +               if (nowait)
> +                       ret = false;
> +               else {
> +                       wait_event_lock_irq(conf->wait_barrier,
> +                                       !conf->array_frozen &&
> +                                       !atomic_read(&conf->barrier[idx]),
> +                                       conf->resync_lock);
> +               }
> +       }
>         atomic_inc(&conf->nr_pending[idx]);
>         atomic_dec(&conf->nr_waiting[idx]);
>         spin_unlock_irq(&conf->resync_lock);
> +       return ret;
>  }
>
> -static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
> +static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
>  {
>         int idx = sector_to_idx(sector_nr);
> +       bool ret = true;
>
>         /*
>          * Very similar to _wait_barrier(). The difference is, for read
> @@ -1002,7 +1012,7 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
>         atomic_inc(&conf->nr_pending[idx]);
>
>         if (!READ_ONCE(conf->array_frozen))
> -               return;
> +               return ret;
>
>         spin_lock_irq(&conf->resync_lock);
>         atomic_inc(&conf->nr_waiting[idx]);
> @@ -1013,19 +1023,27 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
>          */
>         wake_up(&conf->wait_barrier);
>         /* Wait for array to be unfrozen */
> -       wait_event_lock_irq(conf->wait_barrier,
> -                           !conf->array_frozen,
> -                           conf->resync_lock);
> +       if (conf->array_frozen || atomic_read(&conf->barrier[idx])) {
> +               if (nowait)
> +                       /* Return false when nowait flag is set */
> +                       ret = false;
> +               else {
> +                       wait_event_lock_irq(conf->wait_barrier,
> +                                       !conf->array_frozen,
> +                                       conf->resync_lock);
> +               }
> +       }
>         atomic_inc(&conf->nr_pending[idx]);
>         atomic_dec(&conf->nr_waiting[idx]);
>         spin_unlock_irq(&conf->resync_lock);
> +       return ret;
>  }
>
> -static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
> +static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
>  {
>         int idx = sector_to_idx(sector_nr);
>
> -       _wait_barrier(conf, idx);
> +       return _wait_barrier(conf, idx, nowait);
>  }
>
>  static void _allow_barrier(struct r1conf *conf, int idx)
> @@ -1236,7 +1254,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
>          * Still need barrier for READ in case that whole
>          * array is frozen.
>          */
> -       wait_read_barrier(conf, bio->bi_iter.bi_sector);
> +       if (!wait_read_barrier(conf, bio->bi_iter.bi_sector,
> +                               bio->bi_opf & REQ_NOWAIT)) {
> +               bio_wouldblock_error(bio);
> +               return;
> +       }
>
>         if (!r1_bio)
>                 r1_bio = alloc_r1bio(mddev, bio);
> @@ -1336,6 +1358,10 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>                      bio->bi_iter.bi_sector, bio_end_sector(bio))) {
>
>                 DEFINE_WAIT(w);
> +               if (bio->bi_opf & REQ_NOWAIT) {
> +                       bio_wouldblock_error(bio);
> +                       return;
> +               }
>                 for (;;) {
>                         prepare_to_wait(&conf->wait_barrier,
>                                         &w, TASK_IDLE);
> @@ -1353,17 +1379,26 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>          * thread has put up a bar for new requests.
>          * Continue immediately if no resync is active currently.
>          */
> -       wait_barrier(conf, bio->bi_iter.bi_sector);
> +       if (!wait_read_barrier(conf, bio->bi_iter.bi_sector,

We change wait_barrier to wait_read_barrier here, I guess this is a typo?

Please include changes in raid10 and raid456 (or don't set QUEUE_FLAG_NOWAIT
for these personalities) and resend the patch. We will target it for
the next merge
window (5.17).

Thanks,
Song


> +                               bio->bi_opf & REQ_NOWAIT)) {
> +               bio_wouldblock_error(bio);
> +               return;
> +       }
>
>         r1_bio = alloc_r1bio(mddev, bio);
>         r1_bio->sectors = max_write_sectors;
>
>         if (conf->pending_count >= max_queued_requests) {
>                 md_wakeup_thread(mddev->thread);
> +               if (bio->bi_opf & REQ_NOWAIT) {
> +                       bio_wouldblock_error(bio);
> +                       return;
> +               }
>                 raid1_log(mddev, "wait queued");
>                 wait_event(conf->wait_barrier,
>                            conf->pending_count < max_queued_requests);
>         }
> +
>         /* first select target devices under rcu_lock and
>          * inc refcount on their rdev.  Record them by setting
>          * bios[x] to bio
> @@ -1458,9 +1493,14 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>                                 rdev_dec_pending(conf->mirrors[j].rdev, mddev);
>                 r1_bio->state = 0;
>                 allow_barrier(conf, bio->bi_iter.bi_sector);
> +
> +               if (bio->bi_opf & REQ_NOWAIT) {
> +                       bio_wouldblock_error(bio);
> +                       return;
> +               }
>                 raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
>                 md_wait_for_blocked_rdev(blocked_rdev, mddev);
> -               wait_barrier(conf, bio->bi_iter.bi_sector);
> +               wait_barrier(conf, bio->bi_iter.bi_sector, false);
>                 goto retry_write;
>         }
>
> @@ -1687,7 +1727,7 @@ static void close_sync(struct r1conf *conf)
>         int idx;
>
>         for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) {
> -               _wait_barrier(conf, idx);
> +               _wait_barrier(conf, idx, false);
>                 _allow_barrier(conf, idx);
>         }
>
> --
> 2.17.1
>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v3 1/2] md: add support for REQ_NOWAIT
  2021-11-08 22:17           ` Song Liu
@ 2021-11-08 22:36             ` Vishal Verma
  0 siblings, 0 replies; 86+ messages in thread
From: Vishal Verma @ 2021-11-08 22:36 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, rgoldwyn, Jens Axboe


On 11/8/21 3:17 PM, Song Liu wrote:
> On Wed, Nov 3, 2021 at 9:52 PM Vishal Verma <vverma@digitalocean.com> wrote:
>> commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support
>> for checking whether a given bdev supports handling of REQ_NOWAIT or not.
>> Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable
>> it for linear target") added support for REQ_NOWAIT for dm. This uses
>> a similar approach to incorporate REQ_NOWAIT for md based bios.
>>
>> This patch was tested using t/io_uring tool within FIO. A nvme drive
>> was partitioned into 2 partitions and a simple raid 0 configuration
>> /dev/md0 was created.
>>
>> md0 : active raid0 nvme4n1p1[1] nvme4n1p2[0]
>>        937423872 blocks super 1.2 512k chunks
>>
>> Before patch:
>>
>> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
>>
>> Running top while the above runs:
>>
>> $ ps -eL | grep $(pidof io_uring)
>>
>>    38396   38396 pts/2    00:00:00 io_uring
>>    38396   38397 pts/2    00:00:15 io_uring
>>    38396   38398 pts/2    00:00:13 iou-wrk-38397
>>
>> We can see iou-wrk-38397 io worker thread created which gets created
>> when io_uring sees that the underlying device (/dev/md0 in this case)
>> doesn't support nowait.
>>
>> After patch:
>>
>> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
>>
>> Running top while the above runs:
>>
>> $ ps -eL | grep $(pidof io_uring)
>>
>>    38341   38341 pts/2    00:10:22 io_uring
>>    38341   38342 pts/2    00:10:37 io_uring
>>
>> After running this patch, we don't see any io worker thread
>> being created which indicated that io_uring saw that the
>> underlying device does support nowait. This is the exact behaviour
>> noticed on a dm device which also supports nowait.
>>
>> For all the other raid personalities except raid0, we would need
>> to train pieces which involves make_request fn in order for them
>> to correctly handle REQ_NOWAIT.
>>
>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
> I think we still need the logic in md_handle_request() similar to v1?
>
> Thanks,
> Song
>
> Yes, I believe so. I misunderstood your earlier comment in v1 regarding bio_endio().
> Will fix it.
>> ---
>>   drivers/md/md.c | 14 ++++++++++++++
>>   1 file changed, 14 insertions(+)
>>
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index 5111ed966947..73089776475f 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -5792,6 +5792,7 @@ int md_run(struct mddev *mddev)
>>          int err;
>>          struct md_rdev *rdev;
>>          struct md_personality *pers;
>> +       bool nowait = true;
>>
>>          if (list_empty(&mddev->disks))
>>                  /* cannot run an array with no devices.. */
>> @@ -5862,8 +5863,13 @@ int md_run(struct mddev *mddev)
>>                          }
>>                  }
>>                  sysfs_notify_dirent_safe(rdev->sysfs_state);
>> +               nowait = nowait && blk_queue_nowait(bdev_get_queue(rdev->bdev));
>>          }
>>
>> +       /* Set the NOWAIT flags if all underlying devices support it */
>> +       if (nowait)
>> +               blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue);
>> +
>>          if (!bioset_initialized(&mddev->bio_set)) {
>>                  err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
>>                  if (err)
>> @@ -7007,6 +7013,14 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
>>          set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
>>          if (!mddev->thread)
>>                  md_update_sb(mddev, 1);
>> +       /* If the new disk does not support REQ_NOWAIT,
>> +        * disable on the whole MD.
>> +        */
>> +       if (!blk_queue_nowait(bdev_get_queue(rdev->bdev))) {
>> +               pr_info("%s: Disabling nowait because %s does not support nowait\n",
>> +                       mdname(mddev), bdevname(rdev->bdev, b));
>> +               blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue);
>> +       }
>>          /*
>>           * Kick recovery, maybe this spare has to be added to the
>>           * array immediately.
>> --
>> 2.17.1
>>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v3 2/2] md: raid1 add nowait support
  2021-11-08 22:32         ` Song Liu
@ 2021-11-08 22:39           ` Vishal Verma
  2021-11-09 20:59             ` Vishal Verma
  2021-11-10 18:14           ` [RFC PATCH v4 1/4] md: add support for REQ_NOWAIT Vishal Verma
  1 sibling, 1 reply; 86+ messages in thread
From: Vishal Verma @ 2021-11-08 22:39 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, rgoldwyn, Jens Axboe


On 11/8/21 3:32 PM, Song Liu wrote:
> On Wed, Nov 3, 2021 at 9:52 PM Vishal Verma <vverma@digitalocean.com> wrote:
>> This adds nowait support to the RAID1 driver. It makes RAID1 driver
>> return with EAGAIN for situations where it could wait for eg:
>>
>>    - Waiting for the barrier,
>>    - Array got frozen,
>>    - Too many pending I/Os to be queued.
>>
>> wait_barrier() fn is modified to return bool to support error for
>> wait barriers. It returns true in case of wait or if wait is not
>> required and returns false if wait was required but not performed
>> to support nowait.
>>
>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
>> ---
>>   drivers/md/raid1.c | 74 +++++++++++++++++++++++++++++++++++-----------
>>   1 file changed, 57 insertions(+), 17 deletions(-)
>>
>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>> index 7dc8026cf6ee..2e191fc2147b 100644
>> --- a/drivers/md/raid1.c
>> +++ b/drivers/md/raid1.c
>> @@ -929,8 +929,9 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
>>          wake_up(&conf->wait_barrier);
>>   }
>>
>> -static void _wait_barrier(struct r1conf *conf, int idx)
>> +static bool _wait_barrier(struct r1conf *conf, int idx, bool nowait)
>>   {
>> +       bool ret = true;
>>          /*
>>           * We need to increase conf->nr_pending[idx] very early here,
>>           * then raise_barrier() can be blocked when it waits for
>> @@ -961,7 +962,7 @@ static void _wait_barrier(struct r1conf *conf, int idx)
>>           */
>>          if (!READ_ONCE(conf->array_frozen) &&
>>              !atomic_read(&conf->barrier[idx]))
>> -               return;
>> +               return ret;
>>
>>          /*
>>           * After holding conf->resync_lock, conf->nr_pending[idx]
>> @@ -979,18 +980,27 @@ static void _wait_barrier(struct r1conf *conf, int idx)
>>           */
>>          wake_up(&conf->wait_barrier);
>>          /* Wait for the barrier in same barrier unit bucket to drop. */
>> -       wait_event_lock_irq(conf->wait_barrier,
>> -                           !conf->array_frozen &&
>> -                            !atomic_read(&conf->barrier[idx]),
>> -                           conf->resync_lock);
>> +       if (conf->array_frozen || atomic_read(&conf->barrier[idx])) {
>> +               /* Return false when nowait flag is set */
>> +               if (nowait)
>> +                       ret = false;
>> +               else {
>> +                       wait_event_lock_irq(conf->wait_barrier,
>> +                                       !conf->array_frozen &&
>> +                                       !atomic_read(&conf->barrier[idx]),
>> +                                       conf->resync_lock);
>> +               }
>> +       }
>>          atomic_inc(&conf->nr_pending[idx]);
>>          atomic_dec(&conf->nr_waiting[idx]);
>>          spin_unlock_irq(&conf->resync_lock);
>> +       return ret;
>>   }
>>
>> -static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
>> +static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
>>   {
>>          int idx = sector_to_idx(sector_nr);
>> +       bool ret = true;
>>
>>          /*
>>           * Very similar to _wait_barrier(). The difference is, for read
>> @@ -1002,7 +1012,7 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
>>          atomic_inc(&conf->nr_pending[idx]);
>>
>>          if (!READ_ONCE(conf->array_frozen))
>> -               return;
>> +               return ret;
>>
>>          spin_lock_irq(&conf->resync_lock);
>>          atomic_inc(&conf->nr_waiting[idx]);
>> @@ -1013,19 +1023,27 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
>>           */
>>          wake_up(&conf->wait_barrier);
>>          /* Wait for array to be unfrozen */
>> -       wait_event_lock_irq(conf->wait_barrier,
>> -                           !conf->array_frozen,
>> -                           conf->resync_lock);
>> +       if (conf->array_frozen || atomic_read(&conf->barrier[idx])) {
>> +               if (nowait)
>> +                       /* Return false when nowait flag is set */
>> +                       ret = false;
>> +               else {
>> +                       wait_event_lock_irq(conf->wait_barrier,
>> +                                       !conf->array_frozen,
>> +                                       conf->resync_lock);
>> +               }
>> +       }
>>          atomic_inc(&conf->nr_pending[idx]);
>>          atomic_dec(&conf->nr_waiting[idx]);
>>          spin_unlock_irq(&conf->resync_lock);
>> +       return ret;
>>   }
>>
>> -static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
>> +static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
>>   {
>>          int idx = sector_to_idx(sector_nr);
>>
>> -       _wait_barrier(conf, idx);
>> +       return _wait_barrier(conf, idx, nowait);
>>   }
>>
>>   static void _allow_barrier(struct r1conf *conf, int idx)
>> @@ -1236,7 +1254,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
>>           * Still need barrier for READ in case that whole
>>           * array is frozen.
>>           */
>> -       wait_read_barrier(conf, bio->bi_iter.bi_sector);
>> +       if (!wait_read_barrier(conf, bio->bi_iter.bi_sector,
>> +                               bio->bi_opf & REQ_NOWAIT)) {
>> +               bio_wouldblock_error(bio);
>> +               return;
>> +       }
>>
>>          if (!r1_bio)
>>                  r1_bio = alloc_r1bio(mddev, bio);
>> @@ -1336,6 +1358,10 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>>                       bio->bi_iter.bi_sector, bio_end_sector(bio))) {
>>
>>                  DEFINE_WAIT(w);
>> +               if (bio->bi_opf & REQ_NOWAIT) {
>> +                       bio_wouldblock_error(bio);
>> +                       return;
>> +               }
>>                  for (;;) {
>>                          prepare_to_wait(&conf->wait_barrier,
>>                                          &w, TASK_IDLE);
>> @@ -1353,17 +1379,26 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>>           * thread has put up a bar for new requests.
>>           * Continue immediately if no resync is active currently.
>>           */
>> -       wait_barrier(conf, bio->bi_iter.bi_sector);
>> +       if (!wait_read_barrier(conf, bio->bi_iter.bi_sector,
> We change wait_barrier to wait_read_barrier here, I guess this is a typo?
>
> Please include changes in raid10 and raid456 (or don't set QUEUE_FLAG_NOWAIT
> for these personalities) and resend the patch. We will target it for
> the next merge
> window (5.17).
>
> Thanks,
> Song
>
> Thanks Song. I am almost done with the raid10 change and will do the
> raid456 soon and resend.
>> +                               bio->bi_opf & REQ_NOWAIT)) {
>> +               bio_wouldblock_error(bio);
>> +               return;
>> +       }
>>
>>          r1_bio = alloc_r1bio(mddev, bio);
>>          r1_bio->sectors = max_write_sectors;
>>
>>          if (conf->pending_count >= max_queued_requests) {
>>                  md_wakeup_thread(mddev->thread);
>> +               if (bio->bi_opf & REQ_NOWAIT) {
>> +                       bio_wouldblock_error(bio);
>> +                       return;
>> +               }
>>                  raid1_log(mddev, "wait queued");
>>                  wait_event(conf->wait_barrier,
>>                             conf->pending_count < max_queued_requests);
>>          }
>> +
>>          /* first select target devices under rcu_lock and
>>           * inc refcount on their rdev.  Record them by setting
>>           * bios[x] to bio
>> @@ -1458,9 +1493,14 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>>                                  rdev_dec_pending(conf->mirrors[j].rdev, mddev);
>>                  r1_bio->state = 0;
>>                  allow_barrier(conf, bio->bi_iter.bi_sector);
>> +
>> +               if (bio->bi_opf & REQ_NOWAIT) {
>> +                       bio_wouldblock_error(bio);
>> +                       return;
>> +               }
>>                  raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
>>                  md_wait_for_blocked_rdev(blocked_rdev, mddev);
>> -               wait_barrier(conf, bio->bi_iter.bi_sector);
>> +               wait_barrier(conf, bio->bi_iter.bi_sector, false);
>>                  goto retry_write;
>>          }
>>
>> @@ -1687,7 +1727,7 @@ static void close_sync(struct r1conf *conf)
>>          int idx;
>>
>>          for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) {
>> -               _wait_barrier(conf, idx);
>> +               _wait_barrier(conf, idx, false);
>>                  _allow_barrier(conf, idx);
>>          }
>>
>> --
>> 2.17.1
>>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v3 2/2] md: raid1 add nowait support
  2021-11-08 22:39           ` Vishal Verma
@ 2021-11-09 20:59             ` Vishal Verma
  2021-11-10 17:02               ` Song Liu
  0 siblings, 1 reply; 86+ messages in thread
From: Vishal Verma @ 2021-11-09 20:59 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, rgoldwyn, Jens Axboe

Hi Song,

I did modify raid456 and raid10 with nowait support, but unfortunately 
have been running into kernel task hung and panics while doing write IO 
and having hard time trying to debug.

Shall I post the patches in to get the feedback or go with an 
alternative route to have a flag to only enable nowait for raid1 for now?

Thanks,

Vishal

On 11/8/21 3:39 PM, Vishal Verma wrote:
>
> On 11/8/21 3:32 PM, Song Liu wrote:
>> On Wed, Nov 3, 2021 at 9:52 PM Vishal Verma <vverma@digitalocean.com> 
>> wrote:
>>> This adds nowait support to the RAID1 driver. It makes RAID1 driver
>>> return with EAGAIN for situations where it could wait for eg:
>>>
>>>    - Waiting for the barrier,
>>>    - Array got frozen,
>>>    - Too many pending I/Os to be queued.
>>>
>>> wait_barrier() fn is modified to return bool to support error for
>>> wait barriers. It returns true in case of wait or if wait is not
>>> required and returns false if wait was required but not performed
>>> to support nowait.
>>>
>>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
>>> ---
>>>   drivers/md/raid1.c | 74 
>>> +++++++++++++++++++++++++++++++++++-----------
>>>   1 file changed, 57 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>>> index 7dc8026cf6ee..2e191fc2147b 100644
>>> --- a/drivers/md/raid1.c
>>> +++ b/drivers/md/raid1.c
>>> @@ -929,8 +929,9 @@ static void lower_barrier(struct r1conf *conf, 
>>> sector_t sector_nr)
>>>          wake_up(&conf->wait_barrier);
>>>   }
>>>
>>> -static void _wait_barrier(struct r1conf *conf, int idx)
>>> +static bool _wait_barrier(struct r1conf *conf, int idx, bool nowait)
>>>   {
>>> +       bool ret = true;
>>>          /*
>>>           * We need to increase conf->nr_pending[idx] very early here,
>>>           * then raise_barrier() can be blocked when it waits for
>>> @@ -961,7 +962,7 @@ static void _wait_barrier(struct r1conf *conf, 
>>> int idx)
>>>           */
>>>          if (!READ_ONCE(conf->array_frozen) &&
>>>              !atomic_read(&conf->barrier[idx]))
>>> -               return;
>>> +               return ret;
>>>
>>>          /*
>>>           * After holding conf->resync_lock, conf->nr_pending[idx]
>>> @@ -979,18 +980,27 @@ static void _wait_barrier(struct r1conf *conf, 
>>> int idx)
>>>           */
>>>          wake_up(&conf->wait_barrier);
>>>          /* Wait for the barrier in same barrier unit bucket to 
>>> drop. */
>>> -       wait_event_lock_irq(conf->wait_barrier,
>>> -                           !conf->array_frozen &&
>>> - !atomic_read(&conf->barrier[idx]),
>>> -                           conf->resync_lock);
>>> +       if (conf->array_frozen || atomic_read(&conf->barrier[idx])) {
>>> +               /* Return false when nowait flag is set */
>>> +               if (nowait)
>>> +                       ret = false;
>>> +               else {
>>> + wait_event_lock_irq(conf->wait_barrier,
>>> +                                       !conf->array_frozen &&
>>> + !atomic_read(&conf->barrier[idx]),
>>> +                                       conf->resync_lock);
>>> +               }
>>> +       }
>>>          atomic_inc(&conf->nr_pending[idx]);
>>>          atomic_dec(&conf->nr_waiting[idx]);
>>>          spin_unlock_irq(&conf->resync_lock);
>>> +       return ret;
>>>   }
>>>
>>> -static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
>>> +static bool wait_read_barrier(struct r1conf *conf, sector_t 
>>> sector_nr, bool nowait)
>>>   {
>>>          int idx = sector_to_idx(sector_nr);
>>> +       bool ret = true;
>>>
>>>          /*
>>>           * Very similar to _wait_barrier(). The difference is, for 
>>> read
>>> @@ -1002,7 +1012,7 @@ static void wait_read_barrier(struct r1conf 
>>> *conf, sector_t sector_nr)
>>>          atomic_inc(&conf->nr_pending[idx]);
>>>
>>>          if (!READ_ONCE(conf->array_frozen))
>>> -               return;
>>> +               return ret;
>>>
>>>          spin_lock_irq(&conf->resync_lock);
>>>          atomic_inc(&conf->nr_waiting[idx]);
>>> @@ -1013,19 +1023,27 @@ static void wait_read_barrier(struct r1conf 
>>> *conf, sector_t sector_nr)
>>>           */
>>>          wake_up(&conf->wait_barrier);
>>>          /* Wait for array to be unfrozen */
>>> -       wait_event_lock_irq(conf->wait_barrier,
>>> -                           !conf->array_frozen,
>>> -                           conf->resync_lock);
>>> +       if (conf->array_frozen || atomic_read(&conf->barrier[idx])) {
>>> +               if (nowait)
>>> +                       /* Return false when nowait flag is set */
>>> +                       ret = false;
>>> +               else {
>>> + wait_event_lock_irq(conf->wait_barrier,
>>> + !conf->array_frozen,
>>> +                                       conf->resync_lock);
>>> +               }
>>> +       }
>>>          atomic_inc(&conf->nr_pending[idx]);
>>>          atomic_dec(&conf->nr_waiting[idx]);
>>>          spin_unlock_irq(&conf->resync_lock);
>>> +       return ret;
>>>   }
>>>
>>> -static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
>>> +static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, 
>>> bool nowait)
>>>   {
>>>          int idx = sector_to_idx(sector_nr);
>>>
>>> -       _wait_barrier(conf, idx);
>>> +       return _wait_barrier(conf, idx, nowait);
>>>   }
>>>
>>>   static void _allow_barrier(struct r1conf *conf, int idx)
>>> @@ -1236,7 +1254,11 @@ static void raid1_read_request(struct mddev 
>>> *mddev, struct bio *bio,
>>>           * Still need barrier for READ in case that whole
>>>           * array is frozen.
>>>           */
>>> -       wait_read_barrier(conf, bio->bi_iter.bi_sector);
>>> +       if (!wait_read_barrier(conf, bio->bi_iter.bi_sector,
>>> +                               bio->bi_opf & REQ_NOWAIT)) {
>>> +               bio_wouldblock_error(bio);
>>> +               return;
>>> +       }
>>>
>>>          if (!r1_bio)
>>>                  r1_bio = alloc_r1bio(mddev, bio);
>>> @@ -1336,6 +1358,10 @@ static void raid1_write_request(struct mddev 
>>> *mddev, struct bio *bio,
>>>                       bio->bi_iter.bi_sector, bio_end_sector(bio))) {
>>>
>>>                  DEFINE_WAIT(w);
>>> +               if (bio->bi_opf & REQ_NOWAIT) {
>>> +                       bio_wouldblock_error(bio);
>>> +                       return;
>>> +               }
>>>                  for (;;) {
>>> prepare_to_wait(&conf->wait_barrier,
>>>                                          &w, TASK_IDLE);
>>> @@ -1353,17 +1379,26 @@ static void raid1_write_request(struct mddev 
>>> *mddev, struct bio *bio,
>>>           * thread has put up a bar for new requests.
>>>           * Continue immediately if no resync is active currently.
>>>           */
>>> -       wait_barrier(conf, bio->bi_iter.bi_sector);
>>> +       if (!wait_read_barrier(conf, bio->bi_iter.bi_sector,
>> We change wait_barrier to wait_read_barrier here, I guess this is a 
>> typo?
>>
>> Please include changes in raid10 and raid456 (or don't set 
>> QUEUE_FLAG_NOWAIT
>> for these personalities) and resend the patch. We will target it for
>> the next merge
>> window (5.17).
>>
>> Thanks,
>> Song
>>
>> Thanks Song. I am almost done with the raid10 change and will do the
>> raid456 soon and resend.
>>> + bio->bi_opf & REQ_NOWAIT)) {
>>> +               bio_wouldblock_error(bio);
>>> +               return;
>>> +       }
>>>
>>>          r1_bio = alloc_r1bio(mddev, bio);
>>>          r1_bio->sectors = max_write_sectors;
>>>
>>>          if (conf->pending_count >= max_queued_requests) {
>>>                  md_wakeup_thread(mddev->thread);
>>> +               if (bio->bi_opf & REQ_NOWAIT) {
>>> +                       bio_wouldblock_error(bio);
>>> +                       return;
>>> +               }
>>>                  raid1_log(mddev, "wait queued");
>>>                  wait_event(conf->wait_barrier,
>>>                             conf->pending_count < max_queued_requests);
>>>          }
>>> +
>>>          /* first select target devices under rcu_lock and
>>>           * inc refcount on their rdev.  Record them by setting
>>>           * bios[x] to bio
>>> @@ -1458,9 +1493,14 @@ static void raid1_write_request(struct mddev 
>>> *mddev, struct bio *bio,
>>> rdev_dec_pending(conf->mirrors[j].rdev, mddev);
>>>                  r1_bio->state = 0;
>>>                  allow_barrier(conf, bio->bi_iter.bi_sector);
>>> +
>>> +               if (bio->bi_opf & REQ_NOWAIT) {
>>> +                       bio_wouldblock_error(bio);
>>> +                       return;
>>> +               }
>>>                  raid1_log(mddev, "wait rdev %d blocked", 
>>> blocked_rdev->raid_disk);
>>>                  md_wait_for_blocked_rdev(blocked_rdev, mddev);
>>> -               wait_barrier(conf, bio->bi_iter.bi_sector);
>>> +               wait_barrier(conf, bio->bi_iter.bi_sector, false);
>>>                  goto retry_write;
>>>          }
>>>
>>> @@ -1687,7 +1727,7 @@ static void close_sync(struct r1conf *conf)
>>>          int idx;
>>>
>>>          for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) {
>>> -               _wait_barrier(conf, idx);
>>> +               _wait_barrier(conf, idx, false);
>>>                  _allow_barrier(conf, idx);
>>>          }
>>>
>>> -- 
>>> 2.17.1
>>>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v3 2/2] md: raid1 add nowait support
  2021-11-09 20:59             ` Vishal Verma
@ 2021-11-10 17:02               ` Song Liu
  2021-11-10 17:04                 ` Vishal Verma
  0 siblings, 1 reply; 86+ messages in thread
From: Song Liu @ 2021-11-10 17:02 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, rgoldwyn, Jens Axboe

On Tue, Nov 9, 2021 at 12:59 PM Vishal Verma <vverma@digitalocean.com> wrote:
>
> Hi Song,
>
> I did modify raid456 and raid10 with nowait support, but unfortunately
> have been running into kernel task hung and panics while doing write IO
> and having hard time trying to debug.
>
> Shall I post the patches in to get the feedback or go with an
> alternative route to have a flag to only enable nowait for raid1 for now?
>

There is still sufficient time before 5.17, so I would recommend we do all
personalities together. You can always post patches as RFC for feedback.

Thanks,
Song

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v3 2/2] md: raid1 add nowait support
  2021-11-10 17:02               ` Song Liu
@ 2021-11-10 17:04                 ` Vishal Verma
  0 siblings, 0 replies; 86+ messages in thread
From: Vishal Verma @ 2021-11-10 17:04 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, rgoldwyn, Jens Axboe

Ack, thanks!

On 11/10/21 10:02 AM, Song Liu wrote:
> On Tue, Nov 9, 2021 at 12:59 PM Vishal Verma <vverma@digitalocean.com> wrote:
>> Hi Song,
>>
>> I did modify raid456 and raid10 with nowait support, but unfortunately
>> have been running into kernel task hung and panics while doing write IO
>> and having hard time trying to debug.
>>
>> Shall I post the patches in to get the feedback or go with an
>> alternative route to have a flag to only enable nowait for raid1 for now?
>>
> There is still sufficient time before 5.17, so I would recommend we do all
> personalities together. You can always post patches as RFC for feedback.
>
> Thanks,
> Song

^ permalink raw reply	[flat|nested] 86+ messages in thread

* [RFC PATCH v4 1/4] md: add support for REQ_NOWAIT
  2021-11-08 22:32         ` Song Liu
  2021-11-08 22:39           ` Vishal Verma
@ 2021-11-10 18:14           ` Vishal Verma
  2021-11-10 18:14             ` [RFC PATCH v4 2/4] md: raid1 add nowait support Vishal Verma
                               ` (3 more replies)
  1 sibling, 4 replies; 86+ messages in thread
From: Vishal Verma @ 2021-11-10 18:14 UTC (permalink / raw)
  To: song, linux-raid, rgoldwyn; +Cc: axboe, Vishal Verma

commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support
for checking whether a given bdev supports handling of REQ_NOWAIT or not.
Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable
it for linear target") added support for REQ_NOWAIT for dm. This uses
a similar approach to incorporate REQ_NOWAIT for md based bios.

This patch was tested using t/io_uring tool within FIO. A nvme drive
was partitioned into 2 partitions and a simple raid 0 configuration
/dev/md0 was created.

md0 : active raid0 nvme4n1p1[1] nvme4n1p2[0]
      937423872 blocks super 1.2 512k chunks

Before patch:

$ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100

Running top while the above runs:

$ ps -eL | grep $(pidof io_uring)

  38396   38396 pts/2    00:00:00 io_uring
  38396   38397 pts/2    00:00:15 io_uring
  38396   38398 pts/2    00:00:13 iou-wrk-38397

We can see iou-wrk-38397 io worker thread created which gets created
when io_uring sees that the underlying device (/dev/md0 in this case)
doesn't support nowait.

After patch:

$ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100

Running top while the above runs:

$ ps -eL | grep $(pidof io_uring)

  38341   38341 pts/2    00:10:22 io_uring
  38341   38342 pts/2    00:10:37 io_uring

After running this patch, we don't see any io worker thread
being created which indicated that io_uring saw that the
underlying device does support nowait. This is the exact behaviour
noticed on a dm device which also supports nowait.

For all the other raid personalities except raid0, we would need
to train pieces which involves make_request fn in order for them
to correctly handle REQ_NOWAIT.

Signed-off-by: Vishal Verma <vverma@digitalocean.com>
---
 drivers/md/md.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5111ed966947..a30c78afcab6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -419,6 +419,11 @@ void md_handle_request(struct mddev *mddev, struct bio *bio)
 	if (is_suspended(mddev, bio)) {
 		DEFINE_WAIT(__wait);
 		for (;;) {
+			/* Bail out if REQ_NOWAIT is set for the bio */
+			if (bio->bi_opf & REQ_NOWAIT) {
+				bio_wouldblock_error(bio);
+				return;
+			}
 			prepare_to_wait(&mddev->sb_wait, &__wait,
 					TASK_UNINTERRUPTIBLE);
 			if (!is_suspended(mddev, bio))
@@ -5792,6 +5797,7 @@ int md_run(struct mddev *mddev)
 	int err;
 	struct md_rdev *rdev;
 	struct md_personality *pers;
+	bool nowait = true;
 
 	if (list_empty(&mddev->disks))
 		/* cannot run an array with no devices.. */
@@ -5862,8 +5868,13 @@ int md_run(struct mddev *mddev)
 			}
 		}
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
+		nowait = nowait && blk_queue_nowait(bdev_get_queue(rdev->bdev));
 	}
 
+	/* Set the NOWAIT flags if all underlying devices support it */
+	if (nowait)
+		blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue);
+
 	if (!bioset_initialized(&mddev->bio_set)) {
 		err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
 		if (err)
@@ -7007,6 +7018,15 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 	if (!mddev->thread)
 		md_update_sb(mddev, 1);
+	/*
+	 * If the new disk does not support REQ_NOWAIT,
+	 * disable on the whole MD.
+	 */
+	if (!blk_queue_nowait(bdev_get_queue(rdev->bdev))) {
+		pr_info("%s: Disabling nowait because %s does not support nowait\n",
+			mdname(mddev), bdevname(rdev->bdev, b));
+		blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue);
+	}
 	/*
 	 * Kick recovery, maybe this spare has to be added to the
 	 * array immediately.
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 86+ messages in thread

* [RFC PATCH v4 2/4] md: raid1 add nowait support
  2021-11-10 18:14           ` [RFC PATCH v4 1/4] md: add support for REQ_NOWAIT Vishal Verma
@ 2021-11-10 18:14             ` Vishal Verma
  2021-11-10 18:14             ` [RFC PATCH v4 3/4] md: raid10 " Vishal Verma
                               ` (2 subsequent siblings)
  3 siblings, 0 replies; 86+ messages in thread
From: Vishal Verma @ 2021-11-10 18:14 UTC (permalink / raw)
  To: song, linux-raid, rgoldwyn; +Cc: axboe, Vishal Verma

This adds nowait support to the RAID1 driver. It makes RAID1 driver
return with EAGAIN for situations where it could wait for eg:

  - Waiting for the barrier,
  - Array got frozen,
  - Too many pending I/Os to be queued.

wait_barrier() fn is modified to return bool to support error for
wait barriers. It returns true in case of wait or if wait is not
required and returns false if wait was required but not performed
to support nowait.

Signed-off-by: Vishal Verma <vverma@digitalocean.com>
---
 drivers/md/raid1.c | 74 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 57 insertions(+), 17 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7dc8026cf6ee..727d31de5694 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -929,8 +929,9 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
 	wake_up(&conf->wait_barrier);
 }
 
-static void _wait_barrier(struct r1conf *conf, int idx)
+static bool _wait_barrier(struct r1conf *conf, int idx, bool nowait)
 {
+	bool ret = true;
 	/*
 	 * We need to increase conf->nr_pending[idx] very early here,
 	 * then raise_barrier() can be blocked when it waits for
@@ -961,7 +962,7 @@ static void _wait_barrier(struct r1conf *conf, int idx)
 	 */
 	if (!READ_ONCE(conf->array_frozen) &&
 	    !atomic_read(&conf->barrier[idx]))
-		return;
+		return ret;
 
 	/*
 	 * After holding conf->resync_lock, conf->nr_pending[idx]
@@ -979,18 +980,27 @@ static void _wait_barrier(struct r1conf *conf, int idx)
 	 */
 	wake_up(&conf->wait_barrier);
 	/* Wait for the barrier in same barrier unit bucket to drop. */
-	wait_event_lock_irq(conf->wait_barrier,
-			    !conf->array_frozen &&
-			     !atomic_read(&conf->barrier[idx]),
-			    conf->resync_lock);
+	if (conf->array_frozen || atomic_read(&conf->barrier[idx])) {
+		/* Return false when nowait flag is set */
+		if (nowait)
+			ret = false;
+		else {
+			wait_event_lock_irq(conf->wait_barrier,
+					!conf->array_frozen &&
+					!atomic_read(&conf->barrier[idx]),
+					conf->resync_lock);
+		}
+	}
 	atomic_inc(&conf->nr_pending[idx]);
 	atomic_dec(&conf->nr_waiting[idx]);
 	spin_unlock_irq(&conf->resync_lock);
+	return ret;
 }
 
-static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
+static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
 {
 	int idx = sector_to_idx(sector_nr);
+	bool ret = true;
 
 	/*
 	 * Very similar to _wait_barrier(). The difference is, for read
@@ -1002,7 +1012,7 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
 	atomic_inc(&conf->nr_pending[idx]);
 
 	if (!READ_ONCE(conf->array_frozen))
-		return;
+		return ret;
 
 	spin_lock_irq(&conf->resync_lock);
 	atomic_inc(&conf->nr_waiting[idx]);
@@ -1013,19 +1023,27 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
 	 */
 	wake_up(&conf->wait_barrier);
 	/* Wait for array to be unfrozen */
-	wait_event_lock_irq(conf->wait_barrier,
-			    !conf->array_frozen,
-			    conf->resync_lock);
+	if (conf->array_frozen || atomic_read(&conf->barrier[idx])) {
+		if (nowait)
+			/* Return false when nowait flag is set */
+			ret = false;
+		else {
+			wait_event_lock_irq(conf->wait_barrier,
+					!conf->array_frozen,
+					conf->resync_lock);
+		}
+	}
 	atomic_inc(&conf->nr_pending[idx]);
 	atomic_dec(&conf->nr_waiting[idx]);
 	spin_unlock_irq(&conf->resync_lock);
+	return ret;
 }
 
-static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
+static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
 {
 	int idx = sector_to_idx(sector_nr);
 
-	_wait_barrier(conf, idx);
+	return _wait_barrier(conf, idx, nowait);
 }
 
 static void _allow_barrier(struct r1conf *conf, int idx)
@@ -1236,7 +1254,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 	 * Still need barrier for READ in case that whole
 	 * array is frozen.
 	 */
-	wait_read_barrier(conf, bio->bi_iter.bi_sector);
+	if (!wait_read_barrier(conf, bio->bi_iter.bi_sector,
+				bio->bi_opf & REQ_NOWAIT)) {
+		bio_wouldblock_error(bio);
+		return;
+	}
 
 	if (!r1_bio)
 		r1_bio = alloc_r1bio(mddev, bio);
@@ -1336,6 +1358,10 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 		     bio->bi_iter.bi_sector, bio_end_sector(bio))) {
 
 		DEFINE_WAIT(w);
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		for (;;) {
 			prepare_to_wait(&conf->wait_barrier,
 					&w, TASK_IDLE);
@@ -1353,17 +1379,26 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	 * thread has put up a bar for new requests.
 	 * Continue immediately if no resync is active currently.
 	 */
-	wait_barrier(conf, bio->bi_iter.bi_sector);
+	if (!wait_barrier(conf, bio->bi_iter.bi_sector,
+				bio->bi_opf & REQ_NOWAIT)) {
+		bio_wouldblock_error(bio);
+		return;
+	}
 
 	r1_bio = alloc_r1bio(mddev, bio);
 	r1_bio->sectors = max_write_sectors;
 
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		raid1_log(mddev, "wait queued");
 		wait_event(conf->wait_barrier,
 			   conf->pending_count < max_queued_requests);
 	}
+
 	/* first select target devices under rcu_lock and
 	 * inc refcount on their rdev.  Record them by setting
 	 * bios[x] to bio
@@ -1458,9 +1493,14 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 				rdev_dec_pending(conf->mirrors[j].rdev, mddev);
 		r1_bio->state = 0;
 		allow_barrier(conf, bio->bi_iter.bi_sector);
+
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
 		md_wait_for_blocked_rdev(blocked_rdev, mddev);
-		wait_barrier(conf, bio->bi_iter.bi_sector);
+		wait_barrier(conf, bio->bi_iter.bi_sector, false);
 		goto retry_write;
 	}
 
@@ -1687,7 +1727,7 @@ static void close_sync(struct r1conf *conf)
 	int idx;
 
 	for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) {
-		_wait_barrier(conf, idx);
+		_wait_barrier(conf, idx, false);
 		_allow_barrier(conf, idx);
 	}
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 86+ messages in thread

* [RFC PATCH v4 3/4] md: raid10 add nowait support
  2021-11-10 18:14           ` [RFC PATCH v4 1/4] md: add support for REQ_NOWAIT Vishal Verma
  2021-11-10 18:14             ` [RFC PATCH v4 2/4] md: raid1 add nowait support Vishal Verma
@ 2021-11-10 18:14             ` Vishal Verma
  2021-12-14  0:32               ` Song Liu
  2021-11-10 18:14             ` [RFC PATCH v4 4/4] md: raid456 " Vishal Verma
  2021-12-13 23:50             ` [RFC PATCH v4 1/4] md: add support for REQ_NOWAIT Song Liu
  3 siblings, 1 reply; 86+ messages in thread
From: Vishal Verma @ 2021-11-10 18:14 UTC (permalink / raw)
  To: song, linux-raid, rgoldwyn; +Cc: axboe, Vishal Verma

This adds nowait support to the RAID10 driver. Very similar to
raid1 driver changes. It makes RAID10 driver return with EAGAIN
for situations where it could wait for eg:

  - Waiting for the barrier,
  - Too many pending I/Os to be queued,
  - Reshape operation,
  - Discard operation.

wait_barrier() fn is modified to return bool to support error for
wait barriers. It returns true in case of wait or if wait is not
required and returns false if wait was required but not performed
to support nowait.

Signed-off-by: Vishal Verma <vverma@digitalocean.com>
---
 drivers/md/raid10.c | 87 +++++++++++++++++++++++++++++++--------------
 1 file changed, 60 insertions(+), 27 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index dde98f65bd04..03983146d31a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -952,8 +952,9 @@ static void lower_barrier(struct r10conf *conf)
 	wake_up(&conf->wait_barrier);
 }
 
-static void wait_barrier(struct r10conf *conf)
+static bool wait_barrier(struct r10conf *conf, bool nowait)
 {
+	bool ret = true;
 	spin_lock_irq(&conf->resync_lock);
 	if (conf->barrier) {
 		struct bio_list *bio_list = current->bio_list;
@@ -968,26 +969,33 @@ static void wait_barrier(struct r10conf *conf)
 		 * count down.
 		 */
 		raid10_log(conf->mddev, "wait barrier");
-		wait_event_lock_irq(conf->wait_barrier,
-				    !conf->barrier ||
-				    (atomic_read(&conf->nr_pending) &&
-				     bio_list &&
-				     (!bio_list_empty(&bio_list[0]) ||
-				      !bio_list_empty(&bio_list[1]))) ||
-				     /* move on if recovery thread is
-				      * blocked by us
-				      */
-				     (conf->mddev->thread->tsk == current &&
-				      test_bit(MD_RECOVERY_RUNNING,
-					       &conf->mddev->recovery) &&
-				      conf->nr_queued > 0),
-				    conf->resync_lock);
+		/* Return false when nowait flag is set */
+		if (nowait)
+			ret = false;
+		else
+			wait_event_lock_irq(conf->wait_barrier,
+					    !conf->barrier ||
+					    (atomic_read(&conf->nr_pending) &&
+					     bio_list &&
+					     (!bio_list_empty(&bio_list[0]) ||
+					      !bio_list_empty(&bio_list[1]))) ||
+					     /* move on if recovery thread is
+					      * blocked by us
+					      */
+					     (conf->mddev->thread->tsk == current &&
+					      test_bit(MD_RECOVERY_RUNNING,
+						       &conf->mddev->recovery) &&
+					      conf->nr_queued > 0),
+					    conf->resync_lock);
 		conf->nr_waiting--;
 		if (!conf->nr_waiting)
 			wake_up(&conf->wait_barrier);
 	}
-	atomic_inc(&conf->nr_pending);
+	/* Only increment nr_pending when we wait */
+	if (ret)
+		atomic_inc(&conf->nr_pending);
 	spin_unlock_irq(&conf->resync_lock);
+	return ret;
 }
 
 static void allow_barrier(struct r10conf *conf)
@@ -1101,17 +1109,25 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
 static void regular_request_wait(struct mddev *mddev, struct r10conf *conf,
 				 struct bio *bio, sector_t sectors)
 {
-	wait_barrier(conf);
+	/* Bail out if REQ_NOWAIT is set for the bio */
+	if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
+		bio_wouldblock_error(bio);
+		return;
+	}
 	while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    bio->bi_iter.bi_sector < conf->reshape_progress &&
 	    bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
 		raid10_log(conf->mddev, "wait reshape");
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		allow_barrier(conf);
 		wait_event(conf->wait_barrier,
 			   conf->reshape_progress <= bio->bi_iter.bi_sector ||
 			   conf->reshape_progress >= bio->bi_iter.bi_sector +
 			   sectors);
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 	}
 }
 
@@ -1179,7 +1195,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 		bio_chain(split, bio);
 		allow_barrier(conf);
 		submit_bio_noacct(bio);
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 		bio = split;
 		r10_bio->master_bio = bio;
 		r10_bio->sectors = max_sectors;
@@ -1338,7 +1354,7 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
 		raid10_log(conf->mddev, "%s wait rdev %d blocked",
 				__func__, blocked_rdev->raid_disk);
 		md_wait_for_blocked_rdev(blocked_rdev, mddev);
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 		goto retry_wait;
 	}
 }
@@ -1357,6 +1373,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 					    bio_end_sector(bio)))) {
 		DEFINE_WAIT(w);
 		for (;;) {
+			/* Bail out if REQ_NOWAIT is set for the bio */
+			if (bio->bi_opf & REQ_NOWAIT) {
+				bio_wouldblock_error(bio);
+				return;
+			}
 			prepare_to_wait(&conf->wait_barrier,
 					&w, TASK_IDLE);
 			if (!md_cluster_ops->area_resyncing(mddev, WRITE,
@@ -1381,6 +1402,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 			      BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
 		md_wakeup_thread(mddev->thread);
 		raid10_log(conf->mddev, "wait reshape metadata");
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		wait_event(mddev->sb_wait,
 			   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
 
@@ -1390,6 +1415,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
 		raid10_log(mddev, "wait queued");
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		wait_event(conf->wait_barrier,
 			   conf->pending_count < max_queued_requests);
 	}
@@ -1482,7 +1511,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 		bio_chain(split, bio);
 		allow_barrier(conf);
 		submit_bio_noacct(bio);
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 		bio = split;
 		r10_bio->master_bio = bio;
 	}
@@ -1607,7 +1636,11 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 		return -EAGAIN;
 
-	wait_barrier(conf);
+	if (bio->bi_opf & REQ_NOWAIT) {
+		bio_wouldblock_error(bio);
+		return 0;
+	}
+	wait_barrier(conf, false);
 
 	/*
 	 * Check reshape again to avoid reshape happens after checking
@@ -1649,7 +1682,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 		allow_barrier(conf);
 		/* Resend the fist split part */
 		submit_bio_noacct(split);
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 	}
 	div_u64_rem(bio_end, stripe_size, &remainder);
 	if (remainder) {
@@ -1660,7 +1693,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 		/* Resend the second split part */
 		submit_bio_noacct(bio);
 		bio = split;
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 	}
 
 	bio_start = bio->bi_iter.bi_sector;
@@ -1816,7 +1849,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 		end_disk_offset += geo->stride;
 		atomic_inc(&first_r10bio->remaining);
 		raid_end_discard_bio(r10_bio);
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 		goto retry_discard;
 	}
 
@@ -2011,7 +2044,7 @@ static void print_conf(struct r10conf *conf)
 
 static void close_sync(struct r10conf *conf)
 {
-	wait_barrier(conf);
+	wait_barrier(conf, false);
 	allow_barrier(conf);
 
 	mempool_exit(&conf->r10buf_pool);
@@ -4819,7 +4852,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 	if (need_flush ||
 	    time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
 		/* Need to update reshape_position in metadata */
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 		mddev->reshape_position = conf->reshape_progress;
 		if (mddev->reshape_backwards)
 			mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 86+ messages in thread

* [RFC PATCH v4 4/4] md: raid456 add nowait support
  2021-11-10 18:14           ` [RFC PATCH v4 1/4] md: add support for REQ_NOWAIT Vishal Verma
  2021-11-10 18:14             ` [RFC PATCH v4 2/4] md: raid1 add nowait support Vishal Verma
  2021-11-10 18:14             ` [RFC PATCH v4 3/4] md: raid10 " Vishal Verma
@ 2021-11-10 18:14             ` Vishal Verma
  2021-11-11 21:42               ` Song Liu
  2021-12-10  2:16               ` Song Liu
  2021-12-13 23:50             ` [RFC PATCH v4 1/4] md: add support for REQ_NOWAIT Song Liu
  3 siblings, 2 replies; 86+ messages in thread
From: Vishal Verma @ 2021-11-10 18:14 UTC (permalink / raw)
  To: song, linux-raid, rgoldwyn; +Cc: axboe, Vishal Verma

Returns EAGAIN in case the raid456 driver would block
waiting for situations like:

  - Reshape operation,
  - Discard operation.

Signed-off-by: Vishal Verma <vverma@digitalocean.com>
---
 drivers/md/raid5.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9c1a5877cf9f..fa64ee315241 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5710,6 +5710,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 		int d;
 	again:
 		sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
+		/* Bail out if REQ_NOWAIT is set */
+		if (bi->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bi);
+			return;
+		}
 		prepare_to_wait(&conf->wait_for_overlap, &w,
 				TASK_UNINTERRUPTIBLE);
 		set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
@@ -5820,6 +5825,15 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
 	bi->bi_next = NULL;
 
 	md_account_bio(mddev, &bi);
+	/* Bail out if REQ_NOWAIT is set */
+	if (bi->bi_opf & REQ_NOWAIT &&
+	    conf->reshape_progress != MaxSector &&
+	    mddev->reshape_backwards
+	    ? logical_sector < conf->reshape_safe
+	    : logical_sector >= conf->reshape_safe) {
+		bio_wouldblock_error(bi);
+		return true;
+	}
 	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 	for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
 		int previous;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 86+ messages in thread

* Re: [RFC PATCH v4 4/4] md: raid456 add nowait support
  2021-11-10 18:14             ` [RFC PATCH v4 4/4] md: raid456 " Vishal Verma
@ 2021-11-11 21:42               ` Song Liu
       [not found]                 ` <f8c2a2bc-a885-8254-2b39-fc0c969ac70d@digitalocean.com>
  2021-12-10  2:16               ` Song Liu
  1 sibling, 1 reply; 86+ messages in thread
From: Song Liu @ 2021-11-11 21:42 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, rgoldwyn, Jens Axboe

On Wed, Nov 10, 2021 at 10:15 AM Vishal Verma <vverma@digitalocean.com> wrote:
>
> Returns EAGAIN in case the raid456 driver would block
> waiting for situations like:
>
>   - Reshape operation,
>   - Discard operation.
>
> Signed-off-by: Vishal Verma <vverma@digitalocean.com>

I think you mentioned there are some task hung issues, could you
please provide some
information about them?

btw: I am taking vacation this week. So I may not have time to try it
out until next week.

Thanks,
Song

> ---
>  drivers/md/raid5.c | 14 ++++++++++++++
>  1 file changed, 14 insertions(+)
>
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 9c1a5877cf9f..fa64ee315241 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -5710,6 +5710,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
>                 int d;
>         again:
>                 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
> +               /* Bail out if REQ_NOWAIT is set */
> +               if (bi->bi_opf & REQ_NOWAIT) {
> +                       bio_wouldblock_error(bi);
> +                       return;
> +               }
>                 prepare_to_wait(&conf->wait_for_overlap, &w,
>                                 TASK_UNINTERRUPTIBLE);
>                 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
> @@ -5820,6 +5825,15 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
>         bi->bi_next = NULL;
>
>         md_account_bio(mddev, &bi);
> +       /* Bail out if REQ_NOWAIT is set */
> +       if (bi->bi_opf & REQ_NOWAIT &&
> +           conf->reshape_progress != MaxSector &&
> +           mddev->reshape_backwards
> +           ? logical_sector < conf->reshape_safe
> +           : logical_sector >= conf->reshape_safe) {
> +               bio_wouldblock_error(bi);
> +               return true;
> +       }
>         prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
>         for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
>                 int previous;
> --
> 2.17.1
>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC PATCH v4 4/4] md: raid456 add nowait support
       [not found]                 ` <f8c2a2bc-a885-8254-2b39-fc0c969ac70d@digitalocean.com>
@ 2021-11-19  4:07                   ` Song Liu
  2021-11-19  4:20                     ` Vishal Verma
  2021-12-09 16:53                     ` Vishal Verma
  0 siblings, 2 replies; 86+ messages in thread
From: Song Liu @ 2021-11-19  4:07 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, rgoldwyn, Jens Axboe

On Thu, Nov 11, 2021 at 10:09 PM Vishal Verma <vverma@digitalocean.com> wrote:
>
> Yes, with raid10 the task hung happened when doing write IO using FIO where FIO just gets stuck  after like 30s or so and no I/O happens afterwards.
> This was on a test nvme based raid10: (tried with both io_uring and aio, same issue)
>
> [ 1818.677686] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [ 1818.685512] task:fio             state:D stack:    0 pid:14314 ppid:     1 flags:0x00020004
> [ 1818.685516] Call Trace:
> [ 1818.685519]  __schedule+0x295/0x840
> [ 1818.685525]  ? wbt_cleanup_cb+0x20/0x20
> [ 1818.685528]  schedule+0x4e/0xb0
> [ 1818.685529]  io_schedule+0x3f/0x70
> [ 1818.685531]  rq_qos_wait+0xb9/0x130
> [ 1818.685535]  ? sysv68_partition+0x280/0x280
> [ 1818.685537]  ? wbt_cleanup_cb+0x20/0x20
> [ 1818.685538]  wbt_wait+0x92/0xc0
> [ 1818.685539]  __rq_qos_throttle+0x25/0x40
> [ 1818.685541]  blk_mq_submit_bio+0xc6/0x5d0
> [ 1818.685544]  ? submit_bio_checks+0x39e/0x5f0
> [ 1818.685547]  __submit_bio+0x1bc/0x1d0
> [ 1818.685549]  submit_bio_noacct+0x256/0x2a0
> [ 1818.685550]  ? bio_associate_blkg+0x29/0x70
> [ 1818.685553]  0xffffffffc028d38a
> [ 1818.685555]  blk_flush_plug+0xc3/0x130
> [ 1818.685558]  blk_finish_plug+0x26/0x40
> [ 1818.685560]  blkdev_write_iter+0xf8/0x160
> [ 1818.685561]  io_write+0x153/0x2e0
> [ 1818.685564]  ? blk_mq_put_tags+0x1d/0x20
> [ 1818.685566]  ? blk_mq_end_request_batch+0x295/0x2e0
> [ 1818.685568]  ? sysvec_apic_timer_interrupt+0x46/0x80
> [ 1818.685570]  io_issue_sqe+0x579/0x1990
> [ 1818.685571]  ? io_req_prep+0x6a9/0xe60
> [ 1818.685573]  ? __fget_files+0x56/0x80
> [ 1818.685576]  ? fget+0x2a/0x30
> [ 1818.685577]  io_submit_sqes+0x28c/0x930
> [ 1818.685578]  ? __io_submit_flush_completions+0xdc/0x150
> [ 1818.685580]  ? ctx_flush_and_put+0x4b/0x70
> [ 1818.685581]  __x64_sys_io_uring_enter+0x1db/0x8e0
> [ 1818.685583]  ? exit_to_user_mode_prepare+0x3e/0x1e0
> [ 1818.685586]  ? exit_to_user_mode_prepare+0x3e/0x1e0
> [ 1818.685588]  do_syscall_64+0x38/0x90
> [ 1818.685591]  entry_SYSCALL_64_after_hwframe+0x44/0xae
> [ 1818.685593] RIP: 0033:0x7f8a41c1889d
> [ 1818.685594] RSP: 002b:00007ffe390d5af8 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa
> [ 1818.685596] RAX: ffffffffffffffda RBX: 00007ffe390d5b20 RCX: 00007f8a41c1889d
> [ 1818.685597] RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000006
> [ 1818.685597] RBP: 000055de073b6ef0 R08: 0000000000000000 R09: 0000000000000000
> [ 1818.685598] R10: 0000000000000001 R11: 0000000000000246 R12: 00007f8a38400000
> [ 1818.685599] R13: 0000000000000001 R14: 0000000000875bc1 R15: 0000000000000000
>
> For raid456, running into this as soon as I try to create a raid5 volume:
>
> [ 5338.620661] Buffer I/O error on dev md5, logical block 0, async page read
> [ 5338.627457] Buffer I/O error on dev md5, logical block 0, async page read
> [ 5338.634250] Buffer I/O error on dev md5, logical block 0, async page read
> [ 5338.641043] Buffer I/O error on dev md5, logical block 0, async page read
> [ 5338.647836] Buffer I/O error on dev md5, logical block 0, async page read
> [ 5338.654632] Buffer I/O error on dev md5, logical block 0, async page read
> [ 5338.661424] Dev md5: unable to read RDB block 0
> [ 5338.665957] Buffer I/O error on dev md5, logical block 0, async page read
> [ 5338.672746] Buffer I/O error on dev md5, logical block 0, async page read
> [ 5338.679540] Buffer I/O error on dev md5, logical block 3, async page read

I am sorry that I haven't got time to look into this, and I will be on
vacation again from
tomorrow. If you make progress, please share your finding and/or
updated version.
I will try to look into this after Thanksgiving.

Song

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC PATCH v4 4/4] md: raid456 add nowait support
  2021-11-19  4:07                   ` Song Liu
@ 2021-11-19  4:20                     ` Vishal Verma
  2021-12-09 16:53                     ` Vishal Verma
  1 sibling, 0 replies; 86+ messages in thread
From: Vishal Verma @ 2021-11-19  4:20 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, rgoldwyn, Jens Axboe


On 11/18/21 9:07 PM, Song Liu wrote:
> On Thu, Nov 11, 2021 at 10:09 PM Vishal Verma <vverma@digitalocean.com> wrote:
>> Yes, with raid10 the task hung happened when doing write IO using FIO where FIO just gets stuck  after like 30s or so and no I/O happens afterwards.
>> This was on a test nvme based raid10: (tried with both io_uring and aio, same issue)
>>
>> [ 1818.677686] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> [ 1818.685512] task:fio             state:D stack:    0 pid:14314 ppid:     1 flags:0x00020004
>> [ 1818.685516] Call Trace:
>> [ 1818.685519]  __schedule+0x295/0x840
>> [ 1818.685525]  ? wbt_cleanup_cb+0x20/0x20
>> [ 1818.685528]  schedule+0x4e/0xb0
>> [ 1818.685529]  io_schedule+0x3f/0x70
>> [ 1818.685531]  rq_qos_wait+0xb9/0x130
>> [ 1818.685535]  ? sysv68_partition+0x280/0x280
>> [ 1818.685537]  ? wbt_cleanup_cb+0x20/0x20
>> [ 1818.685538]  wbt_wait+0x92/0xc0
>> [ 1818.685539]  __rq_qos_throttle+0x25/0x40
>> [ 1818.685541]  blk_mq_submit_bio+0xc6/0x5d0
>> [ 1818.685544]  ? submit_bio_checks+0x39e/0x5f0
>> [ 1818.685547]  __submit_bio+0x1bc/0x1d0
>> [ 1818.685549]  submit_bio_noacct+0x256/0x2a0
>> [ 1818.685550]  ? bio_associate_blkg+0x29/0x70
>> [ 1818.685553]  0xffffffffc028d38a
>> [ 1818.685555]  blk_flush_plug+0xc3/0x130
>> [ 1818.685558]  blk_finish_plug+0x26/0x40
>> [ 1818.685560]  blkdev_write_iter+0xf8/0x160
>> [ 1818.685561]  io_write+0x153/0x2e0
>> [ 1818.685564]  ? blk_mq_put_tags+0x1d/0x20
>> [ 1818.685566]  ? blk_mq_end_request_batch+0x295/0x2e0
>> [ 1818.685568]  ? sysvec_apic_timer_interrupt+0x46/0x80
>> [ 1818.685570]  io_issue_sqe+0x579/0x1990
>> [ 1818.685571]  ? io_req_prep+0x6a9/0xe60
>> [ 1818.685573]  ? __fget_files+0x56/0x80
>> [ 1818.685576]  ? fget+0x2a/0x30
>> [ 1818.685577]  io_submit_sqes+0x28c/0x930
>> [ 1818.685578]  ? __io_submit_flush_completions+0xdc/0x150
>> [ 1818.685580]  ? ctx_flush_and_put+0x4b/0x70
>> [ 1818.685581]  __x64_sys_io_uring_enter+0x1db/0x8e0
>> [ 1818.685583]  ? exit_to_user_mode_prepare+0x3e/0x1e0
>> [ 1818.685586]  ? exit_to_user_mode_prepare+0x3e/0x1e0
>> [ 1818.685588]  do_syscall_64+0x38/0x90
>> [ 1818.685591]  entry_SYSCALL_64_after_hwframe+0x44/0xae
>> [ 1818.685593] RIP: 0033:0x7f8a41c1889d
>> [ 1818.685594] RSP: 002b:00007ffe390d5af8 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa
>> [ 1818.685596] RAX: ffffffffffffffda RBX: 00007ffe390d5b20 RCX: 00007f8a41c1889d
>> [ 1818.685597] RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000006
>> [ 1818.685597] RBP: 000055de073b6ef0 R08: 0000000000000000 R09: 0000000000000000
>> [ 1818.685598] R10: 0000000000000001 R11: 0000000000000246 R12: 00007f8a38400000
>> [ 1818.685599] R13: 0000000000000001 R14: 0000000000875bc1 R15: 0000000000000000
>>
>> For raid456, running into this as soon as I try to create a raid5 volume:
>>
>> [ 5338.620661] Buffer I/O error on dev md5, logical block 0, async page read
>> [ 5338.627457] Buffer I/O error on dev md5, logical block 0, async page read
>> [ 5338.634250] Buffer I/O error on dev md5, logical block 0, async page read
>> [ 5338.641043] Buffer I/O error on dev md5, logical block 0, async page read
>> [ 5338.647836] Buffer I/O error on dev md5, logical block 0, async page read
>> [ 5338.654632] Buffer I/O error on dev md5, logical block 0, async page read
>> [ 5338.661424] Dev md5: unable to read RDB block 0
>> [ 5338.665957] Buffer I/O error on dev md5, logical block 0, async page read
>> [ 5338.672746] Buffer I/O error on dev md5, logical block 0, async page read
>> [ 5338.679540] Buffer I/O error on dev md5, logical block 3, async page read
> I am sorry that I haven't got time to look into this, and I will be on
> vacation again from
> tomorrow. If you make progress, please share your finding and/or
> updated version.
> I will try to look into this after Thanksgiving.

Hi Song,

Unfortunately, I didn't make any progress after posting my previous 
email. I'll defintely share if I make any further progress (will be out 
next week too).

> Song

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC PATCH v4 4/4] md: raid456 add nowait support
  2021-11-19  4:07                   ` Song Liu
  2021-11-19  4:20                     ` Vishal Verma
@ 2021-12-09 16:53                     ` Vishal Verma
  2021-12-09 16:59                       ` Song Liu
  1 sibling, 1 reply; 86+ messages in thread
From: Vishal Verma @ 2021-12-09 16:53 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, rgoldwyn, Jens Axboe


On 11/18/21 9:07 PM, Song Liu wrote:
> On Thu, Nov 11, 2021 at 10:09 PM Vishal Verma <vverma@digitalocean.com> wrote:
>> Yes, with raid10 the task hung happened when doing write IO using FIO where FIO just gets stuck  after like 30s or so and no I/O happens afterwards.
>> This was on a test nvme based raid10: (tried with both io_uring and aio, same issue)
>>
>> [ 1818.677686] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> [ 1818.685512] task:fio             state:D stack:    0 pid:14314 ppid:     1 flags:0x00020004
>> [ 1818.685516] Call Trace:
>> [ 1818.685519]  __schedule+0x295/0x840
>> [ 1818.685525]  ? wbt_cleanup_cb+0x20/0x20
>> [ 1818.685528]  schedule+0x4e/0xb0
>> [ 1818.685529]  io_schedule+0x3f/0x70
>> [ 1818.685531]  rq_qos_wait+0xb9/0x130
>> [ 1818.685535]  ? sysv68_partition+0x280/0x280
>> [ 1818.685537]  ? wbt_cleanup_cb+0x20/0x20
>> [ 1818.685538]  wbt_wait+0x92/0xc0
>> [ 1818.685539]  __rq_qos_throttle+0x25/0x40
>> [ 1818.685541]  blk_mq_submit_bio+0xc6/0x5d0
>> [ 1818.685544]  ? submit_bio_checks+0x39e/0x5f0
>> [ 1818.685547]  __submit_bio+0x1bc/0x1d0
>> [ 1818.685549]  submit_bio_noacct+0x256/0x2a0
>> [ 1818.685550]  ? bio_associate_blkg+0x29/0x70
>> [ 1818.685553]  0xffffffffc028d38a
>> [ 1818.685555]  blk_flush_plug+0xc3/0x130
>> [ 1818.685558]  blk_finish_plug+0x26/0x40
>> [ 1818.685560]  blkdev_write_iter+0xf8/0x160
>> [ 1818.685561]  io_write+0x153/0x2e0
>> [ 1818.685564]  ? blk_mq_put_tags+0x1d/0x20
>> [ 1818.685566]  ? blk_mq_end_request_batch+0x295/0x2e0
>> [ 1818.685568]  ? sysvec_apic_timer_interrupt+0x46/0x80
>> [ 1818.685570]  io_issue_sqe+0x579/0x1990
>> [ 1818.685571]  ? io_req_prep+0x6a9/0xe60
>> [ 1818.685573]  ? __fget_files+0x56/0x80
>> [ 1818.685576]  ? fget+0x2a/0x30
>> [ 1818.685577]  io_submit_sqes+0x28c/0x930
>> [ 1818.685578]  ? __io_submit_flush_completions+0xdc/0x150
>> [ 1818.685580]  ? ctx_flush_and_put+0x4b/0x70
>> [ 1818.685581]  __x64_sys_io_uring_enter+0x1db/0x8e0
>> [ 1818.685583]  ? exit_to_user_mode_prepare+0x3e/0x1e0
>> [ 1818.685586]  ? exit_to_user_mode_prepare+0x3e/0x1e0
>> [ 1818.685588]  do_syscall_64+0x38/0x90
>> [ 1818.685591]  entry_SYSCALL_64_after_hwframe+0x44/0xae
>> [ 1818.685593] RIP: 0033:0x7f8a41c1889d
>> [ 1818.685594] RSP: 002b:00007ffe390d5af8 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa
>> [ 1818.685596] RAX: ffffffffffffffda RBX: 00007ffe390d5b20 RCX: 00007f8a41c1889d
>> [ 1818.685597] RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000006
>> [ 1818.685597] RBP: 000055de073b6ef0 R08: 0000000000000000 R09: 0000000000000000
>> [ 1818.685598] R10: 0000000000000001 R11: 0000000000000246 R12: 00007f8a38400000
>> [ 1818.685599] R13: 0000000000000001 R14: 0000000000875bc1 R15: 0000000000000000
>>
>> For raid456, running into this as soon as I try to create a raid5 volume:
>>
>> [ 5338.620661] Buffer I/O error on dev md5, logical block 0, async page read
>> [ 5338.627457] Buffer I/O error on dev md5, logical block 0, async page read
>> [ 5338.634250] Buffer I/O error on dev md5, logical block 0, async page read
>> [ 5338.641043] Buffer I/O error on dev md5, logical block 0, async page read
>> [ 5338.647836] Buffer I/O error on dev md5, logical block 0, async page read
>> [ 5338.654632] Buffer I/O error on dev md5, logical block 0, async page read
>> [ 5338.661424] Dev md5: unable to read RDB block 0
>> [ 5338.665957] Buffer I/O error on dev md5, logical block 0, async page read
>> [ 5338.672746] Buffer I/O error on dev md5, logical block 0, async page read
>> [ 5338.679540] Buffer I/O error on dev md5, logical block 3, async page read
> I am sorry that I haven't got time to look into this, and I will be on
> vacation again from
> tomorrow. If you make progress, please share your finding and/or
> updated version.
> I will try to look into this after Thanksgiving.
>
> Song
>
> Hi Song,
> Did you get chance to look into this? It looks like I am bit stuck here. The other option I am thinking is if we just add a flag for enabling nowait and enable it by default for raid1?
>
> Thanks,
> Vishal

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC PATCH v4 4/4] md: raid456 add nowait support
  2021-12-09 16:53                     ` Vishal Verma
@ 2021-12-09 16:59                       ` Song Liu
  2021-12-09 17:01                         ` Vishal Verma
  0 siblings, 1 reply; 86+ messages in thread
From: Song Liu @ 2021-12-09 16:59 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, rgoldwyn, Jens Axboe

On Thu, Dec 9, 2021 at 8:53 AM Vishal Verma <vverma@digitalocean.com> wrote:
>
[...]
> >
> > Song
> >
> > Hi Song,
> > Did you get chance to look into this? It looks like I am bit stuck here. The other option I am thinking is if we just add a flag for enabling nowait and enable it by default for raid1?
> >

I am sorry for the delay. I will look into this and get back to you
early next week (or sooner).

Song

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC PATCH v4 4/4] md: raid456 add nowait support
  2021-12-09 16:59                       ` Song Liu
@ 2021-12-09 17:01                         ` Vishal Verma
  0 siblings, 0 replies; 86+ messages in thread
From: Vishal Verma @ 2021-12-09 17:01 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, rgoldwyn, Jens Axboe


On 12/9/21 9:59 AM, Song Liu wrote:
> On Thu, Dec 9, 2021 at 8:53 AM Vishal Verma <vverma@digitalocean.com> wrote:
> [...]
>>> Song
>>>
>>> Hi Song,
>>> Did you get chance to look into this? It looks like I am bit stuck here. The other option I am thinking is if we just add a flag for enabling nowait and enable it by default for raid1?
>>>
> I am sorry for the delay. I will look into this and get back to you
> early next week (or sooner).
>
> Song
>
>
> Np, thanks!

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC PATCH v4 4/4] md: raid456 add nowait support
  2021-11-10 18:14             ` [RFC PATCH v4 4/4] md: raid456 " Vishal Verma
  2021-11-11 21:42               ` Song Liu
@ 2021-12-10  2:16               ` Song Liu
  2021-12-10  7:18                 ` Song Liu
  2021-12-10 18:26                 ` Vishal Verma
  1 sibling, 2 replies; 86+ messages in thread
From: Song Liu @ 2021-12-10  2:16 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, rgoldwyn, Jens Axboe

On Wed, Nov 10, 2021 at 10:15 AM Vishal Verma <vverma@digitalocean.com> wrote:
>
> Returns EAGAIN in case the raid456 driver would block
> waiting for situations like:
>
>   - Reshape operation,
>   - Discard operation.
>
> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
> ---
>  drivers/md/raid5.c | 14 ++++++++++++++
>  1 file changed, 14 insertions(+)
>
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 9c1a5877cf9f..fa64ee315241 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -5710,6 +5710,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
>                 int d;
>         again:
>                 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
> +               /* Bail out if REQ_NOWAIT is set */
> +               if (bi->bi_opf & REQ_NOWAIT) {
> +                       bio_wouldblock_error(bi);
> +                       return;
> +               }

This is not right. raid5_get_active_stripe() gets refcount on the sh,
we cannot simply
return here. I think we need the logic after raid5_release_stripe()
and before schedule().

>                 prepare_to_wait(&conf->wait_for_overlap, &w,
>                                 TASK_UNINTERRUPTIBLE);
>                 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
> @@ -5820,6 +5825,15 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
>         bi->bi_next = NULL;
>
>         md_account_bio(mddev, &bi);
> +       /* Bail out if REQ_NOWAIT is set */
> +       if (bi->bi_opf & REQ_NOWAIT &&
> +           conf->reshape_progress != MaxSector &&
> +           mddev->reshape_backwards
> +           ? logical_sector < conf->reshape_safe
> +           : logical_sector >= conf->reshape_safe) {
> +               bio_wouldblock_error(bi);
> +               return true;
> +       }

This is also problematic, and is the trigger of those error messages.
We only want to trigger -EAGAIN when logical_sector is between
reshape_progress and reshape_safe.

Please let me know if these make sense.

Thanks,
Song

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC PATCH v4 4/4] md: raid456 add nowait support
  2021-12-10  2:16               ` Song Liu
@ 2021-12-10  7:18                 ` Song Liu
  2021-12-10 18:26                 ` Vishal Verma
  1 sibling, 0 replies; 86+ messages in thread
From: Song Liu @ 2021-12-10  7:18 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, rgoldwyn, Jens Axboe

On Thu, Dec 9, 2021 at 6:16 PM Song Liu <song@kernel.org> wrote:
>
> On Wed, Nov 10, 2021 at 10:15 AM Vishal Verma <vverma@digitalocean.com> wrote:
> >
> > Returns EAGAIN in case the raid456 driver would block
> > waiting for situations like:
> >
> >   - Reshape operation,
> >   - Discard operation.
> >
> > Signed-off-by: Vishal Verma <vverma@digitalocean.com>
> > ---
> >  drivers/md/raid5.c | 14 ++++++++++++++
> >  1 file changed, 14 insertions(+)
> >
> > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> > index 9c1a5877cf9f..fa64ee315241 100644
> > --- a/drivers/md/raid5.c
> > +++ b/drivers/md/raid5.c
> > @@ -5710,6 +5710,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
> >                 int d;
> >         again:
> >                 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
> > +               /* Bail out if REQ_NOWAIT is set */
> > +               if (bi->bi_opf & REQ_NOWAIT) {
> > +                       bio_wouldblock_error(bi);
> > +                       return;
> > +               }
>
> This is not right. raid5_get_active_stripe() gets refcount on the sh,
> we cannot simply
> return here. I think we need the logic after raid5_release_stripe()
> and before schedule().
>
> >                 prepare_to_wait(&conf->wait_for_overlap, &w,
> >                                 TASK_UNINTERRUPTIBLE);
> >                 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
> > @@ -5820,6 +5825,15 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
> >         bi->bi_next = NULL;
> >
> >         md_account_bio(mddev, &bi);
> > +       /* Bail out if REQ_NOWAIT is set */
> > +       if (bi->bi_opf & REQ_NOWAIT &&
> > +           conf->reshape_progress != MaxSector &&
> > +           mddev->reshape_backwards
> > +           ? logical_sector < conf->reshape_safe
> > +           : logical_sector >= conf->reshape_safe) {

There is also an Operator Precedence bug here. "&&" goes before "?
:", so we need
"()" around the "? :" block.

Thanks,
Song

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC PATCH v4 4/4] md: raid456 add nowait support
  2021-12-10  2:16               ` Song Liu
  2021-12-10  7:18                 ` Song Liu
@ 2021-12-10 18:26                 ` Vishal Verma
  2021-12-13  5:56                   ` Song Liu
  1 sibling, 1 reply; 86+ messages in thread
From: Vishal Verma @ 2021-12-10 18:26 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, rgoldwyn, Jens Axboe


On 12/9/21 7:16 PM, Song Liu wrote:
> On Wed, Nov 10, 2021 at 10:15 AM Vishal Verma <vverma@digitalocean.com> wrote:
>> Returns EAGAIN in case the raid456 driver would block
>> waiting for situations like:
>>
>>    - Reshape operation,
>>    - Discard operation.
>>
>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
>> ---
>>   drivers/md/raid5.c | 14 ++++++++++++++
>>   1 file changed, 14 insertions(+)
>>
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index 9c1a5877cf9f..fa64ee315241 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -5710,6 +5710,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
>>                  int d;
>>          again:
>>                  sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
>> +               /* Bail out if REQ_NOWAIT is set */
>> +               if (bi->bi_opf & REQ_NOWAIT) {
>> +                       bio_wouldblock_error(bi);
>> +                       return;
>> +               }
> This is not right. raid5_get_active_stripe() gets refcount on the sh,
> we cannot simply
> return here. I think we need the logic after raid5_release_stripe()
> and before schedule().
>
>>                  prepare_to_wait(&conf->wait_for_overlap, &w,
>>                                  TASK_UNINTERRUPTIBLE);
>>                  set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
>> @@ -5820,6 +5825,15 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
>>          bi->bi_next = NULL;
>>
>>          md_account_bio(mddev, &bi);
>> +       /* Bail out if REQ_NOWAIT is set */
>> +       if (bi->bi_opf & REQ_NOWAIT &&
>> +           conf->reshape_progress != MaxSector &&
>> +           mddev->reshape_backwards
>> +           ? logical_sector < conf->reshape_safe
>> +           : logical_sector >= conf->reshape_safe) {
>> +               bio_wouldblock_error(bi);
>> +               return true;
>> +       }
> This is also problematic, and is the trigger of those error messages.
> We only want to trigger -EAGAIN when logical_sector is between
> reshape_progress and reshape_safe.
>
> Just to clarify, did you mean doing something like:
> if (bi->bi_opf & REQ_NOWAIT &&
> +           conf->reshape_progress != MaxSector &&
> +           (mddev->reshape_backwards
> +           ? (logical_sector > conf->reshape_progress && logical_sector < conf->reshape_safe)
> +           : logical_sector >= conf->reshape_safe)) {
> +               bio_wouldblock_error(bi);
> +               return true;
> +
>
> Please let me know if these make sense.
>
> Thanks,
> Song
>
>
> Makes sense. Thanks for your feedback. I'll incorporate it and test.

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC PATCH v4 4/4] md: raid456 add nowait support
  2021-12-10 18:26                 ` Vishal Verma
@ 2021-12-13  5:56                   ` Song Liu
  2021-12-13 22:43                     ` Vishal Verma
  0 siblings, 1 reply; 86+ messages in thread
From: Song Liu @ 2021-12-13  5:56 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, rgoldwyn, Jens Axboe

On Fri, Dec 10, 2021 at 10:26 AM Vishal Verma <vverma@digitalocean.com> wrote:
>
>
> On 12/9/21 7:16 PM, Song Liu wrote:
> > On Wed, Nov 10, 2021 at 10:15 AM Vishal Verma <vverma@digitalocean.com> wrote:
> >> Returns EAGAIN in case the raid456 driver would block
> >> waiting for situations like:
> >>
> >>    - Reshape operation,
> >>    - Discard operation.
> >>
> >> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
> >> ---
> >>   drivers/md/raid5.c | 14 ++++++++++++++
> >>   1 file changed, 14 insertions(+)
> >>
> >> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> >> index 9c1a5877cf9f..fa64ee315241 100644
> >> --- a/drivers/md/raid5.c
> >> +++ b/drivers/md/raid5.c
> >> @@ -5710,6 +5710,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
> >>                  int d;
> >>          again:
> >>                  sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
> >> +               /* Bail out if REQ_NOWAIT is set */
> >> +               if (bi->bi_opf & REQ_NOWAIT) {
> >> +                       bio_wouldblock_error(bi);
> >> +                       return;
> >> +               }
> > This is not right. raid5_get_active_stripe() gets refcount on the sh,
> > we cannot simply
> > return here. I think we need the logic after raid5_release_stripe()
> > and before schedule().
> >
> >>                  prepare_to_wait(&conf->wait_for_overlap, &w,
> >>                                  TASK_UNINTERRUPTIBLE);
> >>                  set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
> >> @@ -5820,6 +5825,15 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
> >>          bi->bi_next = NULL;
> >>
> >>          md_account_bio(mddev, &bi);
> >> +       /* Bail out if REQ_NOWAIT is set */
> >> +       if (bi->bi_opf & REQ_NOWAIT &&
> >> +           conf->reshape_progress != MaxSector &&
> >> +           mddev->reshape_backwards
> >> +           ? logical_sector < conf->reshape_safe
> >> +           : logical_sector >= conf->reshape_safe) {
> >> +               bio_wouldblock_error(bi);
> >> +               return true;
> >> +       }
> > This is also problematic, and is the trigger of those error messages.
> > We only want to trigger -EAGAIN when logical_sector is between
> > reshape_progress and reshape_safe.
> >
> > Just to clarify, did you mean doing something like:
> > if (bi->bi_opf & REQ_NOWAIT &&
> > +           conf->reshape_progress != MaxSector &&
> > +           (mddev->reshape_backwards
> > +           ? (logical_sector > conf->reshape_progress && logical_sector < conf->reshape_safe)
> > +           : logical_sector >= conf->reshape_safe)) {

I think this should be
  :   (logical_sector >= conf->reshape_safe && logical_sector <
conf->reshape_progress)

> > +               bio_wouldblock_error(bi);
> > +               return true;
> > +
> >
> > Please let me know if these make sense.
> >
> > Thanks,
> > Song
> >
> >
> > Makes sense. Thanks for your feedback. I'll incorporate it and test.

When testing, please make sure we hit all different conditions with REQ_NOWAIT.

Thanks,
Song

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC PATCH v4 4/4] md: raid456 add nowait support
  2021-12-13  5:56                   ` Song Liu
@ 2021-12-13 22:43                     ` Vishal Verma
  2021-12-13 23:35                       ` Jens Axboe
  2021-12-14  0:36                       ` [RFC PATCH v4 4/4] md: raid456 add nowait support Song Liu
  0 siblings, 2 replies; 86+ messages in thread
From: Vishal Verma @ 2021-12-13 22:43 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, rgoldwyn, Jens Axboe


On 12/12/21 10:56 PM, Song Liu wrote:
> On Fri, Dec 10, 2021 at 10:26 AM Vishal Verma <vverma@digitalocean.com> wrote:
>>
>> On 12/9/21 7:16 PM, Song Liu wrote:
>>> On Wed, Nov 10, 2021 at 10:15 AM Vishal Verma <vverma@digitalocean.com> wrote:
>>>> Returns EAGAIN in case the raid456 driver would block
>>>> waiting for situations like:
>>>>
>>>>     - Reshape operation,
>>>>     - Discard operation.
>>>>
>>>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
>>>> ---
>>>>    drivers/md/raid5.c | 14 ++++++++++++++
>>>>    1 file changed, 14 insertions(+)
>>>>
>>>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>>>> index 9c1a5877cf9f..fa64ee315241 100644
>>>> --- a/drivers/md/raid5.c
>>>> +++ b/drivers/md/raid5.c
>>>> @@ -5710,6 +5710,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
>>>>                   int d;
>>>>           again:
>>>>                   sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
>>>> +               /* Bail out if REQ_NOWAIT is set */
>>>> +               if (bi->bi_opf & REQ_NOWAIT) {
>>>> +                       bio_wouldblock_error(bi);
>>>> +                       return;
>>>> +               }
>>> This is not right. raid5_get_active_stripe() gets refcount on the sh,
>>> we cannot simply
>>> return here. I think we need the logic after raid5_release_stripe()
>>> and before schedule().
>>>
>>>>                   prepare_to_wait(&conf->wait_for_overlap, &w,
>>>>                                   TASK_UNINTERRUPTIBLE);
>>>>                   set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
>>>> @@ -5820,6 +5825,15 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
>>>>           bi->bi_next = NULL;
>>>>
>>>>           md_account_bio(mddev, &bi);
>>>> +       /* Bail out if REQ_NOWAIT is set */
>>>> +       if (bi->bi_opf & REQ_NOWAIT &&
>>>> +           conf->reshape_progress != MaxSector &&
>>>> +           mddev->reshape_backwards
>>>> +           ? logical_sector < conf->reshape_safe
>>>> +           : logical_sector >= conf->reshape_safe) {
>>>> +               bio_wouldblock_error(bi);
>>>> +               return true;
>>>> +       }
>>> This is also problematic, and is the trigger of those error messages.
>>> We only want to trigger -EAGAIN when logical_sector is between
>>> reshape_progress and reshape_safe.
>>>
>>> Just to clarify, did you mean doing something like:
>>> if (bi->bi_opf & REQ_NOWAIT &&
>>> +           conf->reshape_progress != MaxSector &&
>>> +           (mddev->reshape_backwards
>>> +           ? (logical_sector > conf->reshape_progress && logical_sector < conf->reshape_safe)
>>> +           : logical_sector >= conf->reshape_safe)) {
> I think this should be
>    :   (logical_sector >= conf->reshape_safe && logical_sector <
> conf->reshape_progress)


if (bi->bi_opf & REQ_NOWAIT &&
                 conf->reshape_progress != MaxSector &&
                 (mddev->reshape_backwards
                 ? (logical_sector > conf->reshape_progress && 
logical_sector <= conf->reshape_safe)
                 : (logical_sector >= conf->reshape_safe && 
logical_sector < conf->reshape_progress))) {
                         bio_wouldblock_error(bi);
                         return true;
         }

After making this change along with other changes, I ran some tests with 
100% reads, 70%read30%writes and 100% writes on a clean raid5 array.

Unfortunately, I ran into this following task hung with 100% writes 
(with both libaio and io_uring):

[21876.856692] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" 
disables this message.
[21876.864518] task:md5_raid5       state:D stack:    0 pid:11675 
ppid:     2 flags:0x00004000
[21876.864522] Call Trace:
[21876.864526]  __schedule+0x2d4/0x970
[21876.864532]  ? wbt_cleanup_cb+0x20/0x20
[21876.864535]  schedule+0x4e/0xb0
[21876.864537]  io_schedule+0x3f/0x70
[21876.864539]  rq_qos_wait+0xb9/0x130
[21876.864542]  ? sysv68_partition+0x280/0x280
[21876.864543]  ? wbt_cleanup_cb+0x20/0x20
[21876.864545]  wbt_wait+0x92/0xc0
[21876.864546]  __rq_qos_throttle+0x25/0x40
[21876.864548]  blk_mq_submit_bio+0xc6/0x5d0
[21876.864551]  ? submit_bio_checks+0x39e/0x5f0
[21876.864554]  __submit_bio+0x1bc/0x1d0
[21876.864555]  ? kmem_cache_free+0x378/0x3c0
[21876.864558]  ? mempool_free_slab+0x17/0x20
[21876.864562]  submit_bio_noacct+0x256/0x2a0
[21876.864565]  0xffffffffc01fa6d9
[21876.864568]  ? 0xffffffffc01f5d01
[21876.864569]  raid5_get_active_stripe+0x16c0/0x3e00 [raid456]
[21876.864571]  ? __wake_up_common_lock+0x8a/0xc0
[21876.864575]  raid5_get_active_stripe+0x2839/0x3e00 [raid456]
[21876.864577]  raid5_get_active_stripe+0x2d6e/0x3e00 [raid456]
[21876.864579]  md_thread+0xae/0x170
[21876.864581]  ? wait_woken+0x60/0x60
[21876.864582]  ? md_start_sync+0x60/0x60
[21876.864584]  kthread+0x127/0x150
[21876.864586]  ? set_kthread_struct+0x40/0x40
[21876.864588]  ret_from_fork+0x1f/0x30

>>> +               bio_wouldblock_error(bi);
>>> +               return true;
>>> +
>>>
>>> Please let me know if these make sense.
>>>
>>> Thanks,
>>> Song
>>>
>>>
>>> Makes sense. Thanks for your feedback. I'll incorporate it and test.
> When testing, please make sure we hit all different conditions with REQ_NOWAIT.
>
> Thanks,
> Song

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC PATCH v4 4/4] md: raid456 add nowait support
  2021-12-13 22:43                     ` Vishal Verma
@ 2021-12-13 23:35                       ` Jens Axboe
       [not found]                         ` <78d5f029-791e-6d3f-4871-263ec6b5c09b@digitalocean.com>
  2021-12-14  0:36                       ` [RFC PATCH v4 4/4] md: raid456 add nowait support Song Liu
  1 sibling, 1 reply; 86+ messages in thread
From: Jens Axboe @ 2021-12-13 23:35 UTC (permalink / raw)
  To: Vishal Verma, Song Liu; +Cc: linux-raid, rgoldwyn

On 12/13/21 3:43 PM, Vishal Verma wrote:
> 
> On 12/12/21 10:56 PM, Song Liu wrote:
>> On Fri, Dec 10, 2021 at 10:26 AM Vishal Verma <vverma@digitalocean.com> wrote:
>>>
>>> On 12/9/21 7:16 PM, Song Liu wrote:
>>>> On Wed, Nov 10, 2021 at 10:15 AM Vishal Verma <vverma@digitalocean.com> wrote:
>>>>> Returns EAGAIN in case the raid456 driver would block
>>>>> waiting for situations like:
>>>>>
>>>>>     - Reshape operation,
>>>>>     - Discard operation.
>>>>>
>>>>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
>>>>> ---
>>>>>    drivers/md/raid5.c | 14 ++++++++++++++
>>>>>    1 file changed, 14 insertions(+)
>>>>>
>>>>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>>>>> index 9c1a5877cf9f..fa64ee315241 100644
>>>>> --- a/drivers/md/raid5.c
>>>>> +++ b/drivers/md/raid5.c
>>>>> @@ -5710,6 +5710,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
>>>>>                   int d;
>>>>>           again:
>>>>>                   sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
>>>>> +               /* Bail out if REQ_NOWAIT is set */
>>>>> +               if (bi->bi_opf & REQ_NOWAIT) {
>>>>> +                       bio_wouldblock_error(bi);
>>>>> +                       return;
>>>>> +               }
>>>> This is not right. raid5_get_active_stripe() gets refcount on the sh,
>>>> we cannot simply
>>>> return here. I think we need the logic after raid5_release_stripe()
>>>> and before schedule().
>>>>
>>>>>                   prepare_to_wait(&conf->wait_for_overlap, &w,
>>>>>                                   TASK_UNINTERRUPTIBLE);
>>>>>                   set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
>>>>> @@ -5820,6 +5825,15 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
>>>>>           bi->bi_next = NULL;
>>>>>
>>>>>           md_account_bio(mddev, &bi);
>>>>> +       /* Bail out if REQ_NOWAIT is set */
>>>>> +       if (bi->bi_opf & REQ_NOWAIT &&
>>>>> +           conf->reshape_progress != MaxSector &&
>>>>> +           mddev->reshape_backwards
>>>>> +           ? logical_sector < conf->reshape_safe
>>>>> +           : logical_sector >= conf->reshape_safe) {
>>>>> +               bio_wouldblock_error(bi);
>>>>> +               return true;
>>>>> +       }
>>>> This is also problematic, and is the trigger of those error messages.
>>>> We only want to trigger -EAGAIN when logical_sector is between
>>>> reshape_progress and reshape_safe.
>>>>
>>>> Just to clarify, did you mean doing something like:
>>>> if (bi->bi_opf & REQ_NOWAIT &&
>>>> +           conf->reshape_progress != MaxSector &&
>>>> +           (mddev->reshape_backwards
>>>> +           ? (logical_sector > conf->reshape_progress && logical_sector < conf->reshape_safe)
>>>> +           : logical_sector >= conf->reshape_safe)) {
>> I think this should be
>>    :   (logical_sector >= conf->reshape_safe && logical_sector <
>> conf->reshape_progress)
> 
> 
> if (bi->bi_opf & REQ_NOWAIT &&
>                  conf->reshape_progress != MaxSector &&
>                  (mddev->reshape_backwards
>                  ? (logical_sector > conf->reshape_progress && 
> logical_sector <= conf->reshape_safe)
>                  : (logical_sector >= conf->reshape_safe && 
> logical_sector < conf->reshape_progress))) {
>                          bio_wouldblock_error(bi);
>                          return true;
>          }
> 
> After making this change along with other changes, I ran some tests with 
> 100% reads, 70%read30%writes and 100% writes on a clean raid5 array.
> 
> Unfortunately, I ran into this following task hung with 100% writes 
> (with both libaio and io_uring):
> 
> [21876.856692] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" 
> disables this message.
> [21876.864518] task:md5_raid5       state:D stack:    0 pid:11675 
> ppid:     2 flags:0x00004000
> [21876.864522] Call Trace:
> [21876.864526]  __schedule+0x2d4/0x970
> [21876.864532]  ? wbt_cleanup_cb+0x20/0x20
> [21876.864535]  schedule+0x4e/0xb0
> [21876.864537]  io_schedule+0x3f/0x70
> [21876.864539]  rq_qos_wait+0xb9/0x130
> [21876.864542]  ? sysv68_partition+0x280/0x280
> [21876.864543]  ? wbt_cleanup_cb+0x20/0x20
> [21876.864545]  wbt_wait+0x92/0xc0
> [21876.864546]  __rq_qos_throttle+0x25/0x40
> [21876.864548]  blk_mq_submit_bio+0xc6/0x5d0
> [21876.864551]  ? submit_bio_checks+0x39e/0x5f0
> [21876.864554]  __submit_bio+0x1bc/0x1d0
> [21876.864555]  ? kmem_cache_free+0x378/0x3c0
> [21876.864558]  ? mempool_free_slab+0x17/0x20
> [21876.864562]  submit_bio_noacct+0x256/0x2a0
> [21876.864565]  0xffffffffc01fa6d9
> [21876.864568]  ? 0xffffffffc01f5d01
> [21876.864569]  raid5_get_active_stripe+0x16c0/0x3e00 [raid456]
> [21876.864571]  ? __wake_up_common_lock+0x8a/0xc0
> [21876.864575]  raid5_get_active_stripe+0x2839/0x3e00 [raid456]
> [21876.864577]  raid5_get_active_stripe+0x2d6e/0x3e00 [raid456]
> [21876.864579]  md_thread+0xae/0x170
> [21876.864581]  ? wait_woken+0x60/0x60
> [21876.864582]  ? md_start_sync+0x60/0x60
> [21876.864584]  kthread+0x127/0x150
> [21876.864586]  ? set_kthread_struct+0x40/0x40
> [21876.864588]  ret_from_fork+0x1f/0x30

What kernel base are you using for your patches?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC PATCH v4 1/4] md: add support for REQ_NOWAIT
  2021-11-10 18:14           ` [RFC PATCH v4 1/4] md: add support for REQ_NOWAIT Vishal Verma
                               ` (2 preceding siblings ...)
  2021-11-10 18:14             ` [RFC PATCH v4 4/4] md: raid456 " Vishal Verma
@ 2021-12-13 23:50             ` Song Liu
  3 siblings, 0 replies; 86+ messages in thread
From: Song Liu @ 2021-12-13 23:50 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, rgoldwyn, Jens Axboe

On Wed, Nov 10, 2021 at 10:15 AM Vishal Verma <vverma@digitalocean.com> wrote:
>
> commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support
> for checking whether a given bdev supports handling of REQ_NOWAIT or not.
> Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable
> it for linear target") added support for REQ_NOWAIT for dm. This uses
> a similar approach to incorporate REQ_NOWAIT for md based bios.
>
> This patch was tested using t/io_uring tool within FIO. A nvme drive
> was partitioned into 2 partitions and a simple raid 0 configuration
> /dev/md0 was created.
>
> md0 : active raid0 nvme4n1p1[1] nvme4n1p2[0]
>       937423872 blocks super 1.2 512k chunks
>
> Before patch:
>
> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
>
> Running top while the above runs:
>
> $ ps -eL | grep $(pidof io_uring)
>
>   38396   38396 pts/2    00:00:00 io_uring
>   38396   38397 pts/2    00:00:15 io_uring
>   38396   38398 pts/2    00:00:13 iou-wrk-38397
>
> We can see iou-wrk-38397 io worker thread created which gets created
> when io_uring sees that the underlying device (/dev/md0 in this case)
> doesn't support nowait.
>
> After patch:
>
> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
>
> Running top while the above runs:
>
> $ ps -eL | grep $(pidof io_uring)
>
>   38341   38341 pts/2    00:10:22 io_uring
>   38341   38342 pts/2    00:10:37 io_uring
>
> After running this patch, we don't see any io worker thread
> being created which indicated that io_uring saw that the
> underlying device does support nowait. This is the exact behaviour
> noticed on a dm device which also supports nowait.
>
> For all the other raid personalities except raid0, we would need
> to train pieces which involves make_request fn in order for them
> to correctly handle REQ_NOWAIT.
>
> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
> ---
>  drivers/md/md.c | 20 ++++++++++++++++++++
>  1 file changed, 20 insertions(+)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 5111ed966947..a30c78afcab6 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -419,6 +419,11 @@ void md_handle_request(struct mddev *mddev, struct bio *bio)
>         if (is_suspended(mddev, bio)) {
>                 DEFINE_WAIT(__wait);
>                 for (;;) {
> +                       /* Bail out if REQ_NOWAIT is set for the bio */
> +                       if (bio->bi_opf & REQ_NOWAIT) {
> +                               bio_wouldblock_error(bio);
> +                               return;
> +                       }

We need a rcu_read_unlock() before bio_wouldbock_error(bio).

>                         prepare_to_wait(&mddev->sb_wait, &__wait,
>                                         TASK_UNINTERRUPTIBLE);
>                         if (!is_suspended(mddev, bio))
> @@ -5792,6 +5797,7 @@ int md_run(struct mddev *mddev)
>         int err;
>         struct md_rdev *rdev;
>         struct md_personality *pers;
> +       bool nowait = true;
>
>         if (list_empty(&mddev->disks))
>                 /* cannot run an array with no devices.. */
> @@ -5862,8 +5868,13 @@ int md_run(struct mddev *mddev)
>                         }
>                 }
>                 sysfs_notify_dirent_safe(rdev->sysfs_state);
> +               nowait = nowait && blk_queue_nowait(bdev_get_queue(rdev->bdev));
>         }
>
> +       /* Set the NOWAIT flags if all underlying devices support it */
> +       if (nowait)
> +               blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue);
> +
>         if (!bioset_initialized(&mddev->bio_set)) {
>                 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
>                 if (err)
> @@ -7007,6 +7018,15 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
>         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
>         if (!mddev->thread)
>                 md_update_sb(mddev, 1);
> +       /*
> +        * If the new disk does not support REQ_NOWAIT,
> +        * disable on the whole MD.
> +        */
> +       if (!blk_queue_nowait(bdev_get_queue(rdev->bdev))) {
> +               pr_info("%s: Disabling nowait because %s does not support nowait\n",
> +                       mdname(mddev), bdevname(rdev->bdev, b));
> +               blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue);
> +       }
>         /*
>          * Kick recovery, maybe this spare has to be added to the
>          * array immediately.
> --
> 2.17.1
>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC PATCH v4 3/4] md: raid10 add nowait support
  2021-11-10 18:14             ` [RFC PATCH v4 3/4] md: raid10 " Vishal Verma
@ 2021-12-14  0:32               ` Song Liu
  2021-12-14 15:27                 ` Vishal Verma
  0 siblings, 1 reply; 86+ messages in thread
From: Song Liu @ 2021-12-14  0:32 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, rgoldwyn, Jens Axboe

On Wed, Nov 10, 2021 at 10:15 AM Vishal Verma <vverma@digitalocean.com> wrote:
>
> This adds nowait support to the RAID10 driver. Very similar to
> raid1 driver changes. It makes RAID10 driver return with EAGAIN
> for situations where it could wait for eg:
>
>   - Waiting for the barrier,
>   - Too many pending I/Os to be queued,
>   - Reshape operation,
>   - Discard operation.
>
> wait_barrier() fn is modified to return bool to support error for
> wait barriers. It returns true in case of wait or if wait is not
> required and returns false if wait was required but not performed
> to support nowait.
>
> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
> ---
>  drivers/md/raid10.c | 87 +++++++++++++++++++++++++++++++--------------
>  1 file changed, 60 insertions(+), 27 deletions(-)
>
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index dde98f65bd04..03983146d31a 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -952,8 +952,9 @@ static void lower_barrier(struct r10conf *conf)
>         wake_up(&conf->wait_barrier);
>  }
>
> -static void wait_barrier(struct r10conf *conf)
> +static bool wait_barrier(struct r10conf *conf, bool nowait)
>  {
> +       bool ret = true;
>         spin_lock_irq(&conf->resync_lock);
>         if (conf->barrier) {
>                 struct bio_list *bio_list = current->bio_list;
> @@ -968,26 +969,33 @@ static void wait_barrier(struct r10conf *conf)
>                  * count down.
>                  */
>                 raid10_log(conf->mddev, "wait barrier");
> -               wait_event_lock_irq(conf->wait_barrier,
> -                                   !conf->barrier ||
> -                                   (atomic_read(&conf->nr_pending) &&
> -                                    bio_list &&
> -                                    (!bio_list_empty(&bio_list[0]) ||
> -                                     !bio_list_empty(&bio_list[1]))) ||
> -                                    /* move on if recovery thread is
> -                                     * blocked by us
> -                                     */
> -                                    (conf->mddev->thread->tsk == current &&
> -                                     test_bit(MD_RECOVERY_RUNNING,
> -                                              &conf->mddev->recovery) &&
> -                                     conf->nr_queued > 0),
> -                                   conf->resync_lock);
> +               /* Return false when nowait flag is set */
> +               if (nowait)
> +                       ret = false;
> +               else
> +                       wait_event_lock_irq(conf->wait_barrier,
> +                                           !conf->barrier ||
> +                                           (atomic_read(&conf->nr_pending) &&
> +                                            bio_list &&
> +                                            (!bio_list_empty(&bio_list[0]) ||
> +                                             !bio_list_empty(&bio_list[1]))) ||
> +                                            /* move on if recovery thread is
> +                                             * blocked by us
> +                                             */
> +                                            (conf->mddev->thread->tsk == current &&
> +                                             test_bit(MD_RECOVERY_RUNNING,
> +                                                      &conf->mddev->recovery) &&
> +                                             conf->nr_queued > 0),
> +                                           conf->resync_lock);
>                 conf->nr_waiting--;
>                 if (!conf->nr_waiting)
>                         wake_up(&conf->wait_barrier);
>         }
> -       atomic_inc(&conf->nr_pending);
> +       /* Only increment nr_pending when we wait */
> +       if (ret)
> +               atomic_inc(&conf->nr_pending);
>         spin_unlock_irq(&conf->resync_lock);
> +       return ret;
>  }

I guess something like this would simplify the code:

static bool wait_barrier(struct r10conf *conf, bool nowait)
{
        bool ret = true;
        spin_lock_irq(&conf->resync_lock);
        if (conf->barrier) {
                struct bio_list *bio_list = current->bio_list;

                if (nowait) {
                        spin_unlock_irq(&conf->resync_lock);
                        return false;
                }

>
>  static void allow_barrier(struct r10conf *conf)
> @@ -1101,17 +1109,25 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
>  static void regular_request_wait(struct mddev *mddev, struct r10conf *conf,
>                                  struct bio *bio, sector_t sectors)
>  {
> -       wait_barrier(conf);
> +       /* Bail out if REQ_NOWAIT is set for the bio */
> +       if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
> +               bio_wouldblock_error(bio);
> +               return;
> +       }
>         while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
>             bio->bi_iter.bi_sector < conf->reshape_progress &&
>             bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
>                 raid10_log(conf->mddev, "wait reshape");
> +               if (bio->bi_opf & REQ_NOWAIT) {
> +                       bio_wouldblock_error(bio);
> +                       return;
> +               }
>                 allow_barrier(conf);
>                 wait_event(conf->wait_barrier,
>                            conf->reshape_progress <= bio->bi_iter.bi_sector ||
>                            conf->reshape_progress >= bio->bi_iter.bi_sector +
>                            sectors);
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>         }
>  }
>
> @@ -1179,7 +1195,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
>                 bio_chain(split, bio);
>                 allow_barrier(conf);
>                 submit_bio_noacct(bio);
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>                 bio = split;
>                 r10_bio->master_bio = bio;
>                 r10_bio->sectors = max_sectors;
> @@ -1338,7 +1354,7 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
>                 raid10_log(conf->mddev, "%s wait rdev %d blocked",
>                                 __func__, blocked_rdev->raid_disk);
>                 md_wait_for_blocked_rdev(blocked_rdev, mddev);
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>                 goto retry_wait;
>         }
>  }
> @@ -1357,6 +1373,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>                                             bio_end_sector(bio)))) {
>                 DEFINE_WAIT(w);
>                 for (;;) {
> +                       /* Bail out if REQ_NOWAIT is set for the bio */
> +                       if (bio->bi_opf & REQ_NOWAIT) {
> +                               bio_wouldblock_error(bio);
> +                               return;
> +                       }
>                         prepare_to_wait(&conf->wait_barrier,
>                                         &w, TASK_IDLE);
>                         if (!md_cluster_ops->area_resyncing(mddev, WRITE,
> @@ -1381,6 +1402,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>                               BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
>                 md_wakeup_thread(mddev->thread);
>                 raid10_log(conf->mddev, "wait reshape metadata");
> +               if (bio->bi_opf & REQ_NOWAIT) {
> +                       bio_wouldblock_error(bio);
> +                       return;
> +               }
>                 wait_event(mddev->sb_wait,
>                            !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
>
> @@ -1390,6 +1415,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>         if (conf->pending_count >= max_queued_requests) {
>                 md_wakeup_thread(mddev->thread);
>                 raid10_log(mddev, "wait queued");
> +               if (bio->bi_opf & REQ_NOWAIT) {
> +                       bio_wouldblock_error(bio);
> +                       return;
> +               }
>                 wait_event(conf->wait_barrier,
>                            conf->pending_count < max_queued_requests);
>         }
> @@ -1482,7 +1511,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>                 bio_chain(split, bio);
>                 allow_barrier(conf);
>                 submit_bio_noacct(bio);
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>                 bio = split;
>                 r10_bio->master_bio = bio;
>         }
> @@ -1607,7 +1636,11 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
>         if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
>                 return -EAGAIN;
>
> -       wait_barrier(conf);
> +       if (bio->bi_opf & REQ_NOWAIT) {
> +               bio_wouldblock_error(bio);
> +               return 0;
> +       }

Does this mean we always bail out on discard?

> +       wait_barrier(conf, false);
>
>         /*
>          * Check reshape again to avoid reshape happens after checking
> @@ -1649,7 +1682,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
>                 allow_barrier(conf);
>                 /* Resend the fist split part */
>                 submit_bio_noacct(split);
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>         }
>         div_u64_rem(bio_end, stripe_size, &remainder);
>         if (remainder) {
> @@ -1660,7 +1693,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
>                 /* Resend the second split part */
>                 submit_bio_noacct(bio);
>                 bio = split;
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>         }
>
>         bio_start = bio->bi_iter.bi_sector;
> @@ -1816,7 +1849,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
>                 end_disk_offset += geo->stride;
>                 atomic_inc(&first_r10bio->remaining);
>                 raid_end_discard_bio(r10_bio);
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>                 goto retry_discard;
>         }
>
> @@ -2011,7 +2044,7 @@ static void print_conf(struct r10conf *conf)
>
>  static void close_sync(struct r10conf *conf)
>  {
> -       wait_barrier(conf);
> +       wait_barrier(conf, false);
>         allow_barrier(conf);
>
>         mempool_exit(&conf->r10buf_pool);
> @@ -4819,7 +4852,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
>         if (need_flush ||
>             time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
>                 /* Need to update reshape_position in metadata */
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>                 mddev->reshape_position = conf->reshape_progress;
>                 if (mddev->reshape_backwards)
>                         mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
> --
> 2.17.1
>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC PATCH v4 4/4] md: raid456 add nowait support
  2021-12-13 22:43                     ` Vishal Verma
  2021-12-13 23:35                       ` Jens Axboe
@ 2021-12-14  0:36                       ` Song Liu
  1 sibling, 0 replies; 86+ messages in thread
From: Song Liu @ 2021-12-14  0:36 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, rgoldwyn, Jens Axboe

On Mon, Dec 13, 2021 at 2:43 PM Vishal Verma <vverma@digitalocean.com> wrote:
>
>
> On 12/12/21 10:56 PM, Song Liu wrote:
> > On Fri, Dec 10, 2021 at 10:26 AM Vishal Verma <vverma@digitalocean.com> wrote:
> >>
> >> On 12/9/21 7:16 PM, Song Liu wrote:
> >>> On Wed, Nov 10, 2021 at 10:15 AM Vishal Verma <vverma@digitalocean.com> wrote:
> >>>> Returns EAGAIN in case the raid456 driver would block
> >>>> waiting for situations like:
> >>>>
> >>>>     - Reshape operation,
> >>>>     - Discard operation.
> >>>>
> >>>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
> >>>> ---
> >>>>    drivers/md/raid5.c | 14 ++++++++++++++
> >>>>    1 file changed, 14 insertions(+)
> >>>>
> >>>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> >>>> index 9c1a5877cf9f..fa64ee315241 100644
> >>>> --- a/drivers/md/raid5.c
> >>>> +++ b/drivers/md/raid5.c
> >>>> @@ -5710,6 +5710,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
> >>>>                   int d;
> >>>>           again:
> >>>>                   sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
> >>>> +               /* Bail out if REQ_NOWAIT is set */
> >>>> +               if (bi->bi_opf & REQ_NOWAIT) {
> >>>> +                       bio_wouldblock_error(bi);
> >>>> +                       return;
> >>>> +               }
> >>> This is not right. raid5_get_active_stripe() gets refcount on the sh,
> >>> we cannot simply
> >>> return here. I think we need the logic after raid5_release_stripe()
> >>> and before schedule().
> >>>
> >>>>                   prepare_to_wait(&conf->wait_for_overlap, &w,
> >>>>                                   TASK_UNINTERRUPTIBLE);
> >>>>                   set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
> >>>> @@ -5820,6 +5825,15 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
> >>>>           bi->bi_next = NULL;
> >>>>
> >>>>           md_account_bio(mddev, &bi);
> >>>> +       /* Bail out if REQ_NOWAIT is set */
> >>>> +       if (bi->bi_opf & REQ_NOWAIT &&
> >>>> +           conf->reshape_progress != MaxSector &&
> >>>> +           mddev->reshape_backwards
> >>>> +           ? logical_sector < conf->reshape_safe
> >>>> +           : logical_sector >= conf->reshape_safe) {
> >>>> +               bio_wouldblock_error(bi);
> >>>> +               return true;
> >>>> +       }
> >>> This is also problematic, and is the trigger of those error messages.
> >>> We only want to trigger -EAGAIN when logical_sector is between
> >>> reshape_progress and reshape_safe.
> >>>
> >>> Just to clarify, did you mean doing something like:
> >>> if (bi->bi_opf & REQ_NOWAIT &&
> >>> +           conf->reshape_progress != MaxSector &&
> >>> +           (mddev->reshape_backwards
> >>> +           ? (logical_sector > conf->reshape_progress && logical_sector < conf->reshape_safe)
> >>> +           : logical_sector >= conf->reshape_safe)) {
> > I think this should be
> >    :   (logical_sector >= conf->reshape_safe && logical_sector <
> > conf->reshape_progress)
>
>
> if (bi->bi_opf & REQ_NOWAIT &&
>                  conf->reshape_progress != MaxSector &&
>                  (mddev->reshape_backwards
>                  ? (logical_sector > conf->reshape_progress &&
> logical_sector <= conf->reshape_safe)
>                  : (logical_sector >= conf->reshape_safe &&
> logical_sector < conf->reshape_progress))) {
>                          bio_wouldblock_error(bi);
>                          return true;
>          }
>
> After making this change along with other changes, I ran some tests with
> 100% reads, 70%read30%writes and 100% writes on a clean raid5 array.
>
> Unfortunately, I ran into this following task hung with 100% writes
> (with both libaio and io_uring):
>
> [21876.856692] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
> disables this message.
> [21876.864518] task:md5_raid5       state:D stack:    0 pid:11675
> ppid:     2 flags:0x00004000
> [21876.864522] Call Trace:
> [21876.864526]  __schedule+0x2d4/0x970
> [21876.864532]  ? wbt_cleanup_cb+0x20/0x20
> [21876.864535]  schedule+0x4e/0xb0
> [21876.864537]  io_schedule+0x3f/0x70
> [21876.864539]  rq_qos_wait+0xb9/0x130
> [21876.864542]  ? sysv68_partition+0x280/0x280
> [21876.864543]  ? wbt_cleanup_cb+0x20/0x20
> [21876.864545]  wbt_wait+0x92/0xc0
> [21876.864546]  __rq_qos_throttle+0x25/0x40
> [21876.864548]  blk_mq_submit_bio+0xc6/0x5d0
> [21876.864551]  ? submit_bio_checks+0x39e/0x5f0
> [21876.864554]  __submit_bio+0x1bc/0x1d0
> [21876.864555]  ? kmem_cache_free+0x378/0x3c0
> [21876.864558]  ? mempool_free_slab+0x17/0x20
> [21876.864562]  submit_bio_noacct+0x256/0x2a0
> [21876.864565]  0xffffffffc01fa6d9
> [21876.864568]  ? 0xffffffffc01f5d01
> [21876.864569]  raid5_get_active_stripe+0x16c0/0x3e00 [raid456]
> [21876.864571]  ? __wake_up_common_lock+0x8a/0xc0
> [21876.864575]  raid5_get_active_stripe+0x2839/0x3e00 [raid456]
> [21876.864577]  raid5_get_active_stripe+0x2d6e/0x3e00 [raid456]
> [21876.864579]  md_thread+0xae/0x170
> [21876.864581]  ? wait_woken+0x60/0x60
> [21876.864582]  ? md_start_sync+0x60/0x60
> [21876.864584]  kthread+0x127/0x150
> [21876.864586]  ? set_kthread_struct+0x40/0x40
> [21876.864588]  ret_from_fork+0x1f/0x30

While there is something suspicious with raid10_handle_discard(), I don't
think it is related to this error. I couldn't reproduce this in my tests.

Thanks,
Song

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC PATCH v4 4/4] md: raid456 add nowait support
       [not found]                         ` <78d5f029-791e-6d3f-4871-263ec6b5c09b@digitalocean.com>
@ 2021-12-14  1:11                           ` Song Liu
  2021-12-14  1:12                             ` Vishal Verma
  0 siblings, 1 reply; 86+ messages in thread
From: Song Liu @ 2021-12-14  1:11 UTC (permalink / raw)
  To: Vishal Verma; +Cc: Jens Axboe, linux-raid, rgoldwyn

On Mon, Dec 13, 2021 at 4:53 PM Vishal Verma <vverma@digitalocean.com> wrote:
>
[...]
>
> What kernel base are you using for your patches?
>
> These were based out of for-5.16-tag (037c50bfb)

Please rebase on top of md-next branch from here:

https://git.kernel.org/pub/scm/linux/kernel/git/song/md.git

Thanks,
Song

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC PATCH v4 4/4] md: raid456 add nowait support
  2021-12-14  1:11                           ` Song Liu
@ 2021-12-14  1:12                             ` Vishal Verma
  2021-12-14 15:30                               ` Vishal Verma
  0 siblings, 1 reply; 86+ messages in thread
From: Vishal Verma @ 2021-12-14  1:12 UTC (permalink / raw)
  To: Song Liu; +Cc: Jens Axboe, linux-raid, rgoldwyn


On 12/13/21 6:11 PM, Song Liu wrote:
> On Mon, Dec 13, 2021 at 4:53 PM Vishal Verma <vverma@digitalocean.com> wrote:
> [...]
>> What kernel base are you using for your patches?
>>
>> These were based out of for-5.16-tag (037c50bfb)
> Please rebase on top of md-next branch from here:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/song/md.git
>
> Thanks,
> Song
Ack, will do!

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC PATCH v4 3/4] md: raid10 add nowait support
  2021-12-14  0:32               ` Song Liu
@ 2021-12-14 15:27                 ` Vishal Verma
  0 siblings, 0 replies; 86+ messages in thread
From: Vishal Verma @ 2021-12-14 15:27 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, rgoldwyn, Jens Axboe


On 12/13/21 5:32 PM, Song Liu wrote:
> On Wed, Nov 10, 2021 at 10:15 AM Vishal Verma <vverma@digitalocean.com> wrote:
>> This adds nowait support to the RAID10 driver. Very similar to
>> raid1 driver changes. It makes RAID10 driver return with EAGAIN
>> for situations where it could wait for eg:
>>
>>    - Waiting for the barrier,
>>    - Too many pending I/Os to be queued,
>>    - Reshape operation,
>>    - Discard operation.
>>
>> wait_barrier() fn is modified to return bool to support error for
>> wait barriers. It returns true in case of wait or if wait is not
>> required and returns false if wait was required but not performed
>> to support nowait.
>>
>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
>> ---
>>   drivers/md/raid10.c | 87 +++++++++++++++++++++++++++++++--------------
>>   1 file changed, 60 insertions(+), 27 deletions(-)
>>
>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>> index dde98f65bd04..03983146d31a 100644
>> --- a/drivers/md/raid10.c
>> +++ b/drivers/md/raid10.c
>> @@ -952,8 +952,9 @@ static void lower_barrier(struct r10conf *conf)
>>          wake_up(&conf->wait_barrier);
>>   }
>>
>> -static void wait_barrier(struct r10conf *conf)
>> +static bool wait_barrier(struct r10conf *conf, bool nowait)
>>   {
>> +       bool ret = true;
>>          spin_lock_irq(&conf->resync_lock);
>>          if (conf->barrier) {
>>                  struct bio_list *bio_list = current->bio_list;
>> @@ -968,26 +969,33 @@ static void wait_barrier(struct r10conf *conf)
>>                   * count down.
>>                   */
>>                  raid10_log(conf->mddev, "wait barrier");
>> -               wait_event_lock_irq(conf->wait_barrier,
>> -                                   !conf->barrier ||
>> -                                   (atomic_read(&conf->nr_pending) &&
>> -                                    bio_list &&
>> -                                    (!bio_list_empty(&bio_list[0]) ||
>> -                                     !bio_list_empty(&bio_list[1]))) ||
>> -                                    /* move on if recovery thread is
>> -                                     * blocked by us
>> -                                     */
>> -                                    (conf->mddev->thread->tsk == current &&
>> -                                     test_bit(MD_RECOVERY_RUNNING,
>> -                                              &conf->mddev->recovery) &&
>> -                                     conf->nr_queued > 0),
>> -                                   conf->resync_lock);
>> +               /* Return false when nowait flag is set */
>> +               if (nowait)
>> +                       ret = false;
>> +               else
>> +                       wait_event_lock_irq(conf->wait_barrier,
>> +                                           !conf->barrier ||
>> +                                           (atomic_read(&conf->nr_pending) &&
>> +                                            bio_list &&
>> +                                            (!bio_list_empty(&bio_list[0]) ||
>> +                                             !bio_list_empty(&bio_list[1]))) ||
>> +                                            /* move on if recovery thread is
>> +                                             * blocked by us
>> +                                             */
>> +                                            (conf->mddev->thread->tsk == current &&
>> +                                             test_bit(MD_RECOVERY_RUNNING,
>> +                                                      &conf->mddev->recovery) &&
>> +                                             conf->nr_queued > 0),
>> +                                           conf->resync_lock);
>>                  conf->nr_waiting--;
>>                  if (!conf->nr_waiting)
>>                          wake_up(&conf->wait_barrier);
>>          }
>> -       atomic_inc(&conf->nr_pending);
>> +       /* Only increment nr_pending when we wait */
>> +       if (ret)
>> +               atomic_inc(&conf->nr_pending);
>>          spin_unlock_irq(&conf->resync_lock);
>> +       return ret;
>>   }
> I guess something like this would simplify the code:
>
> static bool wait_barrier(struct r10conf *conf, bool nowait)
> {
>          bool ret = true;
>          spin_lock_irq(&conf->resync_lock);
>          if (conf->barrier) {
>                  struct bio_list *bio_list = current->bio_list;
>
>                  if (nowait) {
>                          spin_unlock_irq(&conf->resync_lock);
>                          return false;
>                  }
Ack, makes sense to me.
>>   static void allow_barrier(struct r10conf *conf)
>> @@ -1101,17 +1109,25 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
>>   static void regular_request_wait(struct mddev *mddev, struct r10conf *conf,
>>                                   struct bio *bio, sector_t sectors)
>>   {
>> -       wait_barrier(conf);
>> +       /* Bail out if REQ_NOWAIT is set for the bio */
>> +       if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
>> +               bio_wouldblock_error(bio);
>> +               return;
>> +       }
>>          while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
>>              bio->bi_iter.bi_sector < conf->reshape_progress &&
>>              bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
>>                  raid10_log(conf->mddev, "wait reshape");
>> +               if (bio->bi_opf & REQ_NOWAIT) {
>> +                       bio_wouldblock_error(bio);
>> +                       return;
>> +               }
>>                  allow_barrier(conf);
>>                  wait_event(conf->wait_barrier,
>>                             conf->reshape_progress <= bio->bi_iter.bi_sector ||
>>                             conf->reshape_progress >= bio->bi_iter.bi_sector +
>>                             sectors);
>> -               wait_barrier(conf);
>> +               wait_barrier(conf, false);
>>          }
>>   }
>>
>> @@ -1179,7 +1195,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
>>                  bio_chain(split, bio);
>>                  allow_barrier(conf);
>>                  submit_bio_noacct(bio);
>> -               wait_barrier(conf);
>> +               wait_barrier(conf, false);
>>                  bio = split;
>>                  r10_bio->master_bio = bio;
>>                  r10_bio->sectors = max_sectors;
>> @@ -1338,7 +1354,7 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
>>                  raid10_log(conf->mddev, "%s wait rdev %d blocked",
>>                                  __func__, blocked_rdev->raid_disk);
>>                  md_wait_for_blocked_rdev(blocked_rdev, mddev);
>> -               wait_barrier(conf);
>> +               wait_barrier(conf, false);
>>                  goto retry_wait;
>>          }
>>   }
>> @@ -1357,6 +1373,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>>                                              bio_end_sector(bio)))) {
>>                  DEFINE_WAIT(w);
>>                  for (;;) {
>> +                       /* Bail out if REQ_NOWAIT is set for the bio */
>> +                       if (bio->bi_opf & REQ_NOWAIT) {
>> +                               bio_wouldblock_error(bio);
>> +                               return;
>> +                       }
>>                          prepare_to_wait(&conf->wait_barrier,
>>                                          &w, TASK_IDLE);
>>                          if (!md_cluster_ops->area_resyncing(mddev, WRITE,
>> @@ -1381,6 +1402,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>>                                BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
>>                  md_wakeup_thread(mddev->thread);
>>                  raid10_log(conf->mddev, "wait reshape metadata");
>> +               if (bio->bi_opf & REQ_NOWAIT) {
>> +                       bio_wouldblock_error(bio);
>> +                       return;
>> +               }
>>                  wait_event(mddev->sb_wait,
>>                             !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
>>
>> @@ -1390,6 +1415,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>>          if (conf->pending_count >= max_queued_requests) {
>>                  md_wakeup_thread(mddev->thread);
>>                  raid10_log(mddev, "wait queued");
>> +               if (bio->bi_opf & REQ_NOWAIT) {
>> +                       bio_wouldblock_error(bio);
>> +                       return;
>> +               }
>>                  wait_event(conf->wait_barrier,
>>                             conf->pending_count < max_queued_requests);
>>          }
>> @@ -1482,7 +1511,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>>                  bio_chain(split, bio);
>>                  allow_barrier(conf);
>>                  submit_bio_noacct(bio);
>> -               wait_barrier(conf);
>> +               wait_barrier(conf, false);
>>                  bio = split;
>>                  r10_bio->master_bio = bio;
>>          }
>> @@ -1607,7 +1636,11 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
>>          if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
>>                  return -EAGAIN;
>>
>> -       wait_barrier(conf);
>> +       if (bio->bi_opf & REQ_NOWAIT) {
>> +               bio_wouldblock_error(bio);
>> +               return 0;
>> +       }
> Does this mean we always bail out on discard?
Yeah, I wanted your feedback specifically for this case. I see just 
after this is when wait_barrier() happens
>> +       wait_barrier(conf, false);
>>
>>          /*
>>           * Check reshape again to avoid reshape happens after checking
>> @@ -1649,7 +1682,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
>>                  allow_barrier(conf);
>>                  /* Resend the fist split part */
>>                  submit_bio_noacct(split);
>> -               wait_barrier(conf);
>> +               wait_barrier(conf, false);
>>          }
>>          div_u64_rem(bio_end, stripe_size, &remainder);
>>          if (remainder) {
>> @@ -1660,7 +1693,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
>>                  /* Resend the second split part */
>>                  submit_bio_noacct(bio);
>>                  bio = split;
>> -               wait_barrier(conf);
>> +               wait_barrier(conf, false);
>>          }
>>
>>          bio_start = bio->bi_iter.bi_sector;
>> @@ -1816,7 +1849,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
>>                  end_disk_offset += geo->stride;
>>                  atomic_inc(&first_r10bio->remaining);
>>                  raid_end_discard_bio(r10_bio);
>> -               wait_barrier(conf);
>> +               wait_barrier(conf, false);
>>                  goto retry_discard;
>>          }
>>
>> @@ -2011,7 +2044,7 @@ static void print_conf(struct r10conf *conf)
>>
>>   static void close_sync(struct r10conf *conf)
>>   {
>> -       wait_barrier(conf);
>> +       wait_barrier(conf, false);
>>          allow_barrier(conf);
>>
>>          mempool_exit(&conf->r10buf_pool);
>> @@ -4819,7 +4852,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
>>          if (need_flush ||
>>              time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
>>                  /* Need to update reshape_position in metadata */
>> -               wait_barrier(conf);
>> +               wait_barrier(conf, false);
>>                  mddev->reshape_position = conf->reshape_progress;
>>                  if (mddev->reshape_backwards)
>>                          mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
>> --
>> 2.17.1
>>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC PATCH v4 4/4] md: raid456 add nowait support
  2021-12-14  1:12                             ` Vishal Verma
@ 2021-12-14 15:30                               ` Vishal Verma
  2021-12-14 17:08                                 ` Song Liu
  0 siblings, 1 reply; 86+ messages in thread
From: Vishal Verma @ 2021-12-14 15:30 UTC (permalink / raw)
  To: Song Liu; +Cc: Jens Axboe, linux-raid, rgoldwyn


On 12/13/21 6:12 PM, Vishal Verma wrote:
>
> On 12/13/21 6:11 PM, Song Liu wrote:
>> On Mon, Dec 13, 2021 at 4:53 PM Vishal Verma 
>> <vverma@digitalocean.com> wrote:
>> [...]
>>> What kernel base are you using for your patches?
>>>
>>> These were based out of for-5.16-tag (037c50bfb)
>> Please rebase on top of md-next branch from here:
>>
>> https://git.kernel.org/pub/scm/linux/kernel/git/song/md.git
>>
>> Thanks,
>> Song
> Ack, will do!
>
After rebasing to md-next branch and re-running the tests 100% W, 100% 
R, 70%R30%W with both io_uring and libaio I don't see any issue. Thank you!

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC PATCH v4 4/4] md: raid456 add nowait support
  2021-12-14 15:30                               ` Vishal Verma
@ 2021-12-14 17:08                                 ` Song Liu
  2021-12-14 18:09                                   ` Vishal Verma
  2021-12-15  6:09                                   ` [PATCH v5 1/4] md: add support for REQ_NOWAIT Vishal Verma
  0 siblings, 2 replies; 86+ messages in thread
From: Song Liu @ 2021-12-14 17:08 UTC (permalink / raw)
  To: Vishal Verma; +Cc: Jens Axboe, linux-raid, rgoldwyn

On Tue, Dec 14, 2021 at 7:30 AM Vishal Verma <vverma@digitalocean.com> wrote:
>
>
> On 12/13/21 6:12 PM, Vishal Verma wrote:
> >
> > On 12/13/21 6:11 PM, Song Liu wrote:
> >> On Mon, Dec 13, 2021 at 4:53 PM Vishal Verma
> >> <vverma@digitalocean.com> wrote:
> >> [...]
> >>> What kernel base are you using for your patches?
> >>>
> >>> These were based out of for-5.16-tag (037c50bfb)
> >> Please rebase on top of md-next branch from here:
> >>
> >> https://git.kernel.org/pub/scm/linux/kernel/git/song/md.git
> >>
> >> Thanks,
> >> Song
> > Ack, will do!
> >
> After rebasing to md-next branch and re-running the tests 100% W, 100%
> R, 70%R30%W with both io_uring and libaio I don't see any issue. Thank you!

That's great! Please address all the feedback and submit v5.

Thanks,
Song

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC PATCH v4 4/4] md: raid456 add nowait support
  2021-12-14 17:08                                 ` Song Liu
@ 2021-12-14 18:09                                   ` Vishal Verma
  2021-12-15  6:09                                   ` [PATCH v5 1/4] md: add support for REQ_NOWAIT Vishal Verma
  1 sibling, 0 replies; 86+ messages in thread
From: Vishal Verma @ 2021-12-14 18:09 UTC (permalink / raw)
  To: Song Liu; +Cc: Jens Axboe, linux-raid, rgoldwyn


On 12/14/21 10:08 AM, Song Liu wrote:
> On Tue, Dec 14, 2021 at 7:30 AM Vishal Verma <vverma@digitalocean.com> wrote:
>>
>> On 12/13/21 6:12 PM, Vishal Verma wrote:
>>> On 12/13/21 6:11 PM, Song Liu wrote:
>>>> On Mon, Dec 13, 2021 at 4:53 PM Vishal Verma
>>>> <vverma@digitalocean.com> wrote:
>>>> [...]
>>>>> What kernel base are you using for your patches?
>>>>>
>>>>> These were based out of for-5.16-tag (037c50bfb)
>>>> Please rebase on top of md-next branch from here:
>>>>
>>>> https://git.kernel.org/pub/scm/linux/kernel/git/song/md.git
>>>>
>>>> Thanks,
>>>> Song
>>> Ack, will do!
>>>
>> After rebasing to md-next branch and re-running the tests 100% W, 100%
>> R, 70%R30%W with both io_uring and libaio I don't see any issue. Thank you!
> That's great! Please address all the feedback and submit v5.
>
> Thanks,
> Song
Yup, will do later today or tomorrow. I need to test raid10 with similar 
cases and not 100% sure about discard case.

^ permalink raw reply	[flat|nested] 86+ messages in thread

* [PATCH v5 1/4] md: add support for REQ_NOWAIT
  2021-12-14 17:08                                 ` Song Liu
  2021-12-14 18:09                                   ` Vishal Verma
@ 2021-12-15  6:09                                   ` Vishal Verma
  2021-12-15  6:09                                     ` [PATCH v5 2/4] md: raid1 add nowait support Vishal Verma
                                                       ` (3 more replies)
  1 sibling, 4 replies; 86+ messages in thread
From: Vishal Verma @ 2021-12-15  6:09 UTC (permalink / raw)
  To: song, linux-raid; +Cc: axboe, rgoldwyn, Vishal Verma

commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support
for checking whether a given bdev supports handling of REQ_NOWAIT or not.
Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable
it for linear target") added support for REQ_NOWAIT for dm. This uses
a similar approach to incorporate REQ_NOWAIT for md based bios.

This patch was tested using t/io_uring tool within FIO. A nvme drive
was partitioned into 2 partitions and a simple raid 0 configuration
/dev/md0 was created.

md0 : active raid0 nvme4n1p1[1] nvme4n1p2[0]
  937423872 blocks super 1.2 512k chunks

Before patch:

$ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100

Running top while the above runs:

$ ps -eL | grep $(pidof io_uring)

38396   38396 pts/2    00:00:00 io_uring
38396   38397 pts/2    00:00:15 io_uring
38396   38398 pts/2    00:00:13 iou-wrk-38397

We can see iou-wrk-38397 io worker thread created which gets created
when io_uring sees that the underlying device (/dev/md0 in this case)
doesn't support nowait.

After patch:

$ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100

Running top while the above runs:

$ ps -eL | grep $(pidof io_uring)

38341   38341 pts/2    00:10:22 io_uring
38341   38342 pts/2    00:10:37 io_uring

After running this patch, we don't see any io worker thread
being created which indicated that io_uring saw that the
underlying device does support nowait. This is the exact behaviour
noticed on a dm device which also supports nowait.

For all the other raid personalities except raid0, we would need
to train pieces which involves make_request fn in order for them
to correctly handle REQ_NOWAIT.

Signed-off-by: Vishal Verma <vverma@digitalocean.com>
---
 drivers/md/md.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7fbf6f0ac01b..5b4c28e0e1db 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -419,6 +419,12 @@ void md_handle_request(struct mddev *mddev, struct bio *bio)
 	if (is_suspended(mddev, bio)) {
 		DEFINE_WAIT(__wait);
 		for (;;) {
+			/* Bail out if REQ_NOWAIT is set for the bio */
+			if (bio->bi_opf & REQ_NOWAIT) {
+				rcu_read_unlock();
+				bio_wouldblock_error(bio);
+				return;
+			}
 			prepare_to_wait(&mddev->sb_wait, &__wait,
 					TASK_UNINTERRUPTIBLE);
 			if (!is_suspended(mddev, bio))
@@ -5787,6 +5793,7 @@ int md_run(struct mddev *mddev)
 	int err;
 	struct md_rdev *rdev;
 	struct md_personality *pers;
+	bool nowait = true;
 
 	if (list_empty(&mddev->disks))
 		/* cannot run an array with no devices.. */
@@ -5857,8 +5864,13 @@ int md_run(struct mddev *mddev)
 			}
 		}
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
+		nowait = nowait && blk_queue_nowait(bdev_get_queue(rdev->bdev));
 	}
 
+	/* Set the NOWAIT flags if all underlying devices support it */
+	if (nowait)
+		blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue);
+
 	if (!bioset_initialized(&mddev->bio_set)) {
 		err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
 		if (err)
@@ -7002,6 +7014,15 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 	if (!mddev->thread)
 		md_update_sb(mddev, 1);
+	/*
+	 * If the new disk does not support REQ_NOWAIT,
+	 * disable on the whole MD.
+	 */
+	if (!blk_queue_nowait(bdev_get_queue(rdev->bdev))) {
+		pr_info("%s: Disabling nowait because %s does not support nowait\n",
+			mdname(mddev), bdevname(rdev->bdev, b));
+		blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue);
+	}
 	/*
 	 * Kick recovery, maybe this spare has to be added to the
 	 * array immediately.
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 86+ messages in thread

* [PATCH v5 2/4] md: raid1 add nowait support
  2021-12-15  6:09                                   ` [PATCH v5 1/4] md: add support for REQ_NOWAIT Vishal Verma
@ 2021-12-15  6:09                                     ` Vishal Verma
  2021-12-15 20:33                                       ` Song Liu
  2021-12-15  6:09                                     ` [PATCH v5 3/4] md: raid10 add nowait support Vishal Verma
                                                       ` (2 subsequent siblings)
  3 siblings, 1 reply; 86+ messages in thread
From: Vishal Verma @ 2021-12-15  6:09 UTC (permalink / raw)
  To: song, linux-raid; +Cc: axboe, rgoldwyn, Vishal Verma

This adds nowait support to the RAID1 driver. It makes RAID1 driver
return with EAGAIN for situations where it could wait for eg:

- Waiting for the barrier,
- Array got frozen,
- Too many pending I/Os to be queued.

wait_barrier() fn is modified to return bool to support error for
wait barriers. It returns true in case of wait or if wait is not
required and returns false if wait was required but not performed
to support nowait.

Signed-off-by: Vishal Verma <vverma@digitalocean.com>
---
 drivers/md/raid1.c | 74 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 57 insertions(+), 17 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7dc8026cf6ee..727d31de5694 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -929,8 +929,9 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
 	wake_up(&conf->wait_barrier);
 }
 
-static void _wait_barrier(struct r1conf *conf, int idx)
+static bool _wait_barrier(struct r1conf *conf, int idx, bool nowait)
 {
+	bool ret = true;
 	/*
 	 * We need to increase conf->nr_pending[idx] very early here,
 	 * then raise_barrier() can be blocked when it waits for
@@ -961,7 +962,7 @@ static void _wait_barrier(struct r1conf *conf, int idx)
 	 */
 	if (!READ_ONCE(conf->array_frozen) &&
 	    !atomic_read(&conf->barrier[idx]))
-		return;
+		return ret;
 
 	/*
 	 * After holding conf->resync_lock, conf->nr_pending[idx]
@@ -979,18 +980,27 @@ static void _wait_barrier(struct r1conf *conf, int idx)
 	 */
 	wake_up(&conf->wait_barrier);
 	/* Wait for the barrier in same barrier unit bucket to drop. */
-	wait_event_lock_irq(conf->wait_barrier,
-			    !conf->array_frozen &&
-			     !atomic_read(&conf->barrier[idx]),
-			    conf->resync_lock);
+	if (conf->array_frozen || atomic_read(&conf->barrier[idx])) {
+		/* Return false when nowait flag is set */
+		if (nowait)
+			ret = false;
+		else {
+			wait_event_lock_irq(conf->wait_barrier,
+					!conf->array_frozen &&
+					!atomic_read(&conf->barrier[idx]),
+					conf->resync_lock);
+		}
+	}
 	atomic_inc(&conf->nr_pending[idx]);
 	atomic_dec(&conf->nr_waiting[idx]);
 	spin_unlock_irq(&conf->resync_lock);
+	return ret;
 }
 
-static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
+static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
 {
 	int idx = sector_to_idx(sector_nr);
+	bool ret = true;
 
 	/*
 	 * Very similar to _wait_barrier(). The difference is, for read
@@ -1002,7 +1012,7 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
 	atomic_inc(&conf->nr_pending[idx]);
 
 	if (!READ_ONCE(conf->array_frozen))
-		return;
+		return ret;
 
 	spin_lock_irq(&conf->resync_lock);
 	atomic_inc(&conf->nr_waiting[idx]);
@@ -1013,19 +1023,27 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
 	 */
 	wake_up(&conf->wait_barrier);
 	/* Wait for array to be unfrozen */
-	wait_event_lock_irq(conf->wait_barrier,
-			    !conf->array_frozen,
-			    conf->resync_lock);
+	if (conf->array_frozen || atomic_read(&conf->barrier[idx])) {
+		if (nowait)
+			/* Return false when nowait flag is set */
+			ret = false;
+		else {
+			wait_event_lock_irq(conf->wait_barrier,
+					!conf->array_frozen,
+					conf->resync_lock);
+		}
+	}
 	atomic_inc(&conf->nr_pending[idx]);
 	atomic_dec(&conf->nr_waiting[idx]);
 	spin_unlock_irq(&conf->resync_lock);
+	return ret;
 }
 
-static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
+static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
 {
 	int idx = sector_to_idx(sector_nr);
 
-	_wait_barrier(conf, idx);
+	return _wait_barrier(conf, idx, nowait);
 }
 
 static void _allow_barrier(struct r1conf *conf, int idx)
@@ -1236,7 +1254,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 	 * Still need barrier for READ in case that whole
 	 * array is frozen.
 	 */
-	wait_read_barrier(conf, bio->bi_iter.bi_sector);
+	if (!wait_read_barrier(conf, bio->bi_iter.bi_sector,
+				bio->bi_opf & REQ_NOWAIT)) {
+		bio_wouldblock_error(bio);
+		return;
+	}
 
 	if (!r1_bio)
 		r1_bio = alloc_r1bio(mddev, bio);
@@ -1336,6 +1358,10 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 		     bio->bi_iter.bi_sector, bio_end_sector(bio))) {
 
 		DEFINE_WAIT(w);
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		for (;;) {
 			prepare_to_wait(&conf->wait_barrier,
 					&w, TASK_IDLE);
@@ -1353,17 +1379,26 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	 * thread has put up a bar for new requests.
 	 * Continue immediately if no resync is active currently.
 	 */
-	wait_barrier(conf, bio->bi_iter.bi_sector);
+	if (!wait_barrier(conf, bio->bi_iter.bi_sector,
+				bio->bi_opf & REQ_NOWAIT)) {
+		bio_wouldblock_error(bio);
+		return;
+	}
 
 	r1_bio = alloc_r1bio(mddev, bio);
 	r1_bio->sectors = max_write_sectors;
 
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		raid1_log(mddev, "wait queued");
 		wait_event(conf->wait_barrier,
 			   conf->pending_count < max_queued_requests);
 	}
+
 	/* first select target devices under rcu_lock and
 	 * inc refcount on their rdev.  Record them by setting
 	 * bios[x] to bio
@@ -1458,9 +1493,14 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 				rdev_dec_pending(conf->mirrors[j].rdev, mddev);
 		r1_bio->state = 0;
 		allow_barrier(conf, bio->bi_iter.bi_sector);
+
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
 		md_wait_for_blocked_rdev(blocked_rdev, mddev);
-		wait_barrier(conf, bio->bi_iter.bi_sector);
+		wait_barrier(conf, bio->bi_iter.bi_sector, false);
 		goto retry_write;
 	}
 
@@ -1687,7 +1727,7 @@ static void close_sync(struct r1conf *conf)
 	int idx;
 
 	for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) {
-		_wait_barrier(conf, idx);
+		_wait_barrier(conf, idx, false);
 		_allow_barrier(conf, idx);
 	}
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 86+ messages in thread

* [PATCH v5 3/4] md: raid10 add nowait support
  2021-12-15  6:09                                   ` [PATCH v5 1/4] md: add support for REQ_NOWAIT Vishal Verma
  2021-12-15  6:09                                     ` [PATCH v5 2/4] md: raid1 add nowait support Vishal Verma
@ 2021-12-15  6:09                                     ` Vishal Verma
  2021-12-15 20:42                                       ` Song Liu
  2021-12-15  6:09                                     ` [PATCH v5 4/4] md: raid456 " Vishal Verma
  2021-12-15 20:02                                     ` [PATCH v5 1/4] md: add support for REQ_NOWAIT Song Liu
  3 siblings, 1 reply; 86+ messages in thread
From: Vishal Verma @ 2021-12-15  6:09 UTC (permalink / raw)
  To: song, linux-raid; +Cc: axboe, rgoldwyn, Vishal Verma

This adds nowait support to the RAID10 driver. Very similar to
raid1 driver changes. It makes RAID10 driver return with EAGAIN
for situations where it could wait for eg:

- Waiting for the barrier,
- Too many pending I/Os to be queued,
- Reshape operation,
- Discard operation.

wait_barrier() fn is modified to return bool to support error for
wait barriers. It returns true in case of wait or if wait is not
required and returns false if wait was required but not performed
to support nowait.

Signed-off-by: Vishal Verma <vverma@digitalocean.com>
---
 drivers/md/raid10.c | 57 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 45 insertions(+), 12 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index dde98f65bd04..f6c73987e9ac 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -952,11 +952,18 @@ static void lower_barrier(struct r10conf *conf)
 	wake_up(&conf->wait_barrier);
 }
 
-static void wait_barrier(struct r10conf *conf)
+static bool wait_barrier(struct r10conf *conf, bool nowait)
 {
 	spin_lock_irq(&conf->resync_lock);
 	if (conf->barrier) {
 		struct bio_list *bio_list = current->bio_list;
+
+		/* Return false when nowait flag is set */
+		if (nowait) {
+			spin_unlock_irq(&conf->resync_lock);
+			return false;
+		}
+
 		conf->nr_waiting++;
 		/* Wait for the barrier to drop.
 		 * However if there are already pending
@@ -988,6 +995,7 @@ static void wait_barrier(struct r10conf *conf)
 	}
 	atomic_inc(&conf->nr_pending);
 	spin_unlock_irq(&conf->resync_lock);
+	return true;
 }
 
 static void allow_barrier(struct r10conf *conf)
@@ -1101,17 +1109,25 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
 static void regular_request_wait(struct mddev *mddev, struct r10conf *conf,
 				 struct bio *bio, sector_t sectors)
 {
-	wait_barrier(conf);
+	/* Bail out if REQ_NOWAIT is set for the bio */
+	if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
+		bio_wouldblock_error(bio);
+		return;
+	}
 	while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    bio->bi_iter.bi_sector < conf->reshape_progress &&
 	    bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
 		raid10_log(conf->mddev, "wait reshape");
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		allow_barrier(conf);
 		wait_event(conf->wait_barrier,
 			   conf->reshape_progress <= bio->bi_iter.bi_sector ||
 			   conf->reshape_progress >= bio->bi_iter.bi_sector +
 			   sectors);
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 	}
 }
 
@@ -1179,7 +1195,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 		bio_chain(split, bio);
 		allow_barrier(conf);
 		submit_bio_noacct(bio);
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 		bio = split;
 		r10_bio->master_bio = bio;
 		r10_bio->sectors = max_sectors;
@@ -1338,7 +1354,7 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
 		raid10_log(conf->mddev, "%s wait rdev %d blocked",
 				__func__, blocked_rdev->raid_disk);
 		md_wait_for_blocked_rdev(blocked_rdev, mddev);
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 		goto retry_wait;
 	}
 }
@@ -1357,6 +1373,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 					    bio_end_sector(bio)))) {
 		DEFINE_WAIT(w);
 		for (;;) {
+			/* Bail out if REQ_NOWAIT is set for the bio */
+			if (bio->bi_opf & REQ_NOWAIT) {
+				bio_wouldblock_error(bio);
+				return;
+			}
 			prepare_to_wait(&conf->wait_barrier,
 					&w, TASK_IDLE);
 			if (!md_cluster_ops->area_resyncing(mddev, WRITE,
@@ -1381,6 +1402,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 			      BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
 		md_wakeup_thread(mddev->thread);
 		raid10_log(conf->mddev, "wait reshape metadata");
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		wait_event(mddev->sb_wait,
 			   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
 
@@ -1390,6 +1415,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
 		raid10_log(mddev, "wait queued");
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		wait_event(conf->wait_barrier,
 			   conf->pending_count < max_queued_requests);
 	}
@@ -1482,7 +1511,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 		bio_chain(split, bio);
 		allow_barrier(conf);
 		submit_bio_noacct(bio);
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 		bio = split;
 		r10_bio->master_bio = bio;
 	}
@@ -1607,7 +1636,11 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 		return -EAGAIN;
 
-	wait_barrier(conf);
+	if (bio->bi_opf & REQ_NOWAIT) {
+		bio_wouldblock_error(bio);
+		return 0;
+	}
+	wait_barrier(conf, false);
 
 	/*
 	 * Check reshape again to avoid reshape happens after checking
@@ -1649,7 +1682,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 		allow_barrier(conf);
 		/* Resend the fist split part */
 		submit_bio_noacct(split);
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 	}
 	div_u64_rem(bio_end, stripe_size, &remainder);
 	if (remainder) {
@@ -1660,7 +1693,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 		/* Resend the second split part */
 		submit_bio_noacct(bio);
 		bio = split;
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 	}
 
 	bio_start = bio->bi_iter.bi_sector;
@@ -1816,7 +1849,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 		end_disk_offset += geo->stride;
 		atomic_inc(&first_r10bio->remaining);
 		raid_end_discard_bio(r10_bio);
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 		goto retry_discard;
 	}
 
@@ -2011,7 +2044,7 @@ static void print_conf(struct r10conf *conf)
 
 static void close_sync(struct r10conf *conf)
 {
-	wait_barrier(conf);
+	wait_barrier(conf, false);
 	allow_barrier(conf);
 
 	mempool_exit(&conf->r10buf_pool);
@@ -4819,7 +4852,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 	if (need_flush ||
 	    time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
 		/* Need to update reshape_position in metadata */
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 		mddev->reshape_position = conf->reshape_progress;
 		if (mddev->reshape_backwards)
 			mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 86+ messages in thread

* [PATCH v5 4/4] md: raid456 add nowait support
  2021-12-15  6:09                                   ` [PATCH v5 1/4] md: add support for REQ_NOWAIT Vishal Verma
  2021-12-15  6:09                                     ` [PATCH v5 2/4] md: raid1 add nowait support Vishal Verma
  2021-12-15  6:09                                     ` [PATCH v5 3/4] md: raid10 add nowait support Vishal Verma
@ 2021-12-15  6:09                                     ` Vishal Verma
  2021-12-15 20:02                                     ` [PATCH v5 1/4] md: add support for REQ_NOWAIT Song Liu
  3 siblings, 0 replies; 86+ messages in thread
From: Vishal Verma @ 2021-12-15  6:09 UTC (permalink / raw)
  To: song, linux-raid; +Cc: axboe, rgoldwyn, Vishal Verma

Returns EAGAIN in case the raid456 driver would block
waiting for situations like:

- Reshape operation,
- Discard operation.

Signed-off-by: Vishal Verma <vverma@digitalocean.com>
---
 drivers/md/raid5.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1240a5c16af8..b505e4cec777 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5715,6 +5715,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 		set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
 		if (test_bit(STRIPE_SYNCING, &sh->state)) {
 			raid5_release_stripe(sh);
+			/* Bail out if REQ_NOWAIT is set */
+			if (bi->bi_opf & REQ_NOWAIT) {
+				bio_wouldblock_error(bi);
+				return;
+			}
 			schedule();
 			goto again;
 		}
@@ -5727,6 +5732,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 				set_bit(R5_Overlap, &sh->dev[d].flags);
 				spin_unlock_irq(&sh->stripe_lock);
 				raid5_release_stripe(sh);
+				/* Bail out if REQ_NOWAIT is set */
+				if (bi->bi_opf & REQ_NOWAIT) {
+					bio_wouldblock_error(bi);
+					return;
+				}
 				schedule();
 				goto again;
 			}
@@ -5820,6 +5830,16 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
 	bi->bi_next = NULL;
 
 	md_account_bio(mddev, &bi);
+	/* Bail out if REQ_NOWAIT is set */
+	if ((bi->bi_opf & REQ_NOWAIT) &&
+	    (conf->reshape_progress != MaxSector) &&
+	    (mddev->reshape_backwards
+	    ? (logical_sector > conf->reshape_progress && logical_sector <= conf->reshape_safe)
+	    : (logical_sector >= conf->reshape_safe && logical_sector < conf->reshape_progress))) {
+		bio_wouldblock_error(bi);
+		return true;
+	}
+
 	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 	for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
 		int previous;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 86+ messages in thread

* Re: [PATCH v5 1/4] md: add support for REQ_NOWAIT
  2021-12-15  6:09                                   ` [PATCH v5 1/4] md: add support for REQ_NOWAIT Vishal Verma
                                                       ` (2 preceding siblings ...)
  2021-12-15  6:09                                     ` [PATCH v5 4/4] md: raid456 " Vishal Verma
@ 2021-12-15 20:02                                     ` Song Liu
  3 siblings, 0 replies; 86+ messages in thread
From: Song Liu @ 2021-12-15 20:02 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, Jens Axboe, rgoldwyn

On Tue, Dec 14, 2021 at 10:09 PM Vishal Verma <vverma@digitalocean.com> wrote:
>
[...]
>
> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
> ---
>  drivers/md/md.c | 21 +++++++++++++++++++++
>  1 file changed, 21 insertions(+)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 7fbf6f0ac01b..5b4c28e0e1db 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -419,6 +419,12 @@ void md_handle_request(struct mddev *mddev, struct bio *bio)
>         if (is_suspended(mddev, bio)) {
>                 DEFINE_WAIT(__wait);
>                 for (;;) {
> +                       /* Bail out if REQ_NOWAIT is set for the bio */
> +                       if (bio->bi_opf & REQ_NOWAIT) {
> +                               rcu_read_unlock();
> +                               bio_wouldblock_error(bio);
> +                               return;
> +                       }

I moved this part to before the for (;;) loop. And applied to md-next.

Thanks,
Song

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v5 2/4] md: raid1 add nowait support
  2021-12-15  6:09                                     ` [PATCH v5 2/4] md: raid1 add nowait support Vishal Verma
@ 2021-12-15 20:33                                       ` Song Liu
  2021-12-15 22:20                                         ` Vishal Verma
  0 siblings, 1 reply; 86+ messages in thread
From: Song Liu @ 2021-12-15 20:33 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, Jens Axboe, rgoldwyn

On Tue, Dec 14, 2021 at 10:09 PM Vishal Verma <vverma@digitalocean.com> wrote:
>
> This adds nowait support to the RAID1 driver. It makes RAID1 driver
> return with EAGAIN for situations where it could wait for eg:
>
> - Waiting for the barrier,
> - Array got frozen,
> - Too many pending I/Os to be queued.
>
> wait_barrier() fn is modified to return bool to support error for
> wait barriers. It returns true in case of wait or if wait is not
> required and returns false if wait was required but not performed
> to support nowait.

Please see some detailed comments below. But a general and more important
question: were you able to trigger these conditions (path that lead to
bio_wouldblock_error) in the tests?

Ideally, we should test all these conditions. If something is really
hard to trigger,
please highlight that in the commit log, so that I can run more tests on them.

Thanks,
Song

>
> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
> ---
>  drivers/md/raid1.c | 74 +++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 57 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 7dc8026cf6ee..727d31de5694 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -929,8 +929,9 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
>         wake_up(&conf->wait_barrier);
>  }
>
> -static void _wait_barrier(struct r1conf *conf, int idx)
> +static bool _wait_barrier(struct r1conf *conf, int idx, bool nowait)
>  {
> +       bool ret = true;
>         /*
>          * We need to increase conf->nr_pending[idx] very early here,
>          * then raise_barrier() can be blocked when it waits for
> @@ -961,7 +962,7 @@ static void _wait_barrier(struct r1conf *conf, int idx)
>          */
>         if (!READ_ONCE(conf->array_frozen) &&
>             !atomic_read(&conf->barrier[idx]))
> -               return;
> +               return ret;
>
>         /*
>          * After holding conf->resync_lock, conf->nr_pending[idx]
> @@ -979,18 +980,27 @@ static void _wait_barrier(struct r1conf *conf, int idx)
>          */
>         wake_up(&conf->wait_barrier);
>         /* Wait for the barrier in same barrier unit bucket to drop. */
> -       wait_event_lock_irq(conf->wait_barrier,
> -                           !conf->array_frozen &&
> -                            !atomic_read(&conf->barrier[idx]),
> -                           conf->resync_lock);
> +       if (conf->array_frozen || atomic_read(&conf->barrier[idx])) {

Do we really need this check?

> +               /* Return false when nowait flag is set */
> +               if (nowait)
> +                       ret = false;
> +               else {
> +                       wait_event_lock_irq(conf->wait_barrier,
> +                                       !conf->array_frozen &&
> +                                       !atomic_read(&conf->barrier[idx]),
> +                                       conf->resync_lock);
> +               }
> +       }
>         atomic_inc(&conf->nr_pending[idx]);

Were you able to trigger the condition in the tests? I think we should
only increase
nr_pending for ret == true. Otherwise, we will leak a nr_pending.

>         atomic_dec(&conf->nr_waiting[idx]);
>         spin_unlock_irq(&conf->resync_lock);
> +       return ret;
>  }
>
> -static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
> +static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
>  {
>         int idx = sector_to_idx(sector_nr);
> +       bool ret = true;
>
>         /*
>          * Very similar to _wait_barrier(). The difference is, for read
> @@ -1002,7 +1012,7 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
>         atomic_inc(&conf->nr_pending[idx]);
>
>         if (!READ_ONCE(conf->array_frozen))
> -               return;
> +               return ret;
>
>         spin_lock_irq(&conf->resync_lock);
>         atomic_inc(&conf->nr_waiting[idx]);
> @@ -1013,19 +1023,27 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
>          */
>         wake_up(&conf->wait_barrier);
>         /* Wait for array to be unfrozen */
> -       wait_event_lock_irq(conf->wait_barrier,
> -                           !conf->array_frozen,
> -                           conf->resync_lock);
> +       if (conf->array_frozen || atomic_read(&conf->barrier[idx])) {

I guess we don't need this either. Also, the condition there is not identical
to wait_barrier (no need to check conf->barrier[idx]).

> +               if (nowait)
> +                       /* Return false when nowait flag is set */
> +                       ret = false;
> +               else {
> +                       wait_event_lock_irq(conf->wait_barrier,
> +                                       !conf->array_frozen,
> +                                       conf->resync_lock);
> +               }
> +       }
>         atomic_inc(&conf->nr_pending[idx]);

ditto on nr_pending.

>         atomic_dec(&conf->nr_waiting[idx]);
>         spin_unlock_irq(&conf->resync_lock);
> +       return ret;
>  }
>
> -static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
> +static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
>  {
>         int idx = sector_to_idx(sector_nr);
>
> -       _wait_barrier(conf, idx);
> +       return _wait_barrier(conf, idx, nowait);
>  }
>
>  static void _allow_barrier(struct r1conf *conf, int idx)
> @@ -1236,7 +1254,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
>          * Still need barrier for READ in case that whole
>          * array is frozen.
>          */
> -       wait_read_barrier(conf, bio->bi_iter.bi_sector);
> +       if (!wait_read_barrier(conf, bio->bi_iter.bi_sector,
> +                               bio->bi_opf & REQ_NOWAIT)) {
> +               bio_wouldblock_error(bio);
> +               return;
> +       }
>
>         if (!r1_bio)
>                 r1_bio = alloc_r1bio(mddev, bio);
> @@ -1336,6 +1358,10 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>                      bio->bi_iter.bi_sector, bio_end_sector(bio))) {
>
>                 DEFINE_WAIT(w);
> +               if (bio->bi_opf & REQ_NOWAIT) {
> +                       bio_wouldblock_error(bio);
> +                       return;
> +               }
>                 for (;;) {
>                         prepare_to_wait(&conf->wait_barrier,
>                                         &w, TASK_IDLE);
> @@ -1353,17 +1379,26 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>          * thread has put up a bar for new requests.
>          * Continue immediately if no resync is active currently.
>          */
> -       wait_barrier(conf, bio->bi_iter.bi_sector);
> +       if (!wait_barrier(conf, bio->bi_iter.bi_sector,
> +                               bio->bi_opf & REQ_NOWAIT)) {
> +               bio_wouldblock_error(bio);
> +               return;
> +       }
>
>         r1_bio = alloc_r1bio(mddev, bio);
>         r1_bio->sectors = max_write_sectors;
>
>         if (conf->pending_count >= max_queued_requests) {
>                 md_wakeup_thread(mddev->thread);
> +               if (bio->bi_opf & REQ_NOWAIT) {
> +                       bio_wouldblock_error(bio);

I think we need to fix conf->nr_pending before returning.

> +                       return;
> +               }
>                 raid1_log(mddev, "wait queued");
>                 wait_event(conf->wait_barrier,
>                            conf->pending_count < max_queued_requests);
>         }
> +
>         /* first select target devices under rcu_lock and
>          * inc refcount on their rdev.  Record them by setting
>          * bios[x] to bio
> @@ -1458,9 +1493,14 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>                                 rdev_dec_pending(conf->mirrors[j].rdev, mddev);
>                 r1_bio->state = 0;
>                 allow_barrier(conf, bio->bi_iter.bi_sector);
> +
> +               if (bio->bi_opf & REQ_NOWAIT) {
> +                       bio_wouldblock_error(bio);
> +                       return;
> +               }
>                 raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
>                 md_wait_for_blocked_rdev(blocked_rdev, mddev);
> -               wait_barrier(conf, bio->bi_iter.bi_sector);
> +               wait_barrier(conf, bio->bi_iter.bi_sector, false);
>                 goto retry_write;
>         }
>
> @@ -1687,7 +1727,7 @@ static void close_sync(struct r1conf *conf)
>         int idx;
>
>         for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) {
> -               _wait_barrier(conf, idx);
> +               _wait_barrier(conf, idx, false);
>                 _allow_barrier(conf, idx);
>         }
>
> --
> 2.17.1
>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v5 3/4] md: raid10 add nowait support
  2021-12-15  6:09                                     ` [PATCH v5 3/4] md: raid10 add nowait support Vishal Verma
@ 2021-12-15 20:42                                       ` Song Liu
  2021-12-15 22:20                                         ` Vishal Verma
  0 siblings, 1 reply; 86+ messages in thread
From: Song Liu @ 2021-12-15 20:42 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, Jens Axboe, rgoldwyn

On Tue, Dec 14, 2021 at 10:09 PM Vishal Verma <vverma@digitalocean.com> wrote:
>
> This adds nowait support to the RAID10 driver. Very similar to
> raid1 driver changes. It makes RAID10 driver return with EAGAIN
> for situations where it could wait for eg:
>
> - Waiting for the barrier,
> - Too many pending I/Os to be queued,
> - Reshape operation,
> - Discard operation.
>
> wait_barrier() fn is modified to return bool to support error for
> wait barriers. It returns true in case of wait or if wait is not
> required and returns false if wait was required but not performed
> to support nowait.
>
> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
> ---
>  drivers/md/raid10.c | 57 +++++++++++++++++++++++++++++++++++----------
>  1 file changed, 45 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index dde98f65bd04..f6c73987e9ac 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -952,11 +952,18 @@ static void lower_barrier(struct r10conf *conf)
>         wake_up(&conf->wait_barrier);
>  }
>
> -static void wait_barrier(struct r10conf *conf)
> +static bool wait_barrier(struct r10conf *conf, bool nowait)
>  {
>         spin_lock_irq(&conf->resync_lock);
>         if (conf->barrier) {
>                 struct bio_list *bio_list = current->bio_list;
> +
> +               /* Return false when nowait flag is set */
> +               if (nowait) {
> +                       spin_unlock_irq(&conf->resync_lock);
> +                       return false;
> +               }
> +
>                 conf->nr_waiting++;
>                 /* Wait for the barrier to drop.
>                  * However if there are already pending
> @@ -988,6 +995,7 @@ static void wait_barrier(struct r10conf *conf)
>         }
>         atomic_inc(&conf->nr_pending);
>         spin_unlock_irq(&conf->resync_lock);
> +       return true;
>  }
>
>  static void allow_barrier(struct r10conf *conf)
> @@ -1101,17 +1109,25 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
>  static void regular_request_wait(struct mddev *mddev, struct r10conf *conf,
>                                  struct bio *bio, sector_t sectors)
>  {
> -       wait_barrier(conf);
> +       /* Bail out if REQ_NOWAIT is set for the bio */
> +       if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
> +               bio_wouldblock_error(bio);
> +               return;
> +       }

I think we also need regular_request_wait to return bool and handle it properly.

Thanks,
Song

>         while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
>             bio->bi_iter.bi_sector < conf->reshape_progress &&
>             bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
>                 raid10_log(conf->mddev, "wait reshape");
> +               if (bio->bi_opf & REQ_NOWAIT) {
> +                       bio_wouldblock_error(bio);
> +                       return;
> +               }
>                 allow_barrier(conf);
>                 wait_event(conf->wait_barrier,
>                            conf->reshape_progress <= bio->bi_iter.bi_sector ||
>                            conf->reshape_progress >= bio->bi_iter.bi_sector +
>                            sectors);
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>         }
>  }
>
> @@ -1179,7 +1195,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
>                 bio_chain(split, bio);
>                 allow_barrier(conf);
>                 submit_bio_noacct(bio);
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>                 bio = split;
>                 r10_bio->master_bio = bio;
>                 r10_bio->sectors = max_sectors;
> @@ -1338,7 +1354,7 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
>                 raid10_log(conf->mddev, "%s wait rdev %d blocked",
>                                 __func__, blocked_rdev->raid_disk);
>                 md_wait_for_blocked_rdev(blocked_rdev, mddev);
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>                 goto retry_wait;
>         }
>  }
> @@ -1357,6 +1373,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>                                             bio_end_sector(bio)))) {
>                 DEFINE_WAIT(w);
>                 for (;;) {
> +                       /* Bail out if REQ_NOWAIT is set for the bio */
> +                       if (bio->bi_opf & REQ_NOWAIT) {
> +                               bio_wouldblock_error(bio);
> +                               return;
> +                       }
>                         prepare_to_wait(&conf->wait_barrier,
>                                         &w, TASK_IDLE);
>                         if (!md_cluster_ops->area_resyncing(mddev, WRITE,
> @@ -1381,6 +1402,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>                               BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
>                 md_wakeup_thread(mddev->thread);
>                 raid10_log(conf->mddev, "wait reshape metadata");
> +               if (bio->bi_opf & REQ_NOWAIT) {
> +                       bio_wouldblock_error(bio);
> +                       return;
> +               }
>                 wait_event(mddev->sb_wait,
>                            !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
>
> @@ -1390,6 +1415,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>         if (conf->pending_count >= max_queued_requests) {
>                 md_wakeup_thread(mddev->thread);
>                 raid10_log(mddev, "wait queued");
> +               if (bio->bi_opf & REQ_NOWAIT) {
> +                       bio_wouldblock_error(bio);
> +                       return;
> +               }
>                 wait_event(conf->wait_barrier,
>                            conf->pending_count < max_queued_requests);
>         }
> @@ -1482,7 +1511,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>                 bio_chain(split, bio);
>                 allow_barrier(conf);
>                 submit_bio_noacct(bio);
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>                 bio = split;
>                 r10_bio->master_bio = bio;
>         }
> @@ -1607,7 +1636,11 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
>         if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
>                 return -EAGAIN;
>
> -       wait_barrier(conf);
> +       if (bio->bi_opf & REQ_NOWAIT) {
> +               bio_wouldblock_error(bio);
> +               return 0;
> +       }
> +       wait_barrier(conf, false);
>
>         /*
>          * Check reshape again to avoid reshape happens after checking
> @@ -1649,7 +1682,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
>                 allow_barrier(conf);
>                 /* Resend the fist split part */
>                 submit_bio_noacct(split);
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>         }
>         div_u64_rem(bio_end, stripe_size, &remainder);
>         if (remainder) {
> @@ -1660,7 +1693,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
>                 /* Resend the second split part */
>                 submit_bio_noacct(bio);
>                 bio = split;
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>         }
>
>         bio_start = bio->bi_iter.bi_sector;
> @@ -1816,7 +1849,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
>                 end_disk_offset += geo->stride;
>                 atomic_inc(&first_r10bio->remaining);
>                 raid_end_discard_bio(r10_bio);
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>                 goto retry_discard;
>         }
>
> @@ -2011,7 +2044,7 @@ static void print_conf(struct r10conf *conf)
>
>  static void close_sync(struct r10conf *conf)
>  {
> -       wait_barrier(conf);
> +       wait_barrier(conf, false);
>         allow_barrier(conf);
>
>         mempool_exit(&conf->r10buf_pool);
> @@ -4819,7 +4852,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
>         if (need_flush ||
>             time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
>                 /* Need to update reshape_position in metadata */
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>                 mddev->reshape_position = conf->reshape_progress;
>                 if (mddev->reshape_backwards)
>                         mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
> --
> 2.17.1
>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v5 2/4] md: raid1 add nowait support
  2021-12-15 20:33                                       ` Song Liu
@ 2021-12-15 22:20                                         ` Vishal Verma
  2021-12-21 20:06                                           ` [PATCH v6 1/4] md: add support for REQ_NOWAIT Vishal Verma
  0 siblings, 1 reply; 86+ messages in thread
From: Vishal Verma @ 2021-12-15 22:20 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, Jens Axboe, rgoldwyn


On 12/15/21 1:33 PM, Song Liu wrote:
> On Tue, Dec 14, 2021 at 10:09 PM Vishal Verma <vverma@digitalocean.com> wrote:
>> This adds nowait support to the RAID1 driver. It makes RAID1 driver
>> return with EAGAIN for situations where it could wait for eg:
>>
>> - Waiting for the barrier,
>> - Array got frozen,
>> - Too many pending I/Os to be queued.
>>
>> wait_barrier() fn is modified to return bool to support error for
>> wait barriers. It returns true in case of wait or if wait is not
>> required and returns false if wait was required but not performed
>> to support nowait.
> Please see some detailed comments below. But a general and more important
> question: were you able to trigger these conditions (path that lead to
> bio_wouldblock_error) in the tests?
>
> Ideally, we should test all these conditions. If something is really
> hard to trigger,
> please highlight that in the commit log, so that I can run more tests on them.
>
> Thanks,
> Song
>
>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
>> ---
>>   drivers/md/raid1.c | 74 +++++++++++++++++++++++++++++++++++-----------
>>   1 file changed, 57 insertions(+), 17 deletions(-)
>>
>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>> index 7dc8026cf6ee..727d31de5694 100644
>> --- a/drivers/md/raid1.c
>> +++ b/drivers/md/raid1.c
>> @@ -929,8 +929,9 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
>>          wake_up(&conf->wait_barrier);
>>   }
>>
>> -static void _wait_barrier(struct r1conf *conf, int idx)
>> +static bool _wait_barrier(struct r1conf *conf, int idx, bool nowait)
>>   {
>> +       bool ret = true;
>>          /*
>>           * We need to increase conf->nr_pending[idx] very early here,
>>           * then raise_barrier() can be blocked when it waits for
>> @@ -961,7 +962,7 @@ static void _wait_barrier(struct r1conf *conf, int idx)
>>           */
>>          if (!READ_ONCE(conf->array_frozen) &&
>>              !atomic_read(&conf->barrier[idx]))
>> -               return;
>> +               return ret;
>>
>>          /*
>>           * After holding conf->resync_lock, conf->nr_pending[idx]
>> @@ -979,18 +980,27 @@ static void _wait_barrier(struct r1conf *conf, int idx)
>>           */
>>          wake_up(&conf->wait_barrier);
>>          /* Wait for the barrier in same barrier unit bucket to drop. */
>> -       wait_event_lock_irq(conf->wait_barrier,
>> -                           !conf->array_frozen &&
>> -                            !atomic_read(&conf->barrier[idx]),
>> -                           conf->resync_lock);
>> +       if (conf->array_frozen || atomic_read(&conf->barrier[idx])) {
> Do we really need this check?
This was done when looking at the wait_event_lock_irq conditions.
I am not very sure about this.
>> +               /* Return false when nowait flag is set */
>> +               if (nowait)
>> +                       ret = false;
>> +               else {
>> +                       wait_event_lock_irq(conf->wait_barrier,
>> +                                       !conf->array_frozen &&
>> +                                       !atomic_read(&conf->barrier[idx]),
>> +                                       conf->resync_lock);
>> +               }
>> +       }
>>          atomic_inc(&conf->nr_pending[idx]);
> Were you able to trigger the condition in the tests? I think we should
> only increase
> nr_pending for ret == true. Otherwise, we will leak a nr_pending.
No I wasn't able to. Makes sense about nr_pending. Thanks for catching.
>
>>          atomic_dec(&conf->nr_waiting[idx]);
>>          spin_unlock_irq(&conf->resync_lock);
>> +       return ret;
>>   }
>>
>> -static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
>> +static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
>>   {
>>          int idx = sector_to_idx(sector_nr);
>> +       bool ret = true;
>>
>>          /*
>>           * Very similar to _wait_barrier(). The difference is, for read
>> @@ -1002,7 +1012,7 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
>>          atomic_inc(&conf->nr_pending[idx]);
>>
>>          if (!READ_ONCE(conf->array_frozen))
>> -               return;
>> +               return ret;
>>
>>          spin_lock_irq(&conf->resync_lock);
>>          atomic_inc(&conf->nr_waiting[idx]);
>> @@ -1013,19 +1023,27 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
>>           */
>>          wake_up(&conf->wait_barrier);
>>          /* Wait for array to be unfrozen */
>> -       wait_event_lock_irq(conf->wait_barrier,
>> -                           !conf->array_frozen,
>> -                           conf->resync_lock);
>> +       if (conf->array_frozen || atomic_read(&conf->barrier[idx])) {
> I guess we don't need this either. Also, the condition there is not identical
> to wait_barrier (no need to check conf->barrier[idx]).
OK
>> +               if (nowait)
>> +                       /* Return false when nowait flag is set */
>> +                       ret = false;
>> +               else {
>> +                       wait_event_lock_irq(conf->wait_barrier,
>> +                                       !conf->array_frozen,
>> +                                       conf->resync_lock);
>> +               }
>> +       }
>>          atomic_inc(&conf->nr_pending[idx]);
> ditto on nr_pending.
OK
>
>>          atomic_dec(&conf->nr_waiting[idx]);
>>          spin_unlock_irq(&conf->resync_lock);
>> +       return ret;
>>   }
>>
>> -static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
>> +static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
>>   {
>>          int idx = sector_to_idx(sector_nr);
>>
>> -       _wait_barrier(conf, idx);
>> +       return _wait_barrier(conf, idx, nowait);
>>   }
>>
>>   static void _allow_barrier(struct r1conf *conf, int idx)
>> @@ -1236,7 +1254,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
>>           * Still need barrier for READ in case that whole
>>           * array is frozen.
>>           */
>> -       wait_read_barrier(conf, bio->bi_iter.bi_sector);
>> +       if (!wait_read_barrier(conf, bio->bi_iter.bi_sector,
>> +                               bio->bi_opf & REQ_NOWAIT)) {
>> +               bio_wouldblock_error(bio);
>> +               return;
>> +       }
>>
>>          if (!r1_bio)
>>                  r1_bio = alloc_r1bio(mddev, bio);
>> @@ -1336,6 +1358,10 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>>                       bio->bi_iter.bi_sector, bio_end_sector(bio))) {
>>
>>                  DEFINE_WAIT(w);
>> +               if (bio->bi_opf & REQ_NOWAIT) {
>> +                       bio_wouldblock_error(bio);
>> +                       return;
>> +               }
>>                  for (;;) {
>>                          prepare_to_wait(&conf->wait_barrier,
>>                                          &w, TASK_IDLE);
>> @@ -1353,17 +1379,26 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>>           * thread has put up a bar for new requests.
>>           * Continue immediately if no resync is active currently.
>>           */
>> -       wait_barrier(conf, bio->bi_iter.bi_sector);
>> +       if (!wait_barrier(conf, bio->bi_iter.bi_sector,
>> +                               bio->bi_opf & REQ_NOWAIT)) {
>> +               bio_wouldblock_error(bio);
>> +               return;
>> +       }
>>
>>          r1_bio = alloc_r1bio(mddev, bio);
>>          r1_bio->sectors = max_write_sectors;
>>
>>          if (conf->pending_count >= max_queued_requests) {
>>                  md_wakeup_thread(mddev->thread);
>> +               if (bio->bi_opf & REQ_NOWAIT) {
>> +                       bio_wouldblock_error(bio);
> I think we need to fix conf->nr_pending before returning.
OK, this one I am not sure. You mean dec conf->nr_pending?
>> +                       return;
>> +               }
>>                  raid1_log(mddev, "wait queued");
>>                  wait_event(conf->wait_barrier,
>>                             conf->pending_count < max_queued_requests);
>>          }
>> +
>>          /* first select target devices under rcu_lock and
>>           * inc refcount on their rdev.  Record them by setting
>>           * bios[x] to bio
>> @@ -1458,9 +1493,14 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>>                                  rdev_dec_pending(conf->mirrors[j].rdev, mddev);
>>                  r1_bio->state = 0;
>>                  allow_barrier(conf, bio->bi_iter.bi_sector);
>> +
>> +               if (bio->bi_opf & REQ_NOWAIT) {
>> +                       bio_wouldblock_error(bio);
>> +                       return;
>> +               }
>>                  raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
>>                  md_wait_for_blocked_rdev(blocked_rdev, mddev);
>> -               wait_barrier(conf, bio->bi_iter.bi_sector);
>> +               wait_barrier(conf, bio->bi_iter.bi_sector, false);
>>                  goto retry_write;
>>          }
>>
>> @@ -1687,7 +1727,7 @@ static void close_sync(struct r1conf *conf)
>>          int idx;
>>
>>          for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) {
>> -               _wait_barrier(conf, idx);
>> +               _wait_barrier(conf, idx, false);
>>                  _allow_barrier(conf, idx);
>>          }
>>
>> --
>> 2.17.1
>>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v5 3/4] md: raid10 add nowait support
  2021-12-15 20:42                                       ` Song Liu
@ 2021-12-15 22:20                                         ` Vishal Verma
  2021-12-16  0:30                                           ` Vishal Verma
  0 siblings, 1 reply; 86+ messages in thread
From: Vishal Verma @ 2021-12-15 22:20 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, Jens Axboe, rgoldwyn


On 12/15/21 1:42 PM, Song Liu wrote:
> On Tue, Dec 14, 2021 at 10:09 PM Vishal Verma <vverma@digitalocean.com> wrote:
>> This adds nowait support to the RAID10 driver. Very similar to
>> raid1 driver changes. It makes RAID10 driver return with EAGAIN
>> for situations where it could wait for eg:
>>
>> - Waiting for the barrier,
>> - Too many pending I/Os to be queued,
>> - Reshape operation,
>> - Discard operation.
>>
>> wait_barrier() fn is modified to return bool to support error for
>> wait barriers. It returns true in case of wait or if wait is not
>> required and returns false if wait was required but not performed
>> to support nowait.
>>
>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
>> ---
>>   drivers/md/raid10.c | 57 +++++++++++++++++++++++++++++++++++----------
>>   1 file changed, 45 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>> index dde98f65bd04..f6c73987e9ac 100644
>> --- a/drivers/md/raid10.c
>> +++ b/drivers/md/raid10.c
>> @@ -952,11 +952,18 @@ static void lower_barrier(struct r10conf *conf)
>>          wake_up(&conf->wait_barrier);
>>   }
>>
>> -static void wait_barrier(struct r10conf *conf)
>> +static bool wait_barrier(struct r10conf *conf, bool nowait)
>>   {
>>          spin_lock_irq(&conf->resync_lock);
>>          if (conf->barrier) {
>>                  struct bio_list *bio_list = current->bio_list;
>> +
>> +               /* Return false when nowait flag is set */
>> +               if (nowait) {
>> +                       spin_unlock_irq(&conf->resync_lock);
>> +                       return false;
>> +               }
>> +
>>                  conf->nr_waiting++;
>>                  /* Wait for the barrier to drop.
>>                   * However if there are already pending
>> @@ -988,6 +995,7 @@ static void wait_barrier(struct r10conf *conf)
>>          }
>>          atomic_inc(&conf->nr_pending);
>>          spin_unlock_irq(&conf->resync_lock);
>> +       return true;
>>   }
>>
>>   static void allow_barrier(struct r10conf *conf)
>> @@ -1101,17 +1109,25 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
>>   static void regular_request_wait(struct mddev *mddev, struct r10conf *conf,
>>                                   struct bio *bio, sector_t sectors)
>>   {
>> -       wait_barrier(conf);
>> +       /* Bail out if REQ_NOWAIT is set for the bio */
>> +       if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
>> +               bio_wouldblock_error(bio);
>> +               return;
>> +       }
> I think we also need regular_request_wait to return bool and handle it properly.
>
> Thanks,
> Song
>
Ack, will fix it. Thanks!
>>          while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
>>              bio->bi_iter.bi_sector < conf->reshape_progress &&
>>              bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
>>                  raid10_log(conf->mddev, "wait reshape");
>> +               if (bio->bi_opf & REQ_NOWAIT) {
>> +                       bio_wouldblock_error(bio);
>> +                       return;
>> +               }
>>                  allow_barrier(conf);
>>                  wait_event(conf->wait_barrier,
>>                             conf->reshape_progress <= bio->bi_iter.bi_sector ||
>>                             conf->reshape_progress >= bio->bi_iter.bi_sector +
>>                             sectors);
>> -               wait_barrier(conf);
>> +               wait_barrier(conf, false);
>>          }
>>   }
>>
>> @@ -1179,7 +1195,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
>>                  bio_chain(split, bio);
>>                  allow_barrier(conf);
>>                  submit_bio_noacct(bio);
>> -               wait_barrier(conf);
>> +               wait_barrier(conf, false);
>>                  bio = split;
>>                  r10_bio->master_bio = bio;
>>                  r10_bio->sectors = max_sectors;
>> @@ -1338,7 +1354,7 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
>>                  raid10_log(conf->mddev, "%s wait rdev %d blocked",
>>                                  __func__, blocked_rdev->raid_disk);
>>                  md_wait_for_blocked_rdev(blocked_rdev, mddev);
>> -               wait_barrier(conf);
>> +               wait_barrier(conf, false);
>>                  goto retry_wait;
>>          }
>>   }
>> @@ -1357,6 +1373,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>>                                              bio_end_sector(bio)))) {
>>                  DEFINE_WAIT(w);
>>                  for (;;) {
>> +                       /* Bail out if REQ_NOWAIT is set for the bio */
>> +                       if (bio->bi_opf & REQ_NOWAIT) {
>> +                               bio_wouldblock_error(bio);
>> +                               return;
>> +                       }
>>                          prepare_to_wait(&conf->wait_barrier,
>>                                          &w, TASK_IDLE);
>>                          if (!md_cluster_ops->area_resyncing(mddev, WRITE,
>> @@ -1381,6 +1402,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>>                                BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
>>                  md_wakeup_thread(mddev->thread);
>>                  raid10_log(conf->mddev, "wait reshape metadata");
>> +               if (bio->bi_opf & REQ_NOWAIT) {
>> +                       bio_wouldblock_error(bio);
>> +                       return;
>> +               }
>>                  wait_event(mddev->sb_wait,
>>                             !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
>>
>> @@ -1390,6 +1415,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>>          if (conf->pending_count >= max_queued_requests) {
>>                  md_wakeup_thread(mddev->thread);
>>                  raid10_log(mddev, "wait queued");
>> +               if (bio->bi_opf & REQ_NOWAIT) {
>> +                       bio_wouldblock_error(bio);
>> +                       return;
>> +               }
>>                  wait_event(conf->wait_barrier,
>>                             conf->pending_count < max_queued_requests);
>>          }
>> @@ -1482,7 +1511,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>>                  bio_chain(split, bio);
>>                  allow_barrier(conf);
>>                  submit_bio_noacct(bio);
>> -               wait_barrier(conf);
>> +               wait_barrier(conf, false);
>>                  bio = split;
>>                  r10_bio->master_bio = bio;
>>          }
>> @@ -1607,7 +1636,11 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
>>          if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
>>                  return -EAGAIN;
>>
>> -       wait_barrier(conf);
>> +       if (bio->bi_opf & REQ_NOWAIT) {
>> +               bio_wouldblock_error(bio);
>> +               return 0;
>> +       }
>> +       wait_barrier(conf, false);
>>
>>          /*
>>           * Check reshape again to avoid reshape happens after checking
>> @@ -1649,7 +1682,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
>>                  allow_barrier(conf);
>>                  /* Resend the fist split part */
>>                  submit_bio_noacct(split);
>> -               wait_barrier(conf);
>> +               wait_barrier(conf, false);
>>          }
>>          div_u64_rem(bio_end, stripe_size, &remainder);
>>          if (remainder) {
>> @@ -1660,7 +1693,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
>>                  /* Resend the second split part */
>>                  submit_bio_noacct(bio);
>>                  bio = split;
>> -               wait_barrier(conf);
>> +               wait_barrier(conf, false);
>>          }
>>
>>          bio_start = bio->bi_iter.bi_sector;
>> @@ -1816,7 +1849,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
>>                  end_disk_offset += geo->stride;
>>                  atomic_inc(&first_r10bio->remaining);
>>                  raid_end_discard_bio(r10_bio);
>> -               wait_barrier(conf);
>> +               wait_barrier(conf, false);
>>                  goto retry_discard;
>>          }
>>
>> @@ -2011,7 +2044,7 @@ static void print_conf(struct r10conf *conf)
>>
>>   static void close_sync(struct r10conf *conf)
>>   {
>> -       wait_barrier(conf);
>> +       wait_barrier(conf, false);
>>          allow_barrier(conf);
>>
>>          mempool_exit(&conf->r10buf_pool);
>> @@ -4819,7 +4852,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
>>          if (need_flush ||
>>              time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
>>                  /* Need to update reshape_position in metadata */
>> -               wait_barrier(conf);
>> +               wait_barrier(conf, false);
>>                  mddev->reshape_position = conf->reshape_progress;
>>                  if (mddev->reshape_backwards)
>>                          mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
>> --
>> 2.17.1
>>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v5 3/4] md: raid10 add nowait support
  2021-12-15 22:20                                         ` Vishal Verma
@ 2021-12-16  0:30                                           ` Vishal Verma
  2021-12-16 16:40                                             ` Vishal Verma
  2021-12-16 16:42                                             ` Jens Axboe
  0 siblings, 2 replies; 86+ messages in thread
From: Vishal Verma @ 2021-12-16  0:30 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, Jens Axboe, rgoldwyn


On 12/15/21 3:20 PM, Vishal Verma wrote:
>
> On 12/15/21 1:42 PM, Song Liu wrote:
>> On Tue, Dec 14, 2021 at 10:09 PM Vishal Verma 
>> <vverma@digitalocean.com> wrote:
>>> This adds nowait support to the RAID10 driver. Very similar to
>>> raid1 driver changes. It makes RAID10 driver return with EAGAIN
>>> for situations where it could wait for eg:
>>>
>>> - Waiting for the barrier,
>>> - Too many pending I/Os to be queued,
>>> - Reshape operation,
>>> - Discard operation.
>>>
>>> wait_barrier() fn is modified to return bool to support error for
>>> wait barriers. It returns true in case of wait or if wait is not
>>> required and returns false if wait was required but not performed
>>> to support nowait.
>>>
>>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
>>> ---
>>>   drivers/md/raid10.c | 57 
>>> +++++++++++++++++++++++++++++++++++----------
>>>   1 file changed, 45 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>>> index dde98f65bd04..f6c73987e9ac 100644
>>> --- a/drivers/md/raid10.c
>>> +++ b/drivers/md/raid10.c
>>> @@ -952,11 +952,18 @@ static void lower_barrier(struct r10conf *conf)
>>>          wake_up(&conf->wait_barrier);
>>>   }
>>>
>>> -static void wait_barrier(struct r10conf *conf)
>>> +static bool wait_barrier(struct r10conf *conf, bool nowait)
>>>   {
>>>          spin_lock_irq(&conf->resync_lock);
>>>          if (conf->barrier) {
>>>                  struct bio_list *bio_list = current->bio_list;
>>> +
>>> +               /* Return false when nowait flag is set */
>>> +               if (nowait) {
>>> + spin_unlock_irq(&conf->resync_lock);
>>> +                       return false;
>>> +               }
>>> +
>>>                  conf->nr_waiting++;
>>>                  /* Wait for the barrier to drop.
>>>                   * However if there are already pending
>>> @@ -988,6 +995,7 @@ static void wait_barrier(struct r10conf *conf)
>>>          }
>>>          atomic_inc(&conf->nr_pending);
>>>          spin_unlock_irq(&conf->resync_lock);
>>> +       return true;
>>>   }
>>>
>>>   static void allow_barrier(struct r10conf *conf)
>>> @@ -1101,17 +1109,25 @@ static void raid10_unplug(struct blk_plug_cb 
>>> *cb, bool from_schedule)
>>>   static void regular_request_wait(struct mddev *mddev, struct 
>>> r10conf *conf,
>>>                                   struct bio *bio, sector_t sectors)
>>>   {
>>> -       wait_barrier(conf);
>>> +       /* Bail out if REQ_NOWAIT is set for the bio */
>>> +       if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
>>> +               bio_wouldblock_error(bio);
>>> +               return;
>>> +       }
>> I think we also need regular_request_wait to return bool and handle 
>> it properly.
>>
>> Thanks,
>> Song
>>
> Ack, will fix it. Thanks!

Ran into this while running with io_uring. With the current v5 (raid10 
patch) on top of md-next branch.
./t/io_uring -a 0 -d 256 </dev/raid10>

It didn't trigger with aio (-a 1)

[  248.128661] BUG: kernel NULL pointer dereference, address: 
00000000000000b8
[  248.135628] #PF: supervisor read access in kernel mode
[  248.140762] #PF: error_code(0x0000) - not-present page
[  248.145903] PGD 0 P4D 0
[  248.148443] Oops: 0000 [#1] PREEMPT SMP NOPTI
[  248.152800] CPU: 49 PID: 9461 Comm: io_uring Kdump: loaded Not 
tainted 5.16.0-rc3+ #2
[  248.160629] Hardware name: Dell Inc. PowerEdge R650xs/0PPTY2, BIOS 
1.3.8 08/31/2021
[  248.168279] RIP: 0010:raid10_end_read_request+0x74/0x140 [raid10]
[  248.174373] Code: 48 60 48 8b 58 58 48 c1 e2 05 49 03 55 08 48 89 4a 
10 40 84 f6 75 48 f0 41 80 4c 24 18 01 4c 89 e7 e8 e0 b8 ff ff 49 8b 4d 
00 <48> 8b 83 b8 00 00 00 f0 ff 8b f0 00 00 00 0f 94 c2 a8 01 74 04 84
[  248.193120] RSP: 0018:ffffb1c38d598ce8 EFLAGS: 00010086
[  248.198344] RAX: ffff8e5da2a1a100 RBX: 0000000000000000 RCX: 
ffff8e5d89747000
[  248.205479] RDX: 000000008040003a RSI: 0000000080400039 RDI: 
ffff8e1e00044900
[  248.212611] RBP: ffffb1c38d598d30 R08: 0000000000000000 R09: 
0000000000000001
[  248.219744] R10: ffff8e5da2a1ae00 R11: 000000411bab9000 R12: 
ffff8e5da2a1ae00
[  248.226877] R13: ffff8e5d8973fc00 R14: 0000000000000000 R15: 
0000000000001000
[  248.234009] FS:  00007fc26b07d700(0000) GS:ffff8e9c6e600000(0000) 
knlGS:0000000000000000
[  248.242096] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  248.247843] CR2: 00000000000000b8 CR3: 00000040b25d4005 CR4: 
0000000000770ee0
[  248.254973] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
0000000000000000
[  248.262107] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 
0000000000000400
[  248.269240] PKRU: 55555554
[  248.271953] Call Trace:
[  248.274406]  <IRQ>
[  248.276425]  bio_endio+0xf6/0x170
[  248.279743]  blk_update_request+0x12d/0x470
[  248.283931]  ? sbitmap_queue_clear_batch+0xc7/0x110
[  248.288809]  blk_mq_end_request_batch+0x76/0x490
[  248.293429]  ? dma_direct_unmap_sg+0xdd/0x1a0
[  248.297786]  ? smp_call_function_single_async+0x46/0x70
[  248.303015]  ? mempool_kfree+0xe/0x10
[  248.306680]  ? mempool_kfree+0xe/0x10
[  248.310345]  nvme_pci_complete_batch+0x26/0xb0
[  248.314792]  nvme_irq+0x298/0x2f0
[  248.318110]  ? nvme_unmap_data+0xf0/0xf0
[  248.322038]  __handle_irq_event_percpu+0x3f/0x190
[  248.326744]  handle_irq_event_percpu+0x33/0x80
[  248.331190]  handle_irq_event+0x39/0x60
[  248.335028]  handle_edge_irq+0xbe/0x1e0
[  248.338869]  __common_interrupt+0x6b/0x110
[  248.342967]  common_interrupt+0xbd/0xe0
[  248.346808]  </IRQ>
[  248.348912]  <TASK>
[  248.351018]  asm_common_interrupt+0x1e/0x40
[  248.355206] RIP: 0010:_raw_spin_unlock_irqrestore+0x1e/0x37
[  248.360780] Code: 02 5d c3 0f 1f 44 00 00 5d c3 66 90 0f 1f 44 00 00 
55 48 89 e5 c6 07 00 0f 1f 40 00 f7 c6 00 02 00 00 74 01 fb bf 01 00 00 
00 <e8> ed 8e 5b ff 65 8b 05 66 7e 52 78 85 c0 74 02 5d c3 0f 1f 44 00

[  248.379525] RSP: 0018:ffffb1c3a429b958 EFLAGS: 00000206
[  248.384749] RAX: 0000000000000001 RBX: ffff8e5d8973fd08 RCX: 
ffff8e5d8973fd10
[  248.391884] RDX: 0000000000000001 RSI: 0000000000000246 RDI: 
0000000000000001
[  248.399017] RBP: ffffb1c3a429b958 R08: 0000000000000000 R09: 
ffffb1c3a429b970
[  248.406148] R10: 0000000000000c00 R11: 0000000000000001 R12: 
0000000000000001
[  248.413280] R13: 0000000000000246 R14: 0000000000000000 R15: 
0000000000000003
[  248.420415]  __wake_up_common_lock+0x8a/0xc0
[  248.424686]  __wake_up+0x13/0x20
[  248.427919]  raid10_make_request+0x101/0x170 [raid10]
[  248.432971]  md_handle_request+0x179/0x1e0
[  248.437071]  ? submit_bio_checks+0x1f6/0x5a0
[  248.441345]  md_submit_bio+0x6d/0xa0
[  248.444924]  __submit_bio+0x94/0x140
[  248.448504]  submit_bio_noacct+0xe1/0x2a0
[  248.452515]  submit_bio+0x48/0x120
[  248.455923]  blkdev_direct_IO+0x220/0x540
[  248.459935]  ? __fsnotify_parent+0xff/0x330
[  248.464121]  ? __fsnotify_parent+0x10f/0x330
[  248.468393]  ? common_interrupt+0x73/0xe0
[  248.472408]  generic_file_read_iter+0xa5/0x160
[  248.476852]  blkdev_read_iter+0x38/0x70
[  248.480693]  io_read+0x119/0x420
[  248.483923]  ? sbitmap_queue_clear_batch+0xc7/0x110
[  248.488805]  ? blk_mq_end_request_batch+0x378/0x490
[  248.493684]  io_issue_sqe+0x7ec/0x19c0
[  248.497436]  ? io_req_prep+0x6a9/0xe60
[  248.501190]  io_submit_sqes+0x2a0/0x9f0
[  248.505030]  ? __fget_files+0x6a/0x90
[  248.508693]  __x64_sys_io_uring_enter+0x1da/0x8c0
[  248.513401]  do_syscall_64+0x38/0x90
[  248.516979]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[  248.522033] RIP: 0033:0x7fc26b19b89d
[  248.525611] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 
48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 
05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
[  248.544360] RSP: 002b:00007fc26b07ce98 EFLAGS: 00000246 ORIG_RAX: 
00000000000001aa
[  248.551925] RAX: ffffffffffffffda RBX: 00007fc26b3f2fc0 RCX: 
00007fc26b19b89d
[  248.559058] RDX: 0000000000000020 RSI: 0000000000000020 RDI: 
0000000000000004
[  248.566189] RBP: 0000000000000020 R08: 0000000000000000 R09: 
0000000000000000
[  248.573322] R10: 0000000000000001 R11: 0000000000000246 R12: 
00005623a4b7a2a0
[  248.580456] R13: 0000000000000020 R14: 0000000000000020 R15: 
0000000000000020
[  248.587591]  </TASK>

>>>          while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
>>>              bio->bi_iter.bi_sector < conf->reshape_progress &&
>>>              bio->bi_iter.bi_sector + sectors > 
>>> conf->reshape_progress) {
>>>                  raid10_log(conf->mddev, "wait reshape");
>>> +               if (bio->bi_opf & REQ_NOWAIT) {
>>> +                       bio_wouldblock_error(bio);
>>> +                       return;
>>> +               }
>>>                  allow_barrier(conf);
>>>                  wait_event(conf->wait_barrier,
>>>                             conf->reshape_progress <= 
>>> bio->bi_iter.bi_sector ||
>>>                             conf->reshape_progress >= 
>>> bio->bi_iter.bi_sector +
>>>                             sectors);
>>> -               wait_barrier(conf);
>>> +               wait_barrier(conf, false);
>>>          }
>>>   }
>>>
>>> @@ -1179,7 +1195,7 @@ static void raid10_read_request(struct mddev 
>>> *mddev, struct bio *bio,
>>>                  bio_chain(split, bio);
>>>                  allow_barrier(conf);
>>>                  submit_bio_noacct(bio);
>>> -               wait_barrier(conf);
>>> +               wait_barrier(conf, false);
>>>                  bio = split;
>>>                  r10_bio->master_bio = bio;
>>>                  r10_bio->sectors = max_sectors;
>>> @@ -1338,7 +1354,7 @@ static void wait_blocked_dev(struct mddev 
>>> *mddev, struct r10bio *r10_bio)
>>>                  raid10_log(conf->mddev, "%s wait rdev %d blocked",
>>>                                  __func__, blocked_rdev->raid_disk);
>>>                  md_wait_for_blocked_rdev(blocked_rdev, mddev);
>>> -               wait_barrier(conf);
>>> +               wait_barrier(conf, false);
>>>                  goto retry_wait;
>>>          }
>>>   }
>>> @@ -1357,6 +1373,11 @@ static void raid10_write_request(struct mddev 
>>> *mddev, struct bio *bio,
>>> bio_end_sector(bio)))) {
>>>                  DEFINE_WAIT(w);
>>>                  for (;;) {
>>> +                       /* Bail out if REQ_NOWAIT is set for the bio */
>>> +                       if (bio->bi_opf & REQ_NOWAIT) {
>>> +                               bio_wouldblock_error(bio);
>>> +                               return;
>>> +                       }
>>> prepare_to_wait(&conf->wait_barrier,
>>>                                          &w, TASK_IDLE);
>>>                          if (!md_cluster_ops->area_resyncing(mddev, 
>>> WRITE,
>>> @@ -1381,6 +1402,10 @@ static void raid10_write_request(struct mddev 
>>> *mddev, struct bio *bio,
>>>                                BIT(MD_SB_CHANGE_DEVS) | 
>>> BIT(MD_SB_CHANGE_PENDING));
>>>                  md_wakeup_thread(mddev->thread);
>>>                  raid10_log(conf->mddev, "wait reshape metadata");
>>> +               if (bio->bi_opf & REQ_NOWAIT) {
>>> +                       bio_wouldblock_error(bio);
>>> +                       return;
>>> +               }
>>>                  wait_event(mddev->sb_wait,
>>>                             !test_bit(MD_SB_CHANGE_PENDING, 
>>> &mddev->sb_flags));
>>>
>>> @@ -1390,6 +1415,10 @@ static void raid10_write_request(struct mddev 
>>> *mddev, struct bio *bio,
>>>          if (conf->pending_count >= max_queued_requests) {
>>>                  md_wakeup_thread(mddev->thread);
>>>                  raid10_log(mddev, "wait queued");
>>> +               if (bio->bi_opf & REQ_NOWAIT) {
>>> +                       bio_wouldblock_error(bio);
>>> +                       return;
>>> +               }
>>>                  wait_event(conf->wait_barrier,
>>>                             conf->pending_count < max_queued_requests);
>>>          }
>>> @@ -1482,7 +1511,7 @@ static void raid10_write_request(struct mddev 
>>> *mddev, struct bio *bio,
>>>                  bio_chain(split, bio);
>>>                  allow_barrier(conf);
>>>                  submit_bio_noacct(bio);
>>> -               wait_barrier(conf);
>>> +               wait_barrier(conf, false);
>>>                  bio = split;
>>>                  r10_bio->master_bio = bio;
>>>          }
>>> @@ -1607,7 +1636,11 @@ static int raid10_handle_discard(struct mddev 
>>> *mddev, struct bio *bio)
>>>          if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
>>>                  return -EAGAIN;
>>>
>>> -       wait_barrier(conf);
>>> +       if (bio->bi_opf & REQ_NOWAIT) {
>>> +               bio_wouldblock_error(bio);
>>> +               return 0;
>>> +       }
>>> +       wait_barrier(conf, false);
>>>
>>>          /*
>>>           * Check reshape again to avoid reshape happens after checking
>>> @@ -1649,7 +1682,7 @@ static int raid10_handle_discard(struct mddev 
>>> *mddev, struct bio *bio)
>>>                  allow_barrier(conf);
>>>                  /* Resend the fist split part */
>>>                  submit_bio_noacct(split);
>>> -               wait_barrier(conf);
>>> +               wait_barrier(conf, false);
>>>          }
>>>          div_u64_rem(bio_end, stripe_size, &remainder);
>>>          if (remainder) {
>>> @@ -1660,7 +1693,7 @@ static int raid10_handle_discard(struct mddev 
>>> *mddev, struct bio *bio)
>>>                  /* Resend the second split part */
>>>                  submit_bio_noacct(bio);
>>>                  bio = split;
>>> -               wait_barrier(conf);
>>> +               wait_barrier(conf, false);
>>>          }
>>>
>>>          bio_start = bio->bi_iter.bi_sector;
>>> @@ -1816,7 +1849,7 @@ static int raid10_handle_discard(struct mddev 
>>> *mddev, struct bio *bio)
>>>                  end_disk_offset += geo->stride;
>>>                  atomic_inc(&first_r10bio->remaining);
>>>                  raid_end_discard_bio(r10_bio);
>>> -               wait_barrier(conf);
>>> +               wait_barrier(conf, false);
>>>                  goto retry_discard;
>>>          }
>>>
>>> @@ -2011,7 +2044,7 @@ static void print_conf(struct r10conf *conf)
>>>
>>>   static void close_sync(struct r10conf *conf)
>>>   {
>>> -       wait_barrier(conf);
>>> +       wait_barrier(conf, false);
>>>          allow_barrier(conf);
>>>
>>>          mempool_exit(&conf->r10buf_pool);
>>> @@ -4819,7 +4852,7 @@ static sector_t reshape_request(struct mddev 
>>> *mddev, sector_t sector_nr,
>>>          if (need_flush ||
>>>              time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
>>>                  /* Need to update reshape_position in metadata */
>>> -               wait_barrier(conf);
>>> +               wait_barrier(conf, false);
>>>                  mddev->reshape_position = conf->reshape_progress;
>>>                  if (mddev->reshape_backwards)
>>>                          mddev->curr_resync_completed = 
>>> raid10_size(mddev, 0, 0)
>>> -- 
>>> 2.17.1
>>>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v5 3/4] md: raid10 add nowait support
  2021-12-16  0:30                                           ` Vishal Verma
@ 2021-12-16 16:40                                             ` Vishal Verma
  2021-12-16 16:42                                             ` Jens Axboe
  1 sibling, 0 replies; 86+ messages in thread
From: Vishal Verma @ 2021-12-16 16:40 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, Jens Axboe, rgoldwyn


On 12/15/21 5:30 PM, Vishal Verma wrote:
>
> On 12/15/21 3:20 PM, Vishal Verma wrote:
>>
>> On 12/15/21 1:42 PM, Song Liu wrote:
>>> On Tue, Dec 14, 2021 at 10:09 PM Vishal Verma 
>>> <vverma@digitalocean.com> wrote:
>>>> This adds nowait support to the RAID10 driver. Very similar to
>>>> raid1 driver changes. It makes RAID10 driver return with EAGAIN
>>>> for situations where it could wait for eg:
>>>>
>>>> - Waiting for the barrier,
>>>> - Too many pending I/Os to be queued,
>>>> - Reshape operation,
>>>> - Discard operation.
>>>>
>>>> wait_barrier() fn is modified to return bool to support error for
>>>> wait barriers. It returns true in case of wait or if wait is not
>>>> required and returns false if wait was required but not performed
>>>> to support nowait.
>>>>
>>>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
>>>> ---
>>>>   drivers/md/raid10.c | 57 
>>>> +++++++++++++++++++++++++++++++++++----------
>>>>   1 file changed, 45 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>>>> index dde98f65bd04..f6c73987e9ac 100644
>>>> --- a/drivers/md/raid10.c
>>>> +++ b/drivers/md/raid10.c
>>>> @@ -952,11 +952,18 @@ static void lower_barrier(struct r10conf *conf)
>>>>          wake_up(&conf->wait_barrier);
>>>>   }
>>>>
>>>> -static void wait_barrier(struct r10conf *conf)
>>>> +static bool wait_barrier(struct r10conf *conf, bool nowait)
>>>>   {
>>>>          spin_lock_irq(&conf->resync_lock);
>>>>          if (conf->barrier) {
>>>>                  struct bio_list *bio_list = current->bio_list;
>>>> +
>>>> +               /* Return false when nowait flag is set */
>>>> +               if (nowait) {
>>>> + spin_unlock_irq(&conf->resync_lock);
>>>> +                       return false;
>>>> +               }
>>>> +
>>>>                  conf->nr_waiting++;
>>>>                  /* Wait for the barrier to drop.
>>>>                   * However if there are already pending
>>>> @@ -988,6 +995,7 @@ static void wait_barrier(struct r10conf *conf)
>>>>          }
>>>>          atomic_inc(&conf->nr_pending);
>>>>          spin_unlock_irq(&conf->resync_lock);
>>>> +       return true;
>>>>   }
>>>>
>>>>   static void allow_barrier(struct r10conf *conf)
>>>> @@ -1101,17 +1109,25 @@ static void raid10_unplug(struct 
>>>> blk_plug_cb *cb, bool from_schedule)
>>>>   static void regular_request_wait(struct mddev *mddev, struct 
>>>> r10conf *conf,
>>>>                                   struct bio *bio, sector_t sectors)
>>>>   {
>>>> -       wait_barrier(conf);
>>>> +       /* Bail out if REQ_NOWAIT is set for the bio */
>>>> +       if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
>>>> +               bio_wouldblock_error(bio);
>>>> +               return;
>>>> +       }
>>> I think we also need regular_request_wait to return bool and handle 
>>> it properly.
>>>
>>> Thanks,
>>> Song
>>>
>> Ack, will fix it. Thanks!
>
> Ran into this while running with io_uring. With the current v5 (raid10 
> patch) on top of md-next branch.
> ./t/io_uring -a 0 -d 256 </dev/raid10>
>
> It didn't trigger with aio (-a 1)
>
> [  248.128661] BUG: kernel NULL pointer dereference, address: 
> 00000000000000b8
> [  248.135628] #PF: supervisor read access in kernel mode
> [  248.140762] #PF: error_code(0x0000) - not-present page
> [  248.145903] PGD 0 P4D 0
> [  248.148443] Oops: 0000 [#1] PREEMPT SMP NOPTI
> [  248.152800] CPU: 49 PID: 9461 Comm: io_uring Kdump: loaded Not 
> tainted 5.16.0-rc3+ #2
> [  248.160629] Hardware name: Dell Inc. PowerEdge R650xs/0PPTY2, BIOS 
> 1.3.8 08/31/2021
> [  248.168279] RIP: 0010:raid10_end_read_request+0x74/0x140 [raid10]
> [  248.174373] Code: 48 60 48 8b 58 58 48 c1 e2 05 49 03 55 08 48 89 
> 4a 10 40 84 f6 75 48 f0 41 80 4c 24 18 01 4c 89 e7 e8 e0 b8 ff ff 49 
> 8b 4d 00 <48> 8b 83 b8 00 00 00 f0 ff 8b f0 00 00 00 0f 94 c2 a8 01 74 
> 04 84
> [  248.193120] RSP: 0018:ffffb1c38d598ce8 EFLAGS: 00010086
> [  248.198344] RAX: ffff8e5da2a1a100 RBX: 0000000000000000 RCX: 
> ffff8e5d89747000
> [  248.205479] RDX: 000000008040003a RSI: 0000000080400039 RDI: 
> ffff8e1e00044900
> [  248.212611] RBP: ffffb1c38d598d30 R08: 0000000000000000 R09: 
> 0000000000000001
> [  248.219744] R10: ffff8e5da2a1ae00 R11: 000000411bab9000 R12: 
> ffff8e5da2a1ae00
> [  248.226877] R13: ffff8e5d8973fc00 R14: 0000000000000000 R15: 
> 0000000000001000
> [  248.234009] FS:  00007fc26b07d700(0000) GS:ffff8e9c6e600000(0000) 
> knlGS:0000000000000000
> [  248.242096] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  248.247843] CR2: 00000000000000b8 CR3: 00000040b25d4005 CR4: 
> 0000000000770ee0
> [  248.254973] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
> 0000000000000000
> [  248.262107] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 
> 0000000000000400
> [  248.269240] PKRU: 55555554
> [  248.271953] Call Trace:
> [  248.274406]  <IRQ>
> [  248.276425]  bio_endio+0xf6/0x170
> [  248.279743]  blk_update_request+0x12d/0x470
> [  248.283931]  ? sbitmap_queue_clear_batch+0xc7/0x110
> [  248.288809]  blk_mq_end_request_batch+0x76/0x490
> [  248.293429]  ? dma_direct_unmap_sg+0xdd/0x1a0
> [  248.297786]  ? smp_call_function_single_async+0x46/0x70
> [  248.303015]  ? mempool_kfree+0xe/0x10
> [  248.306680]  ? mempool_kfree+0xe/0x10
> [  248.310345]  nvme_pci_complete_batch+0x26/0xb0
> [  248.314792]  nvme_irq+0x298/0x2f0
> [  248.318110]  ? nvme_unmap_data+0xf0/0xf0
> [  248.322038]  __handle_irq_event_percpu+0x3f/0x190
> [  248.326744]  handle_irq_event_percpu+0x33/0x80
> [  248.331190]  handle_irq_event+0x39/0x60
> [  248.335028]  handle_edge_irq+0xbe/0x1e0
> [  248.338869]  __common_interrupt+0x6b/0x110
> [  248.342967]  common_interrupt+0xbd/0xe0
> [  248.346808]  </IRQ>
> [  248.348912]  <TASK>
> [  248.351018]  asm_common_interrupt+0x1e/0x40
> [  248.355206] RIP: 0010:_raw_spin_unlock_irqrestore+0x1e/0x37
> [  248.360780] Code: 02 5d c3 0f 1f 44 00 00 5d c3 66 90 0f 1f 44 00 
> 00 55 48 89 e5 c6 07 00 0f 1f 40 00 f7 c6 00 02 00 00 74 01 fb bf 01 
> 00 00 00 <e8> ed 8e 5b ff 65 8b 05 66 7e 52 78 85 c0 74 02 5d c3 0f 1f 
> 44 00
>
> [  248.379525] RSP: 0018:ffffb1c3a429b958 EFLAGS: 00000206
> [  248.384749] RAX: 0000000000000001 RBX: ffff8e5d8973fd08 RCX: 
> ffff8e5d8973fd10
> [  248.391884] RDX: 0000000000000001 RSI: 0000000000000246 RDI: 
> 0000000000000001
> [  248.399017] RBP: ffffb1c3a429b958 R08: 0000000000000000 R09: 
> ffffb1c3a429b970
> [  248.406148] R10: 0000000000000c00 R11: 0000000000000001 R12: 
> 0000000000000001
> [  248.413280] R13: 0000000000000246 R14: 0000000000000000 R15: 
> 0000000000000003
> [  248.420415]  __wake_up_common_lock+0x8a/0xc0
> [  248.424686]  __wake_up+0x13/0x20
> [  248.427919]  raid10_make_request+0x101/0x170 [raid10]
> [  248.432971]  md_handle_request+0x179/0x1e0
> [  248.437071]  ? submit_bio_checks+0x1f6/0x5a0
> [  248.441345]  md_submit_bio+0x6d/0xa0
> [  248.444924]  __submit_bio+0x94/0x140
> [  248.448504]  submit_bio_noacct+0xe1/0x2a0
> [  248.452515]  submit_bio+0x48/0x120
> [  248.455923]  blkdev_direct_IO+0x220/0x540
> [  248.459935]  ? __fsnotify_parent+0xff/0x330
> [  248.464121]  ? __fsnotify_parent+0x10f/0x330
> [  248.468393]  ? common_interrupt+0x73/0xe0
> [  248.472408]  generic_file_read_iter+0xa5/0x160
> [  248.476852]  blkdev_read_iter+0x38/0x70
> [  248.480693]  io_read+0x119/0x420
> [  248.483923]  ? sbitmap_queue_clear_batch+0xc7/0x110
> [  248.488805]  ? blk_mq_end_request_batch+0x378/0x490
> [  248.493684]  io_issue_sqe+0x7ec/0x19c0
> [  248.497436]  ? io_req_prep+0x6a9/0xe60
> [  248.501190]  io_submit_sqes+0x2a0/0x9f0
> [  248.505030]  ? __fget_files+0x6a/0x90
> [  248.508693]  __x64_sys_io_uring_enter+0x1da/0x8c0
> [  248.513401]  do_syscall_64+0x38/0x90
> [  248.516979]  entry_SYSCALL_64_after_hwframe+0x44/0xae
> [  248.522033] RIP: 0033:0x7fc26b19b89d
> [  248.525611] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e 
> fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 
> 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 
> 01 48
> [  248.544360] RSP: 002b:00007fc26b07ce98 EFLAGS: 00000246 ORIG_RAX: 
> 00000000000001aa
> [  248.551925] RAX: ffffffffffffffda RBX: 00007fc26b3f2fc0 RCX: 
> 00007fc26b19b89d
> [  248.559058] RDX: 0000000000000020 RSI: 0000000000000020 RDI: 
> 0000000000000004
> [  248.566189] RBP: 0000000000000020 R08: 0000000000000000 R09: 
> 0000000000000000
> [  248.573322] R10: 0000000000000001 R11: 0000000000000246 R12: 
> 00005623a4b7a2a0
> [  248.580456] R13: 0000000000000020 R14: 0000000000000020 R15: 
> 0000000000000020
> [  248.587591]  </TASK>
>
>
It seems this issue is triggering even when just using "md: add support 
for REQ_NOWAIT" patch running t/io_uring against a raid10 volume with 
very high iodepth (256).
>>>>          while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
>>>>              bio->bi_iter.bi_sector < conf->reshape_progress &&
>>>>              bio->bi_iter.bi_sector + sectors > 
>>>> conf->reshape_progress) {
>>>>                  raid10_log(conf->mddev, "wait reshape");
>>>> +               if (bio->bi_opf & REQ_NOWAIT) {
>>>> +                       bio_wouldblock_error(bio);
>>>> +                       return;
>>>> +               }
>>>>                  allow_barrier(conf);
>>>>                  wait_event(conf->wait_barrier,
>>>>                             conf->reshape_progress <= 
>>>> bio->bi_iter.bi_sector ||
>>>>                             conf->reshape_progress >= 
>>>> bio->bi_iter.bi_sector +
>>>>                             sectors);
>>>> -               wait_barrier(conf);
>>>> +               wait_barrier(conf, false);
>>>>          }
>>>>   }
>>>>
>>>> @@ -1179,7 +1195,7 @@ static void raid10_read_request(struct mddev 
>>>> *mddev, struct bio *bio,
>>>>                  bio_chain(split, bio);
>>>>                  allow_barrier(conf);
>>>>                  submit_bio_noacct(bio);
>>>> -               wait_barrier(conf);
>>>> +               wait_barrier(conf, false);
>>>>                  bio = split;
>>>>                  r10_bio->master_bio = bio;
>>>>                  r10_bio->sectors = max_sectors;
>>>> @@ -1338,7 +1354,7 @@ static void wait_blocked_dev(struct mddev 
>>>> *mddev, struct r10bio *r10_bio)
>>>>                  raid10_log(conf->mddev, "%s wait rdev %d blocked",
>>>>                                  __func__, blocked_rdev->raid_disk);
>>>>                  md_wait_for_blocked_rdev(blocked_rdev, mddev);
>>>> -               wait_barrier(conf);
>>>> +               wait_barrier(conf, false);
>>>>                  goto retry_wait;
>>>>          }
>>>>   }
>>>> @@ -1357,6 +1373,11 @@ static void raid10_write_request(struct 
>>>> mddev *mddev, struct bio *bio,
>>>> bio_end_sector(bio)))) {
>>>>                  DEFINE_WAIT(w);
>>>>                  for (;;) {
>>>> +                       /* Bail out if REQ_NOWAIT is set for the 
>>>> bio */
>>>> +                       if (bio->bi_opf & REQ_NOWAIT) {
>>>> +                               bio_wouldblock_error(bio);
>>>> +                               return;
>>>> +                       }
>>>> prepare_to_wait(&conf->wait_barrier,
>>>>                                          &w, TASK_IDLE);
>>>>                          if (!md_cluster_ops->area_resyncing(mddev, 
>>>> WRITE,
>>>> @@ -1381,6 +1402,10 @@ static void raid10_write_request(struct 
>>>> mddev *mddev, struct bio *bio,
>>>>                                BIT(MD_SB_CHANGE_DEVS) | 
>>>> BIT(MD_SB_CHANGE_PENDING));
>>>>                  md_wakeup_thread(mddev->thread);
>>>>                  raid10_log(conf->mddev, "wait reshape metadata");
>>>> +               if (bio->bi_opf & REQ_NOWAIT) {
>>>> +                       bio_wouldblock_error(bio);
>>>> +                       return;
>>>> +               }
>>>>                  wait_event(mddev->sb_wait,
>>>>                             !test_bit(MD_SB_CHANGE_PENDING, 
>>>> &mddev->sb_flags));
>>>>
>>>> @@ -1390,6 +1415,10 @@ static void raid10_write_request(struct 
>>>> mddev *mddev, struct bio *bio,
>>>>          if (conf->pending_count >= max_queued_requests) {
>>>>                  md_wakeup_thread(mddev->thread);
>>>>                  raid10_log(mddev, "wait queued");
>>>> +               if (bio->bi_opf & REQ_NOWAIT) {
>>>> +                       bio_wouldblock_error(bio);
>>>> +                       return;
>>>> +               }
>>>>                  wait_event(conf->wait_barrier,
>>>>                             conf->pending_count < 
>>>> max_queued_requests);
>>>>          }
>>>> @@ -1482,7 +1511,7 @@ static void raid10_write_request(struct mddev 
>>>> *mddev, struct bio *bio,
>>>>                  bio_chain(split, bio);
>>>>                  allow_barrier(conf);
>>>>                  submit_bio_noacct(bio);
>>>> -               wait_barrier(conf);
>>>> +               wait_barrier(conf, false);
>>>>                  bio = split;
>>>>                  r10_bio->master_bio = bio;
>>>>          }
>>>> @@ -1607,7 +1636,11 @@ static int raid10_handle_discard(struct 
>>>> mddev *mddev, struct bio *bio)
>>>>          if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
>>>>                  return -EAGAIN;
>>>>
>>>> -       wait_barrier(conf);
>>>> +       if (bio->bi_opf & REQ_NOWAIT) {
>>>> +               bio_wouldblock_error(bio);
>>>> +               return 0;
>>>> +       }
>>>> +       wait_barrier(conf, false);
>>>>
>>>>          /*
>>>>           * Check reshape again to avoid reshape happens after 
>>>> checking
>>>> @@ -1649,7 +1682,7 @@ static int raid10_handle_discard(struct mddev 
>>>> *mddev, struct bio *bio)
>>>>                  allow_barrier(conf);
>>>>                  /* Resend the fist split part */
>>>>                  submit_bio_noacct(split);
>>>> -               wait_barrier(conf);
>>>> +               wait_barrier(conf, false);
>>>>          }
>>>>          div_u64_rem(bio_end, stripe_size, &remainder);
>>>>          if (remainder) {
>>>> @@ -1660,7 +1693,7 @@ static int raid10_handle_discard(struct mddev 
>>>> *mddev, struct bio *bio)
>>>>                  /* Resend the second split part */
>>>>                  submit_bio_noacct(bio);
>>>>                  bio = split;
>>>> -               wait_barrier(conf);
>>>> +               wait_barrier(conf, false);
>>>>          }
>>>>
>>>>          bio_start = bio->bi_iter.bi_sector;
>>>> @@ -1816,7 +1849,7 @@ static int raid10_handle_discard(struct mddev 
>>>> *mddev, struct bio *bio)
>>>>                  end_disk_offset += geo->stride;
>>>> atomic_inc(&first_r10bio->remaining);
>>>>                  raid_end_discard_bio(r10_bio);
>>>> -               wait_barrier(conf);
>>>> +               wait_barrier(conf, false);
>>>>                  goto retry_discard;
>>>>          }
>>>>
>>>> @@ -2011,7 +2044,7 @@ static void print_conf(struct r10conf *conf)
>>>>
>>>>   static void close_sync(struct r10conf *conf)
>>>>   {
>>>> -       wait_barrier(conf);
>>>> +       wait_barrier(conf, false);
>>>>          allow_barrier(conf);
>>>>
>>>>          mempool_exit(&conf->r10buf_pool);
>>>> @@ -4819,7 +4852,7 @@ static sector_t reshape_request(struct mddev 
>>>> *mddev, sector_t sector_nr,
>>>>          if (need_flush ||
>>>>              time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
>>>>                  /* Need to update reshape_position in metadata */
>>>> -               wait_barrier(conf);
>>>> +               wait_barrier(conf, false);
>>>>                  mddev->reshape_position = conf->reshape_progress;
>>>>                  if (mddev->reshape_backwards)
>>>>                          mddev->curr_resync_completed = 
>>>> raid10_size(mddev, 0, 0)
>>>> -- 
>>>> 2.17.1
>>>>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v5 3/4] md: raid10 add nowait support
  2021-12-16  0:30                                           ` Vishal Verma
  2021-12-16 16:40                                             ` Vishal Verma
@ 2021-12-16 16:42                                             ` Jens Axboe
  2021-12-16 16:45                                               ` Vishal Verma
  2021-12-16 18:14                                               ` Vishal Verma
  1 sibling, 2 replies; 86+ messages in thread
From: Jens Axboe @ 2021-12-16 16:42 UTC (permalink / raw)
  To: Vishal Verma, Song Liu; +Cc: linux-raid, rgoldwyn

On 12/15/21 5:30 PM, Vishal Verma wrote:
> 
> On 12/15/21 3:20 PM, Vishal Verma wrote:
>>
>> On 12/15/21 1:42 PM, Song Liu wrote:
>>> On Tue, Dec 14, 2021 at 10:09 PM Vishal Verma 
>>> <vverma@digitalocean.com> wrote:
>>>> This adds nowait support to the RAID10 driver. Very similar to
>>>> raid1 driver changes. It makes RAID10 driver return with EAGAIN
>>>> for situations where it could wait for eg:
>>>>
>>>> - Waiting for the barrier,
>>>> - Too many pending I/Os to be queued,
>>>> - Reshape operation,
>>>> - Discard operation.
>>>>
>>>> wait_barrier() fn is modified to return bool to support error for
>>>> wait barriers. It returns true in case of wait or if wait is not
>>>> required and returns false if wait was required but not performed
>>>> to support nowait.
>>>>
>>>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
>>>> ---
>>>>   drivers/md/raid10.c | 57 
>>>> +++++++++++++++++++++++++++++++++++----------
>>>>   1 file changed, 45 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>>>> index dde98f65bd04..f6c73987e9ac 100644
>>>> --- a/drivers/md/raid10.c
>>>> +++ b/drivers/md/raid10.c
>>>> @@ -952,11 +952,18 @@ static void lower_barrier(struct r10conf *conf)
>>>>          wake_up(&conf->wait_barrier);
>>>>   }
>>>>
>>>> -static void wait_barrier(struct r10conf *conf)
>>>> +static bool wait_barrier(struct r10conf *conf, bool nowait)
>>>>   {
>>>>          spin_lock_irq(&conf->resync_lock);
>>>>          if (conf->barrier) {
>>>>                  struct bio_list *bio_list = current->bio_list;
>>>> +
>>>> +               /* Return false when nowait flag is set */
>>>> +               if (nowait) {
>>>> + spin_unlock_irq(&conf->resync_lock);
>>>> +                       return false;
>>>> +               }
>>>> +
>>>>                  conf->nr_waiting++;
>>>>                  /* Wait for the barrier to drop.
>>>>                   * However if there are already pending
>>>> @@ -988,6 +995,7 @@ static void wait_barrier(struct r10conf *conf)
>>>>          }
>>>>          atomic_inc(&conf->nr_pending);
>>>>          spin_unlock_irq(&conf->resync_lock);
>>>> +       return true;
>>>>   }
>>>>
>>>>   static void allow_barrier(struct r10conf *conf)
>>>> @@ -1101,17 +1109,25 @@ static void raid10_unplug(struct blk_plug_cb 
>>>> *cb, bool from_schedule)
>>>>   static void regular_request_wait(struct mddev *mddev, struct 
>>>> r10conf *conf,
>>>>                                   struct bio *bio, sector_t sectors)
>>>>   {
>>>> -       wait_barrier(conf);
>>>> +       /* Bail out if REQ_NOWAIT is set for the bio */
>>>> +       if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
>>>> +               bio_wouldblock_error(bio);
>>>> +               return;
>>>> +       }
>>> I think we also need regular_request_wait to return bool and handle 
>>> it properly.
>>>
>>> Thanks,
>>> Song
>>>
>> Ack, will fix it. Thanks!
> 
> Ran into this while running with io_uring. With the current v5 (raid10 
> patch) on top of md-next branch.
> ./t/io_uring -a 0 -d 256 </dev/raid10>
> 
> It didn't trigger with aio (-a 1)
> 
> [  248.128661] BUG: kernel NULL pointer dereference, address: 
> 00000000000000b8
> [  248.135628] #PF: supervisor read access in kernel mode
> [  248.140762] #PF: error_code(0x0000) - not-present page
> [  248.145903] PGD 0 P4D 0
> [  248.148443] Oops: 0000 [#1] PREEMPT SMP NOPTI
> [  248.152800] CPU: 49 PID: 9461 Comm: io_uring Kdump: loaded Not 
> tainted 5.16.0-rc3+ #2
> [  248.160629] Hardware name: Dell Inc. PowerEdge R650xs/0PPTY2, BIOS 
> 1.3.8 08/31/2021
> [  248.168279] RIP: 0010:raid10_end_read_request+0x74/0x140 [raid10]
> [  248.174373] Code: 48 60 48 8b 58 58 48 c1 e2 05 49 03 55 08 48 89 4a 
> 10 40 84 f6 75 48 f0 41 80 4c 24 18 01 4c 89 e7 e8 e0 b8 ff ff 49 8b 4d 
> 00 <48> 8b 83 b8 00 00 00 f0 ff 8b f0 00 00 00 0f 94 c2 a8 01 74 04 84
> [  248.193120] RSP: 0018:ffffb1c38d598ce8 EFLAGS: 00010086
> [  248.198344] RAX: ffff8e5da2a1a100 RBX: 0000000000000000 RCX: 
> ffff8e5d89747000
> [  248.205479] RDX: 000000008040003a RSI: 0000000080400039 RDI: 
> ffff8e1e00044900
> [  248.212611] RBP: ffffb1c38d598d30 R08: 0000000000000000 R09: 
> 0000000000000001
> [  248.219744] R10: ffff8e5da2a1ae00 R11: 000000411bab9000 R12: 
> ffff8e5da2a1ae00
> [  248.226877] R13: ffff8e5d8973fc00 R14: 0000000000000000 R15: 
> 0000000000001000
> [  248.234009] FS:  00007fc26b07d700(0000) GS:ffff8e9c6e600000(0000) 
> knlGS:0000000000000000
> [  248.242096] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  248.247843] CR2: 00000000000000b8 CR3: 00000040b25d4005 CR4: 
> 0000000000770ee0
> [  248.254973] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
> 0000000000000000
> [  248.262107] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 
> 0000000000000400
> [  248.269240] PKRU: 55555554
> [  248.271953] Call Trace:
> [  248.274406]  <IRQ>
> [  248.276425]  bio_endio+0xf6/0x170
> [  248.279743]  blk_update_request+0x12d/0x470
> [  248.283931]  ? sbitmap_queue_clear_batch+0xc7/0x110
> [  248.288809]  blk_mq_end_request_batch+0x76/0x490
> [  248.293429]  ? dma_direct_unmap_sg+0xdd/0x1a0
> [  248.297786]  ? smp_call_function_single_async+0x46/0x70
> [  248.303015]  ? mempool_kfree+0xe/0x10
> [  248.306680]  ? mempool_kfree+0xe/0x10
> [  248.310345]  nvme_pci_complete_batch+0x26/0xb0
> [  248.314792]  nvme_irq+0x298/0x2f0
> [  248.318110]  ? nvme_unmap_data+0xf0/0xf0
> [  248.322038]  __handle_irq_event_percpu+0x3f/0x190
> [  248.326744]  handle_irq_event_percpu+0x33/0x80
> [  248.331190]  handle_irq_event+0x39/0x60
> [  248.335028]  handle_edge_irq+0xbe/0x1e0
> [  248.338869]  __common_interrupt+0x6b/0x110
> [  248.342967]  common_interrupt+0xbd/0xe0
> [  248.346808]  </IRQ>
> [  248.348912]  <TASK>
> [  248.351018]  asm_common_interrupt+0x1e/0x40
> [  248.355206] RIP: 0010:_raw_spin_unlock_irqrestore+0x1e/0x37
> [  248.360780] Code: 02 5d c3 0f 1f 44 00 00 5d c3 66 90 0f 1f 44 00 00 
> 55 48 89 e5 c6 07 00 0f 1f 40 00 f7 c6 00 02 00 00 74 01 fb bf 01 00 00 
> 00 <e8> ed 8e 5b ff 65 8b 05 66 7e 52 78 85 c0 74 02 5d c3 0f 1f 44 00
> 
> [  248.379525] RSP: 0018:ffffb1c3a429b958 EFLAGS: 00000206
> [  248.384749] RAX: 0000000000000001 RBX: ffff8e5d8973fd08 RCX: 
> ffff8e5d8973fd10
> [  248.391884] RDX: 0000000000000001 RSI: 0000000000000246 RDI: 
> 0000000000000001
> [  248.399017] RBP: ffffb1c3a429b958 R08: 0000000000000000 R09: 
> ffffb1c3a429b970
> [  248.406148] R10: 0000000000000c00 R11: 0000000000000001 R12: 
> 0000000000000001
> [  248.413280] R13: 0000000000000246 R14: 0000000000000000 R15: 
> 0000000000000003
> [  248.420415]  __wake_up_common_lock+0x8a/0xc0
> [  248.424686]  __wake_up+0x13/0x20
> [  248.427919]  raid10_make_request+0x101/0x170 [raid10]
> [  248.432971]  md_handle_request+0x179/0x1e0
> [  248.437071]  ? submit_bio_checks+0x1f6/0x5a0
> [  248.441345]  md_submit_bio+0x6d/0xa0
> [  248.444924]  __submit_bio+0x94/0x140
> [  248.448504]  submit_bio_noacct+0xe1/0x2a0
> [  248.452515]  submit_bio+0x48/0x120
> [  248.455923]  blkdev_direct_IO+0x220/0x540
> [  248.459935]  ? __fsnotify_parent+0xff/0x330
> [  248.464121]  ? __fsnotify_parent+0x10f/0x330
> [  248.468393]  ? common_interrupt+0x73/0xe0
> [  248.472408]  generic_file_read_iter+0xa5/0x160
> [  248.476852]  blkdev_read_iter+0x38/0x70
> [  248.480693]  io_read+0x119/0x420
> [  248.483923]  ? sbitmap_queue_clear_batch+0xc7/0x110
> [  248.488805]  ? blk_mq_end_request_batch+0x378/0x490
> [  248.493684]  io_issue_sqe+0x7ec/0x19c0
> [  248.497436]  ? io_req_prep+0x6a9/0xe60
> [  248.501190]  io_submit_sqes+0x2a0/0x9f0
> [  248.505030]  ? __fget_files+0x6a/0x90
> [  248.508693]  __x64_sys_io_uring_enter+0x1da/0x8c0
> [  248.513401]  do_syscall_64+0x38/0x90
> [  248.516979]  entry_SYSCALL_64_after_hwframe+0x44/0xae
> [  248.522033] RIP: 0033:0x7fc26b19b89d
> [  248.525611] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 
> 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 
> 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
> [  248.544360] RSP: 002b:00007fc26b07ce98 EFLAGS: 00000246 ORIG_RAX: 
> 00000000000001aa
> [  248.551925] RAX: ffffffffffffffda RBX: 00007fc26b3f2fc0 RCX: 
> 00007fc26b19b89d
> [  248.559058] RDX: 0000000000000020 RSI: 0000000000000020 RDI: 
> 0000000000000004
> [  248.566189] RBP: 0000000000000020 R08: 0000000000000000 R09: 
> 0000000000000000
> [  248.573322] R10: 0000000000000001 R11: 0000000000000246 R12: 
> 00005623a4b7a2a0
> [  248.580456] R13: 0000000000000020 R14: 0000000000000020 R15: 
> 0000000000000020
> [  248.587591]  </TASK>

Do you have:

commit 75feae73a28020e492fbad2323245455ef69d687
Author: Pavel Begunkov <asml.silence@gmail.com>
Date:   Tue Dec 7 20:16:36 2021 +0000

    block: fix single bio async DIO error handling

in your tree?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v5 3/4] md: raid10 add nowait support
  2021-12-16 16:42                                             ` Jens Axboe
@ 2021-12-16 16:45                                               ` Vishal Verma
  2021-12-16 18:49                                                 ` Jens Axboe
  2021-12-16 18:14                                               ` Vishal Verma
  1 sibling, 1 reply; 86+ messages in thread
From: Vishal Verma @ 2021-12-16 16:45 UTC (permalink / raw)
  To: Jens Axboe, Song Liu; +Cc: linux-raid, rgoldwyn


On 12/16/21 9:42 AM, Jens Axboe wrote:
> On 12/15/21 5:30 PM, Vishal Verma wrote:
>> On 12/15/21 3:20 PM, Vishal Verma wrote:
>>> On 12/15/21 1:42 PM, Song Liu wrote:
>>>> On Tue, Dec 14, 2021 at 10:09 PM Vishal Verma
>>>> <vverma@digitalocean.com> wrote:
>>>>> This adds nowait support to the RAID10 driver. Very similar to
>>>>> raid1 driver changes. It makes RAID10 driver return with EAGAIN
>>>>> for situations where it could wait for eg:
>>>>>
>>>>> - Waiting for the barrier,
>>>>> - Too many pending I/Os to be queued,
>>>>> - Reshape operation,
>>>>> - Discard operation.
>>>>>
>>>>> wait_barrier() fn is modified to return bool to support error for
>>>>> wait barriers. It returns true in case of wait or if wait is not
>>>>> required and returns false if wait was required but not performed
>>>>> to support nowait.
>>>>>
>>>>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
>>>>> ---
>>>>>    drivers/md/raid10.c | 57
>>>>> +++++++++++++++++++++++++++++++++++----------
>>>>>    1 file changed, 45 insertions(+), 12 deletions(-)
>>>>>
>>>>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>>>>> index dde98f65bd04..f6c73987e9ac 100644
>>>>> --- a/drivers/md/raid10.c
>>>>> +++ b/drivers/md/raid10.c
>>>>> @@ -952,11 +952,18 @@ static void lower_barrier(struct r10conf *conf)
>>>>>           wake_up(&conf->wait_barrier);
>>>>>    }
>>>>>
>>>>> -static void wait_barrier(struct r10conf *conf)
>>>>> +static bool wait_barrier(struct r10conf *conf, bool nowait)
>>>>>    {
>>>>>           spin_lock_irq(&conf->resync_lock);
>>>>>           if (conf->barrier) {
>>>>>                   struct bio_list *bio_list = current->bio_list;
>>>>> +
>>>>> +               /* Return false when nowait flag is set */
>>>>> +               if (nowait) {
>>>>> + spin_unlock_irq(&conf->resync_lock);
>>>>> +                       return false;
>>>>> +               }
>>>>> +
>>>>>                   conf->nr_waiting++;
>>>>>                   /* Wait for the barrier to drop.
>>>>>                    * However if there are already pending
>>>>> @@ -988,6 +995,7 @@ static void wait_barrier(struct r10conf *conf)
>>>>>           }
>>>>>           atomic_inc(&conf->nr_pending);
>>>>>           spin_unlock_irq(&conf->resync_lock);
>>>>> +       return true;
>>>>>    }
>>>>>
>>>>>    static void allow_barrier(struct r10conf *conf)
>>>>> @@ -1101,17 +1109,25 @@ static void raid10_unplug(struct blk_plug_cb
>>>>> *cb, bool from_schedule)
>>>>>    static void regular_request_wait(struct mddev *mddev, struct
>>>>> r10conf *conf,
>>>>>                                    struct bio *bio, sector_t sectors)
>>>>>    {
>>>>> -       wait_barrier(conf);
>>>>> +       /* Bail out if REQ_NOWAIT is set for the bio */
>>>>> +       if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
>>>>> +               bio_wouldblock_error(bio);
>>>>> +               return;
>>>>> +       }
>>>> I think we also need regular_request_wait to return bool and handle
>>>> it properly.
>>>>
>>>> Thanks,
>>>> Song
>>>>
>>> Ack, will fix it. Thanks!
>> Ran into this while running with io_uring. With the current v5 (raid10
>> patch) on top of md-next branch.
>> ./t/io_uring -a 0 -d 256 </dev/raid10>
>>
>> It didn't trigger with aio (-a 1)
>>
>> [  248.128661] BUG: kernel NULL pointer dereference, address:
>> 00000000000000b8
>> [  248.135628] #PF: supervisor read access in kernel mode
>> [  248.140762] #PF: error_code(0x0000) - not-present page
>> [  248.145903] PGD 0 P4D 0
>> [  248.148443] Oops: 0000 [#1] PREEMPT SMP NOPTI
>> [  248.152800] CPU: 49 PID: 9461 Comm: io_uring Kdump: loaded Not
>> tainted 5.16.0-rc3+ #2
>> [  248.160629] Hardware name: Dell Inc. PowerEdge R650xs/0PPTY2, BIOS
>> 1.3.8 08/31/2021
>> [  248.168279] RIP: 0010:raid10_end_read_request+0x74/0x140 [raid10]
>> [  248.174373] Code: 48 60 48 8b 58 58 48 c1 e2 05 49 03 55 08 48 89 4a
>> 10 40 84 f6 75 48 f0 41 80 4c 24 18 01 4c 89 e7 e8 e0 b8 ff ff 49 8b 4d
>> 00 <48> 8b 83 b8 00 00 00 f0 ff 8b f0 00 00 00 0f 94 c2 a8 01 74 04 84
>> [  248.193120] RSP: 0018:ffffb1c38d598ce8 EFLAGS: 00010086
>> [  248.198344] RAX: ffff8e5da2a1a100 RBX: 0000000000000000 RCX:
>> ffff8e5d89747000
>> [  248.205479] RDX: 000000008040003a RSI: 0000000080400039 RDI:
>> ffff8e1e00044900
>> [  248.212611] RBP: ffffb1c38d598d30 R08: 0000000000000000 R09:
>> 0000000000000001
>> [  248.219744] R10: ffff8e5da2a1ae00 R11: 000000411bab9000 R12:
>> ffff8e5da2a1ae00
>> [  248.226877] R13: ffff8e5d8973fc00 R14: 0000000000000000 R15:
>> 0000000000001000
>> [  248.234009] FS:  00007fc26b07d700(0000) GS:ffff8e9c6e600000(0000)
>> knlGS:0000000000000000
>> [  248.242096] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [  248.247843] CR2: 00000000000000b8 CR3: 00000040b25d4005 CR4:
>> 0000000000770ee0
>> [  248.254973] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
>> 0000000000000000
>> [  248.262107] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>> 0000000000000400
>> [  248.269240] PKRU: 55555554
>> [  248.271953] Call Trace:
>> [  248.274406]  <IRQ>
>> [  248.276425]  bio_endio+0xf6/0x170
>> [  248.279743]  blk_update_request+0x12d/0x470
>> [  248.283931]  ? sbitmap_queue_clear_batch+0xc7/0x110
>> [  248.288809]  blk_mq_end_request_batch+0x76/0x490
>> [  248.293429]  ? dma_direct_unmap_sg+0xdd/0x1a0
>> [  248.297786]  ? smp_call_function_single_async+0x46/0x70
>> [  248.303015]  ? mempool_kfree+0xe/0x10
>> [  248.306680]  ? mempool_kfree+0xe/0x10
>> [  248.310345]  nvme_pci_complete_batch+0x26/0xb0
>> [  248.314792]  nvme_irq+0x298/0x2f0
>> [  248.318110]  ? nvme_unmap_data+0xf0/0xf0
>> [  248.322038]  __handle_irq_event_percpu+0x3f/0x190
>> [  248.326744]  handle_irq_event_percpu+0x33/0x80
>> [  248.331190]  handle_irq_event+0x39/0x60
>> [  248.335028]  handle_edge_irq+0xbe/0x1e0
>> [  248.338869]  __common_interrupt+0x6b/0x110
>> [  248.342967]  common_interrupt+0xbd/0xe0
>> [  248.346808]  </IRQ>
>> [  248.348912]  <TASK>
>> [  248.351018]  asm_common_interrupt+0x1e/0x40
>> [  248.355206] RIP: 0010:_raw_spin_unlock_irqrestore+0x1e/0x37
>> [  248.360780] Code: 02 5d c3 0f 1f 44 00 00 5d c3 66 90 0f 1f 44 00 00
>> 55 48 89 e5 c6 07 00 0f 1f 40 00 f7 c6 00 02 00 00 74 01 fb bf 01 00 00
>> 00 <e8> ed 8e 5b ff 65 8b 05 66 7e 52 78 85 c0 74 02 5d c3 0f 1f 44 00
>>
>> [  248.379525] RSP: 0018:ffffb1c3a429b958 EFLAGS: 00000206
>> [  248.384749] RAX: 0000000000000001 RBX: ffff8e5d8973fd08 RCX:
>> ffff8e5d8973fd10
>> [  248.391884] RDX: 0000000000000001 RSI: 0000000000000246 RDI:
>> 0000000000000001
>> [  248.399017] RBP: ffffb1c3a429b958 R08: 0000000000000000 R09:
>> ffffb1c3a429b970
>> [  248.406148] R10: 0000000000000c00 R11: 0000000000000001 R12:
>> 0000000000000001
>> [  248.413280] R13: 0000000000000246 R14: 0000000000000000 R15:
>> 0000000000000003
>> [  248.420415]  __wake_up_common_lock+0x8a/0xc0
>> [  248.424686]  __wake_up+0x13/0x20
>> [  248.427919]  raid10_make_request+0x101/0x170 [raid10]
>> [  248.432971]  md_handle_request+0x179/0x1e0
>> [  248.437071]  ? submit_bio_checks+0x1f6/0x5a0
>> [  248.441345]  md_submit_bio+0x6d/0xa0
>> [  248.444924]  __submit_bio+0x94/0x140
>> [  248.448504]  submit_bio_noacct+0xe1/0x2a0
>> [  248.452515]  submit_bio+0x48/0x120
>> [  248.455923]  blkdev_direct_IO+0x220/0x540
>> [  248.459935]  ? __fsnotify_parent+0xff/0x330
>> [  248.464121]  ? __fsnotify_parent+0x10f/0x330
>> [  248.468393]  ? common_interrupt+0x73/0xe0
>> [  248.472408]  generic_file_read_iter+0xa5/0x160
>> [  248.476852]  blkdev_read_iter+0x38/0x70
>> [  248.480693]  io_read+0x119/0x420
>> [  248.483923]  ? sbitmap_queue_clear_batch+0xc7/0x110
>> [  248.488805]  ? blk_mq_end_request_batch+0x378/0x490
>> [  248.493684]  io_issue_sqe+0x7ec/0x19c0
>> [  248.497436]  ? io_req_prep+0x6a9/0xe60
>> [  248.501190]  io_submit_sqes+0x2a0/0x9f0
>> [  248.505030]  ? __fget_files+0x6a/0x90
>> [  248.508693]  __x64_sys_io_uring_enter+0x1da/0x8c0
>> [  248.513401]  do_syscall_64+0x38/0x90
>> [  248.516979]  entry_SYSCALL_64_after_hwframe+0x44/0xae
>> [  248.522033] RIP: 0033:0x7fc26b19b89d
>> [  248.525611] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa
>> 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f
>> 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
>> [  248.544360] RSP: 002b:00007fc26b07ce98 EFLAGS: 00000246 ORIG_RAX:
>> 00000000000001aa
>> [  248.551925] RAX: ffffffffffffffda RBX: 00007fc26b3f2fc0 RCX:
>> 00007fc26b19b89d
>> [  248.559058] RDX: 0000000000000020 RSI: 0000000000000020 RDI:
>> 0000000000000004
>> [  248.566189] RBP: 0000000000000020 R08: 0000000000000000 R09:
>> 0000000000000000
>> [  248.573322] R10: 0000000000000001 R11: 0000000000000246 R12:
>> 00005623a4b7a2a0
>> [  248.580456] R13: 0000000000000020 R14: 0000000000000020 R15:
>> 0000000000000020
>> [  248.587591]  </TASK>
> Do you have:
>
> commit 75feae73a28020e492fbad2323245455ef69d687
> Author: Pavel Begunkov <asml.silence@gmail.com>
> Date:   Tue Dec 7 20:16:36 2021 +0000
>
>      block: fix single bio async DIO error handling
>
> in your tree?
>
Nope. I will get it in and test. Thanks!

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v5 3/4] md: raid10 add nowait support
  2021-12-16 16:42                                             ` Jens Axboe
  2021-12-16 16:45                                               ` Vishal Verma
@ 2021-12-16 18:14                                               ` Vishal Verma
  1 sibling, 0 replies; 86+ messages in thread
From: Vishal Verma @ 2021-12-16 18:14 UTC (permalink / raw)
  To: Jens Axboe, Song Liu; +Cc: linux-raid, rgoldwyn


On 12/16/21 9:42 AM, Jens Axboe wrote:
> On 12/15/21 5:30 PM, Vishal Verma wrote:
>> On 12/15/21 3:20 PM, Vishal Verma wrote:
>>> On 12/15/21 1:42 PM, Song Liu wrote:
>>>> On Tue, Dec 14, 2021 at 10:09 PM Vishal Verma
>>>> <vverma@digitalocean.com> wrote:
>>>>> This adds nowait support to the RAID10 driver. Very similar to
>>>>> raid1 driver changes. It makes RAID10 driver return with EAGAIN
>>>>> for situations where it could wait for eg:
>>>>>
>>>>> - Waiting for the barrier,
>>>>> - Too many pending I/Os to be queued,
>>>>> - Reshape operation,
>>>>> - Discard operation.
>>>>>
>>>>> wait_barrier() fn is modified to return bool to support error for
>>>>> wait barriers. It returns true in case of wait or if wait is not
>>>>> required and returns false if wait was required but not performed
>>>>> to support nowait.
>>>>>
>>>>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
>>>>> ---
>>>>>    drivers/md/raid10.c | 57
>>>>> +++++++++++++++++++++++++++++++++++----------
>>>>>    1 file changed, 45 insertions(+), 12 deletions(-)
>>>>>
>>>>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>>>>> index dde98f65bd04..f6c73987e9ac 100644
>>>>> --- a/drivers/md/raid10.c
>>>>> +++ b/drivers/md/raid10.c
>>>>> @@ -952,11 +952,18 @@ static void lower_barrier(struct r10conf *conf)
>>>>>           wake_up(&conf->wait_barrier);
>>>>>    }
>>>>>
>>>>> -static void wait_barrier(struct r10conf *conf)
>>>>> +static bool wait_barrier(struct r10conf *conf, bool nowait)
>>>>>    {
>>>>>           spin_lock_irq(&conf->resync_lock);
>>>>>           if (conf->barrier) {
>>>>>                   struct bio_list *bio_list = current->bio_list;
>>>>> +
>>>>> +               /* Return false when nowait flag is set */
>>>>> +               if (nowait) {
>>>>> + spin_unlock_irq(&conf->resync_lock);
>>>>> +                       return false;
>>>>> +               }
>>>>> +
>>>>>                   conf->nr_waiting++;
>>>>>                   /* Wait for the barrier to drop.
>>>>>                    * However if there are already pending
>>>>> @@ -988,6 +995,7 @@ static void wait_barrier(struct r10conf *conf)
>>>>>           }
>>>>>           atomic_inc(&conf->nr_pending);
>>>>>           spin_unlock_irq(&conf->resync_lock);
>>>>> +       return true;
>>>>>    }
>>>>>
>>>>>    static void allow_barrier(struct r10conf *conf)
>>>>> @@ -1101,17 +1109,25 @@ static void raid10_unplug(struct blk_plug_cb
>>>>> *cb, bool from_schedule)
>>>>>    static void regular_request_wait(struct mddev *mddev, struct
>>>>> r10conf *conf,
>>>>>                                    struct bio *bio, sector_t sectors)
>>>>>    {
>>>>> -       wait_barrier(conf);
>>>>> +       /* Bail out if REQ_NOWAIT is set for the bio */
>>>>> +       if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
>>>>> +               bio_wouldblock_error(bio);
>>>>> +               return;
>>>>> +       }
>>>> I think we also need regular_request_wait to return bool and handle
>>>> it properly.
>>>>
>>>> Thanks,
>>>> Song
>>>>
>>> Ack, will fix it. Thanks!
>> Ran into this while running with io_uring. With the current v5 (raid10
>> patch) on top of md-next branch.
>> ./t/io_uring -a 0 -d 256 </dev/raid10>
>>
>> It didn't trigger with aio (-a 1)
>>
>> [  248.128661] BUG: kernel NULL pointer dereference, address:
>> 00000000000000b8
>> [  248.135628] #PF: supervisor read access in kernel mode
>> [  248.140762] #PF: error_code(0x0000) - not-present page
>> [  248.145903] PGD 0 P4D 0
>> [  248.148443] Oops: 0000 [#1] PREEMPT SMP NOPTI
>> [  248.152800] CPU: 49 PID: 9461 Comm: io_uring Kdump: loaded Not
>> tainted 5.16.0-rc3+ #2
>> [  248.160629] Hardware name: Dell Inc. PowerEdge R650xs/0PPTY2, BIOS
>> 1.3.8 08/31/2021
>> [  248.168279] RIP: 0010:raid10_end_read_request+0x74/0x140 [raid10]
>> [  248.174373] Code: 48 60 48 8b 58 58 48 c1 e2 05 49 03 55 08 48 89 4a
>> 10 40 84 f6 75 48 f0 41 80 4c 24 18 01 4c 89 e7 e8 e0 b8 ff ff 49 8b 4d
>> 00 <48> 8b 83 b8 00 00 00 f0 ff 8b f0 00 00 00 0f 94 c2 a8 01 74 04 84
>> [  248.193120] RSP: 0018:ffffb1c38d598ce8 EFLAGS: 00010086
>> [  248.198344] RAX: ffff8e5da2a1a100 RBX: 0000000000000000 RCX:
>> ffff8e5d89747000
>> [  248.205479] RDX: 000000008040003a RSI: 0000000080400039 RDI:
>> ffff8e1e00044900
>> [  248.212611] RBP: ffffb1c38d598d30 R08: 0000000000000000 R09:
>> 0000000000000001
>> [  248.219744] R10: ffff8e5da2a1ae00 R11: 000000411bab9000 R12:
>> ffff8e5da2a1ae00
>> [  248.226877] R13: ffff8e5d8973fc00 R14: 0000000000000000 R15:
>> 0000000000001000
>> [  248.234009] FS:  00007fc26b07d700(0000) GS:ffff8e9c6e600000(0000)
>> knlGS:0000000000000000
>> [  248.242096] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [  248.247843] CR2: 00000000000000b8 CR3: 00000040b25d4005 CR4:
>> 0000000000770ee0
>> [  248.254973] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
>> 0000000000000000
>> [  248.262107] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>> 0000000000000400
>> [  248.269240] PKRU: 55555554
>> [  248.271953] Call Trace:
>> [  248.274406]  <IRQ>
>> [  248.276425]  bio_endio+0xf6/0x170
>> [  248.279743]  blk_update_request+0x12d/0x470
>> [  248.283931]  ? sbitmap_queue_clear_batch+0xc7/0x110
>> [  248.288809]  blk_mq_end_request_batch+0x76/0x490
>> [  248.293429]  ? dma_direct_unmap_sg+0xdd/0x1a0
>> [  248.297786]  ? smp_call_function_single_async+0x46/0x70
>> [  248.303015]  ? mempool_kfree+0xe/0x10
>> [  248.306680]  ? mempool_kfree+0xe/0x10
>> [  248.310345]  nvme_pci_complete_batch+0x26/0xb0
>> [  248.314792]  nvme_irq+0x298/0x2f0
>> [  248.318110]  ? nvme_unmap_data+0xf0/0xf0
>> [  248.322038]  __handle_irq_event_percpu+0x3f/0x190
>> [  248.326744]  handle_irq_event_percpu+0x33/0x80
>> [  248.331190]  handle_irq_event+0x39/0x60
>> [  248.335028]  handle_edge_irq+0xbe/0x1e0
>> [  248.338869]  __common_interrupt+0x6b/0x110
>> [  248.342967]  common_interrupt+0xbd/0xe0
>> [  248.346808]  </IRQ>
>> [  248.348912]  <TASK>
>> [  248.351018]  asm_common_interrupt+0x1e/0x40
>> [  248.355206] RIP: 0010:_raw_spin_unlock_irqrestore+0x1e/0x37
>> [  248.360780] Code: 02 5d c3 0f 1f 44 00 00 5d c3 66 90 0f 1f 44 00 00
>> 55 48 89 e5 c6 07 00 0f 1f 40 00 f7 c6 00 02 00 00 74 01 fb bf 01 00 00
>> 00 <e8> ed 8e 5b ff 65 8b 05 66 7e 52 78 85 c0 74 02 5d c3 0f 1f 44 00
>>
>> [  248.379525] RSP: 0018:ffffb1c3a429b958 EFLAGS: 00000206
>> [  248.384749] RAX: 0000000000000001 RBX: ffff8e5d8973fd08 RCX:
>> ffff8e5d8973fd10
>> [  248.391884] RDX: 0000000000000001 RSI: 0000000000000246 RDI:
>> 0000000000000001
>> [  248.399017] RBP: ffffb1c3a429b958 R08: 0000000000000000 R09:
>> ffffb1c3a429b970
>> [  248.406148] R10: 0000000000000c00 R11: 0000000000000001 R12:
>> 0000000000000001
>> [  248.413280] R13: 0000000000000246 R14: 0000000000000000 R15:
>> 0000000000000003
>> [  248.420415]  __wake_up_common_lock+0x8a/0xc0
>> [  248.424686]  __wake_up+0x13/0x20
>> [  248.427919]  raid10_make_request+0x101/0x170 [raid10]
>> [  248.432971]  md_handle_request+0x179/0x1e0
>> [  248.437071]  ? submit_bio_checks+0x1f6/0x5a0
>> [  248.441345]  md_submit_bio+0x6d/0xa0
>> [  248.444924]  __submit_bio+0x94/0x140
>> [  248.448504]  submit_bio_noacct+0xe1/0x2a0
>> [  248.452515]  submit_bio+0x48/0x120
>> [  248.455923]  blkdev_direct_IO+0x220/0x540
>> [  248.459935]  ? __fsnotify_parent+0xff/0x330
>> [  248.464121]  ? __fsnotify_parent+0x10f/0x330
>> [  248.468393]  ? common_interrupt+0x73/0xe0
>> [  248.472408]  generic_file_read_iter+0xa5/0x160
>> [  248.476852]  blkdev_read_iter+0x38/0x70
>> [  248.480693]  io_read+0x119/0x420
>> [  248.483923]  ? sbitmap_queue_clear_batch+0xc7/0x110
>> [  248.488805]  ? blk_mq_end_request_batch+0x378/0x490
>> [  248.493684]  io_issue_sqe+0x7ec/0x19c0
>> [  248.497436]  ? io_req_prep+0x6a9/0xe60
>> [  248.501190]  io_submit_sqes+0x2a0/0x9f0
>> [  248.505030]  ? __fget_files+0x6a/0x90
>> [  248.508693]  __x64_sys_io_uring_enter+0x1da/0x8c0
>> [  248.513401]  do_syscall_64+0x38/0x90
>> [  248.516979]  entry_SYSCALL_64_after_hwframe+0x44/0xae
>> [  248.522033] RIP: 0033:0x7fc26b19b89d
>> [  248.525611] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa
>> 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f
>> 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
>> [  248.544360] RSP: 002b:00007fc26b07ce98 EFLAGS: 00000246 ORIG_RAX:
>> 00000000000001aa
>> [  248.551925] RAX: ffffffffffffffda RBX: 00007fc26b3f2fc0 RCX:
>> 00007fc26b19b89d
>> [  248.559058] RDX: 0000000000000020 RSI: 0000000000000020 RDI:
>> 0000000000000004
>> [  248.566189] RBP: 0000000000000020 R08: 0000000000000000 R09:
>> 0000000000000000
>> [  248.573322] R10: 0000000000000001 R11: 0000000000000246 R12:
>> 00005623a4b7a2a0
>> [  248.580456] R13: 0000000000000020 R14: 0000000000000020 R15:
>> 0000000000000020
>> [  248.587591]  </TASK>
> Do you have:
>
> commit 75feae73a28020e492fbad2323245455ef69d687
> Author: Pavel Begunkov <asml.silence@gmail.com>
> Date:   Tue Dec 7 20:16:36 2021 +0000
>
>      block: fix single bio async DIO error handling
>
> in your tree?
>
Hmm, it is still triggering..

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v5 3/4] md: raid10 add nowait support
  2021-12-16 16:45                                               ` Vishal Verma
@ 2021-12-16 18:49                                                 ` Jens Axboe
  2021-12-16 19:40                                                   ` Vishal Verma
  0 siblings, 1 reply; 86+ messages in thread
From: Jens Axboe @ 2021-12-16 18:49 UTC (permalink / raw)
  To: Vishal Verma, Song Liu; +Cc: linux-raid, rgoldwyn

On 12/16/21 9:45 AM, Vishal Verma wrote:
> 
> On 12/16/21 9:42 AM, Jens Axboe wrote:
>> On 12/15/21 5:30 PM, Vishal Verma wrote:
>>> On 12/15/21 3:20 PM, Vishal Verma wrote:
>>>> On 12/15/21 1:42 PM, Song Liu wrote:
>>>>> On Tue, Dec 14, 2021 at 10:09 PM Vishal Verma
>>>>> <vverma@digitalocean.com> wrote:
>>>>>> This adds nowait support to the RAID10 driver. Very similar to
>>>>>> raid1 driver changes. It makes RAID10 driver return with EAGAIN
>>>>>> for situations where it could wait for eg:
>>>>>>
>>>>>> - Waiting for the barrier,
>>>>>> - Too many pending I/Os to be queued,
>>>>>> - Reshape operation,
>>>>>> - Discard operation.
>>>>>>
>>>>>> wait_barrier() fn is modified to return bool to support error for
>>>>>> wait barriers. It returns true in case of wait or if wait is not
>>>>>> required and returns false if wait was required but not performed
>>>>>> to support nowait.
>>>>>>
>>>>>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
>>>>>> ---
>>>>>>    drivers/md/raid10.c | 57
>>>>>> +++++++++++++++++++++++++++++++++++----------
>>>>>>    1 file changed, 45 insertions(+), 12 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>>>>>> index dde98f65bd04..f6c73987e9ac 100644
>>>>>> --- a/drivers/md/raid10.c
>>>>>> +++ b/drivers/md/raid10.c
>>>>>> @@ -952,11 +952,18 @@ static void lower_barrier(struct r10conf *conf)
>>>>>>           wake_up(&conf->wait_barrier);
>>>>>>    }
>>>>>>
>>>>>> -static void wait_barrier(struct r10conf *conf)
>>>>>> +static bool wait_barrier(struct r10conf *conf, bool nowait)
>>>>>>    {
>>>>>>           spin_lock_irq(&conf->resync_lock);
>>>>>>           if (conf->barrier) {
>>>>>>                   struct bio_list *bio_list = current->bio_list;
>>>>>> +
>>>>>> +               /* Return false when nowait flag is set */
>>>>>> +               if (nowait) {
>>>>>> + spin_unlock_irq(&conf->resync_lock);
>>>>>> +                       return false;
>>>>>> +               }
>>>>>> +
>>>>>>                   conf->nr_waiting++;
>>>>>>                   /* Wait for the barrier to drop.
>>>>>>                    * However if there are already pending
>>>>>> @@ -988,6 +995,7 @@ static void wait_barrier(struct r10conf *conf)
>>>>>>           }
>>>>>>           atomic_inc(&conf->nr_pending);
>>>>>>           spin_unlock_irq(&conf->resync_lock);
>>>>>> +       return true;
>>>>>>    }
>>>>>>
>>>>>>    static void allow_barrier(struct r10conf *conf)
>>>>>> @@ -1101,17 +1109,25 @@ static void raid10_unplug(struct blk_plug_cb
>>>>>> *cb, bool from_schedule)
>>>>>>    static void regular_request_wait(struct mddev *mddev, struct
>>>>>> r10conf *conf,
>>>>>>                                    struct bio *bio, sector_t sectors)
>>>>>>    {
>>>>>> -       wait_barrier(conf);
>>>>>> +       /* Bail out if REQ_NOWAIT is set for the bio */
>>>>>> +       if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
>>>>>> +               bio_wouldblock_error(bio);
>>>>>> +               return;
>>>>>> +       }
>>>>> I think we also need regular_request_wait to return bool and handle
>>>>> it properly.
>>>>>
>>>>> Thanks,
>>>>> Song
>>>>>
>>>> Ack, will fix it. Thanks!
>>> Ran into this while running with io_uring. With the current v5 (raid10
>>> patch) on top of md-next branch.
>>> ./t/io_uring -a 0 -d 256 </dev/raid10>
>>>
>>> It didn't trigger with aio (-a 1)
>>>
>>> [  248.128661] BUG: kernel NULL pointer dereference, address:
>>> 00000000000000b8
>>> [  248.135628] #PF: supervisor read access in kernel mode
>>> [  248.140762] #PF: error_code(0x0000) - not-present page
>>> [  248.145903] PGD 0 P4D 0
>>> [  248.148443] Oops: 0000 [#1] PREEMPT SMP NOPTI
>>> [  248.152800] CPU: 49 PID: 9461 Comm: io_uring Kdump: loaded Not
>>> tainted 5.16.0-rc3+ #2
>>> [  248.160629] Hardware name: Dell Inc. PowerEdge R650xs/0PPTY2, BIOS
>>> 1.3.8 08/31/2021
>>> [  248.168279] RIP: 0010:raid10_end_read_request+0x74/0x140 [raid10]
>>> [  248.174373] Code: 48 60 48 8b 58 58 48 c1 e2 05 49 03 55 08 48 89 4a
>>> 10 40 84 f6 75 48 f0 41 80 4c 24 18 01 4c 89 e7 e8 e0 b8 ff ff 49 8b 4d
>>> 00 <48> 8b 83 b8 00 00 00 f0 ff 8b f0 00 00 00 0f 94 c2 a8 01 74 04 84
>>> [  248.193120] RSP: 0018:ffffb1c38d598ce8 EFLAGS: 00010086
>>> [  248.198344] RAX: ffff8e5da2a1a100 RBX: 0000000000000000 RCX:
>>> ffff8e5d89747000
>>> [  248.205479] RDX: 000000008040003a RSI: 0000000080400039 RDI:
>>> ffff8e1e00044900
>>> [  248.212611] RBP: ffffb1c38d598d30 R08: 0000000000000000 R09:
>>> 0000000000000001
>>> [  248.219744] R10: ffff8e5da2a1ae00 R11: 000000411bab9000 R12:
>>> ffff8e5da2a1ae00
>>> [  248.226877] R13: ffff8e5d8973fc00 R14: 0000000000000000 R15:
>>> 0000000000001000
>>> [  248.234009] FS:  00007fc26b07d700(0000) GS:ffff8e9c6e600000(0000)
>>> knlGS:0000000000000000
>>> [  248.242096] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>> [  248.247843] CR2: 00000000000000b8 CR3: 00000040b25d4005 CR4:
>>> 0000000000770ee0
>>> [  248.254973] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
>>> 0000000000000000
>>> [  248.262107] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>> 0000000000000400
>>> [  248.269240] PKRU: 55555554
>>> [  248.271953] Call Trace:
>>> [  248.274406]  <IRQ>
>>> [  248.276425]  bio_endio+0xf6/0x170
>>> [  248.279743]  blk_update_request+0x12d/0x470
>>> [  248.283931]  ? sbitmap_queue_clear_batch+0xc7/0x110
>>> [  248.288809]  blk_mq_end_request_batch+0x76/0x490
>>> [  248.293429]  ? dma_direct_unmap_sg+0xdd/0x1a0
>>> [  248.297786]  ? smp_call_function_single_async+0x46/0x70
>>> [  248.303015]  ? mempool_kfree+0xe/0x10
>>> [  248.306680]  ? mempool_kfree+0xe/0x10
>>> [  248.310345]  nvme_pci_complete_batch+0x26/0xb0
>>> [  248.314792]  nvme_irq+0x298/0x2f0
>>> [  248.318110]  ? nvme_unmap_data+0xf0/0xf0
>>> [  248.322038]  __handle_irq_event_percpu+0x3f/0x190
>>> [  248.326744]  handle_irq_event_percpu+0x33/0x80
>>> [  248.331190]  handle_irq_event+0x39/0x60
>>> [  248.335028]  handle_edge_irq+0xbe/0x1e0
>>> [  248.338869]  __common_interrupt+0x6b/0x110
>>> [  248.342967]  common_interrupt+0xbd/0xe0
>>> [  248.346808]  </IRQ>
>>> [  248.348912]  <TASK>
>>> [  248.351018]  asm_common_interrupt+0x1e/0x40
>>> [  248.355206] RIP: 0010:_raw_spin_unlock_irqrestore+0x1e/0x37
>>> [  248.360780] Code: 02 5d c3 0f 1f 44 00 00 5d c3 66 90 0f 1f 44 00 00
>>> 55 48 89 e5 c6 07 00 0f 1f 40 00 f7 c6 00 02 00 00 74 01 fb bf 01 00 00
>>> 00 <e8> ed 8e 5b ff 65 8b 05 66 7e 52 78 85 c0 74 02 5d c3 0f 1f 44 00
>>>
>>> [  248.379525] RSP: 0018:ffffb1c3a429b958 EFLAGS: 00000206
>>> [  248.384749] RAX: 0000000000000001 RBX: ffff8e5d8973fd08 RCX:
>>> ffff8e5d8973fd10
>>> [  248.391884] RDX: 0000000000000001 RSI: 0000000000000246 RDI:
>>> 0000000000000001
>>> [  248.399017] RBP: ffffb1c3a429b958 R08: 0000000000000000 R09:
>>> ffffb1c3a429b970
>>> [  248.406148] R10: 0000000000000c00 R11: 0000000000000001 R12:
>>> 0000000000000001
>>> [  248.413280] R13: 0000000000000246 R14: 0000000000000000 R15:
>>> 0000000000000003
>>> [  248.420415]  __wake_up_common_lock+0x8a/0xc0
>>> [  248.424686]  __wake_up+0x13/0x20
>>> [  248.427919]  raid10_make_request+0x101/0x170 [raid10]
>>> [  248.432971]  md_handle_request+0x179/0x1e0
>>> [  248.437071]  ? submit_bio_checks+0x1f6/0x5a0
>>> [  248.441345]  md_submit_bio+0x6d/0xa0
>>> [  248.444924]  __submit_bio+0x94/0x140
>>> [  248.448504]  submit_bio_noacct+0xe1/0x2a0
>>> [  248.452515]  submit_bio+0x48/0x120
>>> [  248.455923]  blkdev_direct_IO+0x220/0x540
>>> [  248.459935]  ? __fsnotify_parent+0xff/0x330
>>> [  248.464121]  ? __fsnotify_parent+0x10f/0x330
>>> [  248.468393]  ? common_interrupt+0x73/0xe0
>>> [  248.472408]  generic_file_read_iter+0xa5/0x160
>>> [  248.476852]  blkdev_read_iter+0x38/0x70
>>> [  248.480693]  io_read+0x119/0x420
>>> [  248.483923]  ? sbitmap_queue_clear_batch+0xc7/0x110
>>> [  248.488805]  ? blk_mq_end_request_batch+0x378/0x490
>>> [  248.493684]  io_issue_sqe+0x7ec/0x19c0
>>> [  248.497436]  ? io_req_prep+0x6a9/0xe60
>>> [  248.501190]  io_submit_sqes+0x2a0/0x9f0
>>> [  248.505030]  ? __fget_files+0x6a/0x90
>>> [  248.508693]  __x64_sys_io_uring_enter+0x1da/0x8c0
>>> [  248.513401]  do_syscall_64+0x38/0x90
>>> [  248.516979]  entry_SYSCALL_64_after_hwframe+0x44/0xae
>>> [  248.522033] RIP: 0033:0x7fc26b19b89d
>>> [  248.525611] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa
>>> 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f
>>> 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
>>> [  248.544360] RSP: 002b:00007fc26b07ce98 EFLAGS: 00000246 ORIG_RAX:
>>> 00000000000001aa
>>> [  248.551925] RAX: ffffffffffffffda RBX: 00007fc26b3f2fc0 RCX:
>>> 00007fc26b19b89d
>>> [  248.559058] RDX: 0000000000000020 RSI: 0000000000000020 RDI:
>>> 0000000000000004
>>> [  248.566189] RBP: 0000000000000020 R08: 0000000000000000 R09:
>>> 0000000000000000
>>> [  248.573322] R10: 0000000000000001 R11: 0000000000000246 R12:
>>> 00005623a4b7a2a0
>>> [  248.580456] R13: 0000000000000020 R14: 0000000000000020 R15:
>>> 0000000000000020
>>> [  248.587591]  </TASK>
>> Do you have:
>>
>> commit 75feae73a28020e492fbad2323245455ef69d687
>> Author: Pavel Begunkov <asml.silence@gmail.com>
>> Date:   Tue Dec 7 20:16:36 2021 +0000
>>
>>      block: fix single bio async DIO error handling
>>
>> in your tree?
>>
> Nope. I will get it in and test. Thanks!

Might be worth re-running with KASAN enabled in your config to see if
that triggers anything.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v5 3/4] md: raid10 add nowait support
  2021-12-16 18:49                                                 ` Jens Axboe
@ 2021-12-16 19:40                                                   ` Vishal Verma
  2021-12-16 20:18                                                     ` Song Liu
  0 siblings, 1 reply; 86+ messages in thread
From: Vishal Verma @ 2021-12-16 19:40 UTC (permalink / raw)
  To: Jens Axboe, Song Liu; +Cc: linux-raid, rgoldwyn


On 12/16/21 11:49 AM, Jens Axboe wrote:
> On 12/16/21 9:45 AM, Vishal Verma wrote:
>> On 12/16/21 9:42 AM, Jens Axboe wrote:
>>> On 12/15/21 5:30 PM, Vishal Verma wrote:
>>>> On 12/15/21 3:20 PM, Vishal Verma wrote:
>>>>> On 12/15/21 1:42 PM, Song Liu wrote:
>>>>>> On Tue, Dec 14, 2021 at 10:09 PM Vishal Verma
>>>>>> <vverma@digitalocean.com> wrote:
>>>>>>> This adds nowait support to the RAID10 driver. Very similar to
>>>>>>> raid1 driver changes. It makes RAID10 driver return with EAGAIN
>>>>>>> for situations where it could wait for eg:
>>>>>>>
>>>>>>> - Waiting for the barrier,
>>>>>>> - Too many pending I/Os to be queued,
>>>>>>> - Reshape operation,
>>>>>>> - Discard operation.
>>>>>>>
>>>>>>> wait_barrier() fn is modified to return bool to support error for
>>>>>>> wait barriers. It returns true in case of wait or if wait is not
>>>>>>> required and returns false if wait was required but not performed
>>>>>>> to support nowait.
>>>>>>>
>>>>>>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
>>>>>>> ---
>>>>>>>     drivers/md/raid10.c | 57
>>>>>>> +++++++++++++++++++++++++++++++++++----------
>>>>>>>     1 file changed, 45 insertions(+), 12 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>>>>>>> index dde98f65bd04..f6c73987e9ac 100644
>>>>>>> --- a/drivers/md/raid10.c
>>>>>>> +++ b/drivers/md/raid10.c
>>>>>>> @@ -952,11 +952,18 @@ static void lower_barrier(struct r10conf *conf)
>>>>>>>            wake_up(&conf->wait_barrier);
>>>>>>>     }
>>>>>>>
>>>>>>> -static void wait_barrier(struct r10conf *conf)
>>>>>>> +static bool wait_barrier(struct r10conf *conf, bool nowait)
>>>>>>>     {
>>>>>>>            spin_lock_irq(&conf->resync_lock);
>>>>>>>            if (conf->barrier) {
>>>>>>>                    struct bio_list *bio_list = current->bio_list;
>>>>>>> +
>>>>>>> +               /* Return false when nowait flag is set */
>>>>>>> +               if (nowait) {
>>>>>>> + spin_unlock_irq(&conf->resync_lock);
>>>>>>> +                       return false;
>>>>>>> +               }
>>>>>>> +
>>>>>>>                    conf->nr_waiting++;
>>>>>>>                    /* Wait for the barrier to drop.
>>>>>>>                     * However if there are already pending
>>>>>>> @@ -988,6 +995,7 @@ static void wait_barrier(struct r10conf *conf)
>>>>>>>            }
>>>>>>>            atomic_inc(&conf->nr_pending);
>>>>>>>            spin_unlock_irq(&conf->resync_lock);
>>>>>>> +       return true;
>>>>>>>     }
>>>>>>>
>>>>>>>     static void allow_barrier(struct r10conf *conf)
>>>>>>> @@ -1101,17 +1109,25 @@ static void raid10_unplug(struct blk_plug_cb
>>>>>>> *cb, bool from_schedule)
>>>>>>>     static void regular_request_wait(struct mddev *mddev, struct
>>>>>>> r10conf *conf,
>>>>>>>                                     struct bio *bio, sector_t sectors)
>>>>>>>     {
>>>>>>> -       wait_barrier(conf);
>>>>>>> +       /* Bail out if REQ_NOWAIT is set for the bio */
>>>>>>> +       if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
>>>>>>> +               bio_wouldblock_error(bio);
>>>>>>> +               return;
>>>>>>> +       }
>>>>>> I think we also need regular_request_wait to return bool and handle
>>>>>> it properly.
>>>>>>
>>>>>> Thanks,
>>>>>> Song
>>>>>>
>>>>> Ack, will fix it. Thanks!
>>>> Ran into this while running with io_uring. With the current v5 (raid10
>>>> patch) on top of md-next branch.
>>>> ./t/io_uring -a 0 -d 256 </dev/raid10>
>>>>
>>>> It didn't trigger with aio (-a 1)
>>>>
>>>> [  248.128661] BUG: kernel NULL pointer dereference, address:
>>>> 00000000000000b8
>>>> [  248.135628] #PF: supervisor read access in kernel mode
>>>> [  248.140762] #PF: error_code(0x0000) - not-present page
>>>> [  248.145903] PGD 0 P4D 0
>>>> [  248.148443] Oops: 0000 [#1] PREEMPT SMP NOPTI
>>>> [  248.152800] CPU: 49 PID: 9461 Comm: io_uring Kdump: loaded Not
>>>> tainted 5.16.0-rc3+ #2
>>>> [  248.160629] Hardware name: Dell Inc. PowerEdge R650xs/0PPTY2, BIOS
>>>> 1.3.8 08/31/2021
>>>> [  248.168279] RIP: 0010:raid10_end_read_request+0x74/0x140 [raid10]
>>>> [  248.174373] Code: 48 60 48 8b 58 58 48 c1 e2 05 49 03 55 08 48 89 4a
>>>> 10 40 84 f6 75 48 f0 41 80 4c 24 18 01 4c 89 e7 e8 e0 b8 ff ff 49 8b 4d
>>>> 00 <48> 8b 83 b8 00 00 00 f0 ff 8b f0 00 00 00 0f 94 c2 a8 01 74 04 84
>>>> [  248.193120] RSP: 0018:ffffb1c38d598ce8 EFLAGS: 00010086
>>>> [  248.198344] RAX: ffff8e5da2a1a100 RBX: 0000000000000000 RCX:
>>>> ffff8e5d89747000
>>>> [  248.205479] RDX: 000000008040003a RSI: 0000000080400039 RDI:
>>>> ffff8e1e00044900
>>>> [  248.212611] RBP: ffffb1c38d598d30 R08: 0000000000000000 R09:
>>>> 0000000000000001
>>>> [  248.219744] R10: ffff8e5da2a1ae00 R11: 000000411bab9000 R12:
>>>> ffff8e5da2a1ae00
>>>> [  248.226877] R13: ffff8e5d8973fc00 R14: 0000000000000000 R15:
>>>> 0000000000001000
>>>> [  248.234009] FS:  00007fc26b07d700(0000) GS:ffff8e9c6e600000(0000)
>>>> knlGS:0000000000000000
>>>> [  248.242096] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>>> [  248.247843] CR2: 00000000000000b8 CR3: 00000040b25d4005 CR4:
>>>> 0000000000770ee0
>>>> [  248.254973] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
>>>> 0000000000000000
>>>> [  248.262107] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>> 0000000000000400
>>>> [  248.269240] PKRU: 55555554
>>>> [  248.271953] Call Trace:
>>>> [  248.274406]  <IRQ>
>>>> [  248.276425]  bio_endio+0xf6/0x170
>>>> [  248.279743]  blk_update_request+0x12d/0x470
>>>> [  248.283931]  ? sbitmap_queue_clear_batch+0xc7/0x110
>>>> [  248.288809]  blk_mq_end_request_batch+0x76/0x490
>>>> [  248.293429]  ? dma_direct_unmap_sg+0xdd/0x1a0
>>>> [  248.297786]  ? smp_call_function_single_async+0x46/0x70
>>>> [  248.303015]  ? mempool_kfree+0xe/0x10
>>>> [  248.306680]  ? mempool_kfree+0xe/0x10
>>>> [  248.310345]  nvme_pci_complete_batch+0x26/0xb0
>>>> [  248.314792]  nvme_irq+0x298/0x2f0
>>>> [  248.318110]  ? nvme_unmap_data+0xf0/0xf0
>>>> [  248.322038]  __handle_irq_event_percpu+0x3f/0x190
>>>> [  248.326744]  handle_irq_event_percpu+0x33/0x80
>>>> [  248.331190]  handle_irq_event+0x39/0x60
>>>> [  248.335028]  handle_edge_irq+0xbe/0x1e0
>>>> [  248.338869]  __common_interrupt+0x6b/0x110
>>>> [  248.342967]  common_interrupt+0xbd/0xe0
>>>> [  248.346808]  </IRQ>
>>>> [  248.348912]  <TASK>
>>>> [  248.351018]  asm_common_interrupt+0x1e/0x40
>>>> [  248.355206] RIP: 0010:_raw_spin_unlock_irqrestore+0x1e/0x37
>>>> [  248.360780] Code: 02 5d c3 0f 1f 44 00 00 5d c3 66 90 0f 1f 44 00 00
>>>> 55 48 89 e5 c6 07 00 0f 1f 40 00 f7 c6 00 02 00 00 74 01 fb bf 01 00 00
>>>> 00 <e8> ed 8e 5b ff 65 8b 05 66 7e 52 78 85 c0 74 02 5d c3 0f 1f 44 00
>>>>
>>>> [  248.379525] RSP: 0018:ffffb1c3a429b958 EFLAGS: 00000206
>>>> [  248.384749] RAX: 0000000000000001 RBX: ffff8e5d8973fd08 RCX:
>>>> ffff8e5d8973fd10
>>>> [  248.391884] RDX: 0000000000000001 RSI: 0000000000000246 RDI:
>>>> 0000000000000001
>>>> [  248.399017] RBP: ffffb1c3a429b958 R08: 0000000000000000 R09:
>>>> ffffb1c3a429b970
>>>> [  248.406148] R10: 0000000000000c00 R11: 0000000000000001 R12:
>>>> 0000000000000001
>>>> [  248.413280] R13: 0000000000000246 R14: 0000000000000000 R15:
>>>> 0000000000000003
>>>> [  248.420415]  __wake_up_common_lock+0x8a/0xc0
>>>> [  248.424686]  __wake_up+0x13/0x20
>>>> [  248.427919]  raid10_make_request+0x101/0x170 [raid10]
>>>> [  248.432971]  md_handle_request+0x179/0x1e0
>>>> [  248.437071]  ? submit_bio_checks+0x1f6/0x5a0
>>>> [  248.441345]  md_submit_bio+0x6d/0xa0
>>>> [  248.444924]  __submit_bio+0x94/0x140
>>>> [  248.448504]  submit_bio_noacct+0xe1/0x2a0
>>>> [  248.452515]  submit_bio+0x48/0x120
>>>> [  248.455923]  blkdev_direct_IO+0x220/0x540
>>>> [  248.459935]  ? __fsnotify_parent+0xff/0x330
>>>> [  248.464121]  ? __fsnotify_parent+0x10f/0x330
>>>> [  248.468393]  ? common_interrupt+0x73/0xe0
>>>> [  248.472408]  generic_file_read_iter+0xa5/0x160
>>>> [  248.476852]  blkdev_read_iter+0x38/0x70
>>>> [  248.480693]  io_read+0x119/0x420
>>>> [  248.483923]  ? sbitmap_queue_clear_batch+0xc7/0x110
>>>> [  248.488805]  ? blk_mq_end_request_batch+0x378/0x490
>>>> [  248.493684]  io_issue_sqe+0x7ec/0x19c0
>>>> [  248.497436]  ? io_req_prep+0x6a9/0xe60
>>>> [  248.501190]  io_submit_sqes+0x2a0/0x9f0
>>>> [  248.505030]  ? __fget_files+0x6a/0x90
>>>> [  248.508693]  __x64_sys_io_uring_enter+0x1da/0x8c0
>>>> [  248.513401]  do_syscall_64+0x38/0x90
>>>> [  248.516979]  entry_SYSCALL_64_after_hwframe+0x44/0xae
>>>> [  248.522033] RIP: 0033:0x7fc26b19b89d
>>>> [  248.525611] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa
>>>> 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f
>>>> 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
>>>> [  248.544360] RSP: 002b:00007fc26b07ce98 EFLAGS: 00000246 ORIG_RAX:
>>>> 00000000000001aa
>>>> [  248.551925] RAX: ffffffffffffffda RBX: 00007fc26b3f2fc0 RCX:
>>>> 00007fc26b19b89d
>>>> [  248.559058] RDX: 0000000000000020 RSI: 0000000000000020 RDI:
>>>> 0000000000000004
>>>> [  248.566189] RBP: 0000000000000020 R08: 0000000000000000 R09:
>>>> 0000000000000000
>>>> [  248.573322] R10: 0000000000000001 R11: 0000000000000246 R12:
>>>> 00005623a4b7a2a0
>>>> [  248.580456] R13: 0000000000000020 R14: 0000000000000020 R15:
>>>> 0000000000000020
>>>> [  248.587591]  </TASK>
>>> Do you have:
>>>
>>> commit 75feae73a28020e492fbad2323245455ef69d687
>>> Author: Pavel Begunkov <asml.silence@gmail.com>
>>> Date:   Tue Dec 7 20:16:36 2021 +0000
>>>
>>>       block: fix single bio async DIO error handling
>>>
>>> in your tree?
>>>
>> Nope. I will get it in and test. Thanks!
> Might be worth re-running with KASAN enabled in your config to see if
> that triggers anything.
>
Got this:
[  739.336669] CPU: 63 PID: 10373 Comm: io_uring Kdump: loaded Not 
tainted 5.16.0-rc3+ #8
[  739.344583] Hardware name: Dell Inc. PowerEdge R650xs/0PPTY2, BIOS 
1.3.8 08/31/2021
[  739.352236] Call Trace:
[  739.354687]  <IRQ>
[  739.356705]  dump_stack_lvl+0x38/0x49
[  739.360381]  print_address_description.constprop.0+0x28/0x150
[  739.366136]  ? 0xffffffffc046f3df
[  739.369455]  kasan_report.cold+0x82/0xdb
[  739.373383]  ? 0xffffffffc046f3df
[  739.376700]  __asan_load8+0x69/0x90
[  739.380194]  0xffffffffc046f3df
[  739.383339]  ? 0xffffffffc046f340
[  739.386659]  ? blkcg_iolatency_done_bio+0x26/0x390
[  739.391461]  ? __rcu_read_unlock+0x5b/0x270
[  739.395655]  ? kmem_cache_alloc+0x143/0x460
[  739.399841]  ? mempool_alloc_slab+0x17/0x20
[  739.404027]  ? bio_uninit+0x6c/0xf0
[  739.407522]  bio_endio+0x27f/0x2a0
[  739.410926]  blk_update_request+0x1e8/0x750
[  739.415112]  blk_mq_end_request_batch+0x10b/0x9b0
[  739.419818]  ? blk_mq_end_request+0x460/0x460
[  739.424179]  ? kfree+0xa0/0x400
[  739.427322]  ? mempool_kfree+0xe/0x10
[  739.430989]  ? generic_file_write_iter+0xf0/0xf0
[  739.435609]  ? dma_unmap_page_attrs+0x15f/0x2c0
[  739.440144]  nvme_pci_complete_batch+0x34/0x160
[  739.444684]  ? blk_mq_complete_request_remote+0x1ca/0x2d0
[  739.450084]  nvme_irq+0x5fa/0x630
[  739.453404]  ? nvme_timeout+0x370/0x370
[  739.457242]  ? nvme_unmap_data+0x1e0/0x1e0
[  739.461340]  ? __kasan_check_write+0x14/0x20
[  739.465614]  ? credit_entropy_bits.constprop.0+0x76/0x190
[  739.471015]  ? nvme_timeout+0x370/0x370
[  739.474853]  __handle_irq_event_percpu+0x69/0x260
[  739.479568]  handle_irq_event_percpu+0x70/0xf0
[  739.484015]  ? __handle_irq_event_percpu+0x260/0x260
[  739.488981]  ? __kasan_check_write+0x14/0x20
[  739.493261]  ? _raw_spin_lock+0x88/0xe0
[  739.497109]  ? _raw_spin_lock_irqsave+0xf0/0xf0
[  739.501643]  handle_irq_event+0x5a/0x90
[  739.505480]  handle_edge_irq+0x148/0x320
[  739.509407]  __common_interrupt+0x75/0x130
[  739.513514]  common_interrupt+0xae/0xd0
[  739.517354]  </IRQ>
[  739.519460]  <TASK>
[  739.521567]  asm_common_interrupt+0x1e/0x40
[  739.525753] RIP: 0010:__asan_store8+0x37/0x90
[  739.530111] Code: 4f 48 b8 ff ff ff ff ff 7f ff ff 48 39 c7 76 40 48 
8d 47 07 48 89 c2 83 e2 07 48 83 fa 07 75 18 48 ba 00 00 00 00 00 fc ff 
df <48> c1 e8 03 0f b6 04 10 84 c0 75 2b 5d c3 48 be 00 00 00 00 00 fc
[  739.548865] RSP: 0018:ffffc900307c7100 EFLAGS: 00000246
[  739.554092] RAX: ffffc900307c7237 RBX: ffffc900307c7780 RCX: 
ffffffff82c9b461
[  739.561227] RDX: dffffc0000000000 RSI: ffffc900307c7790 RDI: 
ffffc900307c7230
[  739.568358] RBP: ffffc900307c7100 R08: ffffffff82c9b202 R09: 
ffff8882d96c0000
[  739.575490] R10: ffffc900307c7257 R11: fffff520060f8e4a R12: 
0000000082c9b201
[  739.582624] R13: 0000000000000000 R14: ffffc900307c7238 R15: 
ffffc900307c71e8
[  739.589760]  ? update_stack_state+0x22/0x2c0
[  739.594038]  ? update_stack_state+0x281/0x2c0
[  739.598398]  update_stack_state+0x281/0x2c0
[  739.602585]  unwind_next_frame.part.0+0xe0/0x360
[  739.607204]  ? bio_alloc_bioset+0x223/0x2f0
[  739.611389]  ? create_prof_cpu_mask+0x30/0x30
[  739.615749]  ? mempool_alloc_slab+0x17/0x20
[  739.619935]  unwind_next_frame+0x23/0x30
[  739.623860]  arch_stack_walk+0x88/0xf0
[  739.627613]  ? bio_alloc_bioset+0x223/0x2f0
[  739.631800]  stack_trace_save+0x94/0xc0
[  739.635640]  ? filter_irq_stacks+0x70/0x70
[  739.639738]  ? blk_mq_put_tag+0x80/0x80
[  739.643576]  ? _raw_spin_unlock_irqrestore+0x23/0x40
[  739.648545]  ? __wake_up_common_lock+0xfd/0x150
[  739.653084]  kasan_save_stack+0x26/0x60
[  739.656924]  ? kasan_save_stack+0x26/0x60
[  739.660938]  ? __kasan_slab_alloc+0x6d/0x90
[  739.665123]  ? kmem_cache_alloc+0x143/0x460
[  739.669308]  ? mempool_alloc_slab+0x17/0x20
[  739.673494]  ? mempool_alloc+0xef/0x280
[  739.677333]  ? bio_alloc_bioset+0x223/0x2f0
[  739.681519]  ? blk_mq_rq_ctx_init.isra.0+0x28a/0x3c0
[  739.686489]  ? __blk_mq_alloc_requests+0x655/0x680
[  739.691288]  ? blkcg_iolatency_throttle+0x5d/0x760
[  739.696081]  ? bio_to_wbt_flags+0x47/0xf0
[  739.700093]  ? update_io_ticks+0x5e/0xd0
[  739.704021]  ? preempt_count_sub+0x18/0xc0
[  739.708120]  ? __kasan_check_read+0x11/0x20
[  739.712305]  ? blk_mq_submit_bio+0x740/0xce0
[  739.716580]  ? blk_mq_try_issue_list_directly+0x1b0/0x1b0
[  739.721987]  ? kasan_poison+0x3c/0x50
[  739.725650]  ? kasan_unpoison+0x28/0x50
[  739.729491]  __kasan_slab_alloc+0x6d/0x90
[  739.733503]  kmem_cache_alloc+0x143/0x460
[  739.737518]  mempool_alloc_slab+0x17/0x20
[  739.741529]  mempool_alloc+0xef/0x280
[  739.745194]  ? mempool_free+0x170/0x170
[  739.749035]  ? mempool_destroy+0x30/0x30
[  739.752961]  ? __fsnotify_update_child_dentry_flags.part.0+0x170/0x170
[  739.759498]  bio_alloc_bioset+0x223/0x2f0
[  739.763517]  ? __this_cpu_preempt_check+0x13/0x20
[  739.768223]  ? bvec_alloc+0xd0/0xd0
[  739.771715]  ? __fsnotify_parent+0x1ed/0x590
[  739.775988]  ? do_direct_IO+0x150/0x1880
[  739.779916]  ? submit_bio+0xb0/0x220
[  739.783496]  bio_alloc_kiocb+0x185/0x1c0
[  739.787430]  blkdev_direct_IO+0x114/0x400
[  739.791441]  generic_file_read_iter+0x152/0x250
[  739.795974]  blkdev_read_iter+0x84/0xd0
[  739.799815]  io_read+0x1ec/0x770
[  739.803056]  ? __rcu_read_unlock+0x5b/0x270
[  739.807240]  ? io_setup_async_rw+0x270/0x270
[  739.811515]  ? __sbq_wake_up+0x2d/0x1b0
[  739.815352]  ? __rcu_read_unlock+0x5b/0x270
[  739.819537]  ? sbitmap_queue_clear+0xc9/0xe0
[  739.823813]  ? blk_queue_exit+0x35/0x90
[  739.827653]  ? __blk_mq_free_request+0x111/0x160
[  739.832280]  io_issue_sqe+0xcac/0x27f0
[  739.836031]  ? blk_mq_free_plug_rqs+0x3f/0x50
[  739.840394]  ? io_poll_add.isra.0+0x290/0x290
[  739.844760]  ? io_req_prep+0xcc2/0x1bb0
[  739.848598]  ? io_submit_sqes+0x43b/0x1260
[  739.852701]  io_submit_sqes+0x5e5/0x1260
[  739.856634]  ? io_do_iopoll+0x561/0x720
[  739.860474]  ? io_wq_submit_work+0x230/0x230
[  739.864746]  ? __kasan_check_write+0x14/0x20
[  739.869016]  ? mutex_lock+0x8f/0xe0
[  739.872510]  ? __mutex_lock_slowpath+0x20/0x20
[  739.876956]  ? __rcu_read_unlock+0x5b/0x270
[  739.881144]  __x64_sys_io_uring_enter+0x367/0xef0
[  739.885859]  ? io_submit_sqes+0x1260/0x1260
[  739.890044]  ? __this_cpu_preempt_check+0x13/0x20
[  739.894749]  ? xfd_validate_state+0x3c/0xd0
[  739.898936]  ? __schedule+0x5be/0x10c0
[  739.902687]  ? restore_fpregs_from_fpstate+0xa2/0x170
[  739.907741]  ? kernel_fpu_begin_mask+0x170/0x170
[  739.912362]  ? debug_smp_processor_id+0x17/0x20
[  739.916903]  ? debug_smp_processor_id+0x17/0x20
[  739.921434]  ? fpregs_assert_state_consistent+0x5f/0x70
[  739.926662]  ? exit_to_user_mode_prepare+0x4b/0x1e0
[  739.931549]  do_syscall_64+0x38/0x90
[  739.935129]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[  739.940182] RIP: 0033:0x7f7345d7d89d
[  739.943759] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 
48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 
05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
[  739.962513] RSP: 002b:00007f7345c5ee98 EFLAGS: 00000246 ORIG_RAX: 
00000000000001aa
[  739.970081] RAX: ffffffffffffffda RBX: 00007f7345fd2fc0 RCX: 
00007f7345d7d89d
[  739.977216] RDX: 0000000000000000 RSI: 0000000000000020 RDI: 
0000000000000004
[  739.984349] RBP: 0000000000000020 R08: 0000000000000000 R09: 
0000000000000000
[  739.991481] R10: 0000000000000000 R11: 0000000000000246 R12: 
000055be1d11e2a0
[  739.998612] R13: 0000000000000020 R14: 0000000000000000 R15: 
0000000000000020
[  740.005749]  </TASK>
[  740.007945]
[  740.009444] Allocated by task 10373:
[  740.013084]
[  740.014583] Freed by task 10373:
[  740.017879]
[  740.019378] The buggy address belongs to the object at ffff88c1be016e00
[  740.019378]  which belongs to the cache kmalloc-256 of size 256
[  740.031886] The buggy address is located 40 bytes inside of
[  740.031886]  256-byte region [ffff88c1be016e00, ffff88c1be016f00)
[  740.043534] The buggy address belongs to the page:
[  740.048345]
[  740.049840] Memory state around the buggy address:
[  740.054636]  ffff88c1be016d00: fc fc fc fc fc fc fc fc fc fc fc fc fc 
fc fc fc
[  740.061854]  ffff88c1be016d80: fc fc fc fc fc fc fc fc fc fc fc fc fc 
fc fc fc
[  740.069074] >ffff88c1be016e00: fa fb fb fb fb fb fb fb fb fb fb fb fb 
fb fb fb
[  740.076294]                                   ^
[  740.080827]  ffff88c1be016e80: fb fb fb fb fb fb fb fb fb fb fb fb fb 
fb fb fb
[  740.088045]  ffff88c1be016f00: fc fc fc fc fc fc fc fc fc fc fc fc fc 
fc fc fc
[  740.095265] 
==================================================================
[  740.102497] kernel BUG at mm/slub.c:379!
[  740.106431] invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI


^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v5 3/4] md: raid10 add nowait support
  2021-12-16 19:40                                                   ` Vishal Verma
@ 2021-12-16 20:18                                                     ` Song Liu
  2021-12-16 20:37                                                       ` Vishal Verma
  0 siblings, 1 reply; 86+ messages in thread
From: Song Liu @ 2021-12-16 20:18 UTC (permalink / raw)
  To: Vishal Verma; +Cc: Jens Axboe, linux-raid, rgoldwyn

On Thu, Dec 16, 2021 at 11:40 AM Vishal Verma <vverma@digitalocean.com> wrote:
>
>
> On 12/16/21 11:49 AM, Jens Axboe wrote:
> > On 12/16/21 9:45 AM, Vishal Verma wrote:
> >> On 12/16/21 9:42 AM, Jens Axboe wrote:
> >>> On 12/15/21 5:30 PM, Vishal Verma wrote:
> >>>> On 12/15/21 3:20 PM, Vishal Verma wrote:
> >>>>> On 12/15/21 1:42 PM, Song Liu wrote:
> >>>>>> On Tue, Dec 14, 2021 at 10:09 PM Vishal Verma
> >>>>>> <vverma@digitalocean.com> wrote:
> >>>>>>> This adds nowait support to the RAID10 driver. Very similar to
> >>>>>>> raid1 driver changes. It makes RAID10 driver return with EAGAIN
> >>>>>>> for situations where it could wait for eg:
> >>>>>>>
> >>>>>>> - Waiting for the barrier,
> >>>>>>> - Too many pending I/Os to be queued,
> >>>>>>> - Reshape operation,
> >>>>>>> - Discard operation.
> >>>>>>>
> >>>>>>> wait_barrier() fn is modified to return bool to support error for
> >>>>>>> wait barriers. It returns true in case of wait or if wait is not
> >>>>>>> required and returns false if wait was required but not performed
> >>>>>>> to support nowait.
> >>>>>>>
> >>>>>>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
> >>>>>>> ---
> >>>>>>>     drivers/md/raid10.c | 57
> >>>>>>> +++++++++++++++++++++++++++++++++++----------
> >>>>>>>     1 file changed, 45 insertions(+), 12 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> >>>>>>> index dde98f65bd04..f6c73987e9ac 100644
> >>>>>>> --- a/drivers/md/raid10.c
> >>>>>>> +++ b/drivers/md/raid10.c
> >>>>>>> @@ -952,11 +952,18 @@ static void lower_barrier(struct r10conf *conf)
> >>>>>>>            wake_up(&conf->wait_barrier);
> >>>>>>>     }
> >>>>>>>
> >>>>>>> -static void wait_barrier(struct r10conf *conf)
> >>>>>>> +static bool wait_barrier(struct r10conf *conf, bool nowait)
> >>>>>>>     {
> >>>>>>>            spin_lock_irq(&conf->resync_lock);
> >>>>>>>            if (conf->barrier) {
> >>>>>>>                    struct bio_list *bio_list = current->bio_list;
> >>>>>>> +
> >>>>>>> +               /* Return false when nowait flag is set */
> >>>>>>> +               if (nowait) {
> >>>>>>> + spin_unlock_irq(&conf->resync_lock);
> >>>>>>> +                       return false;
> >>>>>>> +               }
> >>>>>>> +
> >>>>>>>                    conf->nr_waiting++;
> >>>>>>>                    /* Wait for the barrier to drop.
> >>>>>>>                     * However if there are already pending
> >>>>>>> @@ -988,6 +995,7 @@ static void wait_barrier(struct r10conf *conf)
> >>>>>>>            }
> >>>>>>>            atomic_inc(&conf->nr_pending);
> >>>>>>>            spin_unlock_irq(&conf->resync_lock);
> >>>>>>> +       return true;
> >>>>>>>     }
> >>>>>>>
> >>>>>>>     static void allow_barrier(struct r10conf *conf)
> >>>>>>> @@ -1101,17 +1109,25 @@ static void raid10_unplug(struct blk_plug_cb
> >>>>>>> *cb, bool from_schedule)
> >>>>>>>     static void regular_request_wait(struct mddev *mddev, struct
> >>>>>>> r10conf *conf,
> >>>>>>>                                     struct bio *bio, sector_t sectors)
> >>>>>>>     {
> >>>>>>> -       wait_barrier(conf);
> >>>>>>> +       /* Bail out if REQ_NOWAIT is set for the bio */
> >>>>>>> +       if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
> >>>>>>> +               bio_wouldblock_error(bio);
> >>>>>>> +               return;
> >>>>>>> +       }
> >>>>>> I think we also need regular_request_wait to return bool and handle
> >>>>>> it properly.
> >>>>>>
> >>>>>> Thanks,
> >>>>>> Song
> >>>>>>
> >>>>> Ack, will fix it. Thanks!
> >>>> Ran into this while running with io_uring. With the current v5 (raid10
> >>>> patch) on top of md-next branch.
> >>>> ./t/io_uring -a 0 -d 256 </dev/raid10>
> >>>>
> >>>> It didn't trigger with aio (-a 1)
> >>>>
> >>>> [  248.128661] BUG: kernel NULL pointer dereference, address:
> >>>> 00000000000000b8
> >>>> [  248.135628] #PF: supervisor read access in kernel mode
> >>>> [  248.140762] #PF: error_code(0x0000) - not-present page
> >>>> [  248.145903] PGD 0 P4D 0
> >>>> [  248.148443] Oops: 0000 [#1] PREEMPT SMP NOPTI
> >>>> [  248.152800] CPU: 49 PID: 9461 Comm: io_uring Kdump: loaded Not
> >>>> tainted 5.16.0-rc3+ #2
> >>>> [  248.160629] Hardware name: Dell Inc. PowerEdge R650xs/0PPTY2, BIOS
> >>>> 1.3.8 08/31/2021
> >>>> [  248.168279] RIP: 0010:raid10_end_read_request+0x74/0x140 [raid10]
> >>>> [  248.174373] Code: 48 60 48 8b 58 58 48 c1 e2 05 49 03 55 08 48 89 4a
> >>>> 10 40 84 f6 75 48 f0 41 80 4c 24 18 01 4c 89 e7 e8 e0 b8 ff ff 49 8b 4d
> >>>> 00 <48> 8b 83 b8 00 00 00 f0 ff 8b f0 00 00 00 0f 94 c2 a8 01 74 04 84
> >>>> [  248.193120] RSP: 0018:ffffb1c38d598ce8 EFLAGS: 00010086
> >>>> [  248.198344] RAX: ffff8e5da2a1a100 RBX: 0000000000000000 RCX:
> >>>> ffff8e5d89747000
> >>>> [  248.205479] RDX: 000000008040003a RSI: 0000000080400039 RDI:
> >>>> ffff8e1e00044900
> >>>> [  248.212611] RBP: ffffb1c38d598d30 R08: 0000000000000000 R09:
> >>>> 0000000000000001
> >>>> [  248.219744] R10: ffff8e5da2a1ae00 R11: 000000411bab9000 R12:
> >>>> ffff8e5da2a1ae00
> >>>> [  248.226877] R13: ffff8e5d8973fc00 R14: 0000000000000000 R15:
> >>>> 0000000000001000
> >>>> [  248.234009] FS:  00007fc26b07d700(0000) GS:ffff8e9c6e600000(0000)
> >>>> knlGS:0000000000000000
> >>>> [  248.242096] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> >>>> [  248.247843] CR2: 00000000000000b8 CR3: 00000040b25d4005 CR4:
> >>>> 0000000000770ee0
> >>>> [  248.254973] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> >>>> 0000000000000000
> >>>> [  248.262107] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
> >>>> 0000000000000400
> >>>> [  248.269240] PKRU: 55555554
> >>>> [  248.271953] Call Trace:
> >>>> [  248.274406]  <IRQ>
> >>>> [  248.276425]  bio_endio+0xf6/0x170
> >>>> [  248.279743]  blk_update_request+0x12d/0x470
> >>>> [  248.283931]  ? sbitmap_queue_clear_batch+0xc7/0x110
> >>>> [  248.288809]  blk_mq_end_request_batch+0x76/0x490
> >>>> [  248.293429]  ? dma_direct_unmap_sg+0xdd/0x1a0
> >>>> [  248.297786]  ? smp_call_function_single_async+0x46/0x70
> >>>> [  248.303015]  ? mempool_kfree+0xe/0x10
> >>>> [  248.306680]  ? mempool_kfree+0xe/0x10
> >>>> [  248.310345]  nvme_pci_complete_batch+0x26/0xb0
> >>>> [  248.314792]  nvme_irq+0x298/0x2f0
> >>>> [  248.318110]  ? nvme_unmap_data+0xf0/0xf0
> >>>> [  248.322038]  __handle_irq_event_percpu+0x3f/0x190
> >>>> [  248.326744]  handle_irq_event_percpu+0x33/0x80
> >>>> [  248.331190]  handle_irq_event+0x39/0x60
> >>>> [  248.335028]  handle_edge_irq+0xbe/0x1e0
> >>>> [  248.338869]  __common_interrupt+0x6b/0x110
> >>>> [  248.342967]  common_interrupt+0xbd/0xe0
> >>>> [  248.346808]  </IRQ>
> >>>> [  248.348912]  <TASK>
> >>>> [  248.351018]  asm_common_interrupt+0x1e/0x40
> >>>> [  248.355206] RIP: 0010:_raw_spin_unlock_irqrestore+0x1e/0x37
> >>>> [  248.360780] Code: 02 5d c3 0f 1f 44 00 00 5d c3 66 90 0f 1f 44 00 00
> >>>> 55 48 89 e5 c6 07 00 0f 1f 40 00 f7 c6 00 02 00 00 74 01 fb bf 01 00 00
> >>>> 00 <e8> ed 8e 5b ff 65 8b 05 66 7e 52 78 85 c0 74 02 5d c3 0f 1f 44 00
> >>>>
> >>>> [  248.379525] RSP: 0018:ffffb1c3a429b958 EFLAGS: 00000206
> >>>> [  248.384749] RAX: 0000000000000001 RBX: ffff8e5d8973fd08 RCX:
> >>>> ffff8e5d8973fd10
> >>>> [  248.391884] RDX: 0000000000000001 RSI: 0000000000000246 RDI:
> >>>> 0000000000000001
> >>>> [  248.399017] RBP: ffffb1c3a429b958 R08: 0000000000000000 R09:
> >>>> ffffb1c3a429b970
> >>>> [  248.406148] R10: 0000000000000c00 R11: 0000000000000001 R12:
> >>>> 0000000000000001
> >>>> [  248.413280] R13: 0000000000000246 R14: 0000000000000000 R15:
> >>>> 0000000000000003
> >>>> [  248.420415]  __wake_up_common_lock+0x8a/0xc0
> >>>> [  248.424686]  __wake_up+0x13/0x20
> >>>> [  248.427919]  raid10_make_request+0x101/0x170 [raid10]
> >>>> [  248.432971]  md_handle_request+0x179/0x1e0
> >>>> [  248.437071]  ? submit_bio_checks+0x1f6/0x5a0
> >>>> [  248.441345]  md_submit_bio+0x6d/0xa0
> >>>> [  248.444924]  __submit_bio+0x94/0x140
> >>>> [  248.448504]  submit_bio_noacct+0xe1/0x2a0
> >>>> [  248.452515]  submit_bio+0x48/0x120
> >>>> [  248.455923]  blkdev_direct_IO+0x220/0x540
> >>>> [  248.459935]  ? __fsnotify_parent+0xff/0x330
> >>>> [  248.464121]  ? __fsnotify_parent+0x10f/0x330
> >>>> [  248.468393]  ? common_interrupt+0x73/0xe0
> >>>> [  248.472408]  generic_file_read_iter+0xa5/0x160
> >>>> [  248.476852]  blkdev_read_iter+0x38/0x70
> >>>> [  248.480693]  io_read+0x119/0x420
> >>>> [  248.483923]  ? sbitmap_queue_clear_batch+0xc7/0x110
> >>>> [  248.488805]  ? blk_mq_end_request_batch+0x378/0x490
> >>>> [  248.493684]  io_issue_sqe+0x7ec/0x19c0
> >>>> [  248.497436]  ? io_req_prep+0x6a9/0xe60
> >>>> [  248.501190]  io_submit_sqes+0x2a0/0x9f0
> >>>> [  248.505030]  ? __fget_files+0x6a/0x90
> >>>> [  248.508693]  __x64_sys_io_uring_enter+0x1da/0x8c0
> >>>> [  248.513401]  do_syscall_64+0x38/0x90
> >>>> [  248.516979]  entry_SYSCALL_64_after_hwframe+0x44/0xae
> >>>> [  248.522033] RIP: 0033:0x7fc26b19b89d
> >>>> [  248.525611] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa
> >>>> 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f
> >>>> 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
> >>>> [  248.544360] RSP: 002b:00007fc26b07ce98 EFLAGS: 00000246 ORIG_RAX:
> >>>> 00000000000001aa
> >>>> [  248.551925] RAX: ffffffffffffffda RBX: 00007fc26b3f2fc0 RCX:
> >>>> 00007fc26b19b89d
> >>>> [  248.559058] RDX: 0000000000000020 RSI: 0000000000000020 RDI:
> >>>> 0000000000000004
> >>>> [  248.566189] RBP: 0000000000000020 R08: 0000000000000000 R09:
> >>>> 0000000000000000
> >>>> [  248.573322] R10: 0000000000000001 R11: 0000000000000246 R12:
> >>>> 00005623a4b7a2a0
> >>>> [  248.580456] R13: 0000000000000020 R14: 0000000000000020 R15:
> >>>> 0000000000000020
> >>>> [  248.587591]  </TASK>
> >>> Do you have:
> >>>
> >>> commit 75feae73a28020e492fbad2323245455ef69d687
> >>> Author: Pavel Begunkov <asml.silence@gmail.com>
> >>> Date:   Tue Dec 7 20:16:36 2021 +0000
> >>>
> >>>       block: fix single bio async DIO error handling
> >>>
> >>> in your tree?
> >>>
> >> Nope. I will get it in and test. Thanks!
> > Might be worth re-running with KASAN enabled in your config to see if
> > that triggers anything.
> >
> Got this:
> [  739.336669] CPU: 63 PID: 10373 Comm: io_uring Kdump: loaded Not
> tainted 5.16.0-rc3+ #8
> [  739.344583] Hardware name: Dell Inc. PowerEdge R650xs/0PPTY2, BIOS
> 1.3.8 08/31/2021
> [  739.352236] Call Trace:
> [  739.354687]  <IRQ>
> [  739.356705]  dump_stack_lvl+0x38/0x49
> [  739.360381]  print_address_description.constprop.0+0x28/0x150
> [  739.366136]  ? 0xffffffffc046f3df
> [  739.369455]  kasan_report.cold+0x82/0xdb
> [  739.373383]  ? 0xffffffffc046f3df
> [  739.376700]  __asan_load8+0x69/0x90
> [  739.380194]  0xffffffffc046f3df
> [  739.383339]  ? 0xffffffffc046f340
> [  739.386659]  ? blkcg_iolatency_done_bio+0x26/0x390
> [  739.391461]  ? __rcu_read_unlock+0x5b/0x270
> [  739.395655]  ? kmem_cache_alloc+0x143/0x460
> [  739.399841]  ? mempool_alloc_slab+0x17/0x20
> [  739.404027]  ? bio_uninit+0x6c/0xf0
> [  739.407522]  bio_endio+0x27f/0x2a0
> [  739.410926]  blk_update_request+0x1e8/0x750
> [  739.415112]  blk_mq_end_request_batch+0x10b/0x9b0
> [  739.419818]  ? blk_mq_end_request+0x460/0x460
> [  739.424179]  ? kfree+0xa0/0x400
> [  739.427322]  ? mempool_kfree+0xe/0x10
> [  739.430989]  ? generic_file_write_iter+0xf0/0xf0
> [  739.435609]  ? dma_unmap_page_attrs+0x15f/0x2c0
> [  739.440144]  nvme_pci_complete_batch+0x34/0x160
> [  739.444684]  ? blk_mq_complete_request_remote+0x1ca/0x2d0
> [  739.450084]  nvme_irq+0x5fa/0x630
> [  739.453404]  ? nvme_timeout+0x370/0x370
> [  739.457242]  ? nvme_unmap_data+0x1e0/0x1e0
> [  739.461340]  ? __kasan_check_write+0x14/0x20
> [  739.465614]  ? credit_entropy_bits.constprop.0+0x76/0x190
> [  739.471015]  ? nvme_timeout+0x370/0x370
> [  739.474853]  __handle_irq_event_percpu+0x69/0x260
> [  739.479568]  handle_irq_event_percpu+0x70/0xf0
> [  739.484015]  ? __handle_irq_event_percpu+0x260/0x260
> [  739.488981]  ? __kasan_check_write+0x14/0x20
> [  739.493261]  ? _raw_spin_lock+0x88/0xe0
> [  739.497109]  ? _raw_spin_lock_irqsave+0xf0/0xf0
> [  739.501643]  handle_irq_event+0x5a/0x90
> [  739.505480]  handle_edge_irq+0x148/0x320
> [  739.509407]  __common_interrupt+0x75/0x130
> [  739.513514]  common_interrupt+0xae/0xd0
> [  739.517354]  </IRQ>
> [  739.519460]  <TASK>
> [  739.521567]  asm_common_interrupt+0x1e/0x40
> [  739.525753] RIP: 0010:__asan_store8+0x37/0x90
> [  739.530111] Code: 4f 48 b8 ff ff ff ff ff 7f ff ff 48 39 c7 76 40 48
> 8d 47 07 48 89 c2 83 e2 07 48 83 fa 07 75 18 48 ba 00 00 00 00 00 fc ff
> df <48> c1 e8 03 0f b6 04 10 84 c0 75 2b 5d c3 48 be 00 00 00 00 00 fc
> [  739.548865] RSP: 0018:ffffc900307c7100 EFLAGS: 00000246
> [  739.554092] RAX: ffffc900307c7237 RBX: ffffc900307c7780 RCX:
> ffffffff82c9b461
> [  739.561227] RDX: dffffc0000000000 RSI: ffffc900307c7790 RDI:
> ffffc900307c7230
> [  739.568358] RBP: ffffc900307c7100 R08: ffffffff82c9b202 R09:
> ffff8882d96c0000
> [  739.575490] R10: ffffc900307c7257 R11: fffff520060f8e4a R12:
> 0000000082c9b201
> [  739.582624] R13: 0000000000000000 R14: ffffc900307c7238 R15:
> ffffc900307c71e8
> [  739.589760]  ? update_stack_state+0x22/0x2c0
> [  739.594038]  ? update_stack_state+0x281/0x2c0
> [  739.598398]  update_stack_state+0x281/0x2c0
> [  739.602585]  unwind_next_frame.part.0+0xe0/0x360
> [  739.607204]  ? bio_alloc_bioset+0x223/0x2f0
> [  739.611389]  ? create_prof_cpu_mask+0x30/0x30
> [  739.615749]  ? mempool_alloc_slab+0x17/0x20
> [  739.619935]  unwind_next_frame+0x23/0x30
> [  739.623860]  arch_stack_walk+0x88/0xf0
> [  739.627613]  ? bio_alloc_bioset+0x223/0x2f0
> [  739.631800]  stack_trace_save+0x94/0xc0
> [  739.635640]  ? filter_irq_stacks+0x70/0x70
> [  739.639738]  ? blk_mq_put_tag+0x80/0x80
> [  739.643576]  ? _raw_spin_unlock_irqrestore+0x23/0x40
> [  739.648545]  ? __wake_up_common_lock+0xfd/0x150
> [  739.653084]  kasan_save_stack+0x26/0x60
> [  739.656924]  ? kasan_save_stack+0x26/0x60
> [  739.660938]  ? __kasan_slab_alloc+0x6d/0x90
> [  739.665123]  ? kmem_cache_alloc+0x143/0x460
> [  739.669308]  ? mempool_alloc_slab+0x17/0x20
> [  739.673494]  ? mempool_alloc+0xef/0x280
> [  739.677333]  ? bio_alloc_bioset+0x223/0x2f0
> [  739.681519]  ? blk_mq_rq_ctx_init.isra.0+0x28a/0x3c0
> [  739.686489]  ? __blk_mq_alloc_requests+0x655/0x680
> [  739.691288]  ? blkcg_iolatency_throttle+0x5d/0x760
> [  739.696081]  ? bio_to_wbt_flags+0x47/0xf0
> [  739.700093]  ? update_io_ticks+0x5e/0xd0
> [  739.704021]  ? preempt_count_sub+0x18/0xc0
> [  739.708120]  ? __kasan_check_read+0x11/0x20
> [  739.712305]  ? blk_mq_submit_bio+0x740/0xce0
> [  739.716580]  ? blk_mq_try_issue_list_directly+0x1b0/0x1b0
> [  739.721987]  ? kasan_poison+0x3c/0x50
> [  739.725650]  ? kasan_unpoison+0x28/0x50
> [  739.729491]  __kasan_slab_alloc+0x6d/0x90
> [  739.733503]  kmem_cache_alloc+0x143/0x460
> [  739.737518]  mempool_alloc_slab+0x17/0x20
> [  739.741529]  mempool_alloc+0xef/0x280
> [  739.745194]  ? mempool_free+0x170/0x170
> [  739.749035]  ? mempool_destroy+0x30/0x30
> [  739.752961]  ? __fsnotify_update_child_dentry_flags.part.0+0x170/0x170
> [  739.759498]  bio_alloc_bioset+0x223/0x2f0
> [  739.763517]  ? __this_cpu_preempt_check+0x13/0x20
> [  739.768223]  ? bvec_alloc+0xd0/0xd0
> [  739.771715]  ? __fsnotify_parent+0x1ed/0x590
> [  739.775988]  ? do_direct_IO+0x150/0x1880
> [  739.779916]  ? submit_bio+0xb0/0x220
> [  739.783496]  bio_alloc_kiocb+0x185/0x1c0
> [  739.787430]  blkdev_direct_IO+0x114/0x400
> [  739.791441]  generic_file_read_iter+0x152/0x250
> [  739.795974]  blkdev_read_iter+0x84/0xd0
> [  739.799815]  io_read+0x1ec/0x770
> [  739.803056]  ? __rcu_read_unlock+0x5b/0x270
> [  739.807240]  ? io_setup_async_rw+0x270/0x270
> [  739.811515]  ? __sbq_wake_up+0x2d/0x1b0
> [  739.815352]  ? __rcu_read_unlock+0x5b/0x270
> [  739.819537]  ? sbitmap_queue_clear+0xc9/0xe0
> [  739.823813]  ? blk_queue_exit+0x35/0x90
> [  739.827653]  ? __blk_mq_free_request+0x111/0x160
> [  739.832280]  io_issue_sqe+0xcac/0x27f0
> [  739.836031]  ? blk_mq_free_plug_rqs+0x3f/0x50
> [  739.840394]  ? io_poll_add.isra.0+0x290/0x290
> [  739.844760]  ? io_req_prep+0xcc2/0x1bb0
> [  739.848598]  ? io_submit_sqes+0x43b/0x1260
> [  739.852701]  io_submit_sqes+0x5e5/0x1260
> [  739.856634]  ? io_do_iopoll+0x561/0x720
> [  739.860474]  ? io_wq_submit_work+0x230/0x230
> [  739.864746]  ? __kasan_check_write+0x14/0x20
> [  739.869016]  ? mutex_lock+0x8f/0xe0
> [  739.872510]  ? __mutex_lock_slowpath+0x20/0x20
> [  739.876956]  ? __rcu_read_unlock+0x5b/0x270
> [  739.881144]  __x64_sys_io_uring_enter+0x367/0xef0
> [  739.885859]  ? io_submit_sqes+0x1260/0x1260
> [  739.890044]  ? __this_cpu_preempt_check+0x13/0x20
> [  739.894749]  ? xfd_validate_state+0x3c/0xd0
> [  739.898936]  ? __schedule+0x5be/0x10c0
> [  739.902687]  ? restore_fpregs_from_fpstate+0xa2/0x170
> [  739.907741]  ? kernel_fpu_begin_mask+0x170/0x170
> [  739.912362]  ? debug_smp_processor_id+0x17/0x20
> [  739.916903]  ? debug_smp_processor_id+0x17/0x20
> [  739.921434]  ? fpregs_assert_state_consistent+0x5f/0x70
> [  739.926662]  ? exit_to_user_mode_prepare+0x4b/0x1e0
> [  739.931549]  do_syscall_64+0x38/0x90
> [  739.935129]  entry_SYSCALL_64_after_hwframe+0x44/0xae
> [  739.940182] RIP: 0033:0x7f7345d7d89d
> [  739.943759] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa
> 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f
> 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
> [  739.962513] RSP: 002b:00007f7345c5ee98 EFLAGS: 00000246 ORIG_RAX:
> 00000000000001aa
> [  739.970081] RAX: ffffffffffffffda RBX: 00007f7345fd2fc0 RCX:
> 00007f7345d7d89d
> [  739.977216] RDX: 0000000000000000 RSI: 0000000000000020 RDI:
> 0000000000000004
> [  739.984349] RBP: 0000000000000020 R08: 0000000000000000 R09:
> 0000000000000000
> [  739.991481] R10: 0000000000000000 R11: 0000000000000246 R12:
> 000055be1d11e2a0
> [  739.998612] R13: 0000000000000020 R14: 0000000000000000 R15:
> 0000000000000020
> [  740.005749]  </TASK>
> [  740.007945]
> [  740.009444] Allocated by task 10373:
> [  740.013084]
> [  740.014583] Freed by task 10373:
> [  740.017879]
> [  740.019378] The buggy address belongs to the object at ffff88c1be016e00
> [  740.019378]  which belongs to the cache kmalloc-256 of size 256
> [  740.031886] The buggy address is located 40 bytes inside of
> [  740.031886]  256-byte region [ffff88c1be016e00, ffff88c1be016f00)
> [  740.043534] The buggy address belongs to the page:
> [  740.048345]
> [  740.049840] Memory state around the buggy address:
> [  740.054636]  ffff88c1be016d00: fc fc fc fc fc fc fc fc fc fc fc fc fc
> fc fc fc
> [  740.061854]  ffff88c1be016d80: fc fc fc fc fc fc fc fc fc fc fc fc fc
> fc fc fc
> [  740.069074] >ffff88c1be016e00: fa fb fb fb fb fb fb fb fb fb fb fb fb
> fb fb fb
> [  740.076294]                                   ^
> [  740.080827]  ffff88c1be016e80: fb fb fb fb fb fb fb fb fb fb fb fb fb
> fb fb fb
> [  740.088045]  ffff88c1be016f00: fc fc fc fc fc fc fc fc fc fc fc fc fc
> fc fc fc
> [  740.095265]
> ==================================================================
> [  740.102497] kernel BUG at mm/slub.c:379!
> [  740.106431] invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI
>

What's the exact command line that triggers this? I am not able to
trigger it with
either fio or t/io_uring.

Song

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v5 3/4] md: raid10 add nowait support
  2021-12-16 20:18                                                     ` Song Liu
@ 2021-12-16 20:37                                                       ` Vishal Verma
  2021-12-16 23:50                                                         ` Song Liu
  0 siblings, 1 reply; 86+ messages in thread
From: Vishal Verma @ 2021-12-16 20:37 UTC (permalink / raw)
  To: Song Liu; +Cc: Jens Axboe, linux-raid, rgoldwyn


On 12/16/21 1:18 PM, Song Liu wrote:
> On Thu, Dec 16, 2021 at 11:40 AM Vishal Verma <vverma@digitalocean.com> wrote:
>>
>> On 12/16/21 11:49 AM, Jens Axboe wrote:
>>> On 12/16/21 9:45 AM, Vishal Verma wrote:
>>>> On 12/16/21 9:42 AM, Jens Axboe wrote:
>>>>> On 12/15/21 5:30 PM, Vishal Verma wrote:
>>>>>> On 12/15/21 3:20 PM, Vishal Verma wrote:
>>>>>>> On 12/15/21 1:42 PM, Song Liu wrote:
>>>>>>>> On Tue, Dec 14, 2021 at 10:09 PM Vishal Verma
>>>>>>>> <vverma@digitalocean.com> wrote:
>>>>>>>>> This adds nowait support to the RAID10 driver. Very similar to
>>>>>>>>> raid1 driver changes. It makes RAID10 driver return with EAGAIN
>>>>>>>>> for situations where it could wait for eg:
>>>>>>>>>
>>>>>>>>> - Waiting for the barrier,
>>>>>>>>> - Too many pending I/Os to be queued,
>>>>>>>>> - Reshape operation,
>>>>>>>>> - Discard operation.
>>>>>>>>>
>>>>>>>>> wait_barrier() fn is modified to return bool to support error for
>>>>>>>>> wait barriers. It returns true in case of wait or if wait is not
>>>>>>>>> required and returns false if wait was required but not performed
>>>>>>>>> to support nowait.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
>>>>>>>>> ---
>>>>>>>>>      drivers/md/raid10.c | 57
>>>>>>>>> +++++++++++++++++++++++++++++++++++----------
>>>>>>>>>      1 file changed, 45 insertions(+), 12 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>>>>>>>>> index dde98f65bd04..f6c73987e9ac 100644
>>>>>>>>> --- a/drivers/md/raid10.c
>>>>>>>>> +++ b/drivers/md/raid10.c
>>>>>>>>> @@ -952,11 +952,18 @@ static void lower_barrier(struct r10conf *conf)
>>>>>>>>>             wake_up(&conf->wait_barrier);
>>>>>>>>>      }
>>>>>>>>>
>>>>>>>>> -static void wait_barrier(struct r10conf *conf)
>>>>>>>>> +static bool wait_barrier(struct r10conf *conf, bool nowait)
>>>>>>>>>      {
>>>>>>>>>             spin_lock_irq(&conf->resync_lock);
>>>>>>>>>             if (conf->barrier) {
>>>>>>>>>                     struct bio_list *bio_list = current->bio_list;
>>>>>>>>> +
>>>>>>>>> +               /* Return false when nowait flag is set */
>>>>>>>>> +               if (nowait) {
>>>>>>>>> + spin_unlock_irq(&conf->resync_lock);
>>>>>>>>> +                       return false;
>>>>>>>>> +               }
>>>>>>>>> +
>>>>>>>>>                     conf->nr_waiting++;
>>>>>>>>>                     /* Wait for the barrier to drop.
>>>>>>>>>                      * However if there are already pending
>>>>>>>>> @@ -988,6 +995,7 @@ static void wait_barrier(struct r10conf *conf)
>>>>>>>>>             }
>>>>>>>>>             atomic_inc(&conf->nr_pending);
>>>>>>>>>             spin_unlock_irq(&conf->resync_lock);
>>>>>>>>> +       return true;
>>>>>>>>>      }
>>>>>>>>>
>>>>>>>>>      static void allow_barrier(struct r10conf *conf)
>>>>>>>>> @@ -1101,17 +1109,25 @@ static void raid10_unplug(struct blk_plug_cb
>>>>>>>>> *cb, bool from_schedule)
>>>>>>>>>      static void regular_request_wait(struct mddev *mddev, struct
>>>>>>>>> r10conf *conf,
>>>>>>>>>                                      struct bio *bio, sector_t sectors)
>>>>>>>>>      {
>>>>>>>>> -       wait_barrier(conf);
>>>>>>>>> +       /* Bail out if REQ_NOWAIT is set for the bio */
>>>>>>>>> +       if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
>>>>>>>>> +               bio_wouldblock_error(bio);
>>>>>>>>> +               return;
>>>>>>>>> +       }
>>>>>>>> I think we also need regular_request_wait to return bool and handle
>>>>>>>> it properly.
>>>>>>>>
>>>>>>>> Thanks,
>>>>>>>> Song
>>>>>>>>
>>>>>>> Ack, will fix it. Thanks!
>>>>>> Ran into this while running with io_uring. With the current v5 (raid10
>>>>>> patch) on top of md-next branch.
>>>>>> ./t/io_uring -a 0 -d 256 </dev/raid10>
>>>>>>
>>>>>> It didn't trigger with aio (-a 1)
>>>>>>
>>>>>> [  248.128661] BUG: kernel NULL pointer dereference, address:
>>>>>> 00000000000000b8
>>>>>> [  248.135628] #PF: supervisor read access in kernel mode
>>>>>> [  248.140762] #PF: error_code(0x0000) - not-present page
>>>>>> [  248.145903] PGD 0 P4D 0
>>>>>> [  248.148443] Oops: 0000 [#1] PREEMPT SMP NOPTI
>>>>>> [  248.152800] CPU: 49 PID: 9461 Comm: io_uring Kdump: loaded Not
>>>>>> tainted 5.16.0-rc3+ #2
>>>>>> [  248.160629] Hardware name: Dell Inc. PowerEdge R650xs/0PPTY2, BIOS
>>>>>> 1.3.8 08/31/2021
>>>>>> [  248.168279] RIP: 0010:raid10_end_read_request+0x74/0x140 [raid10]
>>>>>> [  248.174373] Code: 48 60 48 8b 58 58 48 c1 e2 05 49 03 55 08 48 89 4a
>>>>>> 10 40 84 f6 75 48 f0 41 80 4c 24 18 01 4c 89 e7 e8 e0 b8 ff ff 49 8b 4d
>>>>>> 00 <48> 8b 83 b8 00 00 00 f0 ff 8b f0 00 00 00 0f 94 c2 a8 01 74 04 84
>>>>>> [  248.193120] RSP: 0018:ffffb1c38d598ce8 EFLAGS: 00010086
>>>>>> [  248.198344] RAX: ffff8e5da2a1a100 RBX: 0000000000000000 RCX:
>>>>>> ffff8e5d89747000
>>>>>> [  248.205479] RDX: 000000008040003a RSI: 0000000080400039 RDI:
>>>>>> ffff8e1e00044900
>>>>>> [  248.212611] RBP: ffffb1c38d598d30 R08: 0000000000000000 R09:
>>>>>> 0000000000000001
>>>>>> [  248.219744] R10: ffff8e5da2a1ae00 R11: 000000411bab9000 R12:
>>>>>> ffff8e5da2a1ae00
>>>>>> [  248.226877] R13: ffff8e5d8973fc00 R14: 0000000000000000 R15:
>>>>>> 0000000000001000
>>>>>> [  248.234009] FS:  00007fc26b07d700(0000) GS:ffff8e9c6e600000(0000)
>>>>>> knlGS:0000000000000000
>>>>>> [  248.242096] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>>>>> [  248.247843] CR2: 00000000000000b8 CR3: 00000040b25d4005 CR4:
>>>>>> 0000000000770ee0
>>>>>> [  248.254973] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
>>>>>> 0000000000000000
>>>>>> [  248.262107] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>>> 0000000000000400
>>>>>> [  248.269240] PKRU: 55555554
>>>>>> [  248.271953] Call Trace:
>>>>>> [  248.274406]  <IRQ>
>>>>>> [  248.276425]  bio_endio+0xf6/0x170
>>>>>> [  248.279743]  blk_update_request+0x12d/0x470
>>>>>> [  248.283931]  ? sbitmap_queue_clear_batch+0xc7/0x110
>>>>>> [  248.288809]  blk_mq_end_request_batch+0x76/0x490
>>>>>> [  248.293429]  ? dma_direct_unmap_sg+0xdd/0x1a0
>>>>>> [  248.297786]  ? smp_call_function_single_async+0x46/0x70
>>>>>> [  248.303015]  ? mempool_kfree+0xe/0x10
>>>>>> [  248.306680]  ? mempool_kfree+0xe/0x10
>>>>>> [  248.310345]  nvme_pci_complete_batch+0x26/0xb0
>>>>>> [  248.314792]  nvme_irq+0x298/0x2f0
>>>>>> [  248.318110]  ? nvme_unmap_data+0xf0/0xf0
>>>>>> [  248.322038]  __handle_irq_event_percpu+0x3f/0x190
>>>>>> [  248.326744]  handle_irq_event_percpu+0x33/0x80
>>>>>> [  248.331190]  handle_irq_event+0x39/0x60
>>>>>> [  248.335028]  handle_edge_irq+0xbe/0x1e0
>>>>>> [  248.338869]  __common_interrupt+0x6b/0x110
>>>>>> [  248.342967]  common_interrupt+0xbd/0xe0
>>>>>> [  248.346808]  </IRQ>
>>>>>> [  248.348912]  <TASK>
>>>>>> [  248.351018]  asm_common_interrupt+0x1e/0x40
>>>>>> [  248.355206] RIP: 0010:_raw_spin_unlock_irqrestore+0x1e/0x37
>>>>>> [  248.360780] Code: 02 5d c3 0f 1f 44 00 00 5d c3 66 90 0f 1f 44 00 00
>>>>>> 55 48 89 e5 c6 07 00 0f 1f 40 00 f7 c6 00 02 00 00 74 01 fb bf 01 00 00
>>>>>> 00 <e8> ed 8e 5b ff 65 8b 05 66 7e 52 78 85 c0 74 02 5d c3 0f 1f 44 00
>>>>>>
>>>>>> [  248.379525] RSP: 0018:ffffb1c3a429b958 EFLAGS: 00000206
>>>>>> [  248.384749] RAX: 0000000000000001 RBX: ffff8e5d8973fd08 RCX:
>>>>>> ffff8e5d8973fd10
>>>>>> [  248.391884] RDX: 0000000000000001 RSI: 0000000000000246 RDI:
>>>>>> 0000000000000001
>>>>>> [  248.399017] RBP: ffffb1c3a429b958 R08: 0000000000000000 R09:
>>>>>> ffffb1c3a429b970
>>>>>> [  248.406148] R10: 0000000000000c00 R11: 0000000000000001 R12:
>>>>>> 0000000000000001
>>>>>> [  248.413280] R13: 0000000000000246 R14: 0000000000000000 R15:
>>>>>> 0000000000000003
>>>>>> [  248.420415]  __wake_up_common_lock+0x8a/0xc0
>>>>>> [  248.424686]  __wake_up+0x13/0x20
>>>>>> [  248.427919]  raid10_make_request+0x101/0x170 [raid10]
>>>>>> [  248.432971]  md_handle_request+0x179/0x1e0
>>>>>> [  248.437071]  ? submit_bio_checks+0x1f6/0x5a0
>>>>>> [  248.441345]  md_submit_bio+0x6d/0xa0
>>>>>> [  248.444924]  __submit_bio+0x94/0x140
>>>>>> [  248.448504]  submit_bio_noacct+0xe1/0x2a0
>>>>>> [  248.452515]  submit_bio+0x48/0x120
>>>>>> [  248.455923]  blkdev_direct_IO+0x220/0x540
>>>>>> [  248.459935]  ? __fsnotify_parent+0xff/0x330
>>>>>> [  248.464121]  ? __fsnotify_parent+0x10f/0x330
>>>>>> [  248.468393]  ? common_interrupt+0x73/0xe0
>>>>>> [  248.472408]  generic_file_read_iter+0xa5/0x160
>>>>>> [  248.476852]  blkdev_read_iter+0x38/0x70
>>>>>> [  248.480693]  io_read+0x119/0x420
>>>>>> [  248.483923]  ? sbitmap_queue_clear_batch+0xc7/0x110
>>>>>> [  248.488805]  ? blk_mq_end_request_batch+0x378/0x490
>>>>>> [  248.493684]  io_issue_sqe+0x7ec/0x19c0
>>>>>> [  248.497436]  ? io_req_prep+0x6a9/0xe60
>>>>>> [  248.501190]  io_submit_sqes+0x2a0/0x9f0
>>>>>> [  248.505030]  ? __fget_files+0x6a/0x90
>>>>>> [  248.508693]  __x64_sys_io_uring_enter+0x1da/0x8c0
>>>>>> [  248.513401]  do_syscall_64+0x38/0x90
>>>>>> [  248.516979]  entry_SYSCALL_64_after_hwframe+0x44/0xae
>>>>>> [  248.522033] RIP: 0033:0x7fc26b19b89d
>>>>>> [  248.525611] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa
>>>>>> 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f
>>>>>> 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
>>>>>> [  248.544360] RSP: 002b:00007fc26b07ce98 EFLAGS: 00000246 ORIG_RAX:
>>>>>> 00000000000001aa
>>>>>> [  248.551925] RAX: ffffffffffffffda RBX: 00007fc26b3f2fc0 RCX:
>>>>>> 00007fc26b19b89d
>>>>>> [  248.559058] RDX: 0000000000000020 RSI: 0000000000000020 RDI:
>>>>>> 0000000000000004
>>>>>> [  248.566189] RBP: 0000000000000020 R08: 0000000000000000 R09:
>>>>>> 0000000000000000
>>>>>> [  248.573322] R10: 0000000000000001 R11: 0000000000000246 R12:
>>>>>> 00005623a4b7a2a0
>>>>>> [  248.580456] R13: 0000000000000020 R14: 0000000000000020 R15:
>>>>>> 0000000000000020
>>>>>> [  248.587591]  </TASK>
>>>>> Do you have:
>>>>>
>>>>> commit 75feae73a28020e492fbad2323245455ef69d687
>>>>> Author: Pavel Begunkov <asml.silence@gmail.com>
>>>>> Date:   Tue Dec 7 20:16:36 2021 +0000
>>>>>
>>>>>        block: fix single bio async DIO error handling
>>>>>
>>>>> in your tree?
>>>>>
>>>> Nope. I will get it in and test. Thanks!
>>> Might be worth re-running with KASAN enabled in your config to see if
>>> that triggers anything.
>>>
>> Got this:
>> [  739.336669] CPU: 63 PID: 10373 Comm: io_uring Kdump: loaded Not
>> tainted 5.16.0-rc3+ #8
>> [  739.344583] Hardware name: Dell Inc. PowerEdge R650xs/0PPTY2, BIOS
>> 1.3.8 08/31/2021
>> [  739.352236] Call Trace:
>> [  739.354687]  <IRQ>
>> [  739.356705]  dump_stack_lvl+0x38/0x49
>> [  739.360381]  print_address_description.constprop.0+0x28/0x150
>> [  739.366136]  ? 0xffffffffc046f3df
>> [  739.369455]  kasan_report.cold+0x82/0xdb
>> [  739.373383]  ? 0xffffffffc046f3df
>> [  739.376700]  __asan_load8+0x69/0x90
>> [  739.380194]  0xffffffffc046f3df
>> [  739.383339]  ? 0xffffffffc046f340
>> [  739.386659]  ? blkcg_iolatency_done_bio+0x26/0x390
>> [  739.391461]  ? __rcu_read_unlock+0x5b/0x270
>> [  739.395655]  ? kmem_cache_alloc+0x143/0x460
>> [  739.399841]  ? mempool_alloc_slab+0x17/0x20
>> [  739.404027]  ? bio_uninit+0x6c/0xf0
>> [  739.407522]  bio_endio+0x27f/0x2a0
>> [  739.410926]  blk_update_request+0x1e8/0x750
>> [  739.415112]  blk_mq_end_request_batch+0x10b/0x9b0
>> [  739.419818]  ? blk_mq_end_request+0x460/0x460
>> [  739.424179]  ? kfree+0xa0/0x400
>> [  739.427322]  ? mempool_kfree+0xe/0x10
>> [  739.430989]  ? generic_file_write_iter+0xf0/0xf0
>> [  739.435609]  ? dma_unmap_page_attrs+0x15f/0x2c0
>> [  739.440144]  nvme_pci_complete_batch+0x34/0x160
>> [  739.444684]  ? blk_mq_complete_request_remote+0x1ca/0x2d0
>> [  739.450084]  nvme_irq+0x5fa/0x630
>> [  739.453404]  ? nvme_timeout+0x370/0x370
>> [  739.457242]  ? nvme_unmap_data+0x1e0/0x1e0
>> [  739.461340]  ? __kasan_check_write+0x14/0x20
>> [  739.465614]  ? credit_entropy_bits.constprop.0+0x76/0x190
>> [  739.471015]  ? nvme_timeout+0x370/0x370
>> [  739.474853]  __handle_irq_event_percpu+0x69/0x260
>> [  739.479568]  handle_irq_event_percpu+0x70/0xf0
>> [  739.484015]  ? __handle_irq_event_percpu+0x260/0x260
>> [  739.488981]  ? __kasan_check_write+0x14/0x20
>> [  739.493261]  ? _raw_spin_lock+0x88/0xe0
>> [  739.497109]  ? _raw_spin_lock_irqsave+0xf0/0xf0
>> [  739.501643]  handle_irq_event+0x5a/0x90
>> [  739.505480]  handle_edge_irq+0x148/0x320
>> [  739.509407]  __common_interrupt+0x75/0x130
>> [  739.513514]  common_interrupt+0xae/0xd0
>> [  739.517354]  </IRQ>
>> [  739.519460]  <TASK>
>> [  739.521567]  asm_common_interrupt+0x1e/0x40
>> [  739.525753] RIP: 0010:__asan_store8+0x37/0x90
>> [  739.530111] Code: 4f 48 b8 ff ff ff ff ff 7f ff ff 48 39 c7 76 40 48
>> 8d 47 07 48 89 c2 83 e2 07 48 83 fa 07 75 18 48 ba 00 00 00 00 00 fc ff
>> df <48> c1 e8 03 0f b6 04 10 84 c0 75 2b 5d c3 48 be 00 00 00 00 00 fc
>> [  739.548865] RSP: 0018:ffffc900307c7100 EFLAGS: 00000246
>> [  739.554092] RAX: ffffc900307c7237 RBX: ffffc900307c7780 RCX:
>> ffffffff82c9b461
>> [  739.561227] RDX: dffffc0000000000 RSI: ffffc900307c7790 RDI:
>> ffffc900307c7230
>> [  739.568358] RBP: ffffc900307c7100 R08: ffffffff82c9b202 R09:
>> ffff8882d96c0000
>> [  739.575490] R10: ffffc900307c7257 R11: fffff520060f8e4a R12:
>> 0000000082c9b201
>> [  739.582624] R13: 0000000000000000 R14: ffffc900307c7238 R15:
>> ffffc900307c71e8
>> [  739.589760]  ? update_stack_state+0x22/0x2c0
>> [  739.594038]  ? update_stack_state+0x281/0x2c0
>> [  739.598398]  update_stack_state+0x281/0x2c0
>> [  739.602585]  unwind_next_frame.part.0+0xe0/0x360
>> [  739.607204]  ? bio_alloc_bioset+0x223/0x2f0
>> [  739.611389]  ? create_prof_cpu_mask+0x30/0x30
>> [  739.615749]  ? mempool_alloc_slab+0x17/0x20
>> [  739.619935]  unwind_next_frame+0x23/0x30
>> [  739.623860]  arch_stack_walk+0x88/0xf0
>> [  739.627613]  ? bio_alloc_bioset+0x223/0x2f0
>> [  739.631800]  stack_trace_save+0x94/0xc0
>> [  739.635640]  ? filter_irq_stacks+0x70/0x70
>> [  739.639738]  ? blk_mq_put_tag+0x80/0x80
>> [  739.643576]  ? _raw_spin_unlock_irqrestore+0x23/0x40
>> [  739.648545]  ? __wake_up_common_lock+0xfd/0x150
>> [  739.653084]  kasan_save_stack+0x26/0x60
>> [  739.656924]  ? kasan_save_stack+0x26/0x60
>> [  739.660938]  ? __kasan_slab_alloc+0x6d/0x90
>> [  739.665123]  ? kmem_cache_alloc+0x143/0x460
>> [  739.669308]  ? mempool_alloc_slab+0x17/0x20
>> [  739.673494]  ? mempool_alloc+0xef/0x280
>> [  739.677333]  ? bio_alloc_bioset+0x223/0x2f0
>> [  739.681519]  ? blk_mq_rq_ctx_init.isra.0+0x28a/0x3c0
>> [  739.686489]  ? __blk_mq_alloc_requests+0x655/0x680
>> [  739.691288]  ? blkcg_iolatency_throttle+0x5d/0x760
>> [  739.696081]  ? bio_to_wbt_flags+0x47/0xf0
>> [  739.700093]  ? update_io_ticks+0x5e/0xd0
>> [  739.704021]  ? preempt_count_sub+0x18/0xc0
>> [  739.708120]  ? __kasan_check_read+0x11/0x20
>> [  739.712305]  ? blk_mq_submit_bio+0x740/0xce0
>> [  739.716580]  ? blk_mq_try_issue_list_directly+0x1b0/0x1b0
>> [  739.721987]  ? kasan_poison+0x3c/0x50
>> [  739.725650]  ? kasan_unpoison+0x28/0x50
>> [  739.729491]  __kasan_slab_alloc+0x6d/0x90
>> [  739.733503]  kmem_cache_alloc+0x143/0x460
>> [  739.737518]  mempool_alloc_slab+0x17/0x20
>> [  739.741529]  mempool_alloc+0xef/0x280
>> [  739.745194]  ? mempool_free+0x170/0x170
>> [  739.749035]  ? mempool_destroy+0x30/0x30
>> [  739.752961]  ? __fsnotify_update_child_dentry_flags.part.0+0x170/0x170
>> [  739.759498]  bio_alloc_bioset+0x223/0x2f0
>> [  739.763517]  ? __this_cpu_preempt_check+0x13/0x20
>> [  739.768223]  ? bvec_alloc+0xd0/0xd0
>> [  739.771715]  ? __fsnotify_parent+0x1ed/0x590
>> [  739.775988]  ? do_direct_IO+0x150/0x1880
>> [  739.779916]  ? submit_bio+0xb0/0x220
>> [  739.783496]  bio_alloc_kiocb+0x185/0x1c0
>> [  739.787430]  blkdev_direct_IO+0x114/0x400
>> [  739.791441]  generic_file_read_iter+0x152/0x250
>> [  739.795974]  blkdev_read_iter+0x84/0xd0
>> [  739.799815]  io_read+0x1ec/0x770
>> [  739.803056]  ? __rcu_read_unlock+0x5b/0x270
>> [  739.807240]  ? io_setup_async_rw+0x270/0x270
>> [  739.811515]  ? __sbq_wake_up+0x2d/0x1b0
>> [  739.815352]  ? __rcu_read_unlock+0x5b/0x270
>> [  739.819537]  ? sbitmap_queue_clear+0xc9/0xe0
>> [  739.823813]  ? blk_queue_exit+0x35/0x90
>> [  739.827653]  ? __blk_mq_free_request+0x111/0x160
>> [  739.832280]  io_issue_sqe+0xcac/0x27f0
>> [  739.836031]  ? blk_mq_free_plug_rqs+0x3f/0x50
>> [  739.840394]  ? io_poll_add.isra.0+0x290/0x290
>> [  739.844760]  ? io_req_prep+0xcc2/0x1bb0
>> [  739.848598]  ? io_submit_sqes+0x43b/0x1260
>> [  739.852701]  io_submit_sqes+0x5e5/0x1260
>> [  739.856634]  ? io_do_iopoll+0x561/0x720
>> [  739.860474]  ? io_wq_submit_work+0x230/0x230
>> [  739.864746]  ? __kasan_check_write+0x14/0x20
>> [  739.869016]  ? mutex_lock+0x8f/0xe0
>> [  739.872510]  ? __mutex_lock_slowpath+0x20/0x20
>> [  739.876956]  ? __rcu_read_unlock+0x5b/0x270
>> [  739.881144]  __x64_sys_io_uring_enter+0x367/0xef0
>> [  739.885859]  ? io_submit_sqes+0x1260/0x1260
>> [  739.890044]  ? __this_cpu_preempt_check+0x13/0x20
>> [  739.894749]  ? xfd_validate_state+0x3c/0xd0
>> [  739.898936]  ? __schedule+0x5be/0x10c0
>> [  739.902687]  ? restore_fpregs_from_fpstate+0xa2/0x170
>> [  739.907741]  ? kernel_fpu_begin_mask+0x170/0x170
>> [  739.912362]  ? debug_smp_processor_id+0x17/0x20
>> [  739.916903]  ? debug_smp_processor_id+0x17/0x20
>> [  739.921434]  ? fpregs_assert_state_consistent+0x5f/0x70
>> [  739.926662]  ? exit_to_user_mode_prepare+0x4b/0x1e0
>> [  739.931549]  do_syscall_64+0x38/0x90
>> [  739.935129]  entry_SYSCALL_64_after_hwframe+0x44/0xae
>> [  739.940182] RIP: 0033:0x7f7345d7d89d
>> [  739.943759] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa
>> 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f
>> 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
>> [  739.962513] RSP: 002b:00007f7345c5ee98 EFLAGS: 00000246 ORIG_RAX:
>> 00000000000001aa
>> [  739.970081] RAX: ffffffffffffffda RBX: 00007f7345fd2fc0 RCX:
>> 00007f7345d7d89d
>> [  739.977216] RDX: 0000000000000000 RSI: 0000000000000020 RDI:
>> 0000000000000004
>> [  739.984349] RBP: 0000000000000020 R08: 0000000000000000 R09:
>> 0000000000000000
>> [  739.991481] R10: 0000000000000000 R11: 0000000000000246 R12:
>> 000055be1d11e2a0
>> [  739.998612] R13: 0000000000000020 R14: 0000000000000000 R15:
>> 0000000000000020
>> [  740.005749]  </TASK>
>> [  740.007945]
>> [  740.009444] Allocated by task 10373:
>> [  740.013084]
>> [  740.014583] Freed by task 10373:
>> [  740.017879]
>> [  740.019378] The buggy address belongs to the object at ffff88c1be016e00
>> [  740.019378]  which belongs to the cache kmalloc-256 of size 256
>> [  740.031886] The buggy address is located 40 bytes inside of
>> [  740.031886]  256-byte region [ffff88c1be016e00, ffff88c1be016f00)
>> [  740.043534] The buggy address belongs to the page:
>> [  740.048345]
>> [  740.049840] Memory state around the buggy address:
>> [  740.054636]  ffff88c1be016d00: fc fc fc fc fc fc fc fc fc fc fc fc fc
>> fc fc fc
>> [  740.061854]  ffff88c1be016d80: fc fc fc fc fc fc fc fc fc fc fc fc fc
>> fc fc fc
>> [  740.069074] >ffff88c1be016e00: fa fb fb fb fb fb fb fb fb fb fb fb fb
>> fb fb fb
>> [  740.076294]                                   ^
>> [  740.080827]  ffff88c1be016e80: fb fb fb fb fb fb fb fb fb fb fb fb fb
>> fb fb fb
>> [  740.088045]  ffff88c1be016f00: fc fc fc fc fc fc fc fc fc fc fc fc fc
>> fc fc fc
>> [  740.095265]
>> ==================================================================
>> [  740.102497] kernel BUG at mm/slub.c:379!
>> [  740.106431] invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI
>>
> What's the exact command line that triggers this? I am not able to
> trigger it with
> either fio or t/io_uring.
>
> Song
I only had 1 nvme so was creating 4 partitions on it and creating a 
raid10 and doing:

mdadm -C /dev/md10 -l 10 -n 4 /dev/nvme4n1p1 /dev/nvme4n1p2 
/dev/nvme4n1p3 /dev/nvme4n1p4
./t/io_uring /dev/md10-d 256 -p 0 -a 0 -r 100

on top of commit: c14704e1cb556 (md-next branch) + "md: add support for 
REQ_NOWAIT" patch
Also, applied the commit (75feae73a28) Jens pointed earlier today.


^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v5 3/4] md: raid10 add nowait support
  2021-12-16 20:37                                                       ` Vishal Verma
@ 2021-12-16 23:50                                                         ` Song Liu
       [not found]                                                           ` <bd90d6e6-adb4-2696-3110-fad0b1ee00dc@digitalocean.com>
  0 siblings, 1 reply; 86+ messages in thread
From: Song Liu @ 2021-12-16 23:50 UTC (permalink / raw)
  To: Vishal Verma; +Cc: Jens Axboe, linux-raid, rgoldwyn

On Thu, Dec 16, 2021 at 12:38 PM Vishal Verma <vverma@digitalocean.com> wrote:
>
[...]

> >> [  740.106431] invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI
> >>
> > What's the exact command line that triggers this? I am not able to
> > trigger it with
> > either fio or t/io_uring.
> >
> > Song
> I only had 1 nvme so was creating 4 partitions on it and creating a
> raid10 and doing:
>
> mdadm -C /dev/md10 -l 10 -n 4 /dev/nvme4n1p1 /dev/nvme4n1p2
> /dev/nvme4n1p3 /dev/nvme4n1p4
> ./t/io_uring /dev/md10-d 256 -p 0 -a 0 -r 100
>
> on top of commit: c14704e1cb556 (md-next branch) + "md: add support for
> REQ_NOWAIT" patch
> Also, applied the commit (75feae73a28) Jens pointed earlier today.
>

I am able to trigger the following error. I will look into it.

Thanks,
Song

[ 1583.149004] ==================================================================
[ 1583.150100] BUG: KASAN: use-after-free in raid10_end_read_request+0x91/0x310
[ 1583.151042] Read of size 8 at addr ffff888160a1c928 by task io_uring/1165
[ 1583.152016]
[ 1583.152247] CPU: 0 PID: 1165 Comm: io_uring Not tainted 5.16.0-rc3+ #660
[ 1583.153159] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS 1.13.0-2.module_el8.4.0+547+a85d02ba 04/01/2014
[ 1583.154572] Call Trace:
[ 1583.155005]  <IRQ>
[ 1583.155338]  dump_stack_lvl+0x44/0x57
[ 1583.155950]  print_address_description.constprop.8.cold.17+0x12/0x339
[ 1583.156969]  ? raid10_end_read_request+0x91/0x310
[ 1583.157578]  ? raid10_end_read_request+0x91/0x310
[ 1583.158272]  kasan_report.cold.18+0x83/0xdf
[ 1583.158889]  ? raid10_end_read_request+0x91/0x310
[ 1583.159554]  raid10_end_read_request+0x91/0x310
[ 1583.160201]  ? raid10_resize+0x270/0x270
[ 1583.160724]  ? bio_uninit+0xc7/0x1e0
[ 1583.161274]  blk_update_request+0x21f/0x810
[ 1583.161893]  blk_mq_end_request_batch+0x11c/0xa70
[ 1583.162497]  ? blk_mq_end_request+0x460/0x460
[ 1583.163204]  ? nvme_complete_batch_req+0x12/0x30
[ 1583.163888]  nvme_irq+0x6ad/0x6f0
[ 1583.164354]  ? io_queue_count_set+0xe0/0xe0
[ 1583.164980]  ? nvme_unmap_data+0x1e0/0x1e0
[ 1583.165504]  ? rcu_read_lock_bh_held+0xb0/0xb0
[ 1583.166149]  ? io_queue_count_set+0xe0/0xe0
[ 1583.166721]  __handle_irq_event_percpu+0x79/0x440
[ 1583.167446]  handle_irq_event_percpu+0x6f/0xe0
[ 1583.168101]  ? __handle_irq_event_percpu+0x440/0x440
[ 1583.168734]  ? lock_contended+0x6e0/0x6e0
[ 1583.169349]  ? do_raw_spin_unlock+0xa2/0x130
[ 1583.169961]  handle_irq_event+0x54/0x90
[ 1583.170442]  handle_edge_irq+0x121/0x300
[ 1583.171012]  __common_interrupt+0x7d/0x170
[ 1583.171538]  common_interrupt+0xa0/0xc0
[ 1583.172103]  </IRQ>
[ 1583.172389]  <TASK>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v5 3/4] md: raid10 add nowait support
       [not found]                                                           ` <bd90d6e6-adb4-2696-3110-fad0b1ee00dc@digitalocean.com>
@ 2021-12-21  8:13                                                             ` Song Liu
  2021-12-21 15:29                                                               ` Vishal Verma
  0 siblings, 1 reply; 86+ messages in thread
From: Song Liu @ 2021-12-21  8:13 UTC (permalink / raw)
  To: Vishal Verma; +Cc: Jens Axboe, linux-raid, rgoldwyn

On Mon, Dec 20, 2021 at 6:22 AM Vishal Verma <vverma@digitalocean.com> wrote:
>
>
> On 12/16/21 4:50 PM, Song Liu wrote:
>
> On Thu, Dec 16, 2021 at 12:38 PM Vishal Verma <vverma@digitalocean.com> wrote:
>
> [...]
>
> [  740.106431] invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI
>
> What's the exact command line that triggers this? I am not able to
> trigger it with
> either fio or t/io_uring.
>
> Song
>
> I only had 1 nvme so was creating 4 partitions on it and creating a
> raid10 and doing:
>
> mdadm -C /dev/md10 -l 10 -n 4 /dev/nvme4n1p1 /dev/nvme4n1p2
> /dev/nvme4n1p3 /dev/nvme4n1p4
> ./t/io_uring /dev/md10-d 256 -p 0 -a 0 -r 100
>
> on top of commit: c14704e1cb556 (md-next branch) + "md: add support for
> REQ_NOWAIT" patch
> Also, applied the commit (75feae73a28) Jens pointed earlier today.
>
> I am able to trigger the following error. I will look into it.
>
> Thanks,
> Song
>
> [ 1583.149004] ==================================================================
> [ 1583.150100] BUG: KASAN: use-after-free in raid10_end_read_request+0x91/0x310
> [ 1583.151042] Read of size 8 at addr ffff888160a1c928 by task io_uring/1165
> [ 1583.152016]
> [ 1583.152247] CPU: 0 PID: 1165 Comm: io_uring Not tainted 5.16.0-rc3+ #660
> [ 1583.153159] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> BIOS 1.13.0-2.module_el8.4.0+547+a85d02ba 04/01/2014
> [ 1583.154572] Call Trace:
> [ 1583.155005]  <IRQ>
> [ 1583.155338]  dump_stack_lvl+0x44/0x57
> [ 1583.155950]  print_address_description.constprop.8.cold.17+0x12/0x339
> [ 1583.156969]  ? raid10_end_read_request+0x91/0x310
> [ 1583.157578]  ? raid10_end_read_request+0x91/0x310
> [ 1583.158272]  kasan_report.cold.18+0x83/0xdf
> [ 1583.158889]  ? raid10_end_read_request+0x91/0x310
> [ 1583.159554]  raid10_end_read_request+0x91/0x310
> [ 1583.160201]  ? raid10_resize+0x270/0x270
> [ 1583.160724]  ? bio_uninit+0xc7/0x1e0
> [ 1583.161274]  blk_update_request+0x21f/0x810
> [ 1583.161893]  blk_mq_end_request_batch+0x11c/0xa70
> [ 1583.162497]  ? blk_mq_end_request+0x460/0x460
> [ 1583.163204]  ? nvme_complete_batch_req+0x12/0x30
> [ 1583.163888]  nvme_irq+0x6ad/0x6f0
> [ 1583.164354]  ? io_queue_count_set+0xe0/0xe0
> [ 1583.164980]  ? nvme_unmap_data+0x1e0/0x1e0
> [ 1583.165504]  ? rcu_read_lock_bh_held+0xb0/0xb0
> [ 1583.166149]  ? io_queue_count_set+0xe0/0xe0
> [ 1583.166721]  __handle_irq_event_percpu+0x79/0x440
> [ 1583.167446]  handle_irq_event_percpu+0x6f/0xe0
> [ 1583.168101]  ? __handle_irq_event_percpu+0x440/0x440
> [ 1583.168734]  ? lock_contended+0x6e0/0x6e0
> [ 1583.169349]  ? do_raw_spin_unlock+0xa2/0x130
> [ 1583.169961]  handle_irq_event+0x54/0x90
> [ 1583.170442]  handle_edge_irq+0x121/0x300
> [ 1583.171012]  __common_interrupt+0x7d/0x170
> [ 1583.171538]  common_interrupt+0xa0/0xc0
> [ 1583.172103]  </IRQ>
> [ 1583.172389]  <TASK>
>
> When running t/io_uring on a raid1 array, I get following:
>
> [  189.863726] RIP: 0010:__kmalloc+0xfa/0x430
> [  189.867825] Code: 05 4b 9a 35 43 48 8b 50 08 48 83 78 10 00 4c 8b 20 0f 84 fa 02 00 00 4d 85 e4 0f 84 f1 02 00 00 41 8b 47 28 49 8b 3f 4c 01 e0 <48> 8b 18 48 89 c1 49 33 9f b8 00 00 00 4c 89 e0 48 0f c9 48 31 cb
> [  189.886573] RSP: 0018:ffffaf09e28b7828 EFLAGS: 00010286
> [  189.891799] RAX: a0fa1099d2b0fff3 RBX: 0000000000092900 RCX: 0000000000000000
> [  189.898930] RDX: 00000002ba79600b RSI: 0000000000092900 RDI: 00000000000340e0
> [  189.906062] RBP: ffffaf09e28b7860 R08: ffff90fb8b6ea560 R09: ffff90fba7205f60
> [  189.913195] R10: ffffaf09e28b7c18 R11: 0000000000000000 R12: a0fa1099d2b0ffb3
> [  189.920329] R13: 0000000000000000 R14: ffffffffc074c277 R15: ffff90bc00044700
> [  189.927461] FS:  00007fd6209d7700(0000) GS:ffff913a6e140000(0000) knlGS:0000000000000000
> [  189.935549] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  189.941295] CR2: 00007f16998bebf0 CR3: 00000040be512005 CR4: 0000000000770ee0
> [  189.948426] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  189.955560] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [  189.962691] PKRU: 55555554
> [  189.965403] Call Trace:
> [  189.967857]  <TASK>
> [  189.969966]  0xffffffffc074c277
> [  189.973110]  mempool_alloc+0x61/0x180
> [  189.976777]  ? bio_associate_blkg_from_css+0xf5/0x2c0
> [  189.981829]  ? __bio_clone_fast+0xa9/0xf0
> [  189.985842]  ? __sbitmap_get_word+0x36/0x80
> [  189.990027]  0xffffffffc074ac50
> [  189.993174]  ? __sbitmap_queue_get+0x9/0x10
> [  189.997359]  ? blk_mq_get_tag+0x241/0x270
> [  190.001373]  ? ktime_get+0x3b/0xa0
> [  190.004776]  ? blk_mq_rq_ctx_init.isra.0+0x1a5/0x1c0
> [  190.009743]  0xffffffffc074efb3
> [  190.012891]  md_handle_request+0x134/0x1b0
> [  190.016989]  ? ktime_get+0x3b/0xa0
> [  190.020395]  md_submit_bio+0x6d/0xa0
> [  190.023976]  __submit_bio+0x94/0x140
> [  190.027555]  submit_bio_noacct+0xe1/0x2a0
> [  190.031566]  submit_bio+0x48/0x120
> [  190.034972]  blkdev_direct_IO+0x19b/0x540
> [  190.038987]  ? __fsnotify_parent+0xff/0x330
> [  190.043172]  ? __fsnotify_parent+0x10f/0x330
> [  190.047445]  generic_file_read_iter+0xa5/0x160
> [  190.051889]  blkdev_read_iter+0x38/0x70
> [  190.055731]  io_read+0x119/0x420
> [  190.058963]  ? blk_queue_exit+0x23/0x50
> [  190.062801]  ? __blk_mq_free_request+0x86/0xc0
> [  190.067247]  io_issue_sqe+0x7ec/0x19c0
> [  190.071002]  ? io_req_prep+0x6a9/0xe60
> [  190.074754]  io_submit_sqes+0x2a0/0x9f0
> [  190.078594]  ? __fget_files+0x6a/0x90
> [  190.082259]  __x64_sys_io_uring_enter+0x1da/0x8c0
> [  190.086965]  ? debug_smp_processor_id+0x17/0x20
> [  190.091498]  ? fpregs_assert_state_consistent+0x23/0x50
> [  190.096723]  ? exit_to_user_mode_prepare+0x4b/0x1e0
> [  190.101602]  do_syscall_64+0x38/0x90
> [  190.105182]  entry_SYSCALL_64_after_hwframe+0x44/0xae
> [  190.110236] RIP: 0033:0x7fd620af589d
> [  190.113815] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
> [  190.132563] RSP: 002b:00007fd6209d6e98 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa
> [  190.140126] RAX: ffffffffffffffda RBX: 00007fd620d4bfc0 RCX: 00007fd620af589d
> [  190.147261] RDX: 0000000000000000 RSI: 0000000000000020 RDI: 0000000000000004
> [  190.154391] RBP: 0000000000000020 R08: 0000000000000000 R09: 0000000000000000
> [  190.161524] R10: 0000000000000000 R11: 0000000000000246 R12: 0000561889c472a0
> [  190.168657] R13: 0000000000000020 R14: 0000000000000000 R15: 0000000000000020
> [  190.175793]  </TASK>
>
> It seems this issue is getting triggered with the following commit:
>
> commit 5b13bc8a3fd519d86e5b1a0b1d1b996cace62f3f
> Author: Christoph Hellwig <hch@lst.de>
> Date:   Wed Nov 24 07:28:56 2021 +0100
>
>     blk-mq: cleanup request allocation

Good finding. I am not able to repro these issues after reverting this commit.

Vishal, how does it work in your tests?

Song

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v5 3/4] md: raid10 add nowait support
  2021-12-21  8:13                                                             ` Song Liu
@ 2021-12-21 15:29                                                               ` Vishal Verma
  2021-12-21 15:59                                                                 ` Jens Axboe
  0 siblings, 1 reply; 86+ messages in thread
From: Vishal Verma @ 2021-12-21 15:29 UTC (permalink / raw)
  To: Song Liu; +Cc: Jens Axboe, linux-raid, rgoldwyn


On 12/21/21 1:13 AM, Song Liu wrote:
> On Mon, Dec 20, 2021 at 6:22 AM Vishal Verma <vverma@digitalocean.com> wrote:
>>
>> On 12/16/21 4:50 PM, Song Liu wrote:
>>
>> On Thu, Dec 16, 2021 at 12:38 PM Vishal Verma <vverma@digitalocean.com> wrote:
>>
>> [...]
>>
>> [  740.106431] invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI
>>
>> What's the exact command line that triggers this? I am not able to
>> trigger it with
>> either fio or t/io_uring.
>>
>> Song
>>
>> I only had 1 nvme so was creating 4 partitions on it and creating a
>> raid10 and doing:
>>
>> mdadm -C /dev/md10 -l 10 -n 4 /dev/nvme4n1p1 /dev/nvme4n1p2
>> /dev/nvme4n1p3 /dev/nvme4n1p4
>> ./t/io_uring /dev/md10-d 256 -p 0 -a 0 -r 100
>>
>> on top of commit: c14704e1cb556 (md-next branch) + "md: add support for
>> REQ_NOWAIT" patch
>> Also, applied the commit (75feae73a28) Jens pointed earlier today.
>>
>> I am able to trigger the following error. I will look into it.
>>
>> Thanks,
>> Song
>>
>> [ 1583.149004] ==================================================================
>> [ 1583.150100] BUG: KASAN: use-after-free in raid10_end_read_request+0x91/0x310
>> [ 1583.151042] Read of size 8 at addr ffff888160a1c928 by task io_uring/1165
>> [ 1583.152016]
>> [ 1583.152247] CPU: 0 PID: 1165 Comm: io_uring Not tainted 5.16.0-rc3+ #660
>> [ 1583.153159] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
>> BIOS 1.13.0-2.module_el8.4.0+547+a85d02ba 04/01/2014
>> [ 1583.154572] Call Trace:
>> [ 1583.155005]  <IRQ>
>> [ 1583.155338]  dump_stack_lvl+0x44/0x57
>> [ 1583.155950]  print_address_description.constprop.8.cold.17+0x12/0x339
>> [ 1583.156969]  ? raid10_end_read_request+0x91/0x310
>> [ 1583.157578]  ? raid10_end_read_request+0x91/0x310
>> [ 1583.158272]  kasan_report.cold.18+0x83/0xdf
>> [ 1583.158889]  ? raid10_end_read_request+0x91/0x310
>> [ 1583.159554]  raid10_end_read_request+0x91/0x310
>> [ 1583.160201]  ? raid10_resize+0x270/0x270
>> [ 1583.160724]  ? bio_uninit+0xc7/0x1e0
>> [ 1583.161274]  blk_update_request+0x21f/0x810
>> [ 1583.161893]  blk_mq_end_request_batch+0x11c/0xa70
>> [ 1583.162497]  ? blk_mq_end_request+0x460/0x460
>> [ 1583.163204]  ? nvme_complete_batch_req+0x12/0x30
>> [ 1583.163888]  nvme_irq+0x6ad/0x6f0
>> [ 1583.164354]  ? io_queue_count_set+0xe0/0xe0
>> [ 1583.164980]  ? nvme_unmap_data+0x1e0/0x1e0
>> [ 1583.165504]  ? rcu_read_lock_bh_held+0xb0/0xb0
>> [ 1583.166149]  ? io_queue_count_set+0xe0/0xe0
>> [ 1583.166721]  __handle_irq_event_percpu+0x79/0x440
>> [ 1583.167446]  handle_irq_event_percpu+0x6f/0xe0
>> [ 1583.168101]  ? __handle_irq_event_percpu+0x440/0x440
>> [ 1583.168734]  ? lock_contended+0x6e0/0x6e0
>> [ 1583.169349]  ? do_raw_spin_unlock+0xa2/0x130
>> [ 1583.169961]  handle_irq_event+0x54/0x90
>> [ 1583.170442]  handle_edge_irq+0x121/0x300
>> [ 1583.171012]  __common_interrupt+0x7d/0x170
>> [ 1583.171538]  common_interrupt+0xa0/0xc0
>> [ 1583.172103]  </IRQ>
>> [ 1583.172389]  <TASK>
>>
>> When running t/io_uring on a raid1 array, I get following:
>>
>> [  189.863726] RIP: 0010:__kmalloc+0xfa/0x430
>> [  189.867825] Code: 05 4b 9a 35 43 48 8b 50 08 48 83 78 10 00 4c 8b 20 0f 84 fa 02 00 00 4d 85 e4 0f 84 f1 02 00 00 41 8b 47 28 49 8b 3f 4c 01 e0 <48> 8b 18 48 89 c1 49 33 9f b8 00 00 00 4c 89 e0 48 0f c9 48 31 cb
>> [  189.886573] RSP: 0018:ffffaf09e28b7828 EFLAGS: 00010286
>> [  189.891799] RAX: a0fa1099d2b0fff3 RBX: 0000000000092900 RCX: 0000000000000000
>> [  189.898930] RDX: 00000002ba79600b RSI: 0000000000092900 RDI: 00000000000340e0
>> [  189.906062] RBP: ffffaf09e28b7860 R08: ffff90fb8b6ea560 R09: ffff90fba7205f60
>> [  189.913195] R10: ffffaf09e28b7c18 R11: 0000000000000000 R12: a0fa1099d2b0ffb3
>> [  189.920329] R13: 0000000000000000 R14: ffffffffc074c277 R15: ffff90bc00044700
>> [  189.927461] FS:  00007fd6209d7700(0000) GS:ffff913a6e140000(0000) knlGS:0000000000000000
>> [  189.935549] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [  189.941295] CR2: 00007f16998bebf0 CR3: 00000040be512005 CR4: 0000000000770ee0
>> [  189.948426] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>> [  189.955560] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>> [  189.962691] PKRU: 55555554
>> [  189.965403] Call Trace:
>> [  189.967857]  <TASK>
>> [  189.969966]  0xffffffffc074c277
>> [  189.973110]  mempool_alloc+0x61/0x180
>> [  189.976777]  ? bio_associate_blkg_from_css+0xf5/0x2c0
>> [  189.981829]  ? __bio_clone_fast+0xa9/0xf0
>> [  189.985842]  ? __sbitmap_get_word+0x36/0x80
>> [  189.990027]  0xffffffffc074ac50
>> [  189.993174]  ? __sbitmap_queue_get+0x9/0x10
>> [  189.997359]  ? blk_mq_get_tag+0x241/0x270
>> [  190.001373]  ? ktime_get+0x3b/0xa0
>> [  190.004776]  ? blk_mq_rq_ctx_init.isra.0+0x1a5/0x1c0
>> [  190.009743]  0xffffffffc074efb3
>> [  190.012891]  md_handle_request+0x134/0x1b0
>> [  190.016989]  ? ktime_get+0x3b/0xa0
>> [  190.020395]  md_submit_bio+0x6d/0xa0
>> [  190.023976]  __submit_bio+0x94/0x140
>> [  190.027555]  submit_bio_noacct+0xe1/0x2a0
>> [  190.031566]  submit_bio+0x48/0x120
>> [  190.034972]  blkdev_direct_IO+0x19b/0x540
>> [  190.038987]  ? __fsnotify_parent+0xff/0x330
>> [  190.043172]  ? __fsnotify_parent+0x10f/0x330
>> [  190.047445]  generic_file_read_iter+0xa5/0x160
>> [  190.051889]  blkdev_read_iter+0x38/0x70
>> [  190.055731]  io_read+0x119/0x420
>> [  190.058963]  ? blk_queue_exit+0x23/0x50
>> [  190.062801]  ? __blk_mq_free_request+0x86/0xc0
>> [  190.067247]  io_issue_sqe+0x7ec/0x19c0
>> [  190.071002]  ? io_req_prep+0x6a9/0xe60
>> [  190.074754]  io_submit_sqes+0x2a0/0x9f0
>> [  190.078594]  ? __fget_files+0x6a/0x90
>> [  190.082259]  __x64_sys_io_uring_enter+0x1da/0x8c0
>> [  190.086965]  ? debug_smp_processor_id+0x17/0x20
>> [  190.091498]  ? fpregs_assert_state_consistent+0x23/0x50
>> [  190.096723]  ? exit_to_user_mode_prepare+0x4b/0x1e0
>> [  190.101602]  do_syscall_64+0x38/0x90
>> [  190.105182]  entry_SYSCALL_64_after_hwframe+0x44/0xae
>> [  190.110236] RIP: 0033:0x7fd620af589d
>> [  190.113815] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
>> [  190.132563] RSP: 002b:00007fd6209d6e98 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa
>> [  190.140126] RAX: ffffffffffffffda RBX: 00007fd620d4bfc0 RCX: 00007fd620af589d
>> [  190.147261] RDX: 0000000000000000 RSI: 0000000000000020 RDI: 0000000000000004
>> [  190.154391] RBP: 0000000000000020 R08: 0000000000000000 R09: 0000000000000000
>> [  190.161524] R10: 0000000000000000 R11: 0000000000000246 R12: 0000561889c472a0
>> [  190.168657] R13: 0000000000000020 R14: 0000000000000000 R15: 0000000000000020
>> [  190.175793]  </TASK>
>>
>> It seems this issue is getting triggered with the following commit:
>>
>> commit 5b13bc8a3fd519d86e5b1a0b1d1b996cace62f3f
>> Author: Christoph Hellwig <hch@lst.de>
>> Date:   Wed Nov 24 07:28:56 2021 +0100
>>
>>      blk-mq: cleanup request allocation
> Good finding. I am not able to repro these issues after reverting this commit.
>
> Vishal, how does it work in your tests?
Same. I haven't seen any issue in my tests (io_uring, aio) after 
reverting this commit.
Should I go ahead and send v6 patchset after incororating your prev 
feedback?
>
> Song

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v5 3/4] md: raid10 add nowait support
  2021-12-21 15:29                                                               ` Vishal Verma
@ 2021-12-21 15:59                                                                 ` Jens Axboe
  2021-12-21 16:26                                                                   ` Vishal Verma
  0 siblings, 1 reply; 86+ messages in thread
From: Jens Axboe @ 2021-12-21 15:59 UTC (permalink / raw)
  To: Vishal Verma, Song Liu; +Cc: linux-raid, rgoldwyn

On 12/21/21 8:29 AM, Vishal Verma wrote:
> 
> On 12/21/21 1:13 AM, Song Liu wrote:
>> On Mon, Dec 20, 2021 at 6:22 AM Vishal Verma <vverma@digitalocean.com> wrote:
>>>
>>> On 12/16/21 4:50 PM, Song Liu wrote:
>>>
>>> On Thu, Dec 16, 2021 at 12:38 PM Vishal Verma <vverma@digitalocean.com> wrote:
>>>
>>> [...]
>>>
>>> [  740.106431] invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI
>>>
>>> What's the exact command line that triggers this? I am not able to
>>> trigger it with
>>> either fio or t/io_uring.
>>>
>>> Song
>>>
>>> I only had 1 nvme so was creating 4 partitions on it and creating a
>>> raid10 and doing:
>>>
>>> mdadm -C /dev/md10 -l 10 -n 4 /dev/nvme4n1p1 /dev/nvme4n1p2
>>> /dev/nvme4n1p3 /dev/nvme4n1p4
>>> ./t/io_uring /dev/md10-d 256 -p 0 -a 0 -r 100
>>>
>>> on top of commit: c14704e1cb556 (md-next branch) + "md: add support for
>>> REQ_NOWAIT" patch
>>> Also, applied the commit (75feae73a28) Jens pointed earlier today.
>>>
>>> I am able to trigger the following error. I will look into it.
>>>
>>> Thanks,
>>> Song
>>>
>>> [ 1583.149004] ==================================================================
>>> [ 1583.150100] BUG: KASAN: use-after-free in raid10_end_read_request+0x91/0x310
>>> [ 1583.151042] Read of size 8 at addr ffff888160a1c928 by task io_uring/1165
>>> [ 1583.152016]
>>> [ 1583.152247] CPU: 0 PID: 1165 Comm: io_uring Not tainted 5.16.0-rc3+ #660
>>> [ 1583.153159] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
>>> BIOS 1.13.0-2.module_el8.4.0+547+a85d02ba 04/01/2014
>>> [ 1583.154572] Call Trace:
>>> [ 1583.155005]  <IRQ>
>>> [ 1583.155338]  dump_stack_lvl+0x44/0x57
>>> [ 1583.155950]  print_address_description.constprop.8.cold.17+0x12/0x339
>>> [ 1583.156969]  ? raid10_end_read_request+0x91/0x310
>>> [ 1583.157578]  ? raid10_end_read_request+0x91/0x310
>>> [ 1583.158272]  kasan_report.cold.18+0x83/0xdf
>>> [ 1583.158889]  ? raid10_end_read_request+0x91/0x310
>>> [ 1583.159554]  raid10_end_read_request+0x91/0x310
>>> [ 1583.160201]  ? raid10_resize+0x270/0x270
>>> [ 1583.160724]  ? bio_uninit+0xc7/0x1e0
>>> [ 1583.161274]  blk_update_request+0x21f/0x810
>>> [ 1583.161893]  blk_mq_end_request_batch+0x11c/0xa70
>>> [ 1583.162497]  ? blk_mq_end_request+0x460/0x460
>>> [ 1583.163204]  ? nvme_complete_batch_req+0x12/0x30
>>> [ 1583.163888]  nvme_irq+0x6ad/0x6f0
>>> [ 1583.164354]  ? io_queue_count_set+0xe0/0xe0
>>> [ 1583.164980]  ? nvme_unmap_data+0x1e0/0x1e0
>>> [ 1583.165504]  ? rcu_read_lock_bh_held+0xb0/0xb0
>>> [ 1583.166149]  ? io_queue_count_set+0xe0/0xe0
>>> [ 1583.166721]  __handle_irq_event_percpu+0x79/0x440
>>> [ 1583.167446]  handle_irq_event_percpu+0x6f/0xe0
>>> [ 1583.168101]  ? __handle_irq_event_percpu+0x440/0x440
>>> [ 1583.168734]  ? lock_contended+0x6e0/0x6e0
>>> [ 1583.169349]  ? do_raw_spin_unlock+0xa2/0x130
>>> [ 1583.169961]  handle_irq_event+0x54/0x90
>>> [ 1583.170442]  handle_edge_irq+0x121/0x300
>>> [ 1583.171012]  __common_interrupt+0x7d/0x170
>>> [ 1583.171538]  common_interrupt+0xa0/0xc0
>>> [ 1583.172103]  </IRQ>
>>> [ 1583.172389]  <TASK>
>>>
>>> When running t/io_uring on a raid1 array, I get following:
>>>
>>> [  189.863726] RIP: 0010:__kmalloc+0xfa/0x430
>>> [  189.867825] Code: 05 4b 9a 35 43 48 8b 50 08 48 83 78 10 00 4c 8b 20 0f 84 fa 02 00 00 4d 85 e4 0f 84 f1 02 00 00 41 8b 47 28 49 8b 3f 4c 01 e0 <48> 8b 18 48 89 c1 49 33 9f b8 00 00 00 4c 89 e0 48 0f c9 48 31 cb
>>> [  189.886573] RSP: 0018:ffffaf09e28b7828 EFLAGS: 00010286
>>> [  189.891799] RAX: a0fa1099d2b0fff3 RBX: 0000000000092900 RCX: 0000000000000000
>>> [  189.898930] RDX: 00000002ba79600b RSI: 0000000000092900 RDI: 00000000000340e0
>>> [  189.906062] RBP: ffffaf09e28b7860 R08: ffff90fb8b6ea560 R09: ffff90fba7205f60
>>> [  189.913195] R10: ffffaf09e28b7c18 R11: 0000000000000000 R12: a0fa1099d2b0ffb3
>>> [  189.920329] R13: 0000000000000000 R14: ffffffffc074c277 R15: ffff90bc00044700
>>> [  189.927461] FS:  00007fd6209d7700(0000) GS:ffff913a6e140000(0000) knlGS:0000000000000000
>>> [  189.935549] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>> [  189.941295] CR2: 00007f16998bebf0 CR3: 00000040be512005 CR4: 0000000000770ee0
>>> [  189.948426] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>>> [  189.955560] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>>> [  189.962691] PKRU: 55555554
>>> [  189.965403] Call Trace:
>>> [  189.967857]  <TASK>
>>> [  189.969966]  0xffffffffc074c277
>>> [  189.973110]  mempool_alloc+0x61/0x180
>>> [  189.976777]  ? bio_associate_blkg_from_css+0xf5/0x2c0
>>> [  189.981829]  ? __bio_clone_fast+0xa9/0xf0
>>> [  189.985842]  ? __sbitmap_get_word+0x36/0x80
>>> [  189.990027]  0xffffffffc074ac50
>>> [  189.993174]  ? __sbitmap_queue_get+0x9/0x10
>>> [  189.997359]  ? blk_mq_get_tag+0x241/0x270
>>> [  190.001373]  ? ktime_get+0x3b/0xa0
>>> [  190.004776]  ? blk_mq_rq_ctx_init.isra.0+0x1a5/0x1c0
>>> [  190.009743]  0xffffffffc074efb3
>>> [  190.012891]  md_handle_request+0x134/0x1b0
>>> [  190.016989]  ? ktime_get+0x3b/0xa0
>>> [  190.020395]  md_submit_bio+0x6d/0xa0
>>> [  190.023976]  __submit_bio+0x94/0x140
>>> [  190.027555]  submit_bio_noacct+0xe1/0x2a0
>>> [  190.031566]  submit_bio+0x48/0x120
>>> [  190.034972]  blkdev_direct_IO+0x19b/0x540
>>> [  190.038987]  ? __fsnotify_parent+0xff/0x330
>>> [  190.043172]  ? __fsnotify_parent+0x10f/0x330
>>> [  190.047445]  generic_file_read_iter+0xa5/0x160
>>> [  190.051889]  blkdev_read_iter+0x38/0x70
>>> [  190.055731]  io_read+0x119/0x420
>>> [  190.058963]  ? blk_queue_exit+0x23/0x50
>>> [  190.062801]  ? __blk_mq_free_request+0x86/0xc0
>>> [  190.067247]  io_issue_sqe+0x7ec/0x19c0
>>> [  190.071002]  ? io_req_prep+0x6a9/0xe60
>>> [  190.074754]  io_submit_sqes+0x2a0/0x9f0
>>> [  190.078594]  ? __fget_files+0x6a/0x90
>>> [  190.082259]  __x64_sys_io_uring_enter+0x1da/0x8c0
>>> [  190.086965]  ? debug_smp_processor_id+0x17/0x20
>>> [  190.091498]  ? fpregs_assert_state_consistent+0x23/0x50
>>> [  190.096723]  ? exit_to_user_mode_prepare+0x4b/0x1e0
>>> [  190.101602]  do_syscall_64+0x38/0x90
>>> [  190.105182]  entry_SYSCALL_64_after_hwframe+0x44/0xae
>>> [  190.110236] RIP: 0033:0x7fd620af589d
>>> [  190.113815] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
>>> [  190.132563] RSP: 002b:00007fd6209d6e98 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa
>>> [  190.140126] RAX: ffffffffffffffda RBX: 00007fd620d4bfc0 RCX: 00007fd620af589d
>>> [  190.147261] RDX: 0000000000000000 RSI: 0000000000000020 RDI: 0000000000000004
>>> [  190.154391] RBP: 0000000000000020 R08: 0000000000000000 R09: 0000000000000000
>>> [  190.161524] R10: 0000000000000000 R11: 0000000000000246 R12: 0000561889c472a0
>>> [  190.168657] R13: 0000000000000020 R14: 0000000000000000 R15: 0000000000000020
>>> [  190.175793]  </TASK>
>>>
>>> It seems this issue is getting triggered with the following commit:
>>>
>>> commit 5b13bc8a3fd519d86e5b1a0b1d1b996cace62f3f
>>> Author: Christoph Hellwig <hch@lst.de>
>>> Date:   Wed Nov 24 07:28:56 2021 +0100
>>>
>>>      blk-mq: cleanup request allocation
>> Good finding. I am not able to repro these issues after reverting this commit.
>>
>> Vishal, how does it work in your tests?
> Same. I haven't seen any issue in my tests (io_uring, aio) after 
> reverting this commit.
> Should I go ahead and send v6 patchset after incororating your prev 
> feedback?

Do you have this one:

commit a08ed9aae8a3d2321ef378d6581cc87a3fb75b44
Author: Jens Axboe <axboe@kernel.dk>
Date:   Thu Dec 2 12:43:46 2021 -0700

    block: fix double bio queue when merging in cached request path

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v5 3/4] md: raid10 add nowait support
  2021-12-21 15:59                                                                 ` Jens Axboe
@ 2021-12-21 16:26                                                                   ` Vishal Verma
  0 siblings, 0 replies; 86+ messages in thread
From: Vishal Verma @ 2021-12-21 16:26 UTC (permalink / raw)
  To: Jens Axboe, Song Liu; +Cc: linux-raid, rgoldwyn


On 12/21/21 8:59 AM, Jens Axboe wrote:
> On 12/21/21 8:29 AM, Vishal Verma wrote:
>> On 12/21/21 1:13 AM, Song Liu wrote:
>>> On Mon, Dec 20, 2021 at 6:22 AM Vishal Verma <vverma@digitalocean.com> wrote:
>>>> On 12/16/21 4:50 PM, Song Liu wrote:
>>>>
>>>> On Thu, Dec 16, 2021 at 12:38 PM Vishal Verma <vverma@digitalocean.com> wrote:
>>>>
>>>> [...]
>>>>
>>>> [  740.106431] invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI
>>>>
>>>> What's the exact command line that triggers this? I am not able to
>>>> trigger it with
>>>> either fio or t/io_uring.
>>>>
>>>> Song
>>>>
>>>> I only had 1 nvme so was creating 4 partitions on it and creating a
>>>> raid10 and doing:
>>>>
>>>> mdadm -C /dev/md10 -l 10 -n 4 /dev/nvme4n1p1 /dev/nvme4n1p2
>>>> /dev/nvme4n1p3 /dev/nvme4n1p4
>>>> ./t/io_uring /dev/md10-d 256 -p 0 -a 0 -r 100
>>>>
>>>> on top of commit: c14704e1cb556 (md-next branch) + "md: add support for
>>>> REQ_NOWAIT" patch
>>>> Also, applied the commit (75feae73a28) Jens pointed earlier today.
>>>>
>>>> I am able to trigger the following error. I will look into it.
>>>>
>>>> Thanks,
>>>> Song
>>>>
>>>> [ 1583.149004] ==================================================================
>>>> [ 1583.150100] BUG: KASAN: use-after-free in raid10_end_read_request+0x91/0x310
>>>> [ 1583.151042] Read of size 8 at addr ffff888160a1c928 by task io_uring/1165
>>>> [ 1583.152016]
>>>> [ 1583.152247] CPU: 0 PID: 1165 Comm: io_uring Not tainted 5.16.0-rc3+ #660
>>>> [ 1583.153159] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
>>>> BIOS 1.13.0-2.module_el8.4.0+547+a85d02ba 04/01/2014
>>>> [ 1583.154572] Call Trace:
>>>> [ 1583.155005]  <IRQ>
>>>> [ 1583.155338]  dump_stack_lvl+0x44/0x57
>>>> [ 1583.155950]  print_address_description.constprop.8.cold.17+0x12/0x339
>>>> [ 1583.156969]  ? raid10_end_read_request+0x91/0x310
>>>> [ 1583.157578]  ? raid10_end_read_request+0x91/0x310
>>>> [ 1583.158272]  kasan_report.cold.18+0x83/0xdf
>>>> [ 1583.158889]  ? raid10_end_read_request+0x91/0x310
>>>> [ 1583.159554]  raid10_end_read_request+0x91/0x310
>>>> [ 1583.160201]  ? raid10_resize+0x270/0x270
>>>> [ 1583.160724]  ? bio_uninit+0xc7/0x1e0
>>>> [ 1583.161274]  blk_update_request+0x21f/0x810
>>>> [ 1583.161893]  blk_mq_end_request_batch+0x11c/0xa70
>>>> [ 1583.162497]  ? blk_mq_end_request+0x460/0x460
>>>> [ 1583.163204]  ? nvme_complete_batch_req+0x12/0x30
>>>> [ 1583.163888]  nvme_irq+0x6ad/0x6f0
>>>> [ 1583.164354]  ? io_queue_count_set+0xe0/0xe0
>>>> [ 1583.164980]  ? nvme_unmap_data+0x1e0/0x1e0
>>>> [ 1583.165504]  ? rcu_read_lock_bh_held+0xb0/0xb0
>>>> [ 1583.166149]  ? io_queue_count_set+0xe0/0xe0
>>>> [ 1583.166721]  __handle_irq_event_percpu+0x79/0x440
>>>> [ 1583.167446]  handle_irq_event_percpu+0x6f/0xe0
>>>> [ 1583.168101]  ? __handle_irq_event_percpu+0x440/0x440
>>>> [ 1583.168734]  ? lock_contended+0x6e0/0x6e0
>>>> [ 1583.169349]  ? do_raw_spin_unlock+0xa2/0x130
>>>> [ 1583.169961]  handle_irq_event+0x54/0x90
>>>> [ 1583.170442]  handle_edge_irq+0x121/0x300
>>>> [ 1583.171012]  __common_interrupt+0x7d/0x170
>>>> [ 1583.171538]  common_interrupt+0xa0/0xc0
>>>> [ 1583.172103]  </IRQ>
>>>> [ 1583.172389]  <TASK>
>>>>
>>>> When running t/io_uring on a raid1 array, I get following:
>>>>
>>>> [  189.863726] RIP: 0010:__kmalloc+0xfa/0x430
>>>> [  189.867825] Code: 05 4b 9a 35 43 48 8b 50 08 48 83 78 10 00 4c 8b 20 0f 84 fa 02 00 00 4d 85 e4 0f 84 f1 02 00 00 41 8b 47 28 49 8b 3f 4c 01 e0 <48> 8b 18 48 89 c1 49 33 9f b8 00 00 00 4c 89 e0 48 0f c9 48 31 cb
>>>> [  189.886573] RSP: 0018:ffffaf09e28b7828 EFLAGS: 00010286
>>>> [  189.891799] RAX: a0fa1099d2b0fff3 RBX: 0000000000092900 RCX: 0000000000000000
>>>> [  189.898930] RDX: 00000002ba79600b RSI: 0000000000092900 RDI: 00000000000340e0
>>>> [  189.906062] RBP: ffffaf09e28b7860 R08: ffff90fb8b6ea560 R09: ffff90fba7205f60
>>>> [  189.913195] R10: ffffaf09e28b7c18 R11: 0000000000000000 R12: a0fa1099d2b0ffb3
>>>> [  189.920329] R13: 0000000000000000 R14: ffffffffc074c277 R15: ffff90bc00044700
>>>> [  189.927461] FS:  00007fd6209d7700(0000) GS:ffff913a6e140000(0000) knlGS:0000000000000000
>>>> [  189.935549] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>>> [  189.941295] CR2: 00007f16998bebf0 CR3: 00000040be512005 CR4: 0000000000770ee0
>>>> [  189.948426] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>>>> [  189.955560] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>>>> [  189.962691] PKRU: 55555554
>>>> [  189.965403] Call Trace:
>>>> [  189.967857]  <TASK>
>>>> [  189.969966]  0xffffffffc074c277
>>>> [  189.973110]  mempool_alloc+0x61/0x180
>>>> [  189.976777]  ? bio_associate_blkg_from_css+0xf5/0x2c0
>>>> [  189.981829]  ? __bio_clone_fast+0xa9/0xf0
>>>> [  189.985842]  ? __sbitmap_get_word+0x36/0x80
>>>> [  189.990027]  0xffffffffc074ac50
>>>> [  189.993174]  ? __sbitmap_queue_get+0x9/0x10
>>>> [  189.997359]  ? blk_mq_get_tag+0x241/0x270
>>>> [  190.001373]  ? ktime_get+0x3b/0xa0
>>>> [  190.004776]  ? blk_mq_rq_ctx_init.isra.0+0x1a5/0x1c0
>>>> [  190.009743]  0xffffffffc074efb3
>>>> [  190.012891]  md_handle_request+0x134/0x1b0
>>>> [  190.016989]  ? ktime_get+0x3b/0xa0
>>>> [  190.020395]  md_submit_bio+0x6d/0xa0
>>>> [  190.023976]  __submit_bio+0x94/0x140
>>>> [  190.027555]  submit_bio_noacct+0xe1/0x2a0
>>>> [  190.031566]  submit_bio+0x48/0x120
>>>> [  190.034972]  blkdev_direct_IO+0x19b/0x540
>>>> [  190.038987]  ? __fsnotify_parent+0xff/0x330
>>>> [  190.043172]  ? __fsnotify_parent+0x10f/0x330
>>>> [  190.047445]  generic_file_read_iter+0xa5/0x160
>>>> [  190.051889]  blkdev_read_iter+0x38/0x70
>>>> [  190.055731]  io_read+0x119/0x420
>>>> [  190.058963]  ? blk_queue_exit+0x23/0x50
>>>> [  190.062801]  ? __blk_mq_free_request+0x86/0xc0
>>>> [  190.067247]  io_issue_sqe+0x7ec/0x19c0
>>>> [  190.071002]  ? io_req_prep+0x6a9/0xe60
>>>> [  190.074754]  io_submit_sqes+0x2a0/0x9f0
>>>> [  190.078594]  ? __fget_files+0x6a/0x90
>>>> [  190.082259]  __x64_sys_io_uring_enter+0x1da/0x8c0
>>>> [  190.086965]  ? debug_smp_processor_id+0x17/0x20
>>>> [  190.091498]  ? fpregs_assert_state_consistent+0x23/0x50
>>>> [  190.096723]  ? exit_to_user_mode_prepare+0x4b/0x1e0
>>>> [  190.101602]  do_syscall_64+0x38/0x90
>>>> [  190.105182]  entry_SYSCALL_64_after_hwframe+0x44/0xae
>>>> [  190.110236] RIP: 0033:0x7fd620af589d
>>>> [  190.113815] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48
>>>> [  190.132563] RSP: 002b:00007fd6209d6e98 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa
>>>> [  190.140126] RAX: ffffffffffffffda RBX: 00007fd620d4bfc0 RCX: 00007fd620af589d
>>>> [  190.147261] RDX: 0000000000000000 RSI: 0000000000000020 RDI: 0000000000000004
>>>> [  190.154391] RBP: 0000000000000020 R08: 0000000000000000 R09: 0000000000000000
>>>> [  190.161524] R10: 0000000000000000 R11: 0000000000000246 R12: 0000561889c472a0
>>>> [  190.168657] R13: 0000000000000020 R14: 0000000000000000 R15: 0000000000000020
>>>> [  190.175793]  </TASK>
>>>>
>>>> It seems this issue is getting triggered with the following commit:
>>>>
>>>> commit 5b13bc8a3fd519d86e5b1a0b1d1b996cace62f3f
>>>> Author: Christoph Hellwig <hch@lst.de>
>>>> Date:   Wed Nov 24 07:28:56 2021 +0100
>>>>
>>>>       blk-mq: cleanup request allocation
>>> Good finding. I am not able to repro these issues after reverting this commit.
>>>
>>> Vishal, how does it work in your tests?
>> Same. I haven't seen any issue in my tests (io_uring, aio) after
>> reverting this commit.
>> Should I go ahead and send v6 patchset after incororating your prev
>> feedback?
> Do you have this one:
>
> commit a08ed9aae8a3d2321ef378d6581cc87a3fb75b44
> Author: Jens Axboe <axboe@kernel.dk>
> Date:   Thu Dec 2 12:43:46 2021 -0700
>
>      block: fq queue when merging in cached request path
>
Nope, md-next branch didn't have that commit. I manually applied it. 
Tests seem to run fine now.
Thank you :)



^ permalink raw reply	[flat|nested] 86+ messages in thread

* [PATCH v6 1/4] md: add support for REQ_NOWAIT
  2021-12-15 22:20                                         ` Vishal Verma
@ 2021-12-21 20:06                                           ` Vishal Verma
  2021-12-21 20:06                                             ` [PATCH v6 2/4] md: raid1 add nowait support Vishal Verma
                                                               ` (6 more replies)
  0 siblings, 7 replies; 86+ messages in thread
From: Vishal Verma @ 2021-12-21 20:06 UTC (permalink / raw)
  To: song, linux-raid; +Cc: axboe, rgoldwyn, Vishal Verma

commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support
for checking whether a given bdev supports handling of REQ_NOWAIT or not.
Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable
it for linear target") added support for REQ_NOWAIT for dm. This uses
a similar approach to incorporate REQ_NOWAIT for md based bios.

This patch was tested using t/io_uring tool within FIO. A nvme drive
was partitioned into 2 partitions and a simple raid 0 configuration
/dev/md0 was created.

md0 : active raid0 nvme4n1p1[1] nvme4n1p2[0]
      937423872 blocks super 1.2 512k chunks

Before patch:

$ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100

Running top while the above runs:

$ ps -eL | grep $(pidof io_uring)

  38396   38396 pts/2    00:00:00 io_uring
  38396   38397 pts/2    00:00:15 io_uring
  38396   38398 pts/2    00:00:13 iou-wrk-38397

We can see iou-wrk-38397 io worker thread created which gets created
when io_uring sees that the underlying device (/dev/md0 in this case)
doesn't support nowait.

After patch:

$ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100

Running top while the above runs:

$ ps -eL | grep $(pidof io_uring)

  38341   38341 pts/2    00:10:22 io_uring
  38341   38342 pts/2    00:10:37 io_uring

After running this patch, we don't see any io worker thread
being created which indicated that io_uring saw that the
underlying device does support nowait. This is the exact behaviour
noticed on a dm device which also supports nowait.

For all the other raid personalities except raid0, we would need
to train pieces which involves make_request fn in order for them
to correctly handle REQ_NOWAIT.

Signed-off-by: Vishal Verma <vverma@digitalocean.com>
---
 drivers/md/md.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5111ed966947..ccd296aa9641 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -418,6 +418,11 @@ void md_handle_request(struct mddev *mddev, struct bio *bio)
 	rcu_read_lock();
 	if (is_suspended(mddev, bio)) {
 		DEFINE_WAIT(__wait);
+		/* Bail out if REQ_NOWAIT is set for the bio */
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		for (;;) {
 			prepare_to_wait(&mddev->sb_wait, &__wait,
 					TASK_UNINTERRUPTIBLE);
@@ -5792,6 +5797,7 @@ int md_run(struct mddev *mddev)
 	int err;
 	struct md_rdev *rdev;
 	struct md_personality *pers;
+	bool nowait = true;
 
 	if (list_empty(&mddev->disks))
 		/* cannot run an array with no devices.. */
@@ -5862,8 +5868,13 @@ int md_run(struct mddev *mddev)
 			}
 		}
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
+		nowait = nowait && blk_queue_nowait(bdev_get_queue(rdev->bdev));
 	}
 
+	/* Set the NOWAIT flags if all underlying devices support it */
+	if (nowait)
+		blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue);
+
 	if (!bioset_initialized(&mddev->bio_set)) {
 		err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
 		if (err)
@@ -7007,6 +7018,15 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 	if (!mddev->thread)
 		md_update_sb(mddev, 1);
+	/*
+	 * If the new disk does not support REQ_NOWAIT,
+	 * disable on the whole MD.
+	 */
+	if (!blk_queue_nowait(bdev_get_queue(rdev->bdev))) {
+		pr_info("%s: Disabling nowait because %s does not support nowait\n",
+			mdname(mddev), bdevname(rdev->bdev, b));
+		blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue);
+	}
 	/*
 	 * Kick recovery, maybe this spare has to be added to the
 	 * array immediately.
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 86+ messages in thread

* [PATCH v6 2/4] md: raid1 add nowait support
  2021-12-21 20:06                                           ` [PATCH v6 1/4] md: add support for REQ_NOWAIT Vishal Verma
@ 2021-12-21 20:06                                             ` Vishal Verma
  2021-12-21 20:06                                             ` [PATCH v6 3/4] md: raid10 " Vishal Verma
                                                               ` (5 subsequent siblings)
  6 siblings, 0 replies; 86+ messages in thread
From: Vishal Verma @ 2021-12-21 20:06 UTC (permalink / raw)
  To: song, linux-raid; +Cc: axboe, rgoldwyn, Vishal Verma

This adds nowait support to the RAID1 driver. It makes RAID1 driver
return with EAGAIN for situations where it could wait for eg:

  - Waiting for the barrier,
  - Too many pending I/Os to be queued.

wait_barrier() fn is modified to return bool to support error for
wait barriers. It returns true in case of wait or if wait is not
required and returns false if wait was required but not performed
to support nowait.

Signed-off-by: Vishal Verma <vverma@digitalocean.com>
---
 drivers/md/raid1.c | 83 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 64 insertions(+), 19 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7dc8026cf6ee..e488671bb563 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -929,8 +929,9 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
 	wake_up(&conf->wait_barrier);
 }
 
-static void _wait_barrier(struct r1conf *conf, int idx)
+static bool _wait_barrier(struct r1conf *conf, int idx, bool nowait)
 {
+	bool ret = true;
 	/*
 	 * We need to increase conf->nr_pending[idx] very early here,
 	 * then raise_barrier() can be blocked when it waits for
@@ -961,7 +962,7 @@ static void _wait_barrier(struct r1conf *conf, int idx)
 	 */
 	if (!READ_ONCE(conf->array_frozen) &&
 	    !atomic_read(&conf->barrier[idx]))
-		return;
+		return ret;
 
 	/*
 	 * After holding conf->resync_lock, conf->nr_pending[idx]
@@ -979,18 +980,29 @@ static void _wait_barrier(struct r1conf *conf, int idx)
 	 */
 	wake_up(&conf->wait_barrier);
 	/* Wait for the barrier in same barrier unit bucket to drop. */
-	wait_event_lock_irq(conf->wait_barrier,
-			    !conf->array_frozen &&
-			     !atomic_read(&conf->barrier[idx]),
-			    conf->resync_lock);
-	atomic_inc(&conf->nr_pending[idx]);
+
+	/* Return false when nowait flag is set */
+	if (nowait)
+		ret = false;
+	else {
+		wait_event_lock_irq(conf->wait_barrier,
+				!conf->array_frozen &&
+				!atomic_read(&conf->barrier[idx]),
+				conf->resync_lock);
+	}
+
+	/* Only increment nr_pending when we wait */
+	if (ret)
+		atomic_inc(&conf->nr_pending[idx]);
 	atomic_dec(&conf->nr_waiting[idx]);
 	spin_unlock_irq(&conf->resync_lock);
+	return ret;
 }
 
-static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
+static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
 {
 	int idx = sector_to_idx(sector_nr);
+	bool ret = true;
 
 	/*
 	 * Very similar to _wait_barrier(). The difference is, for read
@@ -1002,7 +1014,7 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
 	atomic_inc(&conf->nr_pending[idx]);
 
 	if (!READ_ONCE(conf->array_frozen))
-		return;
+		return ret;
 
 	spin_lock_irq(&conf->resync_lock);
 	atomic_inc(&conf->nr_waiting[idx]);
@@ -1013,19 +1025,30 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
 	 */
 	wake_up(&conf->wait_barrier);
 	/* Wait for array to be unfrozen */
-	wait_event_lock_irq(conf->wait_barrier,
-			    !conf->array_frozen,
-			    conf->resync_lock);
-	atomic_inc(&conf->nr_pending[idx]);
+
+	/* Return false when nowait flag is set */
+	if (nowait)
+		/* Return false when nowait flag is set */
+		ret = false;
+	else {
+		wait_event_lock_irq(conf->wait_barrier,
+				!conf->array_frozen,
+				conf->resync_lock);
+	}
+
+	/* Only increment nr_pending when we wait */
+	if (ret)
+		atomic_inc(&conf->nr_pending[idx]);
 	atomic_dec(&conf->nr_waiting[idx]);
 	spin_unlock_irq(&conf->resync_lock);
+	return ret;
 }
 
-static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
+static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
 {
 	int idx = sector_to_idx(sector_nr);
 
-	_wait_barrier(conf, idx);
+	return _wait_barrier(conf, idx, nowait);
 }
 
 static void _allow_barrier(struct r1conf *conf, int idx)
@@ -1236,7 +1259,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 	 * Still need barrier for READ in case that whole
 	 * array is frozen.
 	 */
-	wait_read_barrier(conf, bio->bi_iter.bi_sector);
+	if (!wait_read_barrier(conf, bio->bi_iter.bi_sector,
+				bio->bi_opf & REQ_NOWAIT)) {
+		bio_wouldblock_error(bio);
+		return;
+	}
 
 	if (!r1_bio)
 		r1_bio = alloc_r1bio(mddev, bio);
@@ -1336,6 +1363,10 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 		     bio->bi_iter.bi_sector, bio_end_sector(bio))) {
 
 		DEFINE_WAIT(w);
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		for (;;) {
 			prepare_to_wait(&conf->wait_barrier,
 					&w, TASK_IDLE);
@@ -1353,17 +1384,26 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	 * thread has put up a bar for new requests.
 	 * Continue immediately if no resync is active currently.
 	 */
-	wait_barrier(conf, bio->bi_iter.bi_sector);
+	if (!wait_barrier(conf, bio->bi_iter.bi_sector,
+				bio->bi_opf & REQ_NOWAIT)) {
+		bio_wouldblock_error(bio);
+		return;
+	}
 
 	r1_bio = alloc_r1bio(mddev, bio);
 	r1_bio->sectors = max_write_sectors;
 
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		raid1_log(mddev, "wait queued");
 		wait_event(conf->wait_barrier,
 			   conf->pending_count < max_queued_requests);
 	}
+
 	/* first select target devices under rcu_lock and
 	 * inc refcount on their rdev.  Record them by setting
 	 * bios[x] to bio
@@ -1458,9 +1498,14 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 				rdev_dec_pending(conf->mirrors[j].rdev, mddev);
 		r1_bio->state = 0;
 		allow_barrier(conf, bio->bi_iter.bi_sector);
+
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
 		md_wait_for_blocked_rdev(blocked_rdev, mddev);
-		wait_barrier(conf, bio->bi_iter.bi_sector);
+		wait_barrier(conf, bio->bi_iter.bi_sector, false);
 		goto retry_write;
 	}
 
@@ -1687,7 +1732,7 @@ static void close_sync(struct r1conf *conf)
 	int idx;
 
 	for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) {
-		_wait_barrier(conf, idx);
+		_wait_barrier(conf, idx, false);
 		_allow_barrier(conf, idx);
 	}
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 86+ messages in thread

* [PATCH v6 3/4] md: raid10 add nowait support
  2021-12-21 20:06                                           ` [PATCH v6 1/4] md: add support for REQ_NOWAIT Vishal Verma
  2021-12-21 20:06                                             ` [PATCH v6 2/4] md: raid1 add nowait support Vishal Verma
@ 2021-12-21 20:06                                             ` Vishal Verma
  2021-12-22 23:58                                               ` Song Liu
  2021-12-23  1:47                                               ` Song Liu
  2021-12-21 20:06                                             ` [PATCH v6 4/4] md: raid456 " Vishal Verma
                                                               ` (4 subsequent siblings)
  6 siblings, 2 replies; 86+ messages in thread
From: Vishal Verma @ 2021-12-21 20:06 UTC (permalink / raw)
  To: song, linux-raid; +Cc: axboe, rgoldwyn, Vishal Verma

This adds nowait support to the RAID10 driver. Very similar to
raid1 driver changes. It makes RAID10 driver return with EAGAIN
for situations where it could wait for eg:

  - Waiting for the barrier,
  - Too many pending I/Os to be queued,
  - Reshape operation,
  - Discard operation.

wait_barrier() and regular_request_wait() fn are modified to return bool
to support error for wait barriers. They returns true in case of wait
or if wait is not required and returns false if wait was required
but not performed to support nowait.

Signed-off-by: Vishal Verma <vverma@digitalocean.com>
---
 drivers/md/raid10.c | 90 +++++++++++++++++++++++++++++++--------------
 1 file changed, 62 insertions(+), 28 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index dde98f65bd04..7ceae00e863e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -952,8 +952,9 @@ static void lower_barrier(struct r10conf *conf)
 	wake_up(&conf->wait_barrier);
 }
 
-static void wait_barrier(struct r10conf *conf)
+static bool wait_barrier(struct r10conf *conf, bool nowait)
 {
+	bool ret = true;
 	spin_lock_irq(&conf->resync_lock);
 	if (conf->barrier) {
 		struct bio_list *bio_list = current->bio_list;
@@ -968,26 +969,33 @@ static void wait_barrier(struct r10conf *conf)
 		 * count down.
 		 */
 		raid10_log(conf->mddev, "wait barrier");
-		wait_event_lock_irq(conf->wait_barrier,
-				    !conf->barrier ||
-				    (atomic_read(&conf->nr_pending) &&
-				     bio_list &&
-				     (!bio_list_empty(&bio_list[0]) ||
-				      !bio_list_empty(&bio_list[1]))) ||
-				     /* move on if recovery thread is
-				      * blocked by us
-				      */
-				     (conf->mddev->thread->tsk == current &&
-				      test_bit(MD_RECOVERY_RUNNING,
-					       &conf->mddev->recovery) &&
-				      conf->nr_queued > 0),
-				    conf->resync_lock);
+		/* Return false when nowait flag is set */
+		if (nowait)
+			ret = false;
+		else
+			wait_event_lock_irq(conf->wait_barrier,
+					    !conf->barrier ||
+					    (atomic_read(&conf->nr_pending) &&
+					     bio_list &&
+					     (!bio_list_empty(&bio_list[0]) ||
+					      !bio_list_empty(&bio_list[1]))) ||
+					     /* move on if recovery thread is
+					      * blocked by us
+					      */
+					     (conf->mddev->thread->tsk == current &&
+					      test_bit(MD_RECOVERY_RUNNING,
+						       &conf->mddev->recovery) &&
+					      conf->nr_queued > 0),
+					    conf->resync_lock);
 		conf->nr_waiting--;
 		if (!conf->nr_waiting)
 			wake_up(&conf->wait_barrier);
 	}
-	atomic_inc(&conf->nr_pending);
+	/* Only increment nr_pending when we wait */
+	if (ret)
+		atomic_inc(&conf->nr_pending);
 	spin_unlock_irq(&conf->resync_lock);
+	return ret;
 }
 
 static void allow_barrier(struct r10conf *conf)
@@ -1098,21 +1106,30 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
  * currently.
  * 2. If IO spans the reshape position.  Need to wait for reshape to pass.
  */
-static void regular_request_wait(struct mddev *mddev, struct r10conf *conf,
+static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
 				 struct bio *bio, sector_t sectors)
 {
-	wait_barrier(conf);
+	/* Bail out if REQ_NOWAIT is set for the bio */
+	if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
+		bio_wouldblock_error(bio);
+		return false;
+	}
 	while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    bio->bi_iter.bi_sector < conf->reshape_progress &&
 	    bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
 		raid10_log(conf->mddev, "wait reshape");
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return false;
+		}
 		allow_barrier(conf);
 		wait_event(conf->wait_barrier,
 			   conf->reshape_progress <= bio->bi_iter.bi_sector ||
 			   conf->reshape_progress >= bio->bi_iter.bi_sector +
 			   sectors);
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 	}
+	return true;
 }
 
 static void raid10_read_request(struct mddev *mddev, struct bio *bio,
@@ -1179,7 +1196,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 		bio_chain(split, bio);
 		allow_barrier(conf);
 		submit_bio_noacct(bio);
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 		bio = split;
 		r10_bio->master_bio = bio;
 		r10_bio->sectors = max_sectors;
@@ -1338,7 +1355,7 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
 		raid10_log(conf->mddev, "%s wait rdev %d blocked",
 				__func__, blocked_rdev->raid_disk);
 		md_wait_for_blocked_rdev(blocked_rdev, mddev);
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 		goto retry_wait;
 	}
 }
@@ -1356,6 +1373,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 					    bio->bi_iter.bi_sector,
 					    bio_end_sector(bio)))) {
 		DEFINE_WAIT(w);
+		/* Bail out if REQ_NOWAIT is set for the bio */
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		for (;;) {
 			prepare_to_wait(&conf->wait_barrier,
 					&w, TASK_IDLE);
@@ -1381,6 +1403,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 			      BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
 		md_wakeup_thread(mddev->thread);
 		raid10_log(conf->mddev, "wait reshape metadata");
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		wait_event(mddev->sb_wait,
 			   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
 
@@ -1390,6 +1416,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
 		raid10_log(mddev, "wait queued");
+		if (bio->bi_opf & REQ_NOWAIT) {
+			bio_wouldblock_error(bio);
+			return;
+		}
 		wait_event(conf->wait_barrier,
 			   conf->pending_count < max_queued_requests);
 	}
@@ -1482,7 +1512,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 		bio_chain(split, bio);
 		allow_barrier(conf);
 		submit_bio_noacct(bio);
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 		bio = split;
 		r10_bio->master_bio = bio;
 	}
@@ -1607,7 +1637,11 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 		return -EAGAIN;
 
-	wait_barrier(conf);
+	if (bio->bi_opf & REQ_NOWAIT) {
+		bio_wouldblock_error(bio);
+		return 0;
+	}
+	wait_barrier(conf, false);
 
 	/*
 	 * Check reshape again to avoid reshape happens after checking
@@ -1649,7 +1683,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 		allow_barrier(conf);
 		/* Resend the fist split part */
 		submit_bio_noacct(split);
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 	}
 	div_u64_rem(bio_end, stripe_size, &remainder);
 	if (remainder) {
@@ -1660,7 +1694,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 		/* Resend the second split part */
 		submit_bio_noacct(bio);
 		bio = split;
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 	}
 
 	bio_start = bio->bi_iter.bi_sector;
@@ -1816,7 +1850,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 		end_disk_offset += geo->stride;
 		atomic_inc(&first_r10bio->remaining);
 		raid_end_discard_bio(r10_bio);
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 		goto retry_discard;
 	}
 
@@ -2011,7 +2045,7 @@ static void print_conf(struct r10conf *conf)
 
 static void close_sync(struct r10conf *conf)
 {
-	wait_barrier(conf);
+	wait_barrier(conf, false);
 	allow_barrier(conf);
 
 	mempool_exit(&conf->r10buf_pool);
@@ -4819,7 +4853,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 	if (need_flush ||
 	    time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
 		/* Need to update reshape_position in metadata */
-		wait_barrier(conf);
+		wait_barrier(conf, false);
 		mddev->reshape_position = conf->reshape_progress;
 		if (mddev->reshape_backwards)
 			mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 86+ messages in thread

* [PATCH v6 4/4] md: raid456 add nowait support
  2021-12-21 20:06                                           ` [PATCH v6 1/4] md: add support for REQ_NOWAIT Vishal Verma
  2021-12-21 20:06                                             ` [PATCH v6 2/4] md: raid1 add nowait support Vishal Verma
  2021-12-21 20:06                                             ` [PATCH v6 3/4] md: raid10 " Vishal Verma
@ 2021-12-21 20:06                                             ` Vishal Verma
  2021-12-21 22:02                                               ` John Stoffel
  2021-12-25  2:14                                               ` Song Liu
  2021-12-22 16:06                                             ` [PATCH v6 1/4] md: add support for REQ_NOWAIT Jens Axboe
                                                               ` (3 subsequent siblings)
  6 siblings, 2 replies; 86+ messages in thread
From: Vishal Verma @ 2021-12-21 20:06 UTC (permalink / raw)
  To: song, linux-raid; +Cc: axboe, rgoldwyn, Vishal Verma

Returns EAGAIN in case the raid456 driver would block
waiting for situations like:

  - Reshape operation,
  - Discard operation.

Signed-off-by: Vishal Verma <vverma@digitalocean.com>
---
 drivers/md/raid5.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9c1a5877cf9f..d9647c384820 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5715,6 +5715,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 		set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
 		if (test_bit(STRIPE_SYNCING, &sh->state)) {
 			raid5_release_stripe(sh);
+			/* Bail out if REQ_NOWAIT is set */
+			if (bi->bi_opf & REQ_NOWAIT) {
+				bio_wouldblock_error(bi);
+				return;
+			}
 			schedule();
 			goto again;
 		}
@@ -5727,6 +5732,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 				set_bit(R5_Overlap, &sh->dev[d].flags);
 				spin_unlock_irq(&sh->stripe_lock);
 				raid5_release_stripe(sh);
+				/* Bail out if REQ_NOWAIT is set */
+				if (bi->bi_opf & REQ_NOWAIT) {
+					bio_wouldblock_error(bi);
+					return;
+				}
 				schedule();
 				goto again;
 			}
@@ -5820,6 +5830,16 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
 	bi->bi_next = NULL;
 
 	md_account_bio(mddev, &bi);
+	/* Bail out if REQ_NOWAIT is set */
+	if ((bi->bi_opf & REQ_NOWAIT) &&
+	    (conf->reshape_progress != MaxSector) &&
+	    (mddev->reshape_backwards
+	    ? (logical_sector > conf->reshape_progress && logical_sector <= conf->reshape_safe)
+	    : (logical_sector >= conf->reshape_safe && logical_sector < conf->reshape_progress))) {
+		bio_wouldblock_error(bi);
+		return true;
+	}
+
 	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 	for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
 		int previous;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 86+ messages in thread

* Re: [PATCH v6 4/4] md: raid456 add nowait support
  2021-12-21 20:06                                             ` [PATCH v6 4/4] md: raid456 " Vishal Verma
@ 2021-12-21 22:02                                               ` John Stoffel
  2021-12-25  2:14                                               ` Song Liu
  1 sibling, 0 replies; 86+ messages in thread
From: John Stoffel @ 2021-12-21 22:02 UTC (permalink / raw)
  To: Vishal Verma; +Cc: song, linux-raid, axboe, rgoldwyn

>>>>> "Vishal" == Vishal Verma <vverma@digitalocean.com> writes:

Vishal> Returns EAGAIN in case the raid456 driver would block
Vishal> waiting for situations like:

Vishal>   - Reshape operation,
Vishal>   - Discard operation.

Vishal> Signed-off-by: Vishal Verma <vverma@digitalocean.com>

Are there any performance implications with this patch set?  I didn't
see any discussion in the patch set (v6) and I was just wondering what
this buys us?  Your patch 1/4 talks about using fio as a test, but
there's no mention of whether it's now faster or slower.

John

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v6 1/4] md: add support for REQ_NOWAIT
  2021-12-21 20:06                                           ` [PATCH v6 1/4] md: add support for REQ_NOWAIT Vishal Verma
                                                               ` (2 preceding siblings ...)
  2021-12-21 20:06                                             ` [PATCH v6 4/4] md: raid456 " Vishal Verma
@ 2021-12-22 16:06                                             ` Jens Axboe
  2021-12-23  1:22                                             ` Song Liu
                                                               ` (2 subsequent siblings)
  6 siblings, 0 replies; 86+ messages in thread
From: Jens Axboe @ 2021-12-22 16:06 UTC (permalink / raw)
  To: Vishal Verma, song, linux-raid; +Cc: rgoldwyn

On 12/21/21 1:06 PM, Vishal Verma wrote:
> commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support
> for checking whether a given bdev supports handling of REQ_NOWAIT or not.
> Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable
> it for linear target") added support for REQ_NOWAIT for dm. This uses
> a similar approach to incorporate REQ_NOWAIT for md based bios.
> 
> This patch was tested using t/io_uring tool within FIO. A nvme drive
> was partitioned into 2 partitions and a simple raid 0 configuration
> /dev/md0 was created.
> 
> md0 : active raid0 nvme4n1p1[1] nvme4n1p2[0]
>       937423872 blocks super 1.2 512k chunks
> 
> Before patch:
> 
> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
> 
> Running top while the above runs:
> 
> $ ps -eL | grep $(pidof io_uring)
> 
>   38396   38396 pts/2    00:00:00 io_uring
>   38396   38397 pts/2    00:00:15 io_uring
>   38396   38398 pts/2    00:00:13 iou-wrk-38397
> 
> We can see iou-wrk-38397 io worker thread created which gets created
> when io_uring sees that the underlying device (/dev/md0 in this case)
> doesn't support nowait.
> 
> After patch:
> 
> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
> 
> Running top while the above runs:
> 
> $ ps -eL | grep $(pidof io_uring)
> 
>   38341   38341 pts/2    00:10:22 io_uring
>   38341   38342 pts/2    00:10:37 io_uring
> 
> After running this patch, we don't see any io worker thread
> being created which indicated that io_uring saw that the
> underlying device does support nowait. This is the exact behaviour
> noticed on a dm device which also supports nowait.
> 
> For all the other raid personalities except raid0, we would need
> to train pieces which involves make_request fn in order for them
> to correctly handle REQ_NOWAIT.

1-4 look fine to me now:

Reviewed-by: Jens Axboe <axboe@kernel.dk>

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v6 3/4] md: raid10 add nowait support
  2021-12-21 20:06                                             ` [PATCH v6 3/4] md: raid10 " Vishal Verma
@ 2021-12-22 23:58                                               ` Song Liu
  2021-12-23  1:47                                               ` Song Liu
  1 sibling, 0 replies; 86+ messages in thread
From: Song Liu @ 2021-12-22 23:58 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, Jens Axboe, rgoldwyn

On Tue, Dec 21, 2021 at 12:06 PM Vishal Verma <vverma@digitalocean.com> wrote:
>
> This adds nowait support to the RAID10 driver. Very similar to
> raid1 driver changes. It makes RAID10 driver return with EAGAIN
> for situations where it could wait for eg:
>
>   - Waiting for the barrier,
>   - Too many pending I/Os to be queued,
>   - Reshape operation,
>   - Discard operation.
>
> wait_barrier() and regular_request_wait() fn are modified to return bool
> to support error for wait barriers. They returns true in case of wait
> or if wait is not required and returns false if wait was required
> but not performed to support nowait.
>
> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
> ---
>  drivers/md/raid10.c | 90 +++++++++++++++++++++++++++++++--------------
>  1 file changed, 62 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index dde98f65bd04..7ceae00e863e 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -952,8 +952,9 @@ static void lower_barrier(struct r10conf *conf)
>         wake_up(&conf->wait_barrier);
>  }
>
> -static void wait_barrier(struct r10conf *conf)
> +static bool wait_barrier(struct r10conf *conf, bool nowait)
>  {
> +       bool ret = true;
>         spin_lock_irq(&conf->resync_lock);
>         if (conf->barrier) {
>                 struct bio_list *bio_list = current->bio_list;
> @@ -968,26 +969,33 @@ static void wait_barrier(struct r10conf *conf)
>                  * count down.
>                  */
>                 raid10_log(conf->mddev, "wait barrier");
> -               wait_event_lock_irq(conf->wait_barrier,
> -                                   !conf->barrier ||
> -                                   (atomic_read(&conf->nr_pending) &&
> -                                    bio_list &&
> -                                    (!bio_list_empty(&bio_list[0]) ||
> -                                     !bio_list_empty(&bio_list[1]))) ||
> -                                    /* move on if recovery thread is
> -                                     * blocked by us
> -                                     */
> -                                    (conf->mddev->thread->tsk == current &&
> -                                     test_bit(MD_RECOVERY_RUNNING,
> -                                              &conf->mddev->recovery) &&
> -                                     conf->nr_queued > 0),
> -                                   conf->resync_lock);
> +               /* Return false when nowait flag is set */
> +               if (nowait)
> +                       ret = false;
> +               else
> +                       wait_event_lock_irq(conf->wait_barrier,
> +                                           !conf->barrier ||
> +                                           (atomic_read(&conf->nr_pending) &&
> +                                            bio_list &&
> +                                            (!bio_list_empty(&bio_list[0]) ||
> +                                             !bio_list_empty(&bio_list[1]))) ||
> +                                            /* move on if recovery thread is
> +                                             * blocked by us
> +                                             */
> +                                            (conf->mddev->thread->tsk == current &&
> +                                             test_bit(MD_RECOVERY_RUNNING,
> +                                                      &conf->mddev->recovery) &&
> +                                             conf->nr_queued > 0),
> +                                           conf->resync_lock);
>                 conf->nr_waiting--;
>                 if (!conf->nr_waiting)
>                         wake_up(&conf->wait_barrier);
>         }
> -       atomic_inc(&conf->nr_pending);
> +       /* Only increment nr_pending when we wait */
> +       if (ret)
> +               atomic_inc(&conf->nr_pending);
>         spin_unlock_irq(&conf->resync_lock);
> +       return ret;
>  }
>
>  static void allow_barrier(struct r10conf *conf)
> @@ -1098,21 +1106,30 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
>   * currently.
>   * 2. If IO spans the reshape position.  Need to wait for reshape to pass.
>   */
> -static void regular_request_wait(struct mddev *mddev, struct r10conf *conf,
> +static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
>                                  struct bio *bio, sector_t sectors)

This doesn't sound right: regular_request_wait() is called in two
places. But we are
not checking the return value in either of them.

Song
[...]

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v6 1/4] md: add support for REQ_NOWAIT
  2021-12-21 20:06                                           ` [PATCH v6 1/4] md: add support for REQ_NOWAIT Vishal Verma
                                                               ` (3 preceding siblings ...)
  2021-12-22 16:06                                             ` [PATCH v6 1/4] md: add support for REQ_NOWAIT Jens Axboe
@ 2021-12-23  1:22                                             ` Song Liu
  2021-12-23  2:57                                             ` Song Liu
  2021-12-23  8:36                                             ` Christoph Hellwig
  6 siblings, 0 replies; 86+ messages in thread
From: Song Liu @ 2021-12-23  1:22 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, Jens Axboe, rgoldwyn

On Tue, Dec 21, 2021 at 12:06 PM Vishal Verma <vverma@digitalocean.com> wrote:
>
> commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support
> for checking whether a given bdev supports handling of REQ_NOWAIT or not.
> Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable
> it for linear target") added support for REQ_NOWAIT for dm. This uses
> a similar approach to incorporate REQ_NOWAIT for md based bios.
>
> This patch was tested using t/io_uring tool within FIO. A nvme drive
> was partitioned into 2 partitions and a simple raid 0 configuration
> /dev/md0 was created.
>
> md0 : active raid0 nvme4n1p1[1] nvme4n1p2[0]
>       937423872 blocks super 1.2 512k chunks
>
> Before patch:
>
> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
>
> Running top while the above runs:
>
> $ ps -eL | grep $(pidof io_uring)
>
>   38396   38396 pts/2    00:00:00 io_uring
>   38396   38397 pts/2    00:00:15 io_uring
>   38396   38398 pts/2    00:00:13 iou-wrk-38397
>
> We can see iou-wrk-38397 io worker thread created which gets created
> when io_uring sees that the underlying device (/dev/md0 in this case)
> doesn't support nowait.
>
> After patch:
>
> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
>
> Running top while the above runs:
>
> $ ps -eL | grep $(pidof io_uring)
>
>   38341   38341 pts/2    00:10:22 io_uring
>   38341   38342 pts/2    00:10:37 io_uring
>
> After running this patch, we don't see any io worker thread
> being created which indicated that io_uring saw that the
> underlying device does support nowait. This is the exact behaviour
> noticed on a dm device which also supports nowait.
>
> For all the other raid personalities except raid0, we would need
> to train pieces which involves make_request fn in order for them
> to correctly handle REQ_NOWAIT.
>
> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
> ---
>  drivers/md/md.c | 20 ++++++++++++++++++++
>  1 file changed, 20 insertions(+)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 5111ed966947..ccd296aa9641 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -418,6 +418,11 @@ void md_handle_request(struct mddev *mddev, struct bio *bio)
>         rcu_read_lock();
>         if (is_suspended(mddev, bio)) {
>                 DEFINE_WAIT(__wait);
> +               /* Bail out if REQ_NOWAIT is set for the bio */
> +               if (bio->bi_opf & REQ_NOWAIT) {

We need rcu_read_unlock() here.

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v6 3/4] md: raid10 add nowait support
  2021-12-21 20:06                                             ` [PATCH v6 3/4] md: raid10 " Vishal Verma
  2021-12-22 23:58                                               ` Song Liu
@ 2021-12-23  1:47                                               ` Song Liu
  1 sibling, 0 replies; 86+ messages in thread
From: Song Liu @ 2021-12-23  1:47 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, Jens Axboe, rgoldwyn

On Tue, Dec 21, 2021 at 12:06 PM Vishal Verma <vverma@digitalocean.com> wrote:
>
> This adds nowait support to the RAID10 driver. Very similar to
> raid1 driver changes. It makes RAID10 driver return with EAGAIN
> for situations where it could wait for eg:
>
>   - Waiting for the barrier,
>   - Too many pending I/Os to be queued,
>   - Reshape operation,
>   - Discard operation.
>
> wait_barrier() and regular_request_wait() fn are modified to return bool
> to support error for wait barriers. They returns true in case of wait
> or if wait is not required and returns false if wait was required
> but not performed to support nowait.
>
> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
> ---
>  drivers/md/raid10.c | 90 +++++++++++++++++++++++++++++++--------------
>  1 file changed, 62 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index dde98f65bd04..7ceae00e863e 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -952,8 +952,9 @@ static void lower_barrier(struct r10conf *conf)
>         wake_up(&conf->wait_barrier);
>  }
>
> -static void wait_barrier(struct r10conf *conf)
> +static bool wait_barrier(struct r10conf *conf, bool nowait)
>  {
> +       bool ret = true;
>         spin_lock_irq(&conf->resync_lock);
>         if (conf->barrier) {
>                 struct bio_list *bio_list = current->bio_list;
> @@ -968,26 +969,33 @@ static void wait_barrier(struct r10conf *conf)
>                  * count down.
>                  */
>                 raid10_log(conf->mddev, "wait barrier");
> -               wait_event_lock_irq(conf->wait_barrier,
> -                                   !conf->barrier ||
> -                                   (atomic_read(&conf->nr_pending) &&
> -                                    bio_list &&
> -                                    (!bio_list_empty(&bio_list[0]) ||
> -                                     !bio_list_empty(&bio_list[1]))) ||
> -                                    /* move on if recovery thread is
> -                                     * blocked by us
> -                                     */
> -                                    (conf->mddev->thread->tsk == current &&
> -                                     test_bit(MD_RECOVERY_RUNNING,
> -                                              &conf->mddev->recovery) &&
> -                                     conf->nr_queued > 0),
> -                                   conf->resync_lock);
> +               /* Return false when nowait flag is set */
> +               if (nowait)
> +                       ret = false;
> +               else
> +                       wait_event_lock_irq(conf->wait_barrier,
> +                                           !conf->barrier ||
> +                                           (atomic_read(&conf->nr_pending) &&
> +                                            bio_list &&
> +                                            (!bio_list_empty(&bio_list[0]) ||
> +                                             !bio_list_empty(&bio_list[1]))) ||
> +                                            /* move on if recovery thread is
> +                                             * blocked by us
> +                                             */
> +                                            (conf->mddev->thread->tsk == current &&
> +                                             test_bit(MD_RECOVERY_RUNNING,
> +                                                      &conf->mddev->recovery) &&
> +                                             conf->nr_queued > 0),
> +                                           conf->resync_lock);
>                 conf->nr_waiting--;
>                 if (!conf->nr_waiting)
>                         wake_up(&conf->wait_barrier);
>         }
> -       atomic_inc(&conf->nr_pending);
> +       /* Only increment nr_pending when we wait */
> +       if (ret)
> +               atomic_inc(&conf->nr_pending);
>         spin_unlock_irq(&conf->resync_lock);
> +       return ret;
>  }
>
>  static void allow_barrier(struct r10conf *conf)
> @@ -1098,21 +1106,30 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
>   * currently.
>   * 2. If IO spans the reshape position.  Need to wait for reshape to pass.
>   */
> -static void regular_request_wait(struct mddev *mddev, struct r10conf *conf,
> +static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
>                                  struct bio *bio, sector_t sectors)
>  {
> -       wait_barrier(conf);
> +       /* Bail out if REQ_NOWAIT is set for the bio */
> +       if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
> +               bio_wouldblock_error(bio);
> +               return false;
> +       }
>         while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
>             bio->bi_iter.bi_sector < conf->reshape_progress &&
>             bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
>                 raid10_log(conf->mddev, "wait reshape");
> +               if (bio->bi_opf & REQ_NOWAIT) {
> +                       bio_wouldblock_error(bio);
> +                       return false;
> +               }
>                 allow_barrier(conf);
>                 wait_event(conf->wait_barrier,
>                            conf->reshape_progress <= bio->bi_iter.bi_sector ||
>                            conf->reshape_progress >= bio->bi_iter.bi_sector +
>                            sectors);
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>         }
> +       return true;
>  }
>
>  static void raid10_read_request(struct mddev *mddev, struct bio *bio,
> @@ -1179,7 +1196,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
>                 bio_chain(split, bio);
>                 allow_barrier(conf);
>                 submit_bio_noacct(bio);
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>                 bio = split;
>                 r10_bio->master_bio = bio;
>                 r10_bio->sectors = max_sectors;
> @@ -1338,7 +1355,7 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
>                 raid10_log(conf->mddev, "%s wait rdev %d blocked",
>                                 __func__, blocked_rdev->raid_disk);
>                 md_wait_for_blocked_rdev(blocked_rdev, mddev);

I think we need more handling here.

> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>                 goto retry_wait;
>         }
>  }
> @@ -1356,6 +1373,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>                                             bio->bi_iter.bi_sector,
>                                             bio_end_sector(bio)))) {
>                 DEFINE_WAIT(w);
> +               /* Bail out if REQ_NOWAIT is set for the bio */
> +               if (bio->bi_opf & REQ_NOWAIT) {
> +                       bio_wouldblock_error(bio);
> +                       return;
> +               }
>                 for (;;) {
>                         prepare_to_wait(&conf->wait_barrier,
>                                         &w, TASK_IDLE);
> @@ -1381,6 +1403,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>                               BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
>                 md_wakeup_thread(mddev->thread);
>                 raid10_log(conf->mddev, "wait reshape metadata");
> +               if (bio->bi_opf & REQ_NOWAIT) {
> +                       bio_wouldblock_error(bio);
> +                       return;
> +               }
>                 wait_event(mddev->sb_wait,
>                            !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
>
> @@ -1390,6 +1416,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>         if (conf->pending_count >= max_queued_requests) {
>                 md_wakeup_thread(mddev->thread);
>                 raid10_log(mddev, "wait queued");

We need the check before logging "wait queued".

> +               if (bio->bi_opf & REQ_NOWAIT) {
> +                       bio_wouldblock_error(bio);
> +                       return;
> +               }
>                 wait_event(conf->wait_barrier,
>                            conf->pending_count < max_queued_requests);
>         }
> @@ -1482,7 +1512,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>                 bio_chain(split, bio);
>                 allow_barrier(conf);
>                 submit_bio_noacct(bio);
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>                 bio = split;
>                 r10_bio->master_bio = bio;
>         }
> @@ -1607,7 +1637,11 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
>         if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
>                 return -EAGAIN;
>
> -       wait_barrier(conf);
> +       if (bio->bi_opf & REQ_NOWAIT) {
> +               bio_wouldblock_error(bio);
> +               return 0;

Shall we return -EAGAIN here?

> +       }
> +       wait_barrier(conf, false);
>
>         /*
>          * Check reshape again to avoid reshape happens after checking
> @@ -1649,7 +1683,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
>                 allow_barrier(conf);
>                 /* Resend the fist split part */
>                 submit_bio_noacct(split);
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>         }
>         div_u64_rem(bio_end, stripe_size, &remainder);
>         if (remainder) {
> @@ -1660,7 +1694,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
>                 /* Resend the second split part */
>                 submit_bio_noacct(bio);
>                 bio = split;
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>         }
>
>         bio_start = bio->bi_iter.bi_sector;
> @@ -1816,7 +1850,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
>                 end_disk_offset += geo->stride;
>                 atomic_inc(&first_r10bio->remaining);
>                 raid_end_discard_bio(r10_bio);
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>                 goto retry_discard;
>         }
>
> @@ -2011,7 +2045,7 @@ static void print_conf(struct r10conf *conf)
>
>  static void close_sync(struct r10conf *conf)
>  {
> -       wait_barrier(conf);
> +       wait_barrier(conf, false);
>         allow_barrier(conf);
>
>         mempool_exit(&conf->r10buf_pool);
> @@ -4819,7 +4853,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
>         if (need_flush ||
>             time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
>                 /* Need to update reshape_position in metadata */
> -               wait_barrier(conf);
> +               wait_barrier(conf, false);
>                 mddev->reshape_position = conf->reshape_progress;
>                 if (mddev->reshape_backwards)
>                         mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
> --
> 2.17.1
>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v6 1/4] md: add support for REQ_NOWAIT
  2021-12-21 20:06                                           ` [PATCH v6 1/4] md: add support for REQ_NOWAIT Vishal Verma
                                                               ` (4 preceding siblings ...)
  2021-12-23  1:22                                             ` Song Liu
@ 2021-12-23  2:57                                             ` Song Liu
  2021-12-23  3:08                                               ` Vishal Verma
  2022-01-02  0:11                                               ` Song Liu
  2021-12-23  8:36                                             ` Christoph Hellwig
  6 siblings, 2 replies; 86+ messages in thread
From: Song Liu @ 2021-12-23  2:57 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, Jens Axboe, rgoldwyn

On Tue, Dec 21, 2021 at 12:06 PM Vishal Verma <vverma@digitalocean.com> wrote:
>
> commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support
> for checking whether a given bdev supports handling of REQ_NOWAIT or not.
> Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable
> it for linear target") added support for REQ_NOWAIT for dm. This uses
> a similar approach to incorporate REQ_NOWAIT for md based bios.
>
> This patch was tested using t/io_uring tool within FIO. A nvme drive
> was partitioned into 2 partitions and a simple raid 0 configuration
> /dev/md0 was created.
>
> md0 : active raid0 nvme4n1p1[1] nvme4n1p2[0]
>       937423872 blocks super 1.2 512k chunks
>
> Before patch:
>
> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
>
> Running top while the above runs:
>
> $ ps -eL | grep $(pidof io_uring)
>
>   38396   38396 pts/2    00:00:00 io_uring
>   38396   38397 pts/2    00:00:15 io_uring
>   38396   38398 pts/2    00:00:13 iou-wrk-38397
>
> We can see iou-wrk-38397 io worker thread created which gets created
> when io_uring sees that the underlying device (/dev/md0 in this case)
> doesn't support nowait.
>
> After patch:
>
> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
>
> Running top while the above runs:
>
> $ ps -eL | grep $(pidof io_uring)
>
>   38341   38341 pts/2    00:10:22 io_uring
>   38341   38342 pts/2    00:10:37 io_uring
>
> After running this patch, we don't see any io worker thread
> being created which indicated that io_uring saw that the
> underlying device does support nowait. This is the exact behaviour
> noticed on a dm device which also supports nowait.
>
> For all the other raid personalities except raid0, we would need
> to train pieces which involves make_request fn in order for them
> to correctly handle REQ_NOWAIT.
>
> Signed-off-by: Vishal Verma <vverma@digitalocean.com>

I have made some changes and applied the set to md-next. However,
I think we don't yet have enough test coverage. Please continue testing
the code and send fixes on top of it. Based on the test results, we will
see whether we can ship it in the next merge window.

Note, md-next branch doesn't have [1], so we need to cherry-pick it
for testing.

Thanks,
Song

[1] a08ed9aae8a3 ("block: fix double bio queue when merging in cached
request path")

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v6 1/4] md: add support for REQ_NOWAIT
  2021-12-23  2:57                                             ` Song Liu
@ 2021-12-23  3:08                                               ` Vishal Verma
  2022-01-02  0:11                                               ` Song Liu
  1 sibling, 0 replies; 86+ messages in thread
From: Vishal Verma @ 2021-12-23  3:08 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, Jens Axboe, rgoldwyn


On 12/22/21 7:57 PM, Song Liu wrote:
> On Tue, Dec 21, 2021 at 12:06 PM Vishal Verma <vverma@digitalocean.com> wrote:
>> commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support
>> for checking whether a given bdev supports handling of REQ_NOWAIT or not.
>> Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable
>> it for linear target") added support for REQ_NOWAIT for dm. This uses
>> a similar approach to incorporate REQ_NOWAIT for md based bios.
>>
>> This patch was tested using t/io_uring tool within FIO. A nvme drive
>> was partitioned into 2 partitions and a simple raid 0 configuration
>> /dev/md0 was created.
>>
>> md0 : active raid0 nvme4n1p1[1] nvme4n1p2[0]
>>        937423872 blocks super 1.2 512k chunks
>>
>> Before patch:
>>
>> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
>>
>> Running top while the above runs:
>>
>> $ ps -eL | grep $(pidof io_uring)
>>
>>    38396   38396 pts/2    00:00:00 io_uring
>>    38396   38397 pts/2    00:00:15 io_uring
>>    38396   38398 pts/2    00:00:13 iou-wrk-38397
>>
>> We can see iou-wrk-38397 io worker thread created which gets created
>> when io_uring sees that the underlying device (/dev/md0 in this case)
>> doesn't support nowait.
>>
>> After patch:
>>
>> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
>>
>> Running top while the above runs:
>>
>> $ ps -eL | grep $(pidof io_uring)
>>
>>    38341   38341 pts/2    00:10:22 io_uring
>>    38341   38342 pts/2    00:10:37 io_uring
>>
>> After running this patch, we don't see any io worker thread
>> being created which indicated that io_uring saw that the
>> underlying device does support nowait. This is the exact behaviour
>> noticed on a dm device which also supports nowait.
>>
>> For all the other raid personalities except raid0, we would need
>> to train pieces which involves make_request fn in order for them
>> to correctly handle REQ_NOWAIT.
>>
>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
> I have made some changes and applied the set to md-next. However,
> I think we don't yet have enough test coverage. Please continue testing
> the code and send fixes on top of it. Based on the test results, we will
> see whether we can ship it in the next merge window.
>
> Note, md-next branch doesn't have [1], so we need to cherry-pick it
> for testing.
>
> Thanks,
> Song
>
> [1] a08ed9aae8a3 ("block: fix double bio queue when merging in cached
> request path")
Great, and I agree will continue testing this.

Just saw you already addressing some silly things
I missed in v6. Sorry about that.

Thank you!

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v6 1/4] md: add support for REQ_NOWAIT
  2021-12-21 20:06                                           ` [PATCH v6 1/4] md: add support for REQ_NOWAIT Vishal Verma
                                                               ` (5 preceding siblings ...)
  2021-12-23  2:57                                             ` Song Liu
@ 2021-12-23  8:36                                             ` Christoph Hellwig
  6 siblings, 0 replies; 86+ messages in thread
From: Christoph Hellwig @ 2021-12-23  8:36 UTC (permalink / raw)
  To: Vishal Verma; +Cc: song, linux-raid, axboe, rgoldwyn

Please post the new series in a new thread.

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v6 4/4] md: raid456 add nowait support
  2021-12-21 20:06                                             ` [PATCH v6 4/4] md: raid456 " Vishal Verma
  2021-12-21 22:02                                               ` John Stoffel
@ 2021-12-25  2:14                                               ` Song Liu
       [not found]                                                 ` <aadc6d52-bc6e-527a-3b9c-0be225f9b727@digitalocean.com>
  1 sibling, 1 reply; 86+ messages in thread
From: Song Liu @ 2021-12-25  2:14 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, Jens Axboe, rgoldwyn

On Tue, Dec 21, 2021 at 12:06 PM Vishal Verma <vverma@digitalocean.com> wrote:
>
> Returns EAGAIN in case the raid456 driver would block
> waiting for situations like:
>
>   - Reshape operation,
>   - Discard operation.
>
> Signed-off-by: Vishal Verma <vverma@digitalocean.com>

I think we will need the following fix for raid456:

============================ 8< ============================

diff --git i/drivers/md/raid5.c w/drivers/md/raid5.c
index 6ab22f29dacd..55d372ce3300 100644
--- i/drivers/md/raid5.c
+++ w/drivers/md/raid5.c
@@ -5717,6 +5717,7 @@ static void make_discard_request(struct mddev
*mddev, struct bio *bi)
                        raid5_release_stripe(sh);
                        /* Bail out if REQ_NOWAIT is set */
                        if (bi->bi_opf & REQ_NOWAIT) {
+                               finish_wait(&conf->wait_for_overlap, &w);
                                bio_wouldblock_error(bi);
                                return;
                        }
@@ -5734,6 +5735,7 @@ static void make_discard_request(struct mddev
*mddev, struct bio *bi)
                                raid5_release_stripe(sh);
                                /* Bail out if REQ_NOWAIT is set */
                                if (bi->bi_opf & REQ_NOWAIT) {
+
finish_wait(&conf->wait_for_overlap, &w);
                                        bio_wouldblock_error(bi);
                                        return;
                                }
@@ -5829,7 +5831,6 @@ static bool raid5_make_request(struct mddev
*mddev, struct bio * bi)
        last_sector = bio_end_sector(bi);
        bi->bi_next = NULL;

-       md_account_bio(mddev, &bi);
        /* Bail out if REQ_NOWAIT is set */
        if ((bi->bi_opf & REQ_NOWAIT) &&
            (conf->reshape_progress != MaxSector) &&
@@ -5837,9 +5838,11 @@ static bool raid5_make_request(struct mddev
*mddev, struct bio * bi)
            ? (logical_sector > conf->reshape_progress &&
logical_sector <= conf->reshape_safe)
            : (logical_sector >= conf->reshape_safe && logical_sector
< conf->reshape_progress))) {
                bio_wouldblock_error(bi);
+               if (rw == WRITE)
+                       md_write_end(mddev);
                return true;
        }
-
+       md_account_bio(mddev, &bi);
        prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
        for (; logical_sector < last_sector; logical_sector +=
RAID5_STRIPE_SECTORS(conf)) {
                int previous;

============================ 8< ============================

Vishal, please try to trigger all these conditions (including raid1,
raid10) and make sure
they work properly.

For example, I triggered raid5 reshape and used something like the
following to make
sure the logic is triggered:

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 55d372ce3300..e79de48a0027 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5840,6 +5840,11 @@ static bool raid5_make_request(struct mddev
*mddev, struct bio * bi)
                bio_wouldblock_error(bi);
                if (rw == WRITE)
                        md_write_end(mddev);
+               {
+                       static int count = 0;
+                       if (count++ < 10)
+                               pr_info("%s REQ_NOWAIT return\n", __func__);
+               }
                return true;
        }
        md_account_bio(mddev, &bi);

Thanks,
Song

^ permalink raw reply related	[flat|nested] 86+ messages in thread

* Re: [PATCH v6 4/4] md: raid456 add nowait support
       [not found]                                                 ` <aadc6d52-bc6e-527a-3b9c-0be225f9b727@digitalocean.com>
@ 2021-12-25 22:13                                                   ` Vishal Verma
  2021-12-26  0:07                                                     ` Song Liu
  0 siblings, 1 reply; 86+ messages in thread
From: Vishal Verma @ 2021-12-25 22:13 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, Jens Axboe, rgoldwyn


On 12/25/21 12:28 AM, Vishal Verma wrote:
>
>
> On 12/24/21 7:14 PM, Song Liu wrote:
>> On Tue, Dec 21, 2021 at 12:06 PM Vishal Verma<vverma@digitalocean.com>  wrote:
>>> Returns EAGAIN in case the raid456 driver would block
>>> waiting for situations like:
>>>
>>>    - Reshape operation,
>>>    - Discard operation.
>>>
>>> Signed-off-by: Vishal Verma<vverma@digitalocean.com>
>> I think we will need the following fix for raid456:
> Ack
>> ============================ 8< ============================
>>
>> diff --git i/drivers/md/raid5.c w/drivers/md/raid5.c
>> index 6ab22f29dacd..55d372ce3300 100644
>> --- i/drivers/md/raid5.c
>> +++ w/drivers/md/raid5.c
>> @@ -5717,6 +5717,7 @@ static void make_discard_request(struct mddev
>> *mddev, struct bio *bi)
>>                          raid5_release_stripe(sh);
>>                          /* Bail out if REQ_NOWAIT is set */
>>                          if (bi->bi_opf & REQ_NOWAIT) {
>> +                               finish_wait(&conf->wait_for_overlap, &w);
>>                                  bio_wouldblock_error(bi);
>>                                  return;
>>                          }
>> @@ -5734,6 +5735,7 @@ static void make_discard_request(struct mddev
>> *mddev, struct bio *bi)
>>                                  raid5_release_stripe(sh);
>>                                  /* Bail out if REQ_NOWAIT is set */
>>                                  if (bi->bi_opf & REQ_NOWAIT) {
>> +
>> finish_wait(&conf->wait_for_overlap, &w);
>>                                          bio_wouldblock_error(bi);
>>                                          return;
>>                                  }
>> @@ -5829,7 +5831,6 @@ static bool raid5_make_request(struct mddev
>> *mddev, struct bio * bi)
>>          last_sector = bio_end_sector(bi);
>>          bi->bi_next = NULL;
>>
>> -       md_account_bio(mddev, &bi);
>>          /* Bail out if REQ_NOWAIT is set */
>>          if ((bi->bi_opf & REQ_NOWAIT) &&
>>              (conf->reshape_progress != MaxSector) &&
>> @@ -5837,9 +5838,11 @@ static bool raid5_make_request(struct mddev
>> *mddev, struct bio * bi)
>>              ? (logical_sector > conf->reshape_progress &&
>> logical_sector <= conf->reshape_safe)
>>              : (logical_sector >= conf->reshape_safe && logical_sector
>> < conf->reshape_progress))) {
>>                  bio_wouldblock_error(bi);
>> +               if (rw == WRITE)
>> +                       md_write_end(mddev);
>>                  return true;
>>          }
>> -
>> +       md_account_bio(mddev, &bi);
>>          prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
>>          for (; logical_sector < last_sector; logical_sector +=
>> RAID5_STRIPE_SECTORS(conf)) {
>>                  int previous;
>>
>> ============================ 8< ============================
>>
>> Vishal, please try to trigger all these conditions (including raid1,
>> raid10) and make sure
>> they work properly.
>>
>> For example, I triggered raid5 reshape and used something like the
>> following to make
>> sure the logic is triggered:
>>
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index 55d372ce3300..e79de48a0027 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -5840,6 +5840,11 @@ static bool raid5_make_request(struct mddev
>> *mddev, struct bio * bi)
>>                  bio_wouldblock_error(bi);
>>                  if (rw == WRITE)
>>                          md_write_end(mddev);
>> +               {
>> +                       static int count = 0;
>> +                       if (count++ < 10)
>> +                               pr_info("%s REQ_NOWAIT return\n", __func__);
>> +               }
>>                  return true;
>>          }
>>          md_account_bio(mddev, &bi);
>>
>> Thanks,
>> Song
>>
> Sure, will try this and verify for raid1/10.
I am running into an issue during raid10 reshape. I can see the nowait 
code getting triggered during reshape, but it seems like the reshape 
operation was stuck as soon as I issued write IO using FIO to the array 
during reshape.
FIO also seem stuck i.e no IO went through...


^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v6 4/4] md: raid456 add nowait support
  2021-12-25 22:13                                                   ` Vishal Verma
@ 2021-12-26  0:07                                                     ` Song Liu
  2021-12-26  4:02                                                       ` Vishal Verma
  0 siblings, 1 reply; 86+ messages in thread
From: Song Liu @ 2021-12-26  0:07 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, Jens Axboe, rgoldwyn

On Sat, Dec 25, 2021 at 2:13 PM Vishal Verma <vverma@digitalocean.com> wrote:
>
>
> On 12/25/21 12:28 AM, Vishal Verma wrote:
> >
> >
> > On 12/24/21 7:14 PM, Song Liu wrote:
> >> On Tue, Dec 21, 2021 at 12:06 PM Vishal Verma<vverma@digitalocean.com>  wrote:
> >>> Returns EAGAIN in case the raid456 driver would block
> >>> waiting for situations like:
> >>>
> >>>    - Reshape operation,
> >>>    - Discard operation.
> >>>
> >>> Signed-off-by: Vishal Verma<vverma@digitalocean.com>
> >> I think we will need the following fix for raid456:
> > Ack
> >> ============================ 8< ============================
> >>
> >> diff --git i/drivers/md/raid5.c w/drivers/md/raid5.c
> >> index 6ab22f29dacd..55d372ce3300 100644
> >> --- i/drivers/md/raid5.c
> >> +++ w/drivers/md/raid5.c
> >> @@ -5717,6 +5717,7 @@ static void make_discard_request(struct mddev
> >> *mddev, struct bio *bi)
> >>                          raid5_release_stripe(sh);
> >>                          /* Bail out if REQ_NOWAIT is set */
> >>                          if (bi->bi_opf & REQ_NOWAIT) {
> >> +                               finish_wait(&conf->wait_for_overlap, &w);
> >>                                  bio_wouldblock_error(bi);
> >>                                  return;
> >>                          }
> >> @@ -5734,6 +5735,7 @@ static void make_discard_request(struct mddev
> >> *mddev, struct bio *bi)
> >>                                  raid5_release_stripe(sh);
> >>                                  /* Bail out if REQ_NOWAIT is set */
> >>                                  if (bi->bi_opf & REQ_NOWAIT) {
> >> +
> >> finish_wait(&conf->wait_for_overlap, &w);
> >>                                          bio_wouldblock_error(bi);
> >>                                          return;
> >>                                  }
> >> @@ -5829,7 +5831,6 @@ static bool raid5_make_request(struct mddev
> >> *mddev, struct bio * bi)
> >>          last_sector = bio_end_sector(bi);
> >>          bi->bi_next = NULL;
> >>
> >> -       md_account_bio(mddev, &bi);
> >>          /* Bail out if REQ_NOWAIT is set */
> >>          if ((bi->bi_opf & REQ_NOWAIT) &&
> >>              (conf->reshape_progress != MaxSector) &&
> >> @@ -5837,9 +5838,11 @@ static bool raid5_make_request(struct mddev
> >> *mddev, struct bio * bi)
> >>              ? (logical_sector > conf->reshape_progress &&
> >> logical_sector <= conf->reshape_safe)
> >>              : (logical_sector >= conf->reshape_safe && logical_sector
> >> < conf->reshape_progress))) {
> >>                  bio_wouldblock_error(bi);
> >> +               if (rw == WRITE)
> >> +                       md_write_end(mddev);
> >>                  return true;
> >>          }
> >> -
> >> +       md_account_bio(mddev, &bi);
> >>          prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
> >>          for (; logical_sector < last_sector; logical_sector +=
> >> RAID5_STRIPE_SECTORS(conf)) {
> >>                  int previous;
> >>
> >> ============================ 8< ============================
> >>
> >> Vishal, please try to trigger all these conditions (including raid1,
> >> raid10) and make sure
> >> they work properly.
> >>
> >> For example, I triggered raid5 reshape and used something like the
> >> following to make
> >> sure the logic is triggered:
> >>
> >> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> >> index 55d372ce3300..e79de48a0027 100644
> >> --- a/drivers/md/raid5.c
> >> +++ b/drivers/md/raid5.c
> >> @@ -5840,6 +5840,11 @@ static bool raid5_make_request(struct mddev
> >> *mddev, struct bio * bi)
> >>                  bio_wouldblock_error(bi);
> >>                  if (rw == WRITE)
> >>                          md_write_end(mddev);
> >> +               {
> >> +                       static int count = 0;
> >> +                       if (count++ < 10)
> >> +                               pr_info("%s REQ_NOWAIT return\n", __func__);
> >> +               }
> >>                  return true;
> >>          }
> >>          md_account_bio(mddev, &bi);
> >>
> >> Thanks,
> >> Song
> >>
> > Sure, will try this and verify for raid1/10.

Please also try test raid5 with discard. I haven't tested those two
conditions yet.

> I am running into an issue during raid10 reshape. I can see the nowait
> code getting triggered during reshape, but it seems like the reshape
> operation was stuck as soon as I issued write IO using FIO to the array
> during reshape.
> FIO also seem stuck i.e no IO went through...

Maybe the following could fix it?

Thanks,
Song

diff --git i/drivers/md/raid10.c w/drivers/md/raid10.c
index e2c524d50ec0..291eceaeb26c 100644
--- i/drivers/md/raid10.c
+++ w/drivers/md/raid10.c
@@ -1402,14 +1402,14 @@ static void raid10_write_request(struct mddev
*mddev, struct bio *bio,
             : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
                bio->bi_iter.bi_sector < conf->reshape_progress))) {
                /* Need to update reshape_position in metadata */
-               mddev->reshape_position = conf->reshape_progress;
-               set_mask_bits(&mddev->sb_flags, 0,
-                             BIT(MD_SB_CHANGE_DEVS) |
BIT(MD_SB_CHANGE_PENDING));
-               md_wakeup_thread(mddev->thread);
                if (bio->bi_opf & REQ_NOWAIT) {
                        bio_wouldblock_error(bio);
                        return;
                }
+               mddev->reshape_position = conf->reshape_progress;
+               set_mask_bits(&mddev->sb_flags, 0,
+                             BIT(MD_SB_CHANGE_DEVS) |
BIT(MD_SB_CHANGE_PENDING));
+               md_wakeup_thread(mddev->thread);
                raid10_log(conf->mddev, "wait reshape metadata");
                wait_event(mddev->sb_wait,
                           !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));

^ permalink raw reply related	[flat|nested] 86+ messages in thread

* Re: [PATCH v6 4/4] md: raid456 add nowait support
  2021-12-26  0:07                                                     ` Song Liu
@ 2021-12-26  4:02                                                       ` Vishal Verma
  2021-12-26 21:20                                                         ` Vishal Verma
  0 siblings, 1 reply; 86+ messages in thread
From: Vishal Verma @ 2021-12-26  4:02 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, Jens Axboe, rgoldwyn


On 12/25/21 5:07 PM, Song Liu wrote:
> On Sat, Dec 25, 2021 at 2:13 PM Vishal Verma <vverma@digitalocean.com> wrote:
>>
>> On 12/25/21 12:28 AM, Vishal Verma wrote:
>>>
>>> On 12/24/21 7:14 PM, Song Liu wrote:
>>>> On Tue, Dec 21, 2021 at 12:06 PM Vishal Verma<vverma@digitalocean.com>  wrote:
>>>>> Returns EAGAIN in case the raid456 driver would block
>>>>> waiting for situations like:
>>>>>
>>>>>     - Reshape operation,
>>>>>     - Discard operation.
>>>>>
>>>>> Signed-off-by: Vishal Verma<vverma@digitalocean.com>
>>>> I think we will need the following fix for raid456:
>>> Ack
>>>> ============================ 8< ============================
>>>>
>>>> diff --git i/drivers/md/raid5.c w/drivers/md/raid5.c
>>>> index 6ab22f29dacd..55d372ce3300 100644
>>>> --- i/drivers/md/raid5.c
>>>> +++ w/drivers/md/raid5.c
>>>> @@ -5717,6 +5717,7 @@ static void make_discard_request(struct mddev
>>>> *mddev, struct bio *bi)
>>>>                           raid5_release_stripe(sh);
>>>>                           /* Bail out if REQ_NOWAIT is set */
>>>>                           if (bi->bi_opf & REQ_NOWAIT) {
>>>> +                               finish_wait(&conf->wait_for_overlap, &w);
>>>>                                   bio_wouldblock_error(bi);
>>>>                                   return;
>>>>                           }
>>>> @@ -5734,6 +5735,7 @@ static void make_discard_request(struct mddev
>>>> *mddev, struct bio *bi)
>>>>                                   raid5_release_stripe(sh);
>>>>                                   /* Bail out if REQ_NOWAIT is set */
>>>>                                   if (bi->bi_opf & REQ_NOWAIT) {
>>>> +
>>>> finish_wait(&conf->wait_for_overlap, &w);
>>>>                                           bio_wouldblock_error(bi);
>>>>                                           return;
>>>>                                   }
>>>> @@ -5829,7 +5831,6 @@ static bool raid5_make_request(struct mddev
>>>> *mddev, struct bio * bi)
>>>>           last_sector = bio_end_sector(bi);
>>>>           bi->bi_next = NULL;
>>>>
>>>> -       md_account_bio(mddev, &bi);
>>>>           /* Bail out if REQ_NOWAIT is set */
>>>>           if ((bi->bi_opf & REQ_NOWAIT) &&
>>>>               (conf->reshape_progress != MaxSector) &&
>>>> @@ -5837,9 +5838,11 @@ static bool raid5_make_request(struct mddev
>>>> *mddev, struct bio * bi)
>>>>               ? (logical_sector > conf->reshape_progress &&
>>>> logical_sector <= conf->reshape_safe)
>>>>               : (logical_sector >= conf->reshape_safe && logical_sector
>>>> < conf->reshape_progress))) {
>>>>                   bio_wouldblock_error(bi);
>>>> +               if (rw == WRITE)
>>>> +                       md_write_end(mddev);
>>>>                   return true;
>>>>           }
>>>> -
>>>> +       md_account_bio(mddev, &bi);
>>>>           prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
>>>>           for (; logical_sector < last_sector; logical_sector +=
>>>> RAID5_STRIPE_SECTORS(conf)) {
>>>>                   int previous;
>>>>
>>>> ============================ 8< ============================
>>>>
>>>> Vishal, please try to trigger all these conditions (including raid1,
>>>> raid10) and make sure
>>>> they work properly.
>>>>
>>>> For example, I triggered raid5 reshape and used something like the
>>>> following to make
>>>> sure the logic is triggered:
>>>>
>>>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>>>> index 55d372ce3300..e79de48a0027 100644
>>>> --- a/drivers/md/raid5.c
>>>> +++ b/drivers/md/raid5.c
>>>> @@ -5840,6 +5840,11 @@ static bool raid5_make_request(struct mddev
>>>> *mddev, struct bio * bi)
>>>>                   bio_wouldblock_error(bi);
>>>>                   if (rw == WRITE)
>>>>                           md_write_end(mddev);
>>>> +               {
>>>> +                       static int count = 0;
>>>> +                       if (count++ < 10)
>>>> +                               pr_info("%s REQ_NOWAIT return\n", __func__);
>>>> +               }
>>>>                   return true;
>>>>           }
>>>>           md_account_bio(mddev, &bi);
>>>>
>>>> Thanks,
>>>> Song
>>>>
>>> Sure, will try this and verify for raid1/10.
> Please also try test raid5 with discard. I haven't tested those two
> conditions yet.
Ack.
>
>> I am running into an issue during raid10 reshape. I can see the nowait
>> code getting triggered during reshape, but it seems like the reshape
>> operation was stuck as soon as I issued write IO using FIO to the array
>> during reshape.
>> FIO also seem stuck i.e no IO went through...
> Maybe the following could fix it?
>
> Thanks,
> Song
Hmm no luck, still the same issue.
> diff --git i/drivers/md/raid10.c w/drivers/md/raid10.c
> index e2c524d50ec0..291eceaeb26c 100644
> --- i/drivers/md/raid10.c
> +++ w/drivers/md/raid10.c
> @@ -1402,14 +1402,14 @@ static void raid10_write_request(struct mddev
> *mddev, struct bio *bio,
>               : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
>                  bio->bi_iter.bi_sector < conf->reshape_progress))) {
>                  /* Need to update reshape_position in metadata */
> -               mddev->reshape_position = conf->reshape_progress;
> -               set_mask_bits(&mddev->sb_flags, 0,
> -                             BIT(MD_SB_CHANGE_DEVS) |
> BIT(MD_SB_CHANGE_PENDING));
> -               md_wakeup_thread(mddev->thread);
>                  if (bio->bi_opf & REQ_NOWAIT) {
>                          bio_wouldblock_error(bio);
>                          return;
>                  }
> +               mddev->reshape_position = conf->reshape_progress;
> +               set_mask_bits(&mddev->sb_flags, 0,
> +                             BIT(MD_SB_CHANGE_DEVS) |
> BIT(MD_SB_CHANGE_PENDING));
> +               md_wakeup_thread(mddev->thread);
>                  raid10_log(conf->mddev, "wait reshape metadata");
>                  wait_event(mddev->sb_wait,
>                             !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v6 4/4] md: raid456 add nowait support
  2021-12-26  4:02                                                       ` Vishal Verma
@ 2021-12-26 21:20                                                         ` Vishal Verma
  0 siblings, 0 replies; 86+ messages in thread
From: Vishal Verma @ 2021-12-26 21:20 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, Jens Axboe, rgoldwyn


On 12/25/21 9:02 PM, Vishal Verma wrote:
>
> On 12/25/21 5:07 PM, Song Liu wrote:
>> On Sat, Dec 25, 2021 at 2:13 PM Vishal Verma 
>> <vverma@digitalocean.com> wrote:
>>>
>>> On 12/25/21 12:28 AM, Vishal Verma wrote:
>>>>
>>>> On 12/24/21 7:14 PM, Song Liu wrote:
>>>>> On Tue, Dec 21, 2021 at 12:06 PM Vishal 
>>>>> Verma<vverma@digitalocean.com>  wrote:
>>>>>> Returns EAGAIN in case the raid456 driver would block
>>>>>> waiting for situations like:
>>>>>>
>>>>>>     - Reshape operation,
>>>>>>     - Discard operation.
>>>>>>
>>>>>> Signed-off-by: Vishal Verma<vverma@digitalocean.com>
>>>>> I think we will need the following fix for raid456:
>>>> Ack
>>>>> ============================ 8< ============================
>>>>>
>>>>> diff --git i/drivers/md/raid5.c w/drivers/md/raid5.c
>>>>> index 6ab22f29dacd..55d372ce3300 100644
>>>>> --- i/drivers/md/raid5.c
>>>>> +++ w/drivers/md/raid5.c
>>>>> @@ -5717,6 +5717,7 @@ static void make_discard_request(struct mddev
>>>>> *mddev, struct bio *bi)
>>>>>                           raid5_release_stripe(sh);
>>>>>                           /* Bail out if REQ_NOWAIT is set */
>>>>>                           if (bi->bi_opf & REQ_NOWAIT) {
>>>>> + finish_wait(&conf->wait_for_overlap, &w);
>>>>> bio_wouldblock_error(bi);
>>>>>                                   return;
>>>>>                           }
>>>>> @@ -5734,6 +5735,7 @@ static void make_discard_request(struct mddev
>>>>> *mddev, struct bio *bi)
>>>>> raid5_release_stripe(sh);
>>>>>                                   /* Bail out if REQ_NOWAIT is set */
>>>>>                                   if (bi->bi_opf & REQ_NOWAIT) {
>>>>> +
>>>>> finish_wait(&conf->wait_for_overlap, &w);
>>>>> bio_wouldblock_error(bi);
>>>>>                                           return;
>>>>>                                   }
>>>>> @@ -5829,7 +5831,6 @@ static bool raid5_make_request(struct mddev
>>>>> *mddev, struct bio * bi)
>>>>>           last_sector = bio_end_sector(bi);
>>>>>           bi->bi_next = NULL;
>>>>>
>>>>> -       md_account_bio(mddev, &bi);
>>>>>           /* Bail out if REQ_NOWAIT is set */
>>>>>           if ((bi->bi_opf & REQ_NOWAIT) &&
>>>>>               (conf->reshape_progress != MaxSector) &&
>>>>> @@ -5837,9 +5838,11 @@ static bool raid5_make_request(struct mddev
>>>>> *mddev, struct bio * bi)
>>>>>               ? (logical_sector > conf->reshape_progress &&
>>>>> logical_sector <= conf->reshape_safe)
>>>>>               : (logical_sector >= conf->reshape_safe && 
>>>>> logical_sector
>>>>> < conf->reshape_progress))) {
>>>>>                   bio_wouldblock_error(bi);
>>>>> +               if (rw == WRITE)
>>>>> +                       md_write_end(mddev);
>>>>>                   return true;
>>>>>           }
>>>>> -
>>>>> +       md_account_bio(mddev, &bi);
>>>>>           prepare_to_wait(&conf->wait_for_overlap, &w, 
>>>>> TASK_UNINTERRUPTIBLE);
>>>>>           for (; logical_sector < last_sector; logical_sector +=
>>>>> RAID5_STRIPE_SECTORS(conf)) {
>>>>>                   int previous;
>>>>>
>>>>> ============================ 8< ============================
>>>>>
>>>>> Vishal, please try to trigger all these conditions (including raid1,
>>>>> raid10) and make sure
>>>>> they work properly.
>>>>>
>>>>> For example, I triggered raid5 reshape and used something like the
>>>>> following to make
>>>>> sure the logic is triggered:
>>>>>
>>>>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>>>>> index 55d372ce3300..e79de48a0027 100644
>>>>> --- a/drivers/md/raid5.c
>>>>> +++ b/drivers/md/raid5.c
>>>>> @@ -5840,6 +5840,11 @@ static bool raid5_make_request(struct mddev
>>>>> *mddev, struct bio * bi)
>>>>>                   bio_wouldblock_error(bi);
>>>>>                   if (rw == WRITE)
>>>>>                           md_write_end(mddev);
>>>>> +               {
>>>>> +                       static int count = 0;
>>>>> +                       if (count++ < 10)
>>>>> +                               pr_info("%s REQ_NOWAIT return\n", 
>>>>> __func__);
>>>>> +               }
>>>>>                   return true;
>>>>>           }
>>>>>           md_account_bio(mddev, &bi);
>>>>>
>>>>> Thanks,
>>>>> Song
>>>>>
>>>> Sure, will try this and verify for raid1/10.
>> Please also try test raid5 with discard. I haven't tested those two
>> conditions yet.
> Ack.

Do you have suggestion around how to test this. As in use fstrim or 
something
to issue discard op to the raid5 array?

>>
>>> I am running into an issue during raid10 reshape. I can see the nowait
>>> code getting triggered during reshape, but it seems like the reshape
>>> operation was stuck as soon as I issued write IO using FIO to the array
>>> during reshape.
>>> FIO also seem stuck i.e no IO went through...
>> Maybe the following could fix it?
>>
>> Thanks,
>> Song
> Hmm no luck, still the same issue.
It seems both: iou-wrk thread & md_reshape thread are hung during reshape..

[  247.889279] task:iou-wrk-9013    state:D stack:    0 pid: 9088 ppid:  
8869 flags:0x00004000
[  247.889282] Call Trace:
[  247.889284]  <TASK>
[  247.889286]  __schedule+0x2d5/0x9b0
[  247.889292]  ? preempt_count_add+0x74/0xc0
[  247.889295]  schedule+0x58/0xd0
[  247.889298]  wait_barrier+0x1ad/0x270 [raid10]
[  247.889301]  ? wait_woken+0x60/0x60
[  247.889304]  regular_request_wait+0x42/0x1e0 [raid10]
[  247.889306]  ? default_wake_function+0x1a/0x30
[  247.889308]  ? autoremove_wake_function+0x12/0x40
[  247.889310]  raid10_write_request+0x85/0x670 [raid10]
[  247.889312]  ? r10bio_pool_alloc+0x26/0x30 [raid10]
[  247.889314]  ? md_write_start+0xa7/0x270
[  247.889318]  raid10_make_request+0xe8/0x170 [raid10]
[  247.889320]  md_handle_request+0x13d/0x1d0
[  247.889322]  ? submit_bio_checks+0x1f6/0x5a0
[  247.889325]  md_submit_bio+0x6d/0xa0
[  247.889326]  __submit_bio+0x94/0x140
[  247.889327]  submit_bio_noacct+0xe1/0x2a0
[  247.889329]  submit_bio+0x48/0x120
[  247.889330]  blkdev_direct_IO+0x19b/0x540
[  247.889332]  ? hctx_unlock+0x17/0x40
[  247.889335]  ? blk_mq_request_issue_directly+0x57/0x80
[  247.889338]  generic_file_direct_write+0x9f/0x190
[  247.889342]  __generic_file_write_iter+0x9d/0x1c0
[  247.889345]  blkdev_write_iter+0xe7/0x160
[  247.889347]  io_write+0x153/0x300
[  247.889350]  ? __this_cpu_preempt_check+0x13/0x20
[  247.889352]  ? __perf_event_task_sched_in+0x81/0x230
[  247.889355]  ? debug_smp_processor_id+0x17/0x20
[  247.889356]  ? __perf_event_task_sched_out+0x77/0x510
[  247.889359]  io_issue_sqe+0x387/0x19c0
[  247.889361]  ? _raw_spin_lock_irqsave+0x1d/0x50
[  247.889363]  ? lock_timer_base+0x72/0xa0
[  247.889367]  io_wq_submit_work+0x67/0x170
[  247.889369]  io_worker_handle_work+0x2b0/0x500
[  247.889372]  io_wqe_worker+0x1ca/0x360
[  247.889374]  ? _raw_spin_unlock+0x1a/0x30
[  247.889376]  ? preempt_count_add+0x74/0xc0
[  247.889377]  ? io_workqueue_create+0x60/0x60
[  247.889380]  ret_from_fork+0x1f/0x30

[  247.908367] task:md5_reshape     state:D stack:    0 pid: 9087 
ppid:     2 flags:0x00004000
[  247.908369] Call Trace:
[  247.908370]  <TASK>
[  247.908371]  __schedule+0x2d5/0x9b0
[  247.908373]  schedule+0x58/0xd0
[  247.908375]  raise_barrier+0xb7/0x170 [raid10]
[  247.908377]  ? wait_woken+0x60/0x60
[  247.908378]  reshape_request+0x1b9/0x920 [raid10]
[  247.908380]  ? __this_cpu_preempt_check+0x13/0x20
[  247.908382]  ? __perf_event_task_sched_in+0x81/0x230
[  247.908384]  raid10_sync_request+0x1073/0x1640 [raid10]
[  247.908386]  ? _raw_spin_unlock+0x1a/0x30
[  247.908388]  ? __switch_to+0x12e/0x430
[  247.908390]  ? __schedule+0x2dd/0x9b0
[  247.908392]  ? blk_flush_plug+0xeb/0x120
[  247.908393]  ? preempt_count_add+0x74/0xc0
[  247.908394]  ? _raw_spin_lock_irqsave+0x1d/0x50
[  247.908396]  md_do_sync.cold+0x3fa/0x97f
[  247.908399]  ? wait_woken+0x60/0x60
[  247.908401]  md_thread+0xae/0x170
[  247.908402]  ? preempt_count_add+0x74/0xc0
[  247.908403]  ? _raw_spin_lock_irqsave+0x1d/0x50
[  247.908405]  kthread+0x177/0x1a0
[  247.908407]  ? md_start_sync+0x60/0x60
[  247.908408]  ? set_kthread_struct+0x40/0x40
[  247.908410]  ret_from_fork+0x1f/0x30
[  247.908412]  </TASK>

>> diff --git i/drivers/md/raid10.c w/drivers/md/raid10.c
>> index e2c524d50ec0..291eceaeb26c 100644
>> --- i/drivers/md/raid10.c
>> +++ w/drivers/md/raid10.c
>> @@ -1402,14 +1402,14 @@ static void raid10_write_request(struct mddev
>> *mddev, struct bio *bio,
>>               : (bio->bi_iter.bi_sector + sectors > 
>> conf->reshape_safe &&
>>                  bio->bi_iter.bi_sector < conf->reshape_progress))) {
>>                  /* Need to update reshape_position in metadata */
>> -               mddev->reshape_position = conf->reshape_progress;
>> -               set_mask_bits(&mddev->sb_flags, 0,
>> -                             BIT(MD_SB_CHANGE_DEVS) |
>> BIT(MD_SB_CHANGE_PENDING));
>> -               md_wakeup_thread(mddev->thread);
>>                  if (bio->bi_opf & REQ_NOWAIT) {
>>                          bio_wouldblock_error(bio);
>>                          return;
>>                  }
>> +               mddev->reshape_position = conf->reshape_progress;
>> +               set_mask_bits(&mddev->sb_flags, 0,
>> +                             BIT(MD_SB_CHANGE_DEVS) |
>> BIT(MD_SB_CHANGE_PENDING));
>> +               md_wakeup_thread(mddev->thread);
>>                  raid10_log(conf->mddev, "wait reshape metadata");
>>                  wait_event(mddev->sb_wait,
>>                             !test_bit(MD_SB_CHANGE_PENDING, 
>> &mddev->sb_flags));

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v6 1/4] md: add support for REQ_NOWAIT
  2021-12-23  2:57                                             ` Song Liu
  2021-12-23  3:08                                               ` Vishal Verma
@ 2022-01-02  0:11                                               ` Song Liu
  2022-01-02  2:08                                                 ` Vishal Verma
  1 sibling, 1 reply; 86+ messages in thread
From: Song Liu @ 2022-01-02  0:11 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-raid, Jens Axboe, rgoldwyn

On Wed, Dec 22, 2021 at 6:57 PM Song Liu <song@kernel.org> wrote:
>
> On Tue, Dec 21, 2021 at 12:06 PM Vishal Verma <vverma@digitalocean.com> wrote:
> >
> > commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support
> > for checking whether a given bdev supports handling of REQ_NOWAIT or not.
> > Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable
> > it for linear target") added support for REQ_NOWAIT for dm. This uses
> > a similar approach to incorporate REQ_NOWAIT for md based bios.
> >
> > This patch was tested using t/io_uring tool within FIO. A nvme drive
> > was partitioned into 2 partitions and a simple raid 0 configuration
> > /dev/md0 was created.
> >
> > md0 : active raid0 nvme4n1p1[1] nvme4n1p2[0]
> >       937423872 blocks super 1.2 512k chunks
> >
> > Before patch:
> >
> > $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
> >
> > Running top while the above runs:
> >
> > $ ps -eL | grep $(pidof io_uring)
> >
> >   38396   38396 pts/2    00:00:00 io_uring
> >   38396   38397 pts/2    00:00:15 io_uring
> >   38396   38398 pts/2    00:00:13 iou-wrk-38397
> >
> > We can see iou-wrk-38397 io worker thread created which gets created
> > when io_uring sees that the underlying device (/dev/md0 in this case)
> > doesn't support nowait.
> >
> > After patch:
> >
> > $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
> >
> > Running top while the above runs:
> >
> > $ ps -eL | grep $(pidof io_uring)
> >
> >   38341   38341 pts/2    00:10:22 io_uring
> >   38341   38342 pts/2    00:10:37 io_uring
> >
> > After running this patch, we don't see any io worker thread
> > being created which indicated that io_uring saw that the
> > underlying device does support nowait. This is the exact behaviour
> > noticed on a dm device which also supports nowait.
> >
> > For all the other raid personalities except raid0, we would need
> > to train pieces which involves make_request fn in order for them
> > to correctly handle REQ_NOWAIT.
> >
> > Signed-off-by: Vishal Verma <vverma@digitalocean.com>
>
> I have made some changes and applied the set to md-next. However,
> I think we don't yet have enough test coverage. Please continue testing
> the code and send fixes on top of it. Based on the test results, we will
> see whether we can ship it in the next merge window.
>
> Note, md-next branch doesn't have [1], so we need to cherry-pick it
> for testing.

I went through all these changes again and tested many (but not all)
cases. The latest version is available in md-next branch.

Vishal, please run tests on this version and send fixes if anything
is broken.

Thanks,
Song

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH v6 1/4] md: add support for REQ_NOWAIT
  2022-01-02  0:11                                               ` Song Liu
@ 2022-01-02  2:08                                                 ` Vishal Verma
  0 siblings, 0 replies; 86+ messages in thread
From: Vishal Verma @ 2022-01-02  2:08 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, Jens Axboe, rgoldwyn


On 1/1/22 5:11 PM, Song Liu wrote:
> On Wed, Dec 22, 2021 at 6:57 PM Song Liu <song@kernel.org> wrote:
>> On Tue, Dec 21, 2021 at 12:06 PM Vishal Verma <vverma@digitalocean.com> wrote:
>>> commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support
>>> for checking whether a given bdev supports handling of REQ_NOWAIT or not.
>>> Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable
>>> it for linear target") added support for REQ_NOWAIT for dm. This uses
>>> a similar approach to incorporate REQ_NOWAIT for md based bios.
>>>
>>> This patch was tested using t/io_uring tool within FIO. A nvme drive
>>> was partitioned into 2 partitions and a simple raid 0 configuration
>>> /dev/md0 was created.
>>>
>>> md0 : active raid0 nvme4n1p1[1] nvme4n1p2[0]
>>>        937423872 blocks super 1.2 512k chunks
>>>
>>> Before patch:
>>>
>>> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
>>>
>>> Running top while the above runs:
>>>
>>> $ ps -eL | grep $(pidof io_uring)
>>>
>>>    38396   38396 pts/2    00:00:00 io_uring
>>>    38396   38397 pts/2    00:00:15 io_uring
>>>    38396   38398 pts/2    00:00:13 iou-wrk-38397
>>>
>>> We can see iou-wrk-38397 io worker thread created which gets created
>>> when io_uring sees that the underlying device (/dev/md0 in this case)
>>> doesn't support nowait.
>>>
>>> After patch:
>>>
>>> $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100
>>>
>>> Running top while the above runs:
>>>
>>> $ ps -eL | grep $(pidof io_uring)
>>>
>>>    38341   38341 pts/2    00:10:22 io_uring
>>>    38341   38342 pts/2    00:10:37 io_uring
>>>
>>> After running this patch, we don't see any io worker thread
>>> being created which indicated that io_uring saw that the
>>> underlying device does support nowait. This is the exact behaviour
>>> noticed on a dm device which also supports nowait.
>>>
>>> For all the other raid personalities except raid0, we would need
>>> to train pieces which involves make_request fn in order for them
>>> to correctly handle REQ_NOWAIT.
>>>
>>> Signed-off-by: Vishal Verma <vverma@digitalocean.com>
>> I have made some changes and applied the set to md-next. However,
>> I think we don't yet have enough test coverage. Please continue testing
>> the code and send fixes on top of it. Based on the test results, we will
>> see whether we can ship it in the next merge window.
>>
>> Note, md-next branch doesn't have [1], so we need to cherry-pick it
>> for testing.
> I went through all these changes again and tested many (but not all)
> cases. The latest version is available in md-next branch.
>
> Vishal, please run tests on this version and send fixes if anything
> is broken.
>
> Thanks,
> Song
Thanks Song. This latest version looks good!
And yes, will report out if I notice any issues or anything.

^ permalink raw reply	[flat|nested] 86+ messages in thread

end of thread, other threads:[~2022-01-02  2:08 UTC | newest]

Thread overview: 86+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-01 21:51 [PATCH] md: add support for REQ_NOWAIT Vishal Verma
2021-11-02  3:41 ` Li Feng
2021-11-02  5:01 ` Song Liu
2021-11-02 14:40   ` [PATCH v2] " Vishal Verma
2021-11-02 15:31     ` Jens Axboe
2021-11-02 18:35     ` Song Liu
2021-11-04  4:51       ` [PATCH v3 2/2] md: raid1 add nowait support Vishal Verma
2021-11-04  4:51         ` [PATCH v3 1/2] md: add support for REQ_NOWAIT Vishal Verma
2021-11-06 15:38           ` Guoqing Jiang
2021-11-07  0:16             ` Vishal Verma
2021-11-08 22:17           ` Song Liu
2021-11-08 22:36             ` Vishal Verma
2021-11-06 15:24         ` [PATCH v3 2/2] md: raid1 add nowait support Guoqing Jiang
2021-11-07  0:18           ` Vishal Verma
2021-11-08 22:32         ` Song Liu
2021-11-08 22:39           ` Vishal Verma
2021-11-09 20:59             ` Vishal Verma
2021-11-10 17:02               ` Song Liu
2021-11-10 17:04                 ` Vishal Verma
2021-11-10 18:14           ` [RFC PATCH v4 1/4] md: add support for REQ_NOWAIT Vishal Verma
2021-11-10 18:14             ` [RFC PATCH v4 2/4] md: raid1 add nowait support Vishal Verma
2021-11-10 18:14             ` [RFC PATCH v4 3/4] md: raid10 " Vishal Verma
2021-12-14  0:32               ` Song Liu
2021-12-14 15:27                 ` Vishal Verma
2021-11-10 18:14             ` [RFC PATCH v4 4/4] md: raid456 " Vishal Verma
2021-11-11 21:42               ` Song Liu
     [not found]                 ` <f8c2a2bc-a885-8254-2b39-fc0c969ac70d@digitalocean.com>
2021-11-19  4:07                   ` Song Liu
2021-11-19  4:20                     ` Vishal Verma
2021-12-09 16:53                     ` Vishal Verma
2021-12-09 16:59                       ` Song Liu
2021-12-09 17:01                         ` Vishal Verma
2021-12-10  2:16               ` Song Liu
2021-12-10  7:18                 ` Song Liu
2021-12-10 18:26                 ` Vishal Verma
2021-12-13  5:56                   ` Song Liu
2021-12-13 22:43                     ` Vishal Verma
2021-12-13 23:35                       ` Jens Axboe
     [not found]                         ` <78d5f029-791e-6d3f-4871-263ec6b5c09b@digitalocean.com>
2021-12-14  1:11                           ` Song Liu
2021-12-14  1:12                             ` Vishal Verma
2021-12-14 15:30                               ` Vishal Verma
2021-12-14 17:08                                 ` Song Liu
2021-12-14 18:09                                   ` Vishal Verma
2021-12-15  6:09                                   ` [PATCH v5 1/4] md: add support for REQ_NOWAIT Vishal Verma
2021-12-15  6:09                                     ` [PATCH v5 2/4] md: raid1 add nowait support Vishal Verma
2021-12-15 20:33                                       ` Song Liu
2021-12-15 22:20                                         ` Vishal Verma
2021-12-21 20:06                                           ` [PATCH v6 1/4] md: add support for REQ_NOWAIT Vishal Verma
2021-12-21 20:06                                             ` [PATCH v6 2/4] md: raid1 add nowait support Vishal Verma
2021-12-21 20:06                                             ` [PATCH v6 3/4] md: raid10 " Vishal Verma
2021-12-22 23:58                                               ` Song Liu
2021-12-23  1:47                                               ` Song Liu
2021-12-21 20:06                                             ` [PATCH v6 4/4] md: raid456 " Vishal Verma
2021-12-21 22:02                                               ` John Stoffel
2021-12-25  2:14                                               ` Song Liu
     [not found]                                                 ` <aadc6d52-bc6e-527a-3b9c-0be225f9b727@digitalocean.com>
2021-12-25 22:13                                                   ` Vishal Verma
2021-12-26  0:07                                                     ` Song Liu
2021-12-26  4:02                                                       ` Vishal Verma
2021-12-26 21:20                                                         ` Vishal Verma
2021-12-22 16:06                                             ` [PATCH v6 1/4] md: add support for REQ_NOWAIT Jens Axboe
2021-12-23  1:22                                             ` Song Liu
2021-12-23  2:57                                             ` Song Liu
2021-12-23  3:08                                               ` Vishal Verma
2022-01-02  0:11                                               ` Song Liu
2022-01-02  2:08                                                 ` Vishal Verma
2021-12-23  8:36                                             ` Christoph Hellwig
2021-12-15  6:09                                     ` [PATCH v5 3/4] md: raid10 add nowait support Vishal Verma
2021-12-15 20:42                                       ` Song Liu
2021-12-15 22:20                                         ` Vishal Verma
2021-12-16  0:30                                           ` Vishal Verma
2021-12-16 16:40                                             ` Vishal Verma
2021-12-16 16:42                                             ` Jens Axboe
2021-12-16 16:45                                               ` Vishal Verma
2021-12-16 18:49                                                 ` Jens Axboe
2021-12-16 19:40                                                   ` Vishal Verma
2021-12-16 20:18                                                     ` Song Liu
2021-12-16 20:37                                                       ` Vishal Verma
2021-12-16 23:50                                                         ` Song Liu
     [not found]                                                           ` <bd90d6e6-adb4-2696-3110-fad0b1ee00dc@digitalocean.com>
2021-12-21  8:13                                                             ` Song Liu
2021-12-21 15:29                                                               ` Vishal Verma
2021-12-21 15:59                                                                 ` Jens Axboe
2021-12-21 16:26                                                                   ` Vishal Verma
2021-12-16 18:14                                               ` Vishal Verma
2021-12-15  6:09                                     ` [PATCH v5 4/4] md: raid456 " Vishal Verma
2021-12-15 20:02                                     ` [PATCH v5 1/4] md: add support for REQ_NOWAIT Song Liu
2021-12-14  0:36                       ` [RFC PATCH v4 4/4] md: raid456 add nowait support Song Liu
2021-12-13 23:50             ` [RFC PATCH v4 1/4] md: add support for REQ_NOWAIT Song Liu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.