[PATCH] md/raid10: fix deadlock when handle read error and running data-check at same time

* [PATCH] md/raid10: fix deadlock when handle read error and running data-check at same time
@ 2023-04-07  6:04 linminggui
  0 siblings, 0 replies; only message in thread
From: linminggui @ 2023-04-07  6:04 UTC (permalink / raw)
  To: stable; +Cc: shli, linminggui

when running data-check and ecounter a normal IO errror, raid10d handle
the error, one resync IO added into conf->retry_list waiting for
raid10d to handle it, so barrier will not drop to zero and the normal
IO(read error) will stuck in wait_barrier in raid10_read_request.
after this, resyc thread will stuck in raise_barrier, other process
will stuck in wait_barrier. Ignore barrier for read error retry in
raid10_read_request to avoid deadlock. for kernel linux-4.19.y

processA      md0_raid10          md0_resync               processB
-------------------------------------------------------------------------
        |         |                     |                      |
read io error     |                     |                      |
        |   handle_read_error     raise_barrier                |
        |         |               (nr_pending=1,barrier=1)     |
                  |                     |                 wait_barrier
                  |                     |       (nr_waiting=1,barrier=1)
           allow_barrier                |                      |
          (nr_pending=0)                |                      |
                  |                     |                      
                  |                conf->retry_list
                  |                     |
                  |                     |
            wait_barrier
          (nr_waiting=2,barrier=1)

[ 1452.065519] INFO: task md0_raid10:381 blocked for more than 120 seconds.
[ 1452.065852]       Tainted: G           OE K   4.19.280 #2
[ 1452.066018] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 1452.066189] md0_raid10      D    0   381      2 0x80000000
[ 1452.066191] Call Trace:
[ 1452.066197]  __schedule+0x3f8/0x8b0
[ 1452.066199]  schedule+0x36/0x80
[ 1452.066201]  wait_barrier+0x150/0x1b0
[ 1452.066203]  ? wait_woken+0x80/0x80
[ 1452.066205]  raid10_read_request+0xa8/0x510
[ 1452.066206]  handle_read_error+0xa9/0x220
[ 1452.066207]  ? pick_next_task_fair+0x15d/0x610
[ 1452.066208]  raid10d+0xa01/0x1510
[ 1452.066210]  ? schedule+0x36/0x80
[ 1452.066211]  md_thread+0x133/0x180
[ 1452.066212]  ? md_thread+0x133/0x180
[ 1452.066213]  ? wait_woken+0x80/0x80
[ 1452.066214]  kthread+0x105/0x140

Signed-off-by: linminggui <linminggui1@bigo.sg>
---
 drivers/md/raid10.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 9f9cd2f..9f00400 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1137,6 +1137,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 	int slot = r10_bio->read_slot;
 	struct md_rdev *err_rdev = NULL;
 	gfp_t gfp = GFP_NOIO;
+	bool error_retry = false;
 
 	if (slot >= 0 && r10_bio->devs[slot].rdev) {
 		/*
@@ -1153,6 +1154,9 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 		 */
 		gfp = GFP_NOIO | __GFP_HIGH;
 
+		error_retry = true;
+		atomic_inc(&conf->nr_pending);
+
 		rcu_read_lock();
 		disk = r10_bio->devs[slot].devnum;
 		err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
@@ -1169,8 +1173,10 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 	 * Register the new request and wait if the reconstruction
 	 * thread has put up a bar for new requests.
 	 * Continue immediately if no resync is active currently.
+	 * Ignore barrier if this is an error retry.
 	 */
-	wait_barrier(conf);
+	if (!error_retry)
+		wait_barrier(conf);
 
 	sectors = r10_bio->sectors;
 	while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
@@ -1181,12 +1187,14 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 		 * pass
 		 */
 		raid10_log(conf->mddev, "wait reshape");
-		allow_barrier(conf);
+		if (!error_retry)
+			allow_barrier(conf);
 		wait_event(conf->wait_barrier,
 			   conf->reshape_progress <= bio->bi_iter.bi_sector ||
 			   conf->reshape_progress >= bio->bi_iter.bi_sector +
 			   sectors);
-		wait_barrier(conf);
+		if (!error_retry)
+			wait_barrier(conf);
 	}
 
 	rdev = read_balance(conf, r10_bio, &max_sectors);
@@ -1208,9 +1216,11 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 		struct bio *split = bio_split(bio, max_sectors,
 					      gfp, &conf->bio_split);
 		bio_chain(split, bio);
-		allow_barrier(conf);
+		if (!error_retry)
+			allow_barrier(conf);
 		generic_make_request(bio);
-		wait_barrier(conf);
+		if (!error_retry)
+			wait_barrier(conf);
 		bio = split;
 		r10_bio->master_bio = bio;
 		r10_bio->sectors = max_sectors;
-- 
2.7.4


^ permalink raw reply related	[flat|nested] only message in thread