All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V1] raid5: Correct some failed-stripe because the badsector.
@ 2013-01-14  8:44 majianpeng
  2013-02-07  2:05 ` NeilBrown
  0 siblings, 1 reply; 3+ messages in thread
From: majianpeng @ 2013-01-14  8:44 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, 边, 雨, 周, 麒

At present for failed-stripes,the write/read operations will only return error.But there
are some failed-stripes which can correct by write operation.

For example:
1:Create a raid5 by four disks missing/sdb/sdc/sdd.
2:If readed  data from sdb and met error,so the stripe is failed.Because
the stripe was degraded,there is no chance to correct the read-error.
3:If later write-stripe contains sdb which overwrite.The stripe has a
chance to correct.
4:If step3 successed, add a spare disk to raid,the raid can be active
from degraded.

There are two methods to compute raid5 parity,rcw and rmw.
In this situation,it only used rcw.
There are some conditions for correcting failed-stripe
A: in a stripe there are only one or two baddisks(maybe faulty/removed/read-error/write-error).
B: in a stripe for baddisks exclude parity disks,they must overwrite.
C:After writing data,for read-error disks it must reread to check.


Tested-by: Qi Zhou <qi.g.zhou@gmail.com>
Signed-off-by: Yu Bian <ycbzzjlbyby@gmail.com>
Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
---
 drivers/md/raid5.c |  133 +++++++++++++++++++++++++++++++++++++++++++++-------
 drivers/md/raid5.h |    9 ++++
 2 files changed, 125 insertions(+), 17 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 19d77a0..fd8fe58 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -224,6 +224,7 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
 			    < IO_THRESHOLD)
 				md_wakeup_thread(conf->mddev->thread);
 		atomic_dec(&conf->active_stripes);
+		clear_bit(STRIPE_FAILED_REREAD, &sh->state);
 		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
 			list_add_tail(&sh->lru, &conf->inactive_list);
 			wake_up(&conf->wait_for_stripe);
@@ -1800,6 +1801,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 			atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
 			clear_bit(R5_ReadError, &sh->dev[i].flags);
 			clear_bit(R5_ReWrite, &sh->dev[i].flags);
+			clear_bit(R5_FailedReread, &sh->dev[i].flags);
 		} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
 			clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
 
@@ -1855,6 +1857,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 		else {
 			clear_bit(R5_ReadError, &sh->dev[i].flags);
 			clear_bit(R5_ReWrite, &sh->dev[i].flags);
+			clear_bit(R5_FailedReread, &sh->dev[i].flags);
 			if (!(set_bad
 			      && test_bit(In_sync, &rdev->flags)
 			      && rdev_set_badblocks(
@@ -1916,6 +1919,7 @@ static void raid5_end_write_request(struct bio *bi, int error)
 		if (!uptodate) {
 			set_bit(WriteErrorSeen, &rdev->flags);
 			set_bit(R5_WriteError, &sh->dev[i].flags);
+			clear_bit(R5_FailedOverwrite, &sh->dev[i].flags);
 			if (!test_and_set_bit(WantReplacement, &rdev->flags))
 				set_bit(MD_RECOVERY_NEEDED,
 					&rdev->mddev->recovery);
@@ -2523,6 +2527,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 		if (bi)
 			bitmap_end = 1;
 
+		if (test_and_clear_bit(R5_FailedOverwrite, &sh->dev[i].flags))
+			s->failed_overwrite--;
+		clear_bit(R5_BadParity, &sh->dev[i].flags);
+
 		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
 			wake_up(&conf->wait_for_overlap);
 
@@ -2838,6 +2846,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 		pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
 			 conf->max_degraded, (unsigned long long)recovery_cp,
 			 (unsigned long long)sh->sector);
+	} else if (s->failed > conf->max_degraded) {
+		/*For write failed-stripe, only use rcw */
+		rcw = 1;
+		rmw = 2;
 	} else for (i = disks; i--; ) {
 		/* would I have to read this buffer for read_modify_write */
 		struct r5dev *dev = &sh->dev[i];
@@ -2900,8 +2912,13 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 			    !(test_bit(R5_UPTODATE, &dev->flags) ||
 			      test_bit(R5_Wantcompute, &dev->flags))) {
 				rcw++;
-				if (!test_bit(R5_Insync, &dev->flags))
+				if (!test_bit(R5_Insync, &dev->flags)) {
+				 /* Only for all data-disks !R5_Insync */
+					if (s->failed > conf->max_degraded &&
+						!test_bit(STRIPE_FAILED_WRITE, &sh->state))
+						set_bit(STRIPE_DELAYED, &sh->state);
 					continue; /* it's a failed drive */
+				}
 				if (
 				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
 					pr_debug("Read_old block "
@@ -3347,9 +3364,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			}
 		}
 		clear_bit(R5_Insync, &dev->flags);
-		if (!rdev)
-			/* Not in-sync */;
-		else if (is_bad) {
+		if (!rdev) {
+			s->noworked++;
+			if ((dev->towrite && test_bit(R5_OVERWRITE, &dev->flags)) ||
+				test_bit(R5_FailedOverwrite, &dev->flags)) {
+				s->failed_overwrite++;
+				set_bit(R5_FailedOverwrite, &dev->flags);
+			}
+		} else if (is_bad) {
 			/* also not in-sync */
 			if (!test_bit(WriteErrorSeen, &rdev->flags) &&
 			    test_bit(R5_UPTODATE, &dev->flags)) {
@@ -3359,6 +3381,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 				set_bit(R5_Insync, &dev->flags);
 				set_bit(R5_ReadError, &dev->flags);
 			}
+			if (!test_bit(WriteErrorSeen, &rdev->flags) &&
+				((dev->towrite && test_bit(R5_OVERWRITE, &dev->flags)) ||
+				test_bit(R5_FailedOverwrite, &dev->flags))) {
+				s->failed_overwrite++;
+				set_bit(R5_FailedOverwrite, &dev->flags);
+			} else if (i == sh->pd_idx || i == sh->qd_idx)
+				set_bit(R5_BadParity, &dev->flags);
 		} else if (test_bit(In_sync, &rdev->flags))
 			set_bit(R5_Insync, &dev->flags);
 		else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
@@ -3410,14 +3439,36 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			clear_bit(R5_ReadError, &dev->flags);
 			clear_bit(R5_ReWrite, &dev->flags);
 		}
-		if (test_bit(R5_ReadError, &dev->flags))
-			clear_bit(R5_Insync, &dev->flags);
+		if (test_bit(R5_ReadError, &dev->flags)) {
+			/*
+			 * For failed-stripe which contained badsectors,after writing
+			 * it should reread to check.
+			 */
+			struct md_rdev *rdev2 = rcu_dereference(conf->disks[i].rdev);
+			if (!is_bad && rdev2 && !test_bit(Faulty, &rdev2->flags) &&
+				test_bit(R5_UPTODATE, &dev->flags) &&
+				(test_bit(R5_FailedOverwrite, &dev->flags) ||
+				 test_bit(R5_BadParity, &dev->flags) ||
+				 test_bit(R5_FailedReread, &dev->flags))) {
+				set_bit(STRIPE_FAILED_REREAD, &sh->state);
+				set_bit(R5_FailedReread, &dev->flags);
+				clear_bit(R5_FailedOverwrite, &dev->flags);
+				clear_bit(R5_BadParity, &dev->flags);
+			} else
+				clear_bit(R5_Insync, &dev->flags);
+		}
+
 		if (!test_bit(R5_Insync, &dev->flags)) {
 			if (s->failed < 2)
 				s->failed_num[s->failed] = i;
 			s->failed++;
 			if (rdev && !test_bit(Faulty, &rdev->flags))
 				do_recovery = 1;
+			if (i == sh->pd_idx)
+				s->p_failed = 1;
+			else if (i == sh->qd_idx)
+				s->q_failed = 1;
+
 		}
 	}
 	if (test_bit(STRIPE_SYNCING, &sh->state)) {
@@ -3492,19 +3543,46 @@ static void handle_stripe(struct stripe_head *sh)
 	}
 
 	pr_debug("locked=%d uptodate=%d to_read=%d"
-	       " to_write=%d failed=%d failed_num=%d,%d\n",
+		" to_write=%d failed=%d failed_num=%d,%d"
+		" failed_overwrite=%d p_failed=%d q_failed=%d\n",
 	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
-	       s.failed_num[0], s.failed_num[1]);
+	       s.failed_num[0], s.failed_num[1], s.failed_overwrite,
+	       s.p_failed, s.q_failed);
+
 	/* check if the array has lost more than max_degraded devices and,
 	 * if so, some requests might need to be failed.
 	 */
 	if (s.failed > conf->max_degraded) {
-		sh->check_state = 0;
-		sh->reconstruct_state = 0;
-		if (s.to_read+s.to_write+s.written)
-			handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
-		if (s.syncing + s.replacing)
-			handle_failed_sync(conf, sh, &s);
+		/*
+		 *Disks which removed or faulty must less or equal max_degraded
+		 */
+		 if (s.noworked <= conf->max_degraded &&
+			((conf->level < 6 &&
+			(s.failed - s.failed_overwrite <= s.p_failed)) ||
+			 (conf->level == 6 &&
+			(s.failed - s.failed_overwrite <= s.p_failed + s.q_failed)))) {
+			set_bit(STRIPE_FAILED_WRITE, &sh->state);
+			pr_debug("stripe(%llu) failed, try to recorrect\n",
+				(unsigned long long)sh->sector);
+		} else if ((s.noworked <= conf->max_degraded &&
+			s.failed_overwrite > 0 &&
+			!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)))
+			pr_debug("stripe(%lld) delay for failed-write\n",
+			(unsigned long long)sh->sector);
+		 else {
+			sh->check_state = 0;
+			sh->reconstruct_state = 0;
+			clear_bit(STRIPE_FAILED_WRITE, &sh->state);
+			clear_bit(STRIPE_FAILED_REREAD, &sh->state);
+			if (s.to_read+s.to_write+s.written) {
+				handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
+				pr_debug("handle_failed_stripe:%lld\n",
+					(unsigned long long)sh->sector);
+			}
+			if (s.syncing + s.replacing)
+				 handle_failed_sync(conf, sh, &s);
+		}
+
 	}
 
 	/* Now we check to see if any write operations have recently
@@ -3525,6 +3603,7 @@ static void handle_stripe(struct stripe_head *sh)
 		BUG_ON(sh->qd_idx >= 0 &&
 		       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
 		       !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
+		clear_bit(STRIPE_FAILED_WRITE, &sh->state);
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
 			if (test_bit(R5_LOCKED, &dev->flags) &&
@@ -3620,10 +3699,30 @@ static void handle_stripe(struct stripe_head *sh)
 		clear_bit(STRIPE_SYNCING, &sh->state);
 	}
 
-	/* If the failed drives are just a ReadError, then we might need
-	 * to progress the repair/check process
+	/*
+	 * If stripe failed because badsector and
+	 * can write to correct the badsector
+	 * we used readerr routine which rewrite and reread
 	 */
-	if (s.failed <= conf->max_degraded && !conf->mddev->ro)
+	if (test_bit(STRIPE_FAILED_REREAD, &sh->state)) {
+		for (i = disks; i--; ) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_bit(R5_FailedReread, &dev->flags)
+				&& !test_bit(R5_LOCKED, &dev->flags)
+				&& test_bit(R5_UPTODATE, &dev->flags)) {
+				/* it just wrote and sucessed,so not
+				 * necessary to rewrite,only set flag
+				 */
+				set_bit(R5_ReWrite, &dev->flags);
+				set_bit(R5_Wantread, &dev->flags);
+				set_bit(R5_LOCKED, &dev->flags);
+				s.locked++;
+				}
+		}
+	} else if (s.failed <= conf->max_degraded && !conf->mddev->ro)
+		/* If the failed drives are just a ReadError, then we might need
+		 * to progress the repair/check process
+		 */
 		for (i = 0; i < s.failed; i++) {
 			struct r5dev *dev = &sh->dev[s.failed_num[i]];
 			if (test_bit(R5_ReadError, &dev->flags)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 18b2c4a..72fd934 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -251,6 +251,10 @@ struct stripe_head_state {
 	 */
 	int syncing, expanding, expanded, replacing;
 	int locked, uptodate, to_read, to_write, failed, written;
+	/* Overwrite disks which faulty or removed or contained badsector */
+	int failed_overwrite;
+	int noworked; /*disk faulty or removed*/
+
 	int to_fill, compute, req_compute, non_overwrite;
 	int failed_num[2];
 	int p_failed, q_failed;
@@ -299,6 +303,9 @@ enum r5dev_flags {
 			 * data in, and now is a good time to write it out.
 			 */
 	R5_Discard,	/* Discard the stripe */
+	R5_FailedOverwrite, /*overwrite on disk which removed or faulty or had badsectors*/
+	R5_FailedReread,
+	R5_BadParity,
 };
 
 /*
@@ -323,6 +330,8 @@ enum {
 	STRIPE_COMPUTE_RUN,
 	STRIPE_OPS_REQ_PENDING,
 	STRIPE_ON_UNPLUG_LIST,
+	STRIPE_FAILED_WRITE,
+	STRIPE_FAILED_REREAD,
 };
 
 /*
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH V1] raid5: Correct some failed-stripe because the badsector.
  2013-01-14  8:44 [PATCH V1] raid5: Correct some failed-stripe because the badsector majianpeng
@ 2013-02-07  2:05 ` NeilBrown
  2013-02-19  6:54   ` majianpeng
  0 siblings, 1 reply; 3+ messages in thread
From: NeilBrown @ 2013-02-07  2:05 UTC (permalink / raw)
  To: majianpeng; +Cc: linux-raid, 边, 雨, 周, 麒

[-- Attachment #1: Type: text/plain, Size: 13653 bytes --]

On Mon, 14 Jan 2013 16:44:49 +0800 majianpeng <majianpeng@gmail.com> wrote:

> At present for failed-stripes,the write/read operations will only return error.But there
> are some failed-stripes which can correct by write operation.
> 
> For example:
> 1:Create a raid5 by four disks missing/sdb/sdc/sdd.
> 2:If readed  data from sdb and met error,so the stripe is failed.Because
> the stripe was degraded,there is no chance to correct the read-error.
> 3:If later write-stripe contains sdb which overwrite.The stripe has a
> chance to correct.
> 4:If step3 successed, add a spare disk to raid,the raid can be active
> from degraded.
> 
> There are two methods to compute raid5 parity,rcw and rmw.
> In this situation,it only used rcw.
> There are some conditions for correcting failed-stripe
> A: in a stripe there are only one or two baddisks(maybe faulty/removed/read-error/write-error).
> B: in a stripe for baddisks exclude parity disks,they must overwrite.
> C:After writing data,for read-error disks it must reread to check.
> 
> 
> Tested-by: Qi Zhou <qi.g.zhou@gmail.com>
> Signed-off-by: Yu Bian <ycbzzjlbyby@gmail.com>
> Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
> ---
>  drivers/md/raid5.c |  133 +++++++++++++++++++++++++++++++++++++++++++++-------
>  drivers/md/raid5.h |    9 ++++
>  2 files changed, 125 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 19d77a0..fd8fe58 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -224,6 +224,7 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
>  			    < IO_THRESHOLD)
>  				md_wakeup_thread(conf->mddev->thread);
>  		atomic_dec(&conf->active_stripes);
> +		clear_bit(STRIPE_FAILED_REREAD, &sh->state);
>  		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
>  			list_add_tail(&sh->lru, &conf->inactive_list);
>  			wake_up(&conf->wait_for_stripe);
> @@ -1800,6 +1801,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
>  			atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
>  			clear_bit(R5_ReadError, &sh->dev[i].flags);
>  			clear_bit(R5_ReWrite, &sh->dev[i].flags);
> +			clear_bit(R5_FailedReread, &sh->dev[i].flags);
>  		} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
>  			clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
>  
> @@ -1855,6 +1857,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
>  		else {
>  			clear_bit(R5_ReadError, &sh->dev[i].flags);
>  			clear_bit(R5_ReWrite, &sh->dev[i].flags);
> +			clear_bit(R5_FailedReread, &sh->dev[i].flags);
>  			if (!(set_bad
>  			      && test_bit(In_sync, &rdev->flags)
>  			      && rdev_set_badblocks(
> @@ -1916,6 +1919,7 @@ static void raid5_end_write_request(struct bio *bi, int error)
>  		if (!uptodate) {
>  			set_bit(WriteErrorSeen, &rdev->flags);
>  			set_bit(R5_WriteError, &sh->dev[i].flags);
> +			clear_bit(R5_FailedOverwrite, &sh->dev[i].flags);
>  			if (!test_and_set_bit(WantReplacement, &rdev->flags))
>  				set_bit(MD_RECOVERY_NEEDED,
>  					&rdev->mddev->recovery);
> @@ -2523,6 +2527,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
>  		if (bi)
>  			bitmap_end = 1;
>  
> +		if (test_and_clear_bit(R5_FailedOverwrite, &sh->dev[i].flags))
> +			s->failed_overwrite--;
> +		clear_bit(R5_BadParity, &sh->dev[i].flags);
> +
>  		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
>  			wake_up(&conf->wait_for_overlap);
>  
> @@ -2838,6 +2846,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>  		pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
>  			 conf->max_degraded, (unsigned long long)recovery_cp,
>  			 (unsigned long long)sh->sector);
> +	} else if (s->failed > conf->max_degraded) {
> +		/*For write failed-stripe, only use rcw */
> +		rcw = 1;
> +		rmw = 2;
>  	} else for (i = disks; i--; ) {
>  		/* would I have to read this buffer for read_modify_write */
>  		struct r5dev *dev = &sh->dev[i];
> @@ -2900,8 +2912,13 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>  			    !(test_bit(R5_UPTODATE, &dev->flags) ||
>  			      test_bit(R5_Wantcompute, &dev->flags))) {
>  				rcw++;
> -				if (!test_bit(R5_Insync, &dev->flags))
> +				if (!test_bit(R5_Insync, &dev->flags)) {
> +				 /* Only for all data-disks !R5_Insync */
> +					if (s->failed > conf->max_degraded &&
> +						!test_bit(STRIPE_FAILED_WRITE, &sh->state))
> +						set_bit(STRIPE_DELAYED, &sh->state);
>  					continue; /* it's a failed drive */
> +				}
>  				if (
>  				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
>  					pr_debug("Read_old block "
> @@ -3347,9 +3364,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
>  			}
>  		}
>  		clear_bit(R5_Insync, &dev->flags);
> -		if (!rdev)
> -			/* Not in-sync */;
> -		else if (is_bad) {
> +		if (!rdev) {
> +			s->noworked++;
> +			if ((dev->towrite && test_bit(R5_OVERWRITE, &dev->flags)) ||
> +				test_bit(R5_FailedOverwrite, &dev->flags)) {
> +				s->failed_overwrite++;
> +				set_bit(R5_FailedOverwrite, &dev->flags);
> +			}
> +		} else if (is_bad) {
>  			/* also not in-sync */
>  			if (!test_bit(WriteErrorSeen, &rdev->flags) &&
>  			    test_bit(R5_UPTODATE, &dev->flags)) {
> @@ -3359,6 +3381,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
>  				set_bit(R5_Insync, &dev->flags);
>  				set_bit(R5_ReadError, &dev->flags);
>  			}
> +			if (!test_bit(WriteErrorSeen, &rdev->flags) &&
> +				((dev->towrite && test_bit(R5_OVERWRITE, &dev->flags)) ||
> +				test_bit(R5_FailedOverwrite, &dev->flags))) {
> +				s->failed_overwrite++;
> +				set_bit(R5_FailedOverwrite, &dev->flags);
> +			} else if (i == sh->pd_idx || i == sh->qd_idx)
> +				set_bit(R5_BadParity, &dev->flags);
>  		} else if (test_bit(In_sync, &rdev->flags))
>  			set_bit(R5_Insync, &dev->flags);
>  		else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
> @@ -3410,14 +3439,36 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
>  			clear_bit(R5_ReadError, &dev->flags);
>  			clear_bit(R5_ReWrite, &dev->flags);
>  		}
> -		if (test_bit(R5_ReadError, &dev->flags))
> -			clear_bit(R5_Insync, &dev->flags);
> +		if (test_bit(R5_ReadError, &dev->flags)) {
> +			/*
> +			 * For failed-stripe which contained badsectors,after writing
> +			 * it should reread to check.
> +			 */
> +			struct md_rdev *rdev2 = rcu_dereference(conf->disks[i].rdev);
> +			if (!is_bad && rdev2 && !test_bit(Faulty, &rdev2->flags) &&
> +				test_bit(R5_UPTODATE, &dev->flags) &&
> +				(test_bit(R5_FailedOverwrite, &dev->flags) ||
> +				 test_bit(R5_BadParity, &dev->flags) ||
> +				 test_bit(R5_FailedReread, &dev->flags))) {
> +				set_bit(STRIPE_FAILED_REREAD, &sh->state);
> +				set_bit(R5_FailedReread, &dev->flags);
> +				clear_bit(R5_FailedOverwrite, &dev->flags);
> +				clear_bit(R5_BadParity, &dev->flags);
> +			} else
> +				clear_bit(R5_Insync, &dev->flags);
> +		}
> +
>  		if (!test_bit(R5_Insync, &dev->flags)) {
>  			if (s->failed < 2)
>  				s->failed_num[s->failed] = i;
>  			s->failed++;
>  			if (rdev && !test_bit(Faulty, &rdev->flags))
>  				do_recovery = 1;
> +			if (i == sh->pd_idx)
> +				s->p_failed = 1;
> +			else if (i == sh->qd_idx)
> +				s->q_failed = 1;
> +
>  		}
>  	}
>  	if (test_bit(STRIPE_SYNCING, &sh->state)) {
> @@ -3492,19 +3543,46 @@ static void handle_stripe(struct stripe_head *sh)
>  	}
>  
>  	pr_debug("locked=%d uptodate=%d to_read=%d"
> -	       " to_write=%d failed=%d failed_num=%d,%d\n",
> +		" to_write=%d failed=%d failed_num=%d,%d"
> +		" failed_overwrite=%d p_failed=%d q_failed=%d\n",
>  	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
> -	       s.failed_num[0], s.failed_num[1]);
> +	       s.failed_num[0], s.failed_num[1], s.failed_overwrite,
> +	       s.p_failed, s.q_failed);
> +
>  	/* check if the array has lost more than max_degraded devices and,
>  	 * if so, some requests might need to be failed.
>  	 */
>  	if (s.failed > conf->max_degraded) {
> -		sh->check_state = 0;
> -		sh->reconstruct_state = 0;
> -		if (s.to_read+s.to_write+s.written)
> -			handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
> -		if (s.syncing + s.replacing)
> -			handle_failed_sync(conf, sh, &s);
> +		/*
> +		 *Disks which removed or faulty must less or equal max_degraded
> +		 */
> +		 if (s.noworked <= conf->max_degraded &&
> +			((conf->level < 6 &&
> +			(s.failed - s.failed_overwrite <= s.p_failed)) ||
> +			 (conf->level == 6 &&
> +			(s.failed - s.failed_overwrite <= s.p_failed + s.q_failed)))) {
> +			set_bit(STRIPE_FAILED_WRITE, &sh->state);
> +			pr_debug("stripe(%llu) failed, try to recorrect\n",
> +				(unsigned long long)sh->sector);
> +		} else if ((s.noworked <= conf->max_degraded &&
> +			s.failed_overwrite > 0 &&
> +			!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)))
> +			pr_debug("stripe(%lld) delay for failed-write\n",
> +			(unsigned long long)sh->sector);
> +		 else {
> +			sh->check_state = 0;
> +			sh->reconstruct_state = 0;
> +			clear_bit(STRIPE_FAILED_WRITE, &sh->state);
> +			clear_bit(STRIPE_FAILED_REREAD, &sh->state);
> +			if (s.to_read+s.to_write+s.written) {
> +				handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
> +				pr_debug("handle_failed_stripe:%lld\n",
> +					(unsigned long long)sh->sector);
> +			}
> +			if (s.syncing + s.replacing)
> +				 handle_failed_sync(conf, sh, &s);
> +		}
> +
>  	}
>  
>  	/* Now we check to see if any write operations have recently
> @@ -3525,6 +3603,7 @@ static void handle_stripe(struct stripe_head *sh)
>  		BUG_ON(sh->qd_idx >= 0 &&
>  		       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
>  		       !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
> +		clear_bit(STRIPE_FAILED_WRITE, &sh->state);
>  		for (i = disks; i--; ) {
>  			struct r5dev *dev = &sh->dev[i];
>  			if (test_bit(R5_LOCKED, &dev->flags) &&
> @@ -3620,10 +3699,30 @@ static void handle_stripe(struct stripe_head *sh)
>  		clear_bit(STRIPE_SYNCING, &sh->state);
>  	}
>  
> -	/* If the failed drives are just a ReadError, then we might need
> -	 * to progress the repair/check process
> +	/*
> +	 * If stripe failed because badsector and
> +	 * can write to correct the badsector
> +	 * we used readerr routine which rewrite and reread
>  	 */
> -	if (s.failed <= conf->max_degraded && !conf->mddev->ro)
> +	if (test_bit(STRIPE_FAILED_REREAD, &sh->state)) {
> +		for (i = disks; i--; ) {
> +			struct r5dev *dev = &sh->dev[i];
> +			if (test_bit(R5_FailedReread, &dev->flags)
> +				&& !test_bit(R5_LOCKED, &dev->flags)
> +				&& test_bit(R5_UPTODATE, &dev->flags)) {
> +				/* it just wrote and sucessed,so not
> +				 * necessary to rewrite,only set flag
> +				 */
> +				set_bit(R5_ReWrite, &dev->flags);
> +				set_bit(R5_Wantread, &dev->flags);
> +				set_bit(R5_LOCKED, &dev->flags);
> +				s.locked++;
> +				}
> +		}
> +	} else if (s.failed <= conf->max_degraded && !conf->mddev->ro)
> +		/* If the failed drives are just a ReadError, then we might need
> +		 * to progress the repair/check process
> +		 */
>  		for (i = 0; i < s.failed; i++) {
>  			struct r5dev *dev = &sh->dev[s.failed_num[i]];
>  			if (test_bit(R5_ReadError, &dev->flags)
> diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
> index 18b2c4a..72fd934 100644
> --- a/drivers/md/raid5.h
> +++ b/drivers/md/raid5.h
> @@ -251,6 +251,10 @@ struct stripe_head_state {
>  	 */
>  	int syncing, expanding, expanded, replacing;
>  	int locked, uptodate, to_read, to_write, failed, written;
> +	/* Overwrite disks which faulty or removed or contained badsector */
> +	int failed_overwrite;
> +	int noworked; /*disk faulty or removed*/
> +
>  	int to_fill, compute, req_compute, non_overwrite;
>  	int failed_num[2];
>  	int p_failed, q_failed;
> @@ -299,6 +303,9 @@ enum r5dev_flags {
>  			 * data in, and now is a good time to write it out.
>  			 */
>  	R5_Discard,	/* Discard the stripe */
> +	R5_FailedOverwrite, /*overwrite on disk which removed or faulty or had badsectors*/
> +	R5_FailedReread,
> +	R5_BadParity,
>  };
>  
>  /*
> @@ -323,6 +330,8 @@ enum {
>  	STRIPE_COMPUTE_RUN,
>  	STRIPE_OPS_REQ_PENDING,
>  	STRIPE_ON_UNPLUG_LIST,
> +	STRIPE_FAILED_WRITE,
> +	STRIPE_FAILED_REREAD,
>  };
>  
>  /*


Thanks for resending with the irrelevant bits removed.

Unfortunately I'm having a hard time following the logic, which means that if
I apply the patch it will make the code (even more) unmaintainable.

What I need is :
 - a clear description of what each flag really means
 - a clear description of what each new field in stripe_head_state means.
 - a clear description of the sequencing of operations - when and why it
   decided to take a new path to fix things, how it might decide that it
   cannot after all and fail, etc.

I find some constructs rather odd.  e.g.

+		if (!rdev) {
+			s->noworked++;
+			if ((dev->towrite && test_bit(R5_OVERWRITE, &dev->flags)) ||
+			    test_bit(R5_FailedOverwrite, &dev->flags)) {
+				s->failed_overwrite++;
+				set_bit(R5_FailedOverwrite, &dev->flags);
+			}

I don't follow why the "test_bit(xxx)" is in the if clause.  when would it be
set, but ->towrite and R5_OVERWRITE aren't set?

Maybe I'll try again another week, but for the moment, I'm just not making
any headway in understanding you code - sorry.

NeilBrown


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: Re: [PATCH V1] raid5: Correct some failed-stripe because the badsector.
  2013-02-07  2:05 ` NeilBrown
@ 2013-02-19  6:54   ` majianpeng
  0 siblings, 0 replies; 3+ messages in thread
From: majianpeng @ 2013-02-19  6:54 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, 边, 雨, 周, 麒

>On Mon, 14 Jan 2013 16:44:49 +0800 majianpeng <majianpeng@gmail.com> wrote:
>
>> At present for failed-stripes,the write/read operations will only return error.But there
>> are some failed-stripes which can correct by write operation.
>> 
>> For example:
>> 1:Create a raid5 by four disks missing/sdb/sdc/sdd.
>> 2:If readed  data from sdb and met error,so the stripe is failed.Because
>> the stripe was degraded,there is no chance to correct the read-error.
>> 3:If later write-stripe contains sdb which overwrite.The stripe has a
>> chance to correct.
>> 4:If step3 successed, add a spare disk to raid,the raid can be active
>> from degraded.
>> 
>> There are two methods to compute raid5 parity,rcw and rmw.
>> In this situation,it only used rcw.
>> There are some conditions for correcting failed-stripe
>> A: in a stripe there are only one or two baddisks(maybe faulty/removed/read-error/write-error).
>> B: in a stripe for baddisks exclude parity disks,they must overwrite.
>> C:After writing data,for read-error disks it must reread to check.
>> 
>> 
>> Tested-by: Qi Zhou <qi.g.zhou@gmail.com>
>> Signed-off-by: Yu Bian <ycbzzjlbyby@gmail.com>
>> Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
>> ---
>>  drivers/md/raid5.c |  133 +++++++++++++++++++++++++++++++++++++++++++++-------
>>  drivers/md/raid5.h |    9 ++++
>>  2 files changed, 125 insertions(+), 17 deletions(-)
>> 
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index 19d77a0..fd8fe58 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -224,6 +224,7 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
>>  			    < IO_THRESHOLD)
>>  				md_wakeup_thread(conf->mddev->thread);
>>  		atomic_dec(&conf->active_stripes);
>> +		clear_bit(STRIPE_FAILED_REREAD, &sh->state);
>>  		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
>>  			list_add_tail(&sh->lru, &conf->inactive_list);
>>  			wake_up(&conf->wait_for_stripe);
>> @@ -1800,6 +1801,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
>>  			atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
>>  			clear_bit(R5_ReadError, &sh->dev[i].flags);
>>  			clear_bit(R5_ReWrite, &sh->dev[i].flags);
>> +			clear_bit(R5_FailedReread, &sh->dev[i].flags);
>>  		} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
>>  			clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
>>  
>> @@ -1855,6 +1857,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
>>  		else {
>>  			clear_bit(R5_ReadError, &sh->dev[i].flags);
>>  			clear_bit(R5_ReWrite, &sh->dev[i].flags);
>> +			clear_bit(R5_FailedReread, &sh->dev[i].flags);
>>  			if (!(set_bad
>>  			      && test_bit(In_sync, &rdev->flags)
>>  			      && rdev_set_badblocks(
>> @@ -1916,6 +1919,7 @@ static void raid5_end_write_request(struct bio *bi, int error)
>>  		if (!uptodate) {
>>  			set_bit(WriteErrorSeen, &rdev->flags);
>>  			set_bit(R5_WriteError, &sh->dev[i].flags);
>> +			clear_bit(R5_FailedOverwrite, &sh->dev[i].flags);
>>  			if (!test_and_set_bit(WantReplacement, &rdev->flags))
>>  				set_bit(MD_RECOVERY_NEEDED,
>>  					&rdev->mddev->recovery);
>> @@ -2523,6 +2527,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
>>  		if (bi)
>>  			bitmap_end = 1;
>>  
>> +		if (test_and_clear_bit(R5_FailedOverwrite, &sh->dev[i].flags))
>> +			s->failed_overwrite--;
>> +		clear_bit(R5_BadParity, &sh->dev[i].flags);
>> +
>>  		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
>>  			wake_up(&conf->wait_for_overlap);
>>  
>> @@ -2838,6 +2846,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>>  		pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
>>  			 conf->max_degraded, (unsigned long long)recovery_cp,
>>  			 (unsigned long long)sh->sector);
>> +	} else if (s->failed > conf->max_degraded) {
>> +		/*For write failed-stripe, only use rcw */
>> +		rcw = 1;
>> +		rmw = 2;
>>  	} else for (i = disks; i--; ) {
>>  		/* would I have to read this buffer for read_modify_write */
>>  		struct r5dev *dev = &sh->dev[i];
>> @@ -2900,8 +2912,13 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>>  			    !(test_bit(R5_UPTODATE, &dev->flags) ||
>>  			      test_bit(R5_Wantcompute, &dev->flags))) {
>>  				rcw++;
>> -				if (!test_bit(R5_Insync, &dev->flags))
>> +				if (!test_bit(R5_Insync, &dev->flags)) {
>> +				 /* Only for all data-disks !R5_Insync */
>> +					if (s->failed > conf->max_degraded &&
>> +						!test_bit(STRIPE_FAILED_WRITE, &sh->state))
>> +						set_bit(STRIPE_DELAYED, &sh->state);
>>  					continue; /* it's a failed drive */
>> +				}
>>  				if (
>>  				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
>>  					pr_debug("Read_old block "
>> @@ -3347,9 +3364,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
>>  			}
>>  		}
>>  		clear_bit(R5_Insync, &dev->flags);
>> -		if (!rdev)
>> -			/* Not in-sync */;
>> -		else if (is_bad) {
>> +		if (!rdev) {
>> +			s->noworked++;
>> +			if ((dev->towrite && test_bit(R5_OVERWRITE, &dev->flags)) ||
>> +				test_bit(R5_FailedOverwrite, &dev->flags)) {
>> +				s->failed_overwrite++;
>> +				set_bit(R5_FailedOverwrite, &dev->flags);
>> +			}
>> +		} else if (is_bad) {
>>  			/* also not in-sync */
>>  			if (!test_bit(WriteErrorSeen, &rdev->flags) &&
>>  			    test_bit(R5_UPTODATE, &dev->flags)) {
>> @@ -3359,6 +3381,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
>>  				set_bit(R5_Insync, &dev->flags);
>>  				set_bit(R5_ReadError, &dev->flags);
>>  			}
>> +			if (!test_bit(WriteErrorSeen, &rdev->flags) &&
>> +				((dev->towrite && test_bit(R5_OVERWRITE, &dev->flags)) ||
>> +				test_bit(R5_FailedOverwrite, &dev->flags))) {
>> +				s->failed_overwrite++;
>> +				set_bit(R5_FailedOverwrite, &dev->flags);
>> +			} else if (i == sh->pd_idx || i == sh->qd_idx)
>> +				set_bit(R5_BadParity, &dev->flags);
>>  		} else if (test_bit(In_sync, &rdev->flags))
>>  			set_bit(R5_Insync, &dev->flags);
>>  		else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
>> @@ -3410,14 +3439,36 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
>>  			clear_bit(R5_ReadError, &dev->flags);
>>  			clear_bit(R5_ReWrite, &dev->flags);
>>  		}
>> -		if (test_bit(R5_ReadError, &dev->flags))
>> -			clear_bit(R5_Insync, &dev->flags);
>> +		if (test_bit(R5_ReadError, &dev->flags)) {
>> +			/*
>> +			 * For failed-stripe which contained badsectors,after writing
>> +			 * it should reread to check.
>> +			 */
>> +			struct md_rdev *rdev2 = rcu_dereference(conf->disks[i].rdev);
>> +			if (!is_bad && rdev2 && !test_bit(Faulty, &rdev2->flags) &&
>> +				test_bit(R5_UPTODATE, &dev->flags) &&
>> +				(test_bit(R5_FailedOverwrite, &dev->flags) ||
>> +				 test_bit(R5_BadParity, &dev->flags) ||
>> +				 test_bit(R5_FailedReread, &dev->flags))) {
>> +				set_bit(STRIPE_FAILED_REREAD, &sh->state);
>> +				set_bit(R5_FailedReread, &dev->flags);
>> +				clear_bit(R5_FailedOverwrite, &dev->flags);
>> +				clear_bit(R5_BadParity, &dev->flags);
>> +			} else
>> +				clear_bit(R5_Insync, &dev->flags);
>> +		}
>> +
>>  		if (!test_bit(R5_Insync, &dev->flags)) {
>>  			if (s->failed < 2)
>>  				s->failed_num[s->failed] = i;
>>  			s->failed++;
>>  			if (rdev && !test_bit(Faulty, &rdev->flags))
>>  				do_recovery = 1;
>> +			if (i == sh->pd_idx)
>> +				s->p_failed = 1;
>> +			else if (i == sh->qd_idx)
>> +				s->q_failed = 1;
>> +
>>  		}
>>  	}
>>  	if (test_bit(STRIPE_SYNCING, &sh->state)) {
>> @@ -3492,19 +3543,46 @@ static void handle_stripe(struct stripe_head *sh)
>>  	}
>>  
>>  	pr_debug("locked=%d uptodate=%d to_read=%d"
>> -	       " to_write=%d failed=%d failed_num=%d,%d\n",
>> +		" to_write=%d failed=%d failed_num=%d,%d"
>> +		" failed_overwrite=%d p_failed=%d q_failed=%d\n",
>>  	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
>> -	       s.failed_num[0], s.failed_num[1]);
>> +	       s.failed_num[0], s.failed_num[1], s.failed_overwrite,
>> +	       s.p_failed, s.q_failed);
>> +
>>  	/* check if the array has lost more than max_degraded devices and,
>>  	 * if so, some requests might need to be failed.
>>  	 */
>>  	if (s.failed > conf->max_degraded) {
>> -		sh->check_state = 0;
>> -		sh->reconstruct_state = 0;
>> -		if (s.to_read+s.to_write+s.written)
>> -			handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
>> -		if (s.syncing + s.replacing)
>> -			handle_failed_sync(conf, sh, &s);
>> +		/*
>> +		 *Disks which removed or faulty must less or equal max_degraded
>> +		 */
>> +		 if (s.noworked <= conf->max_degraded &&
>> +			((conf->level < 6 &&
>> +			(s.failed - s.failed_overwrite <= s.p_failed)) ||
>> +			 (conf->level == 6 &&
>> +			(s.failed - s.failed_overwrite <= s.p_failed + s.q_failed)))) {
>> +			set_bit(STRIPE_FAILED_WRITE, &sh->state);
>> +			pr_debug("stripe(%llu) failed, try to recorrect\n",
>> +				(unsigned long long)sh->sector);
>> +		} else if ((s.noworked <= conf->max_degraded &&
>> +			s.failed_overwrite > 0 &&
>> +			!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)))
>> +			pr_debug("stripe(%lld) delay for failed-write\n",
>> +			(unsigned long long)sh->sector);
>> +		 else {
>> +			sh->check_state = 0;
>> +			sh->reconstruct_state = 0;
>> +			clear_bit(STRIPE_FAILED_WRITE, &sh->state);
>> +			clear_bit(STRIPE_FAILED_REREAD, &sh->state);
>> +			if (s.to_read+s.to_write+s.written) {
>> +				handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
>> +				pr_debug("handle_failed_stripe:%lld\n",
>> +					(unsigned long long)sh->sector);
>> +			}
>> +			if (s.syncing + s.replacing)
>> +				 handle_failed_sync(conf, sh, &s);
>> +		}
>> +
>>  	}
>>  
>>  	/* Now we check to see if any write operations have recently
>> @@ -3525,6 +3603,7 @@ static void handle_stripe(struct stripe_head *sh)
>>  		BUG_ON(sh->qd_idx >= 0 &&
>>  		       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
>>  		       !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
>> +		clear_bit(STRIPE_FAILED_WRITE, &sh->state);
>>  		for (i = disks; i--; ) {
>>  			struct r5dev *dev = &sh->dev[i];
>>  			if (test_bit(R5_LOCKED, &dev->flags) &&
>> @@ -3620,10 +3699,30 @@ static void handle_stripe(struct stripe_head *sh)
>>  		clear_bit(STRIPE_SYNCING, &sh->state);
>>  	}
>>  
>> -	/* If the failed drives are just a ReadError, then we might need
>> -	 * to progress the repair/check process
>> +	/*
>> +	 * If stripe failed because badsector and
>> +	 * can write to correct the badsector
>> +	 * we used readerr routine which rewrite and reread
>>  	 */
>> -	if (s.failed <= conf->max_degraded && !conf->mddev->ro)
>> +	if (test_bit(STRIPE_FAILED_REREAD, &sh->state)) {
>> +		for (i = disks; i--; ) {
>> +			struct r5dev *dev = &sh->dev[i];
>> +			if (test_bit(R5_FailedReread, &dev->flags)
>> +				&& !test_bit(R5_LOCKED, &dev->flags)
>> +				&& test_bit(R5_UPTODATE, &dev->flags)) {
>> +				/* it just wrote and sucessed,so not
>> +				 * necessary to rewrite,only set flag
>> +				 */
>> +				set_bit(R5_ReWrite, &dev->flags);
>> +				set_bit(R5_Wantread, &dev->flags);
>> +				set_bit(R5_LOCKED, &dev->flags);
>> +				s.locked++;
>> +				}
>> +		}
>> +	} else if (s.failed <= conf->max_degraded && !conf->mddev->ro)
>> +		/* If the failed drives are just a ReadError, then we might need
>> +		 * to progress the repair/check process
>> +		 */
>>  		for (i = 0; i < s.failed; i++) {
>>  			struct r5dev *dev = &sh->dev[s.failed_num[i]];
>>  			if (test_bit(R5_ReadError, &dev->flags)
>> diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
>> index 18b2c4a..72fd934 100644
>> --- a/drivers/md/raid5.h
>> +++ b/drivers/md/raid5.h
>> @@ -251,6 +251,10 @@ struct stripe_head_state {
>>  	 */
>>  	int syncing, expanding, expanded, replacing;
>>  	int locked, uptodate, to_read, to_write, failed, written;
>> +	/* Overwrite disks which faulty or removed or contained badsector */
>> +	int failed_overwrite;
>> +	int noworked; /*disk faulty or removed*/
>> +
>>  	int to_fill, compute, req_compute, non_overwrite;
>>  	int failed_num[2];
>>  	int p_failed, q_failed;
>> @@ -299,6 +303,9 @@ enum r5dev_flags {
>>  			 * data in, and now is a good time to write it out.
>>  			 */
>>  	R5_Discard,	/* Discard the stripe */
>> +	R5_FailedOverwrite, /*overwrite on disk which removed or faulty or had badsectors*/
>> +	R5_FailedReread,
>> +	R5_BadParity,
>>  };
>>  
>>  /*
>> @@ -323,6 +330,8 @@ enum {
>>  	STRIPE_COMPUTE_RUN,
>>  	STRIPE_OPS_REQ_PENDING,
>>  	STRIPE_ON_UNPLUG_LIST,
>> +	STRIPE_FAILED_WRITE,
>> +	STRIPE_FAILED_REREAD,
>>  };
>>  
>>  /*
>
>
>Thanks for resending with the irrelevant bits removed.
>
>Unfortunately I'm having a hard time following the logic, which means that if
>I apply the patch it will make the code (even more) unmaintainable.
>
Sorry for delay in replying.
>What I need is :
> - a clear description of what each flag really means
R5_FailedOverwrite:There are two conditions which can used this flag
a)disk which removed or had Faulty flag;  b)disk had badsectors on this stripe.
For those disks it can't read the data.So only overwrite can had a change to do continue.

R5_FailedReread: If disk had badsector and this stripe is failed,if it can do write success.Then it should to reread to check.
etc: raid5 four disks.A/B had badsector.
A B C D(p)
Without this patch, the write must faild.But if A/B are overwrite, so it read C to computer parity. And write A/B/D.
After the write, it should to reread A/B to check.

R5_BadParity: This flag only for parity disk when it had badsectors.When reread the code, i think this flag can remove.
>test_bit(R5_BadParity, &dev->flags)
it can be replaced by (i == sh->pd_idx || i == sh->qd_idx).

> - a clear description of what each new field in stripe_head_state means.
STRIPE_FAILED_WRITE: This flag means this operation is write on a failed stripe.
STRIPE_FAILED_REREAD:After the stripe-failed-write operation, it should reread the disks which had badsector to check the data.

> - a clear description of the sequencing of operations - when and why it
>   decided to take a new path to fix things, how it might decide that it
>   cannot after all and fail, etc.
For failed-stripe-write operation, there are at most two steps. One write,the other is reread if disks had badsectors.
For failed-stripe,there are three cases after analysing stripe state.
a): it can do write; b):now it can't,but it had a chance to do write.so it wait; c)it failed immediately.
So the case a) is the most important.
>        if (s.failed > conf->max_degraded) {
	1:First stripe was failed
>                /*   
>                 * Disks which removed or faulty must less or equal max_degraded
>                 */
>                if (s.noworked <= conf->max_degraded && 
  2:disks which removed or faulty must less or equal max_degraded. The aim of writing failed-stripe is to reuse this stripe.So if this condition was false, it makes no sense.
	This condition is also useful the case b.
>                        ((conf->level < 6 && 
>                         (s.failed - s.failed_overwrite <= s.p_failed)) || 
  3:For raid5,all failed disks which removed or had faulty flag or had badsectos must be overwrite,excepted parity disk. Only this condition is true, it can do rcw write.
>                         (conf->level == 6 && 
>                          (s.failed - s.failed_overwrite <= s.p_failed + s.q_failed)))) { 
4: For raid6, the difference is that it had two parity disks p and q.
>                        set_bit(STRIPE_FAILED_WRITE, &sh->state);
>                        pr_debug("stripe(%llu) failed, try to recorrect\n",
>                                (unsigned long long)sh->sector);
>                } else if ((s.noworked <= conf->max_degraded &&
>                        s.failed_overwrite > 0 && 
>                        !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)))
>                        pr_debug("stripe(%lld) delay for failed-write\n",
>                                        (unsigned long long)sh->sector);
	This is for case b which waitting for chance to do.
>			else {
>                        sh->check_state = 0; 
>                        sh->reconstruct_state = 0; 
>                        clear_bit(STRIPE_FAILED_WRITE, &sh->state);
>                        clear_bit(STRIPE_FAILED_REREAD, &sh->state);
>                        if (s.to_read+s.to_write+s.written) {
>                                handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
>                                pr_debug("handle_failed_stripe:%lld\n",
>                                        (unsigned long long)sh->sector);
>                        }    
>                        if (s.syncing + s.replacing)
>                                handle_failed_sync(conf, sh, &s); 
>                }
	There, it only failed.
>
>I find some constructs rather odd.  e.g.
>
>+		if (!rdev) {
>+			s->noworked++;
>+			if ((dev->towrite && test_bit(R5_OVERWRITE, &dev->flags)) ||
>+			    test_bit(R5_FailedOverwrite, &dev->flags)) {
>+				s->failed_overwrite++;
>+				set_bit(R5_FailedOverwrite, &dev->flags);
>+			}
>
>I don't follow why the "test_bit(xxx)" is in the if clause.  when would it be
>set, but ->towrite and R5_OVERWRITE aren't set?
A stripe can be called multiple times by func handle_stripe.
The first it check the condition "if ((dev->towrite && test_bit(R5_OVERWRITE, &dev->flags))"
But after the successfuley write, ->towrite will be set null.So if there is no test_bit(xxx), it will be exec handle_failed_stripe/sync.
There are many places like those.How to different the first-check and successfuly write-check? 
Specifically, if there are some badsectors, it must to  reread.

>
>Maybe I'll try again another week, but for the moment, I'm just not making
>any headway in understanding you code - sorry.
>
>NeilBrown
>
>

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2013-02-19  6:54 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-01-14  8:44 [PATCH V1] raid5: Correct some failed-stripe because the badsector majianpeng
2013-02-07  2:05 ` NeilBrown
2013-02-19  6:54   ` majianpeng

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.