All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] DM-RAID: Fix RAID10's check for sufficient redundancy
@ 2013-01-10  2:00 Jonathan Brassow
  2013-01-10  2:02 ` Jonathan Brassow
  2013-01-23  3:42 ` [PATCH - for v3.7] " Jonathan Brassow
  0 siblings, 2 replies; 7+ messages in thread
From: Jonathan Brassow @ 2013-01-10  2:00 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, agk, jbrassow

DM RAID:  Fix RAID10's check for sufficient redundancy

Before attempting to activate a RAID array, it is checked for sufficient
redundancy.  That is, we make sure that there are not too many failed
devices - or devices specified for rebuild - to undermine our ability to
activate the array.  The current code performs this check twice - once to
ensure there were not too many devices specified for rebuild by the user
('validate_rebuild_devices') and again after possibly experiencing a failure
to read the superblock ('analyse_superblocks').  Neither of these checks are
sufficient.  The first check is done properly but with insufficient
information about the possible failure state of the devices to make a good
determination if the array can be activated.  The second check is simply
done wrong in the case of RAID10 because it doesn't account for the
independence of the stripes (i.e. mirror sets).  The solution is to use the
properly written check ('validate_rebuild_devices'), but perform the check
after the superblocks have been read and we know which devices have failed.
This gives us one check instead of two and performs it in a location where
it can be done right.

Only RAID10 was affected and it was affected in the following ways:
- the code did not properly catch the condition where a user specified
  a device for rebuild that already had a failed device in the same mirror
  set.  (This condition would, however, be caught at a deeper level in MD.)
- the code triggers a false positive and denies activation when devices in
  independent mirror sets have failed - counting the failures as though they
  were all in the same set.

The most likely place this error was introduced (or this patch should have
been included) is in commit 4ec1e369 - first introduced in v3.7-rc1.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>

Index: linux-upstream/drivers/md/dm-raid.c
===================================================================
--- linux-upstream.orig/drivers/md/dm-raid.c
+++ linux-upstream/drivers/md/dm-raid.c
@@ -369,25 +369,23 @@ static int validate_region_size(struct r
 }
 
 /*
- * validate_rebuild_devices
+ * validate_raid_redundancy
  * @rs
  *
- * Determine if the devices specified for rebuild can result in a valid
- * usable array that is capable of rebuilding the given devices.
+ * Determine if there are enough devices in the array that haven't
+ * failed (or are being rebuilt) to form a usable array.
  *
  * Returns: 0 on success, -EINVAL on failure.
  */
-static int validate_rebuild_devices(struct raid_set *rs)
+static int validate_raid_redundancy(struct raid_set *rs)
 {
 	unsigned i, rebuild_cnt = 0;
 	unsigned rebuilds_per_group, copies, d;
 	unsigned group_size, last_group_start;
 
-	if (!(rs->print_flags & DMPF_REBUILD))
-		return 0;
-
 	for (i = 0; i < rs->md.raid_disks; i++)
-		if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+		if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
+		    !rs->dev[i].rdev.sb_page)
 			rebuild_cnt++;
 
 	switch (rs->raid_type->level) {
@@ -424,7 +422,8 @@ static int validate_rebuild_devices(stru
 		if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) {
 			for (i = 0; i < rs->md.raid_disks * copies; i++) {
 				d = i % rs->md.raid_disks;
-				if (!test_bit(In_sync, &rs->dev[d].rdev.flags) &&
+				if ((!rs->dev[d].rdev.sb_page ||
+				     !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
 				    (++rebuilds_per_group >= copies))
 					goto too_many;
 				if (!((i + 1) % copies))
@@ -451,22 +450,20 @@ static int validate_rebuild_devices(stru
 		for (i = 0; i < rs->md.raid_disks; i++) {
 			if (!(i % copies) && !(i > last_group_start))
 				rebuilds_per_group = 0;
-			if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
+			if ((!rs->dev[i].rdev.sb_page ||
+			     !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
 			    (++rebuilds_per_group >= copies))
 					goto too_many;
 		}
 		break;
 	default:
-		DMERR("The rebuild parameter is not supported for %s",
-		      rs->raid_type->name);
-		rs->ti->error = "Rebuild not supported for this RAID type";
-		return -EINVAL;
+		if (rebuild_cnt)
+			return -EINVAL;
 	}
 
 	return 0;
 
 too_many:
-	rs->ti->error = "Too many rebuild devices specified";
 	return -EINVAL;
 }
 
@@ -728,9 +725,6 @@ static int parse_raid_params(struct raid
 	}
 	rs->md.dev_sectors = sectors_per_dev;
 
-	if (validate_rebuild_devices(rs))
-		return -EINVAL;
-
 	/* Assume there are no metadata devices until the drives are parsed */
 	rs->md.persistent = 0;
 	rs->md.external = 1;
@@ -1072,28 +1066,10 @@ static int super_validate(struct mddev *
 static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 {
 	int ret;
-	unsigned redundancy = 0;
 	struct raid_dev *dev;
 	struct md_rdev *rdev, *tmp, *freshest;
 	struct mddev *mddev = &rs->md;
 
-	switch (rs->raid_type->level) {
-	case 1:
-		redundancy = rs->md.raid_disks - 1;
-		break;
-	case 4:
-	case 5:
-	case 6:
-		redundancy = rs->raid_type->parity_devs;
-		break;
-	case 10:
-		redundancy = raid10_md_layout_to_copies(mddev->layout) - 1;
-		break;
-	default:
-		ti->error = "Unknown RAID type";
-		return -EINVAL;
-	}
-
 	freshest = NULL;
 	rdev_for_each_safe(rdev, tmp, mddev) {
 		/*
@@ -1122,44 +1098,43 @@ static int analyse_superblocks(struct dm
 			break;
 		default:
 			dev = container_of(rdev, struct raid_dev, rdev);
-			if (redundancy--) {
-				if (dev->meta_dev)
-					dm_put_device(ti, dev->meta_dev);
-
-				dev->meta_dev = NULL;
-				rdev->meta_bdev = NULL;
+			if (dev->meta_dev)
+				dm_put_device(ti, dev->meta_dev);
 
-				if (rdev->sb_page)
-					put_page(rdev->sb_page);
+			dev->meta_dev = NULL;
+			rdev->meta_bdev = NULL;
 
-				rdev->sb_page = NULL;
+			if (rdev->sb_page)
+				put_page(rdev->sb_page);
 
-				rdev->sb_loaded = 0;
+			rdev->sb_page = NULL;
 
-				/*
-				 * We might be able to salvage the data device
-				 * even though the meta device has failed.  For
-				 * now, we behave as though '- -' had been
-				 * set for this device in the table.
-				 */
-				if (dev->data_dev)
-					dm_put_device(ti, dev->data_dev);
+			rdev->sb_loaded = 0;
 
-				dev->data_dev = NULL;
-				rdev->bdev = NULL;
+			/*
+			 * We might be able to salvage the data device
+			 * even though the meta device has failed.  For
+			 * now, we behave as though '- -' had been
+			 * set for this device in the table.
+			 */
+			if (dev->data_dev)
+				dm_put_device(ti, dev->data_dev);
 
-				list_del(&rdev->same_set);
+			dev->data_dev = NULL;
+			rdev->bdev = NULL;
 
-				continue;
-			}
-			ti->error = "Failed to load superblock";
-			return ret;
+			list_del(&rdev->same_set);
 		}
 	}
 
 	if (!freshest)
 		return 0;
 
+	if (validate_raid_redundancy(rs)) {
+		rs->ti->error = "Insufficient redundancy to activate array";
+		return -EINVAL;
+	}
+
 	/*
 	 * Validation of the freshest device provides the source of
 	 * validation for the remaining devices.
Index: linux-upstream/Documentation/device-mapper/dm-raid.txt
===================================================================
--- linux-upstream.orig/Documentation/device-mapper/dm-raid.txt
+++ linux-upstream/Documentation/device-mapper/dm-raid.txt
@@ -171,3 +171,4 @@ Version History
 1.3.1	Allow device replacement/rebuild for RAID 10
 1.4.0	Non-functional change.  Removes arg from mapping function.
 1.4.1   Add RAID10 "far" and "offset" algorithm support.
+1.4.2   Fix/improve redundancy checking for RAID10



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] DM-RAID: Fix RAID10's check for sufficient redundancy
  2013-01-10  2:00 [PATCH] DM-RAID: Fix RAID10's check for sufficient redundancy Jonathan Brassow
@ 2013-01-10  2:02 ` Jonathan Brassow
  2013-01-10  6:03   ` NeilBrown
  2013-01-23  3:42 ` [PATCH - for v3.7] " Jonathan Brassow
  1 sibling, 1 reply; 7+ messages in thread
From: Jonathan Brassow @ 2013-01-10  2:02 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, agk, jbrassow


On Wed, 2013-01-09 at 20:00 -0600, Jonathan Brassow wrote:
> DM RAID:  Fix RAID10's check for sufficient redundancy
> 
> Before attempting to activate a RAID array, it is checked for sufficient
> redundancy.  That is, we make sure that there are not too many failed
> devices - or devices specified for rebuild - to undermine our ability to
> activate the array.  The current code performs this check twice - once to
> ensure there were not too many devices specified for rebuild by the user
> ('validate_rebuild_devices') and again after possibly experiencing a failure
> to read the superblock ('analyse_superblocks').  Neither of these checks are
> sufficient.  The first check is done properly but with insufficient
> information about the possible failure state of the devices to make a good
> determination if the array can be activated.  The second check is simply
> done wrong in the case of RAID10 because it doesn't account for the
> independence of the stripes (i.e. mirror sets).  The solution is to use the
> properly written check ('validate_rebuild_devices'), but perform the check
> after the superblocks have been read and we know which devices have failed.
> This gives us one check instead of two and performs it in a location where
> it can be done right.
> 
> Only RAID10 was affected and it was affected in the following ways:
> - the code did not properly catch the condition where a user specified
>   a device for rebuild that already had a failed device in the same mirror
>   set.  (This condition would, however, be caught at a deeper level in MD.)
> - the code triggers a false positive and denies activation when devices in
>   independent mirror sets have failed - counting the failures as though they
>   were all in the same set.
> 
> The most likely place this error was introduced (or this patch should have
> been included) is in commit 4ec1e369 - first introduced in v3.7-rc1.
> 
> Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>

Neil,

This patch should apply cleanly on top of recent changes, but will not
apply cleanly on an older kernel (like 3.7) due to version # changes and
introduction of the new 'far' and 'offset' RAID10 algorithms.  If you
think this fix should be pushed back into 3.7 rather than just applying
on the latest code, I will make a patch for 3.7 - although I'm not
certain how I'd handle the version number conflict.

Thanks,
 brassow



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] DM-RAID: Fix RAID10's check for sufficient redundancy
  2013-01-10  2:02 ` Jonathan Brassow
@ 2013-01-10  6:03   ` NeilBrown
  2013-01-10 15:02     ` Brassow Jonathan
  0 siblings, 1 reply; 7+ messages in thread
From: NeilBrown @ 2013-01-10  6:03 UTC (permalink / raw)
  To: Jonathan Brassow; +Cc: linux-raid, agk

[-- Attachment #1: Type: text/plain, Size: 3010 bytes --]

On Wed, 09 Jan 2013 20:02:32 -0600 Jonathan Brassow <jbrassow@redhat.com>
wrote:

> 
> On Wed, 2013-01-09 at 20:00 -0600, Jonathan Brassow wrote:
> > DM RAID:  Fix RAID10's check for sufficient redundancy
> > 
> > Before attempting to activate a RAID array, it is checked for sufficient
> > redundancy.  That is, we make sure that there are not too many failed
> > devices - or devices specified for rebuild - to undermine our ability to
> > activate the array.  The current code performs this check twice - once to
> > ensure there were not too many devices specified for rebuild by the user
> > ('validate_rebuild_devices') and again after possibly experiencing a failure
> > to read the superblock ('analyse_superblocks').  Neither of these checks are
> > sufficient.  The first check is done properly but with insufficient
> > information about the possible failure state of the devices to make a good
> > determination if the array can be activated.  The second check is simply
> > done wrong in the case of RAID10 because it doesn't account for the
> > independence of the stripes (i.e. mirror sets).  The solution is to use the
> > properly written check ('validate_rebuild_devices'), but perform the check
> > after the superblocks have been read and we know which devices have failed.
> > This gives us one check instead of two and performs it in a location where
> > it can be done right.
> > 
> > Only RAID10 was affected and it was affected in the following ways:
> > - the code did not properly catch the condition where a user specified
> >   a device for rebuild that already had a failed device in the same mirror
> >   set.  (This condition would, however, be caught at a deeper level in MD.)
> > - the code triggers a false positive and denies activation when devices in
> >   independent mirror sets have failed - counting the failures as though they
> >   were all in the same set.
> > 
> > The most likely place this error was introduced (or this patch should have
> > been included) is in commit 4ec1e369 - first introduced in v3.7-rc1.
> > 
> > Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
> 
> Neil,
> 
> This patch should apply cleanly on top of recent changes, but will not
> apply cleanly on an older kernel (like 3.7) due to version # changes and
> introduction of the new 'far' and 'offset' RAID10 algorithms.  If you
> think this fix should be pushed back into 3.7 rather than just applying
> on the latest code, I will make a patch for 3.7 - although I'm not
> certain how I'd handle the version number conflict.

Thanks Jon, I've applied that patch.

I can't say if it should be back-ported.  What is the worst-case scenario if
something goes wrong?  Do the current user-space code contain appropriate
tests itself too?

I don't see how version numbers will be a concern - they are just comments in
a 'Documentation' file aren't they?   If necessary, add extra text to explain
any possible confusion.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] DM-RAID: Fix RAID10's check for sufficient redundancy
  2013-01-10  6:03   ` NeilBrown
@ 2013-01-10 15:02     ` Brassow Jonathan
  2013-01-22  5:52       ` NeilBrown
  0 siblings, 1 reply; 7+ messages in thread
From: Brassow Jonathan @ 2013-01-10 15:02 UTC (permalink / raw)
  To: NeilBrown
  Cc: linux-raid@vger.kernel.org Raid, Alasdair Kergon, Jonathan Brassow


On Jan 10, 2013, at 12:03 AM, NeilBrown wrote:

> On Wed, 09 Jan 2013 20:02:32 -0600 Jonathan Brassow <jbrassow@redhat.com>
> wrote:
> 
>> 
>> On Wed, 2013-01-09 at 20:00 -0600, Jonathan Brassow wrote:
>>> DM RAID:  Fix RAID10's check for sufficient redundancy
>>> 
>>> Before attempting to activate a RAID array, it is checked for sufficient
>>> redundancy.  That is, we make sure that there are not too many failed
>>> devices - or devices specified for rebuild - to undermine our ability to
>>> activate the array.  The current code performs this check twice - once to
>>> ensure there were not too many devices specified for rebuild by the user
>>> ('validate_rebuild_devices') and again after possibly experiencing a failure
>>> to read the superblock ('analyse_superblocks').  Neither of these checks are
>>> sufficient.  The first check is done properly but with insufficient
>>> information about the possible failure state of the devices to make a good
>>> determination if the array can be activated.  The second check is simply
>>> done wrong in the case of RAID10 because it doesn't account for the
>>> independence of the stripes (i.e. mirror sets).  The solution is to use the
>>> properly written check ('validate_rebuild_devices'), but perform the check
>>> after the superblocks have been read and we know which devices have failed.
>>> This gives us one check instead of two and performs it in a location where
>>> it can be done right.
>>> 
>>> Only RAID10 was affected and it was affected in the following ways:
>>> - the code did not properly catch the condition where a user specified
>>>  a device for rebuild that already had a failed device in the same mirror
>>>  set.  (This condition would, however, be caught at a deeper level in MD.)
>>> - the code triggers a false positive and denies activation when devices in
>>>  independent mirror sets have failed - counting the failures as though they
>>>  were all in the same set.
>>> 
>>> The most likely place this error was introduced (or this patch should have
>>> been included) is in commit 4ec1e369 - first introduced in v3.7-rc1.
>>> 
>>> Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
>> 
>> Neil,
>> 
>> This patch should apply cleanly on top of recent changes, but will not
>> apply cleanly on an older kernel (like 3.7) due to version # changes and
>> introduction of the new 'far' and 'offset' RAID10 algorithms.  If you
>> think this fix should be pushed back into 3.7 rather than just applying
>> on the latest code, I will make a patch for 3.7 - although I'm not
>> certain how I'd handle the version number conflict.
> 
> Thanks Jon, I've applied that patch.
> 
> I can't say if it should be back-ported.  What is the worst-case scenario if
> something goes wrong?  Do the current user-space code contain appropriate
> tests itself too?
> 
> I don't see how version numbers will be a concern - they are just comments in
> a 'Documentation' file aren't they?   If necessary, add extra text to explain
> any possible confusion.

The first condition mentioned above isn't that big of a problem since MD will catch the error deeper down.  The second problem can be pretty bad, but is somewhat limited.  The array will run as you would expect while it is active.  It is only when you attempt to re-activate or repair the array that the problem would occur.  In that case, it would only activate if there are 'copies - 1' or less failed devices present.  This doesn't count any spares that may have been introduced.  So, if you had a 2-stripe, 2-copy array (i.e. 'AA BB') and 2 of the devices failed ('A- B-'), you would not be able to reactivate the array unless you replaced at least one of the devices with a spare (e.g. 'A- BS').  There is no corruption or anything, but loosing more than one device and not having any spares wou
 ld leave you in a pickle.

The version number is tricky because we use it to determine what features a target has.  Testing of a particular feature keys off of this number rather than the kernel version.  I will have unit tests to make sure this bug is fixed and does not resurface, but it will only run if version = 1.4.2.  I can't assign the 3.7 kernel with 1.4.2 because that also implies that the 'far' and 'offset' algorithms are there (introduced in 1.4.1).  Tests for those algorithms would then fail because the feature isn't actually there.  What I could do is choose 1.3.2 (one more than the 3.7 version number, 1.3.1).  This would signal the fix but avoid implying Mikulas' changes or the new algorithms.  Then I would simply test for '(version == 1.3.2) || (version > 1.4.1)' for this particular unit test.  For the
  average bloke, things would just work the way they expect.

 brassow


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] DM-RAID: Fix RAID10's check for sufficient redundancy
  2013-01-10 15:02     ` Brassow Jonathan
@ 2013-01-22  5:52       ` NeilBrown
  0 siblings, 0 replies; 7+ messages in thread
From: NeilBrown @ 2013-01-22  5:52 UTC (permalink / raw)
  To: Brassow Jonathan; +Cc: linux-raid@vger.kernel.org Raid, Alasdair Kergon

[-- Attachment #1: Type: text/plain, Size: 5179 bytes --]

On Thu, 10 Jan 2013 09:02:36 -0600 Brassow Jonathan <jbrassow@redhat.com>
wrote:

> 
> On Jan 10, 2013, at 12:03 AM, NeilBrown wrote:
> 
> > On Wed, 09 Jan 2013 20:02:32 -0600 Jonathan Brassow <jbrassow@redhat.com>
> > wrote:
> > 
> >> 
> >> On Wed, 2013-01-09 at 20:00 -0600, Jonathan Brassow wrote:
> >>> DM RAID:  Fix RAID10's check for sufficient redundancy
> >>> 
> >>> Before attempting to activate a RAID array, it is checked for sufficient
> >>> redundancy.  That is, we make sure that there are not too many failed
> >>> devices - or devices specified for rebuild - to undermine our ability to
> >>> activate the array.  The current code performs this check twice - once to
> >>> ensure there were not too many devices specified for rebuild by the user
> >>> ('validate_rebuild_devices') and again after possibly experiencing a failure
> >>> to read the superblock ('analyse_superblocks').  Neither of these checks are
> >>> sufficient.  The first check is done properly but with insufficient
> >>> information about the possible failure state of the devices to make a good
> >>> determination if the array can be activated.  The second check is simply
> >>> done wrong in the case of RAID10 because it doesn't account for the
> >>> independence of the stripes (i.e. mirror sets).  The solution is to use the
> >>> properly written check ('validate_rebuild_devices'), but perform the check
> >>> after the superblocks have been read and we know which devices have failed.
> >>> This gives us one check instead of two and performs it in a location where
> >>> it can be done right.
> >>> 
> >>> Only RAID10 was affected and it was affected in the following ways:
> >>> - the code did not properly catch the condition where a user specified
> >>>  a device for rebuild that already had a failed device in the same mirror
> >>>  set.  (This condition would, however, be caught at a deeper level in MD.)
> >>> - the code triggers a false positive and denies activation when devices in
> >>>  independent mirror sets have failed - counting the failures as though they
> >>>  were all in the same set.
> >>> 
> >>> The most likely place this error was introduced (or this patch should have
> >>> been included) is in commit 4ec1e369 - first introduced in v3.7-rc1.
> >>> 
> >>> Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
> >> 
> >> Neil,
> >> 
> >> This patch should apply cleanly on top of recent changes, but will not
> >> apply cleanly on an older kernel (like 3.7) due to version # changes and
> >> introduction of the new 'far' and 'offset' RAID10 algorithms.  If you
> >> think this fix should be pushed back into 3.7 rather than just applying
> >> on the latest code, I will make a patch for 3.7 - although I'm not
> >> certain how I'd handle the version number conflict.
> > 
> > Thanks Jon, I've applied that patch.
> > 
> > I can't say if it should be back-ported.  What is the worst-case scenario if
> > something goes wrong?  Do the current user-space code contain appropriate
> > tests itself too?
> > 
> > I don't see how version numbers will be a concern - they are just comments in
> > a 'Documentation' file aren't they?   If necessary, add extra text to explain
> > any possible confusion.
> 
> The first condition mentioned above isn't that big of a problem since MD will catch the error deeper down.  The second problem can be pretty bad, but is somewhat limited.  The array will run as you would expect while it is active.  It is only when you attempt to re-activate or repair the array that the problem would occur.  In that case, it would only activate if there are 'copies - 1' or less failed devices present.  This doesn't count any spares that may have been introduced.  So, if you had a 2-stripe, 2-copy array (i.e. 'AA BB') and 2 of the devices failed ('A- B-'), you would not be able to reactivate the array unless you replaced at least one of the devices with a spare (e.g. 'A- BS').  There is no corruption or anything, but loosing more than one device and not having any spares would leave you in a pickle.

Yes, sounds like that is worth backporting.  Care to write a patch?

> 
> The version number is tricky because we use it to determine what features a target has.  Testing of a particular feature keys off of this number rather than the kernel version.  I will have unit tests to make sure this bug is fixed and does not resurface, but it will only run if version = 1.4.2.  I can't assign the 3.7 kernel with 1.4.2 because that also implies that the 'far' and 'offset' algorithms are there (introduced in 1.4.1).  Tests for those algorithms would then fail because the feature isn't actually there.  What I could do is choose 1.3.2 (one more than the 3.7 version number, 1.3.1).  This would signal the fix but avoid implying Mikulas' changes or the new algorithms.  Then I would simply test for '(version == 1.3.2) || (version > 1.4.1)' for this particular unit test.  For the average bloke, things would just work the way they expect.

Why not test for (version >= 1.3.2) ?? Or doesn't version comparison work
that way?

NeilBrown


> 
>  brassow


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH - for v3.7] DM-RAID: Fix RAID10's check for sufficient redundancy
  2013-01-10  2:00 [PATCH] DM-RAID: Fix RAID10's check for sufficient redundancy Jonathan Brassow
  2013-01-10  2:02 ` Jonathan Brassow
@ 2013-01-23  3:42 ` Jonathan Brassow
  2013-01-24  1:17   ` NeilBrown
  1 sibling, 1 reply; 7+ messages in thread
From: Jonathan Brassow @ 2013-01-23  3:42 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, agk, jbrassow

Neil,

 Here is the RAID10 redundancy check fix backported for linux-3.7

Thanks,
 brassow


DM RAID:  Fix RAID10's check for sufficient redundancy

Before attempting to activate a RAID array, it is checked for sufficient
redundancy.  That is, we make sure that there are not too many failed
devices - or devices specified for rebuild - to undermine our ability to
activate the array.  The current code performs this check twice - once to
ensure there were not too many devices specified for rebuild by the user
('validate_rebuild_devices') and again after possibly experiencing a failure
to read the superblock ('analyse_superblocks').  Neither of these checks are
sufficient.  The first check is done properly but with insufficient
information about the possible failure state of the devices to make a good
determination if the array can be activated.  The second check is simply
done wrong in the case of RAID10 because it doesn't account for the
independence of the stripes (i.e. mirror sets).  The solution is to use the
properly written check ('validate_rebuild_devices'), but perform the check
after the superblocks have been read and we know which devices have failed.
This gives us one check instead of two and performs it in a location where
it can be done right.

Only RAID10 was affected and it was affected in the following ways:
- the code did not properly catch the condition where a user specified
  a device for rebuild that already had a failed device in the same mirror
  set.  (This condition would, however, be caught at a deeper level in MD.)
- the code triggers a false positive and denies activation when devices in
  independent mirror sets have failed - counting the failures as though they
  were all in the same set.

The most likely place this error was introduced (or this patch should have
been included) is in commit 4ec1e369 - first introduced in v3.7-rc1.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Index: linux-upstream/Documentation/device-mapper/dm-raid.txt
===================================================================
--- linux-upstream.orig/Documentation/device-mapper/dm-raid.txt
+++ linux-upstream/Documentation/device-mapper/dm-raid.txt
@@ -141,3 +141,4 @@ Version History
 1.2.0	Handle creation of arrays that contain failed devices.
 1.3.0	Added support for RAID 10
 1.3.1	Allow device replacement/rebuild for RAID 10
+1.3.2   Fix/improve redundancy checking for RAID10
Index: linux-upstream/drivers/md/dm-raid.c
===================================================================
--- linux-upstream.orig/drivers/md/dm-raid.c
+++ linux-upstream/drivers/md/dm-raid.c
@@ -338,24 +338,22 @@ static int validate_region_size(struct r
 }
 
 /*
- * validate_rebuild_devices
+ * validate_raid_redundancy
  * @rs
  *
- * Determine if the devices specified for rebuild can result in a valid
- * usable array that is capable of rebuilding the given devices.
+ * Determine if there are enough devices in the array that haven't
+ * failed (or are being rebuilt) to form a usable array.
  *
  * Returns: 0 on success, -EINVAL on failure.
  */
-static int validate_rebuild_devices(struct raid_set *rs)
+static int validate_raid_redundancy(struct raid_set *rs)
 {
 	unsigned i, rebuild_cnt = 0;
 	unsigned rebuilds_per_group, copies, d;
 
-	if (!(rs->print_flags & DMPF_REBUILD))
-		return 0;
-
 	for (i = 0; i < rs->md.raid_disks; i++)
-		if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+		if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
+		    !rs->dev[i].rdev.sb_page)
 			rebuild_cnt++;
 
 	switch (rs->raid_type->level) {
@@ -391,27 +389,24 @@ static int validate_rebuild_devices(stru
 		 *          A    A    B    B    C
 		 *          C    D    D    E    E
 		 */
-		rebuilds_per_group = 0;
 		for (i = 0; i < rs->md.raid_disks * copies; i++) {
+			if (!(i % copies))
+				rebuilds_per_group = 0;
 			d = i % rs->md.raid_disks;
-			if (!test_bit(In_sync, &rs->dev[d].rdev.flags) &&
+			if ((!rs->dev[d].rdev.sb_page ||
+			     !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
 			    (++rebuilds_per_group >= copies))
 				goto too_many;
-			if (!((i + 1) % copies))
-				rebuilds_per_group = 0;
 		}
 		break;
 	default:
-		DMERR("The rebuild parameter is not supported for %s",
-		      rs->raid_type->name);
-		rs->ti->error = "Rebuild not supported for this RAID type";
-		return -EINVAL;
+		if (rebuild_cnt)
+			return -EINVAL;
 	}
 
 	return 0;
 
 too_many:
-	rs->ti->error = "Too many rebuild devices specified";
 	return -EINVAL;
 }
 
@@ -662,9 +657,6 @@ static int parse_raid_params(struct raid
 	}
 	rs->md.dev_sectors = sectors_per_dev;
 
-	if (validate_rebuild_devices(rs))
-		return -EINVAL;
-
 	/* Assume there are no metadata devices until the drives are parsed */
 	rs->md.persistent = 0;
 	rs->md.external = 1;
@@ -993,28 +985,10 @@ static int super_validate(struct mddev *
 static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 {
 	int ret;
-	unsigned redundancy = 0;
 	struct raid_dev *dev;
 	struct md_rdev *rdev, *tmp, *freshest;
 	struct mddev *mddev = &rs->md;
 
-	switch (rs->raid_type->level) {
-	case 1:
-		redundancy = rs->md.raid_disks - 1;
-		break;
-	case 4:
-	case 5:
-	case 6:
-		redundancy = rs->raid_type->parity_devs;
-		break;
-	case 10:
-		redundancy = raid10_md_layout_to_copies(mddev->layout) - 1;
-		break;
-	default:
-		ti->error = "Unknown RAID type";
-		return -EINVAL;
-	}
-
 	freshest = NULL;
 	rdev_for_each_safe(rdev, tmp, mddev) {
 		/*
@@ -1043,44 +1017,43 @@ static int analyse_superblocks(struct dm
 			break;
 		default:
 			dev = container_of(rdev, struct raid_dev, rdev);
-			if (redundancy--) {
-				if (dev->meta_dev)
-					dm_put_device(ti, dev->meta_dev);
+			if (dev->meta_dev)
+				dm_put_device(ti, dev->meta_dev);
 
-				dev->meta_dev = NULL;
-				rdev->meta_bdev = NULL;
+			dev->meta_dev = NULL;
+			rdev->meta_bdev = NULL;
 
-				if (rdev->sb_page)
-					put_page(rdev->sb_page);
+			if (rdev->sb_page)
+				put_page(rdev->sb_page);
 
-				rdev->sb_page = NULL;
+			rdev->sb_page = NULL;
 
-				rdev->sb_loaded = 0;
+			rdev->sb_loaded = 0;
 
-				/*
-				 * We might be able to salvage the data device
-				 * even though the meta device has failed.  For
-				 * now, we behave as though '- -' had been
-				 * set for this device in the table.
-				 */
-				if (dev->data_dev)
-					dm_put_device(ti, dev->data_dev);
-
-				dev->data_dev = NULL;
-				rdev->bdev = NULL;
+			/*
+			 * We might be able to salvage the data device
+			 * even though the meta device has failed.  For
+			 * now, we behave as though '- -' had been
+			 * set for this device in the table.
+			 */
+			if (dev->data_dev)
+				dm_put_device(ti, dev->data_dev);
 
-				list_del(&rdev->same_set);
+			dev->data_dev = NULL;
+			rdev->bdev = NULL;
 
-				continue;
-			}
-			ti->error = "Failed to load superblock";
-			return ret;
+			list_del(&rdev->same_set);
 		}
 	}
 
 	if (!freshest)
 		return 0;
 
+	if (validate_raid_redundancy(rs)) {
+		rs->ti->error = "Insufficient redundancy to activate array";
+		return -EINVAL;
+	}
+
 	/*
 	 * Validation of the freshest device provides the source of
 	 * validation for the remaining devices.
@@ -1430,7 +1403,7 @@ static void raid_resume(struct dm_target
 
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 3, 1},
+	.version = {1, 3, 2},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH - for v3.7] DM-RAID: Fix RAID10's check for sufficient redundancy
  2013-01-23  3:42 ` [PATCH - for v3.7] " Jonathan Brassow
@ 2013-01-24  1:17   ` NeilBrown
  0 siblings, 0 replies; 7+ messages in thread
From: NeilBrown @ 2013-01-24  1:17 UTC (permalink / raw)
  To: Jonathan Brassow; +Cc: linux-raid, agk

[-- Attachment #1: Type: text/plain, Size: 8657 bytes --]

On Tue, 22 Jan 2013 21:42:18 -0600 Jonathan Brassow <jbrassow@redhat.com>
wrote:

> Neil,
> 
>  Here is the RAID10 redundancy check fix backported for linux-3.7

Thanks.

So this goes into 3.7.y and bumps the dm-raid version to 1.3.2

It should also go into 3.8-rc and bump the dm-raid version there to 1.4.1

Then "Add support for MD's RAID10 "far" and "offset" algorithms" which is due
to go into 3.9-rc should bump the dm-raid version to 1.4.2.

Is that right?

I' re-arranged the patches in my for-next to match this - please check I
didn't mess anything up.  Then I'll send them off.

Thanks,
NeilBrown


> 
> Thanks,
>  brassow
> 
> 
> DM RAID:  Fix RAID10's check for sufficient redundancy
> 
> Before attempting to activate a RAID array, it is checked for sufficient
> redundancy.  That is, we make sure that there are not too many failed
> devices - or devices specified for rebuild - to undermine our ability to
> activate the array.  The current code performs this check twice - once to
> ensure there were not too many devices specified for rebuild by the user
> ('validate_rebuild_devices') and again after possibly experiencing a failure
> to read the superblock ('analyse_superblocks').  Neither of these checks are
> sufficient.  The first check is done properly but with insufficient
> information about the possible failure state of the devices to make a good
> determination if the array can be activated.  The second check is simply
> done wrong in the case of RAID10 because it doesn't account for the
> independence of the stripes (i.e. mirror sets).  The solution is to use the
> properly written check ('validate_rebuild_devices'), but perform the check
> after the superblocks have been read and we know which devices have failed.
> This gives us one check instead of two and performs it in a location where
> it can be done right.
> 
> Only RAID10 was affected and it was affected in the following ways:
> - the code did not properly catch the condition where a user specified
>   a device for rebuild that already had a failed device in the same mirror
>   set.  (This condition would, however, be caught at a deeper level in MD.)
> - the code triggers a false positive and denies activation when devices in
>   independent mirror sets have failed - counting the failures as though they
>   were all in the same set.
> 
> The most likely place this error was introduced (or this patch should have
> been included) is in commit 4ec1e369 - first introduced in v3.7-rc1.
> 
> Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
> Index: linux-upstream/Documentation/device-mapper/dm-raid.txt
> ===================================================================
> --- linux-upstream.orig/Documentation/device-mapper/dm-raid.txt
> +++ linux-upstream/Documentation/device-mapper/dm-raid.txt
> @@ -141,3 +141,4 @@ Version History
>  1.2.0	Handle creation of arrays that contain failed devices.
>  1.3.0	Added support for RAID 10
>  1.3.1	Allow device replacement/rebuild for RAID 10
> +1.3.2   Fix/improve redundancy checking for RAID10
> Index: linux-upstream/drivers/md/dm-raid.c
> ===================================================================
> --- linux-upstream.orig/drivers/md/dm-raid.c
> +++ linux-upstream/drivers/md/dm-raid.c
> @@ -338,24 +338,22 @@ static int validate_region_size(struct r
>  }
>  
>  /*
> - * validate_rebuild_devices
> + * validate_raid_redundancy
>   * @rs
>   *
> - * Determine if the devices specified for rebuild can result in a valid
> - * usable array that is capable of rebuilding the given devices.
> + * Determine if there are enough devices in the array that haven't
> + * failed (or are being rebuilt) to form a usable array.
>   *
>   * Returns: 0 on success, -EINVAL on failure.
>   */
> -static int validate_rebuild_devices(struct raid_set *rs)
> +static int validate_raid_redundancy(struct raid_set *rs)
>  {
>  	unsigned i, rebuild_cnt = 0;
>  	unsigned rebuilds_per_group, copies, d;
>  
> -	if (!(rs->print_flags & DMPF_REBUILD))
> -		return 0;
> -
>  	for (i = 0; i < rs->md.raid_disks; i++)
> -		if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
> +		if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
> +		    !rs->dev[i].rdev.sb_page)
>  			rebuild_cnt++;
>  
>  	switch (rs->raid_type->level) {
> @@ -391,27 +389,24 @@ static int validate_rebuild_devices(stru
>  		 *          A    A    B    B    C
>  		 *          C    D    D    E    E
>  		 */
> -		rebuilds_per_group = 0;
>  		for (i = 0; i < rs->md.raid_disks * copies; i++) {
> +			if (!(i % copies))
> +				rebuilds_per_group = 0;
>  			d = i % rs->md.raid_disks;
> -			if (!test_bit(In_sync, &rs->dev[d].rdev.flags) &&
> +			if ((!rs->dev[d].rdev.sb_page ||
> +			     !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
>  			    (++rebuilds_per_group >= copies))
>  				goto too_many;
> -			if (!((i + 1) % copies))
> -				rebuilds_per_group = 0;
>  		}
>  		break;
>  	default:
> -		DMERR("The rebuild parameter is not supported for %s",
> -		      rs->raid_type->name);
> -		rs->ti->error = "Rebuild not supported for this RAID type";
> -		return -EINVAL;
> +		if (rebuild_cnt)
> +			return -EINVAL;
>  	}
>  
>  	return 0;
>  
>  too_many:
> -	rs->ti->error = "Too many rebuild devices specified";
>  	return -EINVAL;
>  }
>  
> @@ -662,9 +657,6 @@ static int parse_raid_params(struct raid
>  	}
>  	rs->md.dev_sectors = sectors_per_dev;
>  
> -	if (validate_rebuild_devices(rs))
> -		return -EINVAL;
> -
>  	/* Assume there are no metadata devices until the drives are parsed */
>  	rs->md.persistent = 0;
>  	rs->md.external = 1;
> @@ -993,28 +985,10 @@ static int super_validate(struct mddev *
>  static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
>  {
>  	int ret;
> -	unsigned redundancy = 0;
>  	struct raid_dev *dev;
>  	struct md_rdev *rdev, *tmp, *freshest;
>  	struct mddev *mddev = &rs->md;
>  
> -	switch (rs->raid_type->level) {
> -	case 1:
> -		redundancy = rs->md.raid_disks - 1;
> -		break;
> -	case 4:
> -	case 5:
> -	case 6:
> -		redundancy = rs->raid_type->parity_devs;
> -		break;
> -	case 10:
> -		redundancy = raid10_md_layout_to_copies(mddev->layout) - 1;
> -		break;
> -	default:
> -		ti->error = "Unknown RAID type";
> -		return -EINVAL;
> -	}
> -
>  	freshest = NULL;
>  	rdev_for_each_safe(rdev, tmp, mddev) {
>  		/*
> @@ -1043,44 +1017,43 @@ static int analyse_superblocks(struct dm
>  			break;
>  		default:
>  			dev = container_of(rdev, struct raid_dev, rdev);
> -			if (redundancy--) {
> -				if (dev->meta_dev)
> -					dm_put_device(ti, dev->meta_dev);
> +			if (dev->meta_dev)
> +				dm_put_device(ti, dev->meta_dev);
>  
> -				dev->meta_dev = NULL;
> -				rdev->meta_bdev = NULL;
> +			dev->meta_dev = NULL;
> +			rdev->meta_bdev = NULL;
>  
> -				if (rdev->sb_page)
> -					put_page(rdev->sb_page);
> +			if (rdev->sb_page)
> +				put_page(rdev->sb_page);
>  
> -				rdev->sb_page = NULL;
> +			rdev->sb_page = NULL;
>  
> -				rdev->sb_loaded = 0;
> +			rdev->sb_loaded = 0;
>  
> -				/*
> -				 * We might be able to salvage the data device
> -				 * even though the meta device has failed.  For
> -				 * now, we behave as though '- -' had been
> -				 * set for this device in the table.
> -				 */
> -				if (dev->data_dev)
> -					dm_put_device(ti, dev->data_dev);
> -
> -				dev->data_dev = NULL;
> -				rdev->bdev = NULL;
> +			/*
> +			 * We might be able to salvage the data device
> +			 * even though the meta device has failed.  For
> +			 * now, we behave as though '- -' had been
> +			 * set for this device in the table.
> +			 */
> +			if (dev->data_dev)
> +				dm_put_device(ti, dev->data_dev);
>  
> -				list_del(&rdev->same_set);
> +			dev->data_dev = NULL;
> +			rdev->bdev = NULL;
>  
> -				continue;
> -			}
> -			ti->error = "Failed to load superblock";
> -			return ret;
> +			list_del(&rdev->same_set);
>  		}
>  	}
>  
>  	if (!freshest)
>  		return 0;
>  
> +	if (validate_raid_redundancy(rs)) {
> +		rs->ti->error = "Insufficient redundancy to activate array";
> +		return -EINVAL;
> +	}
> +
>  	/*
>  	 * Validation of the freshest device provides the source of
>  	 * validation for the remaining devices.
> @@ -1430,7 +1403,7 @@ static void raid_resume(struct dm_target
>  
>  static struct target_type raid_target = {
>  	.name = "raid",
> -	.version = {1, 3, 1},
> +	.version = {1, 3, 2},
>  	.module = THIS_MODULE,
>  	.ctr = raid_ctr,
>  	.dtr = raid_dtr,
> 


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2013-01-24  1:17 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-01-10  2:00 [PATCH] DM-RAID: Fix RAID10's check for sufficient redundancy Jonathan Brassow
2013-01-10  2:02 ` Jonathan Brassow
2013-01-10  6:03   ` NeilBrown
2013-01-10 15:02     ` Brassow Jonathan
2013-01-22  5:52       ` NeilBrown
2013-01-23  3:42 ` [PATCH - for v3.7] " Jonathan Brassow
2013-01-24  1:17   ` NeilBrown

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.