All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jonathan Brassow <jbrassow@redhat.com>
To: dm-devel@redhat.com, linux-raid@vger.kernel.org
Cc: agk@redhat.com, neilb@suse.de
Subject: [PATCH v2] DM RAID: Add support for MD RAID10
Date: Wed, 11 Jul 2012 20:36:41 -0500	[thread overview]
Message-ID: <1342057001.22214.6.camel@f16> (raw)

Neil,

I've changed the tunables to the way we discussed.  If it becomes
necessary to have the freedom to have simultaneous near and far copies,
then I will likely add 'raid10_near|far|offset_copies' to compliment the
existing 'raid10_copies' arg.  Like you, I doubt they will be necessary
though.

I have yet to add the code that allows new devices to replace old/failed
devices (i.e. handles the 'rebuild' parameter).  That will be a future
patch.

 brassow

dm raid: add md raid10 support

Support the MD RAID10 personality through dm-raid.c

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>

Index: linux-upstream/drivers/md/dm-raid.c
===================================================================
--- linux-upstream.orig/drivers/md/dm-raid.c
+++ linux-upstream/drivers/md/dm-raid.c
@@ -11,6 +11,7 @@
 #include "md.h"
 #include "raid1.h"
 #include "raid5.h"
+#include "raid10.h"
 #include "bitmap.h"
 
 #include <linux/device-mapper.h>
@@ -52,7 +53,10 @@ struct raid_dev {
 #define DMPF_MAX_RECOVERY_RATE 0x20
 #define DMPF_MAX_WRITE_BEHIND  0x40
 #define DMPF_STRIPE_CACHE      0x80
-#define DMPF_REGION_SIZE       0X100
+#define DMPF_REGION_SIZE       0x100
+#define DMPF_RAID10_COPIES     0x200
+#define DMPF_RAID10_FORMAT     0x400
+
 struct raid_set {
 	struct dm_target *ti;
 
@@ -76,6 +80,7 @@ static struct raid_type {
 	const unsigned algorithm;	/* RAID algorithm. */
 } raid_types[] = {
 	{"raid1",    "RAID1 (mirroring)",               0, 2, 1, 0 /* NONE */},
+	{"raid10",   "RAID10 (striped mirrors)",        0, 2, 10, -1 /* Varies */},
 	{"raid4",    "RAID4 (dedicated parity disk)",	1, 2, 5, ALGORITHM_PARITY_0},
 	{"raid5_la", "RAID5 (left asymmetric)",		1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
 	{"raid5_ra", "RAID5 (right asymmetric)",	1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
@@ -86,6 +91,36 @@ static struct raid_type {
 	{"raid6_nc", "RAID6 (N continue)",		2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
 };
 
+static char *raid10_md_layout_to_format(int layout)
+{
+	if (layout & 0x10000)
+		return "offset";
+
+	if ((layout & 0xFF) > 1)
+		return "near";
+
+	return "far";
+}
+
+static unsigned raid10_md_layout_to_copies(int layout)
+{
+	if ((layout & 0xFF) > 1)
+		return layout & 0xFF;
+	return (layout >> 8) & 0xFF;
+}
+
+static int raid10_format_to_md_layout(char *format, unsigned copies)
+{
+	unsigned n = 1, f = 1;
+
+	if (!strcmp("near", format))
+		n = copies;
+	else
+		f = copies;
+
+	return (!strcmp("offset", format) << 16) | (f << 8) | n;
+}
+
 static struct raid_type *get_raid_type(char *name)
 {
 	int i;
@@ -339,10 +374,16 @@ static int validate_region_size(struct r
  *    [max_write_behind <sectors>]	See '-write-behind=' (man mdadm)
  *    [stripe_cache <sectors>]		Stripe cache size for higher RAIDs
  *    [region_size <sectors>]           Defines granularity of bitmap
+ *
+ * RAID10-only options:
+ *    [raid10_copies <# copies>]        Number of copies.  (Default: 2)
+ *    [raid10_format <near|far|offset>] Layout algorithm.  (Default: near)
  */
 static int parse_raid_params(struct raid_set *rs, char **argv,
 			     unsigned num_raid_params)
 {
+	char *raid10_format = "near";
+	unsigned raid10_copies = 2;
 	unsigned i, rebuild_cnt = 0;
 	unsigned long value, region_size = 0;
 	sector_t sectors_per_dev = rs->ti->len;
@@ -416,11 +457,30 @@ static int parse_raid_params(struct raid
 		}
 
 		key = argv[i++];
+
+		/* Parameters that take a string value are checked here. */
+		if (!strcasecmp(key, "raid10_format")) {
+			if (rs->raid_type->level != 10) {
+				rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
+				return -EINVAL;
+			}
+			if (strcmp("near", argv[i]) &&
+			    strcmp("far", argv[i]) &&
+			    strcmp("offset", argv[i])) {
+				rs->ti->error = "Invalid 'raid10_format' value given";
+				return -EINVAL;
+			}
+			raid10_format = argv[i];
+			rs->print_flags |= DMPF_RAID10_FORMAT;
+			continue;
+		}
+
 		if (strict_strtoul(argv[i], 10, &value) < 0) {
 			rs->ti->error = "Bad numerical argument given in raid params";
 			return -EINVAL;
 		}
 
+		/* Parameters that take a numeric value are checked here */
 		if (!strcasecmp(key, "rebuild")) {
 			rebuild_cnt++;
 			rs->ti->error = NULL;
@@ -436,6 +496,7 @@ static int parse_raid_params(struct raid
 				if (rebuild_cnt > rs->raid_type->parity_devs)
 					rs->ti->error = "Too many rebuild devices specified for given RAID type";
 				break;
+			case 10:
 			default:
 				DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name);
 				rs->ti->error = "Rebuild not supported for this RAID type";
@@ -493,7 +554,7 @@ static int parse_raid_params(struct raid
 			 */
 			value /= 2;
 
-			if (rs->raid_type->level < 5) {
+			if (rs->raid_type->level != 5) {
 				rs->ti->error = "Inappropriate argument: stripe_cache";
 				return -EINVAL;
 			}
@@ -518,6 +579,14 @@ static int parse_raid_params(struct raid
 		} else if (!strcasecmp(key, "region_size")) {
 			rs->print_flags |= DMPF_REGION_SIZE;
 			region_size = value;
+		} else if (!strcasecmp(key, "raid10_copies") &&
+			   (rs->raid_type->level == 10)) {
+			if ((value < 2) || (value > 0xFF)) {
+				rs->ti->error = "Bad value for 'raid10_copies'";
+				return -EINVAL;
+			}
+			rs->print_flags |= DMPF_RAID10_COPIES;
+			raid10_copies = value;
 		} else {
 			DMERR("Unable to parse RAID parameter: %s", key);
 			rs->ti->error = "Unable to parse RAID parameters";
@@ -536,9 +605,25 @@ static int parse_raid_params(struct raid
 	if (dm_set_target_max_io_len(rs->ti, max_io_len))
 		return -EINVAL;
 
-	if ((rs->raid_type->level > 1) &&
-	    sector_div(sectors_per_dev, (rs->md.raid_disks - rs->raid_type->parity_devs))) {
+	if (rs->raid_type->level == 10) {
+		/* (Len * Stripes) / Mirrors */
+		sectors_per_dev *= rs->md.raid_disks;
+		if (sector_div(sectors_per_dev, raid10_copies)) {
+			rs->ti->error = "Target length not divisible by number of data devices";
+			return -EINVAL;
+		}
+		if (raid10_copies > rs->md.raid_disks) {
+			rs->ti->error = "Not enough devices to satisfy specification";
+			return -EINVAL;
+		}
+		rs->md.layout = raid10_format_to_md_layout(raid10_format,
+							   raid10_copies);
+		rs->md.new_layout = rs->md.layout;
+	} else if ((rs->raid_type->level > 1) &&
+		   sector_div(sectors_per_dev,
+			      (rs->md.raid_disks - rs->raid_type->parity_devs))) {
 		rs->ti->error = "Target length not divisible by number of data devices";
+
 		return -EINVAL;
 	}
 	rs->md.dev_sectors = sectors_per_dev;
@@ -564,6 +649,9 @@ static int raid_is_congested(struct dm_t
 	if (rs->raid_type->level == 1)
 		return md_raid1_congested(&rs->md, bits);
 
+	if (rs->raid_type->level == 10)
+		return md_raid10_congested(&rs->md, bits);
+
 	return md_raid5_congested(&rs->md, bits);
 }
 
@@ -882,6 +970,9 @@ static int analyse_superblocks(struct dm
 	case 6:
 		redundancy = rs->raid_type->parity_devs;
 		break;
+	case 10:
+		redundancy = raid10_md_layout_to_copies(mddev->layout) - 1;
+		break;
 	default:
 		ti->error = "Unknown RAID type";
 		return -EINVAL;
@@ -1201,6 +1292,14 @@ static int raid_status(struct dm_target
 			DMEMIT(" region_size %lu",
 			       rs->md.bitmap_info.chunksize >> 9);
 
+		if (rs->print_flags & DMPF_RAID10_COPIES)
+			DMEMIT(" raid10_copies %u",
+			       raid10_md_layout_to_copies(rs->md.layout));
+
+		if (rs->print_flags & DMPF_RAID10_FORMAT)
+			DMEMIT(" raid10_format %s",
+			       raid10_md_layout_to_format(rs->md.layout));
+
 		DMEMIT(" %d", rs->md.raid_disks);
 		for (i = 0; i < rs->md.raid_disks; i++) {
 			if (rs->dev[i].meta_dev)
@@ -1275,7 +1374,7 @@ static void raid_resume(struct dm_target
 
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 2, 0},
+	.version = {1, 3, 0},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,
@@ -1302,6 +1401,8 @@ module_init(dm_raid_init);
 module_exit(dm_raid_exit);
 
 MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
+MODULE_ALIAS("dm-raid1");
+MODULE_ALIAS("dm-raid10");
 MODULE_ALIAS("dm-raid4");
 MODULE_ALIAS("dm-raid5");
 MODULE_ALIAS("dm-raid6");
Index: linux-upstream/Documentation/device-mapper/dm-raid.txt
===================================================================
--- linux-upstream.orig/Documentation/device-mapper/dm-raid.txt
+++ linux-upstream/Documentation/device-mapper/dm-raid.txt
@@ -27,6 +27,11 @@ The target is named "raid" and it accept
 		- rotating parity N (right-to-left) with data restart
   raid6_nc	RAID6 N continue
 		- rotating parity N (right-to-left) with data continuation
+  raid10        Various RAID10 inspired algorithms chosen by additional params
+  		- RAID10: Striped Mirrors (aka 'Striping on top of mirrors')
+		- RAID1E: Integrated Adjacent Stripe Mirroring
+		- RAID1E: Integrated Offset Stripe Mirroring
+		-  and other similar RAID10 variants
 
   Reference: Chapter 4 of
   http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf
@@ -59,6 +64,59 @@ The target is named "raid" and it accept
 		logical size of the array.  The bitmap records the device
 		synchronisation state for each region.
 
+        [raid10_copies   <# copies>]
+        [raid10_format   <near|far|offset>]
+		These two options are used to alter the default layout of
+		a RAID10 configuration.  The number of copies is can be
+		specified, but the default is 2.  There are also three
+		variations to how the copies are laid down - the default
+		is "near".  Near copies are what most people think of with
+		respect to mirroring.  If these options are left unspecified,
+		or 'raid10_copies 2' and/or 'raid10_format near' are given,
+		then the layouts for 2, 3 and 4 devices	are:
+		2 drives         3 drives          4 drives
+		--------         ----------        --------------
+		A1  A1           A1  A1  A2        A1  A1  A2  A2
+		A2  A2           A2  A3  A3        A3  A3  A4  A4
+		A3  A3           A4  A4  A5        A5  A5  A6  A6
+		A4  A4           A5  A6  A6        A7  A7  A8  A8
+		..  ..           ..  ..  ..        ..  ..  ..  ..
+		The 2-device layout is equivalent 2-way RAID1.  The 4-device
+		layout is what a traditional RAID10 would look like.  The
+		3-device layout is what might be called a 'RAID1E - Integrated
+		Adjacent Stripe Mirroring'.
+
+		If 'raid10_copies 2' and 'raid10_format far', then the layouts
+		for 2, 3 and 4 devices are:
+		2 drives             3 drives             4 drives
+		--------             --------------       --------------------
+		A1  A2               A1   A2   A3         A1   A2   A3   A4
+		A3  A4               A4   A5   A6         A5   A6   A7   A8
+		A5  A6               A7   A8   A9         A9   A10  A11  A12
+		..  ..               ..   ..   ..         ..   ..   ..   ..
+		A2  A1               A3   A1   A2         A4   A1   A2   A3
+		A4  A3               A6   A4   A5         A8   A5   A6   A7
+		A6  A5               A9   A7   A8         A12  A9   A10  A11
+		..  ..               ..   ..   ..         ..   ..   ..   ..
+
+		If 'raid10_copies 2' and 'raid10_format offset', then the
+		layouts for 2, 3 and 4 devices are:
+		2 drives       3 drives           4 drives
+		--------       ------------       -----------------
+		A1  A2         A1  A2  A3         A1  A2  A3  A4
+		A2  A1         A3  A1  A2         A4  A1  A2  A3
+		A3  A4         A4  A5  A6         A5  A6  A7  A8
+		A4  A3         A6  A4  A5         A8  A5  A6  A7
+		A5  A6         A7  A8  A9         A9  A10 A11 A12
+		A6  A5         A9  A7  A8         A12 A9  A10 A11
+		..  ..         ..  ..  ..         ..  ..  ..  ..
+		Here we see layouts closely akin to 'RAID1E - Integrated
+		Offset Stripe Mirroring'.
+
+		Thanks wikipedia 'Non-standard RAID levels' for the layout
+		figures:
+		http://en.wikipedia.org/wiki/Non-standard_RAID_levels
+
 <#raid_devs>: The number of devices composing the array.
 	Each device consists of two entries.  The first is the device
 	containing the metadata (if any); the second is the one containing the



             reply	other threads:[~2012-07-12  1:36 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-07-12  1:36 Jonathan Brassow [this message]
2012-07-12  6:32 ` [PATCH v2] DM RAID: Add support for MD RAID10 NeilBrown
2012-07-12  9:56   ` Alasdair G Kergon
2012-07-12 11:43     ` NeilBrown
2012-07-16 22:06   ` Brassow Jonathan
2012-07-17  2:34     ` NeilBrown
2012-07-17 16:15       ` Brassow Jonathan
2012-07-18  1:11         ` NeilBrown
2012-07-18 14:45           ` Brassow Jonathan
2012-07-12 16:22 ` keld
2012-07-12 19:00   ` Brassow Jonathan
2012-07-13  1:15     ` keld
2012-07-13  1:27       ` NeilBrown
2012-07-13  8:29         ` keld
2012-07-16  6:14           ` NeilBrown
2012-07-16  8:28             ` keld
2012-07-16 22:53               ` Brassow Jonathan
2012-07-17  2:29                 ` NeilBrown
2012-07-17 20:30                   ` Brassow Jonathan
2012-07-17  2:40               ` NeilBrown
2012-07-18  7:20                 ` keld

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1342057001.22214.6.camel@f16 \
    --to=jbrassow@redhat.com \
    --cc=agk@redhat.com \
    --cc=dm-devel@redhat.com \
    --cc=linux-raid@vger.kernel.org \
    --cc=neilb@suse.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.