All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 7 of 9] MD:  new sb type
@ 2011-05-24  3:07 Jonathan Brassow
  2011-05-25  4:16 ` NeilBrown
  0 siblings, 1 reply; 4+ messages in thread
From: Jonathan Brassow @ 2011-05-24  3:07 UTC (permalink / raw)
  To: linux-raid

Patch name: md-new-sb-type.patch

A new MD superblock that is device-mapper specific.

The new superblock is not read or written from userspace and is not exported.
It contains information to track resync, recovery, and reshaping progress.  It
also maintains information on the health of the devices in the array.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>

Index: linux-2.6/drivers/md/md.c
===================================================================
--- linux-2.6.orig/drivers/md/md.c
+++ linux-2.6/drivers/md/md.c
@@ -1731,6 +1731,305 @@ super_1_rdev_size_change(mdk_rdev_t *rde
 	return num_sectors;
 }
 
+/*
+ * This structure is never used by userspace.  It is only ever
+ * used in these particular super block accessing functions.
+ * Therefore, we don't put it in any .h file.
+ *
+ * It makes sense to define a new magic number here.  This way,
+ * no userspace application will confuse the device as a device
+ * that is accessible through MD operations.  Devices with this
+ * superblock should only ever be accessed via device-mapper.
+ */
+#define MD_DM_SB_MAGIC 0x426E6F4A
+struct mdp_superblock_2 {
+	__le32 magic;
+	__le32 flags; /* Used to indicate possible future changes */
+
+	__le64 events;
+
+	/*
+	 * The following offset variables are used to indicate:
+	 *  reshape_offset:  If the RAID level or layout of an array is
+	 *		     being updated, this offset keeps track of the
+	 *		     progress.
+	 *  disk_recovery_offset:  If drives are being repaired/replaced on
+	 *			   an individual basis, this offset tracks
+	 *			   that progress.  This might happen when a
+	 *			   drive fails and is replaced.
+	 *  array_resync_offset:  When the array is constructed for the first
+	 *			  time, all the devices must be made coherent.
+	 *			  This offset tracks that progress.
+	 */
+	__le64 reshape_offset;
+	__le64 disk_recovery_offset;
+	__le64 array_resync_offset;
+
+	/*
+	 * The following variable pairs reflect things
+	 * that can changed during an array reshape.
+	 */
+	__le32 level;
+	__le32 new_level;
+
+	__le32 layout;
+	__le32 new_layout;
+
+	__le32 stripe_sectors;
+	__le32 new_stripe_sectors;
+
+	__le32 num_devices;    /* Number of devs in RAID, Max = 64 */
+	__le32 new_num_devices;
+
+	__le64 failed_devices; /* bitmap of devs, used to indicate a failure */
+	__u8 pad[432];         /* Round out the struct to 512 bytes */
+};
+
+static void super_2_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	mdk_rdev_t *r, *t;
+	uint64_t failed_devices;
+	struct mdp_superblock_2 *sb;
+
+	sb = (struct mdp_superblock_2 *)page_address(rdev->sb_page);
+	failed_devices = le32_to_cpu(sb->failed_devices);
+
+	rdev_for_each(r, t, mddev)
+		if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
+			failed_devices |= (1 << r->raid_disk);
+
+	memset(sb, 0, sizeof(*sb));
+
+	sb->magic  = cpu_to_le32(MD_DM_SB_MAGIC);
+	sb->flags  = cpu_to_le32(0); /* No flags yet */
+
+	sb->events = cpu_to_le64(mddev->events);
+
+	sb->reshape_offset = cpu_to_le64(mddev->reshape_position);
+	sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
+	sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
+
+	sb->level = cpu_to_le32(mddev->level);
+	sb->layout = cpu_to_le32(mddev->layout);
+	sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
+	sb->num_devices = cpu_to_le32(mddev->raid_disks);
+
+	if (mddev->reshape_position != MaxSector) {
+		sb->new_level = cpu_to_le32(mddev->new_level);
+		sb->new_layout = cpu_to_le32(mddev->new_layout);
+		sb->new_stripe_sectors = cpu_to_le32(mddev->new_chunk_sectors);
+		sb->new_num_devices = cpu_to_le32(mddev->delta_disks);
+	} else {
+		sb->new_level = 0;
+		sb->new_layout = 0;
+		sb->new_stripe_sectors = 0;
+		sb->new_num_devices = 0;
+	}
+
+	sb->failed_devices = cpu_to_le32(failed_devices);
+}
+
+/*
+ * super_2_load
+ *
+ * This function creates a superblock if one is not found on the device
+ * and will indicate the more appropriate device whose superblock should
+ * be used, if given two.
+ *
+ * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
+ */
+static int super_2_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
+{
+	int r;
+	uint64_t ev1, ev2;
+	struct mdp_superblock_2 *sb;
+	struct mdp_superblock_2 *refsb;
+
+	if (sizeof(*sb) & (sizeof(*sb) - 1)) {
+		printk(KERN_ERR "Programmer error: Bad sized superblock (%lu)\n",
+		       sizeof(*sb));
+		return -EIO;
+	}
+
+	rdev->sb_start = 0;
+	rdev->sb_size  = sizeof(*sb);
+	r = read_disk_sb(rdev, rdev->sb_size);
+	if (r)
+		return r;
+
+	sb = (struct mdp_superblock_2 *)page_address(rdev->sb_page);
+	if (sb->magic != cpu_to_le32(MD_DM_SB_MAGIC)) {
+		super_2_sync(rdev->mddev, rdev);
+
+		set_bit(FirstUse, &rdev->flags);
+
+		/* Force new superblocks to disk */
+		set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
+
+		/* Any superblock is better than none, choose that if given */
+		return refdev ? 0 : 1;
+	}
+
+	if (!refdev)
+		return 1;
+
+	ev1 = le64_to_cpu(sb->events);
+	refsb = (struct mdp_superblock_2 *)page_address(refdev->sb_page);
+	ev2 = le64_to_cpu(refsb->events);
+
+	return (ev1 > ev2) ? 1 : 0;
+}
+
+static int super_2_init_validation(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	uint64_t ev1;
+	uint32_t failed_devices;
+	struct mdp_superblock_2 *sb;
+	uint32_t new_devs = 0;
+	uint32_t rebuilds = 0;
+	mdk_rdev_t *r, *t;
+	struct mdp_superblock_2 *sb2;
+
+	sb = (struct mdp_superblock_2 *)page_address(rdev->sb_page);
+	ev1 = le64_to_cpu(sb->events);
+	failed_devices = le32_to_cpu(sb->failed_devices);
+
+	mddev->events = ev1 ? ev1 : 1;
+
+	/* Reshaping is not currently allowed */
+	if ((le32_to_cpu(sb->level) != mddev->level) ||
+	    (le32_to_cpu(sb->layout) != mddev->layout) ||
+	    (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) ||
+	    (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
+		printk(KERN_ERR
+		       "md: %s: Reshaping arrays not yet supported.\n",
+		       mdname(mddev));
+		return -EINVAL;
+	}
+
+	if (!test_and_clear_bit(MD_SYNC_STATE_FORCED, &mddev->flags))
+		mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
+
+	/*
+	 * During load, we set FirstUse if a new superblock was written.
+	 * There are two reasons we might not have a superblock:
+	 * 1) The array is brand new - in which case, all of the
+	 *    devices must have their In_sync bit set.  Also,
+	 *    recovery_cp must be 0, unless forced.
+	 * 2) This is a new device being added to an old array
+	 *    and the new device needs to be rebuilt - in which
+	 *    case the In_sync bit will /not/ be set and
+	 *    recovery_cp must be MaxSector.
+	 */
+	rdev_for_each(r, t, mddev) {
+		if (!test_bit(In_sync, &r->flags)) {
+			if (!test_bit(FirstUse, &r->flags))
+				printk(KERN_ERR "md: %s: Superblock area of "
+				       "rebuild device %d should have been "
+				       "cleared.\n", mdname(mddev),
+				       r->raid_disk);
+			set_bit(FirstUse, &r->flags);
+			rebuilds++;
+		} else if (test_bit(FirstUse, &r->flags))
+			new_devs++;
+	}
+
+	if (!rebuilds) {
+		if (new_devs == mddev->raid_disks) {
+			printk(KERN_INFO "md: %s: Superblocks created for new array\n", mdname(mddev));
+		} else if (new_devs) {
+			printk(KERN_ERR "md: %s: New device injected "
+			       "into existing array without 'rebuild' "
+			       "parameter specified\n", mdname(mddev));
+			return -EINVAL;
+		}
+	} else if (new_devs) {
+		printk(KERN_ERR "md: %s: 'rebuild' devices cannot be "
+		       "injected into an array with other "
+		       "first-time devices\n", mdname(mddev));
+		return -EINVAL;
+	} else if (mddev->recovery_cp != MaxSector) {
+		printk(KERN_ERR "md: %s: 'rebuild' specified while "
+		       "array is not in-sync\n",
+		       mdname(mddev));
+		return -EINVAL;
+	}
+
+	/*
+	 * Now we set the Faulty bit for those devices that are
+	 * recorded in the superblock as failed.
+	 */
+	rdev_for_each(r, t, mddev) {
+		if (!r->sb_page)
+			continue;
+		sb2 = (struct mdp_superblock_2 *)
+			page_address(r->sb_page);
+		sb2->failed_devices = 0;
+
+		if ((r->raid_disk >= 0) &&
+		    (failed_devices & (1 << r->raid_disk))) {
+			if (test_bit(FirstUse, &r->flags)) {
+				char b[BDEVNAME_SIZE];
+				printk(KERN_INFO
+				       "md: %s: Starting complete rebuild of "
+				       "previously failed device, %s\n",
+				       mdname(mddev), bdevname(rdev->bdev, b));
+			} else {
+				set_bit(Faulty, &r->flags);
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int super_2_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	struct mdp_superblock_2 *sb;
+
+	sb = (struct mdp_superblock_2 *)page_address(rdev->sb_page);
+
+	/*
+	 * mddev->events is set during the first call to super_2_validate,
+	 * so we use that knowledge to kick off some global sanity checks
+	 * on the first call.
+	 */
+	if (!mddev->events && super_2_init_validation(mddev, rdev))
+		return -EINVAL;
+
+	rdev->mddev->bitmap_info.offset = 0; /* disable bitmap creation */
+	rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
+	if (!test_bit(FirstUse, &rdev->flags)) {
+		rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
+		if (rdev->recovery_offset != MaxSector)
+			clear_bit(In_sync, &rdev->flags);
+	}
+
+	if (test_bit(Faulty, &rdev->flags)) {
+		clear_bit(Faulty, &rdev->flags);
+		clear_bit(In_sync, &rdev->flags);
+		rdev->recovery_offset = 0;
+		printk(KERN_INFO "md: %s: Dev #%d previously marked as failed\n",
+		       mdname(mddev), rdev->raid_disk);
+	}
+
+	clear_bit(FirstUse, &rdev->flags);
+	return 0;
+}
+
+static unsigned long long
+super_2_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
+{
+	/*
+	 * Arrays built through device-mapper must use device-mapper
+	 * tables to change the size.  A call to this function is
+	 * invalid for this array.
+	 */
+	printk(KERN_ERR "md: %s: Invalid device size change request.\n",
+	       mdname(rdev->mddev));
+	return 0;
+}
+
 static struct super_type super_types[] = {
 	[0] = {
 		.name	= "0.90.0",
@@ -1748,6 +2047,14 @@ static struct super_type super_types[] =
 		.sync_super	    = super_1_sync,
 		.rdev_size_change   = super_1_rdev_size_change,
 	},
+	[2] = {
+		.name	= "dm",
+		.owner	= THIS_MODULE,
+		.load_super	    = super_2_load,
+		.validate_super	    = super_2_validate,
+		.sync_super	    = super_2_sync,
+		.rdev_size_change   = super_2_rdev_size_change,
+	},
 };
 
 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
Index: linux-2.6/drivers/md/md.h
===================================================================
--- linux-2.6.orig/drivers/md/md.h
+++ linux-2.6/drivers/md/md.h
@@ -77,6 +77,8 @@ struct mdk_rdev_s
 #define Blocked		8		/* An error occurred on an externally
 					 * managed array, don't allow writes
 					 * until it is cleared */
+#define FirstUse        9               /* Used by device-mapper interface when
+					 * initializing first-time devices. */
 	wait_queue_head_t blocked_wait;
 
 	int desc_nr;			/* descriptor index in the superblock */
@@ -124,6 +126,7 @@ struct mddev_s
 #define MD_CHANGE_DEVS	0	/* Some device status has changed */
 #define MD_CHANGE_CLEAN 1	/* transition to or from 'clean' */
 #define MD_CHANGE_PENDING 2	/* switch from 'clean' to 'active' in progress */
+#define MD_SYNC_STATE_FORCED 3  /* recovery_cp is set and must be honored */
 
 	int				suspended;
 	atomic_t			active_io;

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 7 of 9] MD:  new sb type
  2011-05-24  3:07 [PATCH 7 of 9] MD: new sb type Jonathan Brassow
@ 2011-05-25  4:16 ` NeilBrown
  2011-05-25 14:40   ` Jonathan Brassow
  0 siblings, 1 reply; 4+ messages in thread
From: NeilBrown @ 2011-05-25  4:16 UTC (permalink / raw)
  To: Jonathan Brassow; +Cc: linux-raid

On Mon, 23 May 2011 22:07:04 -0500 Jonathan Brassow <jbrassow@f14.redhat.com>
wrote:

> Patch name: md-new-sb-type.patch
> 
> A new MD superblock that is device-mapper specific.
> 
> The new superblock is not read or written from userspace and is not exported.
> It contains information to track resync, recovery, and reshaping progress.  It
> also maintains information on the health of the devices in the array.
> 
> Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
> 
> Index: linux-2.6/drivers/md/md.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/md.c
> +++ linux-2.6/drivers/md/md.c
> @@ -1731,6 +1731,305 @@ super_1_rdev_size_change(mdk_rdev_t *rde
>  	return num_sectors;
>  }
>  
> +/*
> + * This structure is never used by userspace.  It is only ever
> + * used in these particular super block accessing functions.
> + * Therefore, we don't put it in any .h file.
> + *
> + * It makes sense to define a new magic number here.  This way,
> + * no userspace application will confuse the device as a device
> + * that is accessible through MD operations.  Devices with this
> + * superblock should only ever be accessed via device-mapper.
> + */
> +#define MD_DM_SB_MAGIC 0x426E6F4A
> +struct mdp_superblock_2 {
> +	__le32 magic;
> +	__le32 flags; /* Used to indicate possible future changes */
> +
> +	__le64 events;
> +
> +	/*
> +	 * The following offset variables are used to indicate:
> +	 *  reshape_offset:  If the RAID level or layout of an array is
> +	 *		     being updated, this offset keeps track of the
> +	 *		     progress.
> +	 *  disk_recovery_offset:  If drives are being repaired/replaced on
> +	 *			   an individual basis, this offset tracks
> +	 *			   that progress.  This might happen when a
> +	 *			   drive fails and is replaced.
> +	 *  array_resync_offset:  When the array is constructed for the first
> +	 *			  time, all the devices must be made coherent.
> +	 *			  This offset tracks that progress.
> +	 */
> +	__le64 reshape_offset;
> +	__le64 disk_recovery_offset;
> +	__le64 array_resync_offset;
> +
> +	/*
> +	 * The following variable pairs reflect things
> +	 * that can changed during an array reshape.
> +	 */
> +	__le32 level;
> +	__le32 new_level;
> +
> +	__le32 layout;
> +	__le32 new_layout;
> +
> +	__le32 stripe_sectors;
> +	__le32 new_stripe_sectors;
> +
> +	__le32 num_devices;    /* Number of devs in RAID, Max = 64 */
> +	__le32 new_num_devices;

Presumably the dm table knows all this info as well and it is just here for
error checking - yes?


> +
> +	__le64 failed_devices; /* bitmap of devs, used to indicate a failure */
> +	__u8 pad[432];         /* Round out the struct to 512 bytes */
> +};
> +
> +static void super_2_sync(mddev_t *mddev, mdk_rdev_t *rdev)
> +{
> +	mdk_rdev_t *r, *t;
> +	uint64_t failed_devices;
> +	struct mdp_superblock_2 *sb;
> +
> +	sb = (struct mdp_superblock_2 *)page_address(rdev->sb_page);
> +	failed_devices = le32_to_cpu(sb->failed_devices);

failed_devices is 64 bit, so you want le64_to_cpu

> +
> +	rdev_for_each(r, t, mddev)
> +		if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
> +			failed_devices |= (1 << r->raid_disk);

And this should be (1ULL << ....)  so that it doesn't overflow.


> +
> +	memset(sb, 0, sizeof(*sb));
> +
> +	sb->magic  = cpu_to_le32(MD_DM_SB_MAGIC);
> +	sb->flags  = cpu_to_le32(0); /* No flags yet */
> +
> +	sb->events = cpu_to_le64(mddev->events);
> +
> +	sb->reshape_offset = cpu_to_le64(mddev->reshape_position);
> +	sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
> +	sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
> +
> +	sb->level = cpu_to_le32(mddev->level);
> +	sb->layout = cpu_to_le32(mddev->layout);
> +	sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
> +	sb->num_devices = cpu_to_le32(mddev->raid_disks);
> +
> +	if (mddev->reshape_position != MaxSector) {
> +		sb->new_level = cpu_to_le32(mddev->new_level);
> +		sb->new_layout = cpu_to_le32(mddev->new_layout);
> +		sb->new_stripe_sectors = cpu_to_le32(mddev->new_chunk_sectors);
> +		sb->new_num_devices = cpu_to_le32(mddev->delta_disks);
> +	} else {
> +		sb->new_level = 0;
> +		sb->new_layout = 0;
> +		sb->new_stripe_sectors = 0;
> +		sb->new_num_devices = 0;
> +	}

As these values are meaningless when reshape_position is MaxSector, and as
the structure has already been zeroed, setting them to zero again looks wrong.


> +
> +	sb->failed_devices = cpu_to_le32(failed_devices);

Again, cpu_to_le64


I haven't thought through the 'FirstUse and STATE_FORCED flags yet.  When I
have I might have more to say - or I might not.

Thanks,
NeilBrown





> +}
> +
> +/*
> + * super_2_load
> + *
> + * This function creates a superblock if one is not found on the device
> + * and will indicate the more appropriate device whose superblock should
> + * be used, if given two.
> + *
> + * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
> + */
> +static int super_2_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
> +{
> +	int r;
> +	uint64_t ev1, ev2;
> +	struct mdp_superblock_2 *sb;
> +	struct mdp_superblock_2 *refsb;
> +
> +	if (sizeof(*sb) & (sizeof(*sb) - 1)) {
> +		printk(KERN_ERR "Programmer error: Bad sized superblock (%lu)\n",
> +		       sizeof(*sb));
> +		return -EIO;
> +	}
> +
> +	rdev->sb_start = 0;
> +	rdev->sb_size  = sizeof(*sb);
> +	r = read_disk_sb(rdev, rdev->sb_size);
> +	if (r)
> +		return r;
> +
> +	sb = (struct mdp_superblock_2 *)page_address(rdev->sb_page);
> +	if (sb->magic != cpu_to_le32(MD_DM_SB_MAGIC)) {
> +		super_2_sync(rdev->mddev, rdev);
> +
> +		set_bit(FirstUse, &rdev->flags);
> +
> +		/* Force new superblocks to disk */
> +		set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
> +
> +		/* Any superblock is better than none, choose that if given */
> +		return refdev ? 0 : 1;
> +	}
> +
> +	if (!refdev)
> +		return 1;
> +
> +	ev1 = le64_to_cpu(sb->events);
> +	refsb = (struct mdp_superblock_2 *)page_address(refdev->sb_page);
> +	ev2 = le64_to_cpu(refsb->events);
> +
> +	return (ev1 > ev2) ? 1 : 0;
> +}
> +
> +static int super_2_init_validation(mddev_t *mddev, mdk_rdev_t *rdev)
> +{
> +	uint64_t ev1;
> +	uint32_t failed_devices;
> +	struct mdp_superblock_2 *sb;
> +	uint32_t new_devs = 0;
> +	uint32_t rebuilds = 0;
> +	mdk_rdev_t *r, *t;
> +	struct mdp_superblock_2 *sb2;
> +
> +	sb = (struct mdp_superblock_2 *)page_address(rdev->sb_page);
> +	ev1 = le64_to_cpu(sb->events);
> +	failed_devices = le32_to_cpu(sb->failed_devices);
> +
> +	mddev->events = ev1 ? ev1 : 1;
> +
> +	/* Reshaping is not currently allowed */
> +	if ((le32_to_cpu(sb->level) != mddev->level) ||
> +	    (le32_to_cpu(sb->layout) != mddev->layout) ||
> +	    (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) ||
> +	    (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
> +		printk(KERN_ERR
> +		       "md: %s: Reshaping arrays not yet supported.\n",
> +		       mdname(mddev));
> +		return -EINVAL;
> +	}
> +
> +	if (!test_and_clear_bit(MD_SYNC_STATE_FORCED, &mddev->flags))
> +		mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
> +
> +	/*
> +	 * During load, we set FirstUse if a new superblock was written.
> +	 * There are two reasons we might not have a superblock:
> +	 * 1) The array is brand new - in which case, all of the
> +	 *    devices must have their In_sync bit set.  Also,
> +	 *    recovery_cp must be 0, unless forced.
> +	 * 2) This is a new device being added to an old array
> +	 *    and the new device needs to be rebuilt - in which
> +	 *    case the In_sync bit will /not/ be set and
> +	 *    recovery_cp must be MaxSector.
> +	 */
> +	rdev_for_each(r, t, mddev) {
> +		if (!test_bit(In_sync, &r->flags)) {
> +			if (!test_bit(FirstUse, &r->flags))
> +				printk(KERN_ERR "md: %s: Superblock area of "
> +				       "rebuild device %d should have been "
> +				       "cleared.\n", mdname(mddev),
> +				       r->raid_disk);
> +			set_bit(FirstUse, &r->flags);
> +			rebuilds++;
> +		} else if (test_bit(FirstUse, &r->flags))
> +			new_devs++;
> +	}
> +
> +	if (!rebuilds) {
> +		if (new_devs == mddev->raid_disks) {
> +			printk(KERN_INFO "md: %s: Superblocks created for new array\n", mdname(mddev));
> +		} else if (new_devs) {
> +			printk(KERN_ERR "md: %s: New device injected "
> +			       "into existing array without 'rebuild' "
> +			       "parameter specified\n", mdname(mddev));
> +			return -EINVAL;
> +		}
> +	} else if (new_devs) {
> +		printk(KERN_ERR "md: %s: 'rebuild' devices cannot be "
> +		       "injected into an array with other "
> +		       "first-time devices\n", mdname(mddev));
> +		return -EINVAL;
> +	} else if (mddev->recovery_cp != MaxSector) {
> +		printk(KERN_ERR "md: %s: 'rebuild' specified while "
> +		       "array is not in-sync\n",
> +		       mdname(mddev));
> +		return -EINVAL;
> +	}
> +
> +	/*
> +	 * Now we set the Faulty bit for those devices that are
> +	 * recorded in the superblock as failed.
> +	 */
> +	rdev_for_each(r, t, mddev) {
> +		if (!r->sb_page)
> +			continue;
> +		sb2 = (struct mdp_superblock_2 *)
> +			page_address(r->sb_page);
> +		sb2->failed_devices = 0;
> +
> +		if ((r->raid_disk >= 0) &&
> +		    (failed_devices & (1 << r->raid_disk))) {
> +			if (test_bit(FirstUse, &r->flags)) {
> +				char b[BDEVNAME_SIZE];
> +				printk(KERN_INFO
> +				       "md: %s: Starting complete rebuild of "
> +				       "previously failed device, %s\n",
> +				       mdname(mddev), bdevname(rdev->bdev, b));
> +			} else {
> +				set_bit(Faulty, &r->flags);
> +			}
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static int super_2_validate(mddev_t *mddev, mdk_rdev_t *rdev)
> +{
> +	struct mdp_superblock_2 *sb;
> +
> +	sb = (struct mdp_superblock_2 *)page_address(rdev->sb_page);
> +
> +	/*
> +	 * mddev->events is set during the first call to super_2_validate,
> +	 * so we use that knowledge to kick off some global sanity checks
> +	 * on the first call.
> +	 */
> +	if (!mddev->events && super_2_init_validation(mddev, rdev))
> +		return -EINVAL;
> +
> +	rdev->mddev->bitmap_info.offset = 0; /* disable bitmap creation */
> +	rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
> +	if (!test_bit(FirstUse, &rdev->flags)) {
> +		rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
> +		if (rdev->recovery_offset != MaxSector)
> +			clear_bit(In_sync, &rdev->flags);
> +	}
> +
> +	if (test_bit(Faulty, &rdev->flags)) {
> +		clear_bit(Faulty, &rdev->flags);
> +		clear_bit(In_sync, &rdev->flags);
> +		rdev->recovery_offset = 0;
> +		printk(KERN_INFO "md: %s: Dev #%d previously marked as failed\n",
> +		       mdname(mddev), rdev->raid_disk);
> +	}
> +
> +	clear_bit(FirstUse, &rdev->flags);
> +	return 0;
> +}
> +
> +static unsigned long long
> +super_2_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
> +{
> +	/*
> +	 * Arrays built through device-mapper must use device-mapper
> +	 * tables to change the size.  A call to this function is
> +	 * invalid for this array.
> +	 */
> +	printk(KERN_ERR "md: %s: Invalid device size change request.\n",
> +	       mdname(rdev->mddev));
> +	return 0;
> +}
> +
>  static struct super_type super_types[] = {
>  	[0] = {
>  		.name	= "0.90.0",
> @@ -1748,6 +2047,14 @@ static struct super_type super_types[] =
>  		.sync_super	    = super_1_sync,
>  		.rdev_size_change   = super_1_rdev_size_change,
>  	},
> +	[2] = {
> +		.name	= "dm",
> +		.owner	= THIS_MODULE,
> +		.load_super	    = super_2_load,
> +		.validate_super	    = super_2_validate,
> +		.sync_super	    = super_2_sync,
> +		.rdev_size_change   = super_2_rdev_size_change,
> +	},
>  };
>  
>  static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
> Index: linux-2.6/drivers/md/md.h
> ===================================================================
> --- linux-2.6.orig/drivers/md/md.h
> +++ linux-2.6/drivers/md/md.h
> @@ -77,6 +77,8 @@ struct mdk_rdev_s
>  #define Blocked		8		/* An error occurred on an externally
>  					 * managed array, don't allow writes
>  					 * until it is cleared */
> +#define FirstUse        9               /* Used by device-mapper interface when
> +					 * initializing first-time devices. */
>  	wait_queue_head_t blocked_wait;
>  
>  	int desc_nr;			/* descriptor index in the superblock */
> @@ -124,6 +126,7 @@ struct mddev_s
>  #define MD_CHANGE_DEVS	0	/* Some device status has changed */
>  #define MD_CHANGE_CLEAN 1	/* transition to or from 'clean' */
>  #define MD_CHANGE_PENDING 2	/* switch from 'clean' to 'active' in progress */
> +#define MD_SYNC_STATE_FORCED 3  /* recovery_cp is set and must be honored */
>  
>  	int				suspended;
>  	atomic_t			active_io;
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 7 of 9] MD:  new sb type
  2011-05-25  4:16 ` NeilBrown
@ 2011-05-25 14:40   ` Jonathan Brassow
  2011-05-26  0:34     ` NeilBrown
  0 siblings, 1 reply; 4+ messages in thread
From: Jonathan Brassow @ 2011-05-25 14:40 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid


On May 24, 2011, at 11:16 PM, NeilBrown wrote:

>> +	__le32 level;
>> +	__le32 new_level;
>> +
>> +	__le32 layout;
>> +	__le32 new_layout;
>> +
>> +	__le32 stripe_sectors;
>> +	__le32 new_stripe_sectors;
>> +
>> +	__le32 num_devices;    /* Number of devs in RAID, Max = 64 */
>> +	__le32 new_num_devices;
> 
> Presumably the dm table knows all this info as well and it is just here for
> error checking - yes?

Error checking, yes... but a little bit more.

If we keep this information in the superblock (instead of LVM metadata), then the act of reshaping is done like this:
1) initial RAID device created - LVM (or some other DM utilizing manager) records necessary info to re-instantiate this device.
2) A reshape (if LVM, 'lvconvert') is issued.  LVM will record only the information about the new RAID layout.  It will submit only this information to device-mapper (and on to MD) for further activations.
The information in the superblock is then used to say, "ok, I see you passed in RAID6 information, but I was RAID5, so I must convert (or continue to convert)."  Any new conversions while a conversion is in process are denied.  So, it is relevant only so far as reshaping is concerned.

If we keep this information in the LVM metadata, the process becomes far more messy:
1) initial RAID device created
2) A reshape is issued.  This time LVM must record all the information for both states - requiring additions to the LVM metadata layout, which in turn require new code for parsing and assembling this information as well as the code for turning it into something device-mapper can understand.
3) The reshape issued to device-mapper would require a new constructor, destructor, and status functions in dm-raid.c specifically for reshape scenarios.
4) An event would have to be raised when the reshape is complete in order to inform LVM to complete the reshape action (i.e. remove the intermediate metadata and write the final metadata describing the end product). This requires new plug-ins for the 'dmeventd' daemon, and of course, new LVM code to be able to convert from the hybrid device to the final device.  And what happens if the daemon is hung, segfaulting, or just not there?  The final change-over doesn't complete.  If this is followed by a machine reboot, LVM will have no way of knowing whether the process was never begun or whether it was finished - reshape_offset doesn't help in this case.
This is a truly horrible way to go...

I've cleaned up the other items you mentioned.  Of course, depending on which way we go regarding analyze_sbs, the bulk of these super functions may be pushed into dm-raid.c.

 brassow


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 7 of 9] MD:  new sb type
  2011-05-25 14:40   ` Jonathan Brassow
@ 2011-05-26  0:34     ` NeilBrown
  0 siblings, 0 replies; 4+ messages in thread
From: NeilBrown @ 2011-05-26  0:34 UTC (permalink / raw)
  To: Jonathan Brassow; +Cc: linux-raid

On Wed, 25 May 2011 09:40:06 -0500 Jonathan Brassow <jbrassow@redhat.com>
wrote:

> 
> On May 24, 2011, at 11:16 PM, NeilBrown wrote:
> 
> >> +	__le32 level;
> >> +	__le32 new_level;
> >> +
> >> +	__le32 layout;
> >> +	__le32 new_layout;
> >> +
> >> +	__le32 stripe_sectors;
> >> +	__le32 new_stripe_sectors;
> >> +
> >> +	__le32 num_devices;    /* Number of devs in RAID, Max = 64 */
> >> +	__le32 new_num_devices;
> > 
> > Presumably the dm table knows all this info as well and it is just here for
> > error checking - yes?
> 
> Error checking, yes... but a little bit more.
> 
> If we keep this information in the superblock (instead of LVM metadata), then the act of reshaping is done like this:
> 1) initial RAID device created - LVM (or some other DM utilizing manager) records necessary info to re-instantiate this device.
> 2) A reshape (if LVM, 'lvconvert') is issued.  LVM will record only the information about the new RAID layout.  It will submit only this information to device-mapper (and on to MD) for further activations.
> The information in the superblock is then used to say, "ok, I see you passed in RAID6 information, but I was RAID5, so I must convert (or continue to convert)."  Any new conversions while a conversion is in process are denied.  So, it is relevant only so far as reshaping is concerned.

OK, that makes sense.
You could argue that the 'new' info is redundant because that is what is
provided in the dm table, but it probably make sense to keep it.

Thanks,

NeilBrown


> 
> If we keep this information in the LVM metadata, the process becomes far more messy:
> 1) initial RAID device created
> 2) A reshape is issued.  This time LVM must record all the information for both states - requiring additions to the LVM metadata layout, which in turn require new code for parsing and assembling this information as well as the code for turning it into something device-mapper can understand.
> 3) The reshape issued to device-mapper would require a new constructor, destructor, and status functions in dm-raid.c specifically for reshape scenarios.
> 4) An event would have to be raised when the reshape is complete in order to inform LVM to complete the reshape action (i.e. remove the intermediate metadata and write the final metadata describing the end product). This requires new plug-ins for the 'dmeventd' daemon, and of course, new LVM code to be able to convert from the hybrid device to the final device.  And what happens if the daemon is hung, segfaulting, or just not there?  The final change-over doesn't complete.  If this is followed by a machine reboot, LVM will have no way of knowing whether the process was never begun or whether it was finished - reshape_offset doesn't help in this case.
> This is a truly horrible way to go...
> 
> I've cleaned up the other items you mentioned.  Of course, depending on which way we go regarding analyze_sbs, the bulk of these super functions may be pushed into dm-raid.c.
> 
>  brassow


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2011-05-26  0:34 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-05-24  3:07 [PATCH 7 of 9] MD: new sb type Jonathan Brassow
2011-05-25  4:16 ` NeilBrown
2011-05-25 14:40   ` Jonathan Brassow
2011-05-26  0:34     ` NeilBrown

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.