All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 0/8] dm-raid (raid456) target
@ 2010-12-21  2:37 Mike Snitzer
  2010-12-21  2:37 ` [PATCH v3 1/8] md/bitmap: revert dm-dirty-log preparation Mike Snitzer
                   ` (8 more replies)
  0 siblings, 9 replies; 19+ messages in thread
From: Mike Snitzer @ 2010-12-21  2:37 UTC (permalink / raw)
  To: jbrassow, neilb; +Cc: agk, dm-devel, linux-raid

I reviewed all patches and adjusted patch headers in preparation for
inclusion in Alasdair's tree (upstream target being v2.6.38).  The 2
md/bitmap changes can obviously go through Neil's MD tree if that is
preferred.

- added Copyright to the top of dm-raid.c
- fixed drivers/md/Kconfig so that DM_RAID will select MD_RAID456
- removed unused 'conf' variable in raid_status()
- few misc style and whitespace changes

Neil,
Hopefully you're comfortable with the authorship of most of these
patches still being attributed to you (despite Jon's changes to your
original patches).

Please advise, thanks!
Mike

Jonathan Brassow (2):
  md/bitmap: revert dm-dirty-log preparation
  md/raid5: use sysfs_notify_dirent_safe to avoid NULL pointer

NeilBrown (6):
  md/bitmap: use DIV_ROUND_UP in bitmap_init_from_disk
  dm raid: skeleton raid456 target support
  dm: introduce target callbacks and congestion fn
  dm: unplug callback
  dm raid: add iterate_devices and io_hints functions
  dm raid: add suspend and resume functions

 drivers/md/Kconfig            |   23 ++
 drivers/md/Makefile           |    1 +
 drivers/md/bitmap.c           |  139 +++------
 drivers/md/bitmap.h           |    5 -
 drivers/md/dm-raid.c          |  664 +++++++++++++++++++++++++++++++++++++++++
 drivers/md/dm-table.c         |   19 ++
 drivers/md/md.h               |    5 -
 drivers/md/raid5.c            |    2 +-
 include/linux/device-mapper.h |   13 +
 9 files changed, 766 insertions(+), 105 deletions(-)
 create mode 100644 drivers/md/dm-raid.c

-- 
1.7.2.3


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH v3 1/8] md/bitmap: revert dm-dirty-log preparation
  2010-12-21  2:37 [PATCH v3 0/8] dm-raid (raid456) target Mike Snitzer
@ 2010-12-21  2:37 ` Mike Snitzer
  2010-12-21  2:37 ` [PATCH v3 2/8] md/bitmap: use DIV_ROUND_UP in bitmap_init_from_disk Mike Snitzer
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Mike Snitzer @ 2010-12-21  2:37 UTC (permalink / raw)
  To: jbrassow, neilb; +Cc: agk, dm-devel, linux-raid

From: Jonathan Brassow <jbrassow@redhat.com>

Revert commit e384e58549a2e9a83071ad80280c1a9053cfd84c
  md/bitmap: prepare for storing write-intent-bitmap via dm-dirty-log.

as MD should not need to use DM's dirty log.

Next patch will restore some DIV_ROUND_UP cleanups from the above commit
that are desirable and unrelated to adding DM dirty log support to MD.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/bitmap.c |  143 +++++++++++++++++----------------------------------
 drivers/md/bitmap.h |    5 --
 drivers/md/md.h     |    5 --
 3 files changed, 47 insertions(+), 106 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 5a1ffe3..2eb51cd 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -29,7 +29,6 @@
 #include "md.h"
 #include "bitmap.h"
 
-#include <linux/dm-dirty-log.h>
 /* debug macros */
 
 #define DEBUG 0
@@ -695,8 +694,6 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon
 static inline struct page *filemap_get_page(struct bitmap *bitmap,
 					unsigned long chunk)
 {
-	if (bitmap->filemap == NULL)
-		return NULL;
 	if (file_page_index(bitmap, chunk) >= bitmap->file_pages)
 		return NULL;
 	return bitmap->filemap[file_page_index(bitmap, chunk)
@@ -796,28 +793,19 @@ enum bitmap_page_attr {
 static inline void set_page_attr(struct bitmap *bitmap, struct page *page,
 				enum bitmap_page_attr attr)
 {
-	if (page)
-		__set_bit((page->index<<2) + attr, bitmap->filemap_attr);
-	else
-		__set_bit(attr, &bitmap->logattrs);
+	__set_bit((page->index<<2) + attr, bitmap->filemap_attr);
 }
 
 static inline void clear_page_attr(struct bitmap *bitmap, struct page *page,
 				enum bitmap_page_attr attr)
 {
-	if (page)
-		__clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
-	else
-		__clear_bit(attr, &bitmap->logattrs);
+	__clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
 }
 
 static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page,
 					   enum bitmap_page_attr attr)
 {
-	if (page)
-		return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
-	else
-		return test_bit(attr, &bitmap->logattrs);
+	return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
 }
 
 /*
@@ -830,30 +818,27 @@ static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *p
 static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 {
 	unsigned long bit;
-	struct page *page = NULL;
+	struct page *page;
 	void *kaddr;
 	unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap);
 
-	if (!bitmap->filemap) {
-		struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log;
-		if (log)
-			log->type->mark_region(log, chunk);
-	} else {
+	if (!bitmap->filemap)
+		return;
 
-		page = filemap_get_page(bitmap, chunk);
-		if (!page)
-			return;
-		bit = file_page_offset(bitmap, chunk);
+	page = filemap_get_page(bitmap, chunk);
+	if (!page)
+		return;
+	bit = file_page_offset(bitmap, chunk);
+
+	/* set the bit */
+	kaddr = kmap_atomic(page, KM_USER0);
+	if (bitmap->flags & BITMAP_HOSTENDIAN)
+		set_bit(bit, kaddr);
+	else
+		ext2_set_bit(bit, kaddr);
+	kunmap_atomic(kaddr, KM_USER0);
+	PRINTK("set file bit %lu page %lu\n", bit, page->index);
 
-		/* set the bit */
-		kaddr = kmap_atomic(page, KM_USER0);
-		if (bitmap->flags & BITMAP_HOSTENDIAN)
-			set_bit(bit, kaddr);
-		else
-			ext2_set_bit(bit, kaddr);
-		kunmap_atomic(kaddr, KM_USER0);
-		PRINTK("set file bit %lu page %lu\n", bit, page->index);
-	}
 	/* record page number so it gets flushed to disk when unplug occurs */
 	set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
 }
@@ -870,16 +855,6 @@ void bitmap_unplug(struct bitmap *bitmap)
 
 	if (!bitmap)
 		return;
-	if (!bitmap->filemap) {
-		/* Must be using a dirty_log */
-		struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log;
-		dirty = test_and_clear_bit(BITMAP_PAGE_DIRTY, &bitmap->logattrs);
-		need_write = test_and_clear_bit(BITMAP_PAGE_NEEDWRITE, &bitmap->logattrs);
-		if (dirty || need_write)
-			if (log->type->flush(log))
-				bitmap->flags |= BITMAP_WRITE_ERROR;
-		goto out;
-	}
 
 	/* look at each page to see if there are any set bits that need to be
 	 * flushed out to disk */
@@ -908,7 +883,6 @@ void bitmap_unplug(struct bitmap *bitmap)
 		else
 			md_super_wait(bitmap->mddev);
 	}
-out:
 	if (bitmap->flags & BITMAP_WRITE_ERROR)
 		bitmap_file_kick(bitmap);
 }
@@ -951,11 +925,11 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		printk(KERN_INFO "%s: bitmap file is out of date, doing full "
 			"recovery\n", bmname(bitmap));
 
-	bytes = DIV_ROUND_UP(bitmap->chunks, 8);
+	bytes = (chunks + 7) / 8;
 	if (!bitmap->mddev->bitmap_info.external)
 		bytes += sizeof(bitmap_super_t);
 
-	num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
+	num_pages = (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
 
 	if (file && i_size_read(file->f_mapping->host) < bytes) {
 		printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n",
@@ -1117,7 +1091,6 @@ void bitmap_daemon_work(mddev_t *mddev)
 	struct page *page = NULL, *lastpage = NULL;
 	sector_t blocks;
 	void *paddr;
-	struct dm_dirty_log *log = mddev->bitmap_info.log;
 
 	/* Use a mutex to guard daemon_work against
 	 * bitmap_destroy.
@@ -1142,12 +1115,11 @@ void bitmap_daemon_work(mddev_t *mddev)
 	spin_lock_irqsave(&bitmap->lock, flags);
 	for (j = 0; j < bitmap->chunks; j++) {
 		bitmap_counter_t *bmc;
-		if (!bitmap->filemap) {
-			if (!log)
-				/* error or shutdown */
-				break;
-		} else
-			page = filemap_get_page(bitmap, j);
+		if (!bitmap->filemap)
+			/* error or shutdown */
+			break;
+
+		page = filemap_get_page(bitmap, j);
 
 		if (page != lastpage) {
 			/* skip this page unless it's marked as needing cleaning */
@@ -1216,17 +1188,14 @@ void bitmap_daemon_work(mddev_t *mddev)
 						  -1);
 
 				/* clear the bit */
-				if (page) {
-					paddr = kmap_atomic(page, KM_USER0);
-					if (bitmap->flags & BITMAP_HOSTENDIAN)
-						clear_bit(file_page_offset(bitmap, j),
-							  paddr);
-					else
-						ext2_clear_bit(file_page_offset(bitmap, j),
-							       paddr);
-					kunmap_atomic(paddr, KM_USER0);
-				} else
-					log->type->clear_region(log, j);
+				paddr = kmap_atomic(page, KM_USER0);
+				if (bitmap->flags & BITMAP_HOSTENDIAN)
+					clear_bit(file_page_offset(bitmap, j),
+						  paddr);
+				else
+					ext2_clear_bit(file_page_offset(bitmap, j),
+						       paddr);
+				kunmap_atomic(paddr, KM_USER0);
 			}
 		} else
 			j |= PAGE_COUNTER_MASK;
@@ -1234,16 +1203,12 @@ void bitmap_daemon_work(mddev_t *mddev)
 	spin_unlock_irqrestore(&bitmap->lock, flags);
 
 	/* now sync the final page */
-	if (lastpage != NULL || log != NULL) {
+	if (lastpage != NULL) {
 		spin_lock_irqsave(&bitmap->lock, flags);
 		if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
 			clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
 			spin_unlock_irqrestore(&bitmap->lock, flags);
-			if (lastpage)
-				write_page(bitmap, lastpage, 0);
-			else
-				if (log->type->flush(log))
-					bitmap->flags |= BITMAP_WRITE_ERROR;
+			write_page(bitmap, lastpage, 0);
 		} else {
 			set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
 			spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -1408,9 +1373,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
 		(*bmc)--;
 		if (*bmc <= 2)
 			set_page_attr(bitmap,
-				      filemap_get_page(
-					      bitmap,
-					      offset >> CHUNK_BLOCK_SHIFT(bitmap)),
+				      filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)),
 				      BITMAP_PAGE_CLEAN);
 
 		spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -1685,13 +1648,10 @@ int bitmap_create(mddev_t *mddev)
 
 	BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
 
-	if (!file
-	    && !mddev->bitmap_info.offset
-	    && !mddev->bitmap_info.log) /* bitmap disabled, nothing to do */
+	if (!file && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */
 		return 0;
 
 	BUG_ON(file && mddev->bitmap_info.offset);
-	BUG_ON(mddev->bitmap_info.offset && mddev->bitmap_info.log);
 
 	bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
 	if (!bitmap)
@@ -1778,6 +1738,7 @@ int bitmap_create(mddev_t *mddev)
 int bitmap_load(mddev_t *mddev)
 {
 	int err = 0;
+	sector_t start = 0;
 	sector_t sector = 0;
 	struct bitmap *bitmap = mddev->bitmap;
 
@@ -1796,24 +1757,14 @@ int bitmap_load(mddev_t *mddev)
 	}
 	bitmap_close_sync(bitmap);
 
-	if (mddev->bitmap_info.log) {
-		unsigned long i;
-		struct dm_dirty_log *log = mddev->bitmap_info.log;
-		for (i = 0; i < bitmap->chunks; i++)
-			if (!log->type->in_sync(log, i, 1))
-				bitmap_set_memory_bits(bitmap,
-						       (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap),
-						       1);
-	} else {
-		sector_t start = 0;
-		if (mddev->degraded == 0
-		    || bitmap->events_cleared == mddev->events)
-			/* no need to keep dirty bits to optimise a
-			 * re-add of a missing device */
-			start = mddev->recovery_cp;
-
-		err = bitmap_init_from_disk(bitmap, start);
-	}
+	if (mddev->degraded == 0
+	    || bitmap->events_cleared == mddev->events)
+		/* no need to keep dirty bits to optimise a
+		 * re-add of a missing device */
+		start = mddev->recovery_cp;
+
+	err = bitmap_init_from_disk(bitmap, start);
+
 	if (err)
 		goto out;
 
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 931a7a7..19f5dc9 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -222,10 +222,6 @@ struct bitmap {
 	unsigned long file_pages; /* number of pages in the file */
 	int last_page_size; /* bytes in the last page */
 
-	unsigned long logattrs; /* used when filemap_attr doesn't exist
-				 * because we are working with a dirty_log
-				 */
-
 	unsigned long flags;
 
 	int allclean;
@@ -247,7 +243,6 @@ struct bitmap {
 	wait_queue_head_t behind_wait;
 
 	struct sysfs_dirent *sysfs_can_clear;
-
 };
 
 /* the bitmap API */
diff --git a/drivers/md/md.h b/drivers/md/md.h
index d05bab5..3616a73 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -313,11 +313,6 @@ struct mddev_s
 							 * hot-adding a bitmap.  It should
 							 * eventually be settable by sysfs.
 							 */
-		/* When md is serving under dm, it might use a
-		 * dirty_log to store the bits.
-		 */
-		struct dm_dirty_log *log;
-
 		struct mutex		mutex;
 		unsigned long		chunksize;
 		unsigned long		daemon_sleep; /* how many jiffies between updates? */
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH v3 2/8] md/bitmap: use DIV_ROUND_UP in bitmap_init_from_disk
  2010-12-21  2:37 [PATCH v3 0/8] dm-raid (raid456) target Mike Snitzer
  2010-12-21  2:37 ` [PATCH v3 1/8] md/bitmap: revert dm-dirty-log preparation Mike Snitzer
@ 2010-12-21  2:37 ` Mike Snitzer
  2010-12-21  2:37 ` [PATCH v3 3/8] md/raid5: use sysfs_notify_dirent_safe to avoid NULL pointer Mike Snitzer
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Mike Snitzer @ 2010-12-21  2:37 UTC (permalink / raw)
  To: jbrassow, neilb; +Cc: agk, dm-devel, linux-raid

From: NeilBrown <neilb@suse.de>

Keep DIV_ROUND_UP changes from e384e58549a2e9a83071ad80280c1a9053cfd84c
  md/bitmap: prepare for storing write-intent-bitmap via dm-dirty-log.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/bitmap.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 2eb51cd..f3f5edc 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -925,11 +925,11 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		printk(KERN_INFO "%s: bitmap file is out of date, doing full "
 			"recovery\n", bmname(bitmap));
 
-	bytes = (chunks + 7) / 8;
+	bytes = DIV_ROUND_UP(bitmap->chunks, 8);
 	if (!bitmap->mddev->bitmap_info.external)
 		bytes += sizeof(bitmap_super_t);
 
-	num_pages = (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+	num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
 
 	if (file && i_size_read(file->f_mapping->host) < bytes) {
 		printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n",
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH v3 3/8] md/raid5: use sysfs_notify_dirent_safe to avoid NULL pointer
  2010-12-21  2:37 [PATCH v3 0/8] dm-raid (raid456) target Mike Snitzer
  2010-12-21  2:37 ` [PATCH v3 1/8] md/bitmap: revert dm-dirty-log preparation Mike Snitzer
  2010-12-21  2:37 ` [PATCH v3 2/8] md/bitmap: use DIV_ROUND_UP in bitmap_init_from_disk Mike Snitzer
@ 2010-12-21  2:37 ` Mike Snitzer
  2010-12-21  2:37 ` [PATCH v3 4/8] dm raid: skeleton raid456 target support Mike Snitzer
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Mike Snitzer @ 2010-12-21  2:37 UTC (permalink / raw)
  To: jbrassow, neilb; +Cc: agk, dm-devel, linux-raid

From: Jonathan Brassow <jbrassow@redhat.com>

With the module parameter 'start_dirty_degraded' set,
raid5_spare_active() previously called sysfs_notify_dirent() with a NULL
argument (rdev->sysfs_state) when a rebuild finished.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/raid5.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index dc574f3..3219fd9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5339,7 +5339,7 @@ static int raid5_spare_active(mddev_t *mddev)
 		    && !test_bit(Faulty, &tmp->rdev->flags)
 		    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
 			count++;
-			sysfs_notify_dirent(tmp->rdev->sysfs_state);
+			sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
 		}
 	}
 	spin_lock_irqsave(&conf->device_lock, flags);
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH v3 4/8] dm raid: skeleton raid456 target support
  2010-12-21  2:37 [PATCH v3 0/8] dm-raid (raid456) target Mike Snitzer
                   ` (2 preceding siblings ...)
  2010-12-21  2:37 ` [PATCH v3 3/8] md/raid5: use sysfs_notify_dirent_safe to avoid NULL pointer Mike Snitzer
@ 2010-12-21  2:37 ` Mike Snitzer
  2010-12-21  2:37 ` [PATCH v3 5/8] dm: introduce target callbacks and congestion callback Mike Snitzer
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Mike Snitzer @ 2010-12-21  2:37 UTC (permalink / raw)
  To: jbrassow, neilb; +Cc: agk, dm-devel, linux-raid

From: NeilBrown <neilb@suse.de>

This patch is the skeleton for the DM target that will be
the bridge from DM to MD (initially RAID456 and later RAID1).  It
provides a way to use device-mapper interfaces to the MD RAID456
drivers.  Additional changes are still needed to provide a functional
DM target.

As with all device-mapper targets, the nominal public interfaces are the
constructor (CTR) tables and the status outputs (both STATUSTYPE_INFO
and STATUSTYPE_TABLE).  The CTR table looks like the following:

1: <s> <l> raid \
2:	<raid_type> <#raid_params> <raid_params> \
3:	<#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN>

Line 1 contains the standard first three arguments to any device-mapper
target - the start, length, and target type fields.  The target type in
this case is "raid".

Line 2 contains the arguments that define the particular raid
type/personality/level, the required arguments for that raid type, and
any optional arguments.  Possible raid types include: raid4, raid5_la,
raid5_ls, raid5_rs, raid6_zr, raid6_nr, and raid6_nc.  (again, raid1 is
planned for the future.)  The list of required and optional parameters
is the same for all the current raid types.  The required parameters are
positional, while the optional parameters are given as key/value pairs.
The possible parameters are as follows:
 <chunk_size>		Chunk size in sectors.
 [[no]sync]		Force/Prevent RAID initialization
 [rebuild <idx>]	Rebuild the drive indicated by the index
 [daemon_sleep <ms>]	Time between bitmap daemon work to clear bits
 [min_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
 [max_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
 [max_write_behind <value>]		See '-write-behind=' (man mdadm)
 [stripe_cache <sectors>]		Stripe cache size for higher RAIDs

Line 3 contains the list of devices that compose the array in
metadata/data device pairs.  If the metadata is stored separately, a '-'
is given for the metadata device position.  If a drive has failed or is
missing at creation time, a '-' can be given for both the metadata and
data drives for a given position.

Examples:
# RAID4 - 4 data drives, 1 parity
# No metadata devices specified to hold superblock/bitmap info
# Chunk size of 1MiB
# (Lines separated for easy reading)
0 1960893648 raid \
	raid4 1 2048 \
	5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81

# RAID4 - 4 data drives, 1 parity (no metadata devices)
# Chunk size of 1MiB, force RAID initialization,
#	min recovery rate at 20 kiB/sec/disk
0 1960893648 raid \
        raid4 4 2048 min_recovery_rate 20 sync\
        5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81

Performing a 'dmsetup table' should display the CTR table used to
construct the mapping (with possible reordering of optional
parameters).

Performing a 'dmsetup status' will yield information on the state and
health of the array.  The output is as follows:
1: <s> <l> raid \
2:	<raid_type> <#devices> <1 health char for each dev> <resync_ratio>

Line 1 is standard DM output.  Line 2 is best shown by example:
	0 1960893648 raid raid4 5 AAAAA 2/490221568
Here we can see the RAID type is raid4, there are 5 devices - all of
which are 'A'live, and the array is 2/490221568 complete with recovery.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/Kconfig   |   23 ++
 drivers/md/Makefile  |    1 +
 drivers/md/dm-raid.c |  585 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 609 insertions(+), 0 deletions(-)
 create mode 100644 drivers/md/dm-raid.c

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index bf1a95e..386501f 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -240,6 +240,29 @@ config DM_MIRROR
          Allow volume managers to mirror logical volumes, also
          needed for live data migration tools such as 'pvmove'.
 
+config DM_RAID
+       tristate "RAID 4/5/6 target (EXPERIMENTAL)"
+       depends on BLK_DEV_DM && EXPERIMENTAL
+       select MD_RAID456
+       ---help---
+	 A dm target that supports RAID4, RAID5 and RAID6 mappings
+
+	 A RAID-5 set of N drives with a capacity of C MB per drive provides
+	 the capacity of C * (N - 1) MB, and protects against a failure
+	 of a single drive. For a given sector (row) number, (N - 1) drives
+	 contain data sectors, and one drive contains the parity protection.
+	 For a RAID-4 set, the parity blocks are present on a single drive,
+	 while a RAID-5 set distributes the parity across the drives in one
+	 of the available parity distribution methods.
+
+	 A RAID-6 set of N drives with a capacity of C MB per drive
+	 provides the capacity of C * (N - 2) MB, and protects
+	 against a failure of any two drives. For a given sector
+	 (row) number, (N - 2) drives contain data sectors, and two
+	 drives contains two independent redundancy syndromes.  Like
+	 RAID-5, RAID-6 distributes the syndromes across the drives
+	 in one of the available parity distribution methods.
+
 config DM_LOG_USERSPACE
 	tristate "Mirror userspace logging (EXPERIMENTAL)"
 	depends on DM_MIRROR && EXPERIMENTAL && NET
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 5e3aac4..d013860 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_LOG_USERSPACE)	+= dm-log-userspace.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
+obj-$(CONFIG_DM_RAID)	+= dm-raid.o
 
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
new file mode 100644
index 0000000..edcccd8
--- /dev/null
+++ b/drivers/md/dm-raid.c
@@ -0,0 +1,585 @@
+/*
+ * Copyright (C) 2010 Neil Brown
+ * Copyright (C) 2010 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/slab.h>
+
+#include "md.h"
+#include "raid5.h"
+#include "dm.h"
+#include "bitmap.h"
+
+#define DM_MSG_PREFIX "raid"
+
+/*
+ * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then
+ * make it so the flag doesn't set anything.
+ */
+#ifndef MD_SYNC_STATE_FORCED
+#define MD_SYNC_STATE_FORCED 0
+#endif
+
+struct raid_dev {
+	/*
+	 * Two DM devices, one to hold metadata and one to hold the
+	 * actual data/parity.  The reason for this is to not confuse
+	 * ti->len and give more flexibility in altering size and
+	 * characteristics.
+	 *
+	 * While it is possible for this device to be associated
+	 * with a different physical device than the data_dev, it
+	 * is intended for it to be the same.
+	 *    |--------- Physical Device ---------|
+	 *    |- meta_dev -|------ data_dev ------|
+	 */
+	struct dm_dev *meta_dev;
+	struct dm_dev *data_dev;
+	struct mdk_rdev_s rdev;
+};
+
+/*
+ * Flags for rs->print_flags field.
+ */
+#define DMPF_DAEMON_SLEEP      0x1
+#define DMPF_MAX_WRITE_BEHIND  0x2
+#define DMPF_SYNC              0x4
+#define DMPF_NOSYNC            0x8
+#define DMPF_STRIPE_CACHE      0x10
+#define DMPF_MIN_RECOVERY_RATE 0x20
+#define DMPF_MAX_RECOVERY_RATE 0x40
+
+struct raid_set {
+	struct dm_target *ti;
+
+	uint64_t print_flags;
+
+	struct mddev_s md;
+	struct raid_type *raid_type;
+
+	struct raid_dev dev[0];
+};
+
+/* Supported raid types and properties. */
+static struct raid_type {
+	const char *name;		/* RAID algorithm. */
+	const char *descr;		/* Descriptor text for logging. */
+	const unsigned parity_devs;	/* # of parity devices. */
+	const unsigned minimal_devs;	/* minimal # of devices in set. */
+	const unsigned level;		/* RAID level. */
+	const unsigned algorithm;	/* RAID algorithm. */
+} raid_types[] = {
+	{"raid4",    "RAID4 (dedicated parity disk)",	1, 2, 5, ALGORITHM_PARITY_0},
+	{"raid5_la", "RAID5 (left asymmetric)",		1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
+	{"raid5_ra", "RAID5 (right asymmetric)",	1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
+	{"raid5_ls", "RAID5 (left symmetric)",		1, 2, 5, ALGORITHM_LEFT_SYMMETRIC},
+	{"raid5_rs", "RAID5 (right symmetric)",		1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC},
+	{"raid6_zr", "RAID6 (zero restart)",		2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART},
+	{"raid6_nr", "RAID6 (N restart)",		2, 4, 6, ALGORITHM_ROTATING_N_RESTART},
+	{"raid6_nc", "RAID6 (N continue)",		2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
+};
+
+static struct raid_type *get_raid_type(char *name)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(raid_types); i++)
+		if (strcmp(raid_types[i].name, name) == 0)
+			return &raid_types[i];
+	return NULL;
+}
+
+static struct raid_set *
+context_alloc(struct dm_target *ti, struct raid_type *raid_type, long raid_devs)
+{
+	int i;
+	struct raid_set *rs;
+	sector_t sectors_per_dev;
+
+	sectors_per_dev = ti->len;
+	if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
+		ti->error = "Target length not divisible by number of data devices";
+		return ERR_PTR(-EINVAL);
+	}
+
+	rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
+	if (!rs) {
+		ti->error = "Cannot allocate raid context";
+		return ERR_PTR(-ENOMEM);
+	}
+
+	mddev_init(&rs->md);
+
+	rs->ti = ti;
+	rs->raid_type = raid_type;
+	rs->md.raid_disks = raid_devs;
+	rs->md.level = raid_type->level;
+	rs->md.new_level = rs->md.level;
+	rs->md.dev_sectors = sectors_per_dev;
+	rs->md.layout = raid_type->algorithm;
+	rs->md.new_layout = rs->md.layout;
+	rs->md.delta_disks = 0;
+	rs->md.recovery_cp = 0;
+
+	for (i = 0; i < raid_devs; i++)
+		md_rdev_init(&rs->dev[i].rdev);
+
+	/*
+	 * Remaining items to be initialized by further RAID params:
+	 *  rs->md.persistent
+	 *  rs->md.external
+	 *  rs->md.chunk_sectors
+	 *  rs->md.new_chunk_sectors
+	 */
+
+	return rs;
+}
+
+static void context_free(struct raid_set *rs)
+{
+	int i;
+	for (i = 0; i < rs->md.raid_disks; i++) {
+		if (rs->dev[i].data_dev)
+			dm_put_device(rs->ti, rs->dev[i].data_dev);
+	}
+	kfree(rs);
+}
+
+/*
+ * For every device we have two words
+ *  <meta_dev>: meta device name or '-' if missing
+ *  <data_dev>: data device name or '-' if missing
+ *
+ * This code parses those words.
+ */
+static int dev_parms(struct raid_set *rs, char **argv)
+{
+	int i;
+	int rebuild = 0;
+	int metadata_available = 0;
+
+	for (i = 0; i < rs->md.raid_disks; i++, argv += 2) {
+		int err = 0;
+
+		rs->dev[i].rdev.raid_disk = i;
+
+		rs->dev[i].meta_dev = NULL;
+		rs->dev[i].data_dev = NULL;
+
+		/*
+		 * There are no offsets, since there is a separate device
+		 * for data and metadata.
+		 */
+		rs->dev[i].rdev.data_offset = 0;
+		rs->dev[i].rdev.mddev = &rs->md;
+
+		if (strcmp(argv[0], "-") != 0) {
+			rs->ti->error = "Metadata devices not supported";
+			return -EINVAL;
+		}
+
+		if (strcmp(argv[1], "-") == 0) {
+			rs->ti->error = "Drive designated for rebuild not specified";
+			if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
+			    (rs->dev[i].rdev.recovery_offset == 0))
+				return -EINVAL;
+
+			continue;
+		}
+
+		err = dm_get_device(rs->ti, argv[1],
+				    dm_table_get_mode(rs->ti->table),
+				    &rs->dev[i].data_dev);
+		rs->ti->error = "RAID device lookup failure";
+		if (err)
+			return err;
+
+		rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
+		list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
+		if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+			rebuild++;
+	}
+
+	if (metadata_available) {
+		rs->md.external = 0;
+		rs->md.persistent = 1;
+		rs->md.major_version = 2;
+	} else if (rebuild && !rs->md.recovery_cp) {
+		/*
+		 * Without metadata, we will not be able to tell if the array
+		 * is in-sync or not - we must assume it is not.  Therefore,
+		 * it is impossible to rebuild a drive.
+		 *
+		 * Even if there is metadata, the on-disk information may
+		 * indicate that the array is not in-sync and it will then
+		 * fail at that time.
+		 *
+		 * User could specify 'nosync' option if desperate.
+		 */
+		DMERR("Unable to rebuild drive while array is not in-sync");
+		rs->ti->error = "RAID device lookup failure";
+		return -EINVAL;
+	}
+
+	rs->ti->error = NULL;
+	return 0;
+}
+
+/*
+ * Possible arguments are...
+ * RAID456:
+ *	<chunk_size> [optional_args]
+ *
+ * Optional args include:
+ *	[[no]sync]		Force or prevent recovery of the entire array
+ *	[rebuild <idx>]		Rebuild the drive indicated by the index
+ *	[daemon_sleep <ms>]	Time between bitmap daemon work to clear bits
+ *	[min_recovery_rate <kB/sec/disk>]
+ *	[max_recovery_rate <kB/sec/disk>]
+ *	[max_write_behind <sectors>]
+ *	[stripe_cache <sectors>]
+ */
+static int parse_raid_params(struct raid_set *rs, char **argv,
+			     unsigned long num_raid_params)
+{
+	int i, rebuild_cnt = 0;
+	unsigned long value;
+	char *key;
+	char *reason = NULL;
+	struct mddev_s *mddev = &rs->md;
+
+	/*
+	 * First, parse the in-order required arguments
+	 */
+	rs->ti->error = "Bad chunk size";
+	if ((strict_strtoul(argv[0], 10, &value) < 0) ||
+	    !is_power_of_2(value) || (value < 8))
+		return -EINVAL;
+	rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
+	argv++;
+	num_raid_params--;
+
+	/*
+	 * Second, parse the unordered optional arguments
+	 */
+	for (i = 0; i < rs->md.raid_disks; i++)
+		set_bit(In_sync, &rs->dev[i].rdev.flags);
+
+	for (i = 0; i < num_raid_params; i++) {
+		if (!strcmp(argv[i], "nosync")) {
+			rs->md.recovery_cp = MaxSector;
+			rs->print_flags |= DMPF_NOSYNC;
+			rs->md.flags |= MD_SYNC_STATE_FORCED;
+			continue;
+		}
+		if (!strcmp(argv[i], "sync")) {
+			rs->md.recovery_cp = 0;
+			rs->print_flags |= DMPF_SYNC;
+			rs->md.flags |= MD_SYNC_STATE_FORCED;
+			continue;
+		}
+
+		/* The rest of the optional arguments come in key/value pairs */
+		if ((i + 1) >= num_raid_params) {
+			reason = "Wrong number of raid parameters given";
+			goto out;
+		}
+		key = argv[i];
+		i++;
+		if (strict_strtoul(argv[i], 10, &value) < 0) {
+			reason = "Bad numerical argument given in raid params";
+			goto out;
+		}
+
+		if (!strcmp(key, "rebuild")) {
+			if (++rebuild_cnt > rs->raid_type->parity_devs)
+				reason = "Too many rebuild drives given";
+			else if ((value < 0) || (value > rs->md.raid_disks))
+				reason = "Invalid rebuild index given";
+			else {
+				clear_bit(In_sync, &rs->dev[value].rdev.flags);
+				rs->dev[value].rdev.recovery_offset = 0;
+			}
+		} else if (!strcmp(key, "max_write_behind")) {
+			rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
+
+			/*
+			 * In device-mapper, we specify things in sectors, but
+			 * MD records this value in kB
+			 */
+			value /= 2;
+			if (value > COUNTER_MAX)
+				reason = "Max write-behind limit out of range";
+			else
+				mddev->bitmap_info.max_write_behind = value;
+		} else if (!strcmp(key, "daemon_sleep")) {
+			rs->print_flags |= DMPF_DAEMON_SLEEP;
+			if ((value < 1) || (value > MAX_SCHEDULE_TIMEOUT))
+				reason = "daemon sleep period out of range";
+			else
+				mddev->bitmap_info.daemon_sleep = value;
+		} else if (!strcmp(key, "stripe_cache")) {
+			rs->print_flags |= DMPF_STRIPE_CACHE;
+
+			/*
+			 * In device-mapper, we specify things in sectors, but
+			 * MD records this value in kB
+			 */
+			value /= 2;
+			if (rs->raid_type->level < 5)
+				reason = "Inappropriate argument, stripe_cache";
+			else if (raid5_set_cache_size(&rs->md, (int)value))
+				reason = "Bad stripe_cache size";
+		} else if (!strcmp(key, "min_recovery_rate")) {
+			rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
+			rs->md.sync_speed_min = (int)value;
+		} else if (!strcmp(key, "max_recovery_rate")) {
+			rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
+			rs->md.sync_speed_max = (int)value;
+			rs->md.flags |= MD_SYNC_STATE_FORCED;
+		} else {
+			DMERR("Unable to parse RAID parameter, %s", key);
+			reason = "Unable to parse RAID parameters";
+		}
+		if (reason)
+			goto out;
+	}
+
+	/* Assume there are no metadata devices until the drives are parsed */
+	rs->md.persistent = 0;
+	rs->md.external = 1;
+
+out:
+	rs->ti->error = reason;
+	return (reason) ? -EINVAL : 0;
+}
+
+static void do_table_event(struct work_struct *ws)
+{
+	struct raid_set *rs = container_of(ws, struct raid_set,
+					   md.event_work);
+	dm_table_event(rs->ti->table);
+}
+
+/*
+ * Construct a RAID4/5/6 mapping:
+ * Args:
+ *	<raid_type> <#raid_params> <raid_params>		\
+ *	<#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
+ *
+ * ** metadata devices are not supported yet, use '-' instead **
+ *
+ * <raid_params> varies by <raid_type>.  See 'parse_raid_params' for
+ * details on possible <raid_params>.
+ */
+static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+	char *err = NULL;
+	int errnum = -EINVAL;
+	struct raid_type *rt;
+	unsigned long num_raid_params, num_raid_devs;
+	struct raid_set *rs = NULL;
+
+	/* Must have at least <raid_type> <#raid_params> */
+	err = "Too few arguments";
+	if (argc < 2)
+		goto err;
+
+	/* raid type */
+	err = "Cannot find raid_type";
+	rt = get_raid_type(argv[0]);
+	if (!rt)
+		goto err;
+	argc--; argv++;
+
+	/* number of RAID parameters */
+	err = "Cannot understand number of RAID parameters";
+	if (strict_strtoul(argv[0], 10, &num_raid_params) < 0)
+		goto err;
+	argc--; argv++;
+
+	/* Skip over RAID params for now and find out # of devices */
+	err = "Arguments do not agree with counts given";
+	if (num_raid_params + 1 > argc)
+		goto err;
+
+	err = "Bad number of raid devices";
+	if (strict_strtol(argv[num_raid_params], 10, &num_raid_devs) < 0)
+		goto err;
+
+	rs = context_alloc(ti, rt, num_raid_devs);
+	if (IS_ERR(rs))
+		return PTR_ERR(rs);
+
+	errnum = parse_raid_params(rs, argv, num_raid_params);
+	if (errnum) {
+		err = ti->error;
+		goto err;
+	}
+	errnum = -EINVAL;
+
+	argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
+	argv += num_raid_params + 1;
+
+	err = "Supplied RAID devices does not match the count given";
+	if (argc != (num_raid_devs * 2))
+		goto err;
+
+	errnum = dev_parms(rs, argv);
+	if (errnum) {
+		err = ti->error;
+		goto err;
+	}
+
+	INIT_WORK(&rs->md.event_work, do_table_event);
+	ti->split_io = rs->md.chunk_sectors;
+	ti->private = rs;
+
+	mutex_lock(&rs->md.reconfig_mutex);
+	err = "Fail to run raid array";
+	errnum = md_run(&rs->md);
+	rs->md.in_sync = 0; /* Assume already marked dirty */
+	mutex_unlock(&rs->md.reconfig_mutex);
+
+	if (!errnum)
+		return 0;
+
+err:
+	if (rs)
+		context_free(rs);
+	ti->error = err;
+	return errnum;
+}
+
+static void raid_dtr(struct dm_target *ti)
+{
+	struct raid_set *rs = ti->private;
+
+	md_stop(&rs->md);
+	context_free(rs);
+}
+
+static int raid_map(struct dm_target *ti, struct bio *bio,
+		    union map_info *map_context)
+{
+	struct raid_set *rs = ti->private;
+	mddev_t *mddev = &rs->md;
+
+	mddev->pers->make_request(mddev, bio);
+	return DM_MAPIO_SUBMITTED;
+}
+
+static int raid_status(struct dm_target *ti, status_type_t type,
+		       char *result, unsigned maxlen)
+{
+	struct raid_set *rs = ti->private;
+	int sz = 0;
+	int raid_param_cnt = 1; /* at least 1 for chunksize */
+	int i;
+	sector_t sync;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
+		for (i = 0; i < rs->md.raid_disks; i++) {
+			if (test_bit(Faulty, &rs->dev[i].rdev.flags))
+				DMEMIT("D");
+			else if (test_bit(In_sync, &rs->dev[i].rdev.flags))
+				DMEMIT("A");
+			else
+				DMEMIT("a");
+		}
+		if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
+			sync = rs->md.curr_resync_completed;
+		else
+			sync = rs->md.recovery_cp;
+		if (sync > rs->md.resync_max_sectors)
+			sync = rs->md.resync_max_sectors;
+		DMEMIT(" %llu/%llu",
+		       (unsigned long long) sync,
+		       (unsigned long long) rs->md.resync_max_sectors);
+
+		break;
+	case STATUSTYPE_TABLE:
+		/* The string you would use to construct this array */
+		for (i = 0; i < rs->md.raid_disks; i++)
+			if (rs->dev[i].data_dev &&
+			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
+				raid_param_cnt++; /* for rebuilds */
+
+		raid_param_cnt += (hweight64(rs->print_flags) * 2);
+		if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
+			raid_param_cnt--;
+
+		DMEMIT("%s %d %u", rs->raid_type->name,
+		       raid_param_cnt, rs->md.chunk_sectors);
+
+		for (i = 0; i < rs->md.raid_disks; i++)
+			if (rs->dev[i].data_dev &&
+			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
+				DMEMIT(" rebuild=%u", i);
+
+		if (rs->print_flags & DMPF_DAEMON_SLEEP)
+			DMEMIT(" daemon_sleep %lu",
+			       rs->md.bitmap_info.daemon_sleep);
+		if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
+			DMEMIT(" max_write_behind %lu",
+			       rs->md.bitmap_info.max_write_behind);
+		if (rs->print_flags & DMPF_STRIPE_CACHE)
+			DMEMIT(" stripe_cache <unknown>");
+		if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
+			DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
+		if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
+			DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
+		if ((rs->print_flags & DMPF_SYNC) &&
+		    (rs->md.recovery_cp == MaxSector))
+			DMEMIT(" sync");
+		if (rs->print_flags & DMPF_NOSYNC)
+			DMEMIT(" nosync");
+
+		DMEMIT(" %d", rs->md.raid_disks);
+		for (i = 0; i < rs->md.raid_disks; i++) {
+			DMEMIT(" -"); /* metadata device */
+
+			if (rs->dev[i].data_dev)
+				DMEMIT(" %s", rs->dev[i].data_dev->name);
+			else
+				DMEMIT(" -");
+		}
+		break;
+	}
+	return 0;
+}
+
+static struct target_type raid_target = {
+	.name = "raid",
+	.version = {1, 0, 0},
+	.module = THIS_MODULE,
+	.ctr = raid_ctr,
+	.dtr = raid_dtr,
+	.map = raid_map,
+	.status = raid_status,
+};
+
+static int __init dm_raid_init(void)
+{
+	int r = dm_register_target(&raid_target);
+
+	return r;
+}
+
+static void __exit dm_raid_exit(void)
+{
+	dm_unregister_target(&raid_target);
+}
+
+module_init(dm_raid_init);
+module_exit(dm_raid_exit);
+
+MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("dm-raid4");
+MODULE_ALIAS("dm-raid5");
+MODULE_ALIAS("dm-raid6");
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH v3 5/8] dm: introduce target callbacks and congestion callback
  2010-12-21  2:37 [PATCH v3 0/8] dm-raid (raid456) target Mike Snitzer
                   ` (3 preceding siblings ...)
  2010-12-21  2:37 ` [PATCH v3 4/8] dm raid: skeleton raid456 target support Mike Snitzer
@ 2010-12-21  2:37 ` Mike Snitzer
  2010-12-21  2:37 ` [PATCH v3 6/8] dm: per-target unplug callback support Mike Snitzer
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Mike Snitzer @ 2010-12-21  2:37 UTC (permalink / raw)
  To: jbrassow, neilb; +Cc: agk, dm-devel, linux-raid

From: NeilBrown <neilb@suse.de>

DM currently implements congestion checking by checking on congestion
in each component device.  For raid456 we need to also check if the
stripe cache is congested.

Add per-target congestion checker callback support and establish the
raid_is_congested() callback for dm-raid.

Extending the target_callbacks structure with additional callback
functions allows for establishing multiple callbacks per-target (a
callback is also needed for unplug).

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c          |   18 ++++++++++++++++--
 drivers/md/dm-table.c         |   15 +++++++++++++++
 include/linux/device-mapper.h |   12 ++++++++++++
 3 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index edcccd8..962f88f 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -58,6 +58,7 @@ struct raid_set {
 
 	struct mddev_s md;
 	struct raid_type *raid_type;
+	struct target_callbacks callbacks;
 
 	struct raid_dev dev[0];
 };
@@ -363,6 +364,13 @@ static void do_table_event(struct work_struct *ws)
 	dm_table_event(rs->ti->table);
 }
 
+static int raid_is_congested(void *v, int bits)
+{
+	struct target_callbacks *cb = v;
+	struct raid_set *rs = container_of(cb, struct raid_set,
+					   callbacks);
+	return md_raid5_congested(&rs->md, bits);
+}
 /*
  * Construct a RAID4/5/6 mapping:
  * Args:
@@ -443,8 +451,13 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	rs->md.in_sync = 0; /* Assume already marked dirty */
 	mutex_unlock(&rs->md.reconfig_mutex);
 
-	if (!errnum)
-		return 0;
+	if (errnum)
+		goto err;
+
+	rs->callbacks.congested_fn = raid_is_congested;
+	dm_table_add_callbacks(ti->table, &rs->callbacks);
+
+	return 0;
 
 err:
 	if (rs)
@@ -457,6 +470,7 @@ static void raid_dtr(struct dm_target *ti)
 {
 	struct raid_set *rs = ti->private;
 
+	list_del_init(&rs->callbacks.list);
 	md_stop(&rs->md);
 	context_free(rs);
 }
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 90267f8..88df831 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -71,6 +71,8 @@ struct dm_table {
 	void *event_context;
 
 	struct dm_md_mempools *mempools;
+
+	struct list_head target_callbacks;
 };
 
 /*
@@ -204,6 +206,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&t->devices);
+	INIT_LIST_HEAD(&t->target_callbacks);
 	atomic_set(&t->holders, 0);
 	t->discards_supported = 1;
 
@@ -1229,10 +1232,18 @@ int dm_table_resume_targets(struct dm_table *t)
 	return 0;
 }
 
+void dm_table_add_callbacks(struct dm_table *t,
+			    struct target_callbacks *cb)
+{
+	list_add(&cb->list, &t->target_callbacks);
+}
+EXPORT_SYMBOL_GPL(dm_table_add_callbacks);
+
 int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 {
 	struct dm_dev_internal *dd;
 	struct list_head *devices = dm_table_get_devices(t);
+	struct target_callbacks *cb;
 	int r = 0;
 
 	list_for_each_entry(dd, devices, list) {
@@ -1247,6 +1258,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 				     bdevname(dd->dm_dev.bdev, b));
 	}
 
+	list_for_each_entry(cb, &t->target_callbacks, list)
+		if (cb->congested_fn)
+			r |= cb->congested_fn(cb, bdi_bits);
+
 	return r;
 }
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 2970022..77e2e78 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -193,6 +193,12 @@ struct dm_target {
 	char *error;
 };
 
+/* Each target can link one of these into the table */
+struct target_callbacks {
+	struct list_head list;
+	congested_fn *congested_fn;
+};
+
 int dm_register_target(struct target_type *t);
 void dm_unregister_target(struct target_type *t);
 
@@ -269,6 +275,12 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 			sector_t start, sector_t len, char *params);
 
 /*
+ * Target_ctr should call this if they need to add any
+ * callback
+ */
+void dm_table_add_callbacks(struct dm_table *t,
+			    struct target_callbacks *cb);
+/*
  * Finally call this to make the table ready for use.
  */
 int dm_table_complete(struct dm_table *t);
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH v3 6/8] dm: per-target unplug callback support
  2010-12-21  2:37 [PATCH v3 0/8] dm-raid (raid456) target Mike Snitzer
                   ` (4 preceding siblings ...)
  2010-12-21  2:37 ` [PATCH v3 5/8] dm: introduce target callbacks and congestion callback Mike Snitzer
@ 2010-12-21  2:37 ` Mike Snitzer
  2010-12-21  2:37 ` [PATCH v3 7/8] dm raid: add iterate_devices and io_hints functions Mike Snitzer
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Mike Snitzer @ 2010-12-21  2:37 UTC (permalink / raw)
  To: jbrassow, neilb; +Cc: agk, dm-devel, linux-raid

From: NeilBrown <neilb@suse.de>

Add per-target unplug callback support and establish the raid_unplug()
callback for dm-raid.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c          |   10 ++++++++++
 drivers/md/dm-table.c         |    4 ++++
 include/linux/device-mapper.h |    1 +
 3 files changed, 15 insertions(+), 0 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 962f88f..4db3eab 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -371,6 +371,15 @@ static int raid_is_congested(void *v, int bits)
 					   callbacks);
 	return md_raid5_congested(&rs->md, bits);
 }
+
+static void raid_unplug(void *v)
+{
+	struct target_callbacks *cb = v;
+	struct raid_set *rs = container_of(cb, struct raid_set,
+					   callbacks);
+	md_raid5_unplug_device(rs->md.private);
+}
+
 /*
  * Construct a RAID4/5/6 mapping:
  * Args:
@@ -455,6 +464,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto err;
 
 	rs->callbacks.congested_fn = raid_is_congested;
+	rs->callbacks.unplug_fn = raid_unplug;
 	dm_table_add_callbacks(ti->table, &rs->callbacks);
 
 	return 0;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 88df831..bd4c4ef 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1283,6 +1283,7 @@ void dm_table_unplug_all(struct dm_table *t)
 {
 	struct dm_dev_internal *dd;
 	struct list_head *devices = dm_table_get_devices(t);
+	struct target_callbacks *cb;
 
 	list_for_each_entry(dd, devices, list) {
 		struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
@@ -1295,6 +1296,9 @@ void dm_table_unplug_all(struct dm_table *t)
 				     dm_device_name(t->md),
 				     bdevname(dd->dm_dev.bdev, b));
 	}
+	list_for_each_entry(cb, &t->target_callbacks, list)
+		if (cb->unplug_fn)
+			cb->unplug_fn(cb);
 }
 
 struct mapped_device *dm_table_get_md(struct dm_table *t)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 77e2e78..92511d2f 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -197,6 +197,7 @@ struct dm_target {
 struct target_callbacks {
 	struct list_head list;
 	congested_fn *congested_fn;
+	void (*unplug_fn)(void *);
 };
 
 int dm_register_target(struct target_type *t);
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH v3 7/8] dm raid: add iterate_devices and io_hints functions
  2010-12-21  2:37 [PATCH v3 0/8] dm-raid (raid456) target Mike Snitzer
                   ` (5 preceding siblings ...)
  2010-12-21  2:37 ` [PATCH v3 6/8] dm: per-target unplug callback support Mike Snitzer
@ 2010-12-21  2:37 ` Mike Snitzer
  2010-12-21  2:37 ` [PATCH v3 8/8] dm raid: add suspend and resume functions Mike Snitzer
  2010-12-21  3:14 ` [PATCH v3 0/8] dm-raid (raid456) target Neil Brown
  8 siblings, 0 replies; 19+ messages in thread
From: Mike Snitzer @ 2010-12-21  2:37 UTC (permalink / raw)
  To: jbrassow, neilb; +Cc: agk, dm-devel, linux-raid

From: NeilBrown <neilb@suse.de>

Add .iterate_devices function which should be implemented for all DM
targets.

Add an .io_hints function too because the provided topology information
is particularly important for a raid device.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c |   33 +++++++++++++++++++++++++++++++++
 1 files changed, 33 insertions(+), 0 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 4db3eab..6494833 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -577,6 +577,37 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 	return 0;
 }
 
+static int raid_iterate_devices(struct dm_target *ti,
+				iterate_devices_callout_fn fn,
+				void *data)
+{
+	struct raid_set *rs = ti->private;
+	int ret = 0;
+	unsigned i = 0;
+
+	for (i = 0; !ret && i < rs->md.raid_disks; i++)
+		if (rs->dev[i].data_dev)
+			ret = fn(ti,
+				 rs->dev[i].data_dev,
+				 0, /* No offset on data devs */
+				 rs->md.dev_sectors,
+				 data);
+
+	return ret;
+}
+
+static void raid_io_hints(struct dm_target *ti,
+			  struct queue_limits *limits)
+{
+	struct raid_set *rs = ti->private;
+	unsigned chunk_size = rs->md.chunk_sectors << 9;
+	raid5_conf_t *conf = rs->md.private;
+
+	blk_limits_io_min(limits, chunk_size);
+	blk_limits_io_opt(limits, chunk_size *
+			  (conf->raid_disks - conf->max_degraded));
+}
+
 static struct target_type raid_target = {
 	.name = "raid",
 	.version = {1, 0, 0},
@@ -585,6 +616,8 @@ static struct target_type raid_target = {
 	.dtr = raid_dtr,
 	.map = raid_map,
 	.status = raid_status,
+	.iterate_devices = raid_iterate_devices,
+	.io_hints = raid_io_hints,
 };
 
 static int __init dm_raid_init(void)
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH v3 8/8] dm raid: add suspend and resume functions
  2010-12-21  2:37 [PATCH v3 0/8] dm-raid (raid456) target Mike Snitzer
                   ` (6 preceding siblings ...)
  2010-12-21  2:37 ` [PATCH v3 7/8] dm raid: add iterate_devices and io_hints functions Mike Snitzer
@ 2010-12-21  2:37 ` Mike Snitzer
  2010-12-21  3:14 ` [PATCH v3 0/8] dm-raid (raid456) target Neil Brown
  8 siblings, 0 replies; 19+ messages in thread
From: Mike Snitzer @ 2010-12-21  2:37 UTC (permalink / raw)
  To: jbrassow, neilb; +Cc: agk, dm-devel, linux-raid

From: NeilBrown <neilb@suse.de>

Add .*suspend and .resume functions required by device-mapper target
implementations.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c |   22 ++++++++++++++++++++++
 1 files changed, 22 insertions(+), 0 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 6494833..91770de 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -608,6 +608,25 @@ static void raid_io_hints(struct dm_target *ti,
 			  (conf->raid_disks - conf->max_degraded));
 }
 
+static void raid_presuspend(struct dm_target *ti)
+{
+	struct raid_set *rs = ti->private;
+	md_stop_writes(&rs->md);
+}
+
+static void raid_postsuspend(struct dm_target *ti)
+{
+	struct raid_set *rs = ti->private;
+	mddev_suspend(&rs->md);
+}
+
+static void raid_resume(struct dm_target *ti)
+{
+	struct raid_set *rs = ti->private;
+
+	mddev_resume(&rs->md);
+}
+
 static struct target_type raid_target = {
 	.name = "raid",
 	.version = {1, 0, 0},
@@ -618,6 +637,9 @@ static struct target_type raid_target = {
 	.status = raid_status,
 	.iterate_devices = raid_iterate_devices,
 	.io_hints = raid_io_hints,
+	.presuspend = raid_presuspend,
+	.postsuspend = raid_postsuspend,
+	.resume = raid_resume,
 };
 
 static int __init dm_raid_init(void)
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH v3 0/8] dm-raid (raid456) target
  2010-12-21  2:37 [PATCH v3 0/8] dm-raid (raid456) target Mike Snitzer
                   ` (7 preceding siblings ...)
  2010-12-21  2:37 ` [PATCH v3 8/8] dm raid: add suspend and resume functions Mike Snitzer
@ 2010-12-21  3:14 ` Neil Brown
  2011-01-05 22:36   ` Mike Snitzer
  8 siblings, 1 reply; 19+ messages in thread
From: Neil Brown @ 2010-12-21  3:14 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: linux-raid, dm-devel, agk

On Mon, 20 Dec 2010 21:37:37 -0500 Mike Snitzer <snitzer@redhat.com> wrote:

> I reviewed all patches and adjusted patch headers in preparation for
> inclusion in Alasdair's tree (upstream target being v2.6.38).  The 2
> md/bitmap changes can obviously go through Neil's MD tree if that is
> preferred.
> 
> - added Copyright to the top of dm-raid.c
> - fixed drivers/md/Kconfig so that DM_RAID will select MD_RAID456
> - removed unused 'conf' variable in raid_status()
> - few misc style and whitespace changes
> 
> Neil,
> Hopefully you're comfortable with the authorship of most of these
> patches still being attributed to you (despite Jon's changes to your
> original patches).
> 
> Please advise, thanks!

I haven't had a chance to have a proper look yet, and I'm about to go on
leave for a couple of weeks.
I'm probably happy with the authorship remaining as-is, but I reserve the
right to change my mind (about anything) once I've had a chance to have a
proper look.

Thanks,
NeilBrown


> Mike
> 
> Jonathan Brassow (2):
>   md/bitmap: revert dm-dirty-log preparation
>   md/raid5: use sysfs_notify_dirent_safe to avoid NULL pointer
> 
> NeilBrown (6):
>   md/bitmap: use DIV_ROUND_UP in bitmap_init_from_disk
>   dm raid: skeleton raid456 target support
>   dm: introduce target callbacks and congestion fn
>   dm: unplug callback
>   dm raid: add iterate_devices and io_hints functions
>   dm raid: add suspend and resume functions
> 
>  drivers/md/Kconfig            |   23 ++
>  drivers/md/Makefile           |    1 +
>  drivers/md/bitmap.c           |  139 +++------
>  drivers/md/bitmap.h           |    5 -
>  drivers/md/dm-raid.c          |  664 +++++++++++++++++++++++++++++++++++++++++
>  drivers/md/dm-table.c         |   19 ++
>  drivers/md/md.h               |    5 -
>  drivers/md/raid5.c            |    2 +-
>  include/linux/device-mapper.h |   13 +
>  9 files changed, 766 insertions(+), 105 deletions(-)
>  create mode 100644 drivers/md/dm-raid.c
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v3 0/8] dm-raid (raid456) target
  2010-12-21  3:14 ` [PATCH v3 0/8] dm-raid (raid456) target Neil Brown
@ 2011-01-05 22:36   ` Mike Snitzer
  2011-01-06 10:46     ` NeilBrown
  0 siblings, 1 reply; 19+ messages in thread
From: Mike Snitzer @ 2011-01-05 22:36 UTC (permalink / raw)
  To: Neil Brown; +Cc: jbrassow, agk, dm-devel, linux-raid

On Mon, Dec 20 2010 at 10:14pm -0500,
Neil Brown <neilb@suse.de> wrote:

> On Mon, 20 Dec 2010 21:37:37 -0500 Mike Snitzer <snitzer@redhat.com> wrote:
> 
> > I reviewed all patches and adjusted patch headers in preparation for
> > inclusion in Alasdair's tree (upstream target being v2.6.38).  The 2
> > md/bitmap changes can obviously go through Neil's MD tree if that is
> > preferred.
> > 
> > - added Copyright to the top of dm-raid.c
> > - fixed drivers/md/Kconfig so that DM_RAID will select MD_RAID456
> > - removed unused 'conf' variable in raid_status()
> > - few misc style and whitespace changes
> > 
> > Neil,
> > Hopefully you're comfortable with the authorship of most of these
> > patches still being attributed to you (despite Jon's changes to your
> > original patches).
> > 
> > Please advise, thanks!
> 
> I haven't had a chance to have a proper look yet, and I'm about to go on
> leave for a couple of weeks.
> I'm probably happy with the authorship remaining as-is, but I reserve the
> right to change my mind (about anything) once I've had a chance to have a
> proper look.

Hi Neil,

Please advise on how you'd like things to proceed with this initial 8
patch set.  Ideally we'd get them in to linux-next now and ultimately
2.6.38 before the merge window closes.

Thanks,
Mike

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v3 0/8] dm-raid (raid456) target
  2011-01-05 22:36   ` Mike Snitzer
@ 2011-01-06 10:46     ` NeilBrown
  2011-01-06 14:43       ` Jonathan Brassow
                         ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: NeilBrown @ 2011-01-06 10:46 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: jbrassow, agk, dm-devel, linux-raid

On Wed, 5 Jan 2011 17:36:29 -0500 Mike Snitzer <snitzer@redhat.com> wrote:

> On Mon, Dec 20 2010 at 10:14pm -0500,
> Neil Brown <neilb@suse.de> wrote:
> 
> > On Mon, 20 Dec 2010 21:37:37 -0500 Mike Snitzer <snitzer@redhat.com> wrote:
> > 
> > > I reviewed all patches and adjusted patch headers in preparation for
> > > inclusion in Alasdair's tree (upstream target being v2.6.38).  The 2
> > > md/bitmap changes can obviously go through Neil's MD tree if that is
> > > preferred.
> > > 
> > > - added Copyright to the top of dm-raid.c
> > > - fixed drivers/md/Kconfig so that DM_RAID will select MD_RAID456
> > > - removed unused 'conf' variable in raid_status()
> > > - few misc style and whitespace changes
> > > 
> > > Neil,
> > > Hopefully you're comfortable with the authorship of most of these
> > > patches still being attributed to you (despite Jon's changes to your
> > > original patches).
> > > 
> > > Please advise, thanks!
> > 
> > I haven't had a chance to have a proper look yet, and I'm about to go on
> > leave for a couple of weeks.
> > I'm probably happy with the authorship remaining as-is, but I reserve the
> > right to change my mind (about anything) once I've had a chance to have a
> > proper look.
> 
> Hi Neil,
> 
> Please advise on how you'd like things to proceed with this initial 8
> patch set.  Ideally we'd get them in to linux-next now and ultimately
> 2.6.38 before the merge window closes.
> 
> Thanks,
> Mike

Sorry for long delays ... leave is always followed by piles of things that
all need to be done at once :-(

Patches 1 and 2 aren't really needed - I feel inclined not to remove that
stuff until all the dust settles..  So I'll put them at the bottom of my
queue and bring them forward in a couple of releases if that seems
appropriate.

Patch 3 I'll push out through my tree shortly.

The rest I am happy with going out through 'dm' to -next and beyond whenever
you like.

However I will take this opportunity to suggest that the raid parameters
might need a bit of review before they are finalised.

The possible parameters are as follows:
 <chunk_size>		Chunk size in sectors.
 [[no]sync]		Force/Prevent RAID initialization
 [rebuild <idx>]	Rebuild the drive indicated by the index
 [daemon_sleep <ms>]	Time between bitmap daemon work to clear bits
 [min_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
 [max_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
 [max_write_behind <value>]		See '-write-behind=' (man mdadm)
 [stripe_cache <sectors>]		Stripe cache size for higher RAIDs


'rebuild idx' only allows one device to be rebuilt at a time.  With RAID6 it
is possible that two devices can be ready for rebuild, in which case we would
want to rebuild both of them.
Also if you stop an array during a rebuild you really want to start again
where you were up to.

Most general solution would be to combine the rebuild information with the
list of component devices - either a sector number or something to indicate
that the rebuild it complete (which could be just a v.big sector number).

I'd much prefer 'daemon_sleep' to be in seconds, we decimals supported if
needed.

Also:

3:	<#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN>

This doesn't allow offsets into the various devices like dm-stripe,
dm-linear, dm-raid1 all do.

So I suspect a bit of work is needed there.

NeilBrown

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v3 0/8] dm-raid (raid456) target
  2011-01-06 10:46     ` NeilBrown
@ 2011-01-06 14:43       ` Jonathan Brassow
  2011-01-10  0:37         ` NeilBrown
  2011-01-06 15:56       ` [dm-devel] " Phillip Susi
  2011-01-06 21:01       ` Jonathan Brassow
  2 siblings, 1 reply; 19+ messages in thread
From: Jonathan Brassow @ 2011-01-06 14:43 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, dm-devel, agk, Mike Snitzer


[-- Attachment #1.1: Type: text/plain, Size: 2774 bytes --]


On Jan 6, 2011, at 4:46 AM, NeilBrown wrote:
>
> Patches 1 and 2 aren't really needed - I feel inclined not to remove  
> that
> stuff until all the dust settles..  So I'll put them at the bottom  
> of my
> queue and bring them forward in a couple of releases if that seems
> appropriate.

No problem.

> Patch 3 I'll push out through my tree shortly.

thanks

> The rest I am happy with going out through 'dm' to -next and beyond  
> whenever
> you like.
>
> However I will take this opportunity to suggest that the raid  
> parameters
> might need a bit of review before they are finalised.
>
> The possible parameters are as follows:
> <chunk_size>		Chunk size in sectors.
> [[no]sync]		Force/Prevent RAID initialization
> [rebuild <idx>]	Rebuild the drive indicated by the index
> [daemon_sleep <ms>]	Time between bitmap daemon work to clear bits
> [min_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
> [max_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
> [max_write_behind <value>]		See '-write-behind=' (man mdadm)
> [stripe_cache <sectors>]		Stripe cache size for higher RAIDs
>
>
> 'rebuild idx' only allows one device to be rebuilt at a time.  With  
> RAID6 it
> is possible that two devices can be ready for rebuild, in which case  
> we would
> want to rebuild both of them.

In which case, you can specify the parameter twice:
"... rebuild 0 rebuild 1..."
No problem there, I think.

> Also if you stop an array during a rebuild you really want to start  
> again
> where you were up to.

This is kept in the superblock (forthcoming patches).  In the case  
where metadata devices are not specified (or can't be specified, as in  
this initial set of patches), the user has two choices: 1) allow the  
array to completely resync, or 2) specify [no]sync.

> Most general solution would be to combine the rebuild information  
> with the
> list of component devices - either a sector number or something to  
> indicate
> that the rebuild it complete (which could be just a v.big sector  
> number).

Is there something more to this, or is the above comment suitable for  
this also?

> I'd much prefer 'daemon_sleep' to be in seconds, we decimals  
> supported if
> needed.

I had this in seconds originally, but device-mapper tables already  
have a precedent for specifying things in ms.  We don't want to have  
each target pick their own units.  (A similar thing can be said for  
values given in sectors.  We may have wanted kiB or B, but sectors is  
the standard.)

>
> Also:
>
> 3:	<#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN>
>
> This doesn't allow offsets into the various devices like dm-stripe,
> dm-linear, dm-raid1 all do.

This is a conscious choice.  The offset was discarded because it is  
unneeded.

  brassow

[-- Attachment #1.2: Type: text/html, Size: 4427 bytes --]

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [dm-devel] [PATCH v3 0/8] dm-raid (raid456) target
  2011-01-06 10:46     ` NeilBrown
  2011-01-06 14:43       ` Jonathan Brassow
@ 2011-01-06 15:56       ` Phillip Susi
  2011-01-06 17:37         ` Jonathan Brassow
  2011-01-06 20:59         ` [dm-devel] " Jonathan Brassow
  2011-01-06 21:01       ` Jonathan Brassow
  2 siblings, 2 replies; 19+ messages in thread
From: Phillip Susi @ 2011-01-06 15:56 UTC (permalink / raw)
  To: device-mapper development; +Cc: NeilBrown, Mike Snitzer, linux-raid, agk

On 1/6/2011 5:46 AM, NeilBrown wrote:
> 3:	<#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN>

Let me get this straight.  You specify a separate device to hold the
metadata and write intent bitmap for each data device?  So for a 3 disk
raid 5, lvm will need to create two logical volumes on each of the 3
physical volumes, one of which will only be a single physical extent,
and will hold the raid metadata and write intent bitmap?

Why not just store the metadata on the main device like mdadm does today?

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v3 0/8] dm-raid (raid456) target
  2011-01-06 15:56       ` [dm-devel] " Phillip Susi
@ 2011-01-06 17:37         ` Jonathan Brassow
  2011-01-06 20:59         ` [dm-devel] " Jonathan Brassow
  1 sibling, 0 replies; 19+ messages in thread
From: Jonathan Brassow @ 2011-01-06 17:37 UTC (permalink / raw)
  To: device-mapper development


On Jan 6, 2011, at 9:56 AM, Phillip Susi wrote:

> On 1/6/2011 5:46 AM, NeilBrown wrote:
>> 3:	<#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN>
>
> Let me get this straight.  You specify a separate device to hold the
> metadata and write intent bitmap for each data device?  So for a 3  
> disk
> raid 5, lvm will need to create two logical volumes on each of the 3
> physical volumes, one of which will only be a single physical extent,
> and will hold the raid metadata and write intent bitmap?
>
> Why not just store the metadata on the main device like mdadm does  
> today?

There is no single big reason to do things as I've propose, just a lot  
of little reasons...

1) Device-mapper already has a few cases where metadata is kept on  
separate devices from the data (snapshots and mirror log) and no cases  
where they are kept together.  This new raid module is similar to the  
mirroring case, where bitmaps are kept separately.

2) It seems a bit funny to specify a length (second param of the  
device-mapper CTR) and then expect the devices to be larger than their  
share of that amount to accommodate metadata.  You might say it is  
funny to have to specify a separate device to hold the metadata, but I  
would again give the mirror log as an example.

3) Where multiple physical devices form a single leg/component of the  
array, the argument for having a metadata device specifically tied to  
its data device as an indivisible unit is weakened.

4) Having the metadata on a separate logical device increases the  
flexibility of its placement.  You could have it at the beginning, in  
the middle, or at the end.  (The middle might actually be preferred  
for performance reasons.)  There are no offset calculations to perform  
in the kernel that depend on metadata placement.

5) Resizing an array might require the resizing of the metadata area.   
Because the devices are separate, there is no need to move around data  
or metadata to accommodate this.  If they were mixed in the same  
device and the metadata was at the beginning, that's a problem if the  
metadata no longer fits in its area.  Likewise, if the metadata were  
at the end of a mixed device, you would have to move it when growing.   
These problems are eliminated.

6) The metadata areas are not necessary in every case.  Some raid  
controllers handle the metadata on their own (dm-raid works with  
these).  You might say it is merely another flag on the CTR line to  
indicate whether to use metadata or not.  Perhaps, but having them  
separate means you can easily convert between the two types.

7) Clustering?  Perhaps one of the weaker arguments, but having the  
metadata separate allows it to easily grow to accommodate a bitmap /  
device / node, for example.  This is really the same argument as  
easily being able to reform/resize the metadata area.

8) Bitmaps/superblocks that are updated often could be placed on  
separate devices, like SSDs, while the data is on spinning media.  I'm  
not necessarily advocating this, but if someone wants to do it, I  
think they should be able to.

9) Flexibility for the future.  Imagine a mirror and you'd like to  
split off a leg - the data portion alone becomes the linear device.   
The metadata device could be discarded, or it could be recombined with  
the data device and reinserted into the array - having just the deltas  
be played back from the original mirror that has remained actively in- 
use.

Each of these reasons is not all that compelling in isolation; but  
together, I think they make a pretty good case.  There is additional  
flexibility here; and this is to be sacrificed for what?  A simpler  
CTR line?  I don't know of anyone who enters these by hand without  
instead using LVM, dm-raid, multipath, etc.  MD does it this way?   
Well, this is device-mapper and it has its own idiosyncrasies and  
precedents.

Also, I understand what you mean by your final question, but for those  
who are new to this I'd like to point out that we /are/ storing the  
metadata on the main physical device, but not the same logical  
device.  [Again, this will be the rule, but is flexible.]

  brassow

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [dm-devel] [PATCH v3 0/8] dm-raid (raid456) target
  2011-01-06 15:56       ` [dm-devel] " Phillip Susi
  2011-01-06 17:37         ` Jonathan Brassow
@ 2011-01-06 20:59         ` Jonathan Brassow
  1 sibling, 0 replies; 19+ messages in thread
From: Jonathan Brassow @ 2011-01-06 20:59 UTC (permalink / raw)
  To: Phillip Susi; +Cc: linux-raid, Kergon Alasdair, Mike Snitzer


On Jan 6, 2011, at 9:56 AM, Phillip Susi wrote:

> On 1/6/2011 5:46 AM, NeilBrown wrote:
>> 3:	<#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN>
>
> Let me get this straight.  You specify a separate device to hold the
> metadata and write intent bitmap for each data device?  So for a 3  
> disk
> raid 5, lvm will need to create two logical volumes on each of the 3
> physical volumes, one of which will only be a single physical extent,
> and will hold the raid metadata and write intent bitmap?
>
> Why not just store the metadata on the main device like mdadm does  
> today?

There is no single big reason to do things as I've propose, just a lot  
of little reasons...

1) Device-mapper already has a few cases where metadata is kept on  
separate devices from the data (snapshots and mirror log) and no cases  
where they are kept together.  This new raid module is similar to the  
mirroring case, where bitmaps are kept separately.

2) It seems a bit funny to specify a length (second param of the  
device-mapper CTR) and then expect the devices to be larger than their  
share of that amount to accommodate metadata.  You might say it is  
funny to have to specify a separate device to hold the metadata, but I  
would again give the mirror log as an example.

3) Where multiple physical devices form a single leg/component of the  
array, the argument for having a metadata device specifically tied to  
its data device as an indivisible unit is weakened.

4) Having the metadata on a separate logical device increases the  
flexibility of its placement.  You could have it at the beginning, in  
the middle, or at the end.  (The middle might actually be preferred  
for performance reasons.)  There are no offset calculations to perform  
in the kernel that depend on metadata placement.

5) Resizing an array might require the resizing of the metadata area.   
Because the devices are separate, there is no need to move around data  
or metadata to accommodate this.  If they were mixed in the same  
device and the metadata was at the beginning, that's a problem if the  
metadata no longer fits in its area.  Likewise, if the metadata were  
at the end of a mixed device, you would have to move it when growing.   
These problems are eliminated.

6) The metadata areas are not necessary in every case.  Some raid  
controllers handle the metadata on their own (dm-raid works with  
these).  You might say it is merely another flag on the CTR line to  
indicate whether to use metadata or not.  Perhaps, but having them  
separate means you can easily convert between the two types.

7) Clustering?  Perhaps one of the weaker arguments, but having the  
metadata separate allows it to easily grow to accommodate a bitmap /  
device / node, for example.  This is really the same argument as  
easily being able to reform/resize the metadata area.

8) Bitmaps/superblocks that are updated often could be placed on  
separate devices, like SSDs, while the data is on spinning media.  I'm  
not necessarily advocating this, but if someone wants to do it, I  
think they should be able to.

9) Flexibility for the future.  Imagine a mirror and you'd like to  
split off a leg - the data portion alone becomes the linear device.   
The metadata device could be discarded, or it could be recombined with  
the data device and reinserted into the array - having just the deltas  
be played back from the original mirror that has remained actively in- 
use.

Each of these reasons is not all that compelling in isolation; but  
together, I think they make a pretty good case.  There is additional  
flexibility here; and this is to be sacrificed for what?  A simpler  
CTR line?  I don't know of anyone who enters these by hand without  
instead using LVM, dm-raid, multipath, etc.  MD does it this way?   
Well, this is device-mapper and it has its own idiosyncrasies and  
precedents.

Also, I understand what you mean by your final question, but for those  
who are new to this I'd like to point out that we /are/ storing the  
metadata on the main physical device, but not the same logical  
device.  [Again, this will be the rule, but is flexible.]

brassow

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v3 0/8] dm-raid (raid456) target
  2011-01-06 10:46     ` NeilBrown
  2011-01-06 14:43       ` Jonathan Brassow
  2011-01-06 15:56       ` [dm-devel] " Phillip Susi
@ 2011-01-06 21:01       ` Jonathan Brassow
  2 siblings, 0 replies; 19+ messages in thread
From: Jonathan Brassow @ 2011-01-06 21:01 UTC (permalink / raw)
  To: linux-raid

[sorry, this msg bounced earlier today]

On Jan 6, 2011, at 4:46 AM, NeilBrown wrote:
>
> Patches 1 and 2 aren't really needed - I feel inclined not to remove  
> that
> stuff until all the dust settles..  So I'll put them at the bottom  
> of my
> queue and bring them forward in a couple of releases if that seems
> appropriate.

No problem.

> Patch 3 I'll push out through my tree shortly.

thanks

> The rest I am happy with going out through 'dm' to -next and beyond  
> whenever
> you like.
>
> However I will take this opportunity to suggest that the raid  
> parameters
> might need a bit of review before they are finalised.
>
> The possible parameters are as follows:
> <chunk_size>		Chunk size in sectors.
> [[no]sync]		Force/Prevent RAID initialization
> [rebuild <idx>]	Rebuild the drive indicated by the index
> [daemon_sleep <ms>]	Time between bitmap daemon work to clear bits
> [min_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
> [max_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
> [max_write_behind <value>]		See '-write-behind=' (man mdadm)
> [stripe_cache <sectors>]		Stripe cache size for higher RAIDs
>
>
> 'rebuild idx' only allows one device to be rebuilt at a time.  With  
> RAID6 it
> is possible that two devices can be ready for rebuild, in which case  
> we would
> want to rebuild both of them.

In which case, you can specify the parameter twice:
"... rebuild 0 rebuild 1..."
No problem there, I think.

> Also if you stop an array during a rebuild you really want to start  
> again
> where you were up to.

This is kept in the superblock (forthcoming patches).  In the case  
where metadata devices are not specified (or can't be specified, as in  
this initial set of patches), the user has two choices: 1) allow the  
array to completely resync, or 2) specify [no]sync.

> Most general solution would be to combine the rebuild information  
> with the
> list of component devices - either a sector number or something to  
> indicate
> that the rebuild it complete (which could be just a v.big sector  
> number).

Is there something more to this, or is the above comment suitable for  
this also?

> I'd much prefer 'daemon_sleep' to be in seconds, we decimals  
> supported if
> needed.

I had this in seconds originally, but device-mapper tables already  
have a precedent for specifying things in ms.  We don't want to have  
each target pick their own units.  (A similar thing can be said for  
values given in sectors.  We may have wanted kiB or B, but sectors is  
the standard.)

>
> Also:
>
> 3:	<#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN>
>
> This doesn't allow offsets into the various devices like dm-stripe,
> dm-linear, dm-raid1 all do.

This is a conscious choice.  The offset was discarded because it is  
unneeded.

  brassow

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v3 0/8] dm-raid (raid456) target
  2011-01-06 14:43       ` Jonathan Brassow
@ 2011-01-10  0:37         ` NeilBrown
  2011-01-10 20:14           ` Jonathan Brassow
  0 siblings, 1 reply; 19+ messages in thread
From: NeilBrown @ 2011-01-10  0:37 UTC (permalink / raw)
  To: Jonathan Brassow; +Cc: Mike Snitzer, agk, dm-devel, linux-raid

On Thu, 6 Jan 2011 08:43:35 -0600 Jonathan Brassow <jbrassow@redhat.com>
wrote:

> 
> On Jan 6, 2011, at 4:46 AM, NeilBrown wrote:
> >
> > Patches 1 and 2 aren't really needed - I feel inclined not to remove  
> > that
> > stuff until all the dust settles..  So I'll put them at the bottom  
> > of my
> > queue and bring them forward in a couple of releases if that seems
> > appropriate.
> 
> No problem.
> 
> > Patch 3 I'll push out through my tree shortly.
> 
> thanks
> 
> > The rest I am happy with going out through 'dm' to -next and beyond  
> > whenever
> > you like.
> >
> > However I will take this opportunity to suggest that the raid  
> > parameters
> > might need a bit of review before they are finalised.
> >
> > The possible parameters are as follows:
> > <chunk_size>		Chunk size in sectors.
> > [[no]sync]		Force/Prevent RAID initialization
> > [rebuild <idx>]	Rebuild the drive indicated by the index
> > [daemon_sleep <ms>]	Time between bitmap daemon work to clear bits
> > [min_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
> > [max_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
> > [max_write_behind <value>]		See '-write-behind=' (man mdadm)
> > [stripe_cache <sectors>]		Stripe cache size for higher RAIDs
> >
> >
> > 'rebuild idx' only allows one device to be rebuilt at a time.  With  
> > RAID6 it
> > is possible that two devices can be ready for rebuild, in which case  
> > we would
> > want to rebuild both of them.
> 
> In which case, you can specify the parameter twice:
> "... rebuild 0 rebuild 1..."
> No problem there, I think.

Yes, I see.  I'm use to seeing positional parameter in dm tables and didn't
process properly that these are key/value parameters.


> 
> > Also if you stop an array during a rebuild you really want to start  
> > again
> > where you were up to.
> 
> This is kept in the superblock (forthcoming patches).  In the case  
> where metadata devices are not specified (or can't be specified, as in  
> this initial set of patches), the user has two choices: 1) allow the  
> array to completely resync, or 2) specify [no]sync.

OK... though 'resync' and 'rebuild' are very different things and should not
be confused.  So the [no]sync option should not affect rebuild of new devices.



> 
> > Most general solution would be to combine the rebuild information  
> > with the
> > list of component devices - either a sector number or something to  
> > indicate
> > that the rebuild it complete (which could be just a v.big sector  
> > number).
> 
> Is there something more to this, or is the above comment suitable for  
> this also?

No, I think you've address it all.


> 
> > I'd much prefer 'daemon_sleep' to be in seconds, we decimals  
> > supported if
> > needed.
> 
> I had this in seconds originally, but device-mapper tables already  
> have a precedent for specifying things in ms.  We don't want to have  
> each target pick their own units.  (A similar thing can be said for  
> values given in sectors.  We may have wanted kiB or B, but sectors is  
> the standard.)

Fair enough.


> 
> >
> > Also:
> >
> > 3:	<#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN>
> >
> > This doesn't allow offsets into the various devices like dm-stripe,
> > dm-linear, dm-raid1 all do.
> 
> This is a conscious choice.  The offset was discarded because it is  
> unneeded.

Famous last words!!!

When reshaping a RAID5/6 to change the chunk size or layout, or when
converting an N drive RAID5 to N+1 drive RAID6, it makes it a lot easier if
you can can change the data offset at the same time.  Then you don't have to
keep backing up data.  md cannot do this yet but I hope to add the code soon
and because 'data_offset' is a distinct value it is conceptually easy to do.
If 'data_offset' is assumed to be zero, then it isn't.

But I don't know to what extent you are planning on supporting reshaping of
arrays....

Thanks,

NeilBrown



> 
>   brassow

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v3 0/8] dm-raid (raid456) target
  2011-01-10  0:37         ` NeilBrown
@ 2011-01-10 20:14           ` Jonathan Brassow
  0 siblings, 0 replies; 19+ messages in thread
From: Jonathan Brassow @ 2011-01-10 20:14 UTC (permalink / raw)
  To: NeilBrown; +Cc: Mike Snitzer, agk, dm-devel, linux-raid


On Jan 9, 2011, at 6:37 PM, NeilBrown wrote:

>>> Also:
>>>
>>> 3:	<#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN>
>>>
>>> This doesn't allow offsets into the various devices like dm-stripe,
>>> dm-linear, dm-raid1 all do.
>>
>> This is a conscious choice.  The offset was discarded because it is
>> unneeded.
>
> Famous last words!!!

:)

>
> When reshaping a RAID5/6 to change the chunk size or layout, or when
> converting an N drive RAID5 to N+1 drive RAID6, it makes it a lot  
> easier if
> you can can change the data offset at the same time.  Then you don't  
> have to
> keep backing up data.  md cannot do this yet but I hope to add the  
> code soon
> and because 'data_offset' is a distinct value it is conceptually  
> easy to do.
> If 'data_offset' is assumed to be zero, then it isn't.

I guess I don't understand this part.  Why would you need to change  
the data offset?  Especially the offset as it relates to the  
individual drives?

> But I don't know to what extent you are planning on supporting  
> reshaping of
> arrays....

I don't see a reason why this can't be completely supported...

  brassow


^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2011-01-10 20:14 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-12-21  2:37 [PATCH v3 0/8] dm-raid (raid456) target Mike Snitzer
2010-12-21  2:37 ` [PATCH v3 1/8] md/bitmap: revert dm-dirty-log preparation Mike Snitzer
2010-12-21  2:37 ` [PATCH v3 2/8] md/bitmap: use DIV_ROUND_UP in bitmap_init_from_disk Mike Snitzer
2010-12-21  2:37 ` [PATCH v3 3/8] md/raid5: use sysfs_notify_dirent_safe to avoid NULL pointer Mike Snitzer
2010-12-21  2:37 ` [PATCH v3 4/8] dm raid: skeleton raid456 target support Mike Snitzer
2010-12-21  2:37 ` [PATCH v3 5/8] dm: introduce target callbacks and congestion callback Mike Snitzer
2010-12-21  2:37 ` [PATCH v3 6/8] dm: per-target unplug callback support Mike Snitzer
2010-12-21  2:37 ` [PATCH v3 7/8] dm raid: add iterate_devices and io_hints functions Mike Snitzer
2010-12-21  2:37 ` [PATCH v3 8/8] dm raid: add suspend and resume functions Mike Snitzer
2010-12-21  3:14 ` [PATCH v3 0/8] dm-raid (raid456) target Neil Brown
2011-01-05 22:36   ` Mike Snitzer
2011-01-06 10:46     ` NeilBrown
2011-01-06 14:43       ` Jonathan Brassow
2011-01-10  0:37         ` NeilBrown
2011-01-10 20:14           ` Jonathan Brassow
2011-01-06 15:56       ` [dm-devel] " Phillip Susi
2011-01-06 17:37         ` Jonathan Brassow
2011-01-06 20:59         ` [dm-devel] " Jonathan Brassow
2011-01-06 21:01       ` Jonathan Brassow

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.