All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
@ 2009-06-15 17:21 heinzm
  2009-06-16 14:09 ` Christoph Hellwig
  2009-06-18 16:39 ` Jonathan Brassow
  0 siblings, 2 replies; 23+ messages in thread
From: heinzm @ 2009-06-15 17:21 UTC (permalink / raw)
  To: dm-devel

Hi,

this patch series introduces the raid45 target.
Please include upstream.

The raid45 target supports RAID4 mappings with a dedicated and selectable
parity device and RAID5 mappings with rotating parity (left/right (a)symmetric).
Initial and partial resynchronization functionality utilizes the dm
dirty log module.

A stripe cache handled by the target in order to update or reconstruct
parity is based on memory objects being managed in a separate, sharable
dm-memcache module, which allows for allocation of such objects with
N chunks of page lists with M pages each.

Message interface parsing is performed in the seperate, sharable
dm-message module.

Heinz


Summary of the patch-set
========================
  1/5: dm raid45 target: export region hash functions and add a needed one
  2/5: dm raid45 target: introduce memory cache
  3/5: dm raid45 target: introduce message parser
  4/5: dm raid45 target: export dm_disk from dm.c
  5/5: dm raid45 target: introcuce the raid45 target
  6/5: dm raid45 target: add raid45 modules to Makefile and adjust Kconfig

 include/linux/dm-region-hash.h  |     4 ++++
 drivers/md/dm-region-hash.c     |    21 +++++++++++++++++--
 drivers/md/dm-memcache.h        |    68 ++++++++++++++++++++++
 drivers/md/dm-memcache.c        |   301 ++++++++++++++++++++++
 drivers/md/dm-message.h         |    91 ++++++++++++++++++++++++
 drivers/md/dm-message.c         |   183 ++++++++++++++++++++++++
 drivers/md/dm.c                 |     1 +
 drivers/md/dm-raid45.h          |    28 ++++++++++++++++++++++++++
 drivers/md/dm-raid45.c          |  4580 ++++++++++++++++++++++++++
 drivers/md/Makefile             |     3 +++
 drivers/md/Kconfig              |     9 +++++++++

 11 files changed, 5286 insertions(+), 3 deletions(-)


 .../{dm-region-hash.h.orig => dm-region-hash.h}    |    4 ++++
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/include/linux/dm-region-hash.h.orig b/include/linux/dm-region-hash.h
index a9e652a..bfd21cb 100644
--- a/include/linux/dm-region-hash.h.orig
+++ b/include/linux/dm-region-hash.h
@@ -49,6 +49,7 @@ struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh);
  */
 region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio);
 sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region);
+region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector);
 void *dm_rh_region_context(struct dm_region *reg);
 
 /*
@@ -72,11 +73,14 @@ void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled);
 int dm_rh_flush(struct dm_region_hash *rh);
 
 /* Inc/dec pending count on regions. */
+void dm_rh_inc(struct dm_region_hash *rh, region_t region);
 void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios);
 void dm_rh_dec(struct dm_region_hash *rh, region_t region);
 
 /* Delay bios on regions. */
 void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio);
+void dm_rh_delay_by_region(struct dm_region_hash *rh, struct bio *bio,
+			   region_t region);
 
 void dm_rh_mark_nosync(struct dm_region_hash *rh,
 		       struct bio *bio, unsigned done, int error);
 .../md/{dm-region-hash.c.orig => dm-region-hash.c} |   21 +++++++++++++++++--
 1 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-region-hash.c.orig b/drivers/md/dm-region-hash.c
index 7b899be..47b088b 100644
--- a/drivers/md/dm-region-hash.c.orig
+++ b/drivers/md/dm-region-hash.c
@@ -107,10 +107,11 @@ struct dm_region {
 /*
  * Conversion fns
  */
-static region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector)
+region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector)
 {
 	return sector >> rh->region_shift;
 }
+EXPORT_SYMBOL_GPL(dm_rh_sector_to_region);
 
 sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region)
 {
@@ -488,7 +489,7 @@ void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled)
 }
 EXPORT_SYMBOL_GPL(dm_rh_update_states);
 
-static void rh_inc(struct dm_region_hash *rh, region_t region)
+void dm_rh_inc(struct dm_region_hash *rh, region_t region)
 {
 	struct dm_region *reg;
 
@@ -510,13 +511,14 @@ static void rh_inc(struct dm_region_hash *rh, region_t region)
 
 	read_unlock(&rh->hash_lock);
 }
+EXPORT_SYMBOL_GPL(dm_rh_inc);
 
 void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
 {
 	struct bio *bio;
 
 	for (bio = bios->head; bio; bio = bio->bi_next)
-		rh_inc(rh, dm_rh_bio_to_region(rh, bio));
+		dm_rh_inc(rh, dm_rh_bio_to_region(rh, bio));
 }
 EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
 
@@ -677,6 +679,19 @@ void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio)
 }
 EXPORT_SYMBOL_GPL(dm_rh_delay);
 
+void dm_rh_delay_by_region(struct dm_region_hash *rh,
+			   struct bio *bio, region_t region)
+{
+	struct dm_region *reg;
+
+	/* FIXME: locking. */
+	read_lock(&rh->hash_lock);
+	reg = __rh_find(rh, region);
+	bio_list_add(&reg->delayed_bios, bio);
+	read_unlock(&rh->hash_lock);
+}
+EXPORT_SYMBOL_GPL(dm_rh_delay_by_region);
+
 void dm_rh_stop_recovery(struct dm_region_hash *rh)
 {
 	int i;

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-06-15 17:21 [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one heinzm
@ 2009-06-16 14:09 ` Christoph Hellwig
  2009-06-16 14:51   ` Heinz Mauelshagen
  2009-06-18 16:39 ` Jonathan Brassow
  1 sibling, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2009-06-16 14:09 UTC (permalink / raw)
  To: device-mapper development

On Mon, Jun 15, 2009 at 07:21:05PM +0200, heinzm@redhat.com wrote:
> Hi,
> 
> this patch series introduces the raid45 target.
> Please include upstream.

NACK, no need for another raid target, especially if it happens to be
as broken as the raid1 one.  Please help on making the md raid4/5 code
more useful for your purposes (whatever that may be).

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-06-16 14:09 ` Christoph Hellwig
@ 2009-06-16 14:51   ` Heinz Mauelshagen
  2009-06-16 17:55     ` Dan Williams
  0 siblings, 1 reply; 23+ messages in thread
From: Heinz Mauelshagen @ 2009-06-16 14:51 UTC (permalink / raw)
  To: device-mapper development

On Tue, 2009-06-16 at 10:09 -0400, Christoph Hellwig wrote:
> On Mon, Jun 15, 2009 at 07:21:05PM +0200, heinzm@redhat.com wrote:
> > Hi,
> > 
> > this patch series introduces the raid45 target.
> > Please include upstream.
> 
> NACK, no need for another raid target, especially if it happens to be
> as broken as the raid1 one.

You didn't review it it seems ?

What are your particular pointers to any raid1 code issues ?
Please point to particular code of concern.

Mind you, that we've got clustered mirroring with it, which isn't
supported in MD.

>   Please help on making the md raid4/5 code
> more useful for your purposes (whatever that may be).

Supporting various ATARAID raid5 mappings via dmraid long before md got
external metadata suport added.

Heinz

> 
> --
> dm-devel mailing list
> dm-devel@redhat.com
> https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-06-16 14:51   ` Heinz Mauelshagen
@ 2009-06-16 17:55     ` Dan Williams
  2009-06-16 19:11       ` Heinz Mauelshagen
  0 siblings, 1 reply; 23+ messages in thread
From: Dan Williams @ 2009-06-16 17:55 UTC (permalink / raw)
  To: heinzm, device-mapper development; +Cc: Christoph Hellwig, Ed Ciechanowski

On Tue, Jun 16, 2009 at 7:51 AM, Heinz Mauelshagen<heinzm@redhat.com> wrote:
> On Tue, 2009-06-16 at 10:09 -0400, Christoph Hellwig wrote:
>> On Mon, Jun 15, 2009 at 07:21:05PM +0200, heinzm@redhat.com wrote:
>> > Hi,
>> >
>> > this patch series introduces the raid45 target.
>> > Please include upstream.
>>
>> NACK, no need for another raid target, especially if it happens to be
>> as broken as the raid1 one.
>
> You didn't review it it seems ?

Will you be allowing time for review?  This seems a bit late for
2.6.31, as your "Please include upstream" implies.

>>   Please help on making the md raid4/5 code
>> more useful for your purposes (whatever that may be).
>
> Supporting various ATARAID raid5 mappings via dmraid long before md got
> external metadata suport added.

This is an opportunity to investigate what technical hurdles remain to
allow dmraid to use md infrastructure [1] (or as Neil proposed [2] a
new unified virtual block device infrastructure) for activating
external metadata raid5 arrays.  The libata/ide situation shows that
there is precedence for duplicate functionality in the kernel, but my
personal opinion is that we exhaust the code reuse discussion before
heading down that path.

Regards,
Dan

[1] http://marc.info/?l=linux-raid&m=123300614013042&w=2
[2] http://marc.info/?l=linux-fsdevel&m=124450400028660&w=2

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-06-16 17:55     ` Dan Williams
@ 2009-06-16 19:11       ` Heinz Mauelshagen
  2009-06-16 19:48         ` Dan Williams
  2009-06-16 22:46         ` Neil Brown
  0 siblings, 2 replies; 23+ messages in thread
From: Heinz Mauelshagen @ 2009-06-16 19:11 UTC (permalink / raw)
  To: Dan Williams
  Cc: Christoph Hellwig, device-mapper development, Ed Ciechanowski

On Tue, 2009-06-16 at 10:55 -0700, Dan Williams wrote:
> On Tue, Jun 16, 2009 at 7:51 AM, Heinz Mauelshagen<heinzm@redhat.com> wrote:
> > On Tue, 2009-06-16 at 10:09 -0400, Christoph Hellwig wrote:
> >> On Mon, Jun 15, 2009 at 07:21:05PM +0200, heinzm@redhat.com wrote:
> >> > Hi,
> >> >
> >> > this patch series introduces the raid45 target.
> >> > Please include upstream.
> >>
> >> NACK, no need for another raid target, especially if it happens to be
> >> as broken as the raid1 one.
> >
> > You didn't review it it seems ?
> 
> Will you be allowing time for review?  This seems a bit late for
> 2.6.31, as your "Please include upstream" implies.

Your help is appreciated, thanks.

As you surely know, your colleagues (in particular in Poland) are
familiar with these patches already, which will help quick review.

> 
> >>   Please help on making the md raid4/5 code
> >> more useful for your purposes (whatever that may be).
> >
> > Supporting various ATARAID raid5 mappings via dmraid long before md got
> > external metadata suport added.
> 
> This is an opportunity to investigate what technical hurdles remain to
> allow dmraid to use md infrastructure [1] (or as Neil proposed [2] a
> new unified virtual block device infrastructure) for activating
> external metadata raid5 arrays.  The libata/ide situation shows that
> there is precedence for duplicate functionality in the kernel, but my
> personal opinion is that we exhaust the code reuse discussion before
> heading down that path.

dmraid supports 11 different metadata formats for various ATARAID vendor
specific solutions plus DDF1:

asr     : Adaptec HostRAID ASR (0,1,10)
ddf1    : SNIA DDF1 (0,1,4,5,linear)
hpt37x  : Highpoint HPT37X (S,0,1,10,01)
hpt45x  : Highpoint HPT45X (S,0,1,10)
isw     : Intel Software RAID (0,1,5,01)
jmicron : JMicron ATARAID (S,0,1)
lsi     : LSI Logic MegaRAID (0,1,10)
nvidia  : NVidia RAID (S,0,1,10,5)
pdc     : Promise FastTrack (S,0,1,10)
sil     : Silicon Image(tm) Medley(tm) (0,1,10)
via     : VIA Software RAID (S,0,1,10)

9 of those still need porting/testing/... to/with MD external metadata
support after the Intel IMSM MD port has settled, hence Red Hat is going
to continue to support dmraid.

That being said: once the future work on a unified virtual block device
infrastructure is production ready, we're open to use that.

Regards,
Heinz

> 
> Regards,
> Dan
> 
> [1] http://marc.info/?l=linux-raid&m=123300614013042&w=2
> [2] http://marc.info/?l=linux-fsdevel&m=124450400028660&w=2

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-06-16 19:11       ` Heinz Mauelshagen
@ 2009-06-16 19:48         ` Dan Williams
  2009-06-16 22:46         ` Neil Brown
  1 sibling, 0 replies; 23+ messages in thread
From: Dan Williams @ 2009-06-16 19:48 UTC (permalink / raw)
  To: heinzm, device-mapper development; +Cc: Christoph Hellwig, Ed Ciechanowski

On Tue, Jun 16, 2009 at 12:11 PM, Heinz Mauelshagen<heinzm@redhat.com> wrote:
> dmraid supports 11 different metadata formats for various ATARAID vendor
> specific solutions plus DDF1:
>
> asr     : Adaptec HostRAID ASR (0,1,10)
> ddf1    : SNIA DDF1 (0,1,4,5,linear)
> hpt37x  : Highpoint HPT37X (S,0,1,10,01)
> hpt45x  : Highpoint HPT45X (S,0,1,10)
> isw     : Intel Software RAID (0,1,5,01)
> jmicron : JMicron ATARAID (S,0,1)
> lsi     : LSI Logic MegaRAID (0,1,10)
> nvidia  : NVidia RAID (S,0,1,10,5)
> pdc     : Promise FastTrack (S,0,1,10)
> sil     : Silicon Image(tm) Medley(tm) (0,1,10)
> via     : VIA Software RAID (S,0,1,10)
>
> 9 of those still need porting/testing/... to/with MD external metadata
> support after the Intel IMSM MD port has settled, hence Red Hat is going
> to continue to support dmraid.

What I am personally interested in is a way for the dmraid userspace
to reuse time proven md kernel infrastructure where it makes sense.
Yes, there are benefits to porting the metadata formats to mdadm, but
the point is that there is no immediate need, or even eventual need,
to to do this work.  dmraid already knows how to convert all these
formats into a common dmsetup table format, and md already has knobs
for userspace to create raid arrays from the same information.

Can we not meet in the middle to allow at least dm-raid5 arrays to use md-raid5?

Put another way do we need ~5000 new lines of kernel code to support
all the functionality that dmraid requires?  What happens when one of
these formats wants to add raid6, more code duplication?  Can we
incrementally extend md-raid to cover the unmet needs of dmraid, what
extensions are required?

> That being said: once the future work on a unified virtual block device
> infrastructure is production ready, we're open to use that.

That is the end game, but there are more cooperation opportunities
along the way.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-06-16 19:11       ` Heinz Mauelshagen
  2009-06-16 19:48         ` Dan Williams
@ 2009-06-16 22:46         ` Neil Brown
  2009-06-18 16:08           ` Jonathan Brassow
  2009-06-19  1:43           ` Neil Brown
  1 sibling, 2 replies; 23+ messages in thread
From: Neil Brown @ 2009-06-16 22:46 UTC (permalink / raw)
  To: heinzm
  Cc: Christoph Hellwig, Dan Williams, device-mapper development,
	Ed Ciechanowski

On Tuesday June 16, heinzm@redhat.com wrote:
> 
> That being said: once the future work on a unified virtual block device
> infrastructure is production ready, we're open to use that.
> 

I was kind-a hoping that you (and others) would be involved in
developing this unified infrastructure, rather than just waiting for
it.

I think a great first step would be to allow md/raid5 to be used
directly as a dm target, thus turning dm-raid5 into a shim layer over
md/raid5.  The process of doing this would very likely highlight a
lot of the issues we would need to address in creating a unified
framework.

I will try to find time to review your dm-raid5 code with a view to
understanding how it plugs in to dm, and then how the md/raid5 engine
can be used by dm-raid5.

Part of this will be disconnecting the md/raid5 code from any specific
knowledge of a gendisk and a request_queue as I suppose a dm-target
doesn't own any of these.  Also I would probably want the mddev not be
have to be on the "all_mddevs" list, as we would not want a 'dm' raid5
to appear in /proc/mdstat.

NeilBrown

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-06-16 22:46         ` Neil Brown
@ 2009-06-18 16:08           ` Jonathan Brassow
  2009-06-19  1:43           ` Neil Brown
  1 sibling, 0 replies; 23+ messages in thread
From: Jonathan Brassow @ 2009-06-18 16:08 UTC (permalink / raw)
  To: device-mapper development


On Jun 16, 2009, at 5:46 PM, Neil Brown wrote:

> On Tuesday June 16, heinzm@redhat.com wrote:
>>
>> That being said: once the future work on a unified virtual block  
>> device
>> infrastructure is production ready, we're open to use that.
>>
>
> I was kind-a hoping that you (and others) would be involved in
> developing this unified infrastructure, rather than just waiting for
> it.

I think this is what everyone thinks, which is why there is no  
integration effort.

  brassow

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-06-15 17:21 [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one heinzm
  2009-06-16 14:09 ` Christoph Hellwig
@ 2009-06-18 16:39 ` Jonathan Brassow
  2009-06-18 20:01   ` Heinz Mauelshagen
  1 sibling, 1 reply; 23+ messages in thread
From: Jonathan Brassow @ 2009-06-18 16:39 UTC (permalink / raw)
  To: device-mapper development

Eliminate the 3rd argument to that function.  You can use  
'dm_rh_bio_to_region'.

  brassow

On Jun 15, 2009, at 12:21 PM, heinzm@redhat.com wrote:

>
> +void dm_rh_delay_by_region(struct dm_region_hash *rh,
> +			   struct bio *bio, region_t region)
> +{
> +	struct dm_region *reg;
> +
> +	/* FIXME: locking. */
> +	read_lock(&rh->hash_lock);
> +	reg = __rh_find(rh, region);
> +	bio_list_add(&reg->delayed_bios, bio);
> +	read_unlock(&rh->hash_lock);
> +}
> +EXPORT_SYMBOL_GPL(dm_rh_delay_by_region);
> +
> void dm_rh_stop_recovery(struct dm_region_hash *rh)
> {
> 	int i;

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-06-18 16:39 ` Jonathan Brassow
@ 2009-06-18 20:01   ` Heinz Mauelshagen
  0 siblings, 0 replies; 23+ messages in thread
From: Heinz Mauelshagen @ 2009-06-18 20:01 UTC (permalink / raw)
  To: device-mapper development

On Thu, 2009-06-18 at 11:39 -0500, Jonathan Brassow wrote:
> Eliminate the 3rd argument to that function.  You can use  
> 'dm_rh_bio_to_region'.

No, I can't, because I'm keeping track of regions per single disk as in
the mirroring code rather than dividing the whole sets capacity into
regions. This is because a disk is divided into 2^N sized chunks and
regions have to be 2^M >= 2^N sized.

See caller side in dm-raid45.c, function do_io().

Heinz

> 
>   brassow
> 
> On Jun 15, 2009, at 12:21 PM, heinzm@redhat.com wrote:
> 
> >
> > +void dm_rh_delay_by_region(struct dm_region_hash *rh,
> > +			   struct bio *bio, region_t region)
> > +{
> > +	struct dm_region *reg;
> > +
> > +	/* FIXME: locking. */
> > +	read_lock(&rh->hash_lock);
> > +	reg = __rh_find(rh, region);
> > +	bio_list_add(&reg->delayed_bios, bio);
> > +	read_unlock(&rh->hash_lock);
> > +}
> > +EXPORT_SYMBOL_GPL(dm_rh_delay_by_region);
> > +
> > void dm_rh_stop_recovery(struct dm_region_hash *rh)
> > {
> > 	int i;
> 
> --
> dm-devel mailing list
> dm-devel@redhat.com
> https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-06-16 22:46         ` Neil Brown
  2009-06-18 16:08           ` Jonathan Brassow
@ 2009-06-19  1:43           ` Neil Brown
  2009-06-19 10:33             ` Heinz Mauelshagen
  1 sibling, 1 reply; 23+ messages in thread
From: Neil Brown @ 2009-06-19  1:43 UTC (permalink / raw)
  To: heinzm, Dan Williams, device-mapper development,
	Christoph Hellwig, Ed Ciechanowski

On Wednesday June 17, neilb@suse.de wrote:
> 
> I will try to find time to review your dm-raid5 code with a view to
> understanding how it plugs in to dm, and then how the md/raid5 engine
> can be used by dm-raid5.

I've had a bit of a look through the dm-raid5 patches.

Some observations:

- You have your own 'xor' code against which you do a run-time test of
  the 'xor_block' code which md/raid5 uses - then choose the fasted.
  This really should not be necessary.  If you have xor code that runs
  faster than anything in xor_block, it really would be best to submit
  it for inclusion in the common xor code base.

- You have introduced "dm-message" which is a frame work for sending
  messages to dm targets.  It is used for adjusting the cache size,
  changing the bandwidth used by resync, and doing things with
  statistics.  Linux already has a framework for sending messages to
  kernel objects.  It is called "sysfs".  It would be much nicer if
  you could use that.

- I don't know what Christoph Hellwig was referring to when he
  suggested that dm-raid1 was broken, but there is one issue with it
  that concerns me and which is (as far as I can tell) broken in
  dm-raid45 as well.
  That issue is summarised in the thread
   http://www.linux-archive.org/device-mapper-development/282987-dm-raid1-data-corruption.html

  The problem is that when a drive fails you allow writes to continue
  without waiting for the metadata to be updated.  I.e. metadata
  updates are asynchronous. 
  The problem scenario for raid5 is:
    - a drive fails while a write is in process
    - the drive is marked as faulty and user space is notified
    - that write, and possibly several others, complete and that 
      status is potentially returned to user-space (e.g. for an fsync)
    - user space acts on that status (e.g. sends an acknowledgement over
      the network)
    - the host crashes before the metadata update completes

    - on reboot, the drive, which just had a transient failure, works
      fine, and the metadata reports that all the drives are good, but
      that the array was active so a parity-resync is needed.
    - All parity blocks are recalculated.  However that write we
      mentioned above is only recorded in the parity block.  So that
      data is lost.

   Now if you get a crash while the array is degraded data loss is a
   real possibility.  However silent data loss should not be tolerated.

   For this reason, metadata updates reporting failed devices really
   need to by synchronous with respect to writes.
   
- Your algorithm for limiting bandwidth used by resync seems to be
  based on a % concept (which is rounded to the nearest reciprocal,
  so 100%, 50%, 33%, 25%, 20%, 16% etc).  If there is outstanding
  IO, the amount of outstanding resync IO must not exceed the given
  percentage of regular IO.   While it is hard to get resync "just
  right", I think this will have some particularly poor
  characteristics.
  Heavy regular IO load, or a non-existent regular load should be
  handled OK, but I suspect that a light IO load (e.g. one or two
  threads of synchronous requests) would cause the resync to go much
  slower than you would want.

  Also it does not take account of IO to other partitions on a device.
  It is common to have a number of different arrays on the same device
  set.  You really want to resync to back off based on IO to all of
  the device, not just to the array.


All of these are issues that are already addressed in md/raid5.
About the only thing that I saw which does not have a working analogue
in md/raid5 is cluster locking.  However that is "not yet implemented"
in dm-raid5 so that isn't a real difference.

Everything else could be made available by md to dm with only a
moderate amount of effort.
Of the different interface routines you require:
+	.ctr = raid_ctr,

  This would be quite straight forward, though it might be worth
  discussing the details of the args passed in.
  "recovery_bandwidth" (as I say above) I don't think is well chosen,
  and I don't quite see the point of io_size (maybe I'm missing
  something obvious)

+	.dtr = raid_dtr,

  This is trivial. raid5.stop

+	.map = raid_map,

  This is very much like the md/raid5 make_request.

+	.presuspend = raid_presuspend,

  I call this 'quiesce'

+	.postsuspend = raid_postsuspend,

  I'm not sure exactly how this fits in.  Something related to the 
  log...

+	.resume = raid_resume,

  ->quiesce requesting 'resume' rather than 'suspend'

+	.status = raid_status,

  very straight forward

+	.message = raid_message,

  As I said, I would rather this didn't exist.  But it could be
  emulated (except for the statistics) if needed - md can already
  adjust resync speed and cache size.


 What is missing is agreement on how to handle synchronous metadata
 updates.  md waits for the drive failure to be acknowledged.  It is
 automatically marked as 'Blocked' on an error, and user-space must
 clear that flag.

 I suspect, using your model, the approach would be a 'dm-message'
 acknowledging that a given device is faulty.  I would, of course,
 rather this be done via sysfs.

 I will try to look at factoring out the md specific part of md/raid5
 over the next few days (maybe next week) and see how that looks.

NeilBrown

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-06-19  1:43           ` Neil Brown
@ 2009-06-19 10:33             ` Heinz Mauelshagen
  2009-06-21  0:32               ` Dan Williams
  2009-06-21 12:06               ` Neil Brown
  0 siblings, 2 replies; 23+ messages in thread
From: Heinz Mauelshagen @ 2009-06-19 10:33 UTC (permalink / raw)
  To: Neil Brown
  Cc: Christoph Hellwig, Dan Williams, device-mapper development,
	Ed Ciechanowski

On Fri, 2009-06-19 at 11:43 +1000, Neil Brown wrote:
> On Wednesday June 17, neilb@suse.de wrote:
> > 
> > I will try to find time to review your dm-raid5 code with a view to
> > understanding how it plugs in to dm, and then how the md/raid5 engine
> > can be used by dm-raid5.

Hi Neil.

> 
> I've had a bit of a look through the dm-raid5 patches.

Thanks.

> 
> Some observations:
> 
> - You have your own 'xor' code against which you do a run-time test of
>   the 'xor_block' code which md/raid5 uses - then choose the fasted.
>   This really should not be necessary.  If you have xor code that runs
>   faster than anything in xor_block, it really would be best to submit
>   it for inclusion in the common xor code base.

This is in because it actually shows better performance regularly by
utilizing cache lines etc. more efficiently (tested on Intel, AMD and
Sparc).

If xor_block would always have been performed best, I'd dropped that
optimization already.

> 
> - You have introduced "dm-message" which is a frame work for sending
>   messages to dm targets.

dm-message is a parser for messages sent to a device-mapper targets
message interface. It aims at allowing common message parsing functions
to be shared between targets.

The message interface in general allows for runtime state changes of
targets like those you found below.

See "dmsetup message ..."

> It is used for adjusting the cache size,
>   changing the bandwidth used by resync, and doing things with
>   statistics.  Linux already has a framework for sending messages to
>   kernel objects.  It is called "sysfs".  It would be much nicer if
>   you could use that.

Looks to me like heavier to program (I counted ~100 LOC of sysfs defines
and calls in md.c) for 3 files used in md.

That needs some more discussion.

> 
> - I don't know what Christoph Hellwig was referring to when he
>   suggested that dm-raid1 was broken, but there is one issue with it
>   that concerns me and which is (as far as I can tell) broken in
>   dm-raid45 as well.
>   That issue is summarised in the thread
>    http://www.linux-archive.org/device-mapper-development/282987-dm-raid1-data-corruption.html

Yes, major point being that dm aims to mostly do uspace metadata updates
and as little kspace metadata access as possible.

> 
>   The problem is that when a drive fails you allow writes to continue
>   without waiting for the metadata to be updated.  I.e. metadata
>   updates are asynchronous. 
>   The problem scenario for raid5 is:
>     - a drive fails while a write is in process
>     - the drive is marked as faulty and user space is notified
>     - that write, and possibly several others, complete and that 
>       status is potentially returned to user-space (e.g. for an fsync)
>     - user space acts on that status (e.g. sends an acknowledgement over
>       the network)
>     - the host crashes before the metadata update completes
>     - on reboot, the drive, which just had a transient failure, works
>       fine, and the metadata reports that all the drives are good, but
>       that the array was active so a parity-resync is needed.
>     - All parity blocks are recalculated.  However that write we
>       mentioned above is only recorded in the parity block.  So that
>       data is lost.
> 

Yes, like in the dm raid1 scenario, blocking of writes would have to
occur until metadata has been updated in order to detect the failing
drive persistently, hence not updating parity (which contains data
mandatory to preserve) but recalculating chunks on the transiently
failing drive utilizing intact parity.

> 
>    Now if you get a crash while the array is degraded data loss is a
>    real possibility.  However silent data loss should not be tolerated.
> 
>    For this reason, metadata updates reporting failed devices really
>    need to by synchronous with respect to writes.
> 

Agreed.
   
> - Your algorithm for limiting bandwidth used by resync seems to be
>   based on a % concept (which is rounded to the nearest reciprocal,
>   so 100%, 50%, 33%, 25%, 20%, 16% etc).  If there is outstanding
>   IO, the amount of outstanding resync IO must not exceed the given
>   percentage of regular IO.   While it is hard to get resync "just
>   right", I think this will have some particularly poor
>   characteristics.

My ANALYZEME pointed you there ? ;-)
That's why it's in. I admitted with that that I'm not satisfied with the
characteristics yet.

>   Heavy regular IO load, or a non-existent regular load should be
>   handled OK, but I suspect that a light IO load (e.g. one or two
>   threads of synchronous requests) would cause the resync to go much
>   slower than you would want.

I'll look into it further taking your thoughts into account.
.
> 
>   Also it does not take account of IO to other partitions on a device.
>   It is common to have a number of different arrays on the same device
>   set.  You really want to resync to back off based on IO to all of
>   the device, not just to the array.

Does MD RAID > 0 cope with that in multi-partition RAID configs ?
E.g.:
disk1:part1+disk2:part1 -> md0 and disk1:part2+disk2:part2 -> md1 with
global iostats for disk[12] ?

> 
> 
> All of these are issues that are already addressed in md/raid5.
> About the only thing that I saw which does not have a working analogue
> in md/raid5 is cluster locking.  However that is "not yet implemented"
> in dm-raid5 so that isn't a real difference.

Yes, it's a start for future cluster RAID5 usage.

> 
> Everything else could be made available by md to dm with only a
> moderate amount of effort.
> Of the different interface routines you require:
> +	.ctr = raid_ctr,
> 
>   This would be quite straight forward, though it might be worth
>   discussing the details of the args passed in.
>   "recovery_bandwidth" (as I say above) I don't think is well chosen,

I was aware it needs further tweaking (your points above taken).

You mean the wording here ?

>   and I don't quite see the point of io_size (maybe I'm missing
>   something obvious)

Performance related. I studied read ahead/chunk size/io size pairs and
found examples of better performance with io_size > PAGE_SIZE <=
chunk_size. io_size being 2^N.

Do you have similar experiences to share with MD RAID ?

> 
> +	.dtr = raid_dtr,
> 
>   This is trivial. raid5.stop
> 
> +	.map = raid_map,
> 
>   This is very much like the md/raid5 make_request.
> 
> +	.presuspend = raid_presuspend,
> 
>   I call this 'quiesce'
> 
> +	.postsuspend = raid_postsuspend,
> 
>   I'm not sure exactly how this fits in.  Something related to the 
>   log...

We use these distinct presuspend/postsuspend methods in order to allow
for targets to distinguish flushing any io in flight in presuspend and
release/quiesce/... any resource (e.g. quiesce the dirty log) utilized
in the first step in postsuspend.

If MD RAID doesn't need that, stick with presuspend.

> 
> +	.resume = raid_resume,
> 
>   ->quiesce requesting 'resume' rather than 'suspend'
> 
> +	.status = raid_status,
> 
>   very straight forward
> 
> +	.message = raid_message,
> 
>   As I said, I would rather this didn't exist.

I think its better to offer options to be utilized for different
purposes. Tradeoff is always (some) overlap.

We don't have one filesystem for all use cases ;-)

> But it could be
>   emulated (except for the statistics) if needed - md can already
>   adjust resync speed and cache size.
> 
> 
>  What is missing is agreement on how to handle synchronous metadata
>  updates.  md waits for the drive failure to be acknowledged.  It is
>  automatically marked as 'Blocked' on an error, and user-space must
>  clear that flag.

Yes, Mikulas already put thought into that.
Let's allow him /and others) to join in.


> 
>  I suspect, using your model, the approach would be a 'dm-message'
>  acknowledging that a given device is faulty.  I would, of course,
>  rather this be done via sysfs.
> 
>  I will try to look at factoring out the md specific part of md/raid5
>  over the next few days (maybe next week) and see how that looks.

That'll be interesting to see how it fits into the device-mapper
framework

I take it You'll port code to create an "md-raid456" target ?

Thanks a lot,
Heinz

> 
> NeilBrown

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-06-19 10:33             ` Heinz Mauelshagen
@ 2009-06-21  0:32               ` Dan Williams
  2009-06-21 12:06               ` Neil Brown
  1 sibling, 0 replies; 23+ messages in thread
From: Dan Williams @ 2009-06-21  0:32 UTC (permalink / raw)
  To: heinzm, device-mapper development; +Cc: Christoph Hellwig, Ed Ciechanowski

On Fri, Jun 19, 2009 at 3:33 AM, Heinz Mauelshagen<heinzm@redhat.com> wrote:
> On Fri, 2009-06-19 at 11:43 +1000, Neil Brown wrote:
>> - You have your own 'xor' code against which you do a run-time test of
>>   the 'xor_block' code which md/raid5 uses - then choose the fasted.
>>   This really should not be necessary.  If you have xor code that runs
>>   faster than anything in xor_block, it really would be best to submit
>>   it for inclusion in the common xor code base.
>
> This is in because it actually shows better performance regularly by
> utilizing cache lines etc. more efficiently (tested on Intel, AMD and
> Sparc).
>
> If xor_block would always have been performed best, I'd dropped that
> optimization already.

Data please.  Especially given the claim that it improves upon the
existing per-architecture hand-tuned assembly routines.  If it is
faster great!  Please send patches to improve the existing code.

>>
>> - You have introduced "dm-message" which is a frame work for sending
>>   messages to dm targets.
>
> dm-message is a parser for messages sent to a device-mapper targets
> message interface. It aims at allowing common message parsing functions
> to be shared between targets.
>
> The message interface in general allows for runtime state changes of
> targets like those you found below.
>
> See "dmsetup message ..."
>
>> It is used for adjusting the cache size,
>>   changing the bandwidth used by resync, and doing things with
>>   statistics.  Linux already has a framework for sending messages to
>>   kernel objects.  It is called "sysfs".  It would be much nicer if
>>   you could use that.
>
> Looks to me like heavier to program (I counted ~100 LOC of sysfs defines
> and calls in md.c) for 3 files used in md.
>
> That needs some more discussion.

Ironically the same argument can be applied to this module.  ~5000 LOC
to re-implement raid5 seems much "heavier to program" than to write
some dmraid ABI compatibility wrappers around the existing md-raid5
code.

[..]
>>   The problem is that when a drive fails you allow writes to continue
>>   without waiting for the metadata to be updated.  I.e. metadata
>>   updates are asynchronous.
>>   The problem scenario for raid5 is:
>>     - a drive fails while a write is in process
>>     - the drive is marked as faulty and user space is notified
>>     - that write, and possibly several others, complete and that
>>       status is potentially returned to user-space (e.g. for an fsync)
>>     - user space acts on that status (e.g. sends an acknowledgement over
>>       the network)
>>     - the host crashes before the metadata update completes
>>     - on reboot, the drive, which just had a transient failure, works
>>       fine, and the metadata reports that all the drives are good, but
>>       that the array was active so a parity-resync is needed.
>>     - All parity blocks are recalculated.  However that write we
>>       mentioned above is only recorded in the parity block.  So that
>>       data is lost.
>>
>
> Yes, like in the dm raid1 scenario, blocking of writes would have to
> occur until metadata has been updated in order to detect the failing
> drive persistently, hence not updating parity (which contains data
> mandatory to preserve) but recalculating chunks on the transiently
> failing drive utilizing intact parity.

Another problem with dm along these lines is the mechanism for
notifying userspace, I am assuming this is still via uevents?  The
problem is that each notification requires a memory allocation which
might fail under load, especially if the blocked array is in the dirty
page write out path.  The userspace notifications and failed device
handling in md are designed to not require memory allocations.

[..]
>>   Also it does not take account of IO to other partitions on a device.
>>   It is common to have a number of different arrays on the same device
>>   set.  You really want to resync to back off based on IO to all of
>>   the device, not just to the array.
>
> Does MD RAID > 0 cope with that in multi-partition RAID configs ?
> E.g.:
> disk1:part1+disk2:part1 -> md0 and disk1:part2+disk2:part2 -> md1 with
> global iostats for disk[12] ?

By default it will hold off resync on other arrays when it notices
that multiple arrays share disks.  A sysfs knob allows parallel
resync.

[..]
>>   As I said, I would rather this didn't exist.
>
> I think its better to offer options to be utilized for different
> purposes. Tradeoff is always (some) overlap.
>
> We don't have one filesystem for all use cases ;-)

That argument does not fly in this case.  md is already suitable for
the dmraid use case (i.e kernel raid/userspace metadata).

>> But it could be
>>   emulated (except for the statistics) if needed - md can already
>>   adjust resync speed and cache size.
>>
>>
>>  What is missing is agreement on how to handle synchronous metadata
>>  updates.  md waits for the drive failure to be acknowledged.  It is
>>  automatically marked as 'Blocked' on an error, and user-space must
>>  clear that flag.
>
> Yes, Mikulas already put thought into that.
> Let's allow him /and others) to join in.
>
>
>>
>>  I suspect, using your model, the approach would be a 'dm-message'
>>  acknowledging that a given device is faulty.  I would, of course,
>>  rather this be done via sysfs.
>>
>>  I will try to look at factoring out the md specific part of md/raid5
>>  over the next few days (maybe next week) and see how that looks.
>
> That'll be interesting to see how it fits into the device-mapper
> framework
>

Yes, kernel ABI compatibility is the better [1] way to go.  I will
also revive the ideas/patches I had along these lines.

Regards,
Dan

[1]: ...than dm2md in userspace

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-06-19 10:33             ` Heinz Mauelshagen
  2009-06-21  0:32               ` Dan Williams
@ 2009-06-21 12:06               ` Neil Brown
  2009-06-22 12:25                 ` Neil Brown
  2009-06-22 19:10                 ` Heinz Mauelshagen
  1 sibling, 2 replies; 23+ messages in thread
From: Neil Brown @ 2009-06-21 12:06 UTC (permalink / raw)
  To: heinzm
  Cc: Christoph Hellwig, Dan Williams, device-mapper development,
	Ed Ciechanowski

On Friday June 19, heinzm@redhat.com wrote:
> On Fri, 2009-06-19 at 11:43 +1000, Neil Brown wrote:
> > On Wednesday June 17, neilb@suse.de wrote:
> > > 
> > > I will try to find time to review your dm-raid5 code with a view to
> > > understanding how it plugs in to dm, and then how the md/raid5 engine
> > > can be used by dm-raid5.
> 
> Hi Neil.
> 
> > 
> > I've had a bit of a look through the dm-raid5 patches.
> 
> Thanks.
> 
> > 
> > Some observations:
> > 
> > - You have your own 'xor' code against which you do a run-time test of
> >   the 'xor_block' code which md/raid5 uses - then choose the fasted.
> >   This really should not be necessary.  If you have xor code that runs
> >   faster than anything in xor_block, it really would be best to submit
> >   it for inclusion in the common xor code base.
> 
> This is in because it actually shows better performance regularly by
> utilizing cache lines etc. more efficiently (tested on Intel, AMD and
> Sparc).
> 
> If xor_block would always have been performed best, I'd dropped that
> optimization already.

I'm no expert on this I must admit - I just use the xor code that
others have provided.
However it is my understanding that some of the xor code options were
chosen deliberately because they bypassed the cache.  As the xor
operation operates on data that very probably was not in the cache,
and only touches it once, and touches quite a lot of data, there is
little benefit having in the cache, and a possible real cost as it
pushes other data out of the cache.

That said, the "write" process involves copying data from a buffer
into the cache, and then using that data as a source for xor.  In that
case the cache might be beneficial, I'm not sure.
I've occasionally wondered if it might be good to have an xor routine
that does "copy and xor".   This would use more registers than the
current code so we could possibly process fewer blocks at a time, but
we might be processing them faster.  In that case, if we could bypass
the cache to read the data, but use the cache to store the result of
the xor, that would be ideal.  However I have no idea if that is
possible.

The mechanism you use, which is much the same as the one used by
xor_block, calculates speed for the hot-cache case.  It is not clear
that this is often a relevant cache - mostly xor is calculated when
the cache is cold.  So it isn't clear if it is generally applicable.
That is why calibrate_xor_blocks sometimes skips the xor_speed test
(in crypto/xor.c).

However if your code performs better than the cache-agnostic code in
xor_block, then we should replace that code with your code, rather
than have two places that try to choose the optimal code.

And I would be very interested to see a discussion about how relevant
the cache bypassing is in real life...  It looks like it was Zach
Brown who introduced this about 10 years ago, presumably in
consultation with Ingo Molnar.

> 
> > 
> > - You have introduced "dm-message" which is a frame work for sending
> >   messages to dm targets.
> 
> dm-message is a parser for messages sent to a device-mapper targets
> message interface. It aims at allowing common message parsing functions
> to be shared between targets.
> 
> The message interface in general allows for runtime state changes of
> targets like those you found below.
> 
> See "dmsetup message ..."
> 
> > It is used for adjusting the cache size,
> >   changing the bandwidth used by resync, and doing things with
> >   statistics.  Linux already has a framework for sending messages to
> >   kernel objects.  It is called "sysfs".  It would be much nicer if
> >   you could use that.
> 
> Looks to me like heavier to program (I counted ~100 LOC of sysfs defines
> and calls in md.c) for 3 files used in md.
> 
> That needs some more discussion.

In dm-raid45 you have 170 lines from "Message interface" to "END
message interface", which is 4 files I think.  Am I counting the same
things?

In any case, it is certainly possible that sysfs usage could be
improved.  We recently added strict_blocks_to_sectors() which extracted
a common parsing task.  Once upon a time, we open coded things with
simple_strtoull and repeated the error checking every time.
There is always room for improvement.

But I think the uniformity across the kernel is the most important
issue here, not the lines of code.

There is a bit of tension here:  your dm-message approach is similar
in style to the way targets are created.  The sysfs approach is
similar in style to many other parts of the kernel.  So should your
new mechanism be consistent with dm or consistent with linux - it
cannot easily be both.

This is really a question that the dm developers as a group need to
consider and answer.  I would like to recommend that transitioning
towards more uniformity with the rest of linux is a good idea, and
here is excellent opportunity to start.  But it isn't my decision.

> > - Your algorithm for limiting bandwidth used by resync seems to be
> >   based on a % concept (which is rounded to the nearest reciprocal,
> >   so 100%, 50%, 33%, 25%, 20%, 16% etc).  If there is outstanding
> >   IO, the amount of outstanding resync IO must not exceed the given
> >   percentage of regular IO.   While it is hard to get resync "just
> >   right", I think this will have some particularly poor
> >   characteristics.
> 
> My ANALYZEME pointed you there ? ;-)
> That's why it's in. I admitted with that that I'm not satisfied with the
> characteristics yet.

Fair enough.
My concern then is that the mechanism for guiding the speed - which is
not yet finalised - is being encoded in the command for creating the
array.  That would make it difficult to change at a later date.
I would suggest not specifying anything when creating the array and
insisting that a default be chosen.
Then you can experiment with mechanism using different dm-message or
sysfs-files with some sort of warning that these are not part of the
api yet.

> > 
> >   Also it does not take account of IO to other partitions on a device.
> >   It is common to have a number of different arrays on the same device
> >   set.  You really want to resync to back off based on IO to all of
> >   the device, not just to the array.
> 
> Does MD RAID > 0 cope with that in multi-partition RAID configs ?
> E.g.:
> disk1:part1+disk2:part1 -> md0 and disk1:part2+disk2:part2 -> md1 with
> global iostats for disk[12] ?

Yes.  'gendisk' has 'sync_io' specifically for this purpose.
When md submits IO for the purpose of resync (or recovery) it adds a
count of the sectors to this counter.  That can be compared with the
regular io statistics for the genhd to see if there is an non-sync IO
happening.  If there is the genhd is assumed to be busy and we backoff
the resync.

This is not a perfect solution as it only goes one level down.
If we had raid1 over raid0 over partitions, then we might not notice
resync IO from elsewhere properly.  But I don't think I've ever had a
complaint about that.

> > 
> > Everything else could be made available by md to dm with only a
> > moderate amount of effort.
> > Of the different interface routines you require:
> > +	.ctr = raid_ctr,
> > 
> >   This would be quite straight forward, though it might be worth
> >   discussing the details of the args passed in.
> >   "recovery_bandwidth" (as I say above) I don't think is well chosen,
> 
> I was aware it needs further tweaking (your points above taken).
> 
> You mean the wording here ?

The thing that I don't think was well chosen was the choice of a
percentage to guide the speed of resync.

> 
> >   and I don't quite see the point of io_size (maybe I'm missing
> >   something obvious)
> 
> Performance related. I studied read ahead/chunk size/io size pairs and
> found examples of better performance with io_size > PAGE_SIZE <=
> chunk_size. io_size being 2^N.
> 
> Do you have similar experiences to share with MD RAID ?

My position has always been that it is up to lower layers to combine
pages together.  md/raid5 does all IO in pages.  If it is more
efficient for some device to do multiple pages as a time, it should
combine the pages.

The normal plugging mechanisms combined with the elevator should be
enough for this.

We have put more focus in to gathering write-pages into whole strips
so that no pre-reading is needed - we can just calculate parity and
write.  This has a significant effect on throughput.
(A 'strip' is like a 'stripe' but it is one page wide, not one chunk
wide).

> > +	.presuspend = raid_presuspend,
> > 
> >   I call this 'quiesce'
> > 
> > +	.postsuspend = raid_postsuspend,
> > 
> >   I'm not sure exactly how this fits in.  Something related to the 
> >   log...
> 
> We use these distinct presuspend/postsuspend methods in order to allow
> for targets to distinguish flushing any io in flight in presuspend and
> release/quiesce/... any resource (e.g. quiesce the dirty log) utilized
> in the first step in postsuspend.
> 
> If MD RAID doesn't need that, stick with presuspend.

OK, thanks.

> > +	.message = raid_message,
> > 
> >   As I said, I would rather this didn't exist.
> 
> I think its better to offer options to be utilized for different
> purposes. Tradeoff is always (some) overlap.
> 
> We don't have one filesystem for all use cases ;-)
> 

No...  I find myself wearing several hats here.

As a developer, I think it is great that you are writing your own
raid5 implementation.  Doing stuff like that is lots of fun and a
valuable learning experience.  As I read though the code there were a
number of time when I thought "that looks rather neat".  Quite
possibly there are ideas in dm-raid5 that I could "steal" to make
md/raid5 better.
dm-message certainly has its merits too (as I said, it fits the dm model).

As a member of the linux kernel community, I want to see linux grow
with a sense of unity.  Where possible, similar tasks should be
addressed in similar ways.  This avoid duplicating mistakes, eases
maintenance, and makes it easier for newcomers to learn.  Establishing
Patterns and following them is a good thing.  Hence my desire to
encourage the use of sysfs and a single xor implementation.

As an employee of a company that sells support, I really don't want a
profusion of different implementations of something that I am seen as
an expert in, as I know I'll end up having to support all of them.
So for very selfish reasons, I'd prefer fewer versions of NFS, fewer
file-systems, and fewer raid5 implementations :-)
Well... fewer that we contract to support anyway.

> > 
> >  I suspect, using your model, the approach would be a 'dm-message'
> >  acknowledging that a given device is faulty.  I would, of course,
> >  rather this be done via sysfs.
> > 
> >  I will try to look at factoring out the md specific part of md/raid5
> >  over the next few days (maybe next week) and see how that looks.
> 
> That'll be interesting to see how it fits into the device-mapper
> framework
> 
> I take it You'll port code to create an "md-raid456" target ?

That's the idea.  I'll let you know when I have something credible.

Thanks,
NeilBrown

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-06-21 12:06               ` Neil Brown
@ 2009-06-22 12:25                 ` Neil Brown
  2009-06-22 19:10                 ` Heinz Mauelshagen
  1 sibling, 0 replies; 23+ messages in thread
From: Neil Brown @ 2009-06-22 12:25 UTC (permalink / raw)
  To: heinzm, Dan Williams, device-mapper development,
	Christoph Hellwig, Ed Ciechanowski

On Sunday June 21, neilb@suse.de wrote:
> On Friday June 19, heinzm@redhat.com wrote:
> > 
> > > 
> > > - You have introduced "dm-message" which is a frame work for sending
> > >   messages to dm targets.
> > 
> > dm-message is a parser for messages sent to a device-mapper targets
> > message interface. It aims at allowing common message parsing functions
> > to be shared between targets.
> > 
> > The message interface in general allows for runtime state changes of
> > targets like those you found below.
> > 
> > See "dmsetup message ..."
> > 

I didn't process this bit properly before...
When I saw "dm-message.c" I assumed you were setting up a whole not
messaging infrastructure, not just writing a parser.  Sorry about
that.
So I can now see that your approach is quite defensible. 
(I would still like to see dm using sysfs though :-)

> > > 
> > >  I will try to look at factoring out the md specific part of md/raid5
> > >  over the next few days (maybe next week) and see how that looks.
> > 
> > That'll be interesting to see how it fits into the device-mapper
> > framework
> > 
> > I take it You'll port code to create an "md-raid456" target ?
> 
> That's the idea.  I'll let you know when I have something credible.

Well... I now have something credible.
It is only lightly tested.  And parts of it are incredibly ugly.  The
could shouldn't be read as "this is how it should be done" but "this
is an issue that needs to be resolved - here is a hack for now".

I used a lot of your code and hacked md and raid5 until I could plug
them together.

I can feed
 0 12000000 raid456 core 1 1 raid5_la 6 128 -1 -1 -1 -1 nosync 4 3 /dev/sdb 0 /dev/sdc 0 /dev/sdd 0 /dev/sde 0

to "dmsetup create" and get a raid5.
I can change 'nosync' or 'sync' and it will resync.
If I mark one of the devices as needing rebuild, it will do that.
And I can mkfs, and "mount" and "cp -r" and it seems to work.
I can even "dmsetup remove" and then
  mdadm --create /dev/md0 -l5 -pla -n4 -z 2000000 --assume-clean /dev/sd[bcde]
and see the same data.

Some notes:
 - there are a number of raid5 layouts and raid6 layouts that md
   supports but which I have not coded support for.  These are mostly
   very odd layouts designed as transition points when converting a
   raid5 to a raid6.
 - I haven't supported all the possible positions for the parity block
   for raid4 - only '0' and 'N'.  Given that the kernel doesn't do any
   metadata management, user-space could always list the parity block
   first, then the data blocks, and you would only need one layout.
 - I have ignored io_size - md always uses PAGE_SIZE
 - Using "dev_to_initialize" doesn't generalise to raid6, which could
   have two devices to initialise.
 - Depending on metadata support, it can be useful to say "this device
   has been recovered up to sector X" or "the array is in-sync up to
   sector Y", rather than having binary flags for these concepts.
   It might be good to support this in the dm interface.  
   i.e. instead of "sync" or "nosync", have a number saying
   'sync from this sector onwards'.  Have a similar number for each
   drive.
 - The status reports I generate are quite different to what your code
   would generate, largely because I left some bits out.  You could
   put in some dummy values if you want currently-existing code to be
   able to parse it.
 - I haven't tested "dmsetup message" at all yet (it is getting late
   and I am tired :-)
 - I haven't preserved the XX_parm values... I'm not sure if I've lost
   something by not doing that.
 - I haven't supported auto-resize of the stripe cache.
 - I think that if a drive throws an error, the raid5 will freeze
   awaiting the faulty drive to be unblocked - which cannot currently
   be done.  It should alert user-space though.
 - Some of the code is really really ugly.
 - I've done something horrible to get a queue to use for unplugging.
   It doesn't feel right to use the dm devices queue, and there could
   be multiple targets hanging off the one dm-table, and they would
   fight for use of that queue.
 - Due it minor implementation details, you can currently only create
   one dm-raid5 at a time (it will be called 'md-X' in various
   messages and process names).
 - There are other ugly things in the code.
 - There are probably other things I've forgotten to mention.
 (BTW, there is an is_power_of_2() function in include/linux/log2.h)

I am unlikely to find more time to play with this until next week, but
I do hope to do some tidying up in md and raid5 then so some of this
code can be a little less ugly.

The following patch is on top of current linus.git kernel, with the
dm-message patch applied.

NeilBrown


commit b2224304f0d9553d73b1a47ddbdef2f0766dd07a
Author: NeilBrown <neilb@suse.de>
Date:   Mon Jun 22 21:52:32 2009 +1000

    dm-raid456

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 36e0675..eab8e52 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -264,4 +264,12 @@ config DM_UEVENT
 	---help---
 	Generate udev events for DM events.
 
+config DM_RAID456
+	tristate "RAID 4/5/6 target (EXPERIMENTAL)"
+	depends on BLK_DEV_DM && MD_RAID456 && EXPERIMENTAL
+	---help---
+	A targets that supports RAID4, RAID5, and RAID6 mappings
+
+	In unsure, say N.
+
 endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 45cc595..dc99ae0 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o
 obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
+obj-$(CONFIG_DM_RAID456)	+= dm-raid456.o dm-message.o
 
 quiet_cmd_unroll = UNROLL  $@
       cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \
diff --git a/drivers/md/dm-raid456.c b/drivers/md/dm-raid456.c
new file mode 100644
index 0000000..4bd1867
--- /dev/null
+++ b/drivers/md/dm-raid456.c
@@ -0,0 +1,1062 @@
+static const char *version = "v0.3000md";
+
+#include "md.h"
+#include "raid5.h"
+#include "dm.h"
+#include "dm-message.h"
+
+extern int raid5_congested(void *data, int bits);
+extern int raid5_set_cache_size(mddev_t *mddev, int size);
+extern int do_md_run(mddev_t * mddev);
+extern int do_md_stop(mddev_t * mddev, int mode, int is_open);
+extern int md_make_request(struct request_queue *q, struct bio *bio);
+extern void mddev_suspend(mddev_t *mddev);
+extern void mddev_resume(mddev_t *mddev);
+
+
+/* Factor out to dm.h. */
+/* Reference to array end. */
+#define ARRAY_END(a)    ((a) + ARRAY_SIZE(a))
+#define	SECTORS_PER_PAGE	(PAGE_SIZE >> SECTOR_SHIFT)
+
+/*
+ * Configurable parameters
+ */
+
+/* Minimum/maximum and default # of selectable stripes. */
+#define	STRIPES_MIN		8
+#define	STRIPES_MAX		16384
+#define	STRIPES_DEFAULT		80
+
+/* Maximum and default chunk size in sectors if not set in constructor. */
+#define	CHUNK_SIZE_MIN		8
+#define	CHUNK_SIZE_MAX		16384
+#define	CHUNK_SIZE_DEFAULT	64
+
+/* Default io size in sectors if not set in constructor. */
+#define	IO_SIZE_MIN		CHUNK_SIZE_MIN
+#define	IO_SIZE_DEFAULT		IO_SIZE_MIN
+
+/* Recover io size default in sectors. */
+#define	RECOVER_IO_SIZE_MIN		64
+#define	RECOVER_IO_SIZE_DEFAULT		256
+
+/* Default, minimum and maximum percentage of recover io bandwidth. */
+#define	BANDWIDTH_DEFAULT	10
+#define	BANDWIDTH_MIN		1
+#define	BANDWIDTH_MAX		100
+
+/* # of parallel recovered regions */
+#define RECOVERY_STRIPES_MIN	1
+#define RECOVERY_STRIPES_MAX	64
+#define RECOVERY_STRIPES_DEFAULT	RECOVERY_STRIPES_MIN
+/*
+ * END Configurable parameters
+ */
+
+#define	TARGET	"dm-raid45"
+#define	DM_MSG_PREFIX	TARGET
+
+/* Check value in range. */
+#define	range_ok(i, min, max)	(i >= min && i <= max)
+
+/* Check argument is power of 2. */
+#define POWER_OF_2(a) (!(a & (a - 1)))
+
+/* Factor out to dm.h */
+#define TI_ERR_RET(str, ret) \
+	do { ti->error = str; return ret; } while (0);
+#define TI_ERR(str)     TI_ERR_RET(str, -EINVAL)
+
+
+enum dm_lock_type { DM_RAID45_EX, DM_RAID45_SHARED };
+
+struct dm_raid45_locking_type {
+	/* Request a lock on a stripe. */
+	void* (*lock)(sector_t key, enum dm_lock_type type);
+
+	/* Release a lock on a stripe. */
+	void (*unlock)(void *lock_handle);
+};
+
+/*
+ * Stripe cache locking functions
+ */
+/* Dummy lock function for single host RAID4+5. */
+static void *no_lock(sector_t key, enum dm_lock_type type)
+{
+	return &no_lock;
+}
+
+/* Dummy unlock function for single host RAID4+5. */
+static void no_unlock(void *lock_handle)
+{
+}
+
+/* No locking (for single host RAID 4+5). */
+static struct dm_raid45_locking_type locking_none = {
+	.lock = no_lock,
+	.unlock = no_unlock,
+};
+
+struct raid_type {
+	const char *name;		/* RAID algorithm. */
+	const char *descr;		/* Descriptor text for logging. */
+	const unsigned parity_devs;	/* # of parity devices. */
+	const unsigned minimal_devs;	/* minimal # of devices in set. */
+	const unsigned level;		/* RAID level. */
+	const unsigned algorithm;	/* RAID algorithm. */
+};
+
+/* Supported raid types and properties. */
+static struct raid_type raid_types[] = {
+	{"raid4",    "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
+	{"raid5_la", "RAID5 (left asymmetric)",       1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
+	{"raid5_ra", "RAID5 (right asymmetric)",      1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
+	{"raid5_ls", "RAID5 (left symmetric)",        1, 2, 5, ALGORITHM_LEFT_SYMMETRIC},
+	{"raid5_rs", "RAID5 (right symmetric)",       1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC},
+	{"raid6_zr", "RAID6 (zero restart)",          2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART },
+	{"raid6_nr", "RAID6 (N restart)",             2, 4, 6, ALGORITHM_ROTATING_N_RESTART},
+	{"raid6_nc", "RAID6 (N continue)",            2, 4, 5, ALGORITHM_ROTATING_N_CONTINUE}
+};
+
+/* Return pointer to raid_type structure for raid name. */
+static struct raid_type *get_raid_type(char *name)
+{
+	struct raid_type *r = ARRAY_END(raid_types);
+
+	while (r-- > raid_types) {
+		if (!strcmp(r->name, name))
+			return r;
+	}
+
+	return NULL;
+}
+
+/* FIXME: factor out to dm core. */
+static int multiple(sector_t a, sector_t b, sector_t *n)
+{
+	sector_t r = a;
+
+	sector_div(r, b);
+	*n = r;
+	return a == r * b;
+}
+
+struct raid_dev {
+	struct dm_dev *dev;
+	struct mdk_rdev_s rdev;
+};
+
+struct raid_set {
+	struct dm_target *ti;
+	struct mddev_s md;
+	struct raid_type *raid_type;
+	int raid_parms;
+	struct work_struct ws_do_table_event;
+	struct mdk_personality pers, *oldpers;
+	struct raid_dev dev[0];
+};
+
+/* Throw an event. */
+static void do_table_event(struct work_struct *ws)
+{
+	struct raid_set *rs = container_of(ws, struct raid_set,
+					   ws_do_table_event);
+	dm_table_event(rs->ti->table);
+}
+
+static void dm_raid5_error(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	struct raid_set *rs = container_of(mddev, struct raid_set, md);
+	
+	rs->oldpers->error_handler(mddev, rdev);
+	schedule_work(&rs->ws_do_table_event);
+}
+
+/*
+ * Allocate a RAID context (a RAID set)
+ */
+/* Structure for variable RAID parameters. */
+struct variable_parms {
+	int bandwidth;
+	int bandwidth_parm;
+	int chunk_size;
+	int chunk_size_parm;
+	int io_size;
+	int io_size_parm;
+	int stripes;
+	int stripes_parm;
+	int recover_io_size;
+	int recover_io_size_parm;
+	int raid_parms;
+	int recovery;
+	int recovery_stripes;
+	int recovery_stripes_parm;
+};
+
+static struct raid_set *
+context_alloc(struct raid_type *raid_type, struct variable_parms *p,
+	      unsigned raid_devs, sector_t sectors_per_dev,
+	      struct dm_target *ti, unsigned dl_parms, char **argv)
+{
+	/* No dirty log for now */
+	struct raid_set *rs;
+	int len;
+
+	len = sizeof(rs->dev[0]);
+	if (dm_array_too_big(sizeof(*rs), len, raid_devs))
+		goto bad_array;
+
+	len = sizeof(*rs) + raid_devs * len;
+	rs = kzalloc(len, GFP_KERNEL);
+	if (!rs)
+		goto bad_alloc;
+
+	rs->ti = ti;
+	/* initialisations from mddev_find */
+	mutex_init(&rs->md.reconfig_mutex);
+	INIT_LIST_HEAD(&rs->md.disks);
+	INIT_LIST_HEAD(&rs->md.all_mddevs);
+	init_timer(&rs->md.safemode_timer);
+	spin_lock_init(&rs->md.write_lock);
+	init_waitqueue_head(&rs->md.sb_wait);
+	init_waitqueue_head(&rs->md.recovery_wait);
+	rs->md.reshape_position = MaxSector;
+	rs->md.resync_max = MaxSector;
+	/* This is horrible! */
+	rs->md.queue = blk_alloc_queue(GFP_KERNEL);
+	/* initialise unplug timer */
+	blk_queue_make_request(rs->md.queue, NULL);
+	rs->md.queue->queuedata = &rs->md;
+	rs->md.sysfs_state = NULL;
+
+	rs->raid_type = raid_type;
+	rs->md.raid_disks = raid_devs;
+	rs->md.level = raid_type->level;
+	rs->md.dev_sectors = sectors_per_dev;
+	rs->md.persistent = 1;
+	rs->md.external = 1;
+	rs->md.layout = raid_type->algorithm;
+	rs->md.chunk_sectors = p->chunk_size;
+
+	if (p->recovery)
+		rs->md.recovery_cp = 0;
+	else
+		rs->md.recovery_cp = MaxSector;
+
+	rs->md.new_level = rs->md.level;
+	rs->md.new_chunk_sectors = rs->md.chunk_sectors;
+	rs->md.new_layout = rs->md.layout;
+	rs->md.delta_disks = 0;
+
+	INIT_WORK(&rs->ws_do_table_event, do_table_event);
+	return rs;
+
+bad_array:
+	TI_ERR_RET("Arry too big", ERR_PTR(-EINVAL));
+
+bad_alloc:
+	TI_ERR_RET("Cannot allocate raid context", ERR_PTR(-ENOMEM));
+}
+
+/* Free a RAID context (a RAID set). */
+static void context_free(struct raid_set *rs, unsigned p)
+{
+	while (p--)
+		dm_put_device(rs->ti, rs->dev[p].dev);
+
+	blk_put_queue(rs->md.queue);
+	kfree(rs);
+}
+
+/* Log RAID set information to kernel log. */
+static void rs_log(struct raid_set *rs)
+{
+	unsigned p;
+	char buf[BDEVNAME_SIZE];
+	raid5_conf_t *conf = rs->md.private;
+
+	for (p = 0; p < rs->md.raid_disks; p++)
+		DMINFO("/dev/%s is raid disk %u",
+		       bdevname(rs->dev[p].dev->bdev, buf), p);
+
+	DMINFO("%d sectors chunk size, %u stripes\n"
+	       "%s set with %u devices",
+	       rs->md.chunk_sectors,
+	       conf->max_nr_stripes,
+	       rs->raid_type->descr, rs->md.raid_disks);
+}
+/* Get all devices and offsets. */
+static int dev_parms(struct raid_set *rs, char **argv, int dev_to_init, int *p)
+{
+	struct dm_target *ti = rs->ti;
+
+	for (*p = 0; *p < rs->md.raid_disks; (*p)++, argv += 2) {
+		int r;
+		unsigned long long tmp;
+		struct raid_dev *dev = rs->dev + *p;
+
+		/* Get offset and device. */
+		if (sscanf(argv[1], "%llu", &tmp) != 1 ||
+		    tmp > rs->md.dev_sectors)
+			/* FIXME this test doesn't make sense */
+			TI_ERR("Invalid RAID device offset parameter");
+
+		dev->rdev.data_offset = tmp;
+		r = dm_get_device(ti, *argv, tmp,
+				  rs->md.dev_sectors,
+				  dm_table_get_mode(ti->table), &dev->dev);
+		if (r)
+			TI_ERR_RET("RAID device lookup failure", r);
+
+		/* avoid duplicates */
+		for (r=0; r < *p; r++)
+			if (dev->dev->bdev == rs->dev[r].dev->bdev) {
+				dm_put_device(ti, dev->dev);
+				TI_ERR_RET("Duplicate RAID device", -ENXIO);
+			}
+		/* initialise rest of 'rdev' - from md_import_device */
+		dev->rdev.desc_nr = -1;
+		dev->rdev.saved_raid_disk = -1;
+		dev->rdev.raid_disk = *p;
+		dev->rdev.flags = 0;
+		if (*p != dev_to_init)
+			set_bit(In_sync, &dev->rdev.flags);
+		atomic_set(&dev->rdev.nr_pending, 0);
+		atomic_set(&dev->rdev.read_errors, 0);
+		atomic_set(&dev->rdev.corrected_errors, 0);
+		init_waitqueue_head(&dev->rdev.blocked_wait);
+		dev->rdev.sb_start = 0;
+		dev->rdev.sb_size = 0;
+		/* and from bind_rdev_to_array */
+		dev->rdev.mddev = &rs->md;
+		dev->rdev.sysfs_state = NULL;
+		dev->rdev.bdev = dev->dev->bdev;
+		list_add(&dev->rdev.same_set, &rs->md.disks);
+	}
+
+	return 0;
+}
+
+/* Parse optional locking parameters. */
+static int get_raid_locking_parms(struct dm_target *ti, char **argv,
+				  int *locking_parms,
+				  struct dm_raid45_locking_type **locking_type)
+{
+	if (!strnicmp(argv[0], "locking", strlen(argv[0]))) {
+		char *lckstr = argv[1];
+		size_t lcksz = strlen(lckstr);
+
+		if (!strnicmp(lckstr, "none", lcksz)) {
+			*locking_type = &locking_none;
+			*locking_parms = 2;
+		} else if (!strnicmp(lckstr, "cluster", lcksz)) {
+			DMERR("locking type \"%s\" not yet implemented",
+			      lckstr);
+			return -EINVAL;
+		} else {
+			DMERR("unknown locking type \"%s\"", lckstr);
+			return -EINVAL;
+		}
+	}
+
+	*locking_parms = 0;
+	*locking_type = &locking_none;
+	return 0;
+}
+
+
+/* Set backing device read ahead properties of RAID set. */
+static void rs_set_read_ahead(struct raid_set *rs,
+			      unsigned sectors, unsigned stripes)
+{
+	unsigned ra_pages = dm_div_up(sectors, SECTORS_PER_PAGE);
+	struct mapped_device *md = dm_table_get_md(rs->ti->table);
+	struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
+
+	/* Set read-ahead for the RAID set and the component devices. */
+	if (ra_pages) {
+		unsigned p = rs->md.raid_disks;
+		int data = p - 1;
+		if (rs->md.level == 6)
+			data --;
+
+		bdi->ra_pages = stripes * ra_pages * data;
+
+		while (p--) {
+			struct request_queue *q =
+				bdev_get_queue(rs->dev[p].dev->bdev);
+
+			q->backing_dev_info.ra_pages = ra_pages;
+		}
+	}
+
+	dm_put(md);
+}
+
+/* RAID set congested function. */
+static int rs_congested(void *congested_data, int bdi_bits)
+{
+	int r;
+	unsigned p;
+	struct raid_set *rs = congested_data;
+
+	r = raid5_congested(&rs->md, bdi_bits);
+	if (r)
+		;
+	else for (r = 0, p = rs->md.raid_disks; !r && p--; ) {
+		/* If any of our component devices are overloaded. */
+		struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
+
+		r |= bdi_congested(&q->backing_dev_info, bdi_bits);
+	}
+
+	return r;
+}
+
+/* Set congested function. */
+static void rs_set_congested_fn(struct raid_set *rs)
+{
+	struct mapped_device *md = dm_table_get_md(rs->ti->table);
+	struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
+
+	/* Set congested function and data. */
+	bdi->congested_fn = rs_congested;
+	bdi->congested_data = rs;
+	dm_put(md);
+}
+
+/* Set recovery bandwidth */
+static void
+recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth)
+{
+	/* hack to convert a percent to K/s assuming the device
+	 * can manage 100M/sec
+	 */
+	rs->md.sync_speed_min = bandwidth * 1024;
+}
+
+/* Get recovery bandwidth */
+static unsigned
+recover_get_bandwidth(struct raid_set *rs)
+{
+	return rs->md.sync_speed_min / 1024;
+}
+
+/* Handle variable number of RAID parameters. */
+static int get_raid_variable_parms(struct dm_target *ti, char **argv, 
+				   struct variable_parms *vp)
+{
+	int p, value;
+	struct {
+		int action; /* -1: skip, 0: no power2 check, 1: power2 check */
+		char *errmsg;
+		int min, max;
+		int *var, *var2, *var3;
+	} argctr[] = {
+		{ 1,
+		  "Invalid chunk size; must be -1 or 2^^n and <= 16384",
+ 		  IO_SIZE_MIN, CHUNK_SIZE_MAX,
+		  &vp->chunk_size_parm, &vp->chunk_size, &vp->io_size },
+		{ 0,
+		  "Invalid number of stripes: must be -1 or >= 8 and <= 16384",
+		  STRIPES_MIN, STRIPES_MAX,
+		  &vp->stripes_parm, &vp->stripes, NULL },
+		{ 1,
+		  "Invalid io size; must -1 or >= 8, 2^^n and less equal "
+		  "min(BIO_MAX_SECTORS/2, chunk size)",
+		  IO_SIZE_MIN, 0, /* Needs to be updated in loop below. */
+		  &vp->io_size_parm, &vp->io_size, NULL },
+		{ 1,
+		  "Invalid recovery io size; must be -1 or "
+		  "2^^n and less equal BIO_MAX_SECTORS/2",
+		  RECOVER_IO_SIZE_MIN, BIO_MAX_SECTORS / 2,
+		  &vp->recover_io_size_parm, &vp->recover_io_size, NULL },
+		{ 0,
+		  "Invalid recovery bandwidth percentage; "
+		  "must be -1 or > 0 and <= 100",
+		  BANDWIDTH_MIN, BANDWIDTH_MAX,
+		  &vp->bandwidth_parm, &vp->bandwidth, NULL },
+		/* Handle sync argument seperately in loop. */
+		{ -1,
+		  "Invalid recovery switch; must be \"sync\" or \"nosync\"" },
+		{ 0,
+		  "Invalid number of recovery stripes;"
+		  "must be -1, > 0 and <= 16384",
+		  RECOVERY_STRIPES_MIN, RECOVERY_STRIPES_MAX,
+		  &vp->recovery_stripes_parm, &vp->recovery_stripes, NULL },
+	}, *varp;
+
+	/* Fetch # of variable raid parameters. */
+	if (sscanf(*(argv++), "%d", &vp->raid_parms) != 1 ||
+	    !range_ok(vp->raid_parms, 0, 7))
+		TI_ERR("Bad variable raid parameters number");
+
+	/* Preset variable RAID parameters. */
+	vp->chunk_size = CHUNK_SIZE_DEFAULT;
+	vp->io_size = IO_SIZE_DEFAULT;
+	vp->stripes = STRIPES_DEFAULT;
+	vp->recover_io_size = RECOVER_IO_SIZE_DEFAULT;
+	vp->bandwidth = BANDWIDTH_DEFAULT;
+	vp->recovery = 1;
+	vp->recovery_stripes = RECOVERY_STRIPES_DEFAULT;
+
+	/* Walk the array of argument constraints for all given ones. */
+	for (p = 0, varp = argctr; p < vp->raid_parms; p++, varp++) {
+	     	BUG_ON(varp >= ARRAY_END(argctr));
+
+		/* Special case for "[no]sync" string argument. */
+		if (varp->action < 0) {
+			if (!strcmp(*argv, "sync"))
+				;
+			else if (!strcmp(*argv, "nosync"))
+				vp->recovery = 0;
+			else
+				TI_ERR(varp->errmsg);
+
+			argv++;
+			continue;
+		}
+
+		/*
+		 * Special case for io_size depending
+		 * on previously set chunk size.
+		 */
+		if (p == 2)
+			varp->max = min(BIO_MAX_SECTORS / 2, vp->chunk_size);
+
+		if (sscanf(*(argv++), "%d", &value) != 1 ||
+		    (value != -1 &&
+		     ((varp->action && !POWER_OF_2(value)) ||
+		      !range_ok(value, varp->min, varp->max))))
+			TI_ERR(varp->errmsg);
+
+		*varp->var = value;
+		if (value != -1) {
+			if (varp->var2)
+				*varp->var2 = value;
+			if (varp->var3)
+				*varp->var3 = value;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Construct a RAID4/5 mapping:
+ *
+ * log_type #log_params <log_params> \
+ * raid_type [#parity_dev] #raid_variable_params <raid_params> \
+ * [locking "none"/"cluster"]
+ * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,}
+ *
+ * log_type = "core"/"disk",
+ * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only)
+ * log_params = [dirty_log_path] region_size [[no]sync])
+ *
+ * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs",
+ *              "raid6_la", "raid6_ra", "raid6_ls", "raid6_rs",
+ *
+ * #parity_dev = N if raid_type = "raid4"
+ * o N = -1: pick default = last device
+ * o N == 0 or == #raid_devs-1: parity device index
+ *
+ * #raid_variable_params = 0-7; raid_params (-1 = default):
+ *   [chunk_size [#stripes [io_size [recover_io_size \
+ *    [%recovery_bandwidth [recovery_switch [#recovery_stripes]]]]]]]
+ *   o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8
+ *     and <= CHUNK_SIZE_MAX)
+ *   o #stripes is number of stripes allocated to stripe cache
+ *     (must be > 1 and < STRIPES_MAX)
+ *   o io_size (io unit size per device in sectors; must be 2^^n and > 8)
+ *   o recover_io_size (io unit size per device for recovery in sectors;
+ must be 2^^n, > SECTORS_PER_PAGE and <= region_size)
+ *   o %recovery_bandwith is the maximum amount spend for recovery during
+ *     application io (1-100%)
+ *   o recovery switch = [sync|nosync]
+ *   o #recovery_stripes is the number of recovery stripes used for
+ *     parallel recovery of the RAID set
+ * If raid_variable_params = 0, defaults will be used.
+ * Any raid_variable_param can be set to -1 to apply a default
+ *
+ * #raid_devs = N (N >= 2)
+ *
+ * #dev_to_initialize = N
+ * -1: initialize parity on all devices
+ * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction
+ * of a failed devices content after replacement
+ *
+ * <dev_path> = device_path (eg, /dev/sdd1)
+ * <offset>   = begin at offset on <dev_path>
+ *
+ */
+#define	MIN_PARMS	13
+static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+	int dev_to_init, dl_parms, i, locking_parms,
+	    parity_parm, pi = -1, r, raid_devs;
+	sector_t tmp, sectors_per_dev;
+	struct dm_raid45_locking_type *locking;
+	struct raid_set *rs;
+	struct raid_type *raid_type;
+	struct variable_parms parms;
+
+	/* Ensure minimum number of parameters. */
+	if (argc < MIN_PARMS)
+		TI_ERR("Not enough parameters");
+
+	/* Fetch # of dirty log parameters. */
+	if (sscanf(argv[1], "%d", &dl_parms) != 1 ||
+	    !range_ok(dl_parms, 1, 4711)) /* ;-) */
+		TI_ERR("Bad dirty log parameters number");
+
+	/* Check raid_type. */
+	raid_type = get_raid_type(argv[dl_parms + 2]);
+	if (!raid_type)
+		TI_ERR("Bad raid type");
+
+	/* In case of RAID4, parity drive is selectable. */
+	parity_parm = !!(raid_type->level == 4);
+
+	/* Handle variable number of RAID parameters. */
+	r = get_raid_variable_parms(ti, argv + dl_parms + parity_parm + 3,
+				    &parms);
+	if (r)
+		return r;
+
+	/* Handle any locking parameters. */
+	r = get_raid_locking_parms(ti,
+				   argv + dl_parms + parity_parm +
+				   parms.raid_parms + 4,
+				   &locking_parms, &locking);
+	if (r)
+		return r;
+
+	/* # of raid devices. */
+	i = dl_parms + parity_parm + parms.raid_parms + locking_parms + 4;
+	if (sscanf(argv[i], "%d", &raid_devs) != 1 ||
+	    raid_devs < raid_type->minimal_devs)
+		TI_ERR("Invalid number of raid devices");
+
+	/* In case of RAID4, check parity drive index is in limits. */
+	if (raid_type->algorithm == ALGORITHM_PARITY_0) {
+		/* Fetch index of parity device. */
+		if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 ||
+		    (pi != -1 && pi != 0 && pi != raid_devs - 1))
+			TI_ERR("Invalid RAID4 parity device index");
+	}
+
+	/*
+	 * Index of device to initialize starts at 0
+	 *
+	 * o -1 -> don't initialize a selected device;
+	 *         initialize parity conforming to algorithm
+	 * o 0..raid_devs-1 -> initialize respective device
+	 *   (used for reconstruction of a replaced device)
+	 */
+	if (sscanf(argv[dl_parms + parity_parm + parms.raid_parms +
+		   locking_parms + 5], "%d", &dev_to_init) != 1 ||
+	    !range_ok(dev_to_init, -1, raid_devs - 1))
+		TI_ERR("Invalid number for raid device to initialize");
+
+	/* Check # of raid device arguments. */
+	if (argc - dl_parms - parity_parm - parms.raid_parms - 6 !=
+	    2 * raid_devs)
+		TI_ERR("Wrong number of raid device/offset arguments");
+
+	/*
+	 * Check that the table length is devisable
+	 * w/o rest by (raid_devs - parity_devs)
+	 */
+	if (!multiple(ti->len, raid_devs - raid_type->parity_devs,
+		      &sectors_per_dev))
+		TI_ERR("Target length not divisible by number of data devices");
+
+	/*
+	 * Check that the device size is
+	 * devisable w/o rest by chunk size
+	 */
+	if (!multiple(sectors_per_dev, parms.chunk_size, &tmp))
+		TI_ERR("Device length not divisible by chunk_size");
+
+	/****************************************************************
+	 * Now that we checked the constructor arguments ->
+	 * let's allocate the RAID set
+	 ****************************************************************/
+	rs = context_alloc(raid_type, &parms, raid_devs, sectors_per_dev,
+			   ti, dl_parms, argv);
+	if (IS_ERR(rs))
+		return PTR_ERR(rs);
+
+
+	if (rs->md.layout == ALGORITHM_PARITY_0 &&
+	    pi != 0)
+		rs->md.layout = ALGORITHM_PARITY_N;
+
+	recover_set_bandwidth(rs, parms.bandwidth);
+
+	/* Get the device/offset tupels. */
+	argv += dl_parms + 6 + parity_parm + parms.raid_parms;
+	r = dev_parms(rs, argv, dev_to_init, &i);
+	if (r)
+		goto err;
+
+	/* Set backing device information (eg. read ahead). */
+	rs_set_read_ahead(rs, 2 * rs->md.chunk_sectors /* sectors per device */,
+			      2 /* # of stripes */);
+	rs_set_congested_fn(rs); /* Set congested function. */
+
+	rs->raid_parms = parms.raid_parms;
+	/*
+	 * Make sure that dm core only hands maximum chunk_size
+	 * length down and pays attention to io boundaries.
+	 * This is only need for reads.  If reads are within on chunk,
+	 * we can by-pass the cache.
+	 */
+	ti->split_io = rs->md.chunk_sectors;
+	ti->private = rs;
+
+	/* Initialize work queue to handle this RAID set's io. */
+	mutex_lock(&rs->md.reconfig_mutex); 
+	r = do_md_run(&rs->md);
+
+	/* Now this is *really* horrible, but I need a call-back
+	 * when an error is thrown
+	 */
+	rs->oldpers = rs->md.pers;
+	rs->pers = *rs->md.pers;
+	rs->pers.error_handler = dm_raid5_error;
+	rs->md.pers = &rs->pers;
+	mutex_unlock(&rs->md.reconfig_mutex); 
+	if (r)
+		goto err;
+	rs->md.safemode = 0;
+	rs->md.safemode_delay = 0;
+	rs->md.in_sync = 0;
+	rs->md.ro = 0;
+	/* Now we can adjust the cache size */
+	raid5_set_cache_size(&rs->md, parms.stripes);
+
+	rs_log(rs); /* Log information about RAID set. */
+	return 0;
+
+err:
+	context_free(rs, i);
+	return r;
+}
+
+/*
+ * Destruct a raid mapping
+ */
+static void raid_dtr(struct dm_target *ti)
+{
+	struct raid_set *rs = ti->private;
+	int d = rs->md.raid_disks;
+
+	mutex_lock(&rs->md.reconfig_mutex);
+	mddev_resume(&rs->md);
+	do_md_stop(&rs->md, 2, 100);
+	mutex_unlock(&rs->md.reconfig_mutex);
+
+	context_free(rs, d);
+}
+
+/* Raid mapping function. */
+static int raid_map(struct dm_target *ti, struct bio *bio,
+		    union map_info *map_context)
+{
+	struct raid_set *rs = ti->private;
+	struct request_queue *q = rs->md.queue;
+	md_make_request(q, bio);
+	return DM_MAPIO_SUBMITTED;
+}
+
+/* Device suspend. */
+static void raid_presuspend(struct dm_target *ti)
+{
+	struct raid_set *rs = ti->private;
+	mutex_lock(&rs->md.reconfig_mutex);
+	mddev_suspend(&rs->md);
+	mutex_unlock(&rs->md.reconfig_mutex);
+}
+
+/* Device resume. */
+static void raid_resume(struct dm_target *ti)
+{
+	struct raid_set *rs = ti->private;
+
+	mutex_lock(&rs->md.reconfig_mutex);
+	mddev_resume(&rs->md);
+	mutex_unlock(&rs->md.reconfig_mutex);
+}
+
+static int raid_status(struct dm_target *ti, status_type_t type,
+		       char *result, unsigned maxlen)
+{
+	unsigned p, sz = 0;
+	char buf[BDEVNAME_SIZE];
+	struct raid_set *rs = ti->private;
+	raid5_conf_t *conf = rs->md.private;
+
+	int raid_parms[] = {
+		rs->md.chunk_sectors,
+		conf->max_nr_stripes,
+		PAGE_SIZE,
+		PAGE_SIZE,
+		recover_get_bandwidth(rs),
+		-2,
+		conf->max_nr_stripes,
+	};
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+
+		DMEMIT("%u ", rs->md.raid_disks);
+
+		for (p = 0; p < rs->md.raid_disks; p++)
+			DMEMIT("%s ",
+			       format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev));
+
+		DMEMIT("2 ");
+		for (p = 0; p < rs->md.raid_disks; p++) {
+			DMEMIT("%c", !test_bit(Faulty, &rs->dev[p].rdev.flags)
+			       ? 'A' : 'D');
+
+			if (!test_bit(In_sync, &rs->dev[p].rdev.flags))
+				DMEMIT("i");
+			if (test_bit(Blocked, &rs->dev[p].rdev.flags))
+				DMEMIT("b");
+		}
+
+		DMEMIT(" %llu/%llu ",
+		       (unsigned long long) rs->md.curr_resync_completed,
+		       (unsigned long long) rs->md.dev_sectors);
+
+		break;
+	case STATUSTYPE_TABLE:
+		/* fake as core_status with sector size of 1 */
+		DMEMIT("core 2 1 ");
+
+		DMEMIT("%s %u ", rs->raid_type->name, rs->raid_parms);
+
+		for (p = 0; p < rs->raid_parms; p++) {
+			if (raid_parms[p] > -2)
+				DMEMIT("%d ", raid_parms[p]);
+			else
+				DMEMIT("%s ", rs->md.recovery_cp == MaxSector ?
+					      "sync" : "nosync");
+		}
+
+		DMEMIT("%u %d ", rs->md.raid_disks, -1);
+
+		for (p = 0; p < rs->md.raid_disks; p++)
+			DMEMIT("%s %llu ",
+			       format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev),
+			       (unsigned long long) rs->dev[p].rdev.data_offset);
+	}
+
+	return 0;
+}
+
+/*
+ * Message interface
+ */
+enum raid_msg_actions {
+	act_bw,			/* Recovery bandwidth switch. */
+	act_dev,		/* Device failure switch. */
+	act_overwrite,		/* Stripe overwrite check. */
+	act_stats,		/* Development statistics switch. */
+	act_sc,			/* Stripe cache switch. */
+
+	act_on,			/* Set entity on. */
+	act_off,		/* Set entity off. */
+	act_reset,		/* Reset entity. */
+
+	act_set = act_on,	/* Set # absolute. */
+	act_grow = act_off,	/* Grow # by an amount. */
+	act_shrink = act_reset,	/* Shrink # by an amount. */
+};
+
+/* Turn a delta into an absolute value. */
+static int _absolute(unsigned long action, int act, int r)
+{
+	/* Make delta absolute. */
+	if (test_bit(act_set, &action))
+		;
+	else if (test_bit(act_grow, &action))
+		r += act;
+	else if (test_bit(act_shrink, &action))
+		r = act - r;
+	else
+		r = -EINVAL;
+
+	return r;
+}
+
+ /* Change recovery io bandwidth. */
+static int bandwidth_change(struct dm_msg *msg, void *context)
+{
+	struct raid_set *rs = context;
+	int act = recover_get_bandwidth(rs);
+	int bandwidth = DM_MSG_INT_ARG(msg);
+
+	if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
+		/* Make delta bandwidth absolute. */
+		bandwidth = _absolute(msg->action, act, bandwidth);
+
+		/* Check range. */
+		if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
+			recover_set_bandwidth(rs, bandwidth);
+			return 0;
+		}
+	}
+
+	set_bit(dm_msg_ret_arg, &msg->ret);
+	set_bit(dm_msg_ret_inval, &msg->ret);
+	return -EINVAL;
+}
+
+
+/* Resize the stripe cache. */
+static int sc_resize(struct dm_msg *msg, void *context)
+{
+	int act, stripes;
+	struct raid_set *rs = context;
+	raid5_conf_t *conf = rs->md.private;
+
+	stripes = DM_MSG_INT_ARG(msg);
+	if (stripes > 0) {
+		mutex_lock(&rs->md.reconfig_mutex);
+		act = conf->max_nr_stripes;
+
+		/* Make delta stripes absolute. */
+		stripes = _absolute(msg->action, act, stripes);
+
+		/*
+		 * Check range and that the # of stripes changes.
+		 */
+		if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX) &&
+		    stripes != conf->max_nr_stripes) {
+			raid5_set_cache_size(&rs->md, stripes);
+			mutex_unlock(&rs->md.reconfig_mutex);
+			return 0;
+		}
+		mutex_unlock(&rs->md.reconfig_mutex);
+	}
+
+	set_bit(dm_msg_ret_arg, &msg->ret);
+	set_bit(dm_msg_ret_inval, &msg->ret);
+	return -EINVAL;
+}
+
+/* Parse the RAID message action. */
+/*
+ * 'ba[ndwidth] {se[t],g[row],sh[rink]} #'	# e.g 'ba se 50'
+ * "o[verwrite]  {on,of[f],r[eset]}'		# e.g. 'o of'
+ * 'sta[tistics] {on,of[f],r[eset]}'		# e.g. 'stat of'
+ * 'str[ipecache] {se[t],g[row],sh[rink]} #'	# e.g. 'stripe set 1024'
+ *
+ */
+static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+	/* Variables to store the parsed parameters im. */
+	static int i[2];
+	static unsigned long *i_arg[] = {
+		(unsigned long *) i + 0,
+		(unsigned long *) i + 1,
+	};
+
+	/* Declare all message option strings. */
+	static char *str_sgs[] = { "set", "grow", "shrink" };
+#if 0
+	static char *str_oor[] = { "on", "off", "reset" };
+#endif
+	/* Declare all actions. */
+	static unsigned long act_sgs[] = { act_set, act_grow, act_shrink };
+#if 0
+	static unsigned long act_oor[] = { act_on, act_off, act_reset };
+#endif
+	/* Bandwidth option. */
+	static struct dm_message_option bw_opt = { 3, str_sgs, act_sgs };
+	static struct dm_message_argument bw_args = {
+		1, i_arg, { dm_msg_int_t }
+	};
+
+#if 0
+	static struct dm_message_argument null_args = {
+		0, NULL, { dm_msg_int_t }
+	};
+#endif
+	/* Sripecache option. */
+	static struct dm_message_option stripe_opt = { 3, str_sgs, act_sgs };
+
+	/* Declare messages. */
+	static struct dm_msg_spec specs[] = {
+		{ "bandwidth", act_bw, &bw_opt, &bw_args,
+		  0, bandwidth_change },
+		{ "stripecache", act_sc, &stripe_opt, &bw_args,
+		  0, sc_resize },
+	};
+
+	/* The message for the parser. */
+	struct dm_msg msg = {
+		.num_specs = ARRAY_SIZE(specs),
+		.specs = specs,
+	};
+
+	return dm_message_parse(TARGET, &msg, ti->private, argc, argv);
+}
+/*
+ * END message interface
+ */
+
+static struct target_type raid_target = {
+	.name = "raid456",
+	.version = {1, 0, 0},
+	.module = THIS_MODULE,
+	.ctr = raid_ctr,
+	.dtr = raid_dtr,
+	.map = raid_map,
+	.presuspend = raid_presuspend,
+	.resume = raid_resume,
+	.status = raid_status,
+	.message = raid_message,
+};
+
+static void init_exit(const char *bad_msg, const char *good_msg, int r)
+{
+	if (r)
+		DMERR("Failed to %sregister target [%d]", bad_msg, r);
+	else
+		DMINFO("%s %s", good_msg, version);
+}
+
+static int __init dm_raid_init(void)
+{
+	int r = dm_register_target(&raid_target);
+
+	init_exit("", "initialized", r);
+
+	/* avoid this being called under a lock */
+	init_emergency_isa_pool();
+	return r;
+}
+
+static void __exit dm_raid_exit(void)
+{
+	dm_unregister_target(&raid_target);
+	init_exit("un", "exit", 0);
+}
+
+/* Module hooks. */
+module_init(dm_raid_init);
+module_exit(dm_raid_exit);
+
+MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
+MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com> and NeilBrown <neilb@suse.de>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("dm-raid4");
+MODULE_ALIAS("dm-raid5");
+MODULE_ALIAS("dm-raid6");
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 09be637..942ce83 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -208,7 +208,7 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
  * call has finished, the bio has been linked into some internal structure
  * and so is visible to ->quiesce(), so we don't need the refcount any more.
  */
-static int md_make_request(struct request_queue *q, struct bio *bio)
+int md_make_request(struct request_queue *q, struct bio *bio)
 {
 	mddev_t *mddev = q->queuedata;
 	int rv;
@@ -238,29 +238,34 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
 
 	return rv;
 }
+EXPORT_SYMBOL_GPL(md_make_request);
 
-static void mddev_suspend(mddev_t *mddev)
+void mddev_suspend(mddev_t *mddev)
 {
 	BUG_ON(mddev->suspended);
 	mddev->suspended = 1;
 	synchronize_rcu();
 	wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
 	mddev->pers->quiesce(mddev, 1);
-	md_unregister_thread(mddev->thread);
-	mddev->thread = NULL;
+	if (mddev->unit) {
+		md_unregister_thread(mddev->thread);
+		mddev->thread = NULL;
+	}
 	/* we now know that no code is executing in the personality module,
 	 * except possibly the tail end of a ->bi_end_io function, but that
 	 * is certain to complete before the module has a chance to get
 	 * unloaded
 	 */
 }
+EXPORT_SYMBOL_GPL(mddev_suspend);
 
-static void mddev_resume(mddev_t *mddev)
+void mddev_resume(mddev_t *mddev)
 {
 	mddev->suspended = 0;
 	wake_up(&mddev->sb_wait);
 	mddev->pers->quiesce(mddev, 0);
 }
+EXPORT_SYMBOL_GPL(mddev_resume);
 
 
 static inline mddev_t *mddev_get(mddev_t *mddev)
@@ -2981,8 +2986,8 @@ array_state_show(mddev_t *mddev, char *page)
 	return sprintf(page, "%s\n", array_states[st]);
 }
 
-static int do_md_stop(mddev_t * mddev, int ro, int is_open);
-static int do_md_run(mddev_t * mddev);
+int do_md_stop(mddev_t * mddev, int ro, int is_open);
+int do_md_run(mddev_t * mddev);
 static int restart_array(mddev_t *mddev);
 
 static ssize_t
@@ -3957,11 +3962,11 @@ static void md_safemode_timeout(unsigned long data)
 
 static int start_dirty_degraded;
 
-static int do_md_run(mddev_t * mddev)
+int do_md_run(mddev_t * mddev)
 {
 	int err;
 	mdk_rdev_t *rdev;
-	struct gendisk *disk;
+	struct gendisk *disk = NULL;
 	struct mdk_personality *pers;
 
 	if (list_empty(&mddev->disks))
@@ -4016,14 +4021,16 @@ static int do_md_run(mddev_t * mddev)
 				return -EINVAL;
 			}
 		}
-		sysfs_notify_dirent(rdev->sysfs_state);
+		if (rdev->sysfs_state)
+			sysfs_notify_dirent(rdev->sysfs_state);
 	}
 
-	md_probe(mddev->unit, NULL, NULL);
-	disk = mddev->gendisk;
-	if (!disk)
-		return -ENOMEM;
-
+	if (mddev->unit) {
+		md_probe(mddev->unit, NULL, NULL);
+		disk = mddev->gendisk;
+		if (!disk)
+			return -ENOMEM;
+	}
 	spin_lock(&pers_lock);
 	pers = find_pers(mddev->level, mddev->clevel);
 	if (!pers || !try_module_get(pers->owner)) {
@@ -4044,7 +4051,7 @@ static int do_md_run(mddev_t * mddev)
 	}
 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
 
-	if (pers->level >= 4 && pers->level <= 6)
+	if (pers->level >= 4 && pers->level <= 6 && mddev->gendisk)
 		/* Cannot support integrity (yet) */
 		blk_integrity_unregister(mddev->gendisk);
 
@@ -4123,7 +4130,7 @@ static int do_md_run(mddev_t * mddev)
 		bitmap_destroy(mddev);
 		return err;
 	}
-	if (mddev->pers->sync_request) {
+	if (mddev->pers->sync_request && mddev->unit) {
 		if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
 			printk(KERN_WARNING
 			       "md: cannot register extra attributes for %s\n",
@@ -4139,6 +4146,7 @@ static int do_md_run(mddev_t * mddev)
 	mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
 	mddev->in_sync = 1;
 
+	if (mddev->unit)
 	list_for_each_entry(rdev, &mddev->disks, same_set)
 		if (rdev->raid_disk >= 0) {
 			char nm[20];
@@ -4153,7 +4161,8 @@ static int do_md_run(mddev_t * mddev)
 	if (mddev->flags)
 		md_update_sb(mddev, 0);
 
-	set_capacity(disk, mddev->array_sectors);
+	if (disk)
+		set_capacity(disk, mddev->array_sectors);
 
 	/* If there is a partially-recovered drive we need to
 	 * start recovery here.  If we leave it to md_check_recovery,
@@ -4187,13 +4196,15 @@ static int do_md_run(mddev_t * mddev)
 
 	mddev->changed = 1;
 	md_new_event(mddev);
-	sysfs_notify_dirent(mddev->sysfs_state);
-	if (mddev->sysfs_action)
+	if (mddev->unit) {
+		sysfs_notify_dirent(mddev->sysfs_state);
 		sysfs_notify_dirent(mddev->sysfs_action);
-	sysfs_notify(&mddev->kobj, NULL, "degraded");
-	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
+		sysfs_notify(&mddev->kobj, NULL, "degraded");
+		kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
+	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(do_md_run);
 
 static int restart_array(mddev_t *mddev)
 {
@@ -4250,7 +4261,7 @@ static void restore_bitmap_write_access(struct file *file)
  *   1 - switch to readonly
  *   2 - stop but do not disassemble array
  */
-static int do_md_stop(mddev_t * mddev, int mode, int is_open)
+int do_md_stop(mddev_t * mddev, int mode, int is_open)
 {
 	int err = 0;
 	struct gendisk *disk = mddev->gendisk;
@@ -4283,7 +4294,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		case 2: /* stop */
 			bitmap_flush(mddev);
 			md_super_wait(mddev);
-			if (mddev->ro)
+			if (mddev->ro && disk)
 				set_disk_ro(disk, 0);
 
 			mddev->pers->stop(mddev);
@@ -4295,8 +4306,10 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 				mddev->private = &md_redundancy_group;
 			mddev->pers = NULL;
 			/* tell userspace to handle 'inactive' */
-			sysfs_notify_dirent(mddev->sysfs_state);
+			if (mddev->sysfs_state)
+				sysfs_notify_dirent(mddev->sysfs_state);
 
+			if (mddev->unit)
 			list_for_each_entry(rdev, &mddev->disks, same_set)
 				if (rdev->raid_disk >= 0) {
 					char nm[20];
@@ -4304,7 +4317,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 					sysfs_remove_link(&mddev->kobj, nm);
 				}
 
-			set_capacity(disk, 0);
+			if (disk)
+				set_capacity(disk, 0);
 			mddev->changed = 1;
 
 			if (mddev->ro)
@@ -4315,7 +4329,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 			mddev->in_sync = 1;
 			md_update_sb(mddev, 1);
 		}
-		if (mode == 1)
+		if (mode == 1 && disk)
 			set_disk_ro(disk, 1);
 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	}
@@ -4382,12 +4396,15 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		printk(KERN_INFO "md: %s switched to read-only mode.\n",
 			mdname(mddev));
 	err = 0;
-	blk_integrity_unregister(disk);
-	md_new_event(mddev);
-	sysfs_notify_dirent(mddev->sysfs_state);
+	if (disk) {
+		blk_integrity_unregister(disk);
+		md_new_event(mddev);
+		sysfs_notify_dirent(mddev->sysfs_state);
+	}
 out:
 	return err;
 }
+EXPORT_SYMBOL_GPL(do_md_stop);
 
 #ifndef MODULE
 static void autorun_array(mddev_t *mddev)
@@ -6076,6 +6093,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
 
 	BUG_ON(mddev->ro == 1);
 	if (mddev->ro == 2) {
+		printk("ro2\n");
 		/* need to switch to read/write */
 		mddev->ro = 0;
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -6087,6 +6105,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
 	if (mddev->safemode == 1)
 		mddev->safemode = 0;
 	if (mddev->in_sync) {
+		printk("insync\n");
 		spin_lock_irq(&mddev->write_lock);
 		if (mddev->in_sync) {
 			mddev->in_sync = 0;
@@ -6330,8 +6349,10 @@ void md_do_sync(mddev_t *mddev)
 				   atomic_read(&mddev->recovery_active) == 0);
 			mddev->curr_resync_completed =
 				mddev->curr_resync;
-			set_bit(MD_CHANGE_CLEAN, &mddev->flags);
-			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+			if (mddev->unit) {
+				set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+				sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+			}
 		}
 
 		if (j >= mddev->resync_max)
@@ -6445,14 +6466,16 @@ void md_do_sync(mddev_t *mddev)
 					rdev->recovery_offset = mddev->curr_resync;
 		}
 	}
-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	if (mddev->unit)
+		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 
  skip:
 	mddev->curr_resync = 0;
 	mddev->curr_resync_completed = 0;
 	mddev->resync_min = 0;
 	mddev->resync_max = MaxSector;
-	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+	if (mddev->unit)
+		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 	wake_up(&resync_wait);
 	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
@@ -6602,7 +6625,7 @@ void md_check_recovery(mddev_t *mddev)
 			if (mddev->safemode == 1)
 				mddev->safemode = 0;
 			spin_unlock_irq(&mddev->write_lock);
-			if (did_change)
+			if (did_change && mddev->sysfs_state)
 				sysfs_notify_dirent(mddev->sysfs_state);
 		}
 
@@ -6611,7 +6634,8 @@ void md_check_recovery(mddev_t *mddev)
 
 		list_for_each_entry(rdev, &mddev->disks, same_set)
 			if (test_and_clear_bit(StateChanged, &rdev->flags))
-				sysfs_notify_dirent(rdev->sysfs_state);
+				if (rdev->sysfs_state)
+					sysfs_notify_dirent(rdev->sysfs_state);
 
 
 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
@@ -6629,6 +6653,7 @@ void md_check_recovery(mddev_t *mddev)
 				/* success...*/
 				/* activate any spares */
 				if (mddev->pers->spare_active(mddev))
+					if (mddev->unit)
 					sysfs_notify(&mddev->kobj, NULL,
 						     "degraded");
 			}
@@ -6647,6 +6672,7 @@ void md_check_recovery(mddev_t *mddev)
 			mddev->recovery = 0;
 			/* flag recovery needed just to double check */
 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+			if (mddev->unit)
 			sysfs_notify_dirent(mddev->sysfs_action);
 			md_new_event(mddev);
 			goto unlock;
@@ -6709,7 +6735,8 @@ void md_check_recovery(mddev_t *mddev)
 				mddev->recovery = 0;
 			} else
 				md_wakeup_thread(mddev->sync_thread);
-			sysfs_notify_dirent(mddev->sysfs_action);
+			if (mddev->unit)
+				sysfs_notify_dirent(mddev->sysfs_action);
 			md_new_event(mddev);
 		}
 	unlock:
@@ -6726,7 +6753,8 @@ void md_check_recovery(mddev_t *mddev)
 
 void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
 {
-	sysfs_notify_dirent(rdev->sysfs_state);
+	if (rdev->sysfs_state)
+		sysfs_notify_dirent(rdev->sysfs_state);
 	wait_event_timeout(rdev->blocked_wait,
 			   !test_bit(Blocked, &rdev->flags),
 			   msecs_to_jiffies(5000));
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f9f991e..6fe6960 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3323,7 +3323,7 @@ static void raid5_unplug_device(struct request_queue *q)
 	unplug_slaves(mddev);
 }
 
-static int raid5_congested(void *data, int bits)
+int raid5_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
 	raid5_conf_t *conf = mddev->private;
@@ -3340,6 +3340,7 @@ static int raid5_congested(void *data, int bits)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(raid5_congested);
 
 /* We want read requests to align with chunks where possible,
  * but write requests don't need to.
@@ -3422,6 +3423,7 @@ static struct bio *remove_bio_from_retry(raid5_conf_t *conf)
 }
 
 
+static void *bio_fs_destructor;
 /*
  *  The "raid5_align_endio" should check if the read succeeded and if it
  *  did, call bio_endio on the original bio (having bio_put the new bio
@@ -3436,9 +3438,10 @@ static void raid5_align_endio(struct bio *bi, int error)
 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
 	mdk_rdev_t *rdev;
 
+	mddev = (void*)bi->bi_destructor;
+	bi->bi_destructor = bio_fs_destructor;
 	bio_put(bi);
 
-	mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
 	conf = mddev->private;
 	rdev = (void*)raid_bi->bi_next;
 	raid_bi->bi_next = NULL;
@@ -3502,6 +3505,8 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
 	 */
 	align_bi->bi_end_io  = raid5_align_endio;
 	align_bi->bi_private = raid_bio;
+	bio_fs_destructor = align_bi->bi_destructor;
+	align_bi->bi_destructor = (void*)mddev;
 	/*
 	 *	compute position
 	 */
@@ -3537,6 +3542,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
 		return 1;
 	} else {
 		rcu_read_unlock();
+		align_bi->bi_destructor = bio_fs_destructor;
 		bio_put(align_bi);
 		return 0;
 	}
@@ -3613,12 +3619,13 @@ static int make_request(struct request_queue *q, struct bio * bi)
 
 	md_write_start(mddev, bi);
 
-	cpu = part_stat_lock();
-	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
-	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
-		      bio_sectors(bi));
-	part_stat_unlock();
-
+	if (mddev->gendisk) {
+		cpu = part_stat_lock();
+		part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
+		part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
+			      bio_sectors(bi));
+		part_stat_unlock();
+	}
 	if (rw == READ &&
 	     mddev->reshape_position == MaxSector &&
 	     chunk_aligned_read(q,bi))
@@ -4192,23 +4199,14 @@ raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
 		return 0;
 }
 
-static ssize_t
-raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
+int raid5_set_cache_size(mddev_t *mddev, int size)
 {
 	raid5_conf_t *conf = mddev->private;
-	unsigned long new;
 	int err;
 
-	if (len >= PAGE_SIZE)
-		return -EINVAL;
-	if (!conf)
-		return -ENODEV;
-
-	if (strict_strtoul(page, 10, &new))
-		return -EINVAL;
-	if (new <= 16 || new > 32768)
+	if (size <= 16 || size > 32768)
 		return -EINVAL;
-	while (new < conf->max_nr_stripes) {
+	while (size < conf->max_nr_stripes) {
 		if (drop_one_stripe(conf))
 			conf->max_nr_stripes--;
 		else
@@ -4217,11 +4215,32 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
 	err = md_allow_write(mddev);
 	if (err)
 		return err;
-	while (new > conf->max_nr_stripes) {
+	while (size > conf->max_nr_stripes) {
 		if (grow_one_stripe(conf))
 			conf->max_nr_stripes++;
 		else break;
 	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(raid5_set_cache_size);
+	
+static ssize_t
+raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
+{
+	raid5_conf_t *conf = mddev->private;
+	unsigned long new;
+	int err;
+
+	if (len >= PAGE_SIZE)
+		return -EINVAL;
+	if (!conf)
+		return -ENODEV;
+
+	if (strict_strtoul(page, 10, &new))
+		return -EINVAL;
+	err = raid5_set_cache_size(mddev, new);
+	if (err)
+		return err;
 	return len;
 }
 
@@ -4593,6 +4612,7 @@ static int run(mddev_t *mddev)
 	}
 
 	/* Ok, everything is just fine now */
+	if (mddev->unit)
 	if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
 		printk(KERN_WARNING
 		       "raid5: failed to create sysfs attributes for %s\n",
@@ -4637,6 +4657,7 @@ static int stop(mddev_t *mddev)
 	kfree(conf->stripe_hashtbl);
 	mddev->queue->backing_dev_info.congested_fn = NULL;
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
+	if (mddev->unit)
 	sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
 	kfree(conf->disks);
 	kfree(conf);

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-06-21 12:06               ` Neil Brown
  2009-06-22 12:25                 ` Neil Brown
@ 2009-06-22 19:10                 ` Heinz Mauelshagen
  2009-07-02 12:52                   ` Heinz Mauelshagen
  1 sibling, 1 reply; 23+ messages in thread
From: Heinz Mauelshagen @ 2009-06-22 19:10 UTC (permalink / raw)
  To: Neil Brown
  Cc: Christoph Hellwig, Dan Williams, device-mapper development,
	Ed Ciechanowski

On Sun, 2009-06-21 at 22:06 +1000, Neil Brown wrote:
> On Friday June 19, heinzm@redhat.com wrote:
> > On Fri, 2009-06-19 at 11:43 +1000, Neil Brown wrote:
> > > On Wednesday June 17, neilb@suse.de wrote:
> > > > 
> > > > I will try to find time to review your dm-raid5 code with a view to
> > > > understanding how it plugs in to dm, and then how the md/raid5 engine
> > > > can be used by dm-raid5.
> > 
> > Hi Neil.
> > 
> > > 
> > > I've had a bit of a look through the dm-raid5 patches.
> > 
> > Thanks.
> > 
> > > 
> > > Some observations:
> > > 
> > > - You have your own 'xor' code against which you do a run-time test of
> > >   the 'xor_block' code which md/raid5 uses - then choose the fasted.
> > >   This really should not be necessary.  If you have xor code that runs
> > >   faster than anything in xor_block, it really would be best to submit
> > >   it for inclusion in the common xor code base.
> > 
> > This is in because it actually shows better performance regularly by
> > utilizing cache lines etc. more efficiently (tested on Intel, AMD and
> > Sparc).
> > 
> > If xor_block would always have been performed best, I'd dropped that
> > optimization already.
> 
> I'm no expert on this I must admit - I just use the xor code that
> others have provided.
> However it is my understanding that some of the xor code options were
> chosen deliberately because they bypassed the cache.  As the xor
> operation operates on data that very probably was not in the cache,
> and only touches it once, and touches quite a lot of data, there is
> little benefit having in the cache, and a possible real cost as it
> pushes other data out of the cache.
> 
> That said, the "write" process involves copying data from a buffer
> into the cache, and then using that data as a source for xor.  In that
> case the cache might be beneficial, I'm not sure.
> I've occasionally wondered if it might be good to have an xor routine
> that does "copy and xor".   This would use more registers than the
> current code so we could possibly process fewer blocks at a time, but
> we might be processing them faster.  In that case, if we could bypass
> the cache to read the data, but use the cache to store the result of
> the xor, that would be ideal.  However I have no idea if that is
> possible.
> 
> The mechanism you use, which is much the same as the one used by
> xor_block, calculates speed for the hot-cache case.  It is not clear
> that this is often a relevant cache - mostly xor is calculated when
> the cache is cold.  So it isn't clear if it is generally applicable.
> That is why calibrate_xor_blocks sometimes skips the xor_speed test
> (in crypto/xor.c).

I can actually perform cold cache runs likely with a little code change
which should allow us to empirically find out, if my optimization is
worth it.

Not sure though, if I'll be able to do this before heading out to
LinuxTag for the rest of this week.

> 
> However if your code performs better than the cache-agnostic code in
> xor_block, then we should replace that code with your code, rather
> than have two places that try to choose the optimal code.

Sure, we'll see after some more investigation.

For the time being, it can stay in the raid45 target until we either
come to the conclusion that it should get moved underneath crypto's
generic API or get dropped altogether.

> 
> And I would be very interested to see a discussion about how relevant
> the cache bypassing is in real life...  It looks like it was Zach
> Brown who introduced this about 10 years ago, presumably in
> consultation with Ingo Molnar.
> 
> > 
> > > 
> > > - You have introduced "dm-message" which is a frame work for sending
> > >   messages to dm targets.
> > 
> > dm-message is a parser for messages sent to a device-mapper targets
> > message interface. It aims at allowing common message parsing functions
> > to be shared between targets.
> > 
> > The message interface in general allows for runtime state changes of
> > targets like those you found below.
> > 
> > See "dmsetup message ..."
> > 
> > > It is used for adjusting the cache size,
> > >   changing the bandwidth used by resync, and doing things with
> > >   statistics.  Linux already has a framework for sending messages to
> > >   kernel objects.  It is called "sysfs".  It would be much nicer if
> > >   you could use that.
> > 
> > Looks to me like heavier to program (I counted ~100 LOC of sysfs defines
> > and calls in md.c) for 3 files used in md.
> > 
> > That needs some more discussion.
> 
> In dm-raid45 you have 170 lines from "Message interface" to "END
> message interface", which is 4 files I think.  Am I counting the same
> things?

Yes.

> 
> In any case, it is certainly possible that sysfs usage could be
> improved.  We recently added strict_blocks_to_sectors() which extracted
> a common parsing task.  Once upon a time, we open coded things with
> simple_strtoull and repeated the error checking every time.
> There is always room for improvement.
> 
> But I think the uniformity across the kernel is the most important
> issue here, not the lines of code.

Well, it should be balanced between this possibly exclusive extremes.

Nobody wants to add huge amounts of codes to maintain just to allow for
uniformity without enhancing shared code upfront.

> 
> There is a bit of tension here:  your dm-message approach is similar
> in style to the way targets are created.  The sysfs approach is
> similar in style to many other parts of the kernel.  So should your
> new mechanism be consistent with dm or consistent with linux - it
> cannot easily be both.
> 
> This is really a question that the dm developers as a group need to
> consider and answer.  I would like to recommend that transitioning
> towards more uniformity with the rest of linux is a good idea, and
> here is excellent opportunity to start.  But it isn't my decision.

We're meeting in Berlin at LinuxTag so this is going to be an item
there...

> 
> > > - Your algorithm for limiting bandwidth used by resync seems to be
> > >   based on a % concept (which is rounded to the nearest reciprocal,
> > >   so 100%, 50%, 33%, 25%, 20%, 16% etc).  If there is outstanding
> > >   IO, the amount of outstanding resync IO must not exceed the given
> > >   percentage of regular IO.   While it is hard to get resync "just
> > >   right", I think this will have some particularly poor
> > >   characteristics.
> > 
> > My ANALYZEME pointed you there ? ;-)
> > That's why it's in. I admitted with that that I'm not satisfied with the
> > characteristics yet.
> 
> Fair enough.
> My concern then is that the mechanism for guiding the speed - which is
> not yet finalised - is being encoded in the command for creating the
> array.  That would make it difficult to change at a later date.
> I would suggest not specifying anything when creating the array and
> insisting that a default be chosen.

It is not mandatory to specify already and a default will be set.
I'm not aware of any application utilizing it yet.

> Then you can experiment with mechanism using different dm-message or
> sysfs-files with some sort of warning that these are not part of the
> api yet.

I got that freedom already with the tiny flaw of an optional paramter
being supported. If this is perceived to be bad, I can surely drop it.

> 
> > > 
> > >   Also it does not take account of IO to other partitions on a device.
> > >   It is common to have a number of different arrays on the same device
> > >   set.  You really want to resync to back off based on IO to all of
> > >   the device, not just to the array.
> > 
> > Does MD RAID > 0 cope with that in multi-partition RAID configs ?
> > E.g.:
> > disk1:part1+disk2:part1 -> md0 and disk1:part2+disk2:part2 -> md1 with
> > global iostats for disk[12] ?
> 
> Yes.  'gendisk' has 'sync_io' specifically for this purpose.
> When md submits IO for the purpose of resync (or recovery) it adds a
> count of the sectors to this counter.  That can be compared with the
> regular io statistics for the genhd to see if there is an non-sync IO
> happening.  If there is the genhd is assumed to be busy and we backoff
> the resync.
> 
> This is not a perfect solution as it only goes one level down.
> If we had raid1 over raid0 over partitions, then we might not notice
> resync IO from elsewhere properly.  But I don't think I've ever had a
> complaint about that.

Thanks for the explanation.

> 
> > > 
> > > Everything else could be made available by md to dm with only a
> > > moderate amount of effort.
> > > Of the different interface routines you require:
> > > +	.ctr = raid_ctr,
> > > 
> > >   This would be quite straight forward, though it might be worth
> > >   discussing the details of the args passed in.
> > >   "recovery_bandwidth" (as I say above) I don't think is well chosen,
> > 
> > I was aware it needs further tweaking (your points above taken).
> > 
> > You mean the wording here ?
> 
> The thing that I don't think was well chosen was the choice of a
> percentage to guide the speed of resync.

I actually like percent of total bandwidth, because it allows for
maximum throughput when there's no application IO while limiting to the
respective and adjustable bandwidth percentage during application IO.

If the set is moved to other HW with different bandwidth, this still
holds up, whereas absolute throughput changes the ratio.

Of course, we could support both absolute and relative.

> 
> > 
> > >   and I don't quite see the point of io_size (maybe I'm missing
> > >   something obvious)
> > 
> > Performance related. I studied read ahead/chunk size/io size pairs and
> > found examples of better performance with io_size > PAGE_SIZE <=
> > chunk_size. io_size being 2^N.
> > 
> > Do you have similar experiences to share with MD RAID ?
> 
> My position has always been that it is up to lower layers to combine
> pages together.  md/raid5 does all IO in pages.  If it is more
> efficient for some device to do multiple pages as a time, it should
> combine the pages.

I can share that standpoint but had differing tests showing otherwise.

> 
> The normal plugging mechanisms combined with the elevator should be
> enough for this.

Will make a series of tests when I get to this in order to finally
decide how to go about it.

> 
> We have put more focus in to gathering write-pages into whole strips
> so that no pre-reading is needed - we can just calculate parity and
> write.  This has a significant effect on throughput.

Yes, the raid45 target does the same.
It's a huge win to avoid the read before write penalty altogether, when
a stripe gets completely written over.

> (A 'strip' is like a 'stripe' but it is one page wide, not one chunk
> wide).
> 
> > > +	.presuspend = raid_presuspend,
> > > 
> > >   I call this 'quiesce'
> > > 
> > > +	.postsuspend = raid_postsuspend,
> > > 
> > >   I'm not sure exactly how this fits in.  Something related to the 
> > >   log...
> > 
> > We use these distinct presuspend/postsuspend methods in order to allow
> > for targets to distinguish flushing any io in flight in presuspend and
> > release/quiesce/... any resource (e.g. quiesce the dirty log) utilized
> > in the first step in postsuspend.
> > 
> > If MD RAID doesn't need that, stick with presuspend.
> 
> OK, thanks.
> 
> > > +	.message = raid_message,
> > > 
> > >   As I said, I would rather this didn't exist.
> > 
> > I think its better to offer options to be utilized for different
> > purposes. Tradeoff is always (some) overlap.
> > 
> > We don't have one filesystem for all use cases ;-)
> > 
> 
> No...  I find myself wearing several hats here.
> 
> As a developer, I think it is great that you are writing your own
> raid5 implementation.  Doing stuff like that is lots of fun and a
> valuable learning experience.  As I read though the code there were a
> number of time when I thought "that looks rather neat".

Thanks :-)

> Quite
> possibly there are ideas in dm-raid5 that I could "steal" to make
> md/raid5 better.

I wouldn't bet you ain't do that ;-)

> dm-message certainly has its merits too (as I said, it fits the dm model).
> 
> As a member of the linux kernel community, I want to see linux grow
> with a sense of unity.  Where possible, similar tasks should be
> addressed in similar ways.  This avoid duplicating mistakes, eases
> maintenance, and makes it easier for newcomers to learn.  Establishing
> Patterns and following them is a good thing.  Hence my desire to
> encourage the use of sysfs and a single xor implementation.

Understandable/sharable standpoint. Takes us back to the dm team
discussion mentioned above.

> 
> As an employee of a company that sells support, I really don't want a
> profusion of different implementations of something that I am seen as
> an expert in, as I know I'll end up having to support all of them.
> So for very selfish reasons, I'd prefer fewer versions of NFS, fewer
> file-systems, and fewer raid5 implementations :-)
> Well... fewer that we contract to support anyway.
> 

We might end up with one...

But we can take advantage of impulses from different implementations to
help better solutions, which ain't happen if there's just one.

> > > 
> > >  I suspect, using your model, the approach would be a 'dm-message'
> > >  acknowledging that a given device is faulty.  I would, of course,
> > >  rather this be done via sysfs.
> > > 
> > >  I will try to look at factoring out the md specific part of md/raid5
> > >  over the next few days (maybe next week) and see how that looks.
> > 
> > That'll be interesting to see how it fits into the device-mapper
> > framework
> > 
> > I take it You'll port code to create an "md-raid456" target ?
> 
> That's the idea.  I'll let you know when I have something credible.

Ok, We were thinking alike.

Regards,
Heinz

> 
> Thanks,
> NeilBrown

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-06-22 19:10                 ` Heinz Mauelshagen
@ 2009-07-02 12:52                   ` Heinz Mauelshagen
  2009-07-06  3:21                     ` Neil Brown
  0 siblings, 1 reply; 23+ messages in thread
From: Heinz Mauelshagen @ 2009-07-02 12:52 UTC (permalink / raw)
  To: device-mapper development
  Cc: Christoph Hellwig, Dan Williams, Ed Ciechanowski

[-- Attachment #1: Type: text/plain, Size: 6495 bytes --]

On Mon, 2009-06-22 at 21:10 +0200, Heinz Mauelshagen wrote:
> On Sun, 2009-06-21 at 22:06 +1000, Neil Brown wrote:
> > On Friday June 19, heinzm@redhat.com wrote:
> > > On Fri, 2009-06-19 at 11:43 +1000, Neil Brown wrote:
> > > > On Wednesday June 17, neilb@suse.de wrote:
> > > > > 
> > > > > I will try to find time to review your dm-raid5 code with a view to
> > > > > understanding how it plugs in to dm, and then how the md/raid5 engine
> > > > > can be used by dm-raid5.
> > > 
> > > Hi Neil.
> > > 
> > > > 
> > > > I've had a bit of a look through the dm-raid5 patches.
> > > 
> > > Thanks.
> > > 
> > > > 
> > > > Some observations:
> > > > 
> > > > - You have your own 'xor' code against which you do a run-time test of
> > > >   the 'xor_block' code which md/raid5 uses - then choose the fasted.
> > > >   This really should not be necessary.  If you have xor code that runs
> > > >   faster than anything in xor_block, it really would be best to submit
> > > >   it for inclusion in the common xor code base.
> > > 
> > > This is in because it actually shows better performance regularly by
> > > utilizing cache lines etc. more efficiently (tested on Intel, AMD and
> > > Sparc).
> > > 
> > > If xor_block would always have been performed best, I'd dropped that
> > > optimization already.
<SNIP>

Dan, Neil,

like mentioned before I left to LinuxTag last week, here comes an initial
take on dm-raid45 warm/cold CPU cache xor speed optimization metrics.

This shall give us the base to decide to keep or drop the dm-raid45
internal xor optimization magic or move (part of) it into the crypto
subsystem.

Heinz


Howto:
------
I added a loop to walk the list of recovery stripes to dm-raid45.c
in xor_optimize() to allow for over committing the cache and some variables
to be able to display absolute minimum and maximum xor runs performed plus
the number of xor runs achieved per cycle for xor_blocks() and for the dm-raid45
build in xor optimization.

In order to make results more deterministic, I run xor_speed() for <= 5 ticks.

See diff vs. dm-devel dm-raid45 patch (submitted Jun 15th) attached.

Tests being performed on the following 2 systems:

   hostname: a4
   2.6.31-rc1@250HZ timer frequency
   Core i7 920@3.4GHz, 8 MB 3rd Level Cache
   6GB RAM

   hostname: t4
   2.6.31-rc1@250HZ timer frequency
   2 CPU Opeteron 280@2.4GHz, 2*1 MB 2nd Level Cache
   2GB RAM

with the xor optimization being the only load on the systems.


I've performed test runs on each of those with the following mapping tables
and 128 iterations for each of them, which represents a small array case
with 3 drives per set, running the xor optimization on a single core:

Intel:
0 58720256 raid45 core 2 8192 nosync  raid5_la 7 -1 -1 -1 512 10 nosync 1  3 -1 /dev/mapper/error1 0 /dev/mapper/error2 0 /dev/mapper/error3 0
...
0 58720256 raid45 core 2 8192 nosync  raid5_la 7 -1 -1 -1 512 10 nosync 13  3 -1 /dev/mapper/error1 0 /dev/mapper/error2 0 /dev/mapper/error3 0

Opteron:
0 58720256 raid45 core 2 8192 nosync  raid5_la 7 -1 -1 -1 256 10 nosync 1  3 -1 /dev/mapper/error1 0 /dev/mapper/error2 0 /dev/mapper/error3 0
...
0 58720256 raid45 core 2 8192 nosync  raid5_la 7 -1 -1 -1 256 10 nosync 13  3 -1 /dev/mapper/error1 0 /dev/mapper/error2 0 /dev/mapper/error3 0


Because no actual IO is being performed, I just mapped to error targets
(table used: "0 2199023255552 error"; I know it's large but it ain't matter).

The number following the 2nd nosync parameter is the amount of recovery
stripes with io size of 512 sectors = 256 kilobytes per chunk or
256 sectors = 128 kilobytes per chunk respectively.
I.e. a work set of 768/384 kilobytes per recovery stripe.
These values shall make sure, that results differ in the per mille range
(i.e. more than 100 cycles per test run) where appropriate.


Systems are running out of cache at
~ >= 8 stripes on the Intel (8192 - 2048 code / (512 / 2) / 3)
and
~ >= 0 stripes on the Opteron system (1024 - 768 code) / (256 / 2) / 3).
assuming some cache utilization for code and other data.

See raw kernel log extracts being created by these test runs attached
in a tarball and the script to extract the metrics as well.


Intel results with 128 iterations each:
---------------------------------------

1 stripe  : NB:10 111/80 HM:118 111/82
2 stripes : NB:25 113/87 HM:103 112/91
3 stripes : NB:24 115/93 HM:104 114/93
4 stripes : NB:48 114/93 HM:80 114/93
5 stripes : NB:38 113/94 HM:90 114/94
6 stripes : NB:25 116/94 HM:103 114/94
7 stripes : NB:25 115/95 HM:103 115/95
8 stripes : NB:62 117/96 HM:66 116/95 <<<--- cold cache starts here
9 stripes : NB:66 117/96 HM:62 116/95
10 stripes: NB:73 117/96 HM:55 114/95
11 stripes: NB:63 114/96 HM:65 112/95
12 stripes: NB:51 111/96 HM:77 110/95
13 stripes: NB:65 109/96 HM:63 112/95

NB: number of xor_blocks() parity calculations winning per 128 iterations
HM: number of dm-raid45 xor() parity calculations equal to/winning
    xor_blocks per 128 iterations
NN/MM: count of maximm/minimum calculations achived per iteration in <= 5 ticks.

Opteron results with 128 iterations each:
-----------------------------------------
1 stripe  : NB:0 30/20 HM:128 64/53
2 stripes : NB:0 31/21 HM:128 68/55
3 stripes : NB:0 31/22 HM:128 68/57
4 stripes : NB:0 32/22 HM:128 70/61
5 stripes : NB:0 32/22 HM:128 70/63
6 stripes : NB:0 35/22 HM:128 70/64
7 stripes : NB:0 32/23 HM:128 69/63
8 stripes : NB:0 44/23 HM:128 76/65
9 stripes : NB:0 43/23 HM:128 73/65
10 stripes: NB:0 35/23 HM:128 72/64
11 stripes: NB:0 35/24 HM:128 72/64
12 stripes: NB:0 33/24 HM:128 72/65
13 stripes: NB:0 33/23 HM:128 71/64


Test analysis:
--------------
I must have done something wrong ;-)

On the Opteron, dm-raid45 xor() outperforms xor_blocks() by far.
No warm cache significance visible.

On the Intel, dm-raid45 xor() performs slightly better on warm cache
vs. xor_blocks() performing slightly better on cold cache, which may be
the result of the lag of prefetching in dm-raid45 xor().
xor_blocks() achieves a slightly better maximum in 8 / 13 runs vs.
xor() in 2 test runs. in 3 runs they achieve the same maximum.

This is not deterministic:
min/max varying by up to > 200% on the Opteron
and up to 46% on the Intel.


Questions/Recommendations:
--------------------------
Review the code changes and the data analysis please.

Review the test cases and argue if those are valid
or recommend different ones please.

Can we get this more deterministic (e.g. use prefetching for dm-raid45 xor()) ?

Regards,
Heinz


[-- Attachment #2: xor_performance_metrics --]
[-- Type: application/x-shellscript, Size: 808 bytes --]

[-- Attachment #3: dm-raid45-2.6.31-rc1.patch --]
[-- Type: text/x-patch, Size: 6876 bytes --]

 {drivers => /usr/src/linux/drivers}/md/dm-raid45.c |  139 ++++++++++++++------
 1 files changed, 98 insertions(+), 41 deletions(-)

diff --git a/drivers/md/dm-raid45.c b/drivers/md/dm-raid45.c
index 0c33fea..6ace975 100644
--- a/drivers/md/dm-raid45.c
+++ b/linux/drivers/md/dm-raid45.c
@@ -8,7 +8,28 @@
  *
  * Linux 2.6 Device Mapper RAID4 and RAID5 target.
  *
- * Supports:
+ * Tested-by: Intel; Marcin.Labun@intel.com, krzysztof.wojcik@intel.com
+ *
+ *
+ * Supports the following ATARAID vendor solutions (and SNIA DDF):
+ *
+ * 	Adaptec HostRAID ASR
+ * 	SNIA DDF1
+ * 	Hiphpoint 37x
+ * 	Hiphpoint 45x
+ *	Intel IMSM
+ *	Jmicron ATARAID
+ *	LSI Logic MegaRAID
+ *	NVidia RAID
+ *	Promise FastTrack
+ *	Silicon Image Medley
+ *	VIA Software RAID
+ *
+ * via the dmraid application.
+ *
+ *
+ * Features:
+ *
  *	o RAID4 with dedicated and selectable parity device
  *	o RAID5 with rotating parity (left+right, symmetric+asymmetric)
  *	o recovery of out of sync device for initial
@@ -37,7 +58,7 @@
  * ANALYZEME: recovery bandwidth
  */ 
 
-static const char *version = "v0.2594p";
+static const char *version = "v0.2596l";
 
 #include "dm.h"
 #include "dm-memcache.h"
@@ -101,9 +122,6 @@ static const char *version = "v0.2594p";
 /* Check value in range. */
 #define	range_ok(i, min, max)	(i >= min && i <= max)
 
-/* Check argument is power of 2. */
-#define POWER_OF_2(a) (!(a & (a - 1)))
-
 /* Structure access macros. */
 /* Derive raid_set from stripe_cache pointer. */
 #define	RS(x)	container_of(x, struct raid_set, sc)
@@ -1848,10 +1866,10 @@ struct xor_func {
 	xor_function_t f;
 	const char *name;
 } static xor_funcs[] = {
-	{ xor_8,   "xor_8"  },
-	{ xor_16,  "xor_16" },
-	{ xor_32,  "xor_32" },
 	{ xor_64,  "xor_64" },
+	{ xor_32,  "xor_32" },
+	{ xor_16,  "xor_16" },
+	{ xor_8,   "xor_8"  },
 	{ xor_blocks_wrapper, "xor_blocks" },
 };
 
@@ -3114,10 +3132,10 @@ static void _do_endios(struct raid_set *rs, struct stripe *stripe,
 		SetStripeReconstructed(stripe);
 
 		/* FIXME: reschedule to be written in case of read. */
-		// if (!RSDead && RSDegraded(rs) !StripeRBW(stripe)) {
-		// 	chunk_set(CHUNK(stripe, stripe->idx.recover), DIRTY);
-		// 	stripe_chunks_rw(stripe);
-		// }
+		/* if (!RSDead && RSDegraded(rs) !StripeRBW(stripe)) {
+			chunk_set(CHUNK(stripe, stripe->idx.recover), DIRTY);
+			stripe_chunks_rw(stripe);
+		} */
 
 		stripe->idx.recover = -1;
 	}
@@ -3257,7 +3275,7 @@ static void do_ios(struct raid_set *rs, struct bio_list *ios)
 		/* Check for recovering regions. */
 		sector = _sector(rs, bio);
 		r = region_state(rs, sector, DM_RH_RECOVERING);
-		if (unlikely(r && bio_data_dir(bio) == WRITE)) {
+		if (unlikely(r)) {
 			delay++;
 			/* Wait writing to recovering regions. */
 			dm_rh_delay_by_region(rh, bio,
@@ -3409,64 +3427,104 @@ static unsigned mbpers(struct raid_set *rs, unsigned speed)
 /*
  * Discover fastest xor algorithm and # of chunks combination.
  */
-/* Calculate speed for algorithm and # of chunks. */
+/* Calculate speed of particular algorithm and # of chunks. */
 static unsigned xor_speed(struct stripe *stripe)
 {
+	int ticks = 5;
 	unsigned r = 0;
-	unsigned long j;
 
-	/* Wait for next tick. */
-	for (j = jiffies; j == jiffies; )
-		;
+	/* Do xors for a few ticks. */
+	while (ticks--) {
+		unsigned xors = 0;
+		unsigned long j = jiffies;
+
+		while (j == jiffies) {
+			mb();
+			common_xor(stripe, stripe->io.size, 0, 0);
+			mb();
+			xors++;
+			mb();
+		}
 
-	/* Do xors for a full tick. */
-	for (j = jiffies; j == jiffies; ) {
-		mb();
-		common_xor(stripe, stripe->io.size, 0, 0);
-		mb();
-		r++;
+		if (xors > r)
+			r = xors;
 	}
 
 	return r;
 }
 
+/* Define for xor multi recovery stripe optimization runs. */
+#define DMRAID45_XOR_TEST
+
 /* Optimize xor algorithm for this RAID set. */
 static unsigned xor_optimize(struct raid_set *rs)
 {
-	unsigned chunks_max = 2, p = rs->set.raid_devs, speed_max = 0;
+	unsigned chunks_max = 2, p, speed_max = 0;
 	struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL;
 	struct stripe *stripe;
+unsigned speed_hm = 0, speed_min = ~0, speed_xor_blocks = 0;
 
 	BUG_ON(list_empty(&rs->recover.stripes));
+#ifndef DMRAID45_XOR_TEST
 	stripe = list_first_entry(&rs->recover.stripes, struct stripe,
 				  lists[LIST_RECOVER]);
 
 	/* Must set uptodate so that xor() will belabour chunks. */
-	while (p--)
+	for (p = rs->set.raid_devs; p-- ;)
 		SetChunkUptodate(CHUNK(stripe, p));
+#endif
 
 	/* Try all xor functions. */
 	while (f-- > xor_funcs) {
 		unsigned speed;
 
-		/* Set actual xor function for common_xor(). */
-		rs->xor.f = f;
-		rs->xor.chunks = (f->f == xor_blocks_wrapper ?
-				  (MAX_XOR_BLOCKS + 1) : XOR_CHUNKS_MAX) + 1;
-
-		while (rs->xor.chunks-- > 2) {
-			speed = xor_speed(stripe);
-			if (speed > speed_max) {
-				speed_max = speed;
-				chunks_max = rs->xor.chunks;
-				f_max = f;
+#ifdef DMRAID45_XOR_TEST
+		list_for_each_entry(stripe, &rs->recover.stripes,
+				    lists[LIST_RECOVER]) {
+				for (p = rs->set.raid_devs; p-- ;)
+					SetChunkUptodate(CHUNK(stripe, p));
+#endif
+	
+			/* Set actual xor function for common_xor(). */
+			rs->xor.f = f;
+			rs->xor.chunks = (f->f == xor_blocks_wrapper ?
+					  (MAX_XOR_BLOCKS + 1) :
+					  XOR_CHUNKS_MAX) + 1;
+	
+			while (rs->xor.chunks-- > 2) {
+				speed = xor_speed(stripe);
+
+#ifdef DMRAID45_XOR_TEST
+				if (f->f == xor_blocks_wrapper) {
+					if (speed > speed_xor_blocks)
+						speed_xor_blocks = speed;
+				} else if (speed > speed_hm)
+					speed_hm = speed;
+	
+				if (speed < speed_min)
+					speed_min = speed;
+#endif
+
+				if (speed > speed_max) {
+					speed_max = speed;
+					chunks_max = rs->xor.chunks;
+					f_max = f;
+				}
 			}
+#ifdef DMRAID45_XOR_TEST
 		}
+#endif
 	}
 
-	/* Memorize optimum parameters. */
+	/* Memorize optimal parameters. */
 	rs->xor.f = f_max;
 	rs->xor.chunks = chunks_max;
+#ifdef DMRAID45_XOR_TEST
+	DMINFO("%s stripes=%u min=%u xor_blocks=%u hm=%u max=%u",
+	       speed_max == speed_hm ? "HM" : "NB",
+	       rs->recover.recovery_stripes, speed_min,
+	       speed_xor_blocks, speed_hm, speed_max);
+#endif
 	return speed_max;
 }
 
@@ -3786,7 +3844,7 @@ static int get_raid_variable_parms(struct dm_target *ti, char **argv,
 		  "Invalid recovery switch; must be \"sync\" or \"nosync\"" },
 		{ 0,
 		  "Invalid number of recovery stripes;"
-		  "must be -1, > 0 and <= 16384",
+		  "must be -1, > 0 and <= 64",
 		  RECOVERY_STRIPES_MIN, RECOVERY_STRIPES_MAX,
 		  &vp->recovery_stripes_parm, &vp->recovery_stripes, NULL },
 	}, *varp;
@@ -3831,7 +3889,7 @@ static int get_raid_variable_parms(struct dm_target *ti, char **argv,
 
 		if (sscanf(*(argv++), "%d", &value) != 1 ||
 		    (value != -1 &&
-		     ((varp->action && !POWER_OF_2(value)) ||
+		     ((varp->action && !is_power_of_2(value)) ||
 		      !range_ok(value, varp->min, varp->max))))
 			TI_ERR(varp->errmsg);
 

[-- Attachment #4: xor_optomize_test_data.tar.bz2 --]
[-- Type: application/x-bzip-compressed-tar, Size: 17107 bytes --]

[-- Attachment #5: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-07-02 12:52                   ` Heinz Mauelshagen
@ 2009-07-06  3:21                     ` Neil Brown
  2009-07-07 18:38                       ` Doug Ledford
  2009-07-08 18:56                       ` Heinz Mauelshagen
  0 siblings, 2 replies; 23+ messages in thread
From: Neil Brown @ 2009-07-06  3:21 UTC (permalink / raw)
  To: heinzm
  Cc: Christoph Hellwig, device-mapper development, Dan Williams,
	Ed Ciechanowski

On Thursday July 2, heinzm@redhat.com wrote:
> 
> Dan, Neil,
> 
> like mentioned before I left to LinuxTag last week, here comes an initial
> take on dm-raid45 warm/cold CPU cache xor speed optimization metrics.
> 
> This shall give us the base to decide to keep or drop the dm-raid45
> internal xor optimization magic or move (part of) it into the crypto
> subsystem.

Thanks for doing this.
> 
> 
> Intel results with 128 iterations each:
> ---------------------------------------
> 
> 1 stripe  : NB:10 111/80 HM:118 111/82
> 2 stripes : NB:25 113/87 HM:103 112/91
> 3 stripes : NB:24 115/93 HM:104 114/93
> 4 stripes : NB:48 114/93 HM:80 114/93
> 5 stripes : NB:38 113/94 HM:90 114/94
> 6 stripes : NB:25 116/94 HM:103 114/94
> 7 stripes : NB:25 115/95 HM:103 115/95
> 8 stripes : NB:62 117/96 HM:66 116/95 <<<--- cold cache starts here
> 9 stripes : NB:66 117/96 HM:62 116/95
> 10 stripes: NB:73 117/96 HM:55 114/95
> 11 stripes: NB:63 114/96 HM:65 112/95
> 12 stripes: NB:51 111/96 HM:77 110/95
> 13 stripes: NB:65 109/96 HM:63 112/95

These results seem to suggest that the two different routines provide
very similar results on this hardware, particularly when the cache is cold.
The high degree of variability might be because you have dropped this:

> -	/* Wait for next tick. */
> -	for (j = jiffies; j == jiffies; )
> -		;

??
Without that, it could be running the test over anything from 4 to 5
jiffies.
I note that do_xor_speed in crypto/xor.c doesn't synchronise at the
start either.  I think that is a bug.
The variability seem to generally be close to 20%, which is consistent
with the difference between 4 and 5.

Could you put that loop back in and re-test?

> 
> Opteron results with 128 iterations each:
> -----------------------------------------
> 1 stripe  : NB:0 30/20 HM:128 64/53
> 2 stripes : NB:0 31/21 HM:128 68/55
> 3 stripes : NB:0 31/22 HM:128 68/57
> 4 stripes : NB:0 32/22 HM:128 70/61
> 5 stripes : NB:0 32/22 HM:128 70/63
> 6 stripes : NB:0 35/22 HM:128 70/64
> 7 stripes : NB:0 32/23 HM:128 69/63
> 8 stripes : NB:0 44/23 HM:128 76/65
> 9 stripes : NB:0 43/23 HM:128 73/65
> 10 stripes: NB:0 35/23 HM:128 72/64
> 11 stripes: NB:0 35/24 HM:128 72/64
> 12 stripes: NB:0 33/24 HM:128 72/65
> 13 stripes: NB:0 33/23 HM:128 71/64

Here your code seems to be 2-3 times faster!
Can you check which function xor_block is using?
If it is :
  xor: automatically using best checksumming function: ....
then it might be worth disabling that test in calibrate_xor_blocks and
see if it picks one that ends up being faster.

There is still the fact that by using the cache for data that will be
accessed once, we are potentially slowing down the rest of the system.
i.e. the reason to avoid the cache is not just because it won't
benefit the xor much, but because it will hurt other users.
I don't know how to measure that effect :-(
But if avoiding the cache makes xor 1/3 the speed of using the cache
even though it is cold, then it would be hard to justify not using the
cache I think.

> 
> Questions/Recommendations:
> --------------------------
> Review the code changes and the data analysis please.

It seems to mostly make sense
 - the 'wait for next tick' should stay
 - it would be interesting to see what the final choice of 'chunks'
   was (i.e. how many to xor together at a time).
  

Thanks!

NeilBrown

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-07-06  3:21                     ` Neil Brown
@ 2009-07-07 18:38                       ` Doug Ledford
  2009-07-10 15:23                         ` Heinz Mauelshagen
  2009-07-08 18:56                       ` Heinz Mauelshagen
  1 sibling, 1 reply; 23+ messages in thread
From: Doug Ledford @ 2009-07-07 18:38 UTC (permalink / raw)
  To: device-mapper development
  Cc: Christoph Hellwig, heinzm, Dan Williams, Ed Ciechanowski


[-- Attachment #1.1: Type: text/plain, Size: 5141 bytes --]

On Jul 5, 2009, at 11:21 PM, Neil Brown wrote:
> Here your code seems to be 2-3 times faster!
> Can you check which function xor_block is using?
> If it is :
>  xor: automatically using best checksumming function: ....
> then it might be worth disabling that test in calibrate_xor_blocks and
> see if it picks one that ends up being faster.
>
> There is still the fact that by using the cache for data that will be
> accessed once, we are potentially slowing down the rest of the system.
> i.e. the reason to avoid the cache is not just because it won't
> benefit the xor much, but because it will hurt other users.
> I don't know how to measure that effect :-(
> But if avoiding the cache makes xor 1/3 the speed of using the cache
> even though it is cold, then it would be hard to justify not using the
> cache I think.

So, Heinz and I are actually both looking at xor speed issues, but  
from two different perspectives.  While he's comparing some of the  
dmraid45 xor stuff to the xor_blocks routine in crypto/, I'm  
specifically looking at that "automatically using best checksumming  
function" routine.  For the last 9 or so years, we've automatically  
opted for the SSE + non-temporal store routine specifically because  
it's not supposed to pollute cache.  However, after even just a  
cursory reading of the current Intel architecture optimization guide,  
it's obvious that our SSE routine is getting rather aged, and I think  
the routine is in serious need of an overhaul.  This is something I'm  
currently looking into.  But, that raises the question of how to  
decide whether or not to use it, either in its current form or any new  
form it might take.  As you point out, the tradeoff between cache  
polluting and non-cache polluting is hard to quantify.

We made a significant error when we originally wrote the SSE routines,  
and Heinz just duplicated it.  Specifically, we tested performance on  
a quiescent system.  For the SSE routines, I think this is a *major*  
error.  The prefetch instructions need to be timed such that the  
prefetch happens at roughly the right point in time to compensate for  
the memory latency in getting the data to L1/L2 cache prior to use by  
the CPU.  Unfortunately, memory latency in a system that is quiescent  
is drastically different than latency in a system with several CPUs  
actively competing for RAM resources on top of 100MB/s+ of DMA  
traffic, etc.  When we optimized the routines in a quiescent state, I  
think we got our prefetches too close to when the data was needed by  
the CPU under real world use conditions and that's impacting the  
operation of the routines today (or maybe we did get it right, but  
changes in CPU speed relative to memory latency have caused the best  
prefetch point to change over time, either way the current SSE xor  
routine appears to be seriously underperforming in my benchmark tests).

Likewise, Heinz's tests were comparing cold cache to hot cache and  
trying to find a break over point where we switch from one to the  
other.  But that question necessarily depends on other factors in the  
system including what other cores on the same die are doing as that  
impacts the same cache.

So if the error was to not test and optimize these routines under  
load, then the right course of action would be to do the opposite.   
And that leads me to believe that the best way to quantify the  
difference between cache polluting and non-cache polluting should  
likewise not be done on a quiescent system with a micro benchmark.   
Instead, we need a holistic performance test to get the truly best xor  
algorithm.  In my current setup, the disks are so much faster than the  
single threaded xor thread that the bottleneck is the xor speed.  So,  
what does it matter if the xor routine doesn't pollute cache if the  
raid is so slow that programs are stuck in I/O wait all the time as  
the raid5 thread runs non-stop?  Likewise, who cares what the top  
speed of a cache polluting xor routine is if in the process it evicts  
so many cache pages belonging to the processes doing real work on the  
system that now cache reload becomes the bottleneck.  The ultimate  
goal of either approach is overall *system* speed, not micro benchmark  
speed.  I would suggest a specific, system wide workload test that  
involves a filesystem on a device that uses the particular raid level  
and parity routine you want to test, and then you need to run that  
system workload and get a total time required to perform that specific  
work set, CPU time versus idle+I/O wait time in completing that work  
set, etc.  Repeat the test for the various algorithms you wish to  
test, then analyze the results and go from there.  I don't think  
you're going to get a valid run time test for this, instead we would  
likely need to create a few heuristic rules that, combined with  
specific CPU properties, cause us to choose the right routine for the  
machine.

--

Doug Ledford <dledford@redhat.com>

GPG KeyID: CFBFF194
http://people.redhat.com/dledford

InfiniBand Specific RPMS
http://people.redhat.com/dledford/Infiniband





[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 203 bytes --]

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-07-06  3:21                     ` Neil Brown
  2009-07-07 18:38                       ` Doug Ledford
@ 2009-07-08 18:56                       ` Heinz Mauelshagen
  1 sibling, 0 replies; 23+ messages in thread
From: Heinz Mauelshagen @ 2009-07-08 18:56 UTC (permalink / raw)
  To: Neil Brown
  Cc: Christoph Hellwig, device-mapper development, Dan Williams,
	Ed Ciechanowski

On Mon, 2009-07-06 at 13:21 +1000, Neil Brown wrote:
> On Thursday July 2, heinzm@redhat.com wrote:
> > 
> > Dan, Neil,

Hi,

back after > 4 days of Internet outage caused by lightning :-(

I'll respond to Neils comments here in order to have a comparable
microbenchmark based on his recommended change
(and one bug I fixed; see below).

> > 
> > like mentioned before I left to LinuxTag last week, here comes an initial
> > take on dm-raid45 warm/cold CPU cache xor speed optimization metrics.
> > 
> > This shall give us the base to decide to keep or drop the dm-raid45
> > internal xor optimization magic or move (part of) it into the crypto
> > subsystem.
> 
> Thanks for doing this.

You're welcome.

> > 
> > 
> > Intel results with 128 iterations each:
> > ---------------------------------------
> > 
> > 1 stripe  : NB:10 111/80 HM:118 111/82
> > 2 stripes : NB:25 113/87 HM:103 112/91
> > 3 stripes : NB:24 115/93 HM:104 114/93
> > 4 stripes : NB:48 114/93 HM:80 114/93
> > 5 stripes : NB:38 113/94 HM:90 114/94
> > 6 stripes : NB:25 116/94 HM:103 114/94
> > 7 stripes : NB:25 115/95 HM:103 115/95
> > 8 stripes : NB:62 117/96 HM:66 116/95 <<<--- cold cache starts here
> > 9 stripes : NB:66 117/96 HM:62 116/95
> > 10 stripes: NB:73 117/96 HM:55 114/95
> > 11 stripes: NB:63 114/96 HM:65 112/95
> > 12 stripes: NB:51 111/96 HM:77 110/95
> > 13 stripes: NB:65 109/96 HM:63 112/95
> 
> These results seem to suggest that the two different routines provide
> very similar results on this hardware, particularly when the cache is cold.
> The high degree of variability might be because you have dropped this:
> 
> > -	/* Wait for next tick. */
> > -	for (j = jiffies; j == jiffies; )
> > -		;
> ??
> Without that, it could be running the test over anything from 4 to 5
> jiffies.
> I note that do_xor_speed in crypto/xor.c doesn't synchronise at the
> start either.  I think that is a bug.
> The variability seem to generally be close to 20%, which is consistent
> with the difference between 4 and 5.
> 
> Could you put that loop back in and re-test?
> 

Reintroduced and rerun tests.

In addition to that I fixed a flaw, which lead to
dm-raid45.c:xor_optimize() running xor_speed() with chunks > raid
devices, which ain't make sense and lead to longer test runs and
erroneous chunk values (e.g. 7 when only 3 raid devices configured).
Hence we could end up with an algorithm claiming it was selected
for > raid devices.

Here's the new results:

Intel Core i7:
--------------
1 stripe  : NB:54 114/94 HM:74 113/93
2 stripes : NB:57 116/94 HM:71 115/94
3 stripes : NB:64 115/94 HM:64 114/94
4 stripes : NB:51 112/94 HM:77 114/94
5 stripes : NB:77 115/94 HM:51 114/94
6 stripes : NB:25 111/89 HM:103 105/90
7 stripes : NB:13 105/91 HM:115 111/90
8 stripes : NB:27 108/92 HM:101 111/93
9 stripes : NB:29 113/92 HM:99 114/93
10 stripes: NB:41 110/92 HM:87 112/93
11 stripes: NB:34 105/92 HM:94 107/93
12 stripes: NB:51 114/93 HM:77 114/93
13 stripes: NB:54 115/94 HM:74 114/93
14 stripes: NB:64 115/94 HM:64 114/93


AMD Opteron:
--------
1 stripe  : NB:0 25/17 HM:128 48/38
2 stripes : NB:0 24/18 HM:128 46/36
3 stripes : NB:0 25/18 HM:128 47/37
4 stripes : NB:0 27/19 HM:128 48/41
5 stripes : NB:0 30/18 HM:128 49/40
6 stripes : NB:0 27/19 HM:128 49/40
7 stripes : NB:0 29/18 HM:128 49/39
8 stripes : NB:0 26/19 HM:128 49/40
9 stripes : NB:0 28/19 HM:128 51/41
10 stripes: NB:0 28/18 HM:128 50/41
11 stripes: NB:0 31/19 HM:128 49/40
12 stripes: NB:0 28/19 HM:128 50/40
13 stripes: NB:0 26/19 HM:128 50/40
14 stripes: NB:0 27/20 HM:128 49/40


Still too much variability...



> > 
> > Opteron results with 128 iterations each:
> > -----------------------------------------
> > 1 stripe  : NB:0 30/20 HM:128 64/53
> > 2 stripes : NB:0 31/21 HM:128 68/55
> > 3 stripes : NB:0 31/22 HM:128 68/57
> > 4 stripes : NB:0 32/22 HM:128 70/61
> > 5 stripes : NB:0 32/22 HM:128 70/63
> > 6 stripes : NB:0 35/22 HM:128 70/64
> > 7 stripes : NB:0 32/23 HM:128 69/63
> > 8 stripes : NB:0 44/23 HM:128 76/65
> > 9 stripes : NB:0 43/23 HM:128 73/65
> > 10 stripes: NB:0 35/23 HM:128 72/64
> > 11 stripes: NB:0 35/24 HM:128 72/64
> > 12 stripes: NB:0 33/24 HM:128 72/65
> > 13 stripes: NB:0 33/23 HM:128 71/64
> 
> Here your code seems to be 2-3 times faster!
> Can you check which function xor_block is using?
> If it is :
>   xor: automatically using best checksumming function: ....
> then it might be worth disabling that test in calibrate_xor_blocks and
> see if it picks one that ends up being faster.

Picks the same sse one automatically/measured on both archs with
obvious variability:

[37414.875236] xor: automatically using best checksumming function:
generic_sse
[37414.893930]    generic_sse: 12619.000 MB/sec
[37414.893932] xor: using function: generic_sse (12619.000 MB/sec)
[37445.679501] xor: measuring software checksum speed
[37445.696829]    generic_sse: 15375.000 MB/sec
[37445.696830] xor: using function: generic_sse (15375.000 MB/sec)


Will get to Dough's recommendation to run loaded benchmarks tomorrow...

Heinz

> 
> There is still the fact that by using the cache for data that will be
> accessed once, we are potentially slowing down the rest of the system.
> i.e. the reason to avoid the cache is not just because it won't
> benefit the xor much, but because it will hurt other users.
> I don't know how to measure that effect :-(
> But if avoiding the cache makes xor 1/3 the speed of using the cache
> even though it is cold, then it would be hard to justify not using the
> cache I think.
> 
> > 
> > Questions/Recommendations:
> > --------------------------
> > Review the code changes and the data analysis please.
> 
> It seems to mostly make sense
>  - the 'wait for next tick' should stay
>  - it would be interesting to see what the final choice of 'chunks'
>    was (i.e. how many to xor together at a time).
>   
> 
> Thanks!
> 
> NeilBrown

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-07-07 18:38                       ` Doug Ledford
@ 2009-07-10 15:23                         ` Heinz Mauelshagen
  2009-07-11 12:44                           ` Doug Ledford
  0 siblings, 1 reply; 23+ messages in thread
From: Heinz Mauelshagen @ 2009-07-10 15:23 UTC (permalink / raw)
  To: Doug Ledford
  Cc: Christoph Hellwig, device-mapper development, Dan Williams,
	Ed Ciechanowski

[-- Attachment #1: Type: text/plain, Size: 4912 bytes --]

On Tue, 2009-07-07 at 14:38 -0400, Doug Ledford wrote:
> On Jul 5, 2009, at 11:21 PM, Neil Brown wrote:
> > Here your code seems to be 2-3 times faster!
> > Can you check which function xor_block is using?
> > If it is :
> >  xor: automatically using best checksumming function: ....
> > then it might be worth disabling that test in calibrate_xor_blocks and
> > see if it picks one that ends up being faster.
> >
> > There is still the fact that by using the cache for data that will be
> > accessed once, we are potentially slowing down the rest of the system.
> > i.e. the reason to avoid the cache is not just because it won't
> > benefit the xor much, but because it will hurt other users.
> > I don't know how to measure that effect :-(
> > But if avoiding the cache makes xor 1/3 the speed of using the cache
> > even though it is cold, then it would be hard to justify not using the
> > cache I think.
> 
> So, Heinz and I are actually both looking at xor speed issues, but  
> from two different perspectives.  While he's comparing some of the  
> dmraid45 xor stuff to the xor_blocks routine in crypto/, I'm  
<SNIP>
> So if the error was to not test and optimize these routines under  
> load, then the right course of action would be to do the opposite.   
> And that leads me to believe that the best way to quantify the  
> difference between cache polluting and non-cache polluting should  
> likewise not be done on a quiescent system with a micro benchmark.   
> Instead, we need a holistic performance test to get the truly best xor  
> algorithm.  In my current setup, the disks are so much faster than the  
> single threaded xor thread that the bottleneck is the xor speed.  So,  
> what does it matter if the xor routine doesn't pollute cache if the  
> raid is so slow that programs are stuck in I/O wait all the time as  
> the raid5 thread runs non-stop?  Likewise, who cares what the top  
> speed of a cache polluting xor routine is if in the process it evicts  
> so many cache pages belonging to the processes doing real work on the  
> system that now cache reload becomes the bottleneck.  The ultimate  
> goal of either approach is overall *system* speed, not micro benchmark  
> speed.  I would suggest a specific, system wide workload test that  
> involves a filesystem on a device that uses the particular raid level  
> and parity routine you want to test, and then you need to run that  
> system workload and get a total time required to perform that specific  
> work set, CPU time versus idle+I/O wait time in completing that work  
> set, etc.  Repeat the test for the various algorithms you wish to  
> test, then analyze the results and go from there.  I don't think  
> you're going to get a valid run time test for this, instead we would  
> likely need to create a few heuristic rules that, combined with  
> specific CPU properties, cause us to choose the right routine for the  
> machine.

Dough,

I extended dm-raid45's message interface to support changing the xor
algorithm and # of chunks, allowing for changes of the algorithm being
used at runtime.

This I used to perform a bunch of mkfs write intensive tests on the
Intel Core i7 system as an initial write load test case. The tests have
been run on 8 disks faked onto one SSD using LVM (~200MB sustained
writes throughput):

for a in xor_blocks
do
	for c in $(seq 2 6)
	do
		echo -e "$a $c\n---------------"
		dmsetup message r5 0 xor $a $c
		for i in $(seq 6)do
			time mkfs -t ext3 /dev/mapper/r5
		done
	done
done > xor_blocks.out 2>&1

for a in xor_8 xor_16 xor_32 xor_64
do
	for c in $(seq 2 8)
	do
		echo -e "$a $c\n---------------"
		dmsetup message r5 0 xor $a $c
		for i in $(seq 6)
		do
			time mkfs -t ext3 /dev/mapper/r5
		done
	done
done > xor_8-64.out 2>&1

Mapping table for r5:
0 146800640 raid45 core 2 8192 nosync  raid5_la 7 64 128 8 -1 10 nosync 1  8 -1 \
/dev/tst/raiddev_1 0 /dev/tst/raiddev_2 0 /dev/tst/raiddev_3 0 /dev/tst/raiddev_4 0 \
/dev/tst/raiddev_5 0 /dev/tst/raiddev_6 0 /dev/tst/raiddev_7 0 /dev/tst/raiddev_8 0

I attached filtered output files xor_blocks_1.txt and xor_8-64_1.txt,
which contain the time information for all the above algorithm/#chunks
settings.


Real time minima:

# egrep '^real' xor_blocks_1.txt|sort|head -1
real    0m14.508s
# egrep '^real' xor_8-64_1.txt|sort|head -1
real    0m14.430s


System time minima:

[root@a4 dm-tests]# egrep '^sys' xor_blocks_1.txt|sort|head -1
sys     0m0.460s
# egrep '^sys' xor_8-64_1.txt|sort|head -1
sys     0m0.444s

User time is negligible.


This mkfs test case indicates better performance for certain dm-raid45
xor() settings vs. xor_blocks(). I can get to dbench etc. after my
vacation in week 31.


Heinz


> 
> --
> 
> Doug Ledford <dledford@redhat.com>
> 
> GPG KeyID: CFBFF194
> http://people.redhat.com/dledford
> 
> InfiniBand Specific RPMS
> http://people.redhat.com/dledford/Infiniband
> 
> 
> 
> 

[-- Attachment #2: xor_blocks_1.txt --]
[-- Type: text/plain, Size: 1405 bytes --]

xor_blocks 2
---------------
real	0m14.513s
user	0m0.000s
sys	0m0.568s
real	0m14.721s
user	0m0.012s
sys	0m0.476s
real	0m14.792s
user	0m0.016s
sys	0m0.568s
real	0m15.037s
user	0m0.008s
sys	0m0.512s
real	0m14.514s
user	0m0.016s
sys	0m0.564s
real	0m14.508s
user	0m0.024s
sys	0m0.512s
xor_blocks 3
---------------
real	0m14.786s
user	0m0.008s
sys	0m0.504s
real	0m14.538s
user	0m0.004s
sys	0m0.504s
real	0m14.738s
user	0m0.012s
sys	0m0.516s
real	0m14.704s
user	0m0.016s
sys	0m0.520s
real	0m14.767s
user	0m0.016s
sys	0m0.500s
real	0m14.510s
user	0m0.020s
sys	0m0.556s
xor_blocks 4
---------------
real	0m14.643s
user	0m0.004s
sys	0m0.536s
real	0m14.647s
user	0m0.032s
sys	0m0.512s
real	0m14.748s
user	0m0.020s
sys	0m0.552s
real	0m14.825s
user	0m0.024s
sys	0m0.520s
real	0m14.829s
user	0m0.008s
sys	0m0.512s
real	0m14.515s
user	0m0.004s
sys	0m0.536s
xor_blocks 5
---------------
real	0m14.764s
user	0m0.008s
sys	0m0.524s
real	0m14.593s
user	0m0.012s
sys	0m0.540s
real	0m14.783s
user	0m0.012s
sys	0m0.504s
real	0m14.632s
user	0m0.008s
sys	0m0.512s
real	0m14.806s
user	0m0.008s
sys	0m0.488s
real	0m14.780s
user	0m0.012s
sys	0m0.528s
xor_blocks 6
---------------
real	0m14.813s
user	0m0.012s
sys	0m0.512s
real	0m14.725s
user	0m0.008s
sys	0m0.524s
real	0m14.518s
user	0m0.016s
sys	0m0.460s
real	0m14.784s
user	0m0.028s
sys	0m0.548s
real	0m14.994s
user	0m0.012s
sys	0m0.516s
real	0m14.803s
user	0m0.012s
sys	0m0.512s

[-- Attachment #3: xor_8-64_1.txt --]
[-- Type: text/plain, Size: 7749 bytes --]

xor_8 2
---------------
real	0m14.518s
user	0m0.024s
sys	0m0.504s
real	0m14.611s
user	0m0.016s
sys	0m0.508s
real	0m14.838s
user	0m0.020s
sys	0m0.500s
real	0m14.837s
user	0m0.008s
sys	0m0.512s
real	0m14.652s
user	0m0.024s
sys	0m0.460s
real	0m14.954s
user	0m0.016s
sys	0m0.556s
xor_8 3
---------------
real	0m14.866s
user	0m0.004s
sys	0m0.560s
real	0m14.736s
user	0m0.008s
sys	0m0.560s
real	0m14.643s
user	0m0.012s
sys	0m0.444s
real	0m14.817s
user	0m0.012s
sys	0m0.556s
real	0m14.644s
user	0m0.008s
sys	0m0.496s
real	0m14.747s
user	0m0.008s
sys	0m0.568s
xor_8 4
---------------
real	0m14.504s
user	0m0.000s
sys	0m0.568s
real	0m14.889s
user	0m0.012s
sys	0m0.516s
real	0m14.813s
user	0m0.020s
sys	0m0.500s
real	0m14.781s
user	0m0.020s
sys	0m0.496s
real	0m14.657s
user	0m0.012s
sys	0m0.500s
real	0m14.810s
user	0m0.020s
sys	0m0.488s
xor_8 5
---------------
real	0m14.805s
user	0m0.016s
sys	0m0.524s
real	0m14.956s
user	0m0.024s
sys	0m0.520s
real	0m14.619s
user	0m0.012s
sys	0m0.468s
real	0m14.902s
user	0m0.008s
sys	0m0.484s
real	0m14.800s
user	0m0.008s
sys	0m0.512s
real	0m14.866s
user	0m0.008s
sys	0m0.516s
xor_8 6
---------------
real	0m14.834s
user	0m0.032s
sys	0m0.476s
real	0m14.661s
user	0m0.008s
sys	0m0.560s
real	0m14.809s
user	0m0.016s
sys	0m0.528s
real	0m14.828s
user	0m0.016s
sys	0m0.568s
real	0m14.801s
user	0m0.008s
sys	0m0.516s
real	0m14.811s
user	0m0.012s
sys	0m0.524s
xor_8 7
---------------
real	0m14.889s
user	0m0.020s
sys	0m0.520s
real	0m14.525s
user	0m0.012s
sys	0m0.548s
real	0m14.767s
user	0m0.008s
sys	0m0.560s
real	0m14.803s
user	0m0.012s
sys	0m0.584s
real	0m14.641s
user	0m0.016s
sys	0m0.608s
real	0m14.810s
user	0m0.016s
sys	0m0.500s
xor_8 8
---------------
real	0m14.719s
user	0m0.016s
sys	0m0.540s
real	0m14.825s
user	0m0.016s
sys	0m0.572s
real	0m14.842s
user	0m0.008s
sys	0m0.552s
real	0m14.811s
user	0m0.016s
sys	0m0.508s
real	0m14.518s
user	0m0.012s
sys	0m0.544s
real	0m14.768s
user	0m0.024s
sys	0m0.500s
xor_16 2
---------------
real	0m14.839s
user	0m0.008s
sys	0m0.576s
real	0m14.517s
user	0m0.020s
sys	0m0.528s
real	0m14.810s
user	0m0.008s
sys	0m0.532s
real	0m14.888s
user	0m0.028s
sys	0m0.520s
real	0m14.811s
user	0m0.012s
sys	0m0.544s
real	0m14.794s
user	0m0.012s
sys	0m0.472s
xor_16 3
---------------
real	0m14.766s
user	0m0.008s
sys	0m0.512s
real	0m14.809s
user	0m0.020s
sys	0m0.488s
real	0m14.582s
user	0m0.008s
sys	0m0.500s
real	0m14.767s
user	0m0.008s
sys	0m0.552s
real	0m14.899s
user	0m0.008s
sys	0m0.528s
real	0m14.812s
user	0m0.004s
sys	0m0.524s
xor_16 4
---------------
real	0m14.827s
user	0m0.004s
sys	0m0.528s
real	0m14.769s
user	0m0.008s
sys	0m0.588s
real	0m14.541s
user	0m0.012s
sys	0m0.572s
real	0m14.788s
user	0m0.016s
sys	0m0.592s
real	0m15.482s
user	0m0.004s
sys	0m0.568s
real	0m14.780s
user	0m0.020s
sys	0m0.524s
xor_16 5
---------------
real	0m14.686s
user	0m0.024s
sys	0m0.500s
real	0m14.782s
user	0m0.012s
sys	0m0.468s
real	0m14.802s
user	0m0.008s
sys	0m0.456s
real	0m14.896s
user	0m0.008s
sys	0m0.548s
real	0m14.821s
user	0m0.004s
sys	0m0.532s
real	0m14.806s
user	0m0.028s
sys	0m0.492s
xor_16 6
---------------
real	0m14.735s
user	0m0.004s
sys	0m0.576s
real	0m14.926s
user	0m0.024s
sys	0m0.564s
real	0m14.912s
user	0m0.016s
sys	0m0.528s
real	0m14.830s
user	0m0.016s
sys	0m0.492s
real	0m14.751s
user	0m0.020s
sys	0m0.524s
real	0m14.492s
user	0m0.012s
sys	0m0.500s
xor_16 7
---------------
real	0m14.821s
user	0m0.016s
sys	0m0.444s
real	0m14.714s
user	0m0.012s
sys	0m0.476s
real	0m14.956s
user	0m0.008s
sys	0m0.544s
real	0m14.755s
user	0m0.012s
sys	0m0.552s
real	0m14.605s
user	0m0.004s
sys	0m0.488s
real	0m14.750s
user	0m0.012s
sys	0m0.564s
xor_16 8
---------------
real	0m14.702s
user	0m0.012s
sys	0m0.460s
real	0m14.797s
user	0m0.012s
sys	0m0.472s
real	0m14.629s
user	0m0.016s
sys	0m0.572s
real	0m14.841s
user	0m0.012s
sys	0m0.488s
real	0m14.768s
user	0m0.020s
sys	0m0.472s
real	0m14.483s
user	0m0.008s
sys	0m0.532s
xor_32 2
---------------
real	0m19.783s
user	0m0.004s
sys	0m0.528s
real	0m14.670s
user	0m0.012s
sys	0m0.448s
real	0m14.913s
user	0m0.020s
sys	0m0.496s
real	0m14.816s
user	0m0.012s
sys	0m0.524s
real	0m14.874s
user	0m0.016s
sys	0m0.560s
real	0m14.815s
user	0m0.004s
sys	0m0.572s
xor_32 3
---------------
real	0m14.751s
user	0m0.016s
sys	0m0.512s
real	0m14.605s
user	0m0.008s
sys	0m0.508s
real	0m14.699s
user	0m0.004s
sys	0m0.576s
real	0m14.674s
user	0m0.004s
sys	0m0.512s
real	0m14.872s
user	0m0.012s
sys	0m0.540s
real	0m14.801s
user	0m0.024s
sys	0m0.504s
xor_32 4
---------------
real	0m14.780s
user	0m0.028s
sys	0m0.504s
real	0m14.802s
user	0m0.008s
sys	0m0.500s
real	0m14.624s
user	0m0.008s
sys	0m0.516s
real	0m14.779s
user	0m0.028s
sys	0m0.536s
real	0m14.953s
user	0m0.012s
sys	0m0.544s
real	0m14.571s
user	0m0.016s
sys	0m0.500s
xor_32 5
---------------
real	0m14.843s
user	0m0.008s
sys	0m0.544s
real	0m14.822s
user	0m0.016s
sys	0m0.540s
real	0m14.583s
user	0m0.016s
sys	0m0.520s
real	0m15.138s
user	0m0.008s
sys	0m0.508s
real	0m14.718s
user	0m0.012s
sys	0m0.548s
real	0m14.547s
user	0m0.012s
sys	0m0.552s
xor_32 6
---------------
real	0m14.744s
user	0m0.012s
sys	0m0.488s
real	0m14.856s
user	0m0.016s
sys	0m0.532s
real	0m14.717s
user	0m0.024s
sys	0m0.552s
real	0m14.777s
user	0m0.008s
sys	0m0.564s
real	0m14.761s
user	0m0.016s
sys	0m0.496s
real	0m14.706s
user	0m0.012s
sys	0m0.560s
xor_32 7
---------------
real	0m14.790s
user	0m0.004s
sys	0m0.568s
real	0m14.797s
user	0m0.016s
sys	0m0.488s
real	0m14.708s
user	0m0.012s
sys	0m0.512s
real	0m14.838s
user	0m0.016s
sys	0m0.512s
real	0m14.748s
user	0m0.008s
sys	0m0.476s
real	0m14.507s
user	0m0.008s
sys	0m0.512s
xor_32 8
---------------
real	0m15.055s
user	0m0.004s
sys	0m0.468s
real	0m14.839s
user	0m0.016s
sys	0m0.564s
real	0m14.551s
user	0m0.020s
sys	0m0.468s
real	0m14.789s
user	0m0.020s
sys	0m0.488s
real	0m14.495s
user	0m0.004s
sys	0m0.556s
real	0m14.852s
user	0m0.032s
sys	0m0.552s
xor_64 2
---------------
real	0m14.749s
user	0m0.028s
sys	0m0.472s
real	0m14.576s
user	0m0.016s
sys	0m0.544s
real	0m14.880s
user	0m0.004s
sys	0m0.496s
real	0m14.789s
user	0m0.016s
sys	0m0.588s
real	0m14.504s
user	0m0.020s
sys	0m0.568s
real	0m14.847s
user	0m0.016s
sys	0m0.548s
xor_64 3
---------------
real	0m14.812s
user	0m0.012s
sys	0m0.492s
real	0m23.521s
user	0m0.012s
sys	0m0.552s
real	0m14.580s
user	0m0.004s
sys	0m0.552s
real	0m14.711s
user	0m0.028s
sys	0m0.524s
real	0m14.817s
user	0m0.016s
sys	0m0.544s
real	0m14.773s
user	0m0.008s
sys	0m0.468s
xor_64 4
---------------
real	0m14.722s
user	0m0.008s
sys	0m0.516s
real	0m14.881s
user	0m0.008s
sys	0m0.520s
real	0m14.821s
user	0m0.012s
sys	0m0.520s
real	0m15.190s
user	0m0.020s
sys	0m0.456s
real	0m14.780s
user	0m0.016s
sys	0m0.448s
real	0m14.762s
user	0m0.004s
sys	0m0.564s
xor_64 5
---------------
real	0m14.688s
user	0m0.016s
sys	0m0.488s
real	0m14.559s
user	0m0.004s
sys	0m0.528s
real	0m14.829s
user	0m0.020s
sys	0m0.520s
real	0m14.818s
user	0m0.016s
sys	0m0.500s
real	0m14.812s
user	0m0.008s
sys	0m0.500s
real	0m14.804s
user	0m0.004s
sys	0m0.480s
xor_64 6
---------------
real	0m14.742s
user	0m0.024s
sys	0m0.476s
real	0m14.882s
user	0m0.020s
sys	0m0.528s
real	0m14.589s
user	0m0.012s
sys	0m0.512s
real	0m14.832s
user	0m0.004s
sys	0m0.504s
real	0m14.638s
user	0m0.012s
sys	0m0.444s
real	0m14.767s
user	0m0.008s
sys	0m0.536s
xor_64 7
---------------
real	0m14.790s
user	0m0.012s
sys	0m0.560s
real	0m14.749s
user	0m0.016s
sys	0m0.476s
real	0m14.430s
user	0m0.016s
sys	0m0.540s
real	0m14.694s
user	0m0.012s
sys	0m0.556s
real	0m14.567s
user	0m0.016s
sys	0m0.488s
real	0m14.753s
user	0m0.016s
sys	0m0.536s
xor_64 8
---------------
real	0m14.816s
user	0m0.008s
sys	0m0.544s
real	0m14.704s
user	0m0.020s
sys	0m0.516s
real	0m14.613s
user	0m0.012s
sys	0m0.548s
real	0m14.900s
user	0m0.008s
sys	0m0.532s
real	0m14.586s
user	0m0.012s
sys	0m0.464s
real	0m14.692s
user	0m0.016s
sys	0m0.520s

[-- Attachment #4: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-07-10 15:23                         ` Heinz Mauelshagen
@ 2009-07-11 12:44                           ` Doug Ledford
  2009-07-12  2:56                             ` Dan Williams
  0 siblings, 1 reply; 23+ messages in thread
From: Doug Ledford @ 2009-07-11 12:44 UTC (permalink / raw)
  To: heinzm
  Cc: Christoph Hellwig, device-mapper development, Dan Williams,
	Ed Ciechanowski


[-- Attachment #1.1: Type: text/plain, Size: 3248 bytes --]

On Jul 10, 2009, at 11:23 AM, Heinz Mauelshagen wrote:
> Dough,
>
> I extended dm-raid45's message interface to support changing the xor
> algorithm and # of chunks, allowing for changes of the algorithm being
> used at runtime.

Very useful indeed.  I may send you some routines to be tested at some  
point in the future if you don't mind ;-)

> This I used to perform a bunch of mkfs write intensive tests on the
> Intel Core i7 system as an initial write load test case. The tests  
> have
> been run on 8 disks faked onto one SSD using LVM (~200MB sustained
> writes throughput):

That's a little slower than I think you need for a good test.  I'm not  
even sure I'm satisfied that my current SATA array is sufficient and I  
can get at least 500MB/s of write throughput to the disks using a  
raid0, possibly more if I can get a better eSATA port.

> for a in xor_blocks
> do
> 	for c in $(seq 2 6)
> 	do
> 		echo -e "$a $c\n---------------"
> 		dmsetup message r5 0 xor $a $c
> 		for i in $(seq 6)do
> 			time mkfs -t ext3 /dev/mapper/r5
> 		done
> 	done
> done > xor_blocks.out 2>&1
>
> for a in xor_8 xor_16 xor_32 xor_64
> do
> 	for c in $(seq 2 8)
> 	do
> 		echo -e "$a $c\n---------------"
> 		dmsetup message r5 0 xor $a $c
> 		for i in $(seq 6)
> 		do
> 			time mkfs -t ext3 /dev/mapper/r5
> 		done
> 	done
> done > xor_8-64.out 2>&1
>
> Mapping table for r5:
> 0 146800640 raid45 core 2 8192 nosync  raid5_la 7 64 128 8 -1 10  
> nosync 1  8 -1 \
> /dev/tst/raiddev_1 0 /dev/tst/raiddev_2 0 /dev/tst/raiddev_3 0 /dev/ 
> tst/raiddev_4 0 \
> /dev/tst/raiddev_5 0 /dev/tst/raiddev_6 0 /dev/tst/raiddev_7 0 /dev/ 
> tst/raiddev_8 0
>
> I attached filtered output files xor_blocks_1.txt and xor_8-64_1.txt,
> which contain the time information for all the above algorithm/#chunks
> settings.
>
>
> Real time minima:
>
> # egrep '^real' xor_blocks_1.txt|sort|head -1
> real    0m14.508s
> # egrep '^real' xor_8-64_1.txt|sort|head -1
> real    0m14.430s
>
>
> System time minima:
>
> [root@a4 dm-tests]# egrep '^sys' xor_blocks_1.txt|sort|head -1
> sys     0m0.460s
> # egrep '^sys' xor_8-64_1.txt|sort|head -1
> sys     0m0.444s
>
> User time is negligible.
>
>
> This mkfs test case indicates better performance for certain dm-raid45
> xor() settings vs. xor_blocks(). I can get to dbench etc. after my
> vacation in week 31.

Thanks.  This isn't too far off from what I would expect.  I would say  
that real world loads fall all along a spectrum from "create lots of  
writes, but do little to no work" to "does lots of work, and only  
sporadically writes".  It's the later end of this spectrum that is  
most likely to be helped by the cache avoiding routines, while the  
former is not.  So, one of the tests I had in mind was to use  
something like timing a complete kernel build, or doing a database  
load/report cycle, or some other things like that.  Things that do  
actual work in the foreground while the raid is kept busy in the  
background.  Of course, testing all the various points along the  
spectrum is needed, so this test gets us the first.


--

Doug Ledford <dledford@redhat.com>

GPG KeyID: CFBFF194
http://people.redhat.com/dledford

InfiniBand Specific RPMS
http://people.redhat.com/dledford/Infiniband





[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 203 bytes --]

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one
  2009-07-11 12:44                           ` Doug Ledford
@ 2009-07-12  2:56                             ` Dan Williams
  0 siblings, 0 replies; 23+ messages in thread
From: Dan Williams @ 2009-07-12  2:56 UTC (permalink / raw)
  To: device-mapper development; +Cc: Christoph Hellwig, heinzm, Ed Ciechanowski

On Sat, Jul 11, 2009 at 5:44 AM, Doug Ledford<dledford@redhat.com> wrote:
> Thanks.  This isn't too far off from what I would expect.  I would say that
> real world loads fall all along a spectrum from "create lots of writes, but
> do little to no work" to "does lots of work, and only sporadically writes".
>  It's the later end of this spectrum that is most likely to be helped by the
> cache avoiding routines, while the former is not.  So, one of the tests I
> had in mind was to use something like timing a complete kernel build, or
> doing a database load/report cycle, or some other things like that.  Things
> that do actual work in the foreground while the raid is kept busy in the
> background.  Of course, testing all the various points along the spectrum is
> needed, so this test gets us the first.

This reminds of the testing I did when quantifying the benefit of
hardware accelerated raid5.  I played with kernel builds while
resyncing to show the cache and cpu-utilization savings, but never got
that to settle on a solid number.  I took a look at Con Kolivas'
"contest" benchmark [1], but ultimately just published plain iozone
data [2].  The interesting bit is that cpu limited random writes saw
more throughput improvement than streaming writes because the i/o
processing did not need to compete with management of the stripe
cache.

--
Dan

[1]: http://users.on.net/~ckolivas/contest/
[2]: http://sourceforge.net/projects/xscaleiop/files/MD RAID
Acceleration/iop-iozone-graphs-20061010.tar.bz2/download

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2009-07-12  2:56 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-06-15 17:21 [PATCH 1/6] dm raid45 target: export region hash functions and add a needed one heinzm
2009-06-16 14:09 ` Christoph Hellwig
2009-06-16 14:51   ` Heinz Mauelshagen
2009-06-16 17:55     ` Dan Williams
2009-06-16 19:11       ` Heinz Mauelshagen
2009-06-16 19:48         ` Dan Williams
2009-06-16 22:46         ` Neil Brown
2009-06-18 16:08           ` Jonathan Brassow
2009-06-19  1:43           ` Neil Brown
2009-06-19 10:33             ` Heinz Mauelshagen
2009-06-21  0:32               ` Dan Williams
2009-06-21 12:06               ` Neil Brown
2009-06-22 12:25                 ` Neil Brown
2009-06-22 19:10                 ` Heinz Mauelshagen
2009-07-02 12:52                   ` Heinz Mauelshagen
2009-07-06  3:21                     ` Neil Brown
2009-07-07 18:38                       ` Doug Ledford
2009-07-10 15:23                         ` Heinz Mauelshagen
2009-07-11 12:44                           ` Doug Ledford
2009-07-12  2:56                             ` Dan Williams
2009-07-08 18:56                       ` Heinz Mauelshagen
2009-06-18 16:39 ` Jonathan Brassow
2009-06-18 20:01   ` Heinz Mauelshagen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.