All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
@ 2009-07-13 14:11 David Woodhouse
  2009-07-15 19:23 ` Dan Williams
  0 siblings, 1 reply; 34+ messages in thread
From: David Woodhouse @ 2009-07-13 14:11 UTC (permalink / raw)
  To: chris.mason, linux-btrfs; +Cc: neilb, linux-raid

We'll want to use these in btrfs too.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/md/Kconfig                           |    5 +-
 drivers/md/Makefile                          |   76 -------------------------
 lib/Kconfig                                  |    3 +
 lib/Makefile                                 |    1 +
 lib/raid6/Makefile                           |   78 ++++++++++++++++++++++++++
 {drivers/md => lib/raid6}/mktables.c         |    0
 {drivers/md => lib/raid6}/raid6algos.c       |    0
 {drivers/md => lib/raid6}/raid6altivec.uc    |    0
 {drivers/md => lib/raid6}/raid6int.uc        |    0
 {drivers/md => lib/raid6}/raid6mmx.c         |    0
 {drivers/md => lib/raid6}/raid6recov.c       |    0
 {drivers/md => lib/raid6}/raid6sse1.c        |    0
 {drivers/md => lib/raid6}/raid6sse2.c        |    0
 {drivers/md => lib/raid6}/raid6test/Makefile |    0
 {drivers/md => lib/raid6}/raid6test/test.c   |    0
 {drivers/md => lib/raid6}/raid6x86.h         |    0
 {drivers/md => lib/raid6}/unroll.pl          |    0
 17 files changed, 83 insertions(+), 80 deletions(-)
 create mode 100644 lib/raid6/Makefile
 rename {drivers/md => lib/raid6}/mktables.c (100%)
 rename {drivers/md => lib/raid6}/raid6algos.c (100%)
 rename {drivers/md => lib/raid6}/raid6altivec.uc (100%)
 rename {drivers/md => lib/raid6}/raid6int.uc (100%)
 rename {drivers/md => lib/raid6}/raid6mmx.c (100%)
 rename {drivers/md => lib/raid6}/raid6recov.c (100%)
 rename {drivers/md => lib/raid6}/raid6sse1.c (100%)
 rename {drivers/md => lib/raid6}/raid6sse2.c (100%)
 rename {drivers/md => lib/raid6}/raid6test/Makefile (100%)
 rename {drivers/md => lib/raid6}/raid6test/test.c (100%)
 rename {drivers/md => lib/raid6}/raid6x86.h (100%)
 rename {drivers/md => lib/raid6}/unroll.pl (100%)

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 36e0675..42bf294 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -121,7 +121,7 @@ config MD_RAID10
 config MD_RAID456
 	tristate "RAID-4/RAID-5/RAID-6 mode"
 	depends on BLK_DEV_MD
-	select MD_RAID6_PQ
+	select RAID6_PQ
 	select ASYNC_MEMCPY
 	select ASYNC_XOR
 	---help---
@@ -152,9 +152,6 @@ config MD_RAID456
 
 	  If unsure, say Y.
 
-config MD_RAID6_PQ
-	tristate
-
 config MD_MULTIPATH
 	tristate "Multipath I/O support"
 	depends on BLK_DEV_MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 45cc595..2c0b697 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -10,13 +10,6 @@ dm-snapshot-y	+= dm-snap.o dm-exception-store.o dm-snap-transient.o \
 dm-mirror-y	+= dm-raid1.o
 md-mod-y	+= md.o bitmap.o
 raid456-y	+= raid5.o
-raid6_pq-y	+= raid6algos.o raid6recov.o raid6tables.o \
-		   raid6int1.o raid6int2.o raid6int4.o \
-		   raid6int8.o raid6int16.o raid6int32.o \
-		   raid6altivec1.o raid6altivec2.o raid6altivec4.o \
-		   raid6altivec8.o \
-		   raid6mmx.o raid6sse1.o raid6sse2.o
-hostprogs-y	+= mktables
 
 # Note: link order is important.  All raid personalities
 # and must come before md.o, as they each initialise 
@@ -27,7 +20,6 @@ obj-$(CONFIG_MD_LINEAR)		+= linear.o
 obj-$(CONFIG_MD_RAID0)		+= raid0.o
 obj-$(CONFIG_MD_RAID1)		+= raid1.o
 obj-$(CONFIG_MD_RAID10)		+= raid10.o
-obj-$(CONFIG_MD_RAID6_PQ)	+= raid6_pq.o
 obj-$(CONFIG_MD_RAID456)	+= raid456.o
 obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o
 obj-$(CONFIG_MD_FAULTY)		+= faulty.o
@@ -40,75 +32,7 @@ obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
 
-quiet_cmd_unroll = UNROLL  $@
-      cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \
-                   < $< > $@ || ( rm -f $@ && exit 1 )
-
-ifeq ($(CONFIG_ALTIVEC),y)
-altivec_flags := -maltivec -mabi=altivec
-endif
-
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o
 endif
 
-targets += raid6int1.c
-$(obj)/raid6int1.c:   UNROLL := 1
-$(obj)/raid6int1.c:   $(src)/raid6int.uc $(src)/unroll.pl FORCE
-	$(call if_changed,unroll)
-
-targets += raid6int2.c
-$(obj)/raid6int2.c:   UNROLL := 2
-$(obj)/raid6int2.c:   $(src)/raid6int.uc $(src)/unroll.pl FORCE
-	$(call if_changed,unroll)
-
-targets += raid6int4.c
-$(obj)/raid6int4.c:   UNROLL := 4
-$(obj)/raid6int4.c:   $(src)/raid6int.uc $(src)/unroll.pl FORCE
-	$(call if_changed,unroll)
-
-targets += raid6int8.c
-$(obj)/raid6int8.c:   UNROLL := 8
-$(obj)/raid6int8.c:   $(src)/raid6int.uc $(src)/unroll.pl FORCE
-	$(call if_changed,unroll)
-
-targets += raid6int16.c
-$(obj)/raid6int16.c:  UNROLL := 16
-$(obj)/raid6int16.c:  $(src)/raid6int.uc $(src)/unroll.pl FORCE
-	$(call if_changed,unroll)
-
-targets += raid6int32.c
-$(obj)/raid6int32.c:  UNROLL := 32
-$(obj)/raid6int32.c:  $(src)/raid6int.uc $(src)/unroll.pl FORCE
-	$(call if_changed,unroll)
-
-CFLAGS_raid6altivec1.o += $(altivec_flags)
-targets += raid6altivec1.c
-$(obj)/raid6altivec1.c:   UNROLL := 1
-$(obj)/raid6altivec1.c:   $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
-	$(call if_changed,unroll)
-
-CFLAGS_raid6altivec2.o += $(altivec_flags)
-targets += raid6altivec2.c
-$(obj)/raid6altivec2.c:   UNROLL := 2
-$(obj)/raid6altivec2.c:   $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
-	$(call if_changed,unroll)
-
-CFLAGS_raid6altivec4.o += $(altivec_flags)
-targets += raid6altivec4.c
-$(obj)/raid6altivec4.c:   UNROLL := 4
-$(obj)/raid6altivec4.c:   $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
-	$(call if_changed,unroll)
-
-CFLAGS_raid6altivec8.o += $(altivec_flags)
-targets += raid6altivec8.c
-$(obj)/raid6altivec8.c:   UNROLL := 8
-$(obj)/raid6altivec8.c:   $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
-	$(call if_changed,unroll)
-
-quiet_cmd_mktable = TABLE   $@
-      cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 )
-
-targets += raid6tables.c
-$(obj)/raid6tables.c: $(obj)/mktables FORCE
-	$(call if_changed,mktable)
diff --git a/lib/Kconfig b/lib/Kconfig
index 8ade0a7..18e0b5f 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -7,6 +7,9 @@ config BINARY_PRINTF
 
 menu "Library routines"
 
+config RAID6_PQ
+	tristate
+
 config BITREVERSE
 	tristate
 
diff --git a/lib/Makefile b/lib/Makefile
index 33a40e4..7f394cd 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -64,6 +64,7 @@ obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/
 obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
 obj-$(CONFIG_LZO_COMPRESS) += lzo/
 obj-$(CONFIG_LZO_DECOMPRESS) += lzo/
+obj-$(CONFIG_RAID6_PQ) += raid6/
 
 lib-$(CONFIG_DECOMPRESS_GZIP) += decompress_inflate.o
 lib-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
new file mode 100644
index 0000000..b2fe4ba
--- /dev/null
+++ b/lib/raid6/Makefile
@@ -0,0 +1,78 @@
+obj-$(CONFIG_RAID6_PQ)	+= raid6_pq.o
+
+raid6_pq-y	+= raid6algos.o raid6recov.o raid6tables.o \
+		   raid6int1.o raid6int2.o raid6int4.o \
+		   raid6int8.o raid6int16.o raid6int32.o \
+		   raid6altivec1.o raid6altivec2.o raid6altivec4.o \
+		   raid6altivec8.o \
+		   raid6mmx.o raid6sse1.o raid6sse2.o
+hostprogs-y	+= mktables
+
+quiet_cmd_unroll = UNROLL  $@
+      cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \
+                   < $< > $@ || ( rm -f $@ && exit 1 )
+
+ifeq ($(CONFIG_ALTIVEC),y)
+altivec_flags := -maltivec -mabi=altivec
+endif
+
+targets += raid6int1.c
+$(obj)/raid6int1.c:   UNROLL := 1
+$(obj)/raid6int1.c:   $(src)/raid6int.uc $(src)/unroll.pl FORCE
+	$(call if_changed,unroll)
+
+targets += raid6int2.c
+$(obj)/raid6int2.c:   UNROLL := 2
+$(obj)/raid6int2.c:   $(src)/raid6int.uc $(src)/unroll.pl FORCE
+	$(call if_changed,unroll)
+
+targets += raid6int4.c
+$(obj)/raid6int4.c:   UNROLL := 4
+$(obj)/raid6int4.c:   $(src)/raid6int.uc $(src)/unroll.pl FORCE
+	$(call if_changed,unroll)
+
+targets += raid6int8.c
+$(obj)/raid6int8.c:   UNROLL := 8
+$(obj)/raid6int8.c:   $(src)/raid6int.uc $(src)/unroll.pl FORCE
+	$(call if_changed,unroll)
+
+targets += raid6int16.c
+$(obj)/raid6int16.c:  UNROLL := 16
+$(obj)/raid6int16.c:  $(src)/raid6int.uc $(src)/unroll.pl FORCE
+	$(call if_changed,unroll)
+
+targets += raid6int32.c
+$(obj)/raid6int32.c:  UNROLL := 32
+$(obj)/raid6int32.c:  $(src)/raid6int.uc $(src)/unroll.pl FORCE
+	$(call if_changed,unroll)
+
+CFLAGS_raid6altivec1.o += $(altivec_flags)
+targets += raid6altivec1.c
+$(obj)/raid6altivec1.c:   UNROLL := 1
+$(obj)/raid6altivec1.c:   $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
+	$(call if_changed,unroll)
+
+CFLAGS_raid6altivec2.o += $(altivec_flags)
+targets += raid6altivec2.c
+$(obj)/raid6altivec2.c:   UNROLL := 2
+$(obj)/raid6altivec2.c:   $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
+	$(call if_changed,unroll)
+
+CFLAGS_raid6altivec4.o += $(altivec_flags)
+targets += raid6altivec4.c
+$(obj)/raid6altivec4.c:   UNROLL := 4
+$(obj)/raid6altivec4.c:   $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
+	$(call if_changed,unroll)
+
+CFLAGS_raid6altivec8.o += $(altivec_flags)
+targets += raid6altivec8.c
+$(obj)/raid6altivec8.c:   UNROLL := 8
+$(obj)/raid6altivec8.c:   $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
+	$(call if_changed,unroll)
+
+quiet_cmd_mktable = TABLE   $@
+      cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 )
+
+targets += raid6tables.c
+$(obj)/raid6tables.c: $(obj)/mktables FORCE
+	$(call if_changed,mktable)
diff --git a/drivers/md/mktables.c b/lib/raid6/mktables.c
similarity index 100%
rename from drivers/md/mktables.c
rename to lib/raid6/mktables.c
diff --git a/drivers/md/raid6algos.c b/lib/raid6/raid6algos.c
similarity index 100%
rename from drivers/md/raid6algos.c
rename to lib/raid6/raid6algos.c
diff --git a/drivers/md/raid6altivec.uc b/lib/raid6/raid6altivec.uc
similarity index 100%
rename from drivers/md/raid6altivec.uc
rename to lib/raid6/raid6altivec.uc
diff --git a/drivers/md/raid6int.uc b/lib/raid6/raid6int.uc
similarity index 100%
rename from drivers/md/raid6int.uc
rename to lib/raid6/raid6int.uc
diff --git a/drivers/md/raid6mmx.c b/lib/raid6/raid6mmx.c
similarity index 100%
rename from drivers/md/raid6mmx.c
rename to lib/raid6/raid6mmx.c
diff --git a/drivers/md/raid6recov.c b/lib/raid6/raid6recov.c
similarity index 100%
rename from drivers/md/raid6recov.c
rename to lib/raid6/raid6recov.c
diff --git a/drivers/md/raid6sse1.c b/lib/raid6/raid6sse1.c
similarity index 100%
rename from drivers/md/raid6sse1.c
rename to lib/raid6/raid6sse1.c
diff --git a/drivers/md/raid6sse2.c b/lib/raid6/raid6sse2.c
similarity index 100%
rename from drivers/md/raid6sse2.c
rename to lib/raid6/raid6sse2.c
diff --git a/drivers/md/raid6test/Makefile b/lib/raid6/raid6test/Makefile
similarity index 100%
rename from drivers/md/raid6test/Makefile
rename to lib/raid6/raid6test/Makefile
diff --git a/drivers/md/raid6test/test.c b/lib/raid6/raid6test/test.c
similarity index 100%
rename from drivers/md/raid6test/test.c
rename to lib/raid6/raid6test/test.c
diff --git a/drivers/md/raid6x86.h b/lib/raid6/raid6x86.h
similarity index 100%
rename from drivers/md/raid6x86.h
rename to lib/raid6/raid6x86.h
diff --git a/drivers/md/unroll.pl b/lib/raid6/unroll.pl
similarity index 100%
rename from drivers/md/unroll.pl
rename to lib/raid6/unroll.pl
-- 
1.6.2.5

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-13 14:11 [PATCH 1/4] md: Factor out RAID6 algorithms into lib/ David Woodhouse
@ 2009-07-15 19:23 ` Dan Williams
  2009-07-15 20:16   ` Chris Mason
  2009-07-16 17:38   ` H. Peter Anvin
  0 siblings, 2 replies; 34+ messages in thread
From: Dan Williams @ 2009-07-15 19:23 UTC (permalink / raw)
  To: David Woodhouse; +Cc: chris.mason, linux-btrfs, neilb, linux-raid

On Mon, Jul 13, 2009 at 7:11 AM, David Woodhouse<dwmw2@infradead.org> wrote:
> We'll want to use these in btrfs too.
>
> Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>

Do you suspect that btrfs will also want to perform these operations
asynchronously?  I am preparing an updated release of the raid6
offload patch kit, but the previous WIP release can be browsed at:

http://git.kernel.org/?p=linux/kernel/git/djbw/async_tx.git;a=shortlog;h=raid6

The routines are housed in crypto/async_tx/async_pq.c and
crypto/async_tx/async_raid6_recov.c.

I also wonder if the raid6 algos are a better fit under crypto/ alongside xor?

--
Dan

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-15 19:23 ` Dan Williams
@ 2009-07-15 20:16   ` Chris Mason
  2009-07-15 22:11       ` Dan Williams
  2009-07-16 17:38   ` H. Peter Anvin
  1 sibling, 1 reply; 34+ messages in thread
From: Chris Mason @ 2009-07-15 20:16 UTC (permalink / raw)
  To: Dan Williams; +Cc: David Woodhouse, linux-btrfs, neilb, linux-raid

On Wed, Jul 15, 2009 at 12:23:47PM -0700, Dan Williams wrote:
> On Mon, Jul 13, 2009 at 7:11 AM, David Woodhouse<dwmw2@infradead.org> wrote:
> > We'll want to use these in btrfs too.
> >
> > Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
> 
> Do you suspect that btrfs will also want to perform these operations
> asynchronously?  I am preparing an updated release of the raid6
> offload patch kit, but the previous WIP release can be browsed at:

The short answer is that we'll definitely want to use the async code
where it is available.  Btrfs is already wired up to hand off CPU
intensive parts of IO submission (compression, checksumming) to helper
threads.  There's some extra ordering so that we can have N threads
checksumming but still send down the actual bios in the proper order.

Once the btrfs raid5/6 is working well, I'll wire it into the helper
threads as well.  Are the raid offload engines fast enough that we won't
want the helper threads when they are available?

-chris

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-15 20:16   ` Chris Mason
@ 2009-07-15 22:11       ` Dan Williams
  0 siblings, 0 replies; 34+ messages in thread
From: Dan Williams @ 2009-07-15 22:11 UTC (permalink / raw)
  To: Chris Mason, Dan Williams, David Woodhouse, linux-btrfs, neilb,
	linux-raid

On Wed, Jul 15, 2009 at 1:16 PM, Chris Mason<chris.mason@oracle.com> wrote:
> On Wed, Jul 15, 2009 at 12:23:47PM -0700, Dan Williams wrote:
>> On Mon, Jul 13, 2009 at 7:11 AM, David Woodhouse<dwmw2@infradead.org> wrote:
>> > We'll want to use these in btrfs too.
>> >
>> > Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
>>
>> Do you suspect that btrfs will also want to perform these operations
>> asynchronously?  I am preparing an updated release of the raid6
>> offload patch kit, but the previous WIP release can be browsed at:
>
> The short answer is that we'll definitely want to use the async code
> where it is available.  Btrfs is already wired up to hand off CPU
> intensive parts of IO submission (compression, checksumming) to helper
> threads.  There's some extra ordering so that we can have N threads
> checksumming but still send down the actual bios in the proper order.
>
> Once the btrfs raid5/6 is working well, I'll wire it into the helper
> threads as well.  Are the raid offload engines fast enough that we won't
> want the helper threads when they are available?

The api is such that you will not be able to tell ahead of time if the
operation is to be offloaded or carried out synchronously (unless of
course you disable offload by CONFIG_ASYNC_TX_DMA=n).  The current
channel allocator hands out channels on a percpu basis so the
scheduling of the helper threads can be used as a proxy for scheduling
the offload engines.  I have considered allowing overloaded channels
to spill work back on to the cpu[1], but I need more data on whether
this is worthwhile.

--
Dan

[1]: http://www.intel.com/design/iio/iop341_42.htm
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
@ 2009-07-15 22:11       ` Dan Williams
  0 siblings, 0 replies; 34+ messages in thread
From: Dan Williams @ 2009-07-15 22:11 UTC (permalink / raw)
  To: Chris Mason, Dan Williams, David Woodhouse, linux-btrfs, neilb,
	linux-raid

On Wed, Jul 15, 2009 at 1:16 PM, Chris Mason<chris.mason@oracle.com> wr=
ote:
> On Wed, Jul 15, 2009 at 12:23:47PM -0700, Dan Williams wrote:
>> On Mon, Jul 13, 2009 at 7:11 AM, David Woodhouse<dwmw2@infradead.org=
> wrote:
>> > We'll want to use these in btrfs too.
>> >
>> > Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
>>
>> Do you suspect that btrfs will also want to perform these operations
>> asynchronously? =A0I am preparing an updated release of the raid6
>> offload patch kit, but the previous WIP release can be browsed at:
>
> The short answer is that we'll definitely want to use the async code
> where it is available. =A0Btrfs is already wired up to hand off CPU
> intensive parts of IO submission (compression, checksumming) to helpe=
r
> threads. =A0There's some extra ordering so that we can have N threads
> checksumming but still send down the actual bios in the proper order.
>
> Once the btrfs raid5/6 is working well, I'll wire it into the helper
> threads as well. =A0Are the raid offload engines fast enough that we =
won't
> want the helper threads when they are available?

The api is such that you will not be able to tell ahead of time if the
operation is to be offloaded or carried out synchronously (unless of
course you disable offload by CONFIG_ASYNC_TX_DMA=3Dn).  The current
channel allocator hands out channels on a percpu basis so the
scheduling of the helper threads can be used as a proxy for scheduling
the offload engines.  I have considered allowing overloaded channels
to spill work back on to the cpu[1], but I need more data on whether
this is worthwhile.

--
Dan

[1]: http://www.intel.com/design/iio/iop341_42.htm
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" i=
n
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-15 19:23 ` Dan Williams
  2009-07-15 20:16   ` Chris Mason
@ 2009-07-16 17:38   ` H. Peter Anvin
  2009-07-17 14:22     ` Ric Wheeler
  1 sibling, 1 reply; 34+ messages in thread
From: H. Peter Anvin @ 2009-07-16 17:38 UTC (permalink / raw)
  To: Dan Williams; +Cc: David Woodhouse, chris.mason, linux-btrfs, neilb, linux-raid

Dan Williams wrote:
> On Mon, Jul 13, 2009 at 7:11 AM, David Woodhouse<dwmw2@infradead.org> wrote:
>> We'll want to use these in btrfs too.
>>
>> Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
> 
> Do you suspect that btrfs will also want to perform these operations
> asynchronously?  I am preparing an updated release of the raid6
> offload patch kit, but the previous WIP release can be browsed at:
> 
> http://git.kernel.org/?p=linux/kernel/git/djbw/async_tx.git;a=shortlog;h=raid6
> 
> The routines are housed in crypto/async_tx/async_pq.c and
> crypto/async_tx/async_raid6_recov.c.
> 
> I also wonder if the raid6 algos are a better fit under crypto/ alongside xor?
> 

I am also sitting on a set of synchronous (CPU) acceleration patches for 
RAID-6 recovery, just waiting for the APIs to stabilize.

	-hpa


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-16 17:38   ` H. Peter Anvin
@ 2009-07-17 14:22     ` Ric Wheeler
  2009-07-17 15:20       ` H. Peter Anvin
  0 siblings, 1 reply; 34+ messages in thread
From: Ric Wheeler @ 2009-07-17 14:22 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Dan Williams, David Woodhouse, chris.mason, linux-btrfs, neilb,
	linux-raid

On 07/16/2009 01:38 PM, H. Peter Anvin wrote:
> Dan Williams wrote:
>> On Mon, Jul 13, 2009 at 7:11 AM, David Woodhouse<dwmw2@infradead.org> 
>> wrote:
>>> We'll want to use these in btrfs too.
>>>
>>> Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
>>
>> Do you suspect that btrfs will also want to perform these operations
>> asynchronously?  I am preparing an updated release of the raid6
>> offload patch kit, but the previous WIP release can be browsed at:
>>
>> http://git.kernel.org/?p=linux/kernel/git/djbw/async_tx.git;a=shortlog;h=raid6 
>>
>>
>> The routines are housed in crypto/async_tx/async_pq.c and
>> crypto/async_tx/async_raid6_recov.c.
>>
>> I also wonder if the raid6 algos are a better fit under crypto/ 
>> alongside xor?
>>
>
> I am also sitting on a set of synchronous (CPU) acceleration patches 
> for RAID-6 recovery, just waiting for the APIs to stabilize.
>
>     -hpa
>

Worth sharing a pointer to a really neat set of papers that describe 
open source friendly RAID6 and erasure encoding algorithms that were 
presented last year and this at FAST:

http://www.cs.utk.edu/~plank/plank/papers/papers.html

If I remember correctly, James Plank's papers also have implemented and 
benchmarked the various encodings,

Ric



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-17 14:22     ` Ric Wheeler
@ 2009-07-17 15:20       ` H. Peter Anvin
  2009-07-17 15:35         ` Ric Wheeler
  0 siblings, 1 reply; 34+ messages in thread
From: H. Peter Anvin @ 2009-07-17 15:20 UTC (permalink / raw)
  To: Ric Wheeler
  Cc: Dan Williams, David Woodhouse, chris.mason, linux-btrfs, neilb,
	linux-raid

Ric Wheeler wrote:
> 
> Worth sharing a pointer to a really neat set of papers that describe 
> open source friendly RAID6 and erasure encoding algorithms that were 
> presented last year and this at FAST:
> 
> http://www.cs.utk.edu/~plank/plank/papers/papers.html
> 
> If I remember correctly, James Plank's papers also have implemented and 
> benchmarked the various encodings,
> 

I have seen the papers; I'm not sure it really makes that much 
difference.  One of the things that bugs me about these papers is that 
he compares to *his* implementation of my optimizations, but not to my 
code.  In real life implementations, on commodity hardware, we're 
limited by memory and disk performance, not by CPU utilization.

	-hpa


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-17 15:20       ` H. Peter Anvin
@ 2009-07-17 15:35         ` Ric Wheeler
  2009-07-17 15:40           ` H. Peter Anvin
  0 siblings, 1 reply; 34+ messages in thread
From: Ric Wheeler @ 2009-07-17 15:35 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Dan Williams, David Woodhouse, chris.mason, linux-btrfs, neilb,
	linux-raid

On 07/17/2009 11:20 AM, H. Peter Anvin wrote:
> Ric Wheeler wrote:
>>
>> Worth sharing a pointer to a really neat set of papers that describe
>> open source friendly RAID6 and erasure encoding algorithms that were
>> presented last year and this at FAST:
>>
>> http://www.cs.utk.edu/~plank/plank/papers/papers.html
>>
>> If I remember correctly, James Plank's papers also have implemented
>> and benchmarked the various encodings,
>>
>
> I have seen the papers; I'm not sure it really makes that much
> difference. One of the things that bugs me about these papers is that he
> compares to *his* implementation of my optimizations, but not to my
> code. In real life implementations, on commodity hardware, we're limited
> by memory and disk performance, not by CPU utilization.
>
> -hpa
>

Fair enough - I thought that his coverage of the other open source friendly 
encodings beyond RAID6 was actually quite interesting.

If you have specifics that you found unconvincing in his work, I am pretty sure 
that he would be delighted to hear from you first hand. James seemed to me to be 
very reasonable and very much a pro-Linux academic, so I would love to be able 
to get him and his grad students aligned in a useful way for us :-)

ric


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-17 15:35         ` Ric Wheeler
@ 2009-07-17 15:40           ` H. Peter Anvin
  2009-07-17 15:47             ` Ric Wheeler
  0 siblings, 1 reply; 34+ messages in thread
From: H. Peter Anvin @ 2009-07-17 15:40 UTC (permalink / raw)
  To: Ric Wheeler
  Cc: Dan Williams, David Woodhouse, chris.mason, linux-btrfs, neilb,
	linux-raid

Ric Wheeler wrote:
>>
>> I have seen the papers; I'm not sure it really makes that much
>> difference. One of the things that bugs me about these papers is that he
>> compares to *his* implementation of my optimizations, but not to my
>> code. In real life implementations, on commodity hardware, we're limited
>> by memory and disk performance, not by CPU utilization.
>>
> 
> Fair enough - I thought that his coverage of the other open source 
> friendly encodings beyond RAID6 was actually quite interesting.
> 
> If you have specifics that you found unconvincing in his work, I am 
> pretty sure that he would be delighted to hear from you first hand. 
> James seemed to me to be very reasonable and very much a pro-Linux 
> academic, so I would love to be able to get him and his grad students 
> aligned in a useful way for us :-)
> 

The main flaw, as I said, is in the phrase "as implemented by the 
Jerasure library".  He's comparing his own implementations of various 
algorithms, not optimized implementations.

The bottom line is pretty much this: the cost of changing the encoding 
would appear to outweigh the benefit.  I'm not trying to claim the Linux 
RAID-6 implementation is optimal, but it is simple and appears to be 
fast enough that the math isn't the bottleneck.

	-hpa

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-17 15:40           ` H. Peter Anvin
@ 2009-07-17 15:47             ` Ric Wheeler
  2009-07-17 15:49               ` H. Peter Anvin
  2009-07-17 15:51               ` H. Peter Anvin
  0 siblings, 2 replies; 34+ messages in thread
From: Ric Wheeler @ 2009-07-17 15:47 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Dan Williams, David Woodhouse, chris.mason, linux-btrfs, neilb,
	linux-raid

On 07/17/2009 11:40 AM, H. Peter Anvin wrote:
> Ric Wheeler wrote:
>>>
>>> I have seen the papers; I'm not sure it really makes that much
>>> difference. One of the things that bugs me about these papers is that he
>>> compares to *his* implementation of my optimizations, but not to my
>>> code. In real life implementations, on commodity hardware, we're limited
>>> by memory and disk performance, not by CPU utilization.
>>>
>>
>> Fair enough - I thought that his coverage of the other open source
>> friendly encodings beyond RAID6 was actually quite interesting.
>>
>> If you have specifics that you found unconvincing in his work, I am
>> pretty sure that he would be delighted to hear from you first hand.
>> James seemed to me to be very reasonable and very much a pro-Linux
>> academic, so I would love to be able to get him and his grad students
>> aligned in a useful way for us :-)
>>
>
> The main flaw, as I said, is in the phrase "as implemented by the
> Jerasure library". He's comparing his own implementations of various
> algorithms, not optimized implementations.
>
> The bottom line is pretty much this: the cost of changing the encoding
> would appear to outweigh the benefit. I'm not trying to claim the Linux
> RAID-6 implementation is optimal, but it is simple and appears to be
> fast enough that the math isn't the bottleneck.
>
> -hpa

Cost? Thank about how to get free grad student hours testing out things that you 
might or might not want to leverage on down the road :-)

ric


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-17 15:47             ` Ric Wheeler
@ 2009-07-17 15:49               ` H. Peter Anvin
  2009-07-17 15:58                 ` Ric Wheeler
                                   ` (2 more replies)
  2009-07-17 15:51               ` H. Peter Anvin
  1 sibling, 3 replies; 34+ messages in thread
From: H. Peter Anvin @ 2009-07-17 15:49 UTC (permalink / raw)
  To: Ric Wheeler
  Cc: Dan Williams, David Woodhouse, chris.mason, linux-btrfs, neilb,
	linux-raid

Ric Wheeler wrote:
>>
>> The bottom line is pretty much this: the cost of changing the encoding
>> would appear to outweigh the benefit. I'm not trying to claim the Linux
>> RAID-6 implementation is optimal, but it is simple and appears to be
>> fast enough that the math isn't the bottleneck.
> 
> Cost? Thank about how to get free grad student hours testing out things 
> that you might or might not want to leverage on down the road :-)
> 

Cost, yes, of changing an on-disk format.

	-hpa


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-17 15:47             ` Ric Wheeler
  2009-07-17 15:49               ` H. Peter Anvin
@ 2009-07-17 15:51               ` H. Peter Anvin
  1 sibling, 0 replies; 34+ messages in thread
From: H. Peter Anvin @ 2009-07-17 15:51 UTC (permalink / raw)
  To: Ric Wheeler
  Cc: Dan Williams, David Woodhouse, chris.mason, linux-btrfs, neilb,
	linux-raid

Ric Wheeler wrote:
>>
>> The main flaw, as I said, is in the phrase "as implemented by the
>> Jerasure library". He's comparing his own implementations of various
>> algorithms, not optimized implementations.
>>
>> The bottom line is pretty much this: the cost of changing the encoding
>> would appear to outweigh the benefit. I'm not trying to claim the Linux
>> RAID-6 implementation is optimal, but it is simple and appears to be
>> fast enough that the math isn't the bottleneck.
> 
> Cost? Thank about how to get free grad student hours testing out things 
> that you might or might not want to leverage on down the road :-)
> 

Anyway... I don't really care too much.  If someone wants to redesign 
the Linux RAID-6 and Neil decides to take it I'm not going to object. 
I'm also not very likely to do any work on it.

	-hpa


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-17 15:49               ` H. Peter Anvin
@ 2009-07-17 15:58                 ` Ric Wheeler
  2009-07-17 18:59                   ` Alex Elsayed
  2009-07-17 19:12                   ` Gregory Maxwell
  2009-07-18 11:53                   ` David Woodhouse
  2 siblings, 1 reply; 34+ messages in thread
From: Ric Wheeler @ 2009-07-17 15:58 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Dan Williams, David Woodhouse, chris.mason, linux-btrfs, neilb,
	linux-raid

On 07/17/2009 11:49 AM, H. Peter Anvin wrote:
> Ric Wheeler wrote:
>>>
>>> The bottom line is pretty much this: the cost of changing the encoding
>>> would appear to outweigh the benefit. I'm not trying to claim the Linux
>>> RAID-6 implementation is optimal, but it is simple and appears to be
>>> fast enough that the math isn't the bottleneck.
>>
>> Cost? Thank about how to get free grad student hours testing out
>> things that you might or might not want to leverage on down the road :-)
>>
>
> Cost, yes, of changing an on-disk format.
>
> -hpa
>

Putting RAID6 behind us, we still might be interested in the other encodings 
that are in:

"A Performance Evaluation and Examination of Open-Source Erasure Coding 
Libraries For Storage"

http://www.cs.utk.edu/~plank/plank/papers/FAST-2009.html

since they give us even more flexibility....

ric

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-17 15:58                 ` Ric Wheeler
@ 2009-07-17 18:59                   ` Alex Elsayed
  2009-07-17 19:02                     ` Alex Elsayed
  0 siblings, 1 reply; 34+ messages in thread
From: Alex Elsayed @ 2009-07-17 18:59 UTC (permalink / raw)
  To: linux-btrfs; +Cc: linux-raid

Ric Wheeler wrote:

> On 07/17/2009 11:49 AM, H. Peter Anvin wrote:
>> Ric Wheeler wrote:
>>>>
>>>> The bottom line is pretty much this: the cost of changing the encoding
>>>> would appear to outweigh the benefit. I'm not trying to claim the Linux
>>>> RAID-6 implementation is optimal, but it is simple and appears to be
>>>> fast enough that the math isn't the bottleneck.
>>>
>>> Cost? Thank about how to get free grad student hours testing out
>>> things that you might or might not want to leverage on down the road :-)
>>>
>>
>> Cost, yes, of changing an on-disk format.
>>
>> -hpa
>>
> 
> Putting RAID6 behind us, we still might be interested in the other 
encodings 
> that are in:
> 
> "A Performance Evaluation and Examination of Open-Source Erasure Coding 
> Libraries For Storage"
> 
> http://www.cs.utk.edu/~plank/plank/papers/FAST-2009.html
> 
> since they give us even more flexibility....

Of course, there's also the fact that, using (essentially unchanged) the 
current code for Reed-Solomon coding, it's completely doable to have 
arbitrary NxM redundancy up to (N + M) < 256 disks (this limit is due to the 
current maximum of 8 for symsize [referred to as 'w' in the below paper] in 
rs_init. If increased to 16, the maximum number of disks would be 65535). 
It's also space-optimal for all combinations of N (checksum) and M (data).

http://www.cs.utk.edu/~plank/plank/papers/CS-96-332.html even describes an 
implementation _very_ similar to the current code, right down to using a 
table for the logarithm and inverse logarithm calculations.

Also, (referencing the earlier-posted paper comparing open-source coding 
techniques), Cauchy Reed-Solomon codes seem to maintain most of the benefits 
of the current system (including the ability to provide NxM redundancy, 
while still retaining the property of being space-optimal), with significant 
performance gains. It also provides an optimization for the RAID6 case, so 
once again the common case would get a benefit over less common cases (as 
with Mr. Anvin's RAID6 optimization in the current system)

However, I will have to dispute that the other methods provide more 
flexibility - Cauchy Reed-Solomon codes are at best a horizontal move there, 
and the other systems are restricted to (or at very least, far more 
effective in) RAID6 systems.



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-17 18:59                   ` Alex Elsayed
@ 2009-07-17 19:02                     ` Alex Elsayed
  2009-07-29 18:16                       ` H. Peter Anvin
  0 siblings, 1 reply; 34+ messages in thread
From: Alex Elsayed @ 2009-07-17 19:02 UTC (permalink / raw)
  To: linux-btrfs; +Cc: linux-raid

Alex Elsayed wrote:

> Ric Wheeler wrote:
> 
>> On 07/17/2009 11:49 AM, H. Peter Anvin wrote:
>>> Ric Wheeler wrote:
>>>>>
>>>>> The bottom line is pretty much this: the cost of changing the encoding
>>>>> would appear to outweigh the benefit. I'm not trying to claim the 
Linux
>>>>> RAID-6 implementation is optimal, but it is simple and appears to be
>>>>> fast enough that the math isn't the bottleneck.
>>>>
>>>> Cost? Thank about how to get free grad student hours testing out
>>>> things that you might or might not want to leverage on down the road 
:-)
>>>>
>>>
>>> Cost, yes, of changing an on-disk format.
>>>
>>> -hpa
>>>
>> 
>> Putting RAID6 behind us, we still might be interested in the other 
> encodings 
>> that are in:
>> 
>> "A Performance Evaluation and Examination of Open-Source Erasure Coding 
>> Libraries For Storage"
>> 
>> http://www.cs.utk.edu/~plank/plank/papers/FAST-2009.html
>> 
>> since they give us even more flexibility....
> 
> Of course, there's also the fact that, using (essentially unchanged) the 
> current code for Reed-Solomon coding, it's completely doable to have 
> arbitrary NxM redundancy up to (N + M) < 256 disks (this limit is due to 
> the 
> current maximum of 8 for symsize [referred to as 'w' in the below paper] 
> in 
> rs_init. If increased to 16, the maximum number of disks would be 65535). 
> It's also space-optimal for all combinations of N (checksum) and M (data).
> 
> http://www.cs.utk.edu/~plank/plank/papers/CS-96-332.html even describes an 
> implementation _very_ similar to the current code, right down to using a 
> table for the logarithm and inverse logarithm calculations.
> 
> Also, (referencing the earlier-posted paper comparing open-source coding 
> techniques), Cauchy Reed-Solomon codes seem to maintain most of the 
> benefits 
> of the current system (including the ability to provide NxM redundancy, 
> while still retaining the property of being space-optimal), with 
> significant 
> performance gains. It also provides an optimization for the RAID6 case, so 
> once again the common case would get a benefit over less common cases (as 
> with Mr. Anvin's RAID6 optimization in the current system)
> 
> However, I will have to dispute that the other methods provide more 
> flexibility - Cauchy Reed-Solomon codes are at best a horizontal move 
> there, 
> and the other systems are restricted to (or at very least, far more 
> effective in) RAID6 systems.

Whoops, got N (data) and M (checksum) backwards.




^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-17 15:49               ` H. Peter Anvin
@ 2009-07-17 19:12                   ` Gregory Maxwell
  2009-07-17 19:12                   ` Gregory Maxwell
  2009-07-18 11:53                   ` David Woodhouse
  2 siblings, 0 replies; 34+ messages in thread
From: Gregory Maxwell @ 2009-07-17 19:12 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Ric Wheeler, Dan Williams, David Woodhouse, chris.mason,
	linux-btrfs, neilb, linux-raid

On Fri, Jul 17, 2009 at 11:49 AM, H. Peter Anvin<hpa@zytor.com> wrote:
> Ric Wheeler wrote:
>>>
>>> The bottom line is pretty much this: the cost of changing the encoding
>>> would appear to outweigh the benefit. I'm not trying to claim the Linux
>>> RAID-6 implementation is optimal, but it is simple and appears to be
>>> fast enough that the math isn't the bottleneck.
>>
>> Cost? Thank about how to get free grad student hours testing out things
>> that you might or might not want to leverage on down the road :-)
>>
>
> Cost, yes, of changing an on-disk format.

In the context of BTRFS the only costs are development of the new
erasure code and integration as well as the possible cost of carrying
around two erasure codes (one for MD backwards compatibility, one for
BTRFS).

The availability of arbitrary M of N erasure codes is more attractive
for BTRFS than MD because of the increased granularity. I.e. with
BRTFS you might want a smaller stripe width because some sub-volume
spans fewer spindles, or increased parity blocks you need increased
redundancy for some small set of files.

On the other hand— for performance reasons if you want additional
redundancy beyond 'RAID6' you might just be better off writing more
duplicates of the data blocks instead of additional RS syndromes since
while the additional data blocks are not optimal from an error
recovery perspective they will provide a performance gain of being
able to spread reads out across additional disks.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
@ 2009-07-17 19:12                   ` Gregory Maxwell
  0 siblings, 0 replies; 34+ messages in thread
From: Gregory Maxwell @ 2009-07-17 19:12 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Ric Wheeler, Dan Williams, David Woodhouse, chris.mason,
	linux-btrfs, neilb, linux-raid

On Fri, Jul 17, 2009 at 11:49 AM, H. Peter Anvin<hpa@zytor.com> wrote:
> Ric Wheeler wrote:
>>>
>>> The bottom line is pretty much this: the cost of changing the encod=
ing
>>> would appear to outweigh the benefit. I'm not trying to claim the L=
inux
>>> RAID-6 implementation is optimal, but it is simple and appears to b=
e
>>> fast enough that the math isn't the bottleneck.
>>
>> Cost? Thank about how to get free grad student hours testing out thi=
ngs
>> that you might or might not want to leverage on down the road :-)
>>
>
> Cost, yes, of changing an on-disk format.

In the context of BTRFS the only costs are development of the new
erasure code and integration as well as the possible cost of carrying
around two erasure codes (one for MD backwards compatibility, one for
BTRFS).

The availability of arbitrary M of N erasure codes is more attractive
for BTRFS than MD because of the increased granularity. I.e. with
BRTFS you might want a smaller stripe width because some sub-volume
spans fewer spindles, or increased parity blocks you need increased
redundancy for some small set of files.

On the other hand=E2=80=94 for performance reasons if you want addition=
al
redundancy beyond 'RAID6' you might just be better off writing more
duplicates of the data blocks instead of additional RS syndromes since
while the additional data blocks are not optimal from an error
recovery perspective they will provide a performance gain of being
able to spread reads out across additional disks.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" =
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-17 15:49               ` H. Peter Anvin
@ 2009-07-18 11:53                   ` David Woodhouse
  2009-07-17 19:12                   ` Gregory Maxwell
  2009-07-18 11:53                   ` David Woodhouse
  2 siblings, 0 replies; 34+ messages in thread
From: David Woodhouse @ 2009-07-18 11:53 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Ric Wheeler, Dan Williams, chris.mason, linux-btrfs, neilb, linux-raid

On Fri, 2009-07-17 at 11:49 -0400, H. Peter Anvin wrote:
> Ric Wheeler wrote:
> >>
> >> The bottom line is pretty much this: the cost of changing the encoding
> >> would appear to outweigh the benefit. I'm not trying to claim the Linux
> >> RAID-6 implementation is optimal, but it is simple and appears to be
> >> fast enough that the math isn't the bottleneck.
> > 
> > Cost? Thank about how to get free grad student hours testing out things 
> > that you might or might not want to leverage on down the road :-)
> > 
> 
> Cost, yes, of changing an on-disk format.

Personally, I don't care about that -- I'm utterly uninterested in the
legacy RAID-6 setup where it pretends to be a normal disk. I think that
model is as fundamentally wrong as flash devices making the similar
pretence.

I'm only interested in what we can use directly within btrfs -- and
ideally I do want something which gives me an _arbitrary_ number of
redundant blocks, rather than limiting me to 2. But the legacy code is
good enough for now¹.

When I get round to wanting more, I was thinking of lifting something
like http://git.infradead.org/mtd-utils.git?a=blob;f=fec.c to start
with, and maybe hoping that someone cleverer will come up with something
better.

The less I have to deal with Galois Fields, the happier I'll be.

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation

¹ Well, kind of. The xor_blocks() function will silently screw you over
  if you ask it to handle more than 5 blocks at a time.

--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
@ 2009-07-18 11:53                   ` David Woodhouse
  0 siblings, 0 replies; 34+ messages in thread
From: David Woodhouse @ 2009-07-18 11:53 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Ric Wheeler, Dan Williams, chris.mason, linux-btrfs, neilb, linux-raid

On Fri, 2009-07-17 at 11:49 -0400, H. Peter Anvin wrote:
> Ric Wheeler wrote:
> >>
> >> The bottom line is pretty much this: the cost of changing the enco=
ding
> >> would appear to outweigh the benefit. I'm not trying to claim the =
Linux
> >> RAID-6 implementation is optimal, but it is simple and appears to =
be
> >> fast enough that the math isn't the bottleneck.
> >=20
> > Cost? Thank about how to get free grad student hours testing out th=
ings=20
> > that you might or might not want to leverage on down the road :-)
> >=20
>=20
> Cost, yes, of changing an on-disk format.

Personally, I don't care about that -- I'm utterly uninterested in the
legacy RAID-6 setup where it pretends to be a normal disk. I think that
model is as fundamentally wrong as flash devices making the similar
pretence.

I'm only interested in what we can use directly within btrfs -- and
ideally I do want something which gives me an _arbitrary_ number of
redundant blocks, rather than limiting me to 2. But the legacy code is
good enough for now=C2=B9.

When I get round to wanting more, I was thinking of lifting something
like http://git.infradead.org/mtd-utils.git?a=3Dblob;f=3Dfec.c to start
with, and maybe hoping that someone cleverer will come up with somethin=
g
better.

The less I have to deal with Galois Fields, the happier I'll be.

--=20
David Woodhouse                            Open Source Technology Centr=
e
David.Woodhouse@intel.com                              Intel Corporatio=
n

=C2=B9 Well, kind of. The xor_blocks() function will silently screw you=
 over
  if you ask it to handle more than 5 blocks at a time.

--
To unsubscribe from this list: send the line "unsubscribe linux-raid" i=
n
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-18 11:53                   ` David Woodhouse
@ 2009-07-18 12:45                     ` H. Peter Anvin
  -1 siblings, 0 replies; 34+ messages in thread
From: H. Peter Anvin @ 2009-07-18 12:45 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Ric Wheeler, Dan Williams, chris.mason, linux-btrfs, neilb, linux-raid

David Woodhouse wrote:
> 
> I'm only interested in what we can use directly within btrfs -- and
> ideally I do want something which gives me an _arbitrary_ number of
> redundant blocks, rather than limiting me to 2. But the legacy code is
> good enough for now¹.
> 
> When I get round to wanting more, I was thinking of lifting something
> like http://git.infradead.org/mtd-utils.git?a=blob;f=fec.c to start
> with, and maybe hoping that someone cleverer will come up with something
> better.
> 
> The less I have to deal with Galois Fields, the happier I'll be.
> 

Well, if you want something with more than 2-block redundancy you need 
something other than the existing RAID-6 code which, as you know, is a 
special case of general Reed-Solomon coding that I happen to have spent 
a lot of time optimizing.  The FEC code is not optimized at all if I can 
tell, and certainly doesn't use SSE in any way -- never mind the GF 
accelerators that are starting to appear.  That doesn't mean it 
*couldn't*, just that noone has done the work to either implement it or 
prove it can't be done.

Either way, perhaps the Plank paper that Rik pointed to could be useful 
as a starting point; it's probably worth taking their performance 
numbers with a *major* grain of salt: their implementation of RAID-6 
"RS-Opt" which is supposed to be equivalent to my code performs at
400 MB/s, which is less than Pentium III-era performance of the real 
world code (they compare not to real code but to their own 
implementation in Java, called "Jerasure".)  Implementability using real 
array instruction sets is key to decent performance.

	-hpa
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
@ 2009-07-18 12:45                     ` H. Peter Anvin
  0 siblings, 0 replies; 34+ messages in thread
From: H. Peter Anvin @ 2009-07-18 12:45 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Ric Wheeler, Dan Williams, chris.mason, linux-btrfs, neilb, linux-raid

David Woodhouse wrote:
>=20
> I'm only interested in what we can use directly within btrfs -- and
> ideally I do want something which gives me an _arbitrary_ number of
> redundant blocks, rather than limiting me to 2. But the legacy code i=
s
> good enough for now=C2=B9.
>=20
> When I get round to wanting more, I was thinking of lifting something
> like http://git.infradead.org/mtd-utils.git?a=3Dblob;f=3Dfec.c to sta=
rt
> with, and maybe hoping that someone cleverer will come up with someth=
ing
> better.
>=20
> The less I have to deal with Galois Fields, the happier I'll be.
>=20

Well, if you want something with more than 2-block redundancy you need=20
something other than the existing RAID-6 code which, as you know, is a=20
special case of general Reed-Solomon coding that I happen to have spent=
=20
a lot of time optimizing.  The FEC code is not optimized at all if I ca=
n=20
tell, and certainly doesn't use SSE in any way -- never mind the GF=20
accelerators that are starting to appear.  That doesn't mean it=20
*couldn't*, just that noone has done the work to either implement it or=
=20
prove it can't be done.

Either way, perhaps the Plank paper that Rik pointed to could be useful=
=20
as a starting point; it's probably worth taking their performance=20
numbers with a *major* grain of salt: their implementation of RAID-6=20
"RS-Opt" which is supposed to be equivalent to my code performs at
400 MB/s, which is less than Pentium III-era performance of the real=20
world code (they compare not to real code but to their own=20
implementation in Java, called "Jerasure".)  Implementability using rea=
l=20
array instruction sets is key to decent performance.

	-hpa
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" =
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-18 11:53                   ` David Woodhouse
@ 2009-07-18 12:49                     ` Ric Wheeler
  -1 siblings, 0 replies; 34+ messages in thread
From: Ric Wheeler @ 2009-07-18 12:49 UTC (permalink / raw)
  To: David Woodhouse
  Cc: H. Peter Anvin, Ric Wheeler, Dan Williams, chris.mason,
	linux-btrfs, neilb, linux-raid, plank

On 07/18/2009 07:53 AM, David Woodhouse wrote:
> On Fri, 2009-07-17 at 11:49 -0400, H. Peter Anvin wrote:
>    
>> Ric Wheeler wrote:
>>      
>>>> The bottom line is pretty much this: the cost of changing the encoding
>>>> would appear to outweigh the benefit. I'm not trying to claim the Linux
>>>> RAID-6 implementation is optimal, but it is simple and appears to be
>>>> fast enough that the math isn't the bottleneck.
>>>>          
>>> Cost? Thank about how to get free grad student hours testing out things
>>> that you might or might not want to leverage on down the road :-)
>>>
>>>        
>> Cost, yes, of changing an on-disk format.
>>      
>
> Personally, I don't care about that -- I'm utterly uninterested in the
> legacy RAID-6 setup where it pretends to be a normal disk. I think that
> model is as fundamentally wrong as flash devices making the similar
> pretence.
>
> I'm only interested in what we can use directly within btrfs -- and
> ideally I do want something which gives me an _arbitrary_ number of
> redundant blocks, rather than limiting me to 2. But the legacy code is
> good enough for now¹.
>
> When I get round to wanting more, I was thinking of lifting something
> like http://git.infradead.org/mtd-utils.git?a=blob;f=fec.c to start
> with, and maybe hoping that someone cleverer will come up with something
> better.
>
> The less I have to deal with Galois Fields, the happier I'll be.
>
>    

I think that we are generally fine with the RAID5/6 support given a 
small number of drives. The fancier erasure encodings are much more 
interesting when you have a large number of drives - for example, we 
just ordered 4 shelves of SATA drives (15/shelf) that will be driven by 
a single server. You can certainly imagine profiling a lot of 
interesting variations with that many things to play with.

Ric


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
@ 2009-07-18 12:49                     ` Ric Wheeler
  0 siblings, 0 replies; 34+ messages in thread
From: Ric Wheeler @ 2009-07-18 12:49 UTC (permalink / raw)
  To: David Woodhouse
  Cc: H. Peter Anvin, Ric Wheeler, Dan Williams, chris.mason,
	linux-btrfs, neilb, linux-raid, plank

On 07/18/2009 07:53 AM, David Woodhouse wrote:
> On Fri, 2009-07-17 at 11:49 -0400, H. Peter Anvin wrote:
>   =20
>> Ric Wheeler wrote:
>>     =20
>>>> The bottom line is pretty much this: the cost of changing the enco=
ding
>>>> would appear to outweigh the benefit. I'm not trying to claim the =
Linux
>>>> RAID-6 implementation is optimal, but it is simple and appears to =
be
>>>> fast enough that the math isn't the bottleneck.
>>>>         =20
>>> Cost? Thank about how to get free grad student hours testing out th=
ings
>>> that you might or might not want to leverage on down the road :-)
>>>
>>>       =20
>> Cost, yes, of changing an on-disk format.
>>     =20
>
> Personally, I don't care about that -- I'm utterly uninterested in th=
e
> legacy RAID-6 setup where it pretends to be a normal disk. I think th=
at
> model is as fundamentally wrong as flash devices making the similar
> pretence.
>
> I'm only interested in what we can use directly within btrfs -- and
> ideally I do want something which gives me an _arbitrary_ number of
> redundant blocks, rather than limiting me to 2. But the legacy code i=
s
> good enough for now=C2=B9.
>
> When I get round to wanting more, I was thinking of lifting something
> like http://git.infradead.org/mtd-utils.git?a=3Dblob;f=3Dfec.c to sta=
rt
> with, and maybe hoping that someone cleverer will come up with someth=
ing
> better.
>
> The less I have to deal with Galois Fields, the happier I'll be.
>
>   =20

I think that we are generally fine with the RAID5/6 support given a=20
small number of drives. The fancier erasure encodings are much more=20
interesting when you have a large number of drives - for example, we=20
just ordered 4 shelves of SATA drives (15/shelf) that will be driven by=
=20
a single server. You can certainly imagine profiling a lot of=20
interesting variations with that many things to play with.

Ric


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" =
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-18 11:53                   ` David Woodhouse
@ 2009-07-18 16:26                     ` Dan Williams
  -1 siblings, 0 replies; 34+ messages in thread
From: Dan Williams @ 2009-07-18 16:26 UTC (permalink / raw)
  To: David Woodhouse
  Cc: H. Peter Anvin, Ric Wheeler, chris.mason, linux-btrfs, neilb, linux-raid

On Sat, Jul 18, 2009 at 4:53 AM, David Woodhouse<dwmw2@infradead.org> wrote:
> On Fri, 2009-07-17 at 11:49 -0400, H. Peter Anvin wrote:
>> Cost, yes, of changing an on-disk format.
>
> Personally, I don't care about that -- I'm utterly uninterested in the
> legacy RAID-6 setup where it pretends to be a normal disk. I think that
> model is as fundamentally wrong as flash devices making the similar
> pretence.

I can understand the frustration of these details being irretrievably
hidden behind a proprietary interface out of the filesystem's control.
 However, this is not the case with Linux software RAID.  I suspect
that there is room for more interaction with even "legacy" filesystems
to communicate things like: "don't worry about initializing that
region of the disk it's all free space", "don't bother resyncing on
dirty shutdown, if power-loss interrupts a write I guarantee I will
replay the entire stripe to you at a later date", or "hey, that last
block I read doesn't checksum, can you come up with a different
version?"

I was under the impression that btrfs wanted to leverage md's stripe
handling logic as well, seems that is not the case?

--
Dan

> ¹ Well, kind of. The xor_blocks() function will silently screw you over
>  if you ask it to handle more than 5 blocks at a time.

async_xor() handles arbitrary block counts.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
@ 2009-07-18 16:26                     ` Dan Williams
  0 siblings, 0 replies; 34+ messages in thread
From: Dan Williams @ 2009-07-18 16:26 UTC (permalink / raw)
  To: David Woodhouse
  Cc: H. Peter Anvin, Ric Wheeler, chris.mason, linux-btrfs, neilb, linux-raid

On Sat, Jul 18, 2009 at 4:53 AM, David Woodhouse<dwmw2@infradead.org> w=
rote:
> On Fri, 2009-07-17 at 11:49 -0400, H. Peter Anvin wrote:
>> Cost, yes, of changing an on-disk format.
>
> Personally, I don't care about that -- I'm utterly uninterested in th=
e
> legacy RAID-6 setup where it pretends to be a normal disk. I think th=
at
> model is as fundamentally wrong as flash devices making the similar
> pretence.

I can understand the frustration of these details being irretrievably
hidden behind a proprietary interface out of the filesystem's control.
 However, this is not the case with Linux software RAID.  I suspect
that there is room for more interaction with even "legacy" filesystems
to communicate things like: "don't worry about initializing that
region of the disk it's all free space", "don't bother resyncing on
dirty shutdown, if power-loss interrupts a write I guarantee I will
replay the entire stripe to you at a later date", or "hey, that last
block I read doesn't checksum, can you come up with a different
version?"

I was under the impression that btrfs wanted to leverage md's stripe
handling logic as well, seems that is not the case?

--
Dan

> =B9 Well, kind of. The xor_blocks() function will silently screw you =
over
> =A0if you ask it to handle more than 5 blocks at a time.

async_xor() handles arbitrary block counts.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" =
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-18 16:26                     ` Dan Williams
  (?)
@ 2009-07-18 18:42                     ` David Woodhouse
  2009-07-18 20:04                       ` Dan Williams
  -1 siblings, 1 reply; 34+ messages in thread
From: David Woodhouse @ 2009-07-18 18:42 UTC (permalink / raw)
  To: Dan Williams
  Cc: H. Peter Anvin, Ric Wheeler, chris.mason, linux-btrfs, neilb, linux-raid

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1643 bytes --]

On Sat, 18 Jul 2009, Dan Williams wrote:

> On Sat, Jul 18, 2009 at 4:53 AM, David Woodhouse<dwmw2@infradead.org> wrote:
>> On Fri, 2009-07-17 at 11:49 -0400, H. Peter Anvin wrote:
>>> Cost, yes, of changing an on-disk format.
>>
>> Personally, I don't care about that -- I'm utterly uninterested in the
>> legacy RAID-6 setup where it pretends to be a normal disk. I think that
>> model is as fundamentally wrong as flash devices making the similar
>> pretence.
>
> I can understand the frustration of these details being irretrievably
> hidden behind a proprietary interface out of the filesystem's control.
> However, this is not the case with Linux software RAID.  I suspect
> that there is room for more interaction with even "legacy" filesystems
> to communicate things like: "don't worry about initializing that
> region of the disk it's all free space", "don't bother resyncing on
> dirty shutdown, if power-loss interrupts a write I guarantee I will
> replay the entire stripe to you at a later date", or "hey, that last
> block I read doesn't checksum, can you come up with a different
> version?"
>
> I was under the impression that btrfs wanted to leverage md's stripe
> handling logic as well, seems that is not the case?

No. We do a bunch of the stuff you mention above, but entirely within the 
file system so we don't have to invent a bunch of layering violations just 
to work around a broken design.

>> ¹ Well, kind of. The xor_blocks() function will silently screw you over
>>  if you ask it to handle more than 5 blocks at a time.
>
> async_xor() handles arbitrary block counts.

That's useful to know; thanks.

-- 
dwmw2

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-18 12:45                     ` H. Peter Anvin
  (?)
@ 2009-07-18 18:50                     ` Alex Elsayed
  2009-07-18 18:52                       ` Alex Elsayed
  -1 siblings, 1 reply; 34+ messages in thread
From: Alex Elsayed @ 2009-07-18 18:50 UTC (permalink / raw)
  To: linux-btrfs; +Cc: linux-raid

H. Peter Anvin wrote:
> implementation in Java, called "Jerasure".)  Implementability using real 
> array instruction sets is key to decent performance.
Actually, it is made clear in the paper that Jerasure is written as a C 
library, and Clearsafe is the only Java implementation. Don't let the name 
fool you. ;D




^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-18 18:50                     ` Alex Elsayed
@ 2009-07-18 18:52                       ` Alex Elsayed
  2009-07-29 18:20                         ` H. Peter Anvin
  0 siblings, 1 reply; 34+ messages in thread
From: Alex Elsayed @ 2009-07-18 18:52 UTC (permalink / raw)
  To: linux-btrfs; +Cc: linux-raid

Alex Elsayed wrote:

> H. Peter Anvin wrote:
>> implementation in Java, called "Jerasure".)  Implementability using real 
>> array instruction sets is key to decent performance.
> Actually, it is made clear in the paper that Jerasure is written as a C 
> library, and Clearsafe is the only Java implementation. Don't let the name 
> fool you. ;D
And again, I make a typo. s/Clearsafe/Cleversafe/.




^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-18 18:42                     ` David Woodhouse
@ 2009-07-18 20:04                       ` Dan Williams
  2009-07-19 18:04                         ` David Woodhouse
  0 siblings, 1 reply; 34+ messages in thread
From: Dan Williams @ 2009-07-18 20:04 UTC (permalink / raw)
  To: David Woodhouse
  Cc: H. Peter Anvin, Ric Wheeler, chris.mason, linux-btrfs, neilb, linux-raid

On Sat, Jul 18, 2009 at 11:42 AM, David Woodhouse<dwmw2@infradead.org> wrote:
> On Sat, 18 Jul 2009, Dan Williams wrote:
>> I was under the impression that btrfs wanted to leverage md's stripe
>> handling logic as well, seems that is not the case?
>
> No. We do a bunch of the stuff you mention above, but entirely within the
> file system so we don't have to invent a bunch of layering violations just
> to work around a broken design.

Sure, a layering violation for an existing filesystem.  For btrfs, at
LSF'09, we briefly talked about breaking out more than just the
erasure codes from software-raid into a "libraid".  At some point in
the i/o path a btrfs stripe operation becomes indistinguishable from a
raid5,6 operation so at first glance there appears to be room to share
common infrastructure like portions of handle_stripe for example.

--
Dan

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-18 20:04                       ` Dan Williams
@ 2009-07-19 18:04                         ` David Woodhouse
  2009-07-20  5:21                           ` H. Peter Anvin
  0 siblings, 1 reply; 34+ messages in thread
From: David Woodhouse @ 2009-07-19 18:04 UTC (permalink / raw)
  To: Dan Williams
  Cc: H. Peter Anvin, Ric Wheeler, chris.mason, linux-btrfs, neilb, linux-raid

On Sat, 2009-07-18 at 13:04 -0700, Dan Williams wrote:
> On Sat, Jul 18, 2009 at 11:42 AM, David Woodhouse<dwmw2@infradead.org> wrote:
> > On Sat, 18 Jul 2009, Dan Williams wrote:
> >> I was under the impression that btrfs wanted to leverage md's stripe
> >> handling logic as well, seems that is not the case?
> >
> > No. We do a bunch of the stuff you mention above, but entirely within the
> > file system so we don't have to invent a bunch of layering violations just
> > to work around a broken design.
> 
> Sure, a layering violation for an existing filesystem.  For btrfs, at
> LSF'09, we briefly talked about breaking out more than just the
> erasure codes from software-raid into a "libraid".  At some point in
> the i/o path a btrfs stripe operation becomes indistinguishable from a
> raid5,6 operation so at first glance there appears to be room to share
> common infrastructure like portions of handle_stripe for example.

At this point we've actually implemented the fundamental parts of
RAID[56] support in btrfs, and it's looking like all we really want is
the arithmetic routines.

Perhaps that's because I'm insufficiently familiar with the
handle_stripe() function to which you refer. Would you like to take to
take a look at http://git.infradead.org/users/dwmw2/btrfs-raid56.git and
attempt to convince me that I should be reusing more?

Criticism in 'diff -up' form is always welcome... :)

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-19 18:04                         ` David Woodhouse
@ 2009-07-20  5:21                           ` H. Peter Anvin
  0 siblings, 0 replies; 34+ messages in thread
From: H. Peter Anvin @ 2009-07-20  5:21 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Dan Williams, Ric Wheeler, chris.mason, linux-btrfs, neilb, linux-raid

David Woodhouse wrote:
> 
> At this point we've actually implemented the fundamental parts of
> RAID[56] support in btrfs, and it's looking like all we really want is
> the arithmetic routines.
> 

Given that you have no legacy requirements, and that supporting more
than two disks may be interesting, it may very well be worth spending
some time at new codes now rather than later.  Part of that
investigation, though, is going to have to be if and how they can be
accelerated.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-17 19:02                     ` Alex Elsayed
@ 2009-07-29 18:16                       ` H. Peter Anvin
  0 siblings, 0 replies; 34+ messages in thread
From: H. Peter Anvin @ 2009-07-29 18:16 UTC (permalink / raw)
  To: Alex Elsayed; +Cc: linux-raid, linux-btrfs

>>
>> http://www.cs.utk.edu/~plank/plank/papers/CS-96-332.html even describes an 
>> implementation _very_ similar to the current code, right down to using a 
>> table for the logarithm and inverse logarithm calculations.
>>

We don't use a table for logarithm and inverse logarithm calculations.
Any time you do a table lookup you commit suicide from a performance
standpoint.

	-hpa


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/4] md: Factor out RAID6 algorithms into lib/
  2009-07-18 18:52                       ` Alex Elsayed
@ 2009-07-29 18:20                         ` H. Peter Anvin
  0 siblings, 0 replies; 34+ messages in thread
From: H. Peter Anvin @ 2009-07-29 18:20 UTC (permalink / raw)
  To: Alex Elsayed; +Cc: linux-raid, linux-btrfs

On 07/18/2009 11:52 AM, Alex Elsayed wrote:
> Alex Elsayed wrote:
> 
>> H. Peter Anvin wrote:
>>> implementation in Java, called "Jerasure".)  Implementability using real 
>>> array instruction sets is key to decent performance.
>> Actually, it is made clear in the paper that Jerasure is written as a C 
>> library, and Clearsafe is the only Java implementation. Don't let the name 
>> fool you. ;D
> And again, I make a typo. s/Clearsafe/Cleversafe/.
> 

It's still their own implementation of poor quality.  That it is poor
quality shows in the numbers, which are dramatically lower than the
actual Linux implementation -- by an order of magnitude or more.  In
other words, they build a strawman and knock it down.

The actual Linux implementation blasts any of the numbers they have in
their paper, which I presume means they haven't optimized any of them.
As such, all the numbers are meaningless.

	-hpa


^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2009-07-29 18:20 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-07-13 14:11 [PATCH 1/4] md: Factor out RAID6 algorithms into lib/ David Woodhouse
2009-07-15 19:23 ` Dan Williams
2009-07-15 20:16   ` Chris Mason
2009-07-15 22:11     ` Dan Williams
2009-07-15 22:11       ` Dan Williams
2009-07-16 17:38   ` H. Peter Anvin
2009-07-17 14:22     ` Ric Wheeler
2009-07-17 15:20       ` H. Peter Anvin
2009-07-17 15:35         ` Ric Wheeler
2009-07-17 15:40           ` H. Peter Anvin
2009-07-17 15:47             ` Ric Wheeler
2009-07-17 15:49               ` H. Peter Anvin
2009-07-17 15:58                 ` Ric Wheeler
2009-07-17 18:59                   ` Alex Elsayed
2009-07-17 19:02                     ` Alex Elsayed
2009-07-29 18:16                       ` H. Peter Anvin
2009-07-17 19:12                 ` Gregory Maxwell
2009-07-17 19:12                   ` Gregory Maxwell
2009-07-18 11:53                 ` David Woodhouse
2009-07-18 11:53                   ` David Woodhouse
2009-07-18 12:45                   ` H. Peter Anvin
2009-07-18 12:45                     ` H. Peter Anvin
2009-07-18 18:50                     ` Alex Elsayed
2009-07-18 18:52                       ` Alex Elsayed
2009-07-29 18:20                         ` H. Peter Anvin
2009-07-18 12:49                   ` Ric Wheeler
2009-07-18 12:49                     ` Ric Wheeler
2009-07-18 16:26                   ` Dan Williams
2009-07-18 16:26                     ` Dan Williams
2009-07-18 18:42                     ` David Woodhouse
2009-07-18 20:04                       ` Dan Williams
2009-07-19 18:04                         ` David Woodhouse
2009-07-20  5:21                           ` H. Peter Anvin
2009-07-17 15:51               ` H. Peter Anvin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.