All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] zlib: Optimize inffast when copying direct from output
@ 2009-11-10  9:03 Joakim Tjernlund
  0 siblings, 0 replies; 8+ messages in thread
From: Joakim Tjernlund @ 2009-11-10  9:03 UTC (permalink / raw)
  To: linux-mtd; +Cc: Joakim Tjernlund

JFFS2 uses lesser compression ratio and inflate always
ends up in "copy direct from output" case.
This patch tries to optimize the direct copy procedure.
Uses get_unaligned() but only in one place.
The copy loop just above this one can also use this
optimization, but I havn't done so as I have not tested if it
is a win there too.
On my MPC8321 this is about 17% faster on my JFFS2 root FS
than the original.
---

 Would like some testing of the PowerPC boot wrapper and
 a LE target before sending it upstream.

 arch/powerpc/boot/Makefile |    4 ++-
 lib/zlib_inflate/inffast.c |   48 +++++++++++++++++++++++++++++++++----------
 2 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 9ae7b7e..98e4c4f 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -20,7 +20,7 @@
 all: $(obj)/zImage
 
 BOOTCFLAGS    := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
-		 -fno-strict-aliasing -Os -msoft-float -pipe \
+		 -fno-strict-aliasing -Os -msoft-float -pipe -D__KERNEL__\
 		 -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \
 		 -isystem $(shell $(CROSS32CC) -print-file-name=include)
 BOOTAFLAGS	:= -D__ASSEMBLY__ $(BOOTCFLAGS) -traditional -nostdinc
@@ -34,6 +34,8 @@ BOOTCFLAGS	+= -fno-stack-protector
 endif
 
 BOOTCFLAGS	+= -I$(obj) -I$(srctree)/$(obj)
+BOOTCFLAGS	+= -include include/linux/autoconf.h -Iarch/powerpc/include
+BOOTCFLAGS	+= -Iinclude
 
 DTS_FLAGS	?= -p 1024
 
diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c
index 8550b0c..0c7fa3d 100644
--- a/lib/zlib_inflate/inffast.c
+++ b/lib/zlib_inflate/inffast.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/zutil.h>
+#include <asm/unaligned.h>
 #include "inftrees.h"
 #include "inflate.h"
 #include "inffast.h"
@@ -24,9 +25,11 @@
 #ifdef POSTINC
 #  define OFF 0
 #  define PUP(a) *(a)++
+#  define UP_UNALIGNED(a) get_unaligned((a)++)
 #else
 #  define OFF 1
 #  define PUP(a) *++(a)
+#  define UP_UNALIGNED(a) get_unaligned(++(a))
 #endif
 
 /*
@@ -239,18 +242,41 @@ void inflate_fast(z_streamp strm, unsigned start)
                     }
                 }
                 else {
+		    unsigned short *sout;
+		    unsigned long loops;
+
                     from = out - dist;          /* copy direct from output */
-                    do {                        /* minimum length is three */
-                        PUP(out) = PUP(from);
-                        PUP(out) = PUP(from);
-                        PUP(out) = PUP(from);
-                        len -= 3;
-                    } while (len > 2);
-                    if (len) {
-                        PUP(out) = PUP(from);
-                        if (len > 1)
-                            PUP(out) = PUP(from);
-                    }
+                    /* minimum length is three */
+		    /* Align out addr */
+		    if (!((long)(out - 1 + OFF)) & 1) {
+			PUP(out) = PUP(from);
+			len--;
+		    }
+		    sout = (unsigned short *)(out - OFF);
+		    if (dist > 2 ) {
+			unsigned short *sfrom;
+
+			sfrom = (unsigned short *)(from - OFF);
+			loops = len >> 1;
+			do
+			    PUP(sout) = UP_UNALIGNED(sfrom);
+			while (--loops);
+			out = (unsigned char *)sout + OFF;
+			from = (unsigned char *)sfrom + OFF;
+		    } else { /* dist == 1 or dist == 2 */
+			unsigned short pat16;
+
+			pat16 = *(sout-2+2*OFF);
+			if (dist == 1)
+			    pat16 = (pat16 & 0xff) | ((pat16 & 0xff ) << 8);
+			loops = len >> 1;
+			do
+			    PUP(sout) = pat16;
+			while (--loops);
+			out = (unsigned char *)sout + OFF;
+		    }
+		    if (len & 1)
+			PUP(out) = PUP(from);
                 }
             }
             else if ((op & 64) == 0) {          /* 2nd level distance code */
-- 
1.6.4.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] zlib: Optimize inffast when copying direct from output
  2009-11-26  8:46       ` Benjamin Herrenschmidt
@ 2009-11-26  9:02         ` Joakim Tjernlund
  0 siblings, 0 replies; 8+ messages in thread
From: Joakim Tjernlund @ 2009-11-26  9:02 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev

Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 26/11/2009 09:46:58:
>
> On Thu, 2009-11-26 at 09:30 +0100, Joakim Tjernlund wrote:
> > > I'm not sure its going to work to use get_unaligned() like that on all
> > > archs .. it might be definitely something to discuss on some more
> > > appropriate mailing list.
> >
> > Oh, why not? Is that because I am using it wrongly or because xx_unaligned
> > is impl. incorrectly on some archs?
>
> I'm just not sure it works in boot wrappers in case archs ... I suppose
> it does but I haven't actually checked :-)

I am not aware of other boot wrappers than PowerPC and I believe ppc
should handle it as unaligned accesses aren't a problem there.

      Jocke

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] zlib: Optimize inffast when copying direct from output
  2009-11-26  8:30     ` Joakim Tjernlund
@ 2009-11-26  8:46       ` Benjamin Herrenschmidt
  2009-11-26  9:02         ` Joakim Tjernlund
  0 siblings, 1 reply; 8+ messages in thread
From: Benjamin Herrenschmidt @ 2009-11-26  8:46 UTC (permalink / raw)
  To: Joakim Tjernlund; +Cc: linuxppc-dev

On Thu, 2009-11-26 at 09:30 +0100, Joakim Tjernlund wrote:
> > I'm not sure its going to work to use get_unaligned() like that on all
> > archs .. it might be definitely something to discuss on some more
> > appropriate mailing list.
> 
> Oh, why not? Is that because I am using it wrongly or because xx_unaligned
> is impl. incorrectly on some archs? 

I'm just not sure it works in boot wrappers in case archs ... I suppose
it does but I haven't actually checked :-)

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] zlib: Optimize inffast when copying direct from output
  2009-11-24  3:12   ` Benjamin Herrenschmidt
@ 2009-11-26  8:30     ` Joakim Tjernlund
  2009-11-26  8:46       ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 8+ messages in thread
From: Joakim Tjernlund @ 2009-11-26  8:30 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev

Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 24/11/2009 04:12:43:
>
> On Tue, 2009-11-10 at 10:00 +0100, Joakim Tjernlund wrote:
> > JFFS2 uses lesser compression ratio and inflate always
> > ends up in "copy direct from output" case.
> > This patch tries to optimize the direct copy procedure.
> > Uses get_unaligned() but only in one place.
> > The copy loop just above this one can also use this
> > optimization, but I havn't done so as I have not tested if it
> > is a win there too.
> > On my MPC8321 this is about 17% faster on my JFFS2 root FS
> > than the original.
> > ---
> >
> >  Would like some testing of the PowerPC boot wrapper and
> >  a LE target before sending it upstream.
>
> Well, you should probably submit that patch to lkml then :-)

I have(with LE fixes), Andrew Morton has it.

>
> I'm not sure its going to work to use get_unaligned() like that on all
> archs .. it might be definitely something to discuss on some more
> appropriate mailing list.

Oh, why not? Is that because I am using it wrongly or because xx_unaligned
is impl. incorrectly on some archs?

      Jocke

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] zlib: Optimize inffast when copying direct from output
  2009-11-10  9:00 ` Joakim Tjernlund
@ 2009-11-24  3:12   ` Benjamin Herrenschmidt
  2009-11-26  8:30     ` Joakim Tjernlund
  0 siblings, 1 reply; 8+ messages in thread
From: Benjamin Herrenschmidt @ 2009-11-24  3:12 UTC (permalink / raw)
  To: Joakim Tjernlund; +Cc: linuxppc-dev

On Tue, 2009-11-10 at 10:00 +0100, Joakim Tjernlund wrote:
> JFFS2 uses lesser compression ratio and inflate always
> ends up in "copy direct from output" case.
> This patch tries to optimize the direct copy procedure.
> Uses get_unaligned() but only in one place.
> The copy loop just above this one can also use this
> optimization, but I havn't done so as I have not tested if it
> is a win there too.
> On my MPC8321 this is about 17% faster on my JFFS2 root FS
> than the original.
> ---
> 
>  Would like some testing of the PowerPC boot wrapper and
>  a LE target before sending it upstream.

Well, you should probably submit that patch to lkml then :-)

I'm not sure its going to work to use get_unaligned() like that on all
archs .. it might be definitely something to discuss on some more
appropriate mailing list.

Cheers,
Ben.

>  arch/powerpc/boot/Makefile |    4 ++-
>  lib/zlib_inflate/inffast.c |   48 +++++++++++++++++++++++++++++++++----------
>  2 files changed, 40 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
> index 9ae7b7e..98e4c4f 100644
> --- a/arch/powerpc/boot/Makefile
> +++ b/arch/powerpc/boot/Makefile
> @@ -20,7 +20,7 @@
>  all: $(obj)/zImage
>  
>  BOOTCFLAGS    := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
> -		 -fno-strict-aliasing -Os -msoft-float -pipe \
> +		 -fno-strict-aliasing -Os -msoft-float -pipe -D__KERNEL__\
>  		 -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \
>  		 -isystem $(shell $(CROSS32CC) -print-file-name=include)
>  BOOTAFLAGS	:= -D__ASSEMBLY__ $(BOOTCFLAGS) -traditional -nostdinc
> @@ -34,6 +34,8 @@ BOOTCFLAGS	+= -fno-stack-protector
>  endif
>  
>  BOOTCFLAGS	+= -I$(obj) -I$(srctree)/$(obj)
> +BOOTCFLAGS	+= -include include/linux/autoconf.h -Iarch/powerpc/include
> +BOOTCFLAGS	+= -Iinclude
>  
>  DTS_FLAGS	?= -p 1024
>  
> diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c
> index 8550b0c..0c7fa3d 100644
> --- a/lib/zlib_inflate/inffast.c
> +++ b/lib/zlib_inflate/inffast.c
> @@ -4,6 +4,7 @@
>   */
>  
>  #include <linux/zutil.h>
> +#include <asm/unaligned.h>
>  #include "inftrees.h"
>  #include "inflate.h"
>  #include "inffast.h"
> @@ -24,9 +25,11 @@
>  #ifdef POSTINC
>  #  define OFF 0
>  #  define PUP(a) *(a)++
> +#  define UP_UNALIGNED(a) get_unaligned((a)++)
>  #else
>  #  define OFF 1
>  #  define PUP(a) *++(a)
> +#  define UP_UNALIGNED(a) get_unaligned(++(a))
>  #endif
>  
>  /*
> @@ -239,18 +242,41 @@ void inflate_fast(z_streamp strm, unsigned start)
>                      }
>                  }
>                  else {
> +		    unsigned short *sout;
> +		    unsigned long loops;
> +
>                      from = out - dist;          /* copy direct from output */
> -                    do {                        /* minimum length is three */
> -                        PUP(out) = PUP(from);
> -                        PUP(out) = PUP(from);
> -                        PUP(out) = PUP(from);
> -                        len -= 3;
> -                    } while (len > 2);
> -                    if (len) {
> -                        PUP(out) = PUP(from);
> -                        if (len > 1)
> -                            PUP(out) = PUP(from);
> -                    }
> +                    /* minimum length is three */
> +		    /* Align out addr */
> +		    if (!((long)(out - 1 + OFF)) & 1) {
> +			PUP(out) = PUP(from);
> +			len--;
> +		    }
> +		    sout = (unsigned short *)(out - OFF);
> +		    if (dist > 2 ) {
> +			unsigned short *sfrom;
> +
> +			sfrom = (unsigned short *)(from - OFF);
> +			loops = len >> 1;
> +			do
> +			    PUP(sout) = UP_UNALIGNED(sfrom);
> +			while (--loops);
> +			out = (unsigned char *)sout + OFF;
> +			from = (unsigned char *)sfrom + OFF;
> +		    } else { /* dist == 1 or dist == 2 */
> +			unsigned short pat16;
> +
> +			pat16 = *(sout-2+2*OFF);
> +			if (dist == 1)
> +			    pat16 = (pat16 & 0xff) | ((pat16 & 0xff ) << 8);
> +			loops = len >> 1;
> +			do
> +			    PUP(sout) = pat16;
> +			while (--loops);
> +			out = (unsigned char *)sout + OFF;
> +		    }
> +		    if (len & 1)
> +			PUP(out) = PUP(from);
>                  }
>              }
>              else if ((op & 64) == 0) {          /* 2nd level distance code */

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] zlib: Optimize inffast when copying direct from output
@ 2009-11-12  9:04 Joakim Tjernlund
  0 siblings, 0 replies; 8+ messages in thread
From: Joakim Tjernlund @ 2009-11-12  9:04 UTC (permalink / raw)
  To: roel kluin, akpm, Richard Purdie, linux-kernel; +Cc: Joakim Tjernlund

JFFS2 uses lesser compression ratio and inflate always
ends up in "copy direct from output" case.
This patch tries to optimize the direct copy procedure.
Uses get_unaligned() but only in one place.
The copy loop just above this one can also use this
optimization, but I havn't done so as I have not tested if it
is a win there too.
On my MPC8321 this is about 17% faster on my JFFS2 root FS
than the original.

Signed-off-by: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
---

 This version replaces all previous versions.
 Changes:
 - Fix aligment check (Roel Kluin)
 - Fix problem for LE targets.

 arch/powerpc/boot/Makefile |    4 ++-
 lib/zlib_inflate/inffast.c |   55 +++++++++++++++++++++++++++++++++++--------
 2 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 9ae7b7e..98e4c4f 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -20,7 +20,7 @@
 all: $(obj)/zImage
 
 BOOTCFLAGS    := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
-		 -fno-strict-aliasing -Os -msoft-float -pipe \
+		 -fno-strict-aliasing -Os -msoft-float -pipe -D__KERNEL__\
 		 -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \
 		 -isystem $(shell $(CROSS32CC) -print-file-name=include)
 BOOTAFLAGS	:= -D__ASSEMBLY__ $(BOOTCFLAGS) -traditional -nostdinc
@@ -34,6 +34,8 @@ BOOTCFLAGS	+= -fno-stack-protector
 endif
 
 BOOTCFLAGS	+= -I$(obj) -I$(srctree)/$(obj)
+BOOTCFLAGS	+= -include include/linux/autoconf.h -Iarch/powerpc/include
+BOOTCFLAGS	+= -Iinclude
 
 DTS_FLAGS	?= -p 1024
 
diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c
index 8550b0c..c6740ae 100644
--- a/lib/zlib_inflate/inffast.c
+++ b/lib/zlib_inflate/inffast.c
@@ -4,6 +4,8 @@
  */
 
 #include <linux/zutil.h>
+#include <asm/unaligned.h>
+#include <asm/byteorder.h>
 #include "inftrees.h"
 #include "inflate.h"
 #include "inffast.h"
@@ -24,9 +26,11 @@
 #ifdef POSTINC
 #  define OFF 0
 #  define PUP(a) *(a)++
+#  define UP_UNALIGNED(a) get_unaligned((a)++)
 #else
 #  define OFF 1
 #  define PUP(a) *++(a)
+#  define UP_UNALIGNED(a) get_unaligned(++(a))
 #endif
 
 /*
@@ -239,18 +243,47 @@ void inflate_fast(z_streamp strm, unsigned start)
                     }
                 }
                 else {
+		    unsigned short *sout;
+		    unsigned long loops;
+
                     from = out - dist;          /* copy direct from output */
-                    do {                        /* minimum length is three */
-                        PUP(out) = PUP(from);
-                        PUP(out) = PUP(from);
-                        PUP(out) = PUP(from);
-                        len -= 3;
-                    } while (len > 2);
-                    if (len) {
-                        PUP(out) = PUP(from);
-                        if (len > 1)
-                            PUP(out) = PUP(from);
-                    }
+                    /* minimum length is three */
+		    /* Align out addr */
+		    if (!((long)(out - 1 + OFF) & 1)) {
+			PUP(out) = PUP(from);
+			len--;
+		    }
+		    sout = (unsigned short *)(out - OFF);
+		    if (dist > 2 ) {
+			unsigned short *sfrom;
+
+			sfrom = (unsigned short *)(from - OFF);
+			loops = len >> 1;
+			do
+			    PUP(sout) = UP_UNALIGNED(sfrom);
+			while (--loops);
+			out = (unsigned char *)sout + OFF;
+			from = (unsigned char *)sfrom + OFF;
+		    } else { /* dist == 1 or dist == 2 */
+			unsigned short pat16;
+
+			pat16 = *(sout-2+2*OFF);
+			if (dist == 1)
+#if defined(__BIG_ENDIAN)
+			    pat16 = (pat16 & 0xff) | ((pat16 & 0xff ) << 8);
+#elif defined(__LITTLE_ENDIAN)
+			    pat16 = (pat16 & 0xff00) | ((pat16 & 0xff00 ) >> 8);
+#else
+#error __BIG_ENDIAN nor __LITTLE_ENDIAN is defined
+#endif
+			loops = len >> 1;
+			do
+			    PUP(sout) = pat16;
+			while (--loops);
+			out = (unsigned char *)sout + OFF;
+		    }
+		    if (len & 1)
+			PUP(out) = PUP(from);
                 }
             }
             else if ((op & 64) == 0) {          /* 2nd level distance code */
-- 
1.6.4.4


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH] zlib: Optimize inffast when copying direct from output
       [not found] <Received>
@ 2009-11-10  9:00 ` Joakim Tjernlund
  2009-11-24  3:12   ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 8+ messages in thread
From: Joakim Tjernlund @ 2009-11-10  9:00 UTC (permalink / raw)
  To: linuxppc-dev

JFFS2 uses lesser compression ratio and inflate always
ends up in "copy direct from output" case.
This patch tries to optimize the direct copy procedure.
Uses get_unaligned() but only in one place.
The copy loop just above this one can also use this
optimization, but I havn't done so as I have not tested if it
is a win there too.
On my MPC8321 this is about 17% faster on my JFFS2 root FS
than the original.
---

 Would like some testing of the PowerPC boot wrapper and
 a LE target before sending it upstream.

 arch/powerpc/boot/Makefile |    4 ++-
 lib/zlib_inflate/inffast.c |   48 +++++++++++++++++++++++++++++++++----------
 2 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 9ae7b7e..98e4c4f 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -20,7 +20,7 @@
 all: $(obj)/zImage
 
 BOOTCFLAGS    := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
-		 -fno-strict-aliasing -Os -msoft-float -pipe \
+		 -fno-strict-aliasing -Os -msoft-float -pipe -D__KERNEL__\
 		 -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \
 		 -isystem $(shell $(CROSS32CC) -print-file-name=include)
 BOOTAFLAGS	:= -D__ASSEMBLY__ $(BOOTCFLAGS) -traditional -nostdinc
@@ -34,6 +34,8 @@ BOOTCFLAGS	+= -fno-stack-protector
 endif
 
 BOOTCFLAGS	+= -I$(obj) -I$(srctree)/$(obj)
+BOOTCFLAGS	+= -include include/linux/autoconf.h -Iarch/powerpc/include
+BOOTCFLAGS	+= -Iinclude
 
 DTS_FLAGS	?= -p 1024
 
diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c
index 8550b0c..0c7fa3d 100644
--- a/lib/zlib_inflate/inffast.c
+++ b/lib/zlib_inflate/inffast.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/zutil.h>
+#include <asm/unaligned.h>
 #include "inftrees.h"
 #include "inflate.h"
 #include "inffast.h"
@@ -24,9 +25,11 @@
 #ifdef POSTINC
 #  define OFF 0
 #  define PUP(a) *(a)++
+#  define UP_UNALIGNED(a) get_unaligned((a)++)
 #else
 #  define OFF 1
 #  define PUP(a) *++(a)
+#  define UP_UNALIGNED(a) get_unaligned(++(a))
 #endif
 
 /*
@@ -239,18 +242,41 @@ void inflate_fast(z_streamp strm, unsigned start)
                     }
                 }
                 else {
+		    unsigned short *sout;
+		    unsigned long loops;
+
                     from = out - dist;          /* copy direct from output */
-                    do {                        /* minimum length is three */
-                        PUP(out) = PUP(from);
-                        PUP(out) = PUP(from);
-                        PUP(out) = PUP(from);
-                        len -= 3;
-                    } while (len > 2);
-                    if (len) {
-                        PUP(out) = PUP(from);
-                        if (len > 1)
-                            PUP(out) = PUP(from);
-                    }
+                    /* minimum length is three */
+		    /* Align out addr */
+		    if (!((long)(out - 1 + OFF)) & 1) {
+			PUP(out) = PUP(from);
+			len--;
+		    }
+		    sout = (unsigned short *)(out - OFF);
+		    if (dist > 2 ) {
+			unsigned short *sfrom;
+
+			sfrom = (unsigned short *)(from - OFF);
+			loops = len >> 1;
+			do
+			    PUP(sout) = UP_UNALIGNED(sfrom);
+			while (--loops);
+			out = (unsigned char *)sout + OFF;
+			from = (unsigned char *)sfrom + OFF;
+		    } else { /* dist == 1 or dist == 2 */
+			unsigned short pat16;
+
+			pat16 = *(sout-2+2*OFF);
+			if (dist == 1)
+			    pat16 = (pat16 & 0xff) | ((pat16 & 0xff ) << 8);
+			loops = len >> 1;
+			do
+			    PUP(sout) = pat16;
+			while (--loops);
+			out = (unsigned char *)sout + OFF;
+		    }
+		    if (len & 1)
+			PUP(out) = PUP(from);
                 }
             }
             else if ((op & 64) == 0) {          /* 2nd level distance code */
-- 
1.6.4.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH] zlib: Optimize inffast when copying direct from output
@ 2009-11-08  9:53 Joakim Tjernlund
  0 siblings, 0 replies; 8+ messages in thread
From: Joakim Tjernlund @ 2009-11-08  9:53 UTC (permalink / raw)
  To: akpm, Richard Purdie, linux-kernel; +Cc: Joakim Tjernlund

JFFS2 uses lesser compression ratio and inflate always
ends up in "copy direct from output" case.
This patch tries to optimize the copy procedure for
arch's that have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS defined.
On my MPC8321 this is about 14% faster on my JFFS2 root FS
than the original.

Signed-off-by: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
---
 lib/zlib_inflate/inffast.c |   35 +++++++++++++++++++++++++++++++++++
 1 files changed, 35 insertions(+), 0 deletions(-)

diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c
index 8550b0c..0588fbf 100644
--- a/lib/zlib_inflate/inffast.c
+++ b/lib/zlib_inflate/inffast.c
@@ -240,6 +240,40 @@ void inflate_fast(z_streamp strm, unsigned start)
                 }
                 else {
                     from = out - dist;          /* copy direct from output */
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+                    /* minimum length is three */
+		    if (dist > 2 ) {
+			unsigned short *sout = (unsigned short *)(out - OFF);
+			unsigned short *sfrom = (unsigned short *)(from - OFF);
+			unsigned long loops = len >> 1;
+
+			do
+			    PUP(sout) = PUP(sfrom);
+			while (--loops);
+			out = (unsigned char *)sout + OFF;
+			from = (unsigned char *)sfrom + OFF;
+			if (len & 1)
+			    PUP(out) = PUP(from);
+		    } else if (dist == 2) {
+			unsigned short *sout = (unsigned short *)(out - OFF);
+			unsigned short pat16;
+			unsigned long loops = len >> 1;
+
+			pat16 = *(sout-2+2*OFF);
+			do
+			    PUP(sout) = pat16;
+			while (--loops);
+			out = (unsigned char *)sout + OFF;
+			if (len & 1)
+			    PUP(out) = PUP(from);
+		    } else {
+			unsigned char pat8 = *(out - 1 + OFF);
+
+			do {
+			    PUP(out) = pat8;
+			} while (--len);
+		    }
+#else
                     do {                        /* minimum length is three */
                         PUP(out) = PUP(from);
                         PUP(out) = PUP(from);
@@ -251,6 +285,7 @@ void inflate_fast(z_streamp strm, unsigned start)
                         if (len > 1)
                             PUP(out) = PUP(from);
                     }
+#endif
                 }
             }
             else if ((op & 64) == 0) {          /* 2nd level distance code */
-- 
1.6.4.4


^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2009-11-26  9:06 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-11-10  9:03 [PATCH] zlib: Optimize inffast when copying direct from output Joakim Tjernlund
  -- strict thread matches above, loose matches on Subject: below --
2009-11-12  9:04 Joakim Tjernlund
     [not found] <Received>
2009-11-10  9:00 ` Joakim Tjernlund
2009-11-24  3:12   ` Benjamin Herrenschmidt
2009-11-26  8:30     ` Joakim Tjernlund
2009-11-26  8:46       ` Benjamin Herrenschmidt
2009-11-26  9:02         ` Joakim Tjernlund
2009-11-08  9:53 Joakim Tjernlund

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.