* [PATCH 1/2] powerpc: Add 64bit optimised memcmp
@ 2015-01-21 1:27 Anton Blanchard
2015-01-21 1:27 ` [PATCH 2/2] powerpc: Add memcmp testcase Anton Blanchard
2015-01-21 9:26 ` [PATCH 1/2] powerpc: Add 64bit optimised memcmp Arnd Bergmann
0 siblings, 2 replies; 10+ messages in thread
From: Anton Blanchard @ 2015-01-21 1:27 UTC (permalink / raw)
To: benh, paulus, mpe, segher; +Cc: linuxppc-dev
I noticed ksm spending quite a lot of time in memcmp on a large
KVM box. The current memcmp loop is very unoptimised - byte at a
time compares with no loop unrolling. We can do much much better.
Optimise the loop in a few ways:
- Unroll the byte at a time loop
- For large (at least 32 byte) comparisons that are also 8 byte
aligned, use an unrolled modulo scheduled loop using 8 byte
loads. This is similar to our glibc memcmp.
A simple microbenchmark testing 10000000 iterations of an 8192 byte
memcmp was used to measure the performance:
baseline: 29.93 s
modified: 1.70 s
Just over 17x faster.
v2: Incorporated some suggestions from Segher:
- Use andi. instead of rdlicl.
- Convert bdnzt eq, to bdnz. It's just duplicating the earlier compare
and was a relic from a previous version.
- Don't use cr5, we have plans to use that CR field for fast local
atomics.
Signed-off-by: Anton Blanchard <anton@samba.org>
---
arch/powerpc/lib/Makefile | 3 +-
arch/powerpc/lib/memcmp_64.S | 233 +++++++++++++++++++++++++++++++++++++++++++
arch/powerpc/lib/string.S | 2 +
3 files changed, 237 insertions(+), 1 deletion(-)
create mode 100644 arch/powerpc/lib/memcmp_64.S
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 1b01159..5526156 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -15,7 +15,8 @@ obj-$(CONFIG_PPC32) += div64.o copy_32.o
obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \
usercopy_64.o mem_64.o hweight_64.o \
- copyuser_power7.o string_64.o copypage_power7.o
+ copyuser_power7.o string_64.o copypage_power7.o \
+ memcmp_64.o
ifeq ($(CONFIG_GENERIC_CSUM),)
obj-y += checksum_$(CONFIG_WORD_SIZE).o
obj-$(CONFIG_PPC64) += checksum_wrappers_64.o
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
new file mode 100644
index 0000000..8953d23
--- /dev/null
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -0,0 +1,233 @@
+/*
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ * Copyright 2015 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/ppc_asm.h>
+
+#define off8 r6
+#define off16 r7
+#define off24 r8
+
+#define rA r9
+#define rB r10
+#define rC r11
+#define rD r27
+#define rE r28
+#define rF r29
+#define rG r30
+#define rH r31
+
+#ifdef __LITTLE_ENDIAN__
+#define LD ldbrx
+#else
+#define LD ldx
+#endif
+
+_GLOBAL(memcmp)
+ cmpdi cr1,r5,0
+
+ /* Use the short loop if both strings are not 8B aligned */
+ or r6,r3,r4
+ andi. r6,r6,7
+
+ /* Use the short loop if length is less than 32B */
+ cmpdi cr6,r5,31
+
+ beq cr1,.Lzero
+ bne .Lshort
+ bgt cr6,.Llong
+
+.Lshort:
+ mtctr r5
+
+1: lbz rA,0(r3)
+ lbz rB,0(r4)
+ subf. rC,rB,rA
+ bne .Lnon_zero
+ bdz .Lzero
+
+ lbz rA,1(r3)
+ lbz rB,1(r4)
+ subf. rC,rB,rA
+ bne .Lnon_zero
+ bdz .Lzero
+
+ lbz rA,2(r3)
+ lbz rB,2(r4)
+ subf. rC,rB,rA
+ bne .Lnon_zero
+ bdz .Lzero
+
+ lbz rA,3(r3)
+ lbz rB,3(r4)
+ subf. rC,rB,rA
+ bne .Lnon_zero
+
+ addi r3,r3,4
+ addi r4,r4,4
+
+ bdnz 1b
+
+.Lzero:
+ li r3,0
+ blr
+
+.Lnon_zero:
+ mr r3,rC
+ blr
+
+.Llong:
+ li off8,8
+ li off16,16
+ li off24,24
+
+ std r31,-8(r1)
+ std r30,-16(r1)
+ std r29,-24(r1)
+ std r28,-32(r1)
+ std r27,-40(r1)
+
+ srdi r0,r5,5
+ mtctr r0
+ andi. r5,r5,31
+
+ LD rA,0,r3
+ LD rB,0,r4
+
+ LD rC,off8,r3
+ LD rD,off8,r4
+
+ LD rE,off16,r3
+ LD rF,off16,r4
+
+ LD rG,off24,r3
+ LD rH,off24,r4
+ cmpld cr0,rA,rB
+
+ addi r3,r3,32
+ addi r4,r4,32
+
+ bdz .Lfirst32
+
+ LD rA,0,r3
+ LD rB,0,r4
+ cmpld cr1,rC,rD
+
+ LD rC,off8,r3
+ LD rD,off8,r4
+ cmpld cr6,rE,rF
+
+ LD rE,off16,r3
+ LD rF,off16,r4
+ cmpld cr7,rG,rH
+ bne cr0,.LcmpAB
+
+ LD rG,off24,r3
+ LD rH,off24,r4
+ cmpld cr0,rA,rB
+ bne cr1,.LcmpCD
+
+ addi r3,r3,32
+ addi r4,r4,32
+
+ bdz .Lsecond32
+
+ .balign 16
+
+1: LD rA,0,r3
+ LD rB,0,r4
+ cmpld cr1,rC,rD
+ bne cr6,.LcmpEF
+
+ LD rC,off8,r3
+ LD rD,off8,r4
+ cmpld cr6,rE,rF
+ bne cr7,.LcmpGH
+
+ LD rE,off16,r3
+ LD rF,off16,r4
+ cmpld cr7,rG,rH
+ bne cr0,.LcmpAB
+
+ LD rG,off24,r3
+ LD rH,off24,r4
+ cmpld cr0,rA,rB
+ bne cr1,.LcmpCD
+
+ addi r3,r3,32
+ addi r4,r4,32
+
+ bdnz 1b
+
+.Lsecond32:
+ cmpld cr1,rC,rD
+ bne cr6,.LcmpEF
+
+ cmpld cr6,rE,rF
+ bne cr7,.LcmpGH
+
+ cmpld cr7,rG,rH
+ bne cr0,.LcmpAB
+
+ bne cr1,.LcmpCD
+ bne cr6,.LcmpEF
+ bne cr7,.LcmpGH
+
+.Ltail:
+ ld r31,-8(r1)
+ ld r30,-16(r1)
+ ld r29,-24(r1)
+ ld r28,-32(r1)
+ ld r27,-40(r1)
+
+ cmpdi r5,0
+ beq .Lzero
+ b .Lshort
+
+.Lfirst32:
+ cmpld cr1,rC,rD
+ cmpld cr6,rE,rF
+ cmpld cr7,rG,rH
+
+ bne cr0,.LcmpAB
+ bne cr1,.LcmpCD
+ bne cr6,.LcmpEF
+ bne cr7,.LcmpGH
+
+ b .Ltail
+
+.LcmpAB:
+ li r3,1
+ bgt cr0,.Lout
+ li r3,-1
+ b .Lout
+
+.LcmpCD:
+ li r3,1
+ bgt cr1,.Lout
+ li r3,-1
+ b .Lout
+
+.LcmpEF:
+ li r3,1
+ bgt cr6,.Lout
+ li r3,-1
+ b .Lout
+
+.LcmpGH:
+ li r3,1
+ bgt cr7,.Lout
+ li r3,-1
+
+.Lout:
+ ld r31,-8(r1)
+ ld r30,-16(r1)
+ ld r29,-24(r1)
+ ld r28,-32(r1)
+ ld r27,-40(r1)
+ blr
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index 1b5a0a0..c80fb49 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -93,6 +93,7 @@ _GLOBAL(strlen)
subf r3,r3,r4
blr
+#ifdef CONFIG_PPC32
_GLOBAL(memcmp)
PPC_LCMPI 0,r5,0
beq- 2f
@@ -106,6 +107,7 @@ _GLOBAL(memcmp)
blr
2: li r3,0
blr
+#endif
_GLOBAL(memchr)
PPC_LCMPI 0,r5,0
--
2.1.0
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 2/2] powerpc: Add memcmp testcase
2015-01-21 1:27 [PATCH 1/2] powerpc: Add 64bit optimised memcmp Anton Blanchard
@ 2015-01-21 1:27 ` Anton Blanchard
2015-01-21 9:26 ` [PATCH 1/2] powerpc: Add 64bit optimised memcmp Arnd Bergmann
1 sibling, 0 replies; 10+ messages in thread
From: Anton Blanchard @ 2015-01-21 1:27 UTC (permalink / raw)
To: benh, paulus, mpe, segher; +Cc: linuxppc-dev
Add a testcase for the new ppc64 memcmp.
Signed-off-by: Anton Blanchard <anton@samba.org>
---
.../testing/selftests/powerpc/stringloops/Makefile | 21 +++++
.../selftests/powerpc/stringloops/asm/ppc_asm.h | 7 ++
.../selftests/powerpc/stringloops/memcmp_64.S | 1 +
.../selftests/powerpc/stringloops/test_memcmp.c | 103 +++++++++++++++++++++
4 files changed, 132 insertions(+)
create mode 100644 tools/testing/selftests/powerpc/stringloops/Makefile
create mode 100644 tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
create mode 120000 tools/testing/selftests/powerpc/stringloops/memcmp_64.S
create mode 100644 tools/testing/selftests/powerpc/stringloops/test_memcmp.c
diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile b/tools/testing/selftests/powerpc/stringloops/Makefile
new file mode 100644
index 0000000..159a299
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/Makefile
@@ -0,0 +1,21 @@
+# The loops are all 64-bit code
+CFLAGS += -m64
+CFLAGS += -I$(CURDIR)
+CFLAGS += -D SELFTEST
+
+PROGS := test_memcmp
+EXTRA_SOURCES := memcmp_64.S ../harness.c
+
+all: $(PROGS)
+
+$(PROGS): $(EXTRA_SOURCES)
+
+run_tests: all
+ @-for PROG in $(PROGS); do \
+ ./$$PROG; \
+ done;
+
+clean:
+ rm -f $(PROGS) *.o
+
+.PHONY: all run_tests clean
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
new file mode 100644
index 0000000..11bece8
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
@@ -0,0 +1,7 @@
+#include <ppc-asm.h>
+
+#ifndef r1
+#define r1 sp
+#endif
+
+#define _GLOBAL(A) FUNC_START(test_ ## A)
diff --git a/tools/testing/selftests/powerpc/stringloops/memcmp_64.S b/tools/testing/selftests/powerpc/stringloops/memcmp_64.S
new file mode 120000
index 0000000..9bc87e4
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/memcmp_64.S
@@ -0,0 +1 @@
+../../../../../arch/powerpc/lib/memcmp_64.S
\ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/stringloops/test_memcmp.c b/tools/testing/selftests/powerpc/stringloops/test_memcmp.c
new file mode 100644
index 0000000..17417dd
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/test_memcmp.c
@@ -0,0 +1,103 @@
+#include <malloc.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../utils.h"
+
+#define SIZE 256
+#define ITERATIONS 10000
+
+int test_memcmp(const void *s1, const void *s2, size_t n);
+
+/* test all offsets and lengths */
+static void test_one(char *s1, char *s2)
+{
+ unsigned long offset, size;
+
+ for (offset = 0; offset < SIZE; offset++) {
+ for (size = 0; size < (SIZE-offset); size++) {
+ int x, y;
+ unsigned long i;
+
+ y = memcmp(s1+offset, s2+offset, size);
+ x = test_memcmp(s1+offset, s2+offset, size);
+
+ if (((x ^ y) < 0) && /* Trick to compare sign */
+ ((x | y) != 0)) { /* check for zero */
+ printf("memcmp returned %d, should have returned %d (offset %ld size %ld)\n", x, y, offset, size);
+
+ for (i = offset; i < offset+size; i++)
+ printf("%02x ", s1[i]);
+ printf("\n");
+
+ for (i = offset; i < offset+size; i++)
+ printf("%02x ", s2[i]);
+ printf("\n");
+ abort();
+ }
+ }
+ }
+}
+
+static int testcase(void)
+{
+ char *s1;
+ char *s2;
+ unsigned long i;
+
+ s1 = memalign(128, SIZE);
+ if (!s1) {
+ perror("memalign");
+ exit(1);
+ }
+
+ s2 = memalign(128, SIZE);
+ if (!s2) {
+ perror("memalign");
+ exit(1);
+ }
+
+ srandom(1);
+
+ for (i = 0; i < ITERATIONS; i++) {
+ unsigned long j;
+ unsigned long change;
+
+ for (j = 0; j < SIZE; j++)
+ s1[j] = random();
+
+ memcpy(s2, s1, SIZE);
+
+ /* change one byte */
+ change = random() % SIZE;
+ s2[change] = random() & 0xff;
+
+ test_one(s1, s2);
+ }
+
+ srandom(1);
+
+ for (i = 0; i < ITERATIONS; i++) {
+ unsigned long j;
+ unsigned long change;
+
+ for (j = 0; j < SIZE; j++)
+ s1[j] = random();
+
+ memcpy(s2, s1, SIZE);
+
+ /* change multiple bytes, 1/8 of total */
+ for (j = 0; j < SIZE / 8; j++) {
+ change = random() % SIZE;
+ s2[change] = random() & 0xff;
+ }
+
+ test_one(s1, s2);
+ }
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(testcase, "memcmp");
+}
--
2.1.0
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2] powerpc: Add 64bit optimised memcmp
2015-01-21 1:27 [PATCH 1/2] powerpc: Add 64bit optimised memcmp Anton Blanchard
2015-01-21 1:27 ` [PATCH 2/2] powerpc: Add memcmp testcase Anton Blanchard
@ 2015-01-21 9:26 ` Arnd Bergmann
2015-01-21 12:06 ` Anton Blanchard
1 sibling, 1 reply; 10+ messages in thread
From: Arnd Bergmann @ 2015-01-21 9:26 UTC (permalink / raw)
To: linuxppc-dev; +Cc: paulus, Anton Blanchard
On Wednesday 21 January 2015 12:27:38 Anton Blanchard wrote:
> I noticed ksm spending quite a lot of time in memcmp on a large
> KVM box. The current memcmp loop is very unoptimised - byte at a
> time compares with no loop unrolling. We can do much much better.
>
> Optimise the loop in a few ways:
>
> - Unroll the byte at a time loop
>
> - For large (at least 32 byte) comparisons that are also 8 byte
> aligned, use an unrolled modulo scheduled loop using 8 byte
> loads. This is similar to our glibc memcmp.
>
> A simple microbenchmark testing 10000000 iterations of an 8192 byte
> memcmp was used to measure the performance:
>
> baseline: 29.93 s
>
> modified: 1.70 s
>
> Just over 17x faster.
>
> v2: Incorporated some suggestions from Segher:
>
> - Use andi. instead of rdlicl.
>
> - Convert bdnzt eq, to bdnz. It's just duplicating the earlier compare
> and was a relic from a previous version.
>
> - Don't use cr5, we have plans to use that CR field for fast local
> atomics.
>
> Signed-off-by: Anton Blanchard <anton@samba.org>
Would it help to also add a way for an architecture to override
memcmp_pages() with its own implementation? That way you could
skip the unaligned part, hardcode the loop counter and avoid the
preempt_disable() in kmap_atomic().
Arnd
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2] powerpc: Add 64bit optimised memcmp
2015-01-21 9:26 ` [PATCH 1/2] powerpc: Add 64bit optimised memcmp Arnd Bergmann
@ 2015-01-21 12:06 ` Anton Blanchard
0 siblings, 0 replies; 10+ messages in thread
From: Anton Blanchard @ 2015-01-21 12:06 UTC (permalink / raw)
To: Arnd Bergmann; +Cc: linuxppc-dev, paulus
Hi Arnd,
> Would it help to also add a way for an architecture to override
> memcmp_pages() with its own implementation? That way you could
> skip the unaligned part, hardcode the loop counter and avoid the
> preempt_disable() in kmap_atomic().
Good idea. We could also have a generic implementation that did long
comparisons, so everyone would get a decent speed up.
Anton
^ permalink raw reply [flat|nested] 10+ messages in thread
* RE: [PATCH 1/2] powerpc: Add 64bit optimised memcmp
2015-01-12 6:55 ` Joakim Tjernlund
@ 2015-01-12 9:45 ` David Laight
0 siblings, 0 replies; 10+ messages in thread
From: David Laight @ 2015-01-12 9:45 UTC (permalink / raw)
To: 'Joakim Tjernlund', anton; +Cc: paulus, linuxppc-dev
From: Joakim Tjernlund=20
> On Mon, 2015-01-12 at 11:55 +1100, Anton Blanchard wrote:
> > Hi David,
> >
> > > The unrolled loop (deleted) looks excessive.
> > > On a modern cpu with multiple execution units you can usually
> > > manage to get the loop overhead to execute in parallel to the
> > > actual 'work'.
> > > So I suspect that a much simpler 'word at a time' loop will be almost=
as fast - especially in the
> case where the code isn't
> > > already in the cache and the compare is relatively short.
> >
> > I'm always keen to keep things as simple as possible, but your loop is =
over 50% slower. Once the
> loop hits a steady state you are going to run into front end issues with =
instruction fetch on POWER8.
Interesting, I'm not an expert on ppc scheduling, but on my old x86 Athon 7=
00 (I think
it was that one) a similar loop ran as fast as 'rep movsw'.
> Out of curiosity, does preincrement make any difference(or can gcc do tha=
t for you nowadays)?
It will only change register pressure slightly, and might allow any executi=
on
delays be filled - but that is very processor dependant.
Actually you probably want to do 'a +=3D 2' somewhere to reduce the instruc=
tion count.
Similarly the end condition needs to compare one of the pointers.
Elsewhere (not ppc) I've used (++p)[-1] instead of *p++ to move the increme=
nt
before the load to get better scheduling.
> a1 =3D *a;
> b1 =3D *b;
> while {
> a2 =3D *++a;
> b2 =3D *++b;
> if (a1 !=3D a2)
That should have been a1 !=3D b1
> break;
> a1 =3D *++a;
> b1 =3D *++b;
> } while (a2 !=3D a1);
and a2 !=3D b2
David
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2] powerpc: Add 64bit optimised memcmp
2015-01-12 0:55 ` Anton Blanchard
@ 2015-01-12 6:55 ` Joakim Tjernlund
2015-01-12 9:45 ` David Laight
0 siblings, 1 reply; 10+ messages in thread
From: Joakim Tjernlund @ 2015-01-12 6:55 UTC (permalink / raw)
To: anton; +Cc: paulus, David.Laight, linuxppc-dev
On Mon, 2015-01-12 at 11:55 +1100, Anton Blanchard wrote:
> Hi David,
>=20
> > The unrolled loop (deleted) looks excessive.
> > On a modern cpu with multiple execution units you can usually
> > manage to get the loop overhead to execute in parallel to the
> > actual 'work'.
> > So I suspect that a much simpler 'word at a time' loop will be almost a=
s fast - especially in the case where the code isn't
> > already in the cache and the compare is relatively short.
>=20
> I'm always keen to keep things as simple as possible, but your loop is ov=
er 50% slower. Once the loop hits a steady state you are going to run into =
front end issues with instruction fetch on POWER8.
>=20
Out of curiosity, does preincrement make any difference(or can gcc do that =
for you nowadays)?
a1 =3D *a;
b1 =3D *b;
while {
a2 =3D *++a;
b2 =3D *++b;
if (a1 !=3D a2)
break;
a1 =3D *++a;
b1 =3D *++b;
} while (a2 !=3D a1);
Jocke
> Anton
>=20
> > Try something based on:
> > a1 =3D *a++;
> > b1 =3D *b++;
> > while {
> > a2 =3D *a++;
> > b2 =3D *b++;
> > if (a1 !=3D a2)
> > break;
> > a1 =3D *a++;
> > b1 =3D *b++;
> > } while (a2 !=3D a1);
> >=20
> > David
> >=20
>=20
> _______________________________________________
> Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozl=
abs.org/listinfo/linuxppc-dev=
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2] powerpc: Add 64bit optimised memcmp
2015-01-09 10:06 ` David Laight
@ 2015-01-12 0:55 ` Anton Blanchard
2015-01-12 6:55 ` Joakim Tjernlund
0 siblings, 1 reply; 10+ messages in thread
From: Anton Blanchard @ 2015-01-12 0:55 UTC (permalink / raw)
To: David Laight; +Cc: paulus, linuxppc-dev
Hi David,
> The unrolled loop (deleted) looks excessive.
> On a modern cpu with multiple execution units you can usually
> manage to get the loop overhead to execute in parallel to the
> actual 'work'.
> So I suspect that a much simpler 'word at a time' loop will be
> almost as fast - especially in the case where the code isn't
> already in the cache and the compare is relatively short.
I'm always keen to keep things as simple as possible, but your loop is
over 50% slower. Once the loop hits a steady state you are going to run
into front end issues with instruction fetch on POWER8.
Anton
> Try something based on:
> a1 = *a++;
> b1 = *b++;
> while {
> a2 = *a++;
> b2 = *b++;
> if (a1 != a2)
> break;
> a1 = *a++;
> b1 = *b++;
> } while (a2 != a1);
>
> David
>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2] powerpc: Add 64bit optimised memcmp
2015-01-09 1:56 Anton Blanchard
2015-01-09 10:06 ` David Laight
@ 2015-01-09 11:01 ` Adhemerval Zanella
1 sibling, 0 replies; 10+ messages in thread
From: Adhemerval Zanella @ 2015-01-09 11:01 UTC (permalink / raw)
To: linuxppc-dev
On 08-01-2015 23:56, Anton Blanchard wrote:
> I noticed ksm spending quite a lot of time in memcmp on a large
> KVM box. The current memcmp loop is very unoptimised - byte at a
> time compares with no loop unrolling. We can do much much better.
>
> Optimise the loop in a few ways:
>
> - Unroll the byte at a time loop
>
> - For large (at least 32 byte) comparisons that are also 8 byte
> aligned, use an unrolled modulo scheduled loop using 8 byte
> loads. This is similar to our glibc memcmp.
>
> A simple microbenchmark testing 10000000 iterations of an 8192 byte
> memcmp was used to measure the performance:
>
> baseline: 29.93 s
>
> modified: 1.70 s
>
> Just over 17x faster.
>
> Signed-off-by: Anton Blanchard <anton@samba.org>
>
Why not use glibc implementations instead? All of them (ppc64, power4, and
power7) avoids use byte at time compares for unaligned cases inputs; while
showing the same performance for aligned one than this new implementation.
To give you an example, a 8192 bytes compare with input alignment of 63/18
shows:
__memcmp_power7: 320 cycles
__memcmp_power4: 320 cycles
__memcmp_ppc64: 340 cycles
this memcmp: 3185 cycles
^ permalink raw reply [flat|nested] 10+ messages in thread
* RE: [PATCH 1/2] powerpc: Add 64bit optimised memcmp
2015-01-09 1:56 Anton Blanchard
@ 2015-01-09 10:06 ` David Laight
2015-01-12 0:55 ` Anton Blanchard
2015-01-09 11:01 ` Adhemerval Zanella
1 sibling, 1 reply; 10+ messages in thread
From: David Laight @ 2015-01-09 10:06 UTC (permalink / raw)
To: 'Anton Blanchard', benh, paulus, mpe; +Cc: linuxppc-dev
RnJvbTogQW50b24gQmxhbmNoYXJkDQo+IEkgbm90aWNlZCBrc20gc3BlbmRpbmcgcXVpdGUgYSBs
b3Qgb2YgdGltZSBpbiBtZW1jbXAgb24gYSBsYXJnZQ0KPiBLVk0gYm94LiBUaGUgY3VycmVudCBt
ZW1jbXAgbG9vcCBpcyB2ZXJ5IHVub3B0aW1pc2VkIC0gYnl0ZSBhdCBhDQo+IHRpbWUgY29tcGFy
ZXMgd2l0aCBubyBsb29wIHVucm9sbGluZy4gV2UgY2FuIGRvIG11Y2ggbXVjaCBiZXR0ZXIuDQo+
IA0KPiBPcHRpbWlzZSB0aGUgbG9vcCBpbiBhIGZldyB3YXlzOg0KPiANCj4gLSBVbnJvbGwgdGhl
IGJ5dGUgYXQgYSB0aW1lIGxvb3ANCj4gDQo+IC0gRm9yIGxhcmdlIChhdCBsZWFzdCAzMiBieXRl
KSBjb21wYXJpc29ucyB0aGF0IGFyZSBhbHNvIDggYnl0ZQ0KPiAgIGFsaWduZWQsIHVzZSBhbiB1
bnJvbGxlZCBtb2R1bG8gc2NoZWR1bGVkIGxvb3AgdXNpbmcgOCBieXRlDQo+ICAgbG9hZHMuIFRo
aXMgaXMgc2ltaWxhciB0byBvdXIgZ2xpYmMgbWVtY21wLg0KPiANCj4gQSBzaW1wbGUgbWljcm9i
ZW5jaG1hcmsgdGVzdGluZyAxMDAwMDAwMCBpdGVyYXRpb25zIG9mIGFuIDgxOTIgYnl0ZQ0KPiBt
ZW1jbXAgd2FzIHVzZWQgdG8gbWVhc3VyZSB0aGUgcGVyZm9ybWFuY2U6DQo+IA0KPiBiYXNlbGlu
ZToJMjkuOTMgcw0KPiANCj4gbW9kaWZpZWQ6CSAxLjcwIHMNCj4gDQo+IEp1c3Qgb3ZlciAxN3gg
ZmFzdGVyLg0KDQpUaGUgdW5yb2xsZWQgbG9vcCAoZGVsZXRlZCkgbG9va3MgZXhjZXNzaXZlLg0K
T24gYSBtb2Rlcm4gY3B1IHdpdGggbXVsdGlwbGUgZXhlY3V0aW9uIHVuaXRzIHlvdSBjYW4gdXN1
YWxseQ0KbWFuYWdlIHRvIGdldCB0aGUgbG9vcCBvdmVyaGVhZCB0byBleGVjdXRlIGluIHBhcmFs
bGVsIHRvIHRoZQ0KYWN0dWFsICd3b3JrJy4NClNvIEkgc3VzcGVjdCB0aGF0IGEgbXVjaCBzaW1w
bGVyICd3b3JkIGF0IGEgdGltZScgbG9vcCB3aWxsIGJlDQphbG1vc3QgYXMgZmFzdCAtIGVzcGVj
aWFsbHkgaW4gdGhlIGNhc2Ugd2hlcmUgdGhlIGNvZGUgaXNuJ3QNCmFscmVhZHkgaW4gdGhlIGNh
Y2hlIGFuZCB0aGUgY29tcGFyZSBpcyByZWxhdGl2ZWx5IHNob3J0Lg0KVHJ5IHNvbWV0aGluZyBi
YXNlZCBvbjoNCglhMSA9ICphKys7DQoJYjEgPSAqYisrOw0KCXdoaWxlIHsNCgkJYTIgPSAqYSsr
Ow0KCQliMiA9ICpiKys7DQoJCWlmIChhMSAhPSBhMikNCgkJCWJyZWFrOw0KCQlhMSA9ICphKys7
DQoJCWIxID0gKmIrKzsNCgl9IHdoaWxlIChhMiAhPSBhMSk7DQoNCglEYXZpZA0KDQo=
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 1/2] powerpc: Add 64bit optimised memcmp
@ 2015-01-09 1:56 Anton Blanchard
2015-01-09 10:06 ` David Laight
2015-01-09 11:01 ` Adhemerval Zanella
0 siblings, 2 replies; 10+ messages in thread
From: Anton Blanchard @ 2015-01-09 1:56 UTC (permalink / raw)
To: benh, paulus, mpe; +Cc: linuxppc-dev
I noticed ksm spending quite a lot of time in memcmp on a large
KVM box. The current memcmp loop is very unoptimised - byte at a
time compares with no loop unrolling. We can do much much better.
Optimise the loop in a few ways:
- Unroll the byte at a time loop
- For large (at least 32 byte) comparisons that are also 8 byte
aligned, use an unrolled modulo scheduled loop using 8 byte
loads. This is similar to our glibc memcmp.
A simple microbenchmark testing 10000000 iterations of an 8192 byte
memcmp was used to measure the performance:
baseline: 29.93 s
modified: 1.70 s
Just over 17x faster.
Signed-off-by: Anton Blanchard <anton@samba.org>
---
arch/powerpc/lib/Makefile | 3 +-
arch/powerpc/lib/memcmp_64.S | 233 +++++++++++++++++++++++++++++++++++++++++++
arch/powerpc/lib/string.S | 2 +
3 files changed, 237 insertions(+), 1 deletion(-)
create mode 100644 arch/powerpc/lib/memcmp_64.S
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 1b01159..5526156 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -15,7 +15,8 @@ obj-$(CONFIG_PPC32) += div64.o copy_32.o
obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \
usercopy_64.o mem_64.o hweight_64.o \
- copyuser_power7.o string_64.o copypage_power7.o
+ copyuser_power7.o string_64.o copypage_power7.o \
+ memcmp_64.o
ifeq ($(CONFIG_GENERIC_CSUM),)
obj-y += checksum_$(CONFIG_WORD_SIZE).o
obj-$(CONFIG_PPC64) += checksum_wrappers_64.o
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
new file mode 100644
index 0000000..d71880b
--- /dev/null
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -0,0 +1,233 @@
+/*
+ * Author: Author: Anton Blanchard <anton@au.ibm.com>
+ * Copyright 2015 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/ppc_asm.h>
+
+#define off8 r6
+#define off16 r7
+#define off24 r8
+
+#define rA r9
+#define rB r10
+#define rC r11
+#define rD r27
+#define rE r28
+#define rF r29
+#define rG r30
+#define rH r31
+
+#ifdef __LITTLE_ENDIAN__
+#define LD ldbrx
+#else
+#define LD ldx
+#endif
+
+_GLOBAL(memcmp)
+ cmpdi cr1,r5,0
+
+ /* Use the short loop if both strings are not 8B aligned */
+ or r6,r3,r4
+ rldicl. r6,r6,0,(64-3)
+
+ /* Use the short loop if length is less than 32B */
+ cmpdi cr5,r5,31
+
+ beq cr1,.Lzero
+ bne .Lshort
+ bgt cr5,.Llong
+
+.Lshort:
+ mtctr r5
+
+1: lbz rA,0(r3)
+ lbz rB,0(r4)
+ subf. rC,rB,rA
+ bne .Lnon_zero
+ bdz .Lzero
+
+ lbz rA,1(r3)
+ lbz rB,1(r4)
+ subf. rC,rB,rA
+ bne .Lnon_zero
+ bdz .Lzero
+
+ lbz rA,2(r3)
+ lbz rB,2(r4)
+ subf. rC,rB,rA
+ bne .Lnon_zero
+ bdz .Lzero
+
+ lbz rA,3(r3)
+ lbz rB,3(r4)
+ subf. rC,rB,rA
+ bne .Lnon_zero
+
+ addi r3,r3,4
+ addi r4,r4,4
+
+ bdnzt eq,1b
+
+.Lzero:
+ li r3,0
+ blr
+
+.Lnon_zero:
+ mr r3,rC
+ blr
+
+.Llong:
+ li off8,8
+ li off16,16
+ li off24,24
+
+ std r31,-8(r1)
+ std r30,-16(r1)
+ std r29,-24(r1)
+ std r28,-32(r1)
+ std r27,-40(r1)
+
+ srdi r0,r5,5
+ mtctr r0
+ andi. r5,r5,31
+
+ LD rA,0,r3
+ LD rB,0,r4
+
+ LD rC,off8,r3
+ LD rD,off8,r4
+
+ LD rE,off16,r3
+ LD rF,off16,r4
+
+ LD rG,off24,r3
+ LD rH,off24,r4
+ cmpld cr0,rA,rB
+
+ addi r3,r3,32
+ addi r4,r4,32
+
+ bdz .Lfirst32
+
+ LD rA,0,r3
+ LD rB,0,r4
+ cmpld cr1,rC,rD
+
+ LD rC,off8,r3
+ LD rD,off8,r4
+ cmpld cr5,rE,rF
+
+ LD rE,off16,r3
+ LD rF,off16,r4
+ cmpld cr6,rG,rH
+ bne cr0,.LcmpAB
+
+ LD rG,off24,r3
+ LD rH,off24,r4
+ cmpld cr0,rA,rB
+ bne cr1,.LcmpCD
+
+ addi r3,r3,32
+ addi r4,r4,32
+
+ bdz .Lsecond32
+
+ .balign 16
+
+1: LD rA,0,r3
+ LD rB,0,r4
+ cmpld cr1,rC,rD
+ bne cr5,.LcmpEF
+
+ LD rC,off8,r3
+ LD rD,off8,r4
+ cmpld cr5,rE,rF
+ bne cr6,.LcmpGH
+
+ LD rE,off16,r3
+ LD rF,off16,r4
+ cmpld cr6,rG,rH
+ bne cr0,.LcmpAB
+
+ LD rG,off24,r3
+ LD rH,off24,r4
+ cmpld cr0,rA,rB
+ bne cr1,.LcmpCD
+
+ addi r3,r3,32
+ addi r4,r4,32
+
+ bdnz 1b
+
+.Lsecond32:
+ cmpld cr1,rC,rD
+ bne cr5,.LcmpEF
+
+ cmpld cr5,rE,rF
+ bne cr6,.LcmpGH
+
+ cmpld cr6,rG,rH
+ bne cr0,.LcmpAB
+
+ bne cr1,.LcmpCD
+ bne cr5,.LcmpEF
+ bne cr6,.LcmpGH
+
+.Ltail:
+ ld r31,-8(r1)
+ ld r30,-16(r1)
+ ld r29,-24(r1)
+ ld r28,-32(r1)
+ ld r27,-40(r1)
+
+ cmpdi r5,0
+ beq .Lzero
+ b .Lshort
+
+.Lfirst32:
+ cmpld cr1,rC,rD
+ cmpld cr5,rE,rF
+ cmpld cr6,rG,rH
+
+ bne cr0,.LcmpAB
+ bne cr1,.LcmpCD
+ bne cr5,.LcmpEF
+ bne cr6,.LcmpGH
+
+ b .Ltail
+
+.LcmpAB:
+ li r3,1
+ bgt cr0,.Lout
+ li r3,-1
+ b .Lout
+
+.LcmpCD:
+ li r3,1
+ bgt cr1,.Lout
+ li r3,-1
+ b .Lout
+
+.LcmpEF:
+ li r3,1
+ bgt cr5,.Lout
+ li r3,-1
+ b .Lout
+
+.LcmpGH:
+ li r3,1
+ bgt cr6,.Lout
+ li r3,-1
+
+.Lout:
+ ld r31,-8(r1)
+ ld r30,-16(r1)
+ ld r29,-24(r1)
+ ld r28,-32(r1)
+ ld r27,-40(r1)
+ blr
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index 1b5a0a0..c80fb49 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -93,6 +93,7 @@ _GLOBAL(strlen)
subf r3,r3,r4
blr
+#ifdef CONFIG_PPC32
_GLOBAL(memcmp)
PPC_LCMPI 0,r5,0
beq- 2f
@@ -106,6 +107,7 @@ _GLOBAL(memcmp)
blr
2: li r3,0
blr
+#endif
_GLOBAL(memchr)
PPC_LCMPI 0,r5,0
--
2.1.0
^ permalink raw reply related [flat|nested] 10+ messages in thread
end of thread, other threads:[~2015-01-21 12:06 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-01-21 1:27 [PATCH 1/2] powerpc: Add 64bit optimised memcmp Anton Blanchard
2015-01-21 1:27 ` [PATCH 2/2] powerpc: Add memcmp testcase Anton Blanchard
2015-01-21 9:26 ` [PATCH 1/2] powerpc: Add 64bit optimised memcmp Arnd Bergmann
2015-01-21 12:06 ` Anton Blanchard
-- strict thread matches above, loose matches on Subject: below --
2015-01-09 1:56 Anton Blanchard
2015-01-09 10:06 ` David Laight
2015-01-12 0:55 ` Anton Blanchard
2015-01-12 6:55 ` Joakim Tjernlund
2015-01-12 9:45 ` David Laight
2015-01-09 11:01 ` Adhemerval Zanella
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.