All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] powerpc: Add 64bit optimised memcmp
@ 2015-01-21  1:27 Anton Blanchard
  2015-01-21  1:27 ` [PATCH 2/2] powerpc: Add memcmp testcase Anton Blanchard
  2015-01-21  9:26 ` [PATCH 1/2] powerpc: Add 64bit optimised memcmp Arnd Bergmann
  0 siblings, 2 replies; 10+ messages in thread
From: Anton Blanchard @ 2015-01-21  1:27 UTC (permalink / raw)
  To: benh, paulus, mpe, segher; +Cc: linuxppc-dev

I noticed ksm spending quite a lot of time in memcmp on a large
KVM box. The current memcmp loop is very unoptimised - byte at a
time compares with no loop unrolling. We can do much much better.

Optimise the loop in a few ways:

- Unroll the byte at a time loop

- For large (at least 32 byte) comparisons that are also 8 byte
  aligned, use an unrolled modulo scheduled loop using 8 byte
  loads. This is similar to our glibc memcmp.

A simple microbenchmark testing 10000000 iterations of an 8192 byte
memcmp was used to measure the performance:

baseline:	29.93 s

modified:	 1.70 s

Just over 17x faster.

v2: Incorporated some suggestions from Segher:

- Use andi. instead of rdlicl.

- Convert bdnzt eq, to bdnz. It's just duplicating the earlier compare
  and was a relic from a previous version.

- Don't use cr5, we have plans to use that CR field for fast local
  atomics.

Signed-off-by: Anton Blanchard <anton@samba.org>
---
 arch/powerpc/lib/Makefile    |   3 +-
 arch/powerpc/lib/memcmp_64.S | 233 +++++++++++++++++++++++++++++++++++++++++++
 arch/powerpc/lib/string.S    |   2 +
 3 files changed, 237 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/lib/memcmp_64.S

diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 1b01159..5526156 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -15,7 +15,8 @@ obj-$(CONFIG_PPC32)	+= div64.o copy_32.o
 
 obj-$(CONFIG_PPC64)	+= copypage_64.o copyuser_64.o \
 			   usercopy_64.o mem_64.o hweight_64.o \
-			   copyuser_power7.o string_64.o copypage_power7.o
+			   copyuser_power7.o string_64.o copypage_power7.o \
+			   memcmp_64.o
 ifeq ($(CONFIG_GENERIC_CSUM),)
 obj-y			+= checksum_$(CONFIG_WORD_SIZE).o
 obj-$(CONFIG_PPC64)	+= checksum_wrappers_64.o
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
new file mode 100644
index 0000000..8953d23
--- /dev/null
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -0,0 +1,233 @@
+/*
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ * Copyright 2015 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/ppc_asm.h>
+
+#define off8	r6
+#define off16	r7
+#define off24	r8
+
+#define rA	r9
+#define rB	r10
+#define rC	r11
+#define rD	r27
+#define rE	r28
+#define rF	r29
+#define rG	r30
+#define rH	r31
+
+#ifdef __LITTLE_ENDIAN__
+#define LD	ldbrx
+#else
+#define LD	ldx
+#endif
+
+_GLOBAL(memcmp)
+	cmpdi	cr1,r5,0
+
+	/* Use the short loop if both strings are not 8B aligned */
+	or	r6,r3,r4
+	andi.	r6,r6,7
+
+	/* Use the short loop if length is less than 32B */
+	cmpdi	cr6,r5,31
+
+	beq	cr1,.Lzero
+	bne	.Lshort
+	bgt	cr6,.Llong
+
+.Lshort:
+	mtctr	r5
+
+1:	lbz	rA,0(r3)
+	lbz	rB,0(r4)
+	subf.	rC,rB,rA
+	bne	.Lnon_zero
+	bdz	.Lzero
+
+	lbz	rA,1(r3)
+	lbz	rB,1(r4)
+	subf.	rC,rB,rA
+	bne	.Lnon_zero
+	bdz	.Lzero
+
+	lbz	rA,2(r3)
+	lbz	rB,2(r4)
+	subf.	rC,rB,rA
+	bne	.Lnon_zero
+	bdz	.Lzero
+
+	lbz	rA,3(r3)
+	lbz	rB,3(r4)
+	subf.	rC,rB,rA
+	bne	.Lnon_zero
+
+	addi	r3,r3,4
+	addi	r4,r4,4
+
+	bdnz	1b
+
+.Lzero:
+	li	r3,0
+	blr
+
+.Lnon_zero:
+	mr	r3,rC
+	blr
+
+.Llong:
+	li	off8,8
+	li	off16,16
+	li	off24,24
+
+	std	r31,-8(r1)
+	std	r30,-16(r1)
+	std	r29,-24(r1)
+	std	r28,-32(r1)
+	std	r27,-40(r1)
+
+	srdi	r0,r5,5
+	mtctr	r0
+	andi.	r5,r5,31
+
+	LD	rA,0,r3
+	LD	rB,0,r4
+
+	LD	rC,off8,r3
+	LD	rD,off8,r4
+
+	LD	rE,off16,r3
+	LD	rF,off16,r4
+
+	LD	rG,off24,r3
+	LD	rH,off24,r4
+	cmpld	cr0,rA,rB
+
+	addi	r3,r3,32
+	addi	r4,r4,32
+
+	bdz	.Lfirst32
+
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr1,rC,rD
+
+	LD	rC,off8,r3
+	LD	rD,off8,r4
+	cmpld	cr6,rE,rF
+
+	LD	rE,off16,r3
+	LD	rF,off16,r4
+	cmpld	cr7,rG,rH
+	bne	cr0,.LcmpAB
+
+	LD	rG,off24,r3
+	LD	rH,off24,r4
+	cmpld	cr0,rA,rB
+	bne	cr1,.LcmpCD
+
+	addi	r3,r3,32
+	addi	r4,r4,32
+
+	bdz	.Lsecond32
+
+	.balign	16
+
+1:	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr1,rC,rD
+	bne	cr6,.LcmpEF
+
+	LD	rC,off8,r3
+	LD	rD,off8,r4
+	cmpld	cr6,rE,rF
+	bne	cr7,.LcmpGH
+
+	LD	rE,off16,r3
+	LD	rF,off16,r4
+	cmpld	cr7,rG,rH
+	bne	cr0,.LcmpAB
+
+	LD	rG,off24,r3
+	LD	rH,off24,r4
+	cmpld	cr0,rA,rB
+	bne	cr1,.LcmpCD
+
+	addi	r3,r3,32
+	addi	r4,r4,32
+
+	bdnz	1b
+
+.Lsecond32:
+	cmpld	cr1,rC,rD
+	bne	cr6,.LcmpEF
+
+	cmpld	cr6,rE,rF
+	bne	cr7,.LcmpGH
+
+	cmpld	cr7,rG,rH
+	bne	cr0,.LcmpAB
+
+	bne	cr1,.LcmpCD
+	bne	cr6,.LcmpEF
+	bne	cr7,.LcmpGH
+
+.Ltail:
+	ld	r31,-8(r1)
+	ld	r30,-16(r1)
+	ld	r29,-24(r1)
+	ld	r28,-32(r1)
+	ld	r27,-40(r1)
+
+	cmpdi	r5,0
+	beq	.Lzero
+	b	.Lshort
+
+.Lfirst32:
+	cmpld	cr1,rC,rD
+	cmpld	cr6,rE,rF
+	cmpld	cr7,rG,rH
+
+	bne	cr0,.LcmpAB
+	bne	cr1,.LcmpCD
+	bne	cr6,.LcmpEF
+	bne	cr7,.LcmpGH
+
+	b	.Ltail
+
+.LcmpAB:
+	li	r3,1
+	bgt	cr0,.Lout
+	li	r3,-1
+	b	.Lout
+
+.LcmpCD:
+	li	r3,1
+	bgt	cr1,.Lout
+	li	r3,-1
+	b	.Lout
+
+.LcmpEF:
+	li	r3,1
+	bgt	cr6,.Lout
+	li	r3,-1
+	b	.Lout
+
+.LcmpGH:
+	li	r3,1
+	bgt	cr7,.Lout
+	li	r3,-1
+
+.Lout:
+	ld	r31,-8(r1)
+	ld	r30,-16(r1)
+	ld	r29,-24(r1)
+	ld	r28,-32(r1)
+	ld	r27,-40(r1)
+	blr
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index 1b5a0a0..c80fb49 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -93,6 +93,7 @@ _GLOBAL(strlen)
 	subf	r3,r3,r4
 	blr
 
+#ifdef CONFIG_PPC32
 _GLOBAL(memcmp)
 	PPC_LCMPI 0,r5,0
 	beq-	2f
@@ -106,6 +107,7 @@ _GLOBAL(memcmp)
 	blr
 2:	li	r3,0
 	blr
+#endif
 
 _GLOBAL(memchr)
 	PPC_LCMPI 0,r5,0
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/2] powerpc: Add memcmp testcase
  2015-01-21  1:27 [PATCH 1/2] powerpc: Add 64bit optimised memcmp Anton Blanchard
@ 2015-01-21  1:27 ` Anton Blanchard
  2015-01-21  9:26 ` [PATCH 1/2] powerpc: Add 64bit optimised memcmp Arnd Bergmann
  1 sibling, 0 replies; 10+ messages in thread
From: Anton Blanchard @ 2015-01-21  1:27 UTC (permalink / raw)
  To: benh, paulus, mpe, segher; +Cc: linuxppc-dev

Add a testcase for the new ppc64 memcmp.

Signed-off-by: Anton Blanchard <anton@samba.org>
---
 .../testing/selftests/powerpc/stringloops/Makefile |  21 +++++
 .../selftests/powerpc/stringloops/asm/ppc_asm.h    |   7 ++
 .../selftests/powerpc/stringloops/memcmp_64.S      |   1 +
 .../selftests/powerpc/stringloops/test_memcmp.c    | 103 +++++++++++++++++++++
 4 files changed, 132 insertions(+)
 create mode 100644 tools/testing/selftests/powerpc/stringloops/Makefile
 create mode 100644 tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
 create mode 120000 tools/testing/selftests/powerpc/stringloops/memcmp_64.S
 create mode 100644 tools/testing/selftests/powerpc/stringloops/test_memcmp.c

diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile b/tools/testing/selftests/powerpc/stringloops/Makefile
new file mode 100644
index 0000000..159a299
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/Makefile
@@ -0,0 +1,21 @@
+# The loops are all 64-bit code
+CFLAGS += -m64
+CFLAGS += -I$(CURDIR)
+CFLAGS += -D SELFTEST
+
+PROGS := test_memcmp
+EXTRA_SOURCES := memcmp_64.S ../harness.c
+
+all: $(PROGS)
+
+$(PROGS): $(EXTRA_SOURCES)
+
+run_tests: all
+	@-for PROG in $(PROGS); do \
+		./$$PROG; \
+	done;
+
+clean:
+	rm -f $(PROGS) *.o
+
+.PHONY: all run_tests clean
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
new file mode 100644
index 0000000..11bece8
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
@@ -0,0 +1,7 @@
+#include <ppc-asm.h>
+
+#ifndef r1
+#define r1 sp
+#endif
+
+#define _GLOBAL(A) FUNC_START(test_ ## A)
diff --git a/tools/testing/selftests/powerpc/stringloops/memcmp_64.S b/tools/testing/selftests/powerpc/stringloops/memcmp_64.S
new file mode 120000
index 0000000..9bc87e4
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/memcmp_64.S
@@ -0,0 +1 @@
+../../../../../arch/powerpc/lib/memcmp_64.S
\ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/stringloops/test_memcmp.c b/tools/testing/selftests/powerpc/stringloops/test_memcmp.c
new file mode 100644
index 0000000..17417dd
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/test_memcmp.c
@@ -0,0 +1,103 @@
+#include <malloc.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../utils.h"
+
+#define SIZE 256
+#define ITERATIONS 10000
+
+int test_memcmp(const void *s1, const void *s2, size_t n);
+
+/* test all offsets and lengths */
+static void test_one(char *s1, char *s2)
+{
+	unsigned long offset, size;
+
+	for (offset = 0; offset < SIZE; offset++) {
+		for (size = 0; size < (SIZE-offset); size++) {
+			int x, y;
+			unsigned long i;
+
+			y = memcmp(s1+offset, s2+offset, size);
+			x = test_memcmp(s1+offset, s2+offset, size);
+
+			if (((x ^ y) < 0) &&	/* Trick to compare sign */
+				((x | y) != 0)) { /* check for zero */
+				printf("memcmp returned %d, should have returned %d (offset %ld size %ld)\n", x, y, offset, size);
+
+				for (i = offset; i < offset+size; i++)
+					printf("%02x ", s1[i]);
+				printf("\n");
+
+				for (i = offset; i < offset+size; i++)
+					printf("%02x ", s2[i]);
+				printf("\n");
+				abort();
+			}
+		}
+	}
+}
+
+static int testcase(void)
+{
+	char *s1;
+	char *s2;
+	unsigned long i;
+
+	s1 = memalign(128, SIZE);
+	if (!s1) {
+		perror("memalign");
+		exit(1);
+	}
+
+	s2 = memalign(128, SIZE);
+	if (!s2) {
+		perror("memalign");
+		exit(1);
+	}
+
+	srandom(1);
+
+	for (i = 0; i < ITERATIONS; i++) {
+		unsigned long j;
+		unsigned long change;
+
+		for (j = 0; j < SIZE; j++)
+			s1[j] = random();
+
+		memcpy(s2, s1, SIZE);
+
+		/* change one byte */
+		change = random() % SIZE;
+		s2[change] = random() & 0xff;
+
+		test_one(s1, s2);
+	}
+
+	srandom(1);
+
+	for (i = 0; i < ITERATIONS; i++) {
+		unsigned long j;
+		unsigned long change;
+
+		for (j = 0; j < SIZE; j++)
+			s1[j] = random();
+
+		memcpy(s2, s1, SIZE);
+
+		/* change multiple bytes, 1/8 of total */
+		for (j = 0; j < SIZE / 8; j++) {
+			change = random() % SIZE;
+			s2[change] = random() & 0xff;
+		}
+
+		test_one(s1, s2);
+	}
+
+	return 0;
+}
+
+int main(void)
+{
+	return test_harness(testcase, "memcmp");
+}
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] powerpc: Add 64bit optimised memcmp
  2015-01-21  1:27 [PATCH 1/2] powerpc: Add 64bit optimised memcmp Anton Blanchard
  2015-01-21  1:27 ` [PATCH 2/2] powerpc: Add memcmp testcase Anton Blanchard
@ 2015-01-21  9:26 ` Arnd Bergmann
  2015-01-21 12:06   ` Anton Blanchard
  1 sibling, 1 reply; 10+ messages in thread
From: Arnd Bergmann @ 2015-01-21  9:26 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: paulus, Anton Blanchard

On Wednesday 21 January 2015 12:27:38 Anton Blanchard wrote:
> I noticed ksm spending quite a lot of time in memcmp on a large
> KVM box. The current memcmp loop is very unoptimised - byte at a
> time compares with no loop unrolling. We can do much much better.
> 
> Optimise the loop in a few ways:
> 
> - Unroll the byte at a time loop
> 
> - For large (at least 32 byte) comparisons that are also 8 byte
>   aligned, use an unrolled modulo scheduled loop using 8 byte
>   loads. This is similar to our glibc memcmp.
> 
> A simple microbenchmark testing 10000000 iterations of an 8192 byte
> memcmp was used to measure the performance:
> 
> baseline:	29.93 s
> 
> modified:	 1.70 s
> 
> Just over 17x faster.
> 
> v2: Incorporated some suggestions from Segher:
> 
> - Use andi. instead of rdlicl.
> 
> - Convert bdnzt eq, to bdnz. It's just duplicating the earlier compare
>   and was a relic from a previous version.
> 
> - Don't use cr5, we have plans to use that CR field for fast local
>   atomics.
> 
> Signed-off-by: Anton Blanchard <anton@samba.org>

Would it help to also add a way for an architecture to override
memcmp_pages() with its own implementation? That way you could
skip the unaligned part, hardcode the loop counter and avoid the
preempt_disable() in kmap_atomic().

	Arnd

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] powerpc: Add 64bit optimised memcmp
  2015-01-21  9:26 ` [PATCH 1/2] powerpc: Add 64bit optimised memcmp Arnd Bergmann
@ 2015-01-21 12:06   ` Anton Blanchard
  0 siblings, 0 replies; 10+ messages in thread
From: Anton Blanchard @ 2015-01-21 12:06 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linuxppc-dev, paulus

Hi Arnd,

> Would it help to also add a way for an architecture to override
> memcmp_pages() with its own implementation? That way you could
> skip the unaligned part, hardcode the loop counter and avoid the
> preempt_disable() in kmap_atomic().

Good idea. We could also have a generic implementation that did long
comparisons, so everyone would get a decent speed up.

Anton

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH 1/2] powerpc: Add 64bit optimised memcmp
  2015-01-12  6:55     ` Joakim Tjernlund
@ 2015-01-12  9:45       ` David Laight
  0 siblings, 0 replies; 10+ messages in thread
From: David Laight @ 2015-01-12  9:45 UTC (permalink / raw)
  To: 'Joakim Tjernlund', anton; +Cc: paulus, linuxppc-dev

From: Joakim Tjernlund=20
> On Mon, 2015-01-12 at 11:55 +1100, Anton Blanchard wrote:
> > Hi David,
> >
> > > The unrolled loop (deleted) looks excessive.
> > > On a modern cpu with multiple execution units you can usually
> > > manage to get the loop overhead to execute in parallel to the
> > > actual 'work'.
> > > So I suspect that a much simpler 'word at a time' loop will be almost=
 as fast - especially in the
> case where the code isn't
> > > already in the cache and the compare is relatively short.
> >
> > I'm always keen to keep things as simple as possible, but your loop is =
over 50% slower. Once the
> loop hits a steady state you are going to run into front end issues with =
instruction fetch on POWER8.

Interesting, I'm not an expert on ppc scheduling, but on my old x86 Athon 7=
00 (I think
it was that one) a similar loop ran as fast as 'rep movsw'.

> Out of curiosity, does preincrement make any difference(or can gcc do tha=
t for you nowadays)?

It will only change register pressure slightly, and might allow any executi=
on
delays be filled - but that is very processor dependant.
Actually you probably want to do 'a +=3D 2' somewhere to reduce the instruc=
tion count.
Similarly the end condition needs to compare one of the pointers.

Elsewhere (not ppc) I've used (++p)[-1] instead of *p++ to move the increme=
nt
before the load to get better scheduling.

>          a1 =3D *a;
>          b1 =3D *b;
>          while {
>                  a2 =3D *++a;
>                  b2 =3D *++b;
>                  if (a1 !=3D a2)
     That should have been a1 !=3D b1
>                  	break;
>                  a1 =3D *++a;
>                  b1 =3D *++b;
>          } while (a2 !=3D a1);
     and a2 !=3D b2

	David

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] powerpc: Add 64bit optimised memcmp
  2015-01-12  0:55   ` Anton Blanchard
@ 2015-01-12  6:55     ` Joakim Tjernlund
  2015-01-12  9:45       ` David Laight
  0 siblings, 1 reply; 10+ messages in thread
From: Joakim Tjernlund @ 2015-01-12  6:55 UTC (permalink / raw)
  To: anton; +Cc: paulus, David.Laight, linuxppc-dev


On Mon, 2015-01-12 at 11:55 +1100, Anton Blanchard wrote:
> Hi David,
>=20
> > The unrolled loop (deleted) looks excessive.
> > On a modern cpu with multiple execution units you can usually
> > manage to get the loop overhead to execute in parallel to the
> > actual 'work'.
> > So I suspect that a much simpler 'word at a time' loop will be almost a=
s fast - especially in the case where the code isn't
> > already in the cache and the compare is relatively short.
>=20
> I'm always keen to keep things as simple as possible, but your loop is ov=
er 50% slower. Once the loop hits a steady state you are going to run into =
front end issues with instruction fetch on POWER8.
>=20

Out of curiosity, does preincrement make any difference(or can gcc do that =
for you nowadays)?

         a1 =3D *a;
         b1 =3D *b;
         while {
                 a2 =3D *++a;
                 b2 =3D *++b;
                 if (a1 !=3D a2)
                 break;
                 a1 =3D *++a;
                 b1 =3D *++b;
         } while (a2 !=3D a1);

 Jocke

> Anton
>=20
> > Try something based on:
> >         a1 =3D *a++;
> >         b1 =3D *b++;
> >         while {
> >                 a2 =3D *a++;
> >                 b2 =3D *b++;
> >                 if (a1 !=3D a2)
> >                 break;
> >                 a1 =3D *a++;
> >                 b1 =3D *b++;
> >         } while (a2 !=3D a1);
> >=20
> >         David
> >=20
>=20
> _______________________________________________
> Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozl=
abs.org/listinfo/linuxppc-dev=

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] powerpc: Add 64bit optimised memcmp
  2015-01-09 10:06 ` David Laight
@ 2015-01-12  0:55   ` Anton Blanchard
  2015-01-12  6:55     ` Joakim Tjernlund
  0 siblings, 1 reply; 10+ messages in thread
From: Anton Blanchard @ 2015-01-12  0:55 UTC (permalink / raw)
  To: David Laight; +Cc: paulus, linuxppc-dev

Hi David,

> The unrolled loop (deleted) looks excessive.
> On a modern cpu with multiple execution units you can usually
> manage to get the loop overhead to execute in parallel to the
> actual 'work'.
> So I suspect that a much simpler 'word at a time' loop will be
> almost as fast - especially in the case where the code isn't
> already in the cache and the compare is relatively short.

I'm always keen to keep things as simple as possible, but your loop is
over 50% slower. Once the loop hits a steady state you are going to run
into front end issues with instruction fetch on POWER8.

Anton

> Try something based on:
> 	a1 = *a++;
> 	b1 = *b++;
> 	while {
> 		a2 = *a++;
> 		b2 = *b++;
> 		if (a1 != a2)
> 			break;
> 		a1 = *a++;
> 		b1 = *b++;
> 	} while (a2 != a1);
> 
> 	David
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] powerpc: Add 64bit optimised memcmp
  2015-01-09  1:56 Anton Blanchard
  2015-01-09 10:06 ` David Laight
@ 2015-01-09 11:01 ` Adhemerval Zanella
  1 sibling, 0 replies; 10+ messages in thread
From: Adhemerval Zanella @ 2015-01-09 11:01 UTC (permalink / raw)
  To: linuxppc-dev

On 08-01-2015 23:56, Anton Blanchard wrote:
> I noticed ksm spending quite a lot of time in memcmp on a large
> KVM box. The current memcmp loop is very unoptimised - byte at a
> time compares with no loop unrolling. We can do much much better.
>
> Optimise the loop in a few ways:
>
> - Unroll the byte at a time loop
>
> - For large (at least 32 byte) comparisons that are also 8 byte
>   aligned, use an unrolled modulo scheduled loop using 8 byte
>   loads. This is similar to our glibc memcmp.
>
> A simple microbenchmark testing 10000000 iterations of an 8192 byte
> memcmp was used to measure the performance:
>
> baseline:	29.93 s
>
> modified:	 1.70 s
>
> Just over 17x faster.
>
> Signed-off-by: Anton Blanchard <anton@samba.org>
>
Why not use glibc implementations instead? All of them (ppc64, power4, and
power7) avoids use byte at time compares for unaligned cases inputs; while
showing the same performance for aligned one than this new implementation.
To give you an example, a 8192 bytes compare with input alignment of 63/18
shows:

__memcmp_power7:  320 cycles
__memcmp_power4:  320 cycles
__memcmp_ppc64:   340 cycles
this memcmp:     3185 cycles

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH 1/2] powerpc: Add 64bit optimised memcmp
  2015-01-09  1:56 Anton Blanchard
@ 2015-01-09 10:06 ` David Laight
  2015-01-12  0:55   ` Anton Blanchard
  2015-01-09 11:01 ` Adhemerval Zanella
  1 sibling, 1 reply; 10+ messages in thread
From: David Laight @ 2015-01-09 10:06 UTC (permalink / raw)
  To: 'Anton Blanchard', benh, paulus, mpe; +Cc: linuxppc-dev

RnJvbTogQW50b24gQmxhbmNoYXJkDQo+IEkgbm90aWNlZCBrc20gc3BlbmRpbmcgcXVpdGUgYSBs
b3Qgb2YgdGltZSBpbiBtZW1jbXAgb24gYSBsYXJnZQ0KPiBLVk0gYm94LiBUaGUgY3VycmVudCBt
ZW1jbXAgbG9vcCBpcyB2ZXJ5IHVub3B0aW1pc2VkIC0gYnl0ZSBhdCBhDQo+IHRpbWUgY29tcGFy
ZXMgd2l0aCBubyBsb29wIHVucm9sbGluZy4gV2UgY2FuIGRvIG11Y2ggbXVjaCBiZXR0ZXIuDQo+
IA0KPiBPcHRpbWlzZSB0aGUgbG9vcCBpbiBhIGZldyB3YXlzOg0KPiANCj4gLSBVbnJvbGwgdGhl
IGJ5dGUgYXQgYSB0aW1lIGxvb3ANCj4gDQo+IC0gRm9yIGxhcmdlIChhdCBsZWFzdCAzMiBieXRl
KSBjb21wYXJpc29ucyB0aGF0IGFyZSBhbHNvIDggYnl0ZQ0KPiAgIGFsaWduZWQsIHVzZSBhbiB1
bnJvbGxlZCBtb2R1bG8gc2NoZWR1bGVkIGxvb3AgdXNpbmcgOCBieXRlDQo+ICAgbG9hZHMuIFRo
aXMgaXMgc2ltaWxhciB0byBvdXIgZ2xpYmMgbWVtY21wLg0KPiANCj4gQSBzaW1wbGUgbWljcm9i
ZW5jaG1hcmsgdGVzdGluZyAxMDAwMDAwMCBpdGVyYXRpb25zIG9mIGFuIDgxOTIgYnl0ZQ0KPiBt
ZW1jbXAgd2FzIHVzZWQgdG8gbWVhc3VyZSB0aGUgcGVyZm9ybWFuY2U6DQo+IA0KPiBiYXNlbGlu
ZToJMjkuOTMgcw0KPiANCj4gbW9kaWZpZWQ6CSAxLjcwIHMNCj4gDQo+IEp1c3Qgb3ZlciAxN3gg
ZmFzdGVyLg0KDQpUaGUgdW5yb2xsZWQgbG9vcCAoZGVsZXRlZCkgbG9va3MgZXhjZXNzaXZlLg0K
T24gYSBtb2Rlcm4gY3B1IHdpdGggbXVsdGlwbGUgZXhlY3V0aW9uIHVuaXRzIHlvdSBjYW4gdXN1
YWxseQ0KbWFuYWdlIHRvIGdldCB0aGUgbG9vcCBvdmVyaGVhZCB0byBleGVjdXRlIGluIHBhcmFs
bGVsIHRvIHRoZQ0KYWN0dWFsICd3b3JrJy4NClNvIEkgc3VzcGVjdCB0aGF0IGEgbXVjaCBzaW1w
bGVyICd3b3JkIGF0IGEgdGltZScgbG9vcCB3aWxsIGJlDQphbG1vc3QgYXMgZmFzdCAtIGVzcGVj
aWFsbHkgaW4gdGhlIGNhc2Ugd2hlcmUgdGhlIGNvZGUgaXNuJ3QNCmFscmVhZHkgaW4gdGhlIGNh
Y2hlIGFuZCB0aGUgY29tcGFyZSBpcyByZWxhdGl2ZWx5IHNob3J0Lg0KVHJ5IHNvbWV0aGluZyBi
YXNlZCBvbjoNCglhMSA9ICphKys7DQoJYjEgPSAqYisrOw0KCXdoaWxlIHsNCgkJYTIgPSAqYSsr
Ow0KCQliMiA9ICpiKys7DQoJCWlmIChhMSAhPSBhMikNCgkJCWJyZWFrOw0KCQlhMSA9ICphKys7
DQoJCWIxID0gKmIrKzsNCgl9IHdoaWxlIChhMiAhPSBhMSk7DQoNCglEYXZpZA0KDQo=

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 1/2] powerpc: Add 64bit optimised memcmp
@ 2015-01-09  1:56 Anton Blanchard
  2015-01-09 10:06 ` David Laight
  2015-01-09 11:01 ` Adhemerval Zanella
  0 siblings, 2 replies; 10+ messages in thread
From: Anton Blanchard @ 2015-01-09  1:56 UTC (permalink / raw)
  To: benh, paulus, mpe; +Cc: linuxppc-dev

I noticed ksm spending quite a lot of time in memcmp on a large
KVM box. The current memcmp loop is very unoptimised - byte at a
time compares with no loop unrolling. We can do much much better.

Optimise the loop in a few ways:

- Unroll the byte at a time loop

- For large (at least 32 byte) comparisons that are also 8 byte
  aligned, use an unrolled modulo scheduled loop using 8 byte
  loads. This is similar to our glibc memcmp.

A simple microbenchmark testing 10000000 iterations of an 8192 byte
memcmp was used to measure the performance:

baseline:	29.93 s

modified:	 1.70 s

Just over 17x faster.

Signed-off-by: Anton Blanchard <anton@samba.org>
---
 arch/powerpc/lib/Makefile    |   3 +-
 arch/powerpc/lib/memcmp_64.S | 233 +++++++++++++++++++++++++++++++++++++++++++
 arch/powerpc/lib/string.S    |   2 +
 3 files changed, 237 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/lib/memcmp_64.S

diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 1b01159..5526156 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -15,7 +15,8 @@ obj-$(CONFIG_PPC32)	+= div64.o copy_32.o
 
 obj-$(CONFIG_PPC64)	+= copypage_64.o copyuser_64.o \
 			   usercopy_64.o mem_64.o hweight_64.o \
-			   copyuser_power7.o string_64.o copypage_power7.o
+			   copyuser_power7.o string_64.o copypage_power7.o \
+			   memcmp_64.o
 ifeq ($(CONFIG_GENERIC_CSUM),)
 obj-y			+= checksum_$(CONFIG_WORD_SIZE).o
 obj-$(CONFIG_PPC64)	+= checksum_wrappers_64.o
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
new file mode 100644
index 0000000..d71880b
--- /dev/null
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -0,0 +1,233 @@
+/*
+ * Author: Author: Anton Blanchard <anton@au.ibm.com>
+ * Copyright 2015 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/ppc_asm.h>
+
+#define off8	r6
+#define off16	r7
+#define off24	r8
+
+#define rA	r9
+#define rB	r10
+#define rC	r11
+#define rD	r27
+#define rE	r28
+#define rF	r29
+#define rG	r30
+#define rH	r31
+
+#ifdef __LITTLE_ENDIAN__
+#define LD	ldbrx
+#else
+#define LD	ldx
+#endif
+
+_GLOBAL(memcmp)
+	cmpdi	cr1,r5,0
+
+	/* Use the short loop if both strings are not 8B aligned */
+	or	r6,r3,r4
+	rldicl.	r6,r6,0,(64-3)
+
+	/* Use the short loop if length is less than 32B */
+	cmpdi	cr5,r5,31
+
+	beq	cr1,.Lzero
+	bne	.Lshort
+	bgt	cr5,.Llong
+
+.Lshort:
+	mtctr	r5
+
+1:	lbz	rA,0(r3)
+	lbz	rB,0(r4)
+	subf.	rC,rB,rA
+	bne	.Lnon_zero
+	bdz	.Lzero
+
+	lbz	rA,1(r3)
+	lbz	rB,1(r4)
+	subf.	rC,rB,rA
+	bne	.Lnon_zero
+	bdz	.Lzero
+
+	lbz	rA,2(r3)
+	lbz	rB,2(r4)
+	subf.	rC,rB,rA
+	bne	.Lnon_zero
+	bdz	.Lzero
+
+	lbz	rA,3(r3)
+	lbz	rB,3(r4)
+	subf.	rC,rB,rA
+	bne	.Lnon_zero
+
+	addi	r3,r3,4
+	addi	r4,r4,4
+
+	bdnzt	eq,1b
+
+.Lzero:
+	li	r3,0
+	blr
+
+.Lnon_zero:
+	mr	r3,rC
+	blr
+
+.Llong:
+	li	off8,8
+	li	off16,16
+	li	off24,24
+
+	std	r31,-8(r1)
+	std	r30,-16(r1)
+	std	r29,-24(r1)
+	std	r28,-32(r1)
+	std	r27,-40(r1)
+
+	srdi	r0,r5,5
+	mtctr	r0
+	andi.	r5,r5,31
+
+	LD	rA,0,r3
+	LD	rB,0,r4
+
+	LD	rC,off8,r3
+	LD	rD,off8,r4
+
+	LD	rE,off16,r3
+	LD	rF,off16,r4
+
+	LD	rG,off24,r3
+	LD	rH,off24,r4
+	cmpld	cr0,rA,rB
+
+	addi	r3,r3,32
+	addi	r4,r4,32
+
+	bdz	.Lfirst32
+
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr1,rC,rD
+
+	LD	rC,off8,r3
+	LD	rD,off8,r4
+	cmpld	cr5,rE,rF
+
+	LD	rE,off16,r3
+	LD	rF,off16,r4
+	cmpld	cr6,rG,rH
+	bne	cr0,.LcmpAB
+
+	LD	rG,off24,r3
+	LD	rH,off24,r4
+	cmpld	cr0,rA,rB
+	bne	cr1,.LcmpCD
+
+	addi	r3,r3,32
+	addi	r4,r4,32
+
+	bdz	.Lsecond32
+
+	.balign	16
+
+1:	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr1,rC,rD
+	bne	cr5,.LcmpEF
+
+	LD	rC,off8,r3
+	LD	rD,off8,r4
+	cmpld	cr5,rE,rF
+	bne	cr6,.LcmpGH
+
+	LD	rE,off16,r3
+	LD	rF,off16,r4
+	cmpld	cr6,rG,rH
+	bne	cr0,.LcmpAB
+
+	LD	rG,off24,r3
+	LD	rH,off24,r4
+	cmpld	cr0,rA,rB
+	bne	cr1,.LcmpCD
+
+	addi	r3,r3,32
+	addi	r4,r4,32
+
+	bdnz	1b
+
+.Lsecond32:
+	cmpld	cr1,rC,rD
+	bne	cr5,.LcmpEF
+
+	cmpld	cr5,rE,rF
+	bne	cr6,.LcmpGH
+
+	cmpld	cr6,rG,rH
+	bne	cr0,.LcmpAB
+
+	bne	cr1,.LcmpCD
+	bne	cr5,.LcmpEF
+	bne	cr6,.LcmpGH
+
+.Ltail:
+	ld	r31,-8(r1)
+	ld	r30,-16(r1)
+	ld	r29,-24(r1)
+	ld	r28,-32(r1)
+	ld	r27,-40(r1)
+
+	cmpdi	r5,0
+	beq	.Lzero
+	b	.Lshort
+
+.Lfirst32:
+	cmpld	cr1,rC,rD
+	cmpld	cr5,rE,rF
+	cmpld	cr6,rG,rH
+
+	bne	cr0,.LcmpAB
+	bne	cr1,.LcmpCD
+	bne	cr5,.LcmpEF
+	bne	cr6,.LcmpGH
+
+	b	.Ltail
+
+.LcmpAB:
+	li	r3,1
+	bgt	cr0,.Lout
+	li	r3,-1
+	b	.Lout
+
+.LcmpCD:
+	li	r3,1
+	bgt	cr1,.Lout
+	li	r3,-1
+	b	.Lout
+
+.LcmpEF:
+	li	r3,1
+	bgt	cr5,.Lout
+	li	r3,-1
+	b	.Lout
+
+.LcmpGH:
+	li	r3,1
+	bgt	cr6,.Lout
+	li	r3,-1
+
+.Lout:
+	ld	r31,-8(r1)
+	ld	r30,-16(r1)
+	ld	r29,-24(r1)
+	ld	r28,-32(r1)
+	ld	r27,-40(r1)
+	blr
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index 1b5a0a0..c80fb49 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -93,6 +93,7 @@ _GLOBAL(strlen)
 	subf	r3,r3,r4
 	blr
 
+#ifdef CONFIG_PPC32
 _GLOBAL(memcmp)
 	PPC_LCMPI 0,r5,0
 	beq-	2f
@@ -106,6 +107,7 @@ _GLOBAL(memcmp)
 	blr
 2:	li	r3,0
 	blr
+#endif
 
 _GLOBAL(memchr)
 	PPC_LCMPI 0,r5,0
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2015-01-21 12:06 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-01-21  1:27 [PATCH 1/2] powerpc: Add 64bit optimised memcmp Anton Blanchard
2015-01-21  1:27 ` [PATCH 2/2] powerpc: Add memcmp testcase Anton Blanchard
2015-01-21  9:26 ` [PATCH 1/2] powerpc: Add 64bit optimised memcmp Arnd Bergmann
2015-01-21 12:06   ` Anton Blanchard
  -- strict thread matches above, loose matches on Subject: below --
2015-01-09  1:56 Anton Blanchard
2015-01-09 10:06 ` David Laight
2015-01-12  0:55   ` Anton Blanchard
2015-01-12  6:55     ` Joakim Tjernlund
2015-01-12  9:45       ` David Laight
2015-01-09 11:01 ` Adhemerval Zanella

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.