All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/5] powerpc/lib: Optimisation of string functions (mainly for PPC32)
@ 2018-05-17 10:49 Christophe Leroy
  2018-05-17 10:49 ` [PATCH v2 1/5] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
                   ` (4 more replies)
  0 siblings, 5 replies; 13+ messages in thread
From: Christophe Leroy @ 2018-05-17 10:49 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linux-kernel, linuxppc-dev

This serie intend to optimise string functions for PPC32 in the
same spirit as already done on PPC64.

The first patch moves PPC32 specific functions from string.S into
a dedicated file named string_32.S
The second patch rewrites __clear_user() by using dcbz intruction
The third patch rewrites memcmp() to compare 32 bits words instead
of comparing byte per byte.
The fourth patch removes NUL size verification from the assembly
functions so that GCC can optimise them out when the size is constant
The last patch inlines memcmp() for constant sizes <= 16

As shown in each individual commit log, second, third and last patches
provides significant improvment.

Changes in v2:
- Moved out the patch removing the hot loop alignment on PPC32
- Squashed the changes related to NUL size verification in a single patch
- Reordered the patches in a more logical order
- Modified the inlining patch to avoid warning about impossibility to version symbols.

Christophe Leroy (5):
  powerpc/lib: move PPC32 specific functions out of string.S
  powerpc/lib: optimise 32 bits __clear_user()
  powerpc/lib: optimise PPC32 memcmp
  powerpc/lib: inline string functions NUL size verification
  powerpc/lib: inline memcmp() for small constant sizes

 arch/powerpc/include/asm/string.h      |  91 ++++++++++++++++++++-
 arch/powerpc/kernel/prom_init_check.sh |   2 +-
 arch/powerpc/lib/Makefile              |   5 +-
 arch/powerpc/lib/memcmp_64.S           |   8 ++
 arch/powerpc/lib/string.S              |  75 ++++-------------
 arch/powerpc/lib/string_32.S           | 145 +++++++++++++++++++++++++++++++++
 6 files changed, 259 insertions(+), 67 deletions(-)
 create mode 100644 arch/powerpc/lib/string_32.S

-- 
2.13.3

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v2 1/5] powerpc/lib: move PPC32 specific functions out of string.S
  2018-05-17 10:49 [PATCH v2 0/5] powerpc/lib: Optimisation of string functions (mainly for PPC32) Christophe Leroy
@ 2018-05-17 10:49 ` Christophe Leroy
  2018-05-17 13:33   ` Segher Boessenkool
  2018-05-17 10:49 ` [PATCH v2 2/5] powerpc/lib: optimise 32 bits __clear_user() Christophe Leroy
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 13+ messages in thread
From: Christophe Leroy @ 2018-05-17 10:49 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linux-kernel, linuxppc-dev

In preparation of optimisation patches, move PPC32 specific
memcmp() and __clear_user() into string_32.S

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/lib/Makefile    |  5 +--
 arch/powerpc/lib/string.S    | 61 ------------------------------------
 arch/powerpc/lib/string_32.S | 74 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 63 deletions(-)
 create mode 100644 arch/powerpc/lib/string_32.S

diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 653901042ad7..2c9b8c0adf22 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -26,13 +26,14 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \
 			       memcpy_power7.o
 
 obj64-y	+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
-	   string_64.o memcpy_64.o memcmp_64.o pmem.o
+	   memcpy_64.o memcmp_64.o pmem.o
 
 obj64-$(CONFIG_SMP)	+= locks.o
 obj64-$(CONFIG_ALTIVEC)	+= vmx-helper.o
 obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o
 
-obj-y			+= checksum_$(BITS).o checksum_wrappers.o
+obj-y			+= checksum_$(BITS).o checksum_wrappers.o \
+			   string_$(BITS).o
 
 obj-y			+= sstep.o ldstfp.o quad.o
 obj64-y			+= quad.o
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index a026d8fa8a99..0ef189847337 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -59,23 +59,6 @@ _GLOBAL(strncmp)
 	blr
 EXPORT_SYMBOL(strncmp)
 
-#ifdef CONFIG_PPC32
-_GLOBAL(memcmp)
-	PPC_LCMPI 0,r5,0
-	beq-	2f
-	mtctr	r5
-	addi	r6,r3,-1
-	addi	r4,r4,-1
-1:	lbzu	r3,1(r6)
-	lbzu	r0,1(r4)
-	subf.	r3,r0,r3
-	bdnzt	2,1b
-	blr
-2:	li	r3,0
-	blr
-EXPORT_SYMBOL(memcmp)
-#endif
-
 _GLOBAL(memchr)
 	PPC_LCMPI 0,r5,0
 	beq-	2f
@@ -91,47 +74,3 @@ _GLOBAL(memchr)
 2:	li	r3,0
 	blr
 EXPORT_SYMBOL(memchr)
-
-#ifdef CONFIG_PPC32
-_GLOBAL(__clear_user)
-	addi	r6,r3,-4
-	li	r3,0
-	li	r5,0
-	cmplwi	0,r4,4
-	blt	7f
-	/* clear a single word */
-11:	stwu	r5,4(r6)
-	beqlr
-	/* clear word sized chunks */
-	andi.	r0,r6,3
-	add	r4,r0,r4
-	subf	r6,r0,r6
-	srwi	r0,r4,2
-	andi.	r4,r4,3
-	mtctr	r0
-	bdz	7f
-1:	stwu	r5,4(r6)
-	bdnz	1b
-	/* clear byte sized chunks */
-7:	cmpwi	0,r4,0
-	beqlr
-	mtctr	r4
-	addi	r6,r6,3
-8:	stbu	r5,1(r6)
-	bdnz	8b
-	blr
-90:	mr	r3,r4
-	blr
-91:	mfctr	r3
-	slwi	r3,r3,2
-	add	r3,r3,r4
-	blr
-92:	mfctr	r3
-	blr
-
-	EX_TABLE(11b, 90b)
-	EX_TABLE(1b, 91b)
-	EX_TABLE(8b, 92b)
-
-EXPORT_SYMBOL(__clear_user)
-#endif
diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
new file mode 100644
index 000000000000..ab8c4f5f31b6
--- /dev/null
+++ b/arch/powerpc/lib/string_32.S
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * String handling functions for PowerPC32
+ *
+ * Copyright (C) 2018 CS Systemes d'Information
+ *
+ * Author: Christophe Leroy <christophe.leroy@c-s.fr>
+ *
+ */
+
+#include <asm/processor.h>
+#include <asm/errno.h>
+#include <asm/ppc_asm.h>
+#include <asm/export.h>
+
+	.text
+
+_GLOBAL(memcmp)
+	PPC_LCMPI 0,r5,0
+	beq-	2f
+	mtctr	r5
+	addi	r6,r3,-1
+	addi	r4,r4,-1
+1:	lbzu	r3,1(r6)
+	lbzu	r0,1(r4)
+	subf.	r3,r0,r3
+	bdnzt	2,1b
+	blr
+2:	li	r3,0
+	blr
+EXPORT_SYMBOL(memcmp)
+
+_GLOBAL(__clear_user)
+	addi	r6,r3,-4
+	li	r3,0
+	li	r5,0
+	cmplwi	0,r4,4
+	blt	7f
+	/* clear a single word */
+11:	stwu	r5,4(r6)
+	beqlr
+	/* clear word sized chunks */
+	andi.	r0,r6,3
+	add	r4,r0,r4
+	subf	r6,r0,r6
+	srwi	r0,r4,2
+	andi.	r4,r4,3
+	mtctr	r0
+	bdz	7f
+1:	stwu	r5,4(r6)
+	bdnz	1b
+	/* clear byte sized chunks */
+7:	cmpwi	0,r4,0
+	beqlr
+	mtctr	r4
+	addi	r6,r6,3
+8:	stbu	r5,1(r6)
+	bdnz	8b
+	blr
+90:	mr	r3,r4
+	blr
+91:	mfctr	r3
+	slwi	r3,r3,2
+	add	r3,r3,r4
+	blr
+92:	mfctr	r3
+	blr
+
+	EX_TABLE(11b, 90b)
+	EX_TABLE(1b, 91b)
+	EX_TABLE(8b, 92b)
+
+EXPORT_SYMBOL(__clear_user)
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 2/5] powerpc/lib: optimise 32 bits __clear_user()
  2018-05-17 10:49 [PATCH v2 0/5] powerpc/lib: Optimisation of string functions (mainly for PPC32) Christophe Leroy
  2018-05-17 10:49 ` [PATCH v2 1/5] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
@ 2018-05-17 10:49 ` Christophe Leroy
  2018-05-17 10:49 ` [PATCH v2 3/5] powerpc/lib: optimise PPC32 memcmp Christophe Leroy
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 13+ messages in thread
From: Christophe Leroy @ 2018-05-17 10:49 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linux-kernel, linuxppc-dev

Rewrite clear_user() on the same principle as memset(0), making use
of dcbz to clear complete cache lines.

This code is a copy/paste of memset(), with some modifications
in order to retrieve remaining number of bytes to be cleared,
as it needs to be returned in case of error.

On a MPC885, throughput is almost doubled:

Before:
~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 18.990779 seconds, 52.7MB/s

After:
~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 9.611468 seconds, 104.0MB/s

On a MPC8321, throughput is multiplied by 2.12:

Before:
root@vgoippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 6.844352 seconds, 146.1MB/s

After:
root@vgoippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 3.218854 seconds, 310.7MB/s

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/lib/string_32.S | 85 +++++++++++++++++++++++++++++++-------------
 1 file changed, 60 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
index ab8c4f5f31b6..2c11c2019b69 100644
--- a/arch/powerpc/lib/string_32.S
+++ b/arch/powerpc/lib/string_32.S
@@ -13,6 +13,7 @@
 #include <asm/errno.h>
 #include <asm/ppc_asm.h>
 #include <asm/export.h>
+#include <asm/cache.h>
 
 	.text
 
@@ -31,44 +32,78 @@ _GLOBAL(memcmp)
 	blr
 EXPORT_SYMBOL(memcmp)
 
+CACHELINE_BYTES = L1_CACHE_BYTES
+LG_CACHELINE_BYTES = L1_CACHE_SHIFT
+CACHELINE_MASK = (L1_CACHE_BYTES-1)
+
 _GLOBAL(__clear_user)
-	addi	r6,r3,-4
-	li	r3,0
-	li	r5,0
-	cmplwi	0,r4,4
+/*
+ * Use dcbz on the complete cache lines in the destination
+ * to set them to zero.  This requires that the destination
+ * area is cacheable.
+ */
+	cmplwi	cr0, r4, 4
+	mr	r10, r3
+	li	r3, 0
 	blt	7f
-	/* clear a single word */
-11:	stwu	r5,4(r6)
+
+11:	stw	r3, 0(r10)
 	beqlr
-	/* clear word sized chunks */
-	andi.	r0,r6,3
-	add	r4,r0,r4
-	subf	r6,r0,r6
-	srwi	r0,r4,2
-	andi.	r4,r4,3
+	andi.	r0, r10, 3
+	add	r11, r0, r4
+	subf	r6, r0, r10
+
+	clrlwi	r7, r6, 32 - LG_CACHELINE_BYTES
+	add	r8, r7, r11
+	srwi	r9, r8, LG_CACHELINE_BYTES
+	addic.	r9, r9, -1	/* total number of complete cachelines */
+	ble	2f
+	xori	r0, r7, CACHELINE_MASK & ~3
+	srwi.	r0, r0, 2
+	beq	3f
+	mtctr	r0
+4:	stwu	r3, 4(r6)
+	bdnz	4b
+3:	mtctr	r9
+	li	r7, 4
+10:	dcbz	r7, r6
+	addi	r6, r6, CACHELINE_BYTES
+	bdnz	10b
+	clrlwi	r11, r8, 32 - LG_CACHELINE_BYTES
+	addi	r11, r11, 4
+
+2:	srwi	r0 ,r11 ,2
 	mtctr	r0
-	bdz	7f
-1:	stwu	r5,4(r6)
+	bdz	6f
+1:	stwu	r3, 4(r6)
 	bdnz	1b
-	/* clear byte sized chunks */
-7:	cmpwi	0,r4,0
+6:	andi.	r11, r11, 3
 	beqlr
-	mtctr	r4
-	addi	r6,r6,3
-8:	stbu	r5,1(r6)
+	mtctr	r11
+	addi	r6, r6, 3
+8:	stbu	r3, 1(r6)
 	bdnz	8b
 	blr
-90:	mr	r3,r4
+
+7:	cmpwi	cr0, r4, 0
+	beqlr
+	mtctr	r4
+	addi	r6, r10, -1
+9:	stbu	r3, 1(r6)
+	bdnz	9b
 	blr
-91:	mfctr	r3
-	slwi	r3,r3,2
-	add	r3,r3,r4
+
+90:	mr	r3, r4
 	blr
-92:	mfctr	r3
+91:	add	r3, r10, r4
+	subf	r3, r6, r3
 	blr
 
 	EX_TABLE(11b, 90b)
+	EX_TABLE(4b, 91b)
+	EX_TABLE(10b, 91b)
 	EX_TABLE(1b, 91b)
-	EX_TABLE(8b, 92b)
+	EX_TABLE(8b, 91b)
+	EX_TABLE(9b, 91b)
 
 EXPORT_SYMBOL(__clear_user)
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 3/5] powerpc/lib: optimise PPC32 memcmp
  2018-05-17 10:49 [PATCH v2 0/5] powerpc/lib: Optimisation of string functions (mainly for PPC32) Christophe Leroy
  2018-05-17 10:49 ` [PATCH v2 1/5] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
  2018-05-17 10:49 ` [PATCH v2 2/5] powerpc/lib: optimise 32 bits __clear_user() Christophe Leroy
@ 2018-05-17 10:49 ` Christophe Leroy
  2018-05-17 10:49 ` [PATCH v2 4/5] powerpc/lib: inline string functions NUL size verification Christophe Leroy
  2018-05-17 10:49 ` [PATCH v2 5/5] powerpc/lib: inline memcmp() for small constant sizes Christophe Leroy
  4 siblings, 0 replies; 13+ messages in thread
From: Christophe Leroy @ 2018-05-17 10:49 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linux-kernel, linuxppc-dev

At the time being, memcmp() compares two chunks of memory
byte per byte.

This patch optimises the comparison by comparing word by word.

A small benchmark performed on an 8xx comparing two chuncks
of 512 bytes performed 100000 times gives:

Before : 5852274 TB ticks
After:   1488638 TB ticks

This is almost 4 times faster

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/lib/string_32.S | 42 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
index 2c11c2019b69..5c0e77baa9c7 100644
--- a/arch/powerpc/lib/string_32.S
+++ b/arch/powerpc/lib/string_32.S
@@ -20,13 +20,41 @@
 _GLOBAL(memcmp)
 	PPC_LCMPI 0,r5,0
 	beq-	2f
-	mtctr	r5
-	addi	r6,r3,-1
-	addi	r4,r4,-1
-1:	lbzu	r3,1(r6)
-	lbzu	r0,1(r4)
-	subf.	r3,r0,r3
-	bdnzt	2,1b
+	srawi.	r7, r5, 2		/* Divide len by 4 */
+	mr	r6, r3
+	beq-	3f
+	mtctr	r7
+	li	r7, 0
+1:
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	r3, r6, r7
+	lwbrx	r0, r4, r7
+#else
+	lwzx	r3, r6, r7
+	lwzx	r0, r4, r7
+#endif
+	addi	r7, r7, 4
+	subf.	r3, r0, r3
+	bdnzt	eq, 1b
+	bnelr
+	andi.	r5, r5, 3
+	beqlr
+3:	cmplwi	cr1, r5, 2
+	blt-	cr1, 4f
+#ifdef __LITTLE_ENDIAN__
+	lhbrx	r3, r6, r7
+	lhbrx	r0, r4, r7
+#else
+	lhzx	r3, r6, r7
+	lhzx	r0, r4, r7
+#endif
+	addi	r7, r7, 2
+	subf.	r3, r0, r3
+	beqlr	cr1
+	bnelr
+4:	lbzx	r3, r6, r7
+	lbzx	r0, r4, r7
+	subf.	r3, r0, r3
 	blr
 2:	li	r3,0
 	blr
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 4/5] powerpc/lib: inline string functions NUL size verification
  2018-05-17 10:49 [PATCH v2 0/5] powerpc/lib: Optimisation of string functions (mainly for PPC32) Christophe Leroy
                   ` (2 preceding siblings ...)
  2018-05-17 10:49 ` [PATCH v2 3/5] powerpc/lib: optimise PPC32 memcmp Christophe Leroy
@ 2018-05-17 10:49 ` Christophe Leroy
  2018-05-17 10:49 ` [PATCH v2 5/5] powerpc/lib: inline memcmp() for small constant sizes Christophe Leroy
  4 siblings, 0 replies; 13+ messages in thread
From: Christophe Leroy @ 2018-05-17 10:49 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linux-kernel, linuxppc-dev

Many calls to memcmp(), strncmp(), strncpy() and memchr()
are done with constant size.

This patch gives GCC a chance to optimise out
the NUL size verification.

This is only done when CONFIG_FORTIFY_SOURCE is not set, because
when CONFIG_FORTIFY_SOURCE is set, other inline versions of the
functions are defined in linux/string.h and conflict with ours.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/include/asm/string.h      | 45 +++++++++++++++++++++++++++++++---
 arch/powerpc/kernel/prom_init_check.sh |  2 +-
 arch/powerpc/lib/memcmp_64.S           |  8 ++++++
 arch/powerpc/lib/string.S              | 14 +++++++++++
 arch/powerpc/lib/string_32.S           |  8 ++++++
 5 files changed, 73 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
index 9b8cedf618f4..35f1aaad9b50 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -15,17 +15,56 @@
 #define __HAVE_ARCH_MEMCPY_FLUSHCACHE
 
 extern char * strcpy(char *,const char *);
-extern char * strncpy(char *,const char *, __kernel_size_t);
 extern __kernel_size_t strlen(const char *);
 extern int strcmp(const char *,const char *);
-extern int strncmp(const char *, const char *, __kernel_size_t);
 extern char * strcat(char *, const char *);
 extern void * memset(void *,int,__kernel_size_t);
 extern void * memcpy(void *,const void *,__kernel_size_t);
 extern void * memmove(void *,const void *,__kernel_size_t);
+extern void * memcpy_flushcache(void *,const void *,__kernel_size_t);
+
+#ifdef CONFIG_FORTIFY_SOURCE
+
+extern char * strncpy(char *,const char *, __kernel_size_t);
+extern int strncmp(const char *, const char *, __kernel_size_t);
 extern int memcmp(const void *,const void *,__kernel_size_t);
 extern void * memchr(const void *,int,__kernel_size_t);
-extern void * memcpy_flushcache(void *,const void *,__kernel_size_t);
+
+#else
+
+extern char *__strncpy(char *,const char *, __kernel_size_t);
+extern int __strncmp(const char *, const char *, __kernel_size_t);
+extern int __memcmp(const void *,const void *,__kernel_size_t);
+extern void *__memchr(const void *,int,__kernel_size_t);
+
+static inline char *strncpy(char *p, const char *q, __kernel_size_t size)
+{
+	if (unlikely(!size))
+		return p;
+	return __strncpy(p, q, size);
+}
+
+static inline int strncmp(const char *p, const char *q, __kernel_size_t size)
+{
+	if (unlikely(!size))
+		return 0;
+	return __strncmp(p, q, size);
+}
+
+static inline int memcmp(const void *p,const void *q,__kernel_size_t size)
+{
+	if (unlikely(!size))
+		return 0;
+	return __memcmp(p, q, size);
+}
+
+static inline void *memchr(const void *p, int c, __kernel_size_t size)
+{
+	if (unlikely(!size))
+		return NULL;
+	return __memchr(p, c, size);
+}
+#endif
 
 #ifdef CONFIG_PPC64
 #define __HAVE_ARCH_MEMSET32
diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh
index acb6b9226352..2d87e5f9d87b 100644
--- a/arch/powerpc/kernel/prom_init_check.sh
+++ b/arch/powerpc/kernel/prom_init_check.sh
@@ -18,7 +18,7 @@
 
 WHITELIST="add_reloc_offset __bss_start __bss_stop copy_and_flush
 _end enter_prom memcpy memset reloc_offset __secondary_hold
-__secondary_hold_acknowledge __secondary_hold_spinloop __start
+__secondary_hold_acknowledge __secondary_hold_spinloop __start  __strncmp
 strcmp strcpy strlcpy strlen strncmp strstr kstrtobool logo_linux_clut224
 reloc_got2 kernstart_addr memstart_addr linux_banner _stext
 __prom_init_toc_start __prom_init_toc_end btext_setup_display TOC."
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b7bd55..9b28286b85cf 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -29,8 +29,14 @@
 #define LD	ldx
 #endif
 
+#ifndef CONFIG_FORTIFY_SOURCE
+#define memcmp __memcmp
+#endif
+
 _GLOBAL(memcmp)
+#ifdef CONFIG_FORTIFY_SOURCE
 	cmpdi	cr1,r5,0
+#endif
 
 	/* Use the short loop if both strings are not 8B aligned */
 	or	r6,r3,r4
@@ -39,7 +45,9 @@ _GLOBAL(memcmp)
 	/* Use the short loop if length is less than 32B */
 	cmpdi	cr6,r5,31
 
+#ifdef CONFIG_FORTIFY_SOURCE
 	beq	cr1,.Lzero
+#endif
 	bne	.Lshort
 	bgt	cr6,.Llong
 
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index 0ef189847337..2521c159e644 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -14,12 +14,20 @@
 #include <asm/export.h>
 
 	.text
+
+#ifndef CONFIG_FORTIFY_SOURCE
+#define strncpy __strncpy
+#define strncmp __strncmp
+#define memchr __memchr
+#endif
 	
 /* This clears out any unused part of the destination buffer,
    just as the libc version does.  -- paulus */
 _GLOBAL(strncpy)
+#ifdef CONFIG_FORTIFY_SOURCE
 	PPC_LCMPI 0,r5,0
 	beqlr
+#endif
 	mtctr	r5
 	addi	r6,r3,-1
 	addi	r4,r4,-1
@@ -40,8 +48,10 @@ _GLOBAL(strncpy)
 EXPORT_SYMBOL(strncpy)
 
 _GLOBAL(strncmp)
+#ifdef CONFIG_FORTIFY_SOURCE
 	PPC_LCMPI 0,r5,0
 	beq-	2f
+#endif
 	mtctr	r5
 	addi	r5,r3,-1
 	addi	r4,r4,-1
@@ -55,13 +65,17 @@ _GLOBAL(strncmp)
 	beqlr	1
 	bdnzt	eq,1b
 	blr
+#ifdef CONFIG_FORTIFY_SOURCE
 2:	li	r3,0
 	blr
+#endif
 EXPORT_SYMBOL(strncmp)
 
 _GLOBAL(memchr)
+#ifdef CONFIG_FORTIFY_SOURCE
 	PPC_LCMPI 0,r5,0
 	beq-	2f
+#endif
 	mtctr	r5
 	addi	r3,r3,-1
 #ifdef CONFIG_PPC64
diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
index 5c0e77baa9c7..15f6fa175ec1 100644
--- a/arch/powerpc/lib/string_32.S
+++ b/arch/powerpc/lib/string_32.S
@@ -17,9 +17,15 @@
 
 	.text
 
+#ifndef CONFIG_FORTIFY_SOURCE
+#define memcmp __memcmp
+#endif
+
 _GLOBAL(memcmp)
+#ifdef CONFIG_FORTIFY_SOURCE
 	PPC_LCMPI 0,r5,0
 	beq-	2f
+#endif
 	srawi.	r7, r5, 2		/* Divide len by 4 */
 	mr	r6, r3
 	beq-	3f
@@ -56,8 +62,10 @@ _GLOBAL(memcmp)
 	lbzx	r0, r4, r7
 	subf.	r3, r0, r3
 	blr
+#ifdef CONFIG_FORTIFY_SOURCE
 2:	li	r3,0
 	blr
+#endif
 EXPORT_SYMBOL(memcmp)
 
 CACHELINE_BYTES = L1_CACHE_BYTES
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 5/5] powerpc/lib: inline memcmp() for small constant sizes
  2018-05-17 10:49 [PATCH v2 0/5] powerpc/lib: Optimisation of string functions (mainly for PPC32) Christophe Leroy
                   ` (3 preceding siblings ...)
  2018-05-17 10:49 ` [PATCH v2 4/5] powerpc/lib: inline string functions NUL size verification Christophe Leroy
@ 2018-05-17 10:49 ` Christophe Leroy
  2018-05-17 13:03   ` Mathieu Malaterre
  2018-05-17 13:55   ` Segher Boessenkool
  4 siblings, 2 replies; 13+ messages in thread
From: Christophe Leroy @ 2018-05-17 10:49 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linux-kernel, linuxppc-dev

In my 8xx configuration, I get 208 calls to memcmp()
Within those 208 calls, about half of them have constant sizes,
46 have a size of 8, 17 have a size of 16, only a few have a
size over 16. Other fixed sizes are mostly 4, 6 and 10.

This patch inlines calls to memcmp() when size
is constant and lower than or equal to 16

In my 8xx configuration, this reduces the number of calls
to memcmp() from 208 to 123

The following table shows the number of TB timeticks to perform
a constant size memcmp() before and after the patch depending on
the size

	Before	After	Improvement
01:	 7577	 5682	25%
02:	41668	 5682	86%
03:	51137	13258	74%
04:	45455	 5682	87%
05:	58713	13258	77%
06:	58712	13258	77%
07:	68183	20834	70%
08:	56819	15153	73%
09:	70077	28411	60%
10:	70077	28411	60%
11:	79546	35986	55%
12:	68182	28411	58%
13:	81440	35986	55%
14:	81440	39774	51%
15:	94697	43562	54%
16:	79546	37881	52%

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/include/asm/string.h | 46 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
index 35f1aaad9b50..80cf0f9605dd 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -4,6 +4,8 @@
 
 #ifdef __KERNEL__
 
+#include <linux/kernel.h>
+
 #define __HAVE_ARCH_STRNCPY
 #define __HAVE_ARCH_STRNCMP
 #define __HAVE_ARCH_MEMSET
@@ -51,10 +53,54 @@ static inline int strncmp(const char *p, const char *q, __kernel_size_t size)
 	return __strncmp(p, q, size);
 }
 
+static inline int __memcmp1(const void *p, const void *q, int off)
+{
+	return *(u8*)(p + off) - *(u8*)(q + off);
+}
+
+static inline int __memcmp2(const void *p, const void *q, int off)
+{
+	return be16_to_cpu(*(u16*)(p + off)) - be16_to_cpu(*(u16*)(q + off));
+}
+
+static inline int __memcmp4(const void *p, const void *q, int off)
+{
+	return be32_to_cpu(*(u32*)(p + off)) - be32_to_cpu(*(u32*)(q + off));
+}
+
+static inline int __memcmp8(const void *p, const void *q, int off)
+{
+	s64 tmp = be64_to_cpu(*(u64*)(p + off)) - be64_to_cpu(*(u64*)(q + off));
+	return tmp >> 32 ? : (int)tmp;
+}
+
+static inline int __memcmp_cst(const void *p,const void *q,__kernel_size_t size)
+{
+	if (size == 1)
+		return __memcmp1(p, q, 0);
+	if (size == 2)
+		return __memcmp2(p, q, 0);
+	if (size == 3)
+		return __memcmp2(p, q, 0) ? : __memcmp1(p, q, 2);
+	if (size == 4)
+		return __memcmp4(p, q, 0);
+	if (size == 5)
+		return __memcmp4(p, q, 0) ? : __memcmp1(p, q, 4);
+	if (size == 6)
+		return __memcmp4(p, q, 0) ? : __memcmp2(p, q, 4);
+	if (size == 7)
+		return __memcmp4(p, q, 0) ? : __memcmp2(p, q, 4) ? : __memcmp1(p, q, 6);
+	return __memcmp8(p, q, 0);
+}
+
 static inline int memcmp(const void *p,const void *q,__kernel_size_t size)
 {
 	if (unlikely(!size))
 		return 0;
+	if (__builtin_constant_p(size) && size <= 8)
+		return __memcmp_cst(p, q, size);
+	if (__builtin_constant_p(size) && size <= 16)
+		return __memcmp8(p, q, 0) ? : __memcmp_cst(p + 8, q + 8, size - 8);
 	return __memcmp(p, q, size);
 }
 
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 5/5] powerpc/lib: inline memcmp() for small constant sizes
  2018-05-17 10:49 ` [PATCH v2 5/5] powerpc/lib: inline memcmp() for small constant sizes Christophe Leroy
@ 2018-05-17 13:03   ` Mathieu Malaterre
  2018-05-17 13:21     ` Christophe LEROY
  2018-05-17 13:55   ` Segher Boessenkool
  1 sibling, 1 reply; 13+ messages in thread
From: Mathieu Malaterre @ 2018-05-17 13:03 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linuxppc-dev, LKML

On Thu, May 17, 2018 at 12:49 PM, Christophe Leroy
<christophe.leroy@c-s.fr> wrote:
> In my 8xx configuration, I get 208 calls to memcmp()
> Within those 208 calls, about half of them have constant sizes,
> 46 have a size of 8, 17 have a size of 16, only a few have a
> size over 16. Other fixed sizes are mostly 4, 6 and 10.
>
> This patch inlines calls to memcmp() when size
> is constant and lower than or equal to 16
>
> In my 8xx configuration, this reduces the number of calls
> to memcmp() from 208 to 123
>
> The following table shows the number of TB timeticks to perform
> a constant size memcmp() before and after the patch depending on
> the size
>
>         Before  After   Improvement
> 01:      7577    5682   25%
> 02:     41668    5682   86%
> 03:     51137   13258   74%
> 04:     45455    5682   87%
> 05:     58713   13258   77%
> 06:     58712   13258   77%
> 07:     68183   20834   70%
> 08:     56819   15153   73%
> 09:     70077   28411   60%
> 10:     70077   28411   60%
> 11:     79546   35986   55%
> 12:     68182   28411   58%
> 13:     81440   35986   55%
> 14:     81440   39774   51%
> 15:     94697   43562   54%
> 16:     79546   37881   52%
>
> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
> ---
>  arch/powerpc/include/asm/string.h | 46 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 46 insertions(+)
>
> diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
> index 35f1aaad9b50..80cf0f9605dd 100644
> --- a/arch/powerpc/include/asm/string.h
> +++ b/arch/powerpc/include/asm/string.h
> @@ -4,6 +4,8 @@
>
>  #ifdef __KERNEL__
>
> +#include <linux/kernel.h>
> +
>  #define __HAVE_ARCH_STRNCPY
>  #define __HAVE_ARCH_STRNCMP
>  #define __HAVE_ARCH_MEMSET
> @@ -51,10 +53,54 @@ static inline int strncmp(const char *p, const char *q, __kernel_size_t size)
>         return __strncmp(p, q, size);
>  }
>
> +static inline int __memcmp1(const void *p, const void *q, int off)

Does that change anything if you change void* to char* pointer ? I
find void* arithmetic hard to read.

> +{
> +       return *(u8*)(p + off) - *(u8*)(q + off);
> +}
> +
> +static inline int __memcmp2(const void *p, const void *q, int off)
> +{
> +       return be16_to_cpu(*(u16*)(p + off)) - be16_to_cpu(*(u16*)(q + off));
> +}
> +
> +static inline int __memcmp4(const void *p, const void *q, int off)
> +{
> +       return be32_to_cpu(*(u32*)(p + off)) - be32_to_cpu(*(u32*)(q + off));
> +}
> +
> +static inline int __memcmp8(const void *p, const void *q, int off)
> +{
> +       s64 tmp = be64_to_cpu(*(u64*)(p + off)) - be64_to_cpu(*(u64*)(q + off));

I always assumed 64bits unaligned access would trigger an exception.
Is this correct ?

> +       return tmp >> 32 ? : (int)tmp;
> +}
> +
> +static inline int __memcmp_cst(const void *p,const void *q,__kernel_size_t size)
> +{
> +       if (size == 1)
> +               return __memcmp1(p, q, 0);
> +       if (size == 2)
> +               return __memcmp2(p, q, 0);
> +       if (size == 3)
> +               return __memcmp2(p, q, 0) ? : __memcmp1(p, q, 2);
> +       if (size == 4)
> +               return __memcmp4(p, q, 0);
> +       if (size == 5)
> +               return __memcmp4(p, q, 0) ? : __memcmp1(p, q, 4);
> +       if (size == 6)
> +               return __memcmp4(p, q, 0) ? : __memcmp2(p, q, 4);
> +       if (size == 7)
> +               return __memcmp4(p, q, 0) ? : __memcmp2(p, q, 4) ? : __memcmp1(p, q, 6);
> +       return __memcmp8(p, q, 0);
> +}
> +
>  static inline int memcmp(const void *p,const void *q,__kernel_size_t size)
>  {
>         if (unlikely(!size))
>                 return 0;
> +       if (__builtin_constant_p(size) && size <= 8)
> +               return __memcmp_cst(p, q, size);
> +       if (__builtin_constant_p(size) && size <= 16)
> +               return __memcmp8(p, q, 0) ? : __memcmp_cst(p + 8, q + 8, size - 8);
>         return __memcmp(p, q, size);
>  }
>
> --
> 2.13.3
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 5/5] powerpc/lib: inline memcmp() for small constant sizes
  2018-05-17 13:03   ` Mathieu Malaterre
@ 2018-05-17 13:21     ` Christophe LEROY
  2018-05-17 13:44       ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 13+ messages in thread
From: Christophe LEROY @ 2018-05-17 13:21 UTC (permalink / raw)
  To: Mathieu Malaterre
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linuxppc-dev, LKML



Le 17/05/2018 à 15:03, Mathieu Malaterre a écrit :
> On Thu, May 17, 2018 at 12:49 PM, Christophe Leroy
> <christophe.leroy@c-s.fr> wrote:
>> In my 8xx configuration, I get 208 calls to memcmp()
>> Within those 208 calls, about half of them have constant sizes,
>> 46 have a size of 8, 17 have a size of 16, only a few have a
>> size over 16. Other fixed sizes are mostly 4, 6 and 10.
>>
>> This patch inlines calls to memcmp() when size
>> is constant and lower than or equal to 16
>>
>> In my 8xx configuration, this reduces the number of calls
>> to memcmp() from 208 to 123
>>
>> The following table shows the number of TB timeticks to perform
>> a constant size memcmp() before and after the patch depending on
>> the size
>>
>>          Before  After   Improvement
>> 01:      7577    5682   25%
>> 02:     41668    5682   86%
>> 03:     51137   13258   74%
>> 04:     45455    5682   87%
>> 05:     58713   13258   77%
>> 06:     58712   13258   77%
>> 07:     68183   20834   70%
>> 08:     56819   15153   73%
>> 09:     70077   28411   60%
>> 10:     70077   28411   60%
>> 11:     79546   35986   55%
>> 12:     68182   28411   58%
>> 13:     81440   35986   55%
>> 14:     81440   39774   51%
>> 15:     94697   43562   54%
>> 16:     79546   37881   52%
>>
>> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
>> ---
>>   arch/powerpc/include/asm/string.h | 46 +++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 46 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
>> index 35f1aaad9b50..80cf0f9605dd 100644
>> --- a/arch/powerpc/include/asm/string.h
>> +++ b/arch/powerpc/include/asm/string.h
>> @@ -4,6 +4,8 @@
>>
>>   #ifdef __KERNEL__
>>
>> +#include <linux/kernel.h>
>> +
>>   #define __HAVE_ARCH_STRNCPY
>>   #define __HAVE_ARCH_STRNCMP
>>   #define __HAVE_ARCH_MEMSET
>> @@ -51,10 +53,54 @@ static inline int strncmp(const char *p, const char *q, __kernel_size_t size)
>>          return __strncmp(p, q, size);
>>   }
>>
>> +static inline int __memcmp1(const void *p, const void *q, int off)
> 
> Does that change anything if you change void* to char* pointer ? I
> find void* arithmetic hard to read.

Yes that's not the same

void* means you can use any pointer, for instance pointers to two 
structs you want to compare.

char* will force users to cast their pointers to char*

> 
>> +{
>> +       return *(u8*)(p + off) - *(u8*)(q + off);
>> +}
>> +
>> +static inline int __memcmp2(const void *p, const void *q, int off)
>> +{
>> +       return be16_to_cpu(*(u16*)(p + off)) - be16_to_cpu(*(u16*)(q + off));
>> +}
>> +
>> +static inline int __memcmp4(const void *p, const void *q, int off)
>> +{
>> +       return be32_to_cpu(*(u32*)(p + off)) - be32_to_cpu(*(u32*)(q + off));
>> +}
>> +
>> +static inline int __memcmp8(const void *p, const void *q, int off)
>> +{
>> +       s64 tmp = be64_to_cpu(*(u64*)(p + off)) - be64_to_cpu(*(u64*)(q + off));
> 
> I always assumed 64bits unaligned access would trigger an exception.
> Is this correct ?

As far as I know, an unaligned access will only occur when the operand 
of lmw, stmw, lwarx, or stwcx. is not aligned.

Maybe that's different for PPC64 ?

Christophe

> 
>> +       return tmp >> 32 ? : (int)tmp;
>> +}
>> +
>> +static inline int __memcmp_cst(const void *p,const void *q,__kernel_size_t size)
>> +{
>> +       if (size == 1)
>> +               return __memcmp1(p, q, 0);
>> +       if (size == 2)
>> +               return __memcmp2(p, q, 0);
>> +       if (size == 3)
>> +               return __memcmp2(p, q, 0) ? : __memcmp1(p, q, 2);
>> +       if (size == 4)
>> +               return __memcmp4(p, q, 0);
>> +       if (size == 5)
>> +               return __memcmp4(p, q, 0) ? : __memcmp1(p, q, 4);
>> +       if (size == 6)
>> +               return __memcmp4(p, q, 0) ? : __memcmp2(p, q, 4);
>> +       if (size == 7)
>> +               return __memcmp4(p, q, 0) ? : __memcmp2(p, q, 4) ? : __memcmp1(p, q, 6);
>> +       return __memcmp8(p, q, 0);
>> +}
>> +
>>   static inline int memcmp(const void *p,const void *q,__kernel_size_t size)
>>   {
>>          if (unlikely(!size))
>>                  return 0;
>> +       if (__builtin_constant_p(size) && size <= 8)
>> +               return __memcmp_cst(p, q, size);
>> +       if (__builtin_constant_p(size) && size <= 16)
>> +               return __memcmp8(p, q, 0) ? : __memcmp_cst(p + 8, q + 8, size - 8);
>>          return __memcmp(p, q, size);
>>   }
>>
>> --
>> 2.13.3
>>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 1/5] powerpc/lib: move PPC32 specific functions out of string.S
  2018-05-17 10:49 ` [PATCH v2 1/5] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
@ 2018-05-17 13:33   ` Segher Boessenkool
  0 siblings, 0 replies; 13+ messages in thread
From: Segher Boessenkool @ 2018-05-17 13:33 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linuxppc-dev, linux-kernel

On Thu, May 17, 2018 at 12:49:50PM +0200, Christophe Leroy wrote:
> In preparation of optimisation patches, move PPC32 specific
> memcmp() and __clear_user() into string_32.S

> --- /dev/null
> +++ b/arch/powerpc/lib/string_32.S
> @@ -0,0 +1,74 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/*
> + * String handling functions for PowerPC32
> + *
> + * Copyright (C) 2018 CS Systemes d'Information
> + *
> + * Author: Christophe Leroy <christophe.leroy@c-s.fr>
> + *
> + */

That is wrong; that code is a plain copy, from 1996 already.


Segher

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 5/5] powerpc/lib: inline memcmp() for small constant sizes
  2018-05-17 13:21     ` Christophe LEROY
@ 2018-05-17 13:44       ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 13+ messages in thread
From: Benjamin Herrenschmidt @ 2018-05-17 13:44 UTC (permalink / raw)
  To: Christophe LEROY, Mathieu Malaterre
  Cc: Paul Mackerras, Michael Ellerman, linuxppc-dev, LKML

On Thu, 2018-05-17 at 15:21 +0200, Christophe LEROY wrote:
> > > +static inline int __memcmp8(const void *p, const void *q, int off)
> > > +{
> > > +       s64 tmp = be64_to_cpu(*(u64*)(p + off)) - be64_to_cpu(*(u64*)(q + off));
> > 
> > I always assumed 64bits unaligned access would trigger an exception.
> > Is this correct ?
> 
> As far as I know, an unaligned access will only occur when the operand 
> of lmw, stmw, lwarx, or stwcx. is not aligned.
> 
> Maybe that's different for PPC64 ?

It's very implementation specific.

Recent ppc64 chips generally don't trap (unless it's cache inhibited
space). Earlier variants might trap on page boundaries or segment
boundaries. Some embedded parts are less forgiving... some earlier
POWER chips will trap on unaligned in LE mode...

I wouldn't worry too much about it though. I think if 8xx shows an
improvement then it's probably fine everywhere else :-)

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 5/5] powerpc/lib: inline memcmp() for small constant sizes
  2018-05-17 10:49 ` [PATCH v2 5/5] powerpc/lib: inline memcmp() for small constant sizes Christophe Leroy
  2018-05-17 13:03   ` Mathieu Malaterre
@ 2018-05-17 13:55   ` Segher Boessenkool
  2018-05-18 10:35     ` Christophe Leroy
  1 sibling, 1 reply; 13+ messages in thread
From: Segher Boessenkool @ 2018-05-17 13:55 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linuxppc-dev, linux-kernel

On Thu, May 17, 2018 at 12:49:58PM +0200, Christophe Leroy wrote:
> In my 8xx configuration, I get 208 calls to memcmp()
> Within those 208 calls, about half of them have constant sizes,
> 46 have a size of 8, 17 have a size of 16, only a few have a
> size over 16. Other fixed sizes are mostly 4, 6 and 10.
> 
> This patch inlines calls to memcmp() when size
> is constant and lower than or equal to 16
> 
> In my 8xx configuration, this reduces the number of calls
> to memcmp() from 208 to 123
> 
> The following table shows the number of TB timeticks to perform
> a constant size memcmp() before and after the patch depending on
> the size
> 
> 	Before	After	Improvement
> 01:	 7577	 5682	25%
> 02:	41668	 5682	86%
> 03:	51137	13258	74%
> 04:	45455	 5682	87%
> 05:	58713	13258	77%
> 06:	58712	13258	77%
> 07:	68183	20834	70%
> 08:	56819	15153	73%
> 09:	70077	28411	60%
> 10:	70077	28411	60%
> 11:	79546	35986	55%
> 12:	68182	28411	58%
> 13:	81440	35986	55%
> 14:	81440	39774	51%
> 15:	94697	43562	54%
> 16:	79546	37881	52%

Could you show results with a more recent GCC?  What version was this?

What is this really measuring?  I doubt it takes 7577 (or 5682) timebase
ticks to do a 1-byte memcmp, which is just 3 instructions after all.


Segher

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 5/5] powerpc/lib: inline memcmp() for small constant sizes
  2018-05-17 13:55   ` Segher Boessenkool
@ 2018-05-18 10:35     ` Christophe Leroy
  2018-05-18 15:20       ` Segher Boessenkool
  0 siblings, 1 reply; 13+ messages in thread
From: Christophe Leroy @ 2018-05-18 10:35 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linuxppc-dev, linux-kernel



On 05/17/2018 03:55 PM, Segher Boessenkool wrote:
> On Thu, May 17, 2018 at 12:49:58PM +0200, Christophe Leroy wrote:
>> In my 8xx configuration, I get 208 calls to memcmp()
>> Within those 208 calls, about half of them have constant sizes,
>> 46 have a size of 8, 17 have a size of 16, only a few have a
>> size over 16. Other fixed sizes are mostly 4, 6 and 10.
>>
>> This patch inlines calls to memcmp() when size
>> is constant and lower than or equal to 16
>>
>> In my 8xx configuration, this reduces the number of calls
>> to memcmp() from 208 to 123
>>
>> The following table shows the number of TB timeticks to perform
>> a constant size memcmp() before and after the patch depending on
>> the size
>>
>> 	Before	After	Improvement
>> 01:	 7577	 5682	25%
>> 02:	41668	 5682	86%
>> 03:	51137	13258	74%
>> 04:	45455	 5682	87%
>> 05:	58713	13258	77%
>> 06:	58712	13258	77%
>> 07:	68183	20834	70%
>> 08:	56819	15153	73%
>> 09:	70077	28411	60%
>> 10:	70077	28411	60%
>> 11:	79546	35986	55%
>> 12:	68182	28411	58%
>> 13:	81440	35986	55%
>> 14:	81440	39774	51%
>> 15:	94697	43562	54%
>> 16:	79546	37881	52%
> 
> Could you show results with a more recent GCC?  What version was this?

It was with the latest GCC version I have available in my environment, 
that is GCC 5.4. Is that too old ?

It seems that version inlines memcmp() when length is 1. All other 
lengths call memcmp()

> 
> What is this really measuring?  I doubt it takes 7577 (or 5682) timebase
> ticks to do a 1-byte memcmp, which is just 3 instructions after all.

Well I looked again in my tests and it seems some results are wrong, can 
remember why, I probably did something wrong when I did the tests.

Anyway, the principle is to call a function tstcmpX() 100000 times from 
a loop, and getting the mftb before and after the loop.
Then we remove from the elapsed time the time spent when calling 
tstcmp0() which is only a blr.
Therefore, we get really the time spent in the comparison only.

Here is the loop:

c06243b0:	7f ac 42 e6 	mftb    r29
c06243b4:	3f 60 00 01 	lis     r27,1
c06243b8:	63 7b 86 a0 	ori     r27,r27,34464
c06243bc:	38 a0 00 02 	li      r5,2
c06243c0:	7f c4 f3 78 	mr      r4,r30
c06243c4:	7f 83 e3 78 	mr      r3,r28
c06243c8:	4b 9e 8c 09 	bl      c000cfd0 <tstcmp2>
c06243cc:	2c 1b 00 01 	cmpwi   r27,1
c06243d0:	3b 7b ff ff 	addi    r27,r27,-1
c06243d4:	40 82 ff e8 	bne     c06243bc <setup_arch+0x294>
c06243d8:	7c ac 42 e6 	mftb    r5
c06243dc:	7c bd 28 50 	subf    r5,r29,r5
c06243e0:	7c bf 28 50 	subf    r5,r31,r5
c06243e4:	38 80 00 02 	li      r4,2
c06243e8:	7f 43 d3 78 	mr      r3,r26
c06243ec:	4b a2 e4 45 	bl      c0052830 <printk>

Before the patch:
c000cfc4 <tstcmp0>:
c000cfc4:	4e 80 00 20 	blr

c000cfc8 <tstcmp1>:
c000cfc8:	38 a0 00 01 	li      r5,1
c000cfcc:	48 00 72 08 	b       c00141d4 <__memcmp>

c000cfd0 <tstcmp2>:
c000cfd0:	38 a0 00 02 	li      r5,2
c000cfd4:	48 00 72 00 	b       c00141d4 <__memcmp>

c000cfd8 <tstcmp3>:
c000cfd8:	38 a0 00 03 	li      r5,3
c000cfdc:	48 00 71 f8 	b       c00141d4 <__memcmp>

After the patch:
c000cfc4 <tstcmp0>:
c000cfc4:	4e 80 00 20 	blr

c000cfd8 <tstcmp1>:
c000cfd8:	88 64 00 00 	lbz     r3,0(r4)
c000cfdc:	89 25 00 00 	lbz     r9,0(r5)
c000cfe0:	7c 69 18 50 	subf    r3,r9,r3
c000cfe4:	4e 80 00 20 	blr

c000cfe8 <tstcmp2>:
c000cfe8:	a0 64 00 00 	lhz     r3,0(r4)
c000cfec:	a1 25 00 00 	lhz     r9,0(r5)
c000cff0:	7c 69 18 50 	subf    r3,r9,r3
c000cff4:	4e 80 00 20 	blr

c000cff8 <tstcmp3>:
c000cff8:	a1 24 00 00 	lhz     r9,0(r4)
c000cffc:	a0 65 00 00 	lhz     r3,0(r5)
c000d000:	7c 63 48 51 	subf.   r3,r3,r9
c000d004:	4c 82 00 20 	bnelr
c000d008:	88 64 00 02 	lbz     r3,2(r4)
c000d00c:	89 25 00 02 	lbz     r9,2(r5)
c000d010:	7c 69 18 50 	subf    r3,r9,r3
c000d014:	4e 80 00 20 	blr

c000d018 <tstcmp4>:
c000d018:	80 64 00 00 	lwz     r3,0(r4)
c000d01c:	81 25 00 00 	lwz     r9,0(r5)
c000d020:	7c 69 18 50 	subf    r3,r9,r3
c000d024:	4e 80 00 20 	blr

c000d028 <tstcmp5>:
c000d028:	81 24 00 00 	lwz     r9,0(r4)
c000d02c:	80 65 00 00 	lwz     r3,0(r5)
c000d030:	7c 63 48 51 	subf.   r3,r3,r9
c000d034:	4c 82 00 20 	bnelr
c000d038:	88 64 00 04 	lbz     r3,4(r4)
c000d03c:	89 25 00 04 	lbz     r9,4(r5)
c000d040:	7c 69 18 50 	subf    r3,r9,r3
c000d044:	4e 80 00 20 	blr

c000d048 <tstcmp6>:
c000d048:	81 24 00 00 	lwz     r9,0(r4)
c000d04c:	80 65 00 00 	lwz     r3,0(r5)
c000d050:	7c 63 48 51 	subf.   r3,r3,r9
c000d054:	4c 82 00 20 	bnelr
c000d058:	a0 64 00 04 	lhz     r3,4(r4)
c000d05c:	a1 25 00 04 	lhz     r9,4(r5)
c000d060:	7c 69 18 50 	subf    r3,r9,r3
c000d064:	4e 80 00 20 	blr

c000d068 <tstcmp7>:
c000d068:	81 24 00 00 	lwz     r9,0(r4)
c000d06c:	80 65 00 00 	lwz     r3,0(r5)
c000d070:	7d 23 48 51 	subf.   r9,r3,r9
c000d074:	40 82 00 20 	bne     c000d094 <tstcmp7+0x2c>
c000d078:	a0 64 00 04 	lhz     r3,4(r4)
c000d07c:	a1 25 00 04 	lhz     r9,4(r5)
c000d080:	7d 29 18 51 	subf.   r9,r9,r3
c000d084:	40 82 00 10 	bne     c000d094 <tstcmp7+0x2c>
c000d088:	88 64 00 06 	lbz     r3,6(r4)
c000d08c:	89 25 00 06 	lbz     r9,6(r5)
c000d090:	7d 29 18 50 	subf    r9,r9,r3
c000d094:	7d 23 4b 78 	mr      r3,r9
c000d098:	4e 80 00 20 	blr

c000d09c <tstcmp8>:
c000d09c:	81 25 00 04 	lwz     r9,4(r5)
c000d0a0:	80 64 00 04 	lwz     r3,4(r4)
c000d0a4:	81 04 00 00 	lwz     r8,0(r4)
c000d0a8:	81 45 00 00 	lwz     r10,0(r5)
c000d0ac:	7c 69 18 10 	subfc   r3,r9,r3
c000d0b0:	7d 2a 41 10 	subfe   r9,r10,r8
c000d0b4:	7d 2a fe 70 	srawi   r10,r9,31
c000d0b8:	7d 48 4b 79 	or.     r8,r10,r9
c000d0bc:	4d a2 00 20 	bclr+   12,eq
c000d0c0:	7d 23 4b 78 	mr      r3,r9
c000d0c4:	4e 80 00 20 	blr

c000d0c8 <tstcmp9>:
c000d0c8:	81 25 00 04 	lwz     r9,4(r5)
c000d0cc:	80 64 00 04 	lwz     r3,4(r4)
c000d0d0:	81 04 00 00 	lwz     r8,0(r4)
c000d0d4:	81 45 00 00 	lwz     r10,0(r5)
c000d0d8:	7c 69 18 10 	subfc   r3,r9,r3
c000d0dc:	7d 2a 41 10 	subfe   r9,r10,r8
c000d0e0:	7d 2a fe 70 	srawi   r10,r9,31
c000d0e4:	7d 48 4b 79 	or.     r8,r10,r9
c000d0e8:	41 82 00 08 	beq     c000d0f0 <tstcmp9+0x28>
c000d0ec:	7d 23 4b 78 	mr      r3,r9
c000d0f0:	2f 83 00 00 	cmpwi   cr7,r3,0
c000d0f4:	4c 9e 00 20 	bnelr   cr7
c000d0f8:	88 64 00 08 	lbz     r3,8(r4)
c000d0fc:	89 25 00 08 	lbz     r9,8(r5)
c000d100:	7c 69 18 50 	subf    r3,r9,r3
c000d104:	4e 80 00 20 	blr

This shows that on PPC32, the 8 bytes comparison is not optimal, I will 
improve it.

We also see in tstcmp7() that GCC is a bit stupid, it should use r3 as 
result of the sub as he does with all previous ones, then do bnelr 
instead of bne+mr+blr

Below are the results of the measurement redone today:

     Before After  Improvment
01  24621   5681  77%
02  24621   5681  77%
03  34091  13257  61%
04  28409   5681  80%
05  41667  13257  68%
06  41668  13257  68%
07  51138  22727  56%
08  39772  15151  62%
09  53031  28409  46%
10  53031  28409  46%
11  62501  35986  42%
12  51137  28410  44%
13  64395  35985  44%
14  68182  39774  42%
15  73865  43560  41%
16  62500  37879  39%

We also see here that 08 is not optimal, it should have given same 
results as 05 and 06. I will keep it as is for PPC64 but will rewrite it 
as two 04 comparisons for PPC32

Christophe



> 
> 
> Segher
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 5/5] powerpc/lib: inline memcmp() for small constant sizes
  2018-05-18 10:35     ` Christophe Leroy
@ 2018-05-18 15:20       ` Segher Boessenkool
  0 siblings, 0 replies; 13+ messages in thread
From: Segher Boessenkool @ 2018-05-18 15:20 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linuxppc-dev, linux-kernel

On Fri, May 18, 2018 at 12:35:48PM +0200, Christophe Leroy wrote:
> On 05/17/2018 03:55 PM, Segher Boessenkool wrote:
> >On Thu, May 17, 2018 at 12:49:58PM +0200, Christophe Leroy wrote:
> >>In my 8xx configuration, I get 208 calls to memcmp()
> >Could you show results with a more recent GCC?  What version was this?
> 
> It was with the latest GCC version I have available in my environment, 
> that is GCC 5.4. Is that too old ?

Since GCC 7 the compiler knows how to do this, for powerpc; in GCC 8
it has improved still.

> It seems that version inlines memcmp() when length is 1. All other 
> lengths call memcmp()

Yup.

> c000d018 <tstcmp4>:
> c000d018:	80 64 00 00 	lwz     r3,0(r4)
> c000d01c:	81 25 00 00 	lwz     r9,0(r5)
> c000d020:	7c 69 18 50 	subf    r3,r9,r3
> c000d024:	4e 80 00 20 	blr

This is incorrect, it does not get the sign of the result correct.
Say when comparing 0xff 0xff 0xff 0xff to 0 0 0 0.  This should return
positive, but it returns negative.

For Power9 GCC does

        lwz 3,0(3)
        lwz 9,0(4)
        cmpld 7,3,9
        setb 3,7

and for Power7/Power8,

        lwz 9,0(3)
        lwz 3,0(4)
        subfc 3,3,9
        popcntd 3,3
        subfe 9,9,9
        or 3,3,9

(and it gives up for earlier CPUs, there is no nice simple code sequence
as far as we know.  Code size matters when generating inline code).

(Generating code for -m32 it is the same, just w instead of d in a few
places).

> c000d09c <tstcmp8>:
> c000d09c:	81 25 00 04 	lwz     r9,4(r5)
> c000d0a0:	80 64 00 04 	lwz     r3,4(r4)
> c000d0a4:	81 04 00 00 	lwz     r8,0(r4)
> c000d0a8:	81 45 00 00 	lwz     r10,0(r5)
> c000d0ac:	7c 69 18 10 	subfc   r3,r9,r3
> c000d0b0:	7d 2a 41 10 	subfe   r9,r10,r8
> c000d0b4:	7d 2a fe 70 	srawi   r10,r9,31
> c000d0b8:	7d 48 4b 79 	or.     r8,r10,r9
> c000d0bc:	4d a2 00 20 	bclr+   12,eq
> c000d0c0:	7d 23 4b 78 	mr      r3,r9
> c000d0c4:	4e 80 00 20 	blr

> This shows that on PPC32, the 8 bytes comparison is not optimal, I will 
> improve it.

It's not correct either (same problem as with length 4).


Segher

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2018-05-18 15:21 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-17 10:49 [PATCH v2 0/5] powerpc/lib: Optimisation of string functions (mainly for PPC32) Christophe Leroy
2018-05-17 10:49 ` [PATCH v2 1/5] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
2018-05-17 13:33   ` Segher Boessenkool
2018-05-17 10:49 ` [PATCH v2 2/5] powerpc/lib: optimise 32 bits __clear_user() Christophe Leroy
2018-05-17 10:49 ` [PATCH v2 3/5] powerpc/lib: optimise PPC32 memcmp Christophe Leroy
2018-05-17 10:49 ` [PATCH v2 4/5] powerpc/lib: inline string functions NUL size verification Christophe Leroy
2018-05-17 10:49 ` [PATCH v2 5/5] powerpc/lib: inline memcmp() for small constant sizes Christophe Leroy
2018-05-17 13:03   ` Mathieu Malaterre
2018-05-17 13:21     ` Christophe LEROY
2018-05-17 13:44       ` Benjamin Herrenschmidt
2018-05-17 13:55   ` Segher Boessenkool
2018-05-18 10:35     ` Christophe Leroy
2018-05-18 15:20       ` Segher Boessenkool

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.