All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH  1/7] powerpc/lib: move PPC32 specific functions out of string.S
@ 2018-04-17  7:38 Christophe Leroy
  2018-04-17  7:38 ` [PATCH 2/7] powerpc/lib: inline memcmp() NUL size verification Christophe Leroy
                   ` (5 more replies)
  0 siblings, 6 replies; 8+ messages in thread
From: Christophe Leroy @ 2018-04-17  7:38 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linux-kernel, linuxppc-dev

In preparation of optimisation patches, move PPC32 specific
memcmp() and __clear_user() into string_32.S

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/lib/Makefile    |  5 +--
 arch/powerpc/lib/string.S    | 61 -------------------------------------
 arch/powerpc/lib/string_32.S | 72 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 63 deletions(-)
 create mode 100644 arch/powerpc/lib/string_32.S

diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 653901042ad7..2c9b8c0adf22 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -26,13 +26,14 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \
 			       memcpy_power7.o
 
 obj64-y	+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
-	   string_64.o memcpy_64.o memcmp_64.o pmem.o
+	   memcpy_64.o memcmp_64.o pmem.o
 
 obj64-$(CONFIG_SMP)	+= locks.o
 obj64-$(CONFIG_ALTIVEC)	+= vmx-helper.o
 obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o
 
-obj-y			+= checksum_$(BITS).o checksum_wrappers.o
+obj-y			+= checksum_$(BITS).o checksum_wrappers.o \
+			   string_$(BITS).o
 
 obj-y			+= sstep.o ldstfp.o quad.o
 obj64-y			+= quad.o
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index a787776822d8..cbb90fdc672d 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -55,23 +55,6 @@ _GLOBAL(strncmp)
 	blr
 EXPORT_SYMBOL(strncmp)
 
-#ifdef CONFIG_PPC32
-_GLOBAL(memcmp)
-	PPC_LCMPI 0,r5,0
-	beq-	2f
-	mtctr	r5
-	addi	r6,r3,-1
-	addi	r4,r4,-1
-1:	lbzu	r3,1(r6)
-	lbzu	r0,1(r4)
-	subf.	r3,r0,r3
-	bdnzt	2,1b
-	blr
-2:	li	r3,0
-	blr
-EXPORT_SYMBOL(memcmp)
-#endif
-
 _GLOBAL(memchr)
 	PPC_LCMPI 0,r5,0
 	beq-	2f
@@ -85,47 +68,3 @@ _GLOBAL(memchr)
 2:	li	r3,0
 	blr
 EXPORT_SYMBOL(memchr)
-
-#ifdef CONFIG_PPC32
-_GLOBAL(__clear_user)
-	addi	r6,r3,-4
-	li	r3,0
-	li	r5,0
-	cmplwi	0,r4,4
-	blt	7f
-	/* clear a single word */
-11:	stwu	r5,4(r6)
-	beqlr
-	/* clear word sized chunks */
-	andi.	r0,r6,3
-	add	r4,r0,r4
-	subf	r6,r0,r6
-	srwi	r0,r4,2
-	andi.	r4,r4,3
-	mtctr	r0
-	bdz	7f
-1:	stwu	r5,4(r6)
-	bdnz	1b
-	/* clear byte sized chunks */
-7:	cmpwi	0,r4,0
-	beqlr
-	mtctr	r4
-	addi	r6,r6,3
-8:	stbu	r5,1(r6)
-	bdnz	8b
-	blr
-90:	mr	r3,r4
-	blr
-91:	mfctr	r3
-	slwi	r3,r3,2
-	add	r3,r3,r4
-	blr
-92:	mfctr	r3
-	blr
-
-	EX_TABLE(11b, 90b)
-	EX_TABLE(1b, 91b)
-	EX_TABLE(8b, 92b)
-
-EXPORT_SYMBOL(__clear_user)
-#endif
diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
new file mode 100644
index 000000000000..2519f8bd09e3
--- /dev/null
+++ b/arch/powerpc/lib/string_32.S
@@ -0,0 +1,72 @@
+/*
+ * String handling functions for PowerPC32
+ *
+ * Copyright (C) 2018 CS Systemes d'Information
+ *
+ * Author: Christophe Leroy <christophe.leroy@c-s.fr>
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ */
+#include <asm/processor.h>
+#include <asm/errno.h>
+#include <asm/ppc_asm.h>
+#include <asm/export.h>
+
+	.text
+
+_GLOBAL(memcmp)
+	PPC_LCMPI 0,r5,0
+	beq-	2f
+	mtctr	r5
+	addi	r6,r3,-1
+	addi	r4,r4,-1
+1:	lbzu	r3,1(r6)
+	lbzu	r0,1(r4)
+	subf.	r3,r0,r3
+	bdnzt	2,1b
+	blr
+2:	li	r3,0
+	blr
+EXPORT_SYMBOL(memcmp)
+
+_GLOBAL(__clear_user)
+	addi	r6,r3,-4
+	li	r3,0
+	li	r5,0
+	cmplwi	0,r4,4
+	blt	7f
+	/* clear a single word */
+11:	stwu	r5,4(r6)
+	beqlr
+	/* clear word sized chunks */
+	andi.	r0,r6,3
+	add	r4,r0,r4
+	subf	r6,r0,r6
+	srwi	r0,r4,2
+	andi.	r4,r4,3
+	mtctr	r0
+	bdz	7f
+1:	stwu	r5,4(r6)
+	bdnz	1b
+	/* clear byte sized chunks */
+7:	cmpwi	0,r4,0
+	beqlr
+	mtctr	r4
+	addi	r6,r6,3
+8:	stbu	r5,1(r6)
+	bdnz	8b
+	blr
+90:	mr	r3,r4
+	blr
+91:	mfctr	r3
+	slwi	r3,r3,2
+	add	r3,r3,r4
+	blr
+92:	mfctr	r3
+	blr
+
+	EX_TABLE(11b, 90b)
+	EX_TABLE(1b, 91b)
+	EX_TABLE(8b, 92b)
+
+EXPORT_SYMBOL(__clear_user)
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH  2/7] powerpc/lib: inline memcmp() NUL size verification
  2018-04-17  7:38 [PATCH 1/7] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
@ 2018-04-17  7:38 ` Christophe Leroy
  2018-04-17  7:38 ` [PATCH 3/7] powerpc/lib: optimise PPC32 memcmp Christophe Leroy
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Christophe Leroy @ 2018-04-17  7:38 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linux-kernel, linuxppc-dev

Many calls to memcmp() are done with constant size.
This patch gives GCC a chance to optimise out
the NULL size verification.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/include/asm/string.h | 10 ++++++++++
 arch/powerpc/lib/memcmp_64.S      |  4 ++++
 arch/powerpc/lib/string_32.S      |  4 ++++
 3 files changed, 18 insertions(+)

diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
index 9b8cedf618f4..cf6f495134c3 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -27,6 +27,16 @@ extern int memcmp(const void *,const void *,__kernel_size_t);
 extern void * memchr(const void *,int,__kernel_size_t);
 extern void * memcpy_flushcache(void *,const void *,__kernel_size_t);
 
+#ifndef CONFIG_FORTIFY_SOURCE
+static inline int __memcmp(const void *p,const void *q,__kernel_size_t size)
+{
+	if (unlikely(!size))
+		return 0;
+	return memcmp(p, q, size);
+}
+#define memcmp __memcmp
+#endif
+
 #ifdef CONFIG_PPC64
 #define __HAVE_ARCH_MEMSET32
 #define __HAVE_ARCH_MEMSET64
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b7bd55..f6822fabf254 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -30,7 +30,9 @@
 #endif
 
 _GLOBAL(memcmp)
+#ifdef CONFIG_FORTIFY_SOURCE
 	cmpdi	cr1,r5,0
+#endif
 
 	/* Use the short loop if both strings are not 8B aligned */
 	or	r6,r3,r4
@@ -39,7 +41,9 @@ _GLOBAL(memcmp)
 	/* Use the short loop if length is less than 32B */
 	cmpdi	cr6,r5,31
 
+#ifdef CONFIG_FORTIFY_SOURCE
 	beq	cr1,.Lzero
+#endif
 	bne	.Lshort
 	bgt	cr6,.Llong
 
diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
index 2519f8bd09e3..94e9c9bc31c3 100644
--- a/arch/powerpc/lib/string_32.S
+++ b/arch/powerpc/lib/string_32.S
@@ -15,8 +15,10 @@
 	.text
 
 _GLOBAL(memcmp)
+#ifdef CONFIG_FORTIFY_SOURCE
 	PPC_LCMPI 0,r5,0
 	beq-	2f
+#endif
 	mtctr	r5
 	addi	r6,r3,-1
 	addi	r4,r4,-1
@@ -25,8 +27,10 @@ _GLOBAL(memcmp)
 	subf.	r3,r0,r3
 	bdnzt	2,1b
 	blr
+#ifdef CONFIG_FORTIFY_SOURCE
 2:	li	r3,0
 	blr
+#endif
 EXPORT_SYMBOL(memcmp)
 
 _GLOBAL(__clear_user)
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH  3/7] powerpc/lib: optimise PPC32 memcmp
  2018-04-17  7:38 [PATCH 1/7] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
  2018-04-17  7:38 ` [PATCH 2/7] powerpc/lib: inline memcmp() NUL size verification Christophe Leroy
@ 2018-04-17  7:38 ` Christophe Leroy
  2018-04-17  7:38 ` [PATCH 4/7] powerpc/lib: inline memcmp() for small constant sizes Christophe Leroy
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Christophe Leroy @ 2018-04-17  7:38 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linux-kernel, linuxppc-dev

At the time being, memcmp() compares two chunks of memory
byte per byte.

This patch optimised the comparison by comparing word by word.

A small benchmark performed on an 8xx based on the comparison
of two chuncks of 512 bytes performed 100000 times gives:

Before : 5852274 TB ticks
After:   1488638 TB ticks

This is almost 4 times faster

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/lib/string_32.S | 42 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
index 94e9c9bc31c3..5b2a73fb07be 100644
--- a/arch/powerpc/lib/string_32.S
+++ b/arch/powerpc/lib/string_32.S
@@ -19,13 +19,41 @@ _GLOBAL(memcmp)
 	PPC_LCMPI 0,r5,0
 	beq-	2f
 #endif
-	mtctr	r5
-	addi	r6,r3,-1
-	addi	r4,r4,-1
-1:	lbzu	r3,1(r6)
-	lbzu	r0,1(r4)
-	subf.	r3,r0,r3
-	bdnzt	2,1b
+	srawi.	r7, r5, 2		/* Divide len by 4 */
+	mr	r6, r3
+	beq-	3f
+	mtctr	r7
+	li	r7, 0
+1:
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	r3, r6, r7
+	lwbrx	r0, r4, r7
+#else
+	lwzx	r3, r6, r7
+	lwzx	r0, r4, r7
+#endif
+	addi	r7, r7, 4
+	subf.	r3, r0, r3
+	bdnzt	eq, 1b
+	bnelr
+	andi.	r5, r5, 3
+	beqlr
+3:	cmplwi	cr1, r5, 2
+	blt-	cr1, 4f
+#ifdef __LITTLE_ENDIAN__
+	lhbrx	r3, r6, r7
+	lhbrx	r0, r4, r7
+#else
+	lhzx	r3, r6, r7
+	lhzx	r0, r4, r7
+#endif
+	addi	r7, r7, 2
+	subf.	r3, r0, r3
+	beqlr	cr1
+	bnelr
+4:	lbzx	r3, r6, r7
+	lbzx	r0, r4, r7
+	subf.	r3, r0, r3
 	blr
 #ifdef CONFIG_FORTIFY_SOURCE
 2:	li	r3,0
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH  4/7] powerpc/lib: inline memcmp() for small constant sizes
  2018-04-17  7:38 [PATCH 1/7] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
  2018-04-17  7:38 ` [PATCH 2/7] powerpc/lib: inline memcmp() NUL size verification Christophe Leroy
  2018-04-17  7:38 ` [PATCH 3/7] powerpc/lib: optimise PPC32 memcmp Christophe Leroy
@ 2018-04-17  7:38 ` Christophe Leroy
  2018-04-17  7:38 ` [PATCH 5/7] powerpc/lib: optimise 32 bits __clear_user() Christophe Leroy
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Christophe Leroy @ 2018-04-17  7:38 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linux-kernel, linuxppc-dev

In my 8xx configuration, I get 208 calls to memcmp()
Within those 208 calls, about half of them have constant sizes,
46 have a size of 8, 17 have a size of 16, only a few have a
size over 16. Other fixed sizes are mostly 4, 6 and 10.

This patch inlines calls to memcmp() when size
is constant and lower than or equal to 16

In my 8xx configuration, this reduces the number of calls
to memcmp() from 208 to 123

The following table shows the number of TB timeticks to perform
a constant size memcmp() before and after the patch depending on
the size

	Before	After	Improvement
01:	 7577	 5682	25%
02:	41668	 5682	86%
03:	51137	13258	74%
04:	45455	 5682	87%
05:	58713	13258	77%
06:	58712	13258	77%
07:	68183	20834	70%
08:	56819	15153	73%
09:	70077	28411	60%
10:	70077	28411	60%
11:	79546	35986	55%
12:	68182	28411	58%
13:	81440	35986	55%
14:	81440	39774	51%
15:	94697	43562	54%
16:	79546	37881	52%

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/include/asm/string.h | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
index cf6f495134c3..196ac5d587fb 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -4,6 +4,8 @@
 
 #ifdef __KERNEL__
 
+#include <linux/kernel.h>
+
 #define __HAVE_ARCH_STRNCPY
 #define __HAVE_ARCH_STRNCMP
 #define __HAVE_ARCH_MEMSET
@@ -28,10 +30,45 @@ extern void * memchr(const void *,int,__kernel_size_t);
 extern void * memcpy_flushcache(void *,const void *,__kernel_size_t);
 
 #ifndef CONFIG_FORTIFY_SOURCE
+static inline int ___memcmp(const void *p,const void *q,__kernel_size_t size, int offset)
+{
+	int dif;
+
+	BUILD_BUG_ON(!size || size > 8);
+
+	p += offset, q += offset;
+	if (size == 1)
+		return *(u8*)p - *(u8*)q;
+	if (size == 2)
+		return be16_to_cpu(*(u16*)p) - be16_to_cpu(*(u16*)q);
+	if (size == 3) {
+		dif = be16_to_cpu(*(u16*)p) - be16_to_cpu(*(u16*)q);
+		if (dif)
+			return dif;
+		return *(u8*)(p + 2) - *(u8*)(q + 2);
+	}
+	if (size == 8) {
+		s64 tmp = be64_to_cpu(*(u64*)p) - be64_to_cpu(*(u64*)q);
+		return tmp >> 32 ? : (int)tmp;
+	}
+
+	dif = be32_to_cpu(*(u32*)p) - be32_to_cpu(*(u32*)q);
+	if (size == 4 || dif)
+		return dif;
+
+	return ___memcmp(p, q, size - 4, 4);
+}
+
 static inline int __memcmp(const void *p,const void *q,__kernel_size_t size)
 {
 	if (unlikely(!size))
 		return 0;
+	if (__builtin_constant_p(size) && size <= 16) {
+		int dif = ___memcmp(p, q, size < 8 ? size : 8, 0);
+		if (size <= 8 || dif)
+			return dif;
+		return ___memcmp(p, q, size - 8, 8);
+	}
 	return memcmp(p, q, size);
 }
 #define memcmp __memcmp
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH  5/7] powerpc/lib: optimise 32 bits __clear_user()
  2018-04-17  7:38 [PATCH 1/7] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
                   ` (2 preceding siblings ...)
  2018-04-17  7:38 ` [PATCH 4/7] powerpc/lib: inline memcmp() for small constant sizes Christophe Leroy
@ 2018-04-17  7:38 ` Christophe Leroy
  2018-04-17  7:38 ` [PATCH 6/7] powerpc/lib: inline more NUL size verifications Christophe Leroy
  2018-04-17  7:38   ` Christophe Leroy
  5 siblings, 0 replies; 8+ messages in thread
From: Christophe Leroy @ 2018-04-17  7:38 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linux-kernel, linuxppc-dev

Rewrite clear_user() on the same principle as memset(0), making use
of dcbz to clear complete cache lines.

This code is a copy/paste of memset(), with some modifications
in order to retrieve remaining number of bytes to be cleared,
as it needs to be returned in case of error.

On a MPC885, throughput is almost doubled:

Before:
~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 18.990779 seconds, 52.7MB/s

After:
~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 9.611468 seconds, 104.0MB/s

On a MPC8321, throughput is multiplied by 2.12:

Before:
root@vgoippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 6.844352 seconds, 146.1MB/s

After:
root@vgoippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 3.218854 seconds, 310.7MB/s

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/lib/string_32.S | 85 +++++++++++++++++++++++++++++++-------------
 1 file changed, 60 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
index 5b2a73fb07be..31fc92b0aae6 100644
--- a/arch/powerpc/lib/string_32.S
+++ b/arch/powerpc/lib/string_32.S
@@ -11,6 +11,7 @@
 #include <asm/errno.h>
 #include <asm/ppc_asm.h>
 #include <asm/export.h>
+#include <asm/cache.h>
 
 	.text
 
@@ -61,44 +62,78 @@ _GLOBAL(memcmp)
 #endif
 EXPORT_SYMBOL(memcmp)
 
+CACHELINE_BYTES = L1_CACHE_BYTES
+LG_CACHELINE_BYTES = L1_CACHE_SHIFT
+CACHELINE_MASK = (L1_CACHE_BYTES-1)
+
 _GLOBAL(__clear_user)
-	addi	r6,r3,-4
-	li	r3,0
-	li	r5,0
-	cmplwi	0,r4,4
+/*
+ * Use dcbz on the complete cache lines in the destination
+ * to set them to zero.  This requires that the destination
+ * area is cacheable.
+ */
+	cmplwi	cr0, r4, 4
+	mr	r10, r3
+	li	r3, 0
 	blt	7f
-	/* clear a single word */
-11:	stwu	r5,4(r6)
+
+11:	stw	r3, 0(r10)
 	beqlr
-	/* clear word sized chunks */
-	andi.	r0,r6,3
-	add	r4,r0,r4
-	subf	r6,r0,r6
-	srwi	r0,r4,2
-	andi.	r4,r4,3
+	andi.	r0, r10, 3
+	add	r11, r0, r4
+	subf	r6, r0, r10
+
+	clrlwi	r7, r6, 32 - LG_CACHELINE_BYTES
+	add	r8, r7, r11
+	srwi	r9, r8, LG_CACHELINE_BYTES
+	addic.	r9, r9, -1	/* total number of complete cachelines */
+	ble	2f
+	xori	r0, r7, CACHELINE_MASK & ~3
+	srwi.	r0, r0, 2
+	beq	3f
+	mtctr	r0
+4:	stwu	r3, 4(r6)
+	bdnz	4b
+3:	mtctr	r9
+	li	r7, 4
+10:	dcbz	r7, r6
+	addi	r6, r6, CACHELINE_BYTES
+	bdnz	10b
+	clrlwi	r11, r8, 32 - LG_CACHELINE_BYTES
+	addi	r11, r11, 4
+
+2:	srwi	r0 ,r11 ,2
 	mtctr	r0
-	bdz	7f
-1:	stwu	r5,4(r6)
+	bdz	6f
+1:	stwu	r3, 4(r6)
 	bdnz	1b
-	/* clear byte sized chunks */
-7:	cmpwi	0,r4,0
+6:	andi.	r11, r11, 3
 	beqlr
-	mtctr	r4
-	addi	r6,r6,3
-8:	stbu	r5,1(r6)
+	mtctr	r11
+	addi	r6, r6, 3
+8:	stbu	r3, 1(r6)
 	bdnz	8b
 	blr
-90:	mr	r3,r4
+
+7:	cmpwi	cr0, r4, 0
+	beqlr
+	mtctr	r4
+	addi	r6, r10, -1
+9:	stbu	r3, 1(r6)
+	bdnz	9b
 	blr
-91:	mfctr	r3
-	slwi	r3,r3,2
-	add	r3,r3,r4
+
+90:	mr	r3, r4
 	blr
-92:	mfctr	r3
+91:	add	r3, r10, r4
+	subf	r3, r6, r3
 	blr
 
 	EX_TABLE(11b, 90b)
+	EX_TABLE(4b, 91b)
+	EX_TABLE(10b, 91b)
 	EX_TABLE(1b, 91b)
-	EX_TABLE(8b, 92b)
+	EX_TABLE(8b, 91b)
+	EX_TABLE(9b, 91b)
 
 EXPORT_SYMBOL(__clear_user)
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH  6/7] powerpc/lib: inline more NUL size verifications
  2018-04-17  7:38 [PATCH 1/7] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
                   ` (3 preceding siblings ...)
  2018-04-17  7:38 ` [PATCH 5/7] powerpc/lib: optimise 32 bits __clear_user() Christophe Leroy
@ 2018-04-17  7:38 ` Christophe Leroy
  2018-04-17  7:38   ` Christophe Leroy
  5 siblings, 0 replies; 8+ messages in thread
From: Christophe Leroy @ 2018-04-17  7:38 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linux-kernel, linuxppc-dev

strncmp(), strncpy(), memchr() are often called with constant
size.

This patch gives GCC a chance to optimise NULL size verification out

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/include/asm/string.h | 24 ++++++++++++++++++++++++
 arch/powerpc/lib/string.S         |  8 ++++++++
 2 files changed, 32 insertions(+)

diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
index 196ac5d587fb..1465d5629ef2 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -30,6 +30,22 @@ extern void * memchr(const void *,int,__kernel_size_t);
 extern void * memcpy_flushcache(void *,const void *,__kernel_size_t);
 
 #ifndef CONFIG_FORTIFY_SOURCE
+static inline char *__strncpy(char *p, const char *q, __kernel_size_t size)
+{
+	if (unlikely(!size))
+		return p;
+	return strncpy(p, q, size);
+}
+#define strncpy __strncpy
+
+static inline int __strncmp(const char *p, const char *q, __kernel_size_t size)
+{
+	if (unlikely(!size))
+		return 0;
+	return strncmp(p, q, size);
+}
+#define strncmp __strncmp
+
 static inline int ___memcmp(const void *p,const void *q,__kernel_size_t size, int offset)
 {
 	int dif;
@@ -72,6 +88,14 @@ static inline int __memcmp(const void *p,const void *q,__kernel_size_t size)
 	return memcmp(p, q, size);
 }
 #define memcmp __memcmp
+
+static inline void *__memchr(const void *p, int c, __kernel_size_t size)
+{
+	if (unlikely(!size))
+		return NULL;
+	return memchr(p, c, size);
+}
+#define memchr __memchr
 #endif
 
 #ifdef CONFIG_PPC64
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index cbb90fdc672d..89af53b08b4a 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -18,8 +18,10 @@
 /* This clears out any unused part of the destination buffer,
    just as the libc version does.  -- paulus */
 _GLOBAL(strncpy)
+#ifdef CONFIG_FORTIFY_SOURCE
 	PPC_LCMPI 0,r5,0
 	beqlr
+#endif
 	mtctr	r5
 	addi	r6,r3,-1
 	addi	r4,r4,-1
@@ -38,8 +40,10 @@ _GLOBAL(strncpy)
 EXPORT_SYMBOL(strncpy)
 
 _GLOBAL(strncmp)
+#ifdef CONFIG_FORTIFY_SOURCE
 	PPC_LCMPI 0,r5,0
 	beq-	2f
+#endif
 	mtctr	r5
 	addi	r5,r3,-1
 	addi	r4,r4,-1
@@ -51,13 +55,17 @@ _GLOBAL(strncmp)
 	beqlr	1
 	bdnzt	eq,1b
 	blr
+#ifdef CONFIG_FORTIFY_SOURCE
 2:	li	r3,0
 	blr
+#endif
 EXPORT_SYMBOL(strncmp)
 
 _GLOBAL(memchr)
+#ifdef CONFIG_FORTIFY_SOURCE
 	PPC_LCMPI 0,r5,0
 	beq-	2f
+#endif
 	mtctr	r5
 	addi	r3,r3,-1
 	.balign 16
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH  7/7] powerpc/lib: Remove .balign inside string functions for PPC32
  2018-04-17  7:38 [PATCH 1/7] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
@ 2018-04-17  7:38   ` Christophe Leroy
  2018-04-17  7:38 ` [PATCH 3/7] powerpc/lib: optimise PPC32 memcmp Christophe Leroy
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Christophe Leroy @ 2018-04-17  7:38 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linux-kernel, linuxppc-dev

commit 87a156fb18fe1 ("Align hot loops of some string functions")
degraded the performance of string functions by adding useless
nops

A simple benchmark on an 8xx calling 100000x a memchr() that
matches the first byte runs in 41668 TB ticks before this patch
and in 35986 TB ticks after this patch. So this gives an
improvement of approx 10%

Another benchmark doing the same with a memchr() matching the 128th
byte runs in 1011365 TB ticks before this patch and 1005682 TB ticks
after this patch, so regardless on the number of loops, removing
those useless nops improves the test by 5683 TB ticks.

Fixes: 87a156fb18fe1 ("Align hot loops of some string functions")
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/lib/string.S | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index 89af53b08b4a..9e96f1c102c6 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -25,7 +25,9 @@ _GLOBAL(strncpy)
 	mtctr	r5
 	addi	r6,r3,-1
 	addi	r4,r4,-1
+#ifdef CONFIG_PPC64
 	.balign 16
+#endif
 1:	lbzu	r0,1(r4)
 	cmpwi	0,r0,0
 	stbu	r0,1(r6)
@@ -47,7 +49,9 @@ _GLOBAL(strncmp)
 	mtctr	r5
 	addi	r5,r3,-1
 	addi	r4,r4,-1
+#ifdef CONFIG_PPC64
 	.balign 16
+#endif
 1:	lbzu	r3,1(r5)
 	cmpwi	1,r3,0
 	lbzu	r0,1(r4)
@@ -68,7 +72,9 @@ _GLOBAL(memchr)
 #endif
 	mtctr	r5
 	addi	r3,r3,-1
+#ifdef CONFIG_PPC64
 	.balign 16
+#endif
 1:	lbzu	r0,1(r3)
 	cmpw	0,r0,r4
 	bdnzf	2,1b
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 7/7] powerpc/lib: Remove .balign inside string functions for PPC32
@ 2018-04-17  7:38   ` Christophe Leroy
  0 siblings, 0 replies; 8+ messages in thread
From: Christophe Leroy @ 2018-04-17  7:38 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linux-kernel, linuxppc-dev

commit 87a156fb18fe1 ("Align hot loops of some string functions")
degraded the performance of string functions by adding useless
nops

A simple benchmark on an 8xx calling 100000x a memchr() that
matches the first byte runs in 41668 TB ticks before this patch
and in 35986 TB ticks after this patch. So this gives an
improvement of approx 10%

Another benchmark doing the same with a memchr() matching the 128th
byte runs in 1011365 TB ticks before this patch and 1005682 TB ticks
after this patch, so regardless on the number of loops, removing
those useless nops improves the test by 5683 TB ticks.

Fixes: 87a156fb18fe1 ("Align hot loops of some string functions")
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/lib/string.S | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index 89af53b08b4a..9e96f1c102c6 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -25,7 +25,9 @@ _GLOBAL(strncpy)
 	mtctr	r5
 	addi	r6,r3,-1
 	addi	r4,r4,-1
+#ifdef CONFIG_PPC64
 	.balign 16
+#endif
 1:	lbzu	r0,1(r4)
 	cmpwi	0,r0,0
 	stbu	r0,1(r6)
@@ -47,7 +49,9 @@ _GLOBAL(strncmp)
 	mtctr	r5
 	addi	r5,r3,-1
 	addi	r4,r4,-1
+#ifdef CONFIG_PPC64
 	.balign 16
+#endif
 1:	lbzu	r3,1(r5)
 	cmpwi	1,r3,0
 	lbzu	r0,1(r4)
@@ -68,7 +72,9 @@ _GLOBAL(memchr)
 #endif
 	mtctr	r5
 	addi	r3,r3,-1
+#ifdef CONFIG_PPC64
 	.balign 16
+#endif
 1:	lbzu	r0,1(r3)
 	cmpw	0,r0,r4
 	bdnzf	2,1b
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2018-04-17  7:40 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-04-17  7:38 [PATCH 1/7] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
2018-04-17  7:38 ` [PATCH 2/7] powerpc/lib: inline memcmp() NUL size verification Christophe Leroy
2018-04-17  7:38 ` [PATCH 3/7] powerpc/lib: optimise PPC32 memcmp Christophe Leroy
2018-04-17  7:38 ` [PATCH 4/7] powerpc/lib: inline memcmp() for small constant sizes Christophe Leroy
2018-04-17  7:38 ` [PATCH 5/7] powerpc/lib: optimise 32 bits __clear_user() Christophe Leroy
2018-04-17  7:38 ` [PATCH 6/7] powerpc/lib: inline more NUL size verifications Christophe Leroy
2018-04-17  7:38 ` [PATCH 7/7] powerpc/lib: Remove .balign inside string functions for PPC32 Christophe Leroy
2018-04-17  7:38   ` Christophe Leroy

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.