* [PATCH 1/7] powerpc/lib: move PPC32 specific functions out of string.S
@ 2018-04-17 7:38 Christophe Leroy
2018-04-17 7:38 ` [PATCH 2/7] powerpc/lib: inline memcmp() NUL size verification Christophe Leroy
` (5 more replies)
0 siblings, 6 replies; 7+ messages in thread
From: Christophe Leroy @ 2018-04-17 7:38 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linux-kernel, linuxppc-dev
In preparation of optimisation patches, move PPC32 specific
memcmp() and __clear_user() into string_32.S
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
arch/powerpc/lib/Makefile | 5 +--
arch/powerpc/lib/string.S | 61 -------------------------------------
arch/powerpc/lib/string_32.S | 72 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 75 insertions(+), 63 deletions(-)
create mode 100644 arch/powerpc/lib/string_32.S
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 653901042ad7..2c9b8c0adf22 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -26,13 +26,14 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \
memcpy_power7.o
obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
- string_64.o memcpy_64.o memcmp_64.o pmem.o
+ memcpy_64.o memcmp_64.o pmem.o
obj64-$(CONFIG_SMP) += locks.o
obj64-$(CONFIG_ALTIVEC) += vmx-helper.o
obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o
-obj-y += checksum_$(BITS).o checksum_wrappers.o
+obj-y += checksum_$(BITS).o checksum_wrappers.o \
+ string_$(BITS).o
obj-y += sstep.o ldstfp.o quad.o
obj64-y += quad.o
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index a787776822d8..cbb90fdc672d 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -55,23 +55,6 @@ _GLOBAL(strncmp)
blr
EXPORT_SYMBOL(strncmp)
-#ifdef CONFIG_PPC32
-_GLOBAL(memcmp)
- PPC_LCMPI 0,r5,0
- beq- 2f
- mtctr r5
- addi r6,r3,-1
- addi r4,r4,-1
-1: lbzu r3,1(r6)
- lbzu r0,1(r4)
- subf. r3,r0,r3
- bdnzt 2,1b
- blr
-2: li r3,0
- blr
-EXPORT_SYMBOL(memcmp)
-#endif
-
_GLOBAL(memchr)
PPC_LCMPI 0,r5,0
beq- 2f
@@ -85,47 +68,3 @@ _GLOBAL(memchr)
2: li r3,0
blr
EXPORT_SYMBOL(memchr)
-
-#ifdef CONFIG_PPC32
-_GLOBAL(__clear_user)
- addi r6,r3,-4
- li r3,0
- li r5,0
- cmplwi 0,r4,4
- blt 7f
- /* clear a single word */
-11: stwu r5,4(r6)
- beqlr
- /* clear word sized chunks */
- andi. r0,r6,3
- add r4,r0,r4
- subf r6,r0,r6
- srwi r0,r4,2
- andi. r4,r4,3
- mtctr r0
- bdz 7f
-1: stwu r5,4(r6)
- bdnz 1b
- /* clear byte sized chunks */
-7: cmpwi 0,r4,0
- beqlr
- mtctr r4
- addi r6,r6,3
-8: stbu r5,1(r6)
- bdnz 8b
- blr
-90: mr r3,r4
- blr
-91: mfctr r3
- slwi r3,r3,2
- add r3,r3,r4
- blr
-92: mfctr r3
- blr
-
- EX_TABLE(11b, 90b)
- EX_TABLE(1b, 91b)
- EX_TABLE(8b, 92b)
-
-EXPORT_SYMBOL(__clear_user)
-#endif
diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
new file mode 100644
index 000000000000..2519f8bd09e3
--- /dev/null
+++ b/arch/powerpc/lib/string_32.S
@@ -0,0 +1,72 @@
+/*
+ * String handling functions for PowerPC32
+ *
+ * Copyright (C) 2018 CS Systemes d'Information
+ *
+ * Author: Christophe Leroy <christophe.leroy@c-s.fr>
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ */
+#include <asm/processor.h>
+#include <asm/errno.h>
+#include <asm/ppc_asm.h>
+#include <asm/export.h>
+
+ .text
+
+_GLOBAL(memcmp)
+ PPC_LCMPI 0,r5,0
+ beq- 2f
+ mtctr r5
+ addi r6,r3,-1
+ addi r4,r4,-1
+1: lbzu r3,1(r6)
+ lbzu r0,1(r4)
+ subf. r3,r0,r3
+ bdnzt 2,1b
+ blr
+2: li r3,0
+ blr
+EXPORT_SYMBOL(memcmp)
+
+_GLOBAL(__clear_user)
+ addi r6,r3,-4
+ li r3,0
+ li r5,0
+ cmplwi 0,r4,4
+ blt 7f
+ /* clear a single word */
+11: stwu r5,4(r6)
+ beqlr
+ /* clear word sized chunks */
+ andi. r0,r6,3
+ add r4,r0,r4
+ subf r6,r0,r6
+ srwi r0,r4,2
+ andi. r4,r4,3
+ mtctr r0
+ bdz 7f
+1: stwu r5,4(r6)
+ bdnz 1b
+ /* clear byte sized chunks */
+7: cmpwi 0,r4,0
+ beqlr
+ mtctr r4
+ addi r6,r6,3
+8: stbu r5,1(r6)
+ bdnz 8b
+ blr
+90: mr r3,r4
+ blr
+91: mfctr r3
+ slwi r3,r3,2
+ add r3,r3,r4
+ blr
+92: mfctr r3
+ blr
+
+ EX_TABLE(11b, 90b)
+ EX_TABLE(1b, 91b)
+ EX_TABLE(8b, 92b)
+
+EXPORT_SYMBOL(__clear_user)
--
2.13.3
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 2/7] powerpc/lib: inline memcmp() NUL size verification
2018-04-17 7:38 [PATCH 1/7] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
@ 2018-04-17 7:38 ` Christophe Leroy
2018-04-17 7:38 ` [PATCH 3/7] powerpc/lib: optimise PPC32 memcmp Christophe Leroy
` (4 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Christophe Leroy @ 2018-04-17 7:38 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linux-kernel, linuxppc-dev
Many calls to memcmp() are done with constant size.
This patch gives GCC a chance to optimise out
the NULL size verification.
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
arch/powerpc/include/asm/string.h | 10 ++++++++++
arch/powerpc/lib/memcmp_64.S | 4 ++++
arch/powerpc/lib/string_32.S | 4 ++++
3 files changed, 18 insertions(+)
diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
index 9b8cedf618f4..cf6f495134c3 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -27,6 +27,16 @@ extern int memcmp(const void *,const void *,__kernel_size_t);
extern void * memchr(const void *,int,__kernel_size_t);
extern void * memcpy_flushcache(void *,const void *,__kernel_size_t);
+#ifndef CONFIG_FORTIFY_SOURCE
+static inline int __memcmp(const void *p,const void *q,__kernel_size_t size)
+{
+ if (unlikely(!size))
+ return 0;
+ return memcmp(p, q, size);
+}
+#define memcmp __memcmp
+#endif
+
#ifdef CONFIG_PPC64
#define __HAVE_ARCH_MEMSET32
#define __HAVE_ARCH_MEMSET64
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b7bd55..f6822fabf254 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -30,7 +30,9 @@
#endif
_GLOBAL(memcmp)
+#ifdef CONFIG_FORTIFY_SOURCE
cmpdi cr1,r5,0
+#endif
/* Use the short loop if both strings are not 8B aligned */
or r6,r3,r4
@@ -39,7 +41,9 @@ _GLOBAL(memcmp)
/* Use the short loop if length is less than 32B */
cmpdi cr6,r5,31
+#ifdef CONFIG_FORTIFY_SOURCE
beq cr1,.Lzero
+#endif
bne .Lshort
bgt cr6,.Llong
diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
index 2519f8bd09e3..94e9c9bc31c3 100644
--- a/arch/powerpc/lib/string_32.S
+++ b/arch/powerpc/lib/string_32.S
@@ -15,8 +15,10 @@
.text
_GLOBAL(memcmp)
+#ifdef CONFIG_FORTIFY_SOURCE
PPC_LCMPI 0,r5,0
beq- 2f
+#endif
mtctr r5
addi r6,r3,-1
addi r4,r4,-1
@@ -25,8 +27,10 @@ _GLOBAL(memcmp)
subf. r3,r0,r3
bdnzt 2,1b
blr
+#ifdef CONFIG_FORTIFY_SOURCE
2: li r3,0
blr
+#endif
EXPORT_SYMBOL(memcmp)
_GLOBAL(__clear_user)
--
2.13.3
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 3/7] powerpc/lib: optimise PPC32 memcmp
2018-04-17 7:38 [PATCH 1/7] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
2018-04-17 7:38 ` [PATCH 2/7] powerpc/lib: inline memcmp() NUL size verification Christophe Leroy
@ 2018-04-17 7:38 ` Christophe Leroy
2018-04-17 7:38 ` [PATCH 4/7] powerpc/lib: inline memcmp() for small constant sizes Christophe Leroy
` (3 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Christophe Leroy @ 2018-04-17 7:38 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linux-kernel, linuxppc-dev
At the time being, memcmp() compares two chunks of memory
byte per byte.
This patch optimised the comparison by comparing word by word.
A small benchmark performed on an 8xx based on the comparison
of two chuncks of 512 bytes performed 100000 times gives:
Before : 5852274 TB ticks
After: 1488638 TB ticks
This is almost 4 times faster
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
arch/powerpc/lib/string_32.S | 42 +++++++++++++++++++++++++++++++++++-------
1 file changed, 35 insertions(+), 7 deletions(-)
diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
index 94e9c9bc31c3..5b2a73fb07be 100644
--- a/arch/powerpc/lib/string_32.S
+++ b/arch/powerpc/lib/string_32.S
@@ -19,13 +19,41 @@ _GLOBAL(memcmp)
PPC_LCMPI 0,r5,0
beq- 2f
#endif
- mtctr r5
- addi r6,r3,-1
- addi r4,r4,-1
-1: lbzu r3,1(r6)
- lbzu r0,1(r4)
- subf. r3,r0,r3
- bdnzt 2,1b
+ srawi. r7, r5, 2 /* Divide len by 4 */
+ mr r6, r3
+ beq- 3f
+ mtctr r7
+ li r7, 0
+1:
+#ifdef __LITTLE_ENDIAN__
+ lwbrx r3, r6, r7
+ lwbrx r0, r4, r7
+#else
+ lwzx r3, r6, r7
+ lwzx r0, r4, r7
+#endif
+ addi r7, r7, 4
+ subf. r3, r0, r3
+ bdnzt eq, 1b
+ bnelr
+ andi. r5, r5, 3
+ beqlr
+3: cmplwi cr1, r5, 2
+ blt- cr1, 4f
+#ifdef __LITTLE_ENDIAN__
+ lhbrx r3, r6, r7
+ lhbrx r0, r4, r7
+#else
+ lhzx r3, r6, r7
+ lhzx r0, r4, r7
+#endif
+ addi r7, r7, 2
+ subf. r3, r0, r3
+ beqlr cr1
+ bnelr
+4: lbzx r3, r6, r7
+ lbzx r0, r4, r7
+ subf. r3, r0, r3
blr
#ifdef CONFIG_FORTIFY_SOURCE
2: li r3,0
--
2.13.3
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 4/7] powerpc/lib: inline memcmp() for small constant sizes
2018-04-17 7:38 [PATCH 1/7] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
2018-04-17 7:38 ` [PATCH 2/7] powerpc/lib: inline memcmp() NUL size verification Christophe Leroy
2018-04-17 7:38 ` [PATCH 3/7] powerpc/lib: optimise PPC32 memcmp Christophe Leroy
@ 2018-04-17 7:38 ` Christophe Leroy
2018-04-17 7:38 ` [PATCH 5/7] powerpc/lib: optimise 32 bits __clear_user() Christophe Leroy
` (2 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Christophe Leroy @ 2018-04-17 7:38 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linux-kernel, linuxppc-dev
In my 8xx configuration, I get 208 calls to memcmp()
Within those 208 calls, about half of them have constant sizes,
46 have a size of 8, 17 have a size of 16, only a few have a
size over 16. Other fixed sizes are mostly 4, 6 and 10.
This patch inlines calls to memcmp() when size
is constant and lower than or equal to 16
In my 8xx configuration, this reduces the number of calls
to memcmp() from 208 to 123
The following table shows the number of TB timeticks to perform
a constant size memcmp() before and after the patch depending on
the size
Before After Improvement
01: 7577 5682 25%
02: 41668 5682 86%
03: 51137 13258 74%
04: 45455 5682 87%
05: 58713 13258 77%
06: 58712 13258 77%
07: 68183 20834 70%
08: 56819 15153 73%
09: 70077 28411 60%
10: 70077 28411 60%
11: 79546 35986 55%
12: 68182 28411 58%
13: 81440 35986 55%
14: 81440 39774 51%
15: 94697 43562 54%
16: 79546 37881 52%
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
arch/powerpc/include/asm/string.h | 37 +++++++++++++++++++++++++++++++++++++
1 file changed, 37 insertions(+)
diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
index cf6f495134c3..196ac5d587fb 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -4,6 +4,8 @@
#ifdef __KERNEL__
+#include <linux/kernel.h>
+
#define __HAVE_ARCH_STRNCPY
#define __HAVE_ARCH_STRNCMP
#define __HAVE_ARCH_MEMSET
@@ -28,10 +30,45 @@ extern void * memchr(const void *,int,__kernel_size_t);
extern void * memcpy_flushcache(void *,const void *,__kernel_size_t);
#ifndef CONFIG_FORTIFY_SOURCE
+static inline int ___memcmp(const void *p,const void *q,__kernel_size_t size, int offset)
+{
+ int dif;
+
+ BUILD_BUG_ON(!size || size > 8);
+
+ p += offset, q += offset;
+ if (size == 1)
+ return *(u8*)p - *(u8*)q;
+ if (size == 2)
+ return be16_to_cpu(*(u16*)p) - be16_to_cpu(*(u16*)q);
+ if (size == 3) {
+ dif = be16_to_cpu(*(u16*)p) - be16_to_cpu(*(u16*)q);
+ if (dif)
+ return dif;
+ return *(u8*)(p + 2) - *(u8*)(q + 2);
+ }
+ if (size == 8) {
+ s64 tmp = be64_to_cpu(*(u64*)p) - be64_to_cpu(*(u64*)q);
+ return tmp >> 32 ? : (int)tmp;
+ }
+
+ dif = be32_to_cpu(*(u32*)p) - be32_to_cpu(*(u32*)q);
+ if (size == 4 || dif)
+ return dif;
+
+ return ___memcmp(p, q, size - 4, 4);
+}
+
static inline int __memcmp(const void *p,const void *q,__kernel_size_t size)
{
if (unlikely(!size))
return 0;
+ if (__builtin_constant_p(size) && size <= 16) {
+ int dif = ___memcmp(p, q, size < 8 ? size : 8, 0);
+ if (size <= 8 || dif)
+ return dif;
+ return ___memcmp(p, q, size - 8, 8);
+ }
return memcmp(p, q, size);
}
#define memcmp __memcmp
--
2.13.3
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 5/7] powerpc/lib: optimise 32 bits __clear_user()
2018-04-17 7:38 [PATCH 1/7] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
` (2 preceding siblings ...)
2018-04-17 7:38 ` [PATCH 4/7] powerpc/lib: inline memcmp() for small constant sizes Christophe Leroy
@ 2018-04-17 7:38 ` Christophe Leroy
2018-04-17 7:38 ` [PATCH 6/7] powerpc/lib: inline more NUL size verifications Christophe Leroy
2018-04-17 7:38 ` [PATCH 7/7] powerpc/lib: Remove .balign inside string functions for PPC32 Christophe Leroy
5 siblings, 0 replies; 7+ messages in thread
From: Christophe Leroy @ 2018-04-17 7:38 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linux-kernel, linuxppc-dev
Rewrite clear_user() on the same principle as memset(0), making use
of dcbz to clear complete cache lines.
This code is a copy/paste of memset(), with some modifications
in order to retrieve remaining number of bytes to be cleared,
as it needs to be returned in case of error.
On a MPC885, throughput is almost doubled:
Before:
~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 18.990779 seconds, 52.7MB/s
After:
~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 9.611468 seconds, 104.0MB/s
On a MPC8321, throughput is multiplied by 2.12:
Before:
root@vgoippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 6.844352 seconds, 146.1MB/s
After:
root@vgoippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 3.218854 seconds, 310.7MB/s
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
arch/powerpc/lib/string_32.S | 85 +++++++++++++++++++++++++++++++-------------
1 file changed, 60 insertions(+), 25 deletions(-)
diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
index 5b2a73fb07be..31fc92b0aae6 100644
--- a/arch/powerpc/lib/string_32.S
+++ b/arch/powerpc/lib/string_32.S
@@ -11,6 +11,7 @@
#include <asm/errno.h>
#include <asm/ppc_asm.h>
#include <asm/export.h>
+#include <asm/cache.h>
.text
@@ -61,44 +62,78 @@ _GLOBAL(memcmp)
#endif
EXPORT_SYMBOL(memcmp)
+CACHELINE_BYTES = L1_CACHE_BYTES
+LG_CACHELINE_BYTES = L1_CACHE_SHIFT
+CACHELINE_MASK = (L1_CACHE_BYTES-1)
+
_GLOBAL(__clear_user)
- addi r6,r3,-4
- li r3,0
- li r5,0
- cmplwi 0,r4,4
+/*
+ * Use dcbz on the complete cache lines in the destination
+ * to set them to zero. This requires that the destination
+ * area is cacheable.
+ */
+ cmplwi cr0, r4, 4
+ mr r10, r3
+ li r3, 0
blt 7f
- /* clear a single word */
-11: stwu r5,4(r6)
+
+11: stw r3, 0(r10)
beqlr
- /* clear word sized chunks */
- andi. r0,r6,3
- add r4,r0,r4
- subf r6,r0,r6
- srwi r0,r4,2
- andi. r4,r4,3
+ andi. r0, r10, 3
+ add r11, r0, r4
+ subf r6, r0, r10
+
+ clrlwi r7, r6, 32 - LG_CACHELINE_BYTES
+ add r8, r7, r11
+ srwi r9, r8, LG_CACHELINE_BYTES
+ addic. r9, r9, -1 /* total number of complete cachelines */
+ ble 2f
+ xori r0, r7, CACHELINE_MASK & ~3
+ srwi. r0, r0, 2
+ beq 3f
+ mtctr r0
+4: stwu r3, 4(r6)
+ bdnz 4b
+3: mtctr r9
+ li r7, 4
+10: dcbz r7, r6
+ addi r6, r6, CACHELINE_BYTES
+ bdnz 10b
+ clrlwi r11, r8, 32 - LG_CACHELINE_BYTES
+ addi r11, r11, 4
+
+2: srwi r0 ,r11 ,2
mtctr r0
- bdz 7f
-1: stwu r5,4(r6)
+ bdz 6f
+1: stwu r3, 4(r6)
bdnz 1b
- /* clear byte sized chunks */
-7: cmpwi 0,r4,0
+6: andi. r11, r11, 3
beqlr
- mtctr r4
- addi r6,r6,3
-8: stbu r5,1(r6)
+ mtctr r11
+ addi r6, r6, 3
+8: stbu r3, 1(r6)
bdnz 8b
blr
-90: mr r3,r4
+
+7: cmpwi cr0, r4, 0
+ beqlr
+ mtctr r4
+ addi r6, r10, -1
+9: stbu r3, 1(r6)
+ bdnz 9b
blr
-91: mfctr r3
- slwi r3,r3,2
- add r3,r3,r4
+
+90: mr r3, r4
blr
-92: mfctr r3
+91: add r3, r10, r4
+ subf r3, r6, r3
blr
EX_TABLE(11b, 90b)
+ EX_TABLE(4b, 91b)
+ EX_TABLE(10b, 91b)
EX_TABLE(1b, 91b)
- EX_TABLE(8b, 92b)
+ EX_TABLE(8b, 91b)
+ EX_TABLE(9b, 91b)
EXPORT_SYMBOL(__clear_user)
--
2.13.3
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 6/7] powerpc/lib: inline more NUL size verifications
2018-04-17 7:38 [PATCH 1/7] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
` (3 preceding siblings ...)
2018-04-17 7:38 ` [PATCH 5/7] powerpc/lib: optimise 32 bits __clear_user() Christophe Leroy
@ 2018-04-17 7:38 ` Christophe Leroy
2018-04-17 7:38 ` [PATCH 7/7] powerpc/lib: Remove .balign inside string functions for PPC32 Christophe Leroy
5 siblings, 0 replies; 7+ messages in thread
From: Christophe Leroy @ 2018-04-17 7:38 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linux-kernel, linuxppc-dev
strncmp(), strncpy(), memchr() are often called with constant
size.
This patch gives GCC a chance to optimise NULL size verification out
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
arch/powerpc/include/asm/string.h | 24 ++++++++++++++++++++++++
arch/powerpc/lib/string.S | 8 ++++++++
2 files changed, 32 insertions(+)
diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
index 196ac5d587fb..1465d5629ef2 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -30,6 +30,22 @@ extern void * memchr(const void *,int,__kernel_size_t);
extern void * memcpy_flushcache(void *,const void *,__kernel_size_t);
#ifndef CONFIG_FORTIFY_SOURCE
+static inline char *__strncpy(char *p, const char *q, __kernel_size_t size)
+{
+ if (unlikely(!size))
+ return p;
+ return strncpy(p, q, size);
+}
+#define strncpy __strncpy
+
+static inline int __strncmp(const char *p, const char *q, __kernel_size_t size)
+{
+ if (unlikely(!size))
+ return 0;
+ return strncmp(p, q, size);
+}
+#define strncmp __strncmp
+
static inline int ___memcmp(const void *p,const void *q,__kernel_size_t size, int offset)
{
int dif;
@@ -72,6 +88,14 @@ static inline int __memcmp(const void *p,const void *q,__kernel_size_t size)
return memcmp(p, q, size);
}
#define memcmp __memcmp
+
+static inline void *__memchr(const void *p, int c, __kernel_size_t size)
+{
+ if (unlikely(!size))
+ return NULL;
+ return memchr(p, c, size);
+}
+#define memchr __memchr
#endif
#ifdef CONFIG_PPC64
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index cbb90fdc672d..89af53b08b4a 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -18,8 +18,10 @@
/* This clears out any unused part of the destination buffer,
just as the libc version does. -- paulus */
_GLOBAL(strncpy)
+#ifdef CONFIG_FORTIFY_SOURCE
PPC_LCMPI 0,r5,0
beqlr
+#endif
mtctr r5
addi r6,r3,-1
addi r4,r4,-1
@@ -38,8 +40,10 @@ _GLOBAL(strncpy)
EXPORT_SYMBOL(strncpy)
_GLOBAL(strncmp)
+#ifdef CONFIG_FORTIFY_SOURCE
PPC_LCMPI 0,r5,0
beq- 2f
+#endif
mtctr r5
addi r5,r3,-1
addi r4,r4,-1
@@ -51,13 +55,17 @@ _GLOBAL(strncmp)
beqlr 1
bdnzt eq,1b
blr
+#ifdef CONFIG_FORTIFY_SOURCE
2: li r3,0
blr
+#endif
EXPORT_SYMBOL(strncmp)
_GLOBAL(memchr)
+#ifdef CONFIG_FORTIFY_SOURCE
PPC_LCMPI 0,r5,0
beq- 2f
+#endif
mtctr r5
addi r3,r3,-1
.balign 16
--
2.13.3
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 7/7] powerpc/lib: Remove .balign inside string functions for PPC32
2018-04-17 7:38 [PATCH 1/7] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
` (4 preceding siblings ...)
2018-04-17 7:38 ` [PATCH 6/7] powerpc/lib: inline more NUL size verifications Christophe Leroy
@ 2018-04-17 7:38 ` Christophe Leroy
5 siblings, 0 replies; 7+ messages in thread
From: Christophe Leroy @ 2018-04-17 7:38 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linux-kernel, linuxppc-dev
commit 87a156fb18fe1 ("Align hot loops of some string functions")
degraded the performance of string functions by adding useless
nops
A simple benchmark on an 8xx calling 100000x a memchr() that
matches the first byte runs in 41668 TB ticks before this patch
and in 35986 TB ticks after this patch. So this gives an
improvement of approx 10%
Another benchmark doing the same with a memchr() matching the 128th
byte runs in 1011365 TB ticks before this patch and 1005682 TB ticks
after this patch, so regardless on the number of loops, removing
those useless nops improves the test by 5683 TB ticks.
Fixes: 87a156fb18fe1 ("Align hot loops of some string functions")
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
arch/powerpc/lib/string.S | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index 89af53b08b4a..9e96f1c102c6 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -25,7 +25,9 @@ _GLOBAL(strncpy)
mtctr r5
addi r6,r3,-1
addi r4,r4,-1
+#ifdef CONFIG_PPC64
.balign 16
+#endif
1: lbzu r0,1(r4)
cmpwi 0,r0,0
stbu r0,1(r6)
@@ -47,7 +49,9 @@ _GLOBAL(strncmp)
mtctr r5
addi r5,r3,-1
addi r4,r4,-1
+#ifdef CONFIG_PPC64
.balign 16
+#endif
1: lbzu r3,1(r5)
cmpwi 1,r3,0
lbzu r0,1(r4)
@@ -68,7 +72,9 @@ _GLOBAL(memchr)
#endif
mtctr r5
addi r3,r3,-1
+#ifdef CONFIG_PPC64
.balign 16
+#endif
1: lbzu r0,1(r3)
cmpw 0,r0,r4
bdnzf 2,1b
--
2.13.3
^ permalink raw reply related [flat|nested] 7+ messages in thread
end of thread, other threads:[~2018-04-17 7:38 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-04-17 7:38 [PATCH 1/7] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
2018-04-17 7:38 ` [PATCH 2/7] powerpc/lib: inline memcmp() NUL size verification Christophe Leroy
2018-04-17 7:38 ` [PATCH 3/7] powerpc/lib: optimise PPC32 memcmp Christophe Leroy
2018-04-17 7:38 ` [PATCH 4/7] powerpc/lib: inline memcmp() for small constant sizes Christophe Leroy
2018-04-17 7:38 ` [PATCH 5/7] powerpc/lib: optimise 32 bits __clear_user() Christophe Leroy
2018-04-17 7:38 ` [PATCH 6/7] powerpc/lib: inline more NUL size verifications Christophe Leroy
2018-04-17 7:38 ` [PATCH 7/7] powerpc/lib: Remove .balign inside string functions for PPC32 Christophe Leroy
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).