linux-arch.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] riscv: use the generic string routines
@ 2021-07-19 11:43 Matteo Croce
  2021-08-03 16:54 ` Matteo Croce
  0 siblings, 1 reply; 10+ messages in thread
From: Matteo Croce @ 2021-07-19 11:43 UTC (permalink / raw)
  To: linux-riscv
  Cc: linux-kernel, linux-arch, Paul Walmsley, Palmer Dabbelt,
	Albert Ou, Atish Patra, Emil Renner Berthing, Akira Tsukamoto,
	Drew Fustini, Bin Meng, David Laight, Guo Ren, Christoph Hellwig

From: Matteo Croce <mcroce@microsoft.com>

Use the generic routines which handle alignment properly.

These are the performances measured on a BeagleV machine for a
32 mbyte buffer:

memcpy:
original aligned:	 75 Mb/s
original unaligned:	 75 Mb/s
new aligned:		114 Mb/s
new unaligned:		107 Mb/s

memset:
original aligned:	140 Mb/s
original unaligned:	140 Mb/s
new aligned:		241 Mb/s
new unaligned:		241 Mb/s

TCP throughput with iperf3 gives a similar improvement as well.

This is the binary size increase according to bloat-o-meter:

add/remove: 0/0 grow/shrink: 4/2 up/down: 432/-36 (396)
Function                                     old     new   delta
memcpy                                        36     324    +288
memset                                        32     148    +116
strlcpy                                      116     132     +16
strscpy_pad                                   84      96     +12
strlcat                                      176     164     -12
memmove                                       76      52     -24
Total: Before=1225371, After=1225767, chg +0.03%

Signed-off-by: Matteo Croce <mcroce@microsoft.com>
Signed-off-by: Emil Renner Berthing <kernel@esmil.dk>
---
 arch/riscv/include/asm/Kbuild   |   1 +
 arch/riscv/include/asm/string.h |  32 ---------
 arch/riscv/kernel/Makefile      |   1 -
 arch/riscv/kernel/riscv_ksyms.c |  17 -----
 arch/riscv/lib/Makefile         |   3 -
 arch/riscv/lib/memcpy.S         | 108 ------------------------------
 arch/riscv/lib/memmove.S        |  64 ------------------
 arch/riscv/lib/memset.S         | 113 --------------------------------
 8 files changed, 1 insertion(+), 338 deletions(-)
 delete mode 100644 arch/riscv/include/asm/string.h
 delete mode 100644 arch/riscv/kernel/riscv_ksyms.c
 delete mode 100644 arch/riscv/lib/memcpy.S
 delete mode 100644 arch/riscv/lib/memmove.S
 delete mode 100644 arch/riscv/lib/memset.S

diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
index 445ccc97305a..6d699af41320 100644
--- a/arch/riscv/include/asm/Kbuild
+++ b/arch/riscv/include/asm/Kbuild
@@ -3,5 +3,6 @@ generic-y += early_ioremap.h
 generic-y += extable.h
 generic-y += flat.h
 generic-y += kvm_para.h
+generic-y += string.h
 generic-y += user.h
 generic-y += vmlinux.lds.h
diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
deleted file mode 100644
index 909049366555..000000000000
--- a/arch/riscv/include/asm/string.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2013 Regents of the University of California
- */
-
-#ifndef _ASM_RISCV_STRING_H
-#define _ASM_RISCV_STRING_H
-
-#include <linux/types.h>
-#include <linux/linkage.h>
-
-#define __HAVE_ARCH_MEMSET
-extern asmlinkage void *memset(void *, int, size_t);
-extern asmlinkage void *__memset(void *, int, size_t);
-#define __HAVE_ARCH_MEMCPY
-extern asmlinkage void *memcpy(void *, const void *, size_t);
-extern asmlinkage void *__memcpy(void *, const void *, size_t);
-#define __HAVE_ARCH_MEMMOVE
-extern asmlinkage void *memmove(void *, const void *, size_t);
-extern asmlinkage void *__memmove(void *, const void *, size_t);
-/* For those files which don't want to check by kasan. */
-#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
-#define memcpy(dst, src, len) __memcpy(dst, src, len)
-#define memset(s, c, n) __memset(s, c, n)
-#define memmove(dst, src, len) __memmove(dst, src, len)
-
-#ifndef __NO_FORTIFY
-#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */
-#endif
-
-#endif
-#endif /* _ASM_RISCV_STRING_H */
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index d3081e4d9600..e635ce1e5645 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -31,7 +31,6 @@ obj-y	+= syscall_table.o
 obj-y	+= sys_riscv.o
 obj-y	+= time.o
 obj-y	+= traps.o
-obj-y	+= riscv_ksyms.o
 obj-y	+= stacktrace.o
 obj-y	+= cacheinfo.o
 obj-y	+= patch.o
diff --git a/arch/riscv/kernel/riscv_ksyms.c b/arch/riscv/kernel/riscv_ksyms.c
deleted file mode 100644
index 5ab1c7e1a6ed..000000000000
--- a/arch/riscv/kernel/riscv_ksyms.c
+++ /dev/null
@@ -1,17 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2017 Zihao Yu
- */
-
-#include <linux/export.h>
-#include <linux/uaccess.h>
-
-/*
- * Assembly functions that may be used (directly or indirectly) by modules
- */
-EXPORT_SYMBOL(memset);
-EXPORT_SYMBOL(memcpy);
-EXPORT_SYMBOL(memmove);
-EXPORT_SYMBOL(__memset);
-EXPORT_SYMBOL(__memcpy);
-EXPORT_SYMBOL(__memmove);
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index 25d5c9664e57..d2659bd5e9ef 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -1,8 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
 lib-y			+= delay.o
-lib-y			+= memcpy.o
-lib-y			+= memset.o
-lib-y			+= memmove.o
 lib-$(CONFIG_MMU)	+= uaccess.o
 lib-$(CONFIG_64BIT)	+= tishift.o
 
diff --git a/arch/riscv/lib/memcpy.S b/arch/riscv/lib/memcpy.S
deleted file mode 100644
index 51ab716253fa..000000000000
--- a/arch/riscv/lib/memcpy.S
+++ /dev/null
@@ -1,108 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2013 Regents of the University of California
- */
-
-#include <linux/linkage.h>
-#include <asm/asm.h>
-
-/* void *memcpy(void *, const void *, size_t) */
-ENTRY(__memcpy)
-WEAK(memcpy)
-	move t6, a0  /* Preserve return value */
-
-	/* Defer to byte-oriented copy for small sizes */
-	sltiu a3, a2, 128
-	bnez a3, 4f
-	/* Use word-oriented copy only if low-order bits match */
-	andi a3, t6, SZREG-1
-	andi a4, a1, SZREG-1
-	bne a3, a4, 4f
-
-	beqz a3, 2f  /* Skip if already aligned */
-	/*
-	 * Round to nearest double word-aligned address
-	 * greater than or equal to start address
-	 */
-	andi a3, a1, ~(SZREG-1)
-	addi a3, a3, SZREG
-	/* Handle initial misalignment */
-	sub a4, a3, a1
-1:
-	lb a5, 0(a1)
-	addi a1, a1, 1
-	sb a5, 0(t6)
-	addi t6, t6, 1
-	bltu a1, a3, 1b
-	sub a2, a2, a4  /* Update count */
-
-2:
-	andi a4, a2, ~((16*SZREG)-1)
-	beqz a4, 4f
-	add a3, a1, a4
-3:
-	REG_L a4,       0(a1)
-	REG_L a5,   SZREG(a1)
-	REG_L a6, 2*SZREG(a1)
-	REG_L a7, 3*SZREG(a1)
-	REG_L t0, 4*SZREG(a1)
-	REG_L t1, 5*SZREG(a1)
-	REG_L t2, 6*SZREG(a1)
-	REG_L t3, 7*SZREG(a1)
-	REG_L t4, 8*SZREG(a1)
-	REG_L t5, 9*SZREG(a1)
-	REG_S a4,       0(t6)
-	REG_S a5,   SZREG(t6)
-	REG_S a6, 2*SZREG(t6)
-	REG_S a7, 3*SZREG(t6)
-	REG_S t0, 4*SZREG(t6)
-	REG_S t1, 5*SZREG(t6)
-	REG_S t2, 6*SZREG(t6)
-	REG_S t3, 7*SZREG(t6)
-	REG_S t4, 8*SZREG(t6)
-	REG_S t5, 9*SZREG(t6)
-	REG_L a4, 10*SZREG(a1)
-	REG_L a5, 11*SZREG(a1)
-	REG_L a6, 12*SZREG(a1)
-	REG_L a7, 13*SZREG(a1)
-	REG_L t0, 14*SZREG(a1)
-	REG_L t1, 15*SZREG(a1)
-	addi a1, a1, 16*SZREG
-	REG_S a4, 10*SZREG(t6)
-	REG_S a5, 11*SZREG(t6)
-	REG_S a6, 12*SZREG(t6)
-	REG_S a7, 13*SZREG(t6)
-	REG_S t0, 14*SZREG(t6)
-	REG_S t1, 15*SZREG(t6)
-	addi t6, t6, 16*SZREG
-	bltu a1, a3, 3b
-	andi a2, a2, (16*SZREG)-1  /* Update count */
-
-4:
-	/* Handle trailing misalignment */
-	beqz a2, 6f
-	add a3, a1, a2
-
-	/* Use word-oriented copy if co-aligned to word boundary */
-	or a5, a1, t6
-	or a5, a5, a3
-	andi a5, a5, 3
-	bnez a5, 5f
-7:
-	lw a4, 0(a1)
-	addi a1, a1, 4
-	sw a4, 0(t6)
-	addi t6, t6, 4
-	bltu a1, a3, 7b
-
-	ret
-
-5:
-	lb a4, 0(a1)
-	addi a1, a1, 1
-	sb a4, 0(t6)
-	addi t6, t6, 1
-	bltu a1, a3, 5b
-6:
-	ret
-END(__memcpy)
diff --git a/arch/riscv/lib/memmove.S b/arch/riscv/lib/memmove.S
deleted file mode 100644
index 07d1d2152ba5..000000000000
--- a/arch/riscv/lib/memmove.S
+++ /dev/null
@@ -1,64 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#include <linux/linkage.h>
-#include <asm/asm.h>
-
-ENTRY(__memmove)
-WEAK(memmove)
-        move    t0, a0
-        move    t1, a1
-
-        beq     a0, a1, exit_memcpy
-        beqz    a2, exit_memcpy
-        srli    t2, a2, 0x2
-
-        slt     t3, a0, a1
-        beqz    t3, do_reverse
-
-        andi    a2, a2, 0x3
-        li      t4, 1
-        beqz    t2, byte_copy
-
-word_copy:
-        lw      t3, 0(a1)
-        addi    t2, t2, -1
-        addi    a1, a1, 4
-        sw      t3, 0(a0)
-        addi    a0, a0, 4
-        bnez    t2, word_copy
-        beqz    a2, exit_memcpy
-        j       byte_copy
-
-do_reverse:
-        add     a0, a0, a2
-        add     a1, a1, a2
-        andi    a2, a2, 0x3
-        li      t4, -1
-        beqz    t2, reverse_byte_copy
-
-reverse_word_copy:
-        addi    a1, a1, -4
-        addi    t2, t2, -1
-        lw      t3, 0(a1)
-        addi    a0, a0, -4
-        sw      t3, 0(a0)
-        bnez    t2, reverse_word_copy
-        beqz    a2, exit_memcpy
-
-reverse_byte_copy:
-        addi    a0, a0, -1
-        addi    a1, a1, -1
-
-byte_copy:
-        lb      t3, 0(a1)
-        addi    a2, a2, -1
-        sb      t3, 0(a0)
-        add     a1, a1, t4
-        add     a0, a0, t4
-        bnez    a2, byte_copy
-
-exit_memcpy:
-        move a0, t0
-        move a1, t1
-        ret
-END(__memmove)
diff --git a/arch/riscv/lib/memset.S b/arch/riscv/lib/memset.S
deleted file mode 100644
index 34c5360c6705..000000000000
--- a/arch/riscv/lib/memset.S
+++ /dev/null
@@ -1,113 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2013 Regents of the University of California
- */
-
-
-#include <linux/linkage.h>
-#include <asm/asm.h>
-
-/* void *memset(void *, int, size_t) */
-ENTRY(__memset)
-WEAK(memset)
-	move t0, a0  /* Preserve return value */
-
-	/* Defer to byte-oriented fill for small sizes */
-	sltiu a3, a2, 16
-	bnez a3, 4f
-
-	/*
-	 * Round to nearest XLEN-aligned address
-	 * greater than or equal to start address
-	 */
-	addi a3, t0, SZREG-1
-	andi a3, a3, ~(SZREG-1)
-	beq a3, t0, 2f  /* Skip if already aligned */
-	/* Handle initial misalignment */
-	sub a4, a3, t0
-1:
-	sb a1, 0(t0)
-	addi t0, t0, 1
-	bltu t0, a3, 1b
-	sub a2, a2, a4  /* Update count */
-
-2: /* Duff's device with 32 XLEN stores per iteration */
-	/* Broadcast value into all bytes */
-	andi a1, a1, 0xff
-	slli a3, a1, 8
-	or a1, a3, a1
-	slli a3, a1, 16
-	or a1, a3, a1
-#ifdef CONFIG_64BIT
-	slli a3, a1, 32
-	or a1, a3, a1
-#endif
-
-	/* Calculate end address */
-	andi a4, a2, ~(SZREG-1)
-	add a3, t0, a4
-
-	andi a4, a4, 31*SZREG  /* Calculate remainder */
-	beqz a4, 3f            /* Shortcut if no remainder */
-	neg a4, a4
-	addi a4, a4, 32*SZREG  /* Calculate initial offset */
-
-	/* Adjust start address with offset */
-	sub t0, t0, a4
-
-	/* Jump into loop body */
-	/* Assumes 32-bit instruction lengths */
-	la a5, 3f
-#ifdef CONFIG_64BIT
-	srli a4, a4, 1
-#endif
-	add a5, a5, a4
-	jr a5
-3:
-	REG_S a1,        0(t0)
-	REG_S a1,    SZREG(t0)
-	REG_S a1,  2*SZREG(t0)
-	REG_S a1,  3*SZREG(t0)
-	REG_S a1,  4*SZREG(t0)
-	REG_S a1,  5*SZREG(t0)
-	REG_S a1,  6*SZREG(t0)
-	REG_S a1,  7*SZREG(t0)
-	REG_S a1,  8*SZREG(t0)
-	REG_S a1,  9*SZREG(t0)
-	REG_S a1, 10*SZREG(t0)
-	REG_S a1, 11*SZREG(t0)
-	REG_S a1, 12*SZREG(t0)
-	REG_S a1, 13*SZREG(t0)
-	REG_S a1, 14*SZREG(t0)
-	REG_S a1, 15*SZREG(t0)
-	REG_S a1, 16*SZREG(t0)
-	REG_S a1, 17*SZREG(t0)
-	REG_S a1, 18*SZREG(t0)
-	REG_S a1, 19*SZREG(t0)
-	REG_S a1, 20*SZREG(t0)
-	REG_S a1, 21*SZREG(t0)
-	REG_S a1, 22*SZREG(t0)
-	REG_S a1, 23*SZREG(t0)
-	REG_S a1, 24*SZREG(t0)
-	REG_S a1, 25*SZREG(t0)
-	REG_S a1, 26*SZREG(t0)
-	REG_S a1, 27*SZREG(t0)
-	REG_S a1, 28*SZREG(t0)
-	REG_S a1, 29*SZREG(t0)
-	REG_S a1, 30*SZREG(t0)
-	REG_S a1, 31*SZREG(t0)
-	addi t0, t0, 32*SZREG
-	bltu t0, a3, 3b
-	andi a2, a2, SZREG-1  /* Update count */
-
-4:
-	/* Handle trailing misalignment */
-	beqz a2, 6f
-	add a3, t0, a2
-5:
-	sb a1, 0(t0)
-	addi t0, t0, 1
-	bltu t0, a3, 5b
-6:
-	ret
-END(__memset)
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] riscv: use the generic string routines
  2021-07-19 11:43 [PATCH] riscv: use the generic string routines Matteo Croce
@ 2021-08-03 16:54 ` Matteo Croce
  2021-08-04 20:40   ` Palmer Dabbelt
  0 siblings, 1 reply; 10+ messages in thread
From: Matteo Croce @ 2021-08-03 16:54 UTC (permalink / raw)
  To: linux-riscv
  Cc: Linux Kernel Mailing List, linux-arch, Paul Walmsley,
	Palmer Dabbelt, Albert Ou, Atish Patra, Emil Renner Berthing,
	Akira Tsukamoto, Drew Fustini, Bin Meng, David Laight, Guo Ren,
	Christoph Hellwig

On Mon, Jul 19, 2021 at 1:44 PM Matteo Croce <mcroce@linux.microsoft.com> wrote:
>
> From: Matteo Croce <mcroce@microsoft.com>
>
> Use the generic routines which handle alignment properly.
>
> These are the performances measured on a BeagleV machine for a
> 32 mbyte buffer:
>
> memcpy:
> original aligned:        75 Mb/s
> original unaligned:      75 Mb/s
> new aligned:            114 Mb/s
> new unaligned:          107 Mb/s
>
> memset:
> original aligned:       140 Mb/s
> original unaligned:     140 Mb/s
> new aligned:            241 Mb/s
> new unaligned:          241 Mb/s
>
> TCP throughput with iperf3 gives a similar improvement as well.
>
> This is the binary size increase according to bloat-o-meter:
>
> add/remove: 0/0 grow/shrink: 4/2 up/down: 432/-36 (396)
> Function                                     old     new   delta
> memcpy                                        36     324    +288
> memset                                        32     148    +116
> strlcpy                                      116     132     +16
> strscpy_pad                                   84      96     +12
> strlcat                                      176     164     -12
> memmove                                       76      52     -24
> Total: Before=1225371, After=1225767, chg +0.03%
>
> Signed-off-by: Matteo Croce <mcroce@microsoft.com>
> Signed-off-by: Emil Renner Berthing <kernel@esmil.dk>
> ---

Hi,

can someone have a look at this change and share opinions?

Regards,
-- 
per aspera ad upstream

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] riscv: use the generic string routines
  2021-08-03 16:54 ` Matteo Croce
@ 2021-08-04 20:40   ` Palmer Dabbelt
  2021-08-05  8:20     ` David Laight
  2021-08-05 10:31     ` Matteo Croce
  0 siblings, 2 replies; 10+ messages in thread
From: Palmer Dabbelt @ 2021-08-04 20:40 UTC (permalink / raw)
  To: mcroce, mcroce
  Cc: linux-riscv, linux-kernel, linux-arch, Paul Walmsley, aou,
	Atish Patra, kernel, akira.tsukamoto, drew, bmeng.cn,
	David.Laight, guoren, Christoph Hellwig

On Tue, 03 Aug 2021 09:54:34 PDT (-0700), mcroce@linux.microsoft.com wrote:
> On Mon, Jul 19, 2021 at 1:44 PM Matteo Croce <mcroce@linux.microsoft.com> wrote:
>>
>> From: Matteo Croce <mcroce@microsoft.com>
>>
>> Use the generic routines which handle alignment properly.
>>
>> These are the performances measured on a BeagleV machine for a
>> 32 mbyte buffer:
>>
>> memcpy:
>> original aligned:        75 Mb/s
>> original unaligned:      75 Mb/s
>> new aligned:            114 Mb/s
>> new unaligned:          107 Mb/s
>>
>> memset:
>> original aligned:       140 Mb/s
>> original unaligned:     140 Mb/s
>> new aligned:            241 Mb/s
>> new unaligned:          241 Mb/s
>>
>> TCP throughput with iperf3 gives a similar improvement as well.
>>
>> This is the binary size increase according to bloat-o-meter:
>>
>> add/remove: 0/0 grow/shrink: 4/2 up/down: 432/-36 (396)
>> Function                                     old     new   delta
>> memcpy                                        36     324    +288
>> memset                                        32     148    +116
>> strlcpy                                      116     132     +16
>> strscpy_pad                                   84      96     +12
>> strlcat                                      176     164     -12
>> memmove                                       76      52     -24
>> Total: Before=1225371, After=1225767, chg +0.03%
>>
>> Signed-off-by: Matteo Croce <mcroce@microsoft.com>
>> Signed-off-by: Emil Renner Berthing <kernel@esmil.dk>
>> ---
>
> Hi,
>
> can someone have a look at this change and share opinions?

This LGTM.  How are the generic string routines landing?  I'm happy to 
take this into my for-next, but IIUC we need the optimized generic 
versions first so we don't have a performance regression falling back to 
the trivial ones for a bit.  Is there a shared tag I can pull in?

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH] riscv: use the generic string routines
  2021-08-04 20:40   ` Palmer Dabbelt
@ 2021-08-05  8:20     ` David Laight
  2021-08-05 10:31     ` Matteo Croce
  1 sibling, 0 replies; 10+ messages in thread
From: David Laight @ 2021-08-05  8:20 UTC (permalink / raw)
  To: 'Palmer Dabbelt', mcroce, mcroce
  Cc: linux-riscv, linux-kernel, linux-arch, Paul Walmsley, aou,
	Atish Patra, kernel, akira.tsukamoto, drew, bmeng.cn, guoren,
	Christoph Hellwig

From: Palmer Dabbelt
> Sent: 04 August 2021 21:40
> 
> On Tue, 03 Aug 2021 09:54:34 PDT (-0700), mcroce@linux.microsoft.com wrote:
> > On Mon, Jul 19, 2021 at 1:44 PM Matteo Croce <mcroce@linux.microsoft.com> wrote:
> >>
> >> From: Matteo Croce <mcroce@microsoft.com>
> >>
> >> Use the generic routines which handle alignment properly.
> >>
> >> These are the performances measured on a BeagleV machine for a
> >> 32 mbyte buffer:
> >>
> >> memcpy:
> >> original aligned:        75 Mb/s
> >> original unaligned:      75 Mb/s
> >> new aligned:            114 Mb/s
> >> new unaligned:          107 Mb/s
> >>
> >> memset:
> >> original aligned:       140 Mb/s
> >> original unaligned:     140 Mb/s
> >> new aligned:            241 Mb/s
> >> new unaligned:          241 Mb/s
> >>
> >> TCP throughput with iperf3 gives a similar improvement as well.
> >>
> >> This is the binary size increase according to bloat-o-meter:
> >>
> >> add/remove: 0/0 grow/shrink: 4/2 up/down: 432/-36 (396)
> >> Function                                     old     new   delta
> >> memcpy                                        36     324    +288
> >> memset                                        32     148    +116
> >> strlcpy                                      116     132     +16
> >> strscpy_pad                                   84      96     +12
> >> strlcat                                      176     164     -12
> >> memmove                                       76      52     -24
> >> Total: Before=1225371, After=1225767, chg +0.03%
> >>
> >> Signed-off-by: Matteo Croce <mcroce@microsoft.com>
> >> Signed-off-by: Emil Renner Berthing <kernel@esmil.dk>
> >> ---
> >
> > Hi,
> >
> > can someone have a look at this change and share opinions?
> 
> This LGTM.  How are the generic string routines landing?  I'm happy to
> take this into my for-next, but IIUC we need the optimized generic
> versions first so we don't have a performance regression falling back to
> the trivial ones for a bit.  Is there a shared tag I can pull in?

I thought the actual problem was that the asm copy functions were
doing misaligned transfers and faulting.

There is no way that the simple C loop should be as fast as
the asm function given the delay cycles reading from memory.

You definitely need to test much smaller copies where the
buffers are resident in the L1 data cache.
Anything else is completely dominated by the cache line fills/spills.

You also need to test on the much faster riscv implementations
not just on the beaglev board.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] riscv: use the generic string routines
  2021-08-04 20:40   ` Palmer Dabbelt
  2021-08-05  8:20     ` David Laight
@ 2021-08-05 10:31     ` Matteo Croce
  2021-09-11  3:49       ` Palmer Dabbelt
  1 sibling, 1 reply; 10+ messages in thread
From: Matteo Croce @ 2021-08-05 10:31 UTC (permalink / raw)
  To: Palmer Dabbelt
  Cc: linux-riscv, Linux Kernel Mailing List, linux-arch,
	Paul Walmsley, Albert Ou, Atish Patra, Emil Renner Berthing,
	Akira Tsukamoto, Drew Fustini, Bin Meng, David Laight, Guo Ren,
	Christoph Hellwig

On Wed, Aug 4, 2021 at 10:40 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> On Tue, 03 Aug 2021 09:54:34 PDT (-0700), mcroce@linux.microsoft.com wrote:
> > On Mon, Jul 19, 2021 at 1:44 PM Matteo Croce <mcroce@linux.microsoft.com> wrote:
> >>
> >> From: Matteo Croce <mcroce@microsoft.com>
> >>
> >> Use the generic routines which handle alignment properly.
> >>
> >> These are the performances measured on a BeagleV machine for a
> >> 32 mbyte buffer:
> >>
> >> memcpy:
> >> original aligned:        75 Mb/s
> >> original unaligned:      75 Mb/s
> >> new aligned:            114 Mb/s
> >> new unaligned:          107 Mb/s
> >>
> >> memset:
> >> original aligned:       140 Mb/s
> >> original unaligned:     140 Mb/s
> >> new aligned:            241 Mb/s
> >> new unaligned:          241 Mb/s
> >>
> >> TCP throughput with iperf3 gives a similar improvement as well.
> >>
> >> This is the binary size increase according to bloat-o-meter:
> >>
> >> add/remove: 0/0 grow/shrink: 4/2 up/down: 432/-36 (396)
> >> Function                                     old     new   delta
> >> memcpy                                        36     324    +288
> >> memset                                        32     148    +116
> >> strlcpy                                      116     132     +16
> >> strscpy_pad                                   84      96     +12
> >> strlcat                                      176     164     -12
> >> memmove                                       76      52     -24
> >> Total: Before=1225371, After=1225767, chg +0.03%
> >>
> >> Signed-off-by: Matteo Croce <mcroce@microsoft.com>
> >> Signed-off-by: Emil Renner Berthing <kernel@esmil.dk>
> >> ---
> >
> > Hi,
> >
> > can someone have a look at this change and share opinions?
>
> This LGTM.  How are the generic string routines landing?  I'm happy to
> take this into my for-next, but IIUC we need the optimized generic
> versions first so we don't have a performance regression falling back to
> the trivial ones for a bit.  Is there a shared tag I can pull in?

Hi,

I see them only in linux-next by now.

-- 
per aspera ad upstream

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] riscv: use the generic string routines
  2021-08-05 10:31     ` Matteo Croce
@ 2021-09-11  3:49       ` Palmer Dabbelt
  2021-09-11 17:26         ` David Laight
  2021-09-12  0:10         ` Guo Ren
  0 siblings, 2 replies; 10+ messages in thread
From: Palmer Dabbelt @ 2021-09-11  3:49 UTC (permalink / raw)
  To: mcroce
  Cc: linux-riscv, linux-kernel, linux-arch, Paul Walmsley, aou,
	Atish Patra, kernel, akira.tsukamoto, drew, bmeng.cn,
	David.Laight, guoren, Christoph Hellwig

On Thu, 05 Aug 2021 03:31:04 PDT (-0700), mcroce@linux.microsoft.com wrote:
> On Wed, Aug 4, 2021 at 10:40 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>>
>> On Tue, 03 Aug 2021 09:54:34 PDT (-0700), mcroce@linux.microsoft.com wrote:
>> > On Mon, Jul 19, 2021 at 1:44 PM Matteo Croce <mcroce@linux.microsoft.com> wrote:
>> >>
>> >> From: Matteo Croce <mcroce@microsoft.com>
>> >>
>> >> Use the generic routines which handle alignment properly.
>> >>
>> >> These are the performances measured on a BeagleV machine for a
>> >> 32 mbyte buffer:
>> >>
>> >> memcpy:
>> >> original aligned:        75 Mb/s
>> >> original unaligned:      75 Mb/s
>> >> new aligned:            114 Mb/s
>> >> new unaligned:          107 Mb/s
>> >>
>> >> memset:
>> >> original aligned:       140 Mb/s
>> >> original unaligned:     140 Mb/s
>> >> new aligned:            241 Mb/s
>> >> new unaligned:          241 Mb/s
>> >>
>> >> TCP throughput with iperf3 gives a similar improvement as well.
>> >>
>> >> This is the binary size increase according to bloat-o-meter:
>> >>
>> >> add/remove: 0/0 grow/shrink: 4/2 up/down: 432/-36 (396)
>> >> Function                                     old     new   delta
>> >> memcpy                                        36     324    +288
>> >> memset                                        32     148    +116
>> >> strlcpy                                      116     132     +16
>> >> strscpy_pad                                   84      96     +12
>> >> strlcat                                      176     164     -12
>> >> memmove                                       76      52     -24
>> >> Total: Before=1225371, After=1225767, chg +0.03%
>> >>
>> >> Signed-off-by: Matteo Croce <mcroce@microsoft.com>
>> >> Signed-off-by: Emil Renner Berthing <kernel@esmil.dk>
>> >> ---
>> >
>> > Hi,
>> >
>> > can someone have a look at this change and share opinions?
>>
>> This LGTM.  How are the generic string routines landing?  I'm happy to
>> take this into my for-next, but IIUC we need the optimized generic
>> versions first so we don't have a performance regression falling back to
>> the trivial ones for a bit.  Is there a shared tag I can pull in?
>
> Hi,
>
> I see them only in linux-next by now.

These ended up getting rejected by Linus, so I'm going to hold off on 
this for now.  If they're really out of lib/ then I'll take the C 
routines in arch/riscv, but either way it's an issue for the next 
release.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH] riscv: use the generic string routines
  2021-09-11  3:49       ` Palmer Dabbelt
@ 2021-09-11 17:26         ` David Laight
  2021-09-12  0:10         ` Guo Ren
  1 sibling, 0 replies; 10+ messages in thread
From: David Laight @ 2021-09-11 17:26 UTC (permalink / raw)
  To: 'Palmer Dabbelt', mcroce
  Cc: linux-riscv, linux-kernel, linux-arch, Paul Walmsley, aou,
	Atish Patra, kernel, akira.tsukamoto, drew, bmeng.cn, guoren,
	Christoph Hellwig

..
> These ended up getting rejected by Linus, so I'm going to hold off on
> this for now.  If they're really out of lib/ then I'll take the C
> routines in arch/riscv, but either way it's an issue for the next
> release.

I've been half following this.
I've not seen any comparisons between the C functions proposed
here and the riscv asm ones that had the fix for misaligned
transfers applied.

IIRC there is a comment in the asm ones that the unrolled
'read lots' - 'write lots' loop is faster than the older
(asm) read-write loop.

But I've not seen any archictural discussions at all.

A simple in-order single-issue cpu will execute the
unrolled loop faster just because it has fewer instructions.
The read-lots - write-lots almost certainly helps
avoid read-latency delaying things if multiple reads
can be pipelined.
The writes are almost certainly 'posted' and pipelined,
But a simple cpu could easily require all writes finish
before doing a read.

A super-scaler (multi-issue) cpu gives you the ability
to get the loop control instructions 'for free' with
carefully written assembler.
At which point a copy for 'life cache' data should be
limited only by the cpu's cache memory bandwidth.

If reads and writes can interleave then a loop that
alternates reads and writes (read each register
just after writing it) may mean that you always
keep the cpu-cache interface busy.
This would be especially true if the cpu can execute
both a cache read and write in the same cycle.
(Which many moderate performance cpu can.)

None of the requires out-of-order execution, just
execution to continue while a read is in progress.

I'm also guessing that any performance testing has been
done with the (relatively) cheap boards that are readily
available.

But I've also seen references in the press to much faster
riscv cpu that are definitely multi-issue and may have
some simple out-of-order execution.
Any changes ought to be tested on these faster systems.

I also recall that some of the performance measurements
were made with long buffers - they will be dominated by the
cache to DRAM (and maybe TLB lookup) timings, not the copy
loop.

For a simple cpu you ought to be able to measure the
number of cpu cycles used for a copy - and account for
all of them.
For something like x86 you can show that the copy is
being limited by the cpu-cache bandwidth.
(FWIW measurements of the inet checksum code on x86
show it runs at half the expected speed on a lot of
Intel cpu - no one ever measured it.)

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] riscv: use the generic string routines
  2021-09-11  3:49       ` Palmer Dabbelt
  2021-09-11 17:26         ` David Laight
@ 2021-09-12  0:10         ` Guo Ren
  2021-09-13 11:35           ` David Laight
  1 sibling, 1 reply; 10+ messages in thread
From: Guo Ren @ 2021-09-12  0:10 UTC (permalink / raw)
  To: Palmer Dabbelt
  Cc: Matteo Croce, linux-riscv, Linux Kernel Mailing List, linux-arch,
	Paul Walmsley, Albert Ou, Atish Patra, Emil Renner Berthing,
	Akira Tsukamoto, Drew Fustini, Bin Meng, David Laight,
	Christoph Hellwig

On Sat, Sep 11, 2021 at 11:49 AM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> On Thu, 05 Aug 2021 03:31:04 PDT (-0700), mcroce@linux.microsoft.com wrote:
> > On Wed, Aug 4, 2021 at 10:40 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
> >>
> >> On Tue, 03 Aug 2021 09:54:34 PDT (-0700), mcroce@linux.microsoft.com wrote:
> >> > On Mon, Jul 19, 2021 at 1:44 PM Matteo Croce <mcroce@linux.microsoft.com> wrote:
> >> >>
> >> >> From: Matteo Croce <mcroce@microsoft.com>
> >> >>
> >> >> Use the generic routines which handle alignment properly.
> >> >>
> >> >> These are the performances measured on a BeagleV machine for a
> >> >> 32 mbyte buffer:
> >> >>
> >> >> memcpy:
> >> >> original aligned:        75 Mb/s
> >> >> original unaligned:      75 Mb/s
> >> >> new aligned:            114 Mb/s
> >> >> new unaligned:          107 Mb/s
> >> >>
> >> >> memset:
> >> >> original aligned:       140 Mb/s
> >> >> original unaligned:     140 Mb/s
> >> >> new aligned:            241 Mb/s
> >> >> new unaligned:          241 Mb/s
> >> >>
> >> >> TCP throughput with iperf3 gives a similar improvement as well.
> >> >>
> >> >> This is the binary size increase according to bloat-o-meter:
> >> >>
> >> >> add/remove: 0/0 grow/shrink: 4/2 up/down: 432/-36 (396)
> >> >> Function                                     old     new   delta
> >> >> memcpy                                        36     324    +288
> >> >> memset                                        32     148    +116
> >> >> strlcpy                                      116     132     +16
> >> >> strscpy_pad                                   84      96     +12
> >> >> strlcat                                      176     164     -12
> >> >> memmove                                       76      52     -24
> >> >> Total: Before=1225371, After=1225767, chg +0.03%
> >> >>
> >> >> Signed-off-by: Matteo Croce <mcroce@microsoft.com>
> >> >> Signed-off-by: Emil Renner Berthing <kernel@esmil.dk>
> >> >> ---
> >> >
> >> > Hi,
> >> >
> >> > can someone have a look at this change and share opinions?
> >>
> >> This LGTM.  How are the generic string routines landing?  I'm happy to
> >> take this into my for-next, but IIUC we need the optimized generic
> >> versions first so we don't have a performance regression falling back to
> >> the trivial ones for a bit.  Is there a shared tag I can pull in?
> >
> > Hi,
> >
> > I see them only in linux-next by now.
>
> These ended up getting rejected by Linus, so I'm going to hold off on
> this for now.  If they're really out of lib/ then I'll take the C
> routines in arch/riscv, but either way it's an issue for the next
> release.
Agree, we should take the C routine in arch/riscv for common
implementation. If any vendor what custom implementation they could
use the alternative framework in errata for string operations.

-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH] riscv: use the generic string routines
  2021-09-12  0:10         ` Guo Ren
@ 2021-09-13 11:35           ` David Laight
  2021-09-19 19:13             ` Matteo Croce
  0 siblings, 1 reply; 10+ messages in thread
From: David Laight @ 2021-09-13 11:35 UTC (permalink / raw)
  To: 'Guo Ren', Palmer Dabbelt
  Cc: Matteo Croce, linux-riscv, Linux Kernel Mailing List, linux-arch,
	Paul Walmsley, Albert Ou, Atish Patra, Emil Renner Berthing,
	Akira Tsukamoto, Drew Fustini, Bin Meng, Christoph Hellwig

> > These ended up getting rejected by Linus, so I'm going to hold off on
> > this for now.  If they're really out of lib/ then I'll take the C
> > routines in arch/riscv, but either way it's an issue for the next
> > release.
> Agree, we should take the C routine in arch/riscv for common
> implementation. If any vendor what custom implementation they could
> use the alternative framework in errata for string operations.

I though the asm ones were significantly faster because
they were less affected by read latency.

(But they were horribly broken for misaligned transfers.)

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] riscv: use the generic string routines
  2021-09-13 11:35           ` David Laight
@ 2021-09-19 19:13             ` Matteo Croce
  0 siblings, 0 replies; 10+ messages in thread
From: Matteo Croce @ 2021-09-19 19:13 UTC (permalink / raw)
  To: David Laight
  Cc: Guo Ren, Palmer Dabbelt, linux-riscv, Linux Kernel Mailing List,
	linux-arch, Paul Walmsley, Albert Ou, Atish Patra,
	Emil Renner Berthing, Akira Tsukamoto, Drew Fustini, Bin Meng,
	Christoph Hellwig

On Mon, Sep 13, 2021 at 1:35 PM David Laight <David.Laight@aculab.com> wrote:
>
> > > These ended up getting rejected by Linus, so I'm going to hold off on
> > > this for now.  If they're really out of lib/ then I'll take the C
> > > routines in arch/riscv, but either way it's an issue for the next
> > > release.
> > Agree, we should take the C routine in arch/riscv for common
> > implementation. If any vendor what custom implementation they could
> > use the alternative framework in errata for string operations.
>
> I though the asm ones were significantly faster because
> they were less affected by read latency.
>
> (But they were horribly broken for misaligned transfers.)
>

I can get the same exact performance (and a very similar machine code)
in C with this on top of the C memset implementation:

--- a/arch/riscv/lib/string.c
+++ b/arch/riscv/lib/string.c
@@ -112,9 +112,12 @@ EXPORT_SYMBOL(__memmove);
 void *memmove(void *dest, const void *src, size_t count) __weak
__alias(__memmove);
 EXPORT_SYMBOL(memmove);

+#define BATCH 4
+
 void *__memset(void *s, int c, size_t count)
 {
  union types dest = { .as_u8 = s };
+ int i;

  if (count >= MIN_THRESHOLD) {
  unsigned long cu = (unsigned long)c;
@@ -138,8 +141,12 @@ void *__memset(void *s, int c, size_t count)
  }

  /* Copy using the largest size allowed */
- for (; count >= BYTES_LONG; count -= BYTES_LONG)
- *dest.as_ulong++ = cu;
+ for (; count >= BYTES_LONG * BATCH; count -= BYTES_LONG * BATCH) {
+#pragma GCC unroll 4
+     for (i = 0; i < BATCH; i++)
+         dest.as_ulong[i] = cu;
+     dest.as_ulong += BATCH;
+ }
  }

On the BeagleV the memset speed with the different batch size are:

1 (stock): 267 Mb/s
2: 272 Mb/s
4: 276 Mb/s
8: 276 Mb/s

The problem with biggest batch size is that it will fallback to a
single byte copy if the buffers are too small.

Regards,
-- 
per aspera ad upstream

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2021-09-19 19:14 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-19 11:43 [PATCH] riscv: use the generic string routines Matteo Croce
2021-08-03 16:54 ` Matteo Croce
2021-08-04 20:40   ` Palmer Dabbelt
2021-08-05  8:20     ` David Laight
2021-08-05 10:31     ` Matteo Croce
2021-09-11  3:49       ` Palmer Dabbelt
2021-09-11 17:26         ` David Laight
2021-09-12  0:10         ` Guo Ren
2021-09-13 11:35           ` David Laight
2021-09-19 19:13             ` Matteo Croce

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).