All of lore.kernel.org
 help / color / mirror / Atom feed
* [U-Boot] [PATCH] arm: Use optimized memcpy and memset from linux
@ 2011-01-24 15:56 Matthias Weisser
  2011-01-24 16:13 ` Wolfgang Denk
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Matthias Weisser @ 2011-01-24 15:56 UTC (permalink / raw)
  To: u-boot

Using optimized versions of memset and memcpy from linux brings a quite
noticeable speed (x2 or better) improvement for these two functions.

Size impact:

C version:
   text    data     bss     dec     hex filename
 202862   18912  266456  488230   77326 u-boot

ASM version:
   text    data     bss     dec     hex filename
 203798   18912  266288  488998   77626 u-boot

Signed-off-by: Matthias Weisser <weisserm@arcor.de>
---
 arch/arm/include/asm/assembler.h |   62 ++++++++++
 arch/arm/include/asm/string.h    |    4 +-
 arch/arm/lib/Makefile            |    2 +
 arch/arm/lib/memcpy.S            |  241 ++++++++++++++++++++++++++++++++++++++
 arch/arm/lib/memset.S            |  126 ++++++++++++++++++++
 5 files changed, 433 insertions(+), 2 deletions(-)
 create mode 100644 arch/arm/include/asm/assembler.h
 create mode 100644 arch/arm/lib/memcpy.S
 create mode 100644 arch/arm/lib/memset.S

diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
new file mode 100644
index 0000000..231b1ae
--- /dev/null
+++ b/arch/arm/include/asm/assembler.h
@@ -0,0 +1,62 @@
+/*
+ *  arch/arm/include/asm/assembler.h
+ *
+ *  Copyright (C) 1996-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  This file contains arm architecture specific defines
+ *  for the different processors.
+ *
+ *  Do not include any C declarations in this file - it is included by
+ *  assembler source.
+ */
+
+/*
+ * Endian independent macros for shifting bytes within registers.
+ */
+#ifndef __ARMEB__
+#define pull            lsr
+#define push            lsl
+#define get_byte_0      lsl #0
+#define get_byte_1	lsr #8
+#define get_byte_2	lsr #16
+#define get_byte_3	lsr #24
+#define put_byte_0      lsl #0
+#define put_byte_1	lsl #8
+#define put_byte_2	lsl #16
+#define put_byte_3	lsl #24
+#else
+#define pull            lsl
+#define push            lsr
+#define get_byte_0	lsr #24
+#define get_byte_1	lsr #16
+#define get_byte_2	lsr #8
+#define get_byte_3      lsl #0
+#define put_byte_0	lsl #24
+#define put_byte_1	lsl #16
+#define put_byte_2	lsl #8
+#define put_byte_3      lsl #0
+#endif
+
+/*
+ * Data preload for architectures that support it
+ */
+#if defined(__ARM_ARCH_5TE__)
+#define PLD(code...)	code
+#else
+#define PLD(code...)
+#endif
+
+/*
+ * This can be used to enable code to cacheline align the destination
+ * pointer when bulk writing to memory.  Experiments on StrongARM and
+ * XScale didn't show this a worthwhile thing to do when the cache is not
+ * set to write-allocate (this would need further testing on XScale when WA
+ * is used).
+ *
+ * On Feroceon there is much to gain however, regardless of cache mode.
+ */
+#define CALGN(code...) code
diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h
index c3ea582..a939571 100644
--- a/arch/arm/include/asm/string.h
+++ b/arch/arm/include/asm/string.h
@@ -12,7 +12,7 @@ extern char * strrchr(const char * s, int c);
 #undef __HAVE_ARCH_STRCHR
 extern char * strchr(const char * s, int c);
 
-#undef __HAVE_ARCH_MEMCPY
+#define __HAVE_ARCH_MEMCPY
 extern void * memcpy(void *, const void *, __kernel_size_t);
 
 #undef __HAVE_ARCH_MEMMOVE
@@ -22,7 +22,7 @@ extern void * memmove(void *, const void *, __kernel_size_t);
 extern void * memchr(const void *, int, __kernel_size_t);
 
 #undef __HAVE_ARCH_MEMZERO
-#undef __HAVE_ARCH_MEMSET
+#define __HAVE_ARCH_MEMSET
 extern void * memset(void *, int, __kernel_size_t);
 
 #if 0
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 454440c..575a919 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -44,6 +44,8 @@ COBJS-y	+= cache-cp15.o
 endif
 COBJS-y	+= interrupts.o
 COBJS-y	+= reset.o
+SOBJS-y	+= memset.o
+SOBJS-y	+= memcpy.o
 
 SRCS	:= $(GLSOBJS:.o=.S) $(GLCOBJS:.o=.c) \
 	   $(SOBJS-y:.o=.S) $(COBJS-y:.o=.c)
diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
new file mode 100644
index 0000000..40db90e
--- /dev/null
+++ b/arch/arm/lib/memcpy.S
@@ -0,0 +1,241 @@
+/*
+ *  linux/arch/arm/lib/memcpy.S
+ *
+ *  Author:	Nicolas Pitre
+ *  Created:	Sep 28, 2005
+ *  Copyright:	MontaVista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <asm/assembler.h>
+
+#define W(instr)	instr
+
+#define LDR1W_SHIFT	0
+#define STR1W_SHIFT	0
+
+	.macro ldr1w ptr reg abort
+	W(ldr) \reg, [\ptr], #4
+	.endm
+
+	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
+	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
+	.endm
+
+	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+	.endm
+
+	.macro ldr1b ptr reg cond=al abort
+	ldr\cond\()b \reg, [\ptr], #1
+	.endm
+
+	.macro str1w ptr reg abort
+	W(str) \reg, [\ptr], #4
+	.endm
+
+	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+	.endm
+
+	.macro str1b ptr reg cond=al abort
+	str\cond\()b \reg, [\ptr], #1
+	.endm
+
+	.macro enter reg1 reg2
+	stmdb sp!, {r0, \reg1, \reg2}
+	.endm
+
+	.macro exit reg1 reg2
+	ldmfd sp!, {r0, \reg1, \reg2}
+	.endm
+
+	.text
+
+/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
+
+.globl memcpy
+memcpy:
+
+		enter	r4, lr
+
+		subs	r2, r2, #4
+		blt	8f
+		ands	ip, r0, #3
+	PLD(	pld	[r1, #0]		)
+		bne	9f
+		ands	ip, r1, #3
+		bne	10f
+
+1:		subs	r2, r2, #(28)
+		stmfd	sp!, {r5 - r8}
+		blt	5f
+
+	CALGN(	ands	ip, r0, #31		)
+	CALGN(	rsb	r3, ip, #32		)
+	CALGN(	sbcnes	r4, r3, r2		)  @ C is always set here
+	CALGN(	bcs	2f			)
+	CALGN(	adr	r4, 6f			)
+	CALGN(	subs	r2, r2, r3		)  @ C gets set
+	CALGN(	add	pc, r4, ip		)
+
+	PLD(	pld	[r1, #0]		)
+2:	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[r1, #28]		)
+	PLD(	blt	4f			)
+	PLD(	pld	[r1, #60]		)
+	PLD(	pld	[r1, #92]		)
+
+3:	PLD(	pld	[r1, #124]		)
+4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+		subs	r2, r2, #32
+		str8w	r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+		bge	3b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	4b			)
+
+5:		ands	ip, r2, #28
+		rsb	ip, ip, #32
+#if LDR1W_SHIFT > 0
+		lsl	ip, ip, #LDR1W_SHIFT
+#endif
+		addne	pc, pc, ip		@ C is always clear here
+		b	7f
+6:
+		.rept	(1 << LDR1W_SHIFT)
+		W(nop)
+		.endr
+		ldr1w	r1, r3, abort=20f
+		ldr1w	r1, r4, abort=20f
+		ldr1w	r1, r5, abort=20f
+		ldr1w	r1, r6, abort=20f
+		ldr1w	r1, r7, abort=20f
+		ldr1w	r1, r8, abort=20f
+		ldr1w	r1, lr, abort=20f
+
+#if LDR1W_SHIFT < STR1W_SHIFT
+		lsl	ip, ip, #STR1W_SHIFT - LDR1W_SHIFT
+#elif LDR1W_SHIFT > STR1W_SHIFT
+		lsr	ip, ip, #LDR1W_SHIFT - STR1W_SHIFT
+#endif
+		add	pc, pc, ip
+		nop
+		.rept	(1 << STR1W_SHIFT)
+		W(nop)
+		.endr
+		str1w	r0, r3, abort=20f
+		str1w	r0, r4, abort=20f
+		str1w	r0, r5, abort=20f
+		str1w	r0, r6, abort=20f
+		str1w	r0, r7, abort=20f
+		str1w	r0, r8, abort=20f
+		str1w	r0, lr, abort=20f
+
+	CALGN(	bcs	2b			)
+
+7:		ldmfd	sp!, {r5 - r8}
+
+8:		movs	r2, r2, lsl #31
+		ldr1b	r1, r3, ne, abort=21f
+		ldr1b	r1, r4, cs, abort=21f
+		ldr1b	r1, ip, cs, abort=21f
+		str1b	r0, r3, ne, abort=21f
+		str1b	r0, r4, cs, abort=21f
+		str1b	r0, ip, cs, abort=21f
+
+		exit	r4, pc
+
+9:		rsb	ip, ip, #4
+		cmp	ip, #2
+		ldr1b	r1, r3, gt, abort=21f
+		ldr1b	r1, r4, ge, abort=21f
+		ldr1b	r1, lr, abort=21f
+		str1b	r0, r3, gt, abort=21f
+		str1b	r0, r4, ge, abort=21f
+		subs	r2, r2, ip
+		str1b	r0, lr, abort=21f
+		blt	8b
+		ands	ip, r1, #3
+		beq	1b
+
+10:		bic	r1, r1, #3
+		cmp	ip, #2
+		ldr1w	r1, lr, abort=21f
+		beq	17f
+		bgt	18f
+
+
+		.macro	forward_copy_shift pull push
+
+		subs	r2, r2, #28
+		blt	14f
+
+	CALGN(	ands	ip, r0, #31		)
+	CALGN(	rsb	ip, ip, #32		)
+	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
+	CALGN(	subcc	r2, r2, ip		)
+	CALGN(	bcc	15f			)
+
+11:		stmfd	sp!, {r5 - r9}
+
+	PLD(	pld	[r1, #0]		)
+	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[r1, #28]		)
+	PLD(	blt	13f			)
+	PLD(	pld	[r1, #60]		)
+	PLD(	pld	[r1, #92]		)
+
+12:	PLD(	pld	[r1, #124]		)
+13:		ldr4w	r1, r4, r5, r6, r7, abort=19f
+		mov	r3, lr, pull #\pull
+		subs	r2, r2, #32
+		ldr4w	r1, r8, r9, ip, lr, abort=19f
+		orr	r3, r3, r4, push #\push
+		mov	r4, r4, pull #\pull
+		orr	r4, r4, r5, push #\push
+		mov	r5, r5, pull #\pull
+		orr	r5, r5, r6, push #\push
+		mov	r6, r6, pull #\pull
+		orr	r6, r6, r7, push #\push
+		mov	r7, r7, pull #\pull
+		orr	r7, r7, r8, push #\push
+		mov	r8, r8, pull #\pull
+		orr	r8, r8, r9, push #\push
+		mov	r9, r9, pull #\pull
+		orr	r9, r9, ip, push #\push
+		mov	ip, ip, pull #\pull
+		orr	ip, ip, lr, push #\push
+		str8w	r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
+		bge	12b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	13b			)
+
+		ldmfd	sp!, {r5 - r9}
+
+14:		ands	ip, r2, #28
+		beq	16f
+
+15:		mov	r3, lr, pull #\pull
+		ldr1w	r1, lr, abort=21f
+		subs	ip, ip, #4
+		orr	r3, r3, lr, push #\push
+		str1w	r0, r3, abort=21f
+		bgt	15b
+	CALGN(	cmp	r2, #0			)
+	CALGN(	bge	11b			)
+
+16:		sub	r1, r1, #(\push / 8)
+		b	8b
+
+		.endm
+
+
+		forward_copy_shift	pull=8	push=24
+
+17:		forward_copy_shift	pull=16	push=16
+
+18:		forward_copy_shift	pull=24	push=8
+
diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S
new file mode 100644
index 0000000..0cdf895
--- /dev/null
+++ b/arch/arm/lib/memset.S
@@ -0,0 +1,126 @@
+/*
+ *  linux/arch/arm/lib/memset.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  ASM optimised string functions
+ */
+#include <asm/assembler.h>
+
+	.text
+	.align	5
+	.word	0
+
+1:	subs	r2, r2, #4		@ 1 do we have enough
+	blt	5f			@ 1 bytes to align with?
+	cmp	r3, #2			@ 1
+	strltb	r1, [r0], #1		@ 1
+	strleb	r1, [r0], #1		@ 1
+	strb	r1, [r0], #1		@ 1
+	add	r2, r2, r3		@ 1 (r2 = r2 - (4 - r3))
+/*
+ * The pointer is now aligned and the length is adjusted.  Try doing the
+ * memset again.
+ */
+
+.globl memset
+memset:
+	ands	r3, r0, #3		@ 1 unaligned?
+	bne	1b			@ 1
+/*
+ * we know that the pointer in r0 is aligned to a word boundary.
+ */
+	orr	r1, r1, r1, lsl #8
+	orr	r1, r1, r1, lsl #16
+	mov	r3, r1
+	cmp	r2, #16
+	blt	4f
+
+#if ! CALGN(1)+0
+
+/*
+ * We need an extra register for this loop - save the return address and
+ * use the LR
+ */
+	str	lr, [sp, #-4]!
+	mov	ip, r1
+	mov	lr, r1
+
+2:	subs	r2, r2, #64
+	stmgeia	r0!, {r1, r3, ip, lr}	@ 64 bytes at a time.
+	stmgeia	r0!, {r1, r3, ip, lr}
+	stmgeia	r0!, {r1, r3, ip, lr}
+	stmgeia	r0!, {r1, r3, ip, lr}
+	bgt	2b
+	ldmeqfd	sp!, {pc}		@ Now <64 bytes to go.
+/*
+ * No need to correct the count; we're only testing bits from now on
+ */
+	tst	r2, #32
+	stmneia	r0!, {r1, r3, ip, lr}
+	stmneia	r0!, {r1, r3, ip, lr}
+	tst	r2, #16
+	stmneia	r0!, {r1, r3, ip, lr}
+	ldr	lr, [sp], #4
+
+#else
+
+/*
+ * This version aligns the destination pointer in order to write
+ * whole cache lines@once.
+ */
+
+	stmfd	sp!, {r4-r7, lr}
+	mov	r4, r1
+	mov	r5, r1
+	mov	r6, r1
+	mov	r7, r1
+	mov	ip, r1
+	mov	lr, r1
+
+	cmp	r2, #96
+	tstgt	r0, #31
+	ble	3f
+
+	and	ip, r0, #31
+	rsb	ip, ip, #32
+	sub	r2, r2, ip
+	movs	ip, ip, lsl #(32 - 4)
+	stmcsia	r0!, {r4, r5, r6, r7}
+	stmmiia	r0!, {r4, r5}
+	tst	ip, #(1 << 30)
+	mov	ip, r1
+	strne	r1, [r0], #4
+
+3:	subs	r2, r2, #64
+	stmgeia	r0!, {r1, r3-r7, ip, lr}
+	stmgeia	r0!, {r1, r3-r7, ip, lr}
+	bgt	3b
+	ldmeqfd	sp!, {r4-r7, pc}
+
+	tst	r2, #32
+	stmneia	r0!, {r1, r3-r7, ip, lr}
+	tst	r2, #16
+	stmneia	r0!, {r4-r7}
+	ldmfd	sp!, {r4-r7, lr}
+
+#endif
+
+4:	tst	r2, #8
+	stmneia	r0!, {r1, r3}
+	tst	r2, #4
+	strne	r1, [r0], #4
+/*
+ * When we get here, we've got less than 4 bytes to zero.  We
+ * may have an unaligned pointer as well.
+ */
+5:	tst	r2, #2
+	strneb	r1, [r0], #1
+	strneb	r1, [r0], #1
+	tst	r2, #1
+	strneb	r1, [r0], #1
+	mov	pc, lr
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [U-Boot] [PATCH] arm: Use optimized memcpy and memset from linux
  2011-01-24 15:56 [U-Boot] [PATCH] arm: Use optimized memcpy and memset from linux Matthias Weisser
@ 2011-01-24 16:13 ` Wolfgang Denk
  2011-01-24 19:24   ` Matthias Weißer
  2011-01-26 10:45 ` [U-Boot] [PATCH V2] " Matthias Weisser
  2011-03-11  7:36 ` [U-Boot] [PATCH V3] " Matthias Weisser
  2 siblings, 1 reply; 15+ messages in thread
From: Wolfgang Denk @ 2011-01-24 16:13 UTC (permalink / raw)
  To: u-boot

Dear Matthias Weisser,

In message <1295884607-9044-1-git-send-email-weisserm@arcor.de> you wrote:
> Using optimized versions of memset and memcpy from linux brings a quite
> noticeable speed (x2 or better) improvement for these two functions.
> 
> Size impact:
> 
> C version:
>    text    data     bss     dec     hex filename
>  202862   18912  266456  488230   77326 u-boot
> 
> ASM version:
>    text    data     bss     dec     hex filename
>  203798   18912  266288  488998   77626 u-boot

How exactly did you measure the speed improvement?

Best regards,

Wolfgang Denk

-- 
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany
Phone: (+49)-8142-66989-10 Fax: (+49)-8142-66989-80 Email: wd at denx.de
Real programmers can write assembly code in any language.   :-)
                      - Larry Wall in  <8571@jpl-devvax.JPL.NASA.GOV>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [U-Boot] [PATCH] arm: Use optimized memcpy and memset from linux
  2011-01-24 16:13 ` Wolfgang Denk
@ 2011-01-24 19:24   ` Matthias Weißer
  2011-01-24 20:07     ` Wolfgang Denk
  0 siblings, 1 reply; 15+ messages in thread
From: Matthias Weißer @ 2011-01-24 19:24 UTC (permalink / raw)
  To: u-boot

Am 24.01.2011 17:13, schrieb Wolfgang Denk:
> Dear Matthias Weisser,
> 
> In message <1295884607-9044-1-git-send-email-weisserm@arcor.de> you wrote:
>> Using optimized versions of memset and memcpy from linux brings a quite
>> noticeable speed (x2 or better) improvement for these two functions.
>>
>> Size impact:
>>
>> C version:
>>    text    data     bss     dec     hex filename
>>  202862   18912  266456  488230   77326 u-boot
>>
>> ASM version:
>>    text    data     bss     dec     hex filename
>>  203798   18912  266288  488998   77626 u-boot
> 
> How exactly did you measure the speed improvement?

I inserted a printf before and after calls to these functions with sizes
of 1MB or more each. I then measured the times between these printfs
using grabserial (http://elinux.org/Grabserial). In both cases caches
where enabled.

To be precise: As memset test case I used the memset(.., 0, ..) of the
malloc pool (which was 4MB in my case) and a memcpy from flash to RAM
which I inserted in cmd_bootm.c of about 2.2MB (see RFC patch
http://patchwork.ozlabs.org/patch/79480/ for exact location of the memcpy).

Do you think a factor of 2 is not possible against the C version? Maybe
I have done something wrong while measuring theses times. From my point
of view it should be possible to get such improvements as the code takes
cache alignment into account and also uses the PLD instruction.

I can do some additional measurements tomorrow on two systems (jadecpu
with a 32Bit at 166MHz DDR2 memory and a imx25 based on with 16Bit at 133MHz
LPDDR) and come up with some exact numbers. Maybe you can give some more
hints what and how the improvements of this patch can be measured.

Matthias Wei?er

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [U-Boot] [PATCH] arm: Use optimized memcpy and memset from linux
  2011-01-24 19:24   ` Matthias Weißer
@ 2011-01-24 20:07     ` Wolfgang Denk
  2011-01-25 10:55       ` Matthias Weißer
  0 siblings, 1 reply; 15+ messages in thread
From: Wolfgang Denk @ 2011-01-24 20:07 UTC (permalink / raw)
  To: u-boot

Dear =?ISO-8859-1?Q?Matthias_Wei=DFer?=,

In message <4D3DD1EC.7010506@arcor.de> you wrote:
>
> >> C version:
> >>    text    data     bss     dec     hex filename
> >>  202862   18912  266456  488230   77326 u-boot
> >>
> >> ASM version:
> >>    text    data     bss     dec     hex filename
> >>  203798   18912  266288  488998   77626 u-boot
> > 
> > How exactly did you measure the speed improvement?
> 
> I inserted a printf before and after calls to these functions with sizes
> of 1MB or more each. I then measured the times between these printfs
> using grabserial (http://elinux.org/Grabserial). In both cases caches
> where enabled.
> 
> To be precise: As memset test case I used the memset(.., 0, ..) of the
> malloc pool (which was 4MB in my case) and a memcpy from flash to RAM
> which I inserted in cmd_bootm.c of about 2.2MB (see RFC patch
> http://patchwork.ozlabs.org/patch/79480/ for exact location of the memcpy).

OK - so which results do you see in reallife use, say when loading and
booting an OS? How much boot time can be saved?

> Do you think a factor of 2 is not possible against the C version? Maybe
> I have done something wrong while measuring theses times. From my point
> of view it should be possible to get such improvements as the code takes
> cache alignment into account and also uses the PLD instruction.

I don;t doubt your measurements.  But this being an optimization
approach, it seems appropriate to check wether we are really
optimizing a hot spot, and if the measured results can be generalized.

I guess the speed improvemnt you see for a few large copy operations
is just one side - probably there will be slower excution (due to the
effort to set up the operations) for the (many more frequent) small
operations.  In addition, there is an increase of the memory footprint
of nearly 1 kB.

I think additional measuremnts need to be done - for example, we
should check how the execution times change for typical operations
like TFTP download, reading from NAND flash and MMC/SDcard, booting a
Linux kernel etc.

Also, it should be possible to enable this feature consditionally, so
users can decide wether speed or size is more important in their
configurations.

> I can do some additional measurements tomorrow on two systems (jadecpu
> with a 32Bit at 166MHz DDR2 memory and a imx25 based on with 16Bit at 133MHz
> LPDDR) and come up with some exact numbers. Maybe you can give some more
> hints what and how the improvements of this patch can be measured.

See above.

Thanks.

Wolfgang Denk

-- 
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany
Phone: (+49)-8142-66989-10 Fax: (+49)-8142-66989-80 Email: wd at denx.de
Egotist: A person of low taste, more interested in  himself  than  in
me.                                                  - Ambrose Bierce

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [U-Boot] [PATCH] arm: Use optimized memcpy and memset from linux
  2011-01-24 20:07     ` Wolfgang Denk
@ 2011-01-25 10:55       ` Matthias Weißer
  2011-01-25 20:05         ` Wolfgang Denk
  0 siblings, 1 reply; 15+ messages in thread
From: Matthias Weißer @ 2011-01-25 10:55 UTC (permalink / raw)
  To: u-boot

Am 24.01.2011 21:07, schrieb Wolfgang Denk:
> OK - so which results do you see in reallife use, say when loading and
> booting an OS? How much boot time can be saved?

All tests are done with jadecpu

                        | HEAD(1)| HEAD(1)| HEAD(2)| HEAD(2)|
                        |        | +patch |        | +patch |
-----------------------+--------+--------+--------+--------+
Reset to prompt        |  438ms |  330ms |  228ms |  120ms |
                        |        |        |        |        |
TFTP a 3MB img         | 4782ms | 3428ms | 3245ms | 2820ms |
                        |        |        |        |        |
FATLOAD USB a 3MB img* | 8515ms | 8510ms | ------ | ------ |
                        |        |        |        |        |
BOOTM LZO img in RAM   | 3473ms | 3168ms |  592ms |  592ms |
  where CRC is          |  615ms |  615ms |   54ms |   54ms |
  uncompress            | 2460ms | 2462ms |  450ms |  451ms |
  final boot_elf        |  376ms |   68ms |   65ms |   65ms |
                        |        |        |        |        |
BOOTM LZO img in FLASH | 3207ms | 2902ms | 1050ms | 1050ms |
  where CRC is          |  600ms |  600ms |  135ms |  135ms |
  uncompress            | 2209ms | 2211ms |  828ms |  828ms |
  final boot_elf        |  376ms |   68ms |   65ms |   65ms |

(1) No dcache
(2) dcache enabled in board_init
*Does not work when dcache is on

I think we can see that there seems to be no negativ impact of theses 
patches when only execution speed is taken into consideration. The gain 
is noticable when caching is not used or not activated. For pure RAM to 
RAM copy when caching is activated the patch didn't change anything.

Here are some additional numbers for copying a 1.4MB image from NOR to RAM:

HEAD                  : 134ms
HEAD + patch          : 72ms
HEAD + dcache         : 120ms
HEAD + dcache + patch : 70ms

So, for copy actions from flash to RAM there is also an improvement. As 
boot times are a bit critical or us every improvement > 10ms is 
interesting for us.

> I guess the speed improvemnt you see for a few large copy operations
> is just one side - probably there will be slower excution (due to the
> effort to set up the operations) for the (many more frequent) small
> operations.  In addition, there is an increase of the memory footprint
> of nearly 1 kB.
 >
> I think additional measuremnts need to be done - for example, we
> should check how the execution times change for typical operations
> like TFTP download, reading from NAND flash and MMC/SDcard, booting a
> Linux kernel etc.

As the test above show there is no negative performance impact with the 
test cases I have done. As we don't use Linux here I can't test this. 
Maybe someone other can jump in here.

> Also, it should be possible to enable this feature consditionally, so
> users can decide wether speed or size is more important in their
> configurations.

Would it be an option to use the CONFIG entries CONFIG_USE_ARCH_MEMCPY 
and CONFIG_USE_ARCH_MEMSET to enable that feature? If that is OK I can 
send a new version of the patch. The only problem I see with this 
approach is that there are architectures which already have their own 
implementations which are then not affected by these config options.


Regards
Matthias

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [U-Boot] [PATCH] arm: Use optimized memcpy and memset from linux
  2011-01-25 10:55       ` Matthias Weißer
@ 2011-01-25 20:05         ` Wolfgang Denk
  0 siblings, 0 replies; 15+ messages in thread
From: Wolfgang Denk @ 2011-01-25 20:05 UTC (permalink / raw)
  To: u-boot

Dear =?ISO-8859-1?Q?Matthias_Wei=DFer?=,

In message <4D3EAC1A.5030707@arcor.de> you wrote:
>
>                         | HEAD(1)| HEAD(1)| HEAD(2)| HEAD(2)|
>                         |        | +patch |        | +patch |
> -----------------------+--------+--------+--------+--------+
> Reset to prompt        |  438ms |  330ms |  228ms |  120ms |
>                         |        |        |        |        |
> TFTP a 3MB img         | 4782ms | 3428ms | 3245ms | 2820ms |
>                         |        |        |        |        |
> FATLOAD USB a 3MB img* | 8515ms | 8510ms | ------ | ------ |
>                         |        |        |        |        |
> BOOTM LZO img in RAM   | 3473ms | 3168ms |  592ms |  592ms |
>   where CRC is          |  615ms |  615ms |   54ms |   54ms |
>   uncompress            | 2460ms | 2462ms |  450ms |  451ms |
>   final boot_elf        |  376ms |   68ms |   65ms |   65ms |
>                         |        |        |        |        |
> BOOTM LZO img in FLASH | 3207ms | 2902ms | 1050ms | 1050ms |
>   where CRC is          |  600ms |  600ms |  135ms |  135ms |
>   uncompress            | 2209ms | 2211ms |  828ms |  828ms |
>   final boot_elf        |  376ms |   68ms |   65ms |   65ms |
> 
> (1) No dcache
> (2) dcache enabled in board_init
> *Does not work when dcache is on
> 
> I think we can see that there seems to be no negativ impact of theses 
> patches when only execution speed is taken into consideration. The gain 
> is noticable when caching is not used or not activated. For pure RAM to 
> RAM copy when caching is activated the patch didn't change anything.
> 
> Here are some additional numbers for copying a 1.4MB image from NOR to RAM:
> 
> HEAD                  : 134ms
> HEAD + patch          : 72ms
> HEAD + dcache         : 120ms
> HEAD + dcache + patch : 70ms

This is pretty much interesting information for developers who have to
decide if they want to accept the increased memory footprint.  Can you
please add this to the commit message?

> Would it be an option to use the CONFIG entries CONFIG_USE_ARCH_MEMCPY 
> and CONFIG_USE_ARCH_MEMSET to enable that feature? If that is OK I can 

Makes sense to me.

> send a new version of the patch. The only problem I see with this 
> approach is that there are architectures which already have their own 
> implementations which are then not affected by these config options.

If you are aware of any, it might make sense to put the respective
maintainers on Cc: to trigger them to adapt / clean up their code.

Thanks.

Best regards,

Wolfgang Denk

-- 
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany
Phone: (+49)-8142-66989-10 Fax: (+49)-8142-66989-80 Email: wd at denx.de
Its always easier short term to pee in the pond
than install a toilet - it's just not a good long term plan.
          - Alan Cox in <20100101145701.6432e7b7@lxorguk.ukuu.org.uk>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [U-Boot] [PATCH V2] arm: Use optimized memcpy and memset from linux
  2011-01-24 15:56 [U-Boot] [PATCH] arm: Use optimized memcpy and memset from linux Matthias Weisser
  2011-01-24 16:13 ` Wolfgang Denk
@ 2011-01-26 10:45 ` Matthias Weisser
  2011-01-26 12:07   ` Albert ARIBAUD
                     ` (2 more replies)
  2011-03-11  7:36 ` [U-Boot] [PATCH V3] " Matthias Weisser
  2 siblings, 3 replies; 15+ messages in thread
From: Matthias Weisser @ 2011-01-26 10:45 UTC (permalink / raw)
  To: u-boot

Using optimized versions of memset and memcpy from linux brings a quite
noticeable speed (x2 or better) improvement for these two functions.

Here are some numbers for test done with jadecpu

                           | HEAD(1)| HEAD(1)| HEAD(2)| HEAD(2)|
                           |        | +patch |        | +patch |
---------------------------+--------+--------+--------+--------+
Reset to prompt            |  438ms |  330ms |  228ms |  120ms |
                           |        |        |        |        |
TFTP a 3MB img             | 4782ms | 3428ms | 3245ms | 2820ms |
                           |        |        |        |        |
FATLOAD USB a 3MB img*     | 8515ms | 8510ms | ------ | ------ |
                           |        |        |        |        |
BOOTM LZO img in RAM       | 3473ms | 3168ms |  592ms |  592ms |
 where CRC is              |  615ms |  615ms |   54ms |   54ms |
 uncompress                | 2460ms | 2462ms |  450ms |  451ms |
 final boot_elf            |  376ms |   68ms |   65ms |   65ms |
                           |        |        |        |        |
BOOTM LZO img in FLASH     | 3207ms | 2902ms | 1050ms | 1050ms |
 where CRC is              |  600ms |  600ms |  135ms |  135ms |
 uncompress                | 2209ms | 2211ms |  828ms |  828ms |
                           |        |        |        |        |
Copy 1.4MB from NOR to RAM |  134ms |   72ms |  120ms |   70ms |

(1) No dcache
(2) dcache enabled in board_init
*Does not work when dcache is on

Size impact:

C version:
   text    data     bss     dec     hex filename
 202862   18912  266456  488230   77326 u-boot

ASM version:
   text    data     bss     dec     hex filename
 203798   18912  266288  488998   77626 u-boot
222712  u-boot.bin

Changes since V1:
  - Made the usage of these functions optional be CONFIG_USE_ARCH_MEM
  - Usage of PLD instruction on all architectures supporting it
  - Added a README entry
  - Minor style fixes

Signed-off-by: Matthias Weisser <weisserm@arcor.de>
---
 README                           |    6 +
 arch/arm/include/asm/assembler.h |   60 ++++++++++
 arch/arm/include/asm/string.h    |   10 ++-
 arch/arm/lib/Makefile            |    2 +
 arch/arm/lib/memcpy.S            |  241 ++++++++++++++++++++++++++++++++++++++
 arch/arm/lib/memset.S            |  126 ++++++++++++++++++++
 6 files changed, 443 insertions(+), 2 deletions(-)
 create mode 100644 arch/arm/include/asm/assembler.h
 create mode 100644 arch/arm/lib/memcpy.S
 create mode 100644 arch/arm/lib/memset.S

diff --git a/README b/README
index 755d17c..5c610f2 100644
--- a/README
+++ b/README
@@ -2885,6 +2885,12 @@ Low Level (hardware related) configuration options:
 		that is executed before the actual U-Boot. E.g. when
 		compiling a NAND SPL.
 
+- CONFIG_USE_ARCH_MEMCPY
+  CONFIG_USE_ARCH_MEMSET
+		If these options are used a optimized version of memcpy/memset will
+		be used if available. These functions may be faster under some
+		conditions but may increase the binary size.
+
 Building the Software:
 ======================
 
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
new file mode 100644
index 0000000..418ee94
--- /dev/null
+++ b/arch/arm/include/asm/assembler.h
@@ -0,0 +1,60 @@
+/*
+ *  arch/arm/include/asm/assembler.h
+ *
+ *  Copyright (C) 1996-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  This file contains arm architecture specific defines
+ *  for the different processors.
+ *
+ *  Do not include any C declarations in this file - it is included by
+ *  assembler source.
+ */
+
+/*
+ * Endian independent macros for shifting bytes within registers.
+ */
+#ifndef __ARMEB__
+#define pull		lsr
+#define push		lsl
+#define get_byte_0	lsl #0
+#define get_byte_1	lsr #8
+#define get_byte_2	lsr #16
+#define get_byte_3	lsr #24
+#define put_byte_0	lsl #0
+#define put_byte_1	lsl #8
+#define put_byte_2	lsl #16
+#define put_byte_3	lsl #24
+#else
+#define pull		lsl
+#define push		lsr
+#define get_byte_0	lsr #24
+#define get_byte_1	lsr #16
+#define get_byte_2	lsr #8
+#define get_byte_3      lsl #0
+#define put_byte_0	lsl #24
+#define put_byte_1	lsl #16
+#define put_byte_2	lsl #8
+#define put_byte_3      lsl #0
+#endif
+
+/*
+ * Data preload for architectures that support it
+ */
+#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \
+	defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
+	defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || \
+	defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_7A__) || \
+	defined(__ARM_ARCH_7R__)
+#define PLD(code...)	code
+#else
+#define PLD(code...)
+#endif
+
+/*
+ * Cache alligned
+ */
+#define CALGN(code...) code
diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h
index c3ea582..c6dfb25 100644
--- a/arch/arm/include/asm/string.h
+++ b/arch/arm/include/asm/string.h
@@ -1,6 +1,8 @@
 #ifndef __ASM_ARM_STRING_H
 #define __ASM_ARM_STRING_H
 
+#include <config.h>
+
 /*
  * We don't do inline string functions, since the
  * optimised inline asm versions are not small.
@@ -12,7 +14,9 @@ extern char * strrchr(const char * s, int c);
 #undef __HAVE_ARCH_STRCHR
 extern char * strchr(const char * s, int c);
 
-#undef __HAVE_ARCH_MEMCPY
+#ifdef CONFIG_USE_ARCH_MEMCPY
+#define __HAVE_ARCH_MEMCPY
+#endif
 extern void * memcpy(void *, const void *, __kernel_size_t);
 
 #undef __HAVE_ARCH_MEMMOVE
@@ -22,7 +26,9 @@ extern void * memmove(void *, const void *, __kernel_size_t);
 extern void * memchr(const void *, int, __kernel_size_t);
 
 #undef __HAVE_ARCH_MEMZERO
-#undef __HAVE_ARCH_MEMSET
+#ifdef CONFIG_USE_ARCH_MEMSET
+#define __HAVE_ARCH_MEMSET
+#endif
 extern void * memset(void *, int, __kernel_size_t);
 
 #if 0
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 454440c..03b1b5e 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -44,6 +44,8 @@ COBJS-y	+= cache-cp15.o
 endif
 COBJS-y	+= interrupts.o
 COBJS-y	+= reset.o
+SOBJS-$(CONFIG_USE_ARCH_MEMSET) += memset.o
+SOBJS-$(CONFIG_USE_ARCH_MEMCPY) += memcpy.o
 
 SRCS	:= $(GLSOBJS:.o=.S) $(GLCOBJS:.o=.c) \
 	   $(SOBJS-y:.o=.S) $(COBJS-y:.o=.c)
diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
new file mode 100644
index 0000000..40db90e
--- /dev/null
+++ b/arch/arm/lib/memcpy.S
@@ -0,0 +1,241 @@
+/*
+ *  linux/arch/arm/lib/memcpy.S
+ *
+ *  Author:	Nicolas Pitre
+ *  Created:	Sep 28, 2005
+ *  Copyright:	MontaVista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <asm/assembler.h>
+
+#define W(instr)	instr
+
+#define LDR1W_SHIFT	0
+#define STR1W_SHIFT	0
+
+	.macro ldr1w ptr reg abort
+	W(ldr) \reg, [\ptr], #4
+	.endm
+
+	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
+	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
+	.endm
+
+	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+	.endm
+
+	.macro ldr1b ptr reg cond=al abort
+	ldr\cond\()b \reg, [\ptr], #1
+	.endm
+
+	.macro str1w ptr reg abort
+	W(str) \reg, [\ptr], #4
+	.endm
+
+	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+	.endm
+
+	.macro str1b ptr reg cond=al abort
+	str\cond\()b \reg, [\ptr], #1
+	.endm
+
+	.macro enter reg1 reg2
+	stmdb sp!, {r0, \reg1, \reg2}
+	.endm
+
+	.macro exit reg1 reg2
+	ldmfd sp!, {r0, \reg1, \reg2}
+	.endm
+
+	.text
+
+/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
+
+.globl memcpy
+memcpy:
+
+		enter	r4, lr
+
+		subs	r2, r2, #4
+		blt	8f
+		ands	ip, r0, #3
+	PLD(	pld	[r1, #0]		)
+		bne	9f
+		ands	ip, r1, #3
+		bne	10f
+
+1:		subs	r2, r2, #(28)
+		stmfd	sp!, {r5 - r8}
+		blt	5f
+
+	CALGN(	ands	ip, r0, #31		)
+	CALGN(	rsb	r3, ip, #32		)
+	CALGN(	sbcnes	r4, r3, r2		)  @ C is always set here
+	CALGN(	bcs	2f			)
+	CALGN(	adr	r4, 6f			)
+	CALGN(	subs	r2, r2, r3		)  @ C gets set
+	CALGN(	add	pc, r4, ip		)
+
+	PLD(	pld	[r1, #0]		)
+2:	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[r1, #28]		)
+	PLD(	blt	4f			)
+	PLD(	pld	[r1, #60]		)
+	PLD(	pld	[r1, #92]		)
+
+3:	PLD(	pld	[r1, #124]		)
+4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+		subs	r2, r2, #32
+		str8w	r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+		bge	3b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	4b			)
+
+5:		ands	ip, r2, #28
+		rsb	ip, ip, #32
+#if LDR1W_SHIFT > 0
+		lsl	ip, ip, #LDR1W_SHIFT
+#endif
+		addne	pc, pc, ip		@ C is always clear here
+		b	7f
+6:
+		.rept	(1 << LDR1W_SHIFT)
+		W(nop)
+		.endr
+		ldr1w	r1, r3, abort=20f
+		ldr1w	r1, r4, abort=20f
+		ldr1w	r1, r5, abort=20f
+		ldr1w	r1, r6, abort=20f
+		ldr1w	r1, r7, abort=20f
+		ldr1w	r1, r8, abort=20f
+		ldr1w	r1, lr, abort=20f
+
+#if LDR1W_SHIFT < STR1W_SHIFT
+		lsl	ip, ip, #STR1W_SHIFT - LDR1W_SHIFT
+#elif LDR1W_SHIFT > STR1W_SHIFT
+		lsr	ip, ip, #LDR1W_SHIFT - STR1W_SHIFT
+#endif
+		add	pc, pc, ip
+		nop
+		.rept	(1 << STR1W_SHIFT)
+		W(nop)
+		.endr
+		str1w	r0, r3, abort=20f
+		str1w	r0, r4, abort=20f
+		str1w	r0, r5, abort=20f
+		str1w	r0, r6, abort=20f
+		str1w	r0, r7, abort=20f
+		str1w	r0, r8, abort=20f
+		str1w	r0, lr, abort=20f
+
+	CALGN(	bcs	2b			)
+
+7:		ldmfd	sp!, {r5 - r8}
+
+8:		movs	r2, r2, lsl #31
+		ldr1b	r1, r3, ne, abort=21f
+		ldr1b	r1, r4, cs, abort=21f
+		ldr1b	r1, ip, cs, abort=21f
+		str1b	r0, r3, ne, abort=21f
+		str1b	r0, r4, cs, abort=21f
+		str1b	r0, ip, cs, abort=21f
+
+		exit	r4, pc
+
+9:		rsb	ip, ip, #4
+		cmp	ip, #2
+		ldr1b	r1, r3, gt, abort=21f
+		ldr1b	r1, r4, ge, abort=21f
+		ldr1b	r1, lr, abort=21f
+		str1b	r0, r3, gt, abort=21f
+		str1b	r0, r4, ge, abort=21f
+		subs	r2, r2, ip
+		str1b	r0, lr, abort=21f
+		blt	8b
+		ands	ip, r1, #3
+		beq	1b
+
+10:		bic	r1, r1, #3
+		cmp	ip, #2
+		ldr1w	r1, lr, abort=21f
+		beq	17f
+		bgt	18f
+
+
+		.macro	forward_copy_shift pull push
+
+		subs	r2, r2, #28
+		blt	14f
+
+	CALGN(	ands	ip, r0, #31		)
+	CALGN(	rsb	ip, ip, #32		)
+	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
+	CALGN(	subcc	r2, r2, ip		)
+	CALGN(	bcc	15f			)
+
+11:		stmfd	sp!, {r5 - r9}
+
+	PLD(	pld	[r1, #0]		)
+	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[r1, #28]		)
+	PLD(	blt	13f			)
+	PLD(	pld	[r1, #60]		)
+	PLD(	pld	[r1, #92]		)
+
+12:	PLD(	pld	[r1, #124]		)
+13:		ldr4w	r1, r4, r5, r6, r7, abort=19f
+		mov	r3, lr, pull #\pull
+		subs	r2, r2, #32
+		ldr4w	r1, r8, r9, ip, lr, abort=19f
+		orr	r3, r3, r4, push #\push
+		mov	r4, r4, pull #\pull
+		orr	r4, r4, r5, push #\push
+		mov	r5, r5, pull #\pull
+		orr	r5, r5, r6, push #\push
+		mov	r6, r6, pull #\pull
+		orr	r6, r6, r7, push #\push
+		mov	r7, r7, pull #\pull
+		orr	r7, r7, r8, push #\push
+		mov	r8, r8, pull #\pull
+		orr	r8, r8, r9, push #\push
+		mov	r9, r9, pull #\pull
+		orr	r9, r9, ip, push #\push
+		mov	ip, ip, pull #\pull
+		orr	ip, ip, lr, push #\push
+		str8w	r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
+		bge	12b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	13b			)
+
+		ldmfd	sp!, {r5 - r9}
+
+14:		ands	ip, r2, #28
+		beq	16f
+
+15:		mov	r3, lr, pull #\pull
+		ldr1w	r1, lr, abort=21f
+		subs	ip, ip, #4
+		orr	r3, r3, lr, push #\push
+		str1w	r0, r3, abort=21f
+		bgt	15b
+	CALGN(	cmp	r2, #0			)
+	CALGN(	bge	11b			)
+
+16:		sub	r1, r1, #(\push / 8)
+		b	8b
+
+		.endm
+
+
+		forward_copy_shift	pull=8	push=24
+
+17:		forward_copy_shift	pull=16	push=16
+
+18:		forward_copy_shift	pull=24	push=8
+
diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S
new file mode 100644
index 0000000..0cdf895
--- /dev/null
+++ b/arch/arm/lib/memset.S
@@ -0,0 +1,126 @@
+/*
+ *  linux/arch/arm/lib/memset.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  ASM optimised string functions
+ */
+#include <asm/assembler.h>
+
+	.text
+	.align	5
+	.word	0
+
+1:	subs	r2, r2, #4		@ 1 do we have enough
+	blt	5f			@ 1 bytes to align with?
+	cmp	r3, #2			@ 1
+	strltb	r1, [r0], #1		@ 1
+	strleb	r1, [r0], #1		@ 1
+	strb	r1, [r0], #1		@ 1
+	add	r2, r2, r3		@ 1 (r2 = r2 - (4 - r3))
+/*
+ * The pointer is now aligned and the length is adjusted.  Try doing the
+ * memset again.
+ */
+
+.globl memset
+memset:
+	ands	r3, r0, #3		@ 1 unaligned?
+	bne	1b			@ 1
+/*
+ * we know that the pointer in r0 is aligned to a word boundary.
+ */
+	orr	r1, r1, r1, lsl #8
+	orr	r1, r1, r1, lsl #16
+	mov	r3, r1
+	cmp	r2, #16
+	blt	4f
+
+#if ! CALGN(1)+0
+
+/*
+ * We need an extra register for this loop - save the return address and
+ * use the LR
+ */
+	str	lr, [sp, #-4]!
+	mov	ip, r1
+	mov	lr, r1
+
+2:	subs	r2, r2, #64
+	stmgeia	r0!, {r1, r3, ip, lr}	@ 64 bytes at a time.
+	stmgeia	r0!, {r1, r3, ip, lr}
+	stmgeia	r0!, {r1, r3, ip, lr}
+	stmgeia	r0!, {r1, r3, ip, lr}
+	bgt	2b
+	ldmeqfd	sp!, {pc}		@ Now <64 bytes to go.
+/*
+ * No need to correct the count; we're only testing bits from now on
+ */
+	tst	r2, #32
+	stmneia	r0!, {r1, r3, ip, lr}
+	stmneia	r0!, {r1, r3, ip, lr}
+	tst	r2, #16
+	stmneia	r0!, {r1, r3, ip, lr}
+	ldr	lr, [sp], #4
+
+#else
+
+/*
+ * This version aligns the destination pointer in order to write
+ * whole cache lines@once.
+ */
+
+	stmfd	sp!, {r4-r7, lr}
+	mov	r4, r1
+	mov	r5, r1
+	mov	r6, r1
+	mov	r7, r1
+	mov	ip, r1
+	mov	lr, r1
+
+	cmp	r2, #96
+	tstgt	r0, #31
+	ble	3f
+
+	and	ip, r0, #31
+	rsb	ip, ip, #32
+	sub	r2, r2, ip
+	movs	ip, ip, lsl #(32 - 4)
+	stmcsia	r0!, {r4, r5, r6, r7}
+	stmmiia	r0!, {r4, r5}
+	tst	ip, #(1 << 30)
+	mov	ip, r1
+	strne	r1, [r0], #4
+
+3:	subs	r2, r2, #64
+	stmgeia	r0!, {r1, r3-r7, ip, lr}
+	stmgeia	r0!, {r1, r3-r7, ip, lr}
+	bgt	3b
+	ldmeqfd	sp!, {r4-r7, pc}
+
+	tst	r2, #32
+	stmneia	r0!, {r1, r3-r7, ip, lr}
+	tst	r2, #16
+	stmneia	r0!, {r4-r7}
+	ldmfd	sp!, {r4-r7, lr}
+
+#endif
+
+4:	tst	r2, #8
+	stmneia	r0!, {r1, r3}
+	tst	r2, #4
+	strne	r1, [r0], #4
+/*
+ * When we get here, we've got less than 4 bytes to zero.  We
+ * may have an unaligned pointer as well.
+ */
+5:	tst	r2, #2
+	strneb	r1, [r0], #1
+	strneb	r1, [r0], #1
+	tst	r2, #1
+	strneb	r1, [r0], #1
+	mov	pc, lr
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [U-Boot] [PATCH V2] arm: Use optimized memcpy and memset from linux
  2011-01-26 10:45 ` [U-Boot] [PATCH V2] " Matthias Weisser
@ 2011-01-26 12:07   ` Albert ARIBAUD
  2011-01-26 12:50     ` Matthias Weißer
  2011-02-20 19:35   ` Alexander Holler
  2011-03-03  7:07   ` Albert ARIBAUD
  2 siblings, 1 reply; 15+ messages in thread
From: Albert ARIBAUD @ 2011-01-26 12:07 UTC (permalink / raw)
  To: u-boot

Hi Matthias,

Le 26/01/2011 11:45, Matthias Weisser a ?crit :
> Using optimized versions of memset and memcpy from linux brings a quite
> noticeable speed (x2 or better) improvement for these two functions.
>
> Here are some numbers for test done with jadecpu
>
>                             | HEAD(1)| HEAD(1)| HEAD(2)| HEAD(2)|
>                             |        | +patch |        | +patch |
> ---------------------------+--------+--------+--------+--------+
> Reset to prompt            |  438ms |  330ms |  228ms |  120ms |
>                             |        |        |        |        |
> TFTP a 3MB img             | 4782ms | 3428ms | 3245ms | 2820ms |
>                             |        |        |        |        |
> FATLOAD USB a 3MB img*     | 8515ms | 8510ms | ------ | ------ |
>                             |        |        |        |        |
> BOOTM LZO img in RAM       | 3473ms | 3168ms |  592ms |  592ms |
>   where CRC is              |  615ms |  615ms |   54ms |   54ms |
>   uncompress                | 2460ms | 2462ms |  450ms |  451ms |
>   final boot_elf            |  376ms |   68ms |   65ms |   65ms |
>                             |        |        |        |        |
> BOOTM LZO img in FLASH     | 3207ms | 2902ms | 1050ms | 1050ms |
>   where CRC is              |  600ms |  600ms |  135ms |  135ms |
>   uncompress                | 2209ms | 2211ms |  828ms |  828ms |
>                             |        |        |        |        |
> Copy 1.4MB from NOR to RAM |  134ms |   72ms |  120ms |   70ms |
>
> (1) No dcache
> (2) dcache enabled in board_init
> *Does not work when dcache is on
>
> Size impact:
>
> C version:
>     text    data     bss     dec     hex filename
>   202862   18912  266456  488230   77326 u-boot
>
> ASM version:
>     text    data     bss     dec     hex filename
>   203798   18912  266288  488998   77626 u-boot
> 222712  u-boot.bin
>
> Changes since V1:
>    - Made the usage of these functions optional be CONFIG_USE_ARCH_MEM
>    - Usage of PLD instruction on all architectures supporting it
>    - Added a README entry
>    - Minor style fixes
>
> Signed-off-by: Matthias Weisser<weisserm@arcor.de>
> ---

IIRC, the '---' line separates patch commit message (above) from 
freeform comments and history (below). Here, at least the version 
history should move below the '---' line.

Also, I think that above the line, /some/ indication of performance 
enhancement and drawbacks should be given, but not a full ASCII table of 
numbers -- that can go below the line.

>   README                           |    6 +
>   arch/arm/include/asm/assembler.h |   60 ++++++++++
>   arch/arm/include/asm/string.h    |   10 ++-
>   arch/arm/lib/Makefile            |    2 +
>   arch/arm/lib/memcpy.S            |  241 ++++++++++++++++++++++++++++++++++++++
>   arch/arm/lib/memset.S            |  126 ++++++++++++++++++++
>   6 files changed, 443 insertions(+), 2 deletions(-)
>   create mode 100644 arch/arm/include/asm/assembler.h
>   create mode 100644 arch/arm/lib/memcpy.S
>   create mode 100644 arch/arm/lib/memset.S
>
> diff --git a/README b/README
> index 755d17c..5c610f2 100644
> --- a/README
> +++ b/README
> @@ -2885,6 +2885,12 @@ Low Level (hardware related) configuration options:
>   		that is executed before the actual U-Boot. E.g. when
>   		compiling a NAND SPL.
>
> +- CONFIG_USE_ARCH_MEMCPY
> +  CONFIG_USE_ARCH_MEMSET
> +		If these options are used a optimized version of memcpy/memset will
> +		be used if available. These functions may be faster under some
> +		conditions but may increase the binary size.
> +

The name of the options is not self-explaining to me. If the difference 
is "generic vs arch-optimal", then maybe CONFIG_USE_ARCH_OPTIMAL_MEMxxx 
would be a better name?

>   Building the Software:
>   ======================
>
> diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
> new file mode 100644
> index 0000000..418ee94
> --- /dev/null
> +++ b/arch/arm/include/asm/assembler.h
> @@ -0,0 +1,60 @@
> +/*
> + *  arch/arm/include/asm/assembler.h
> + *
> + *  Copyright (C) 1996-2000 Russell King
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + *  This file contains arm architecture specific defines
> + *  for the different processors.
> + *
> + *  Do not include any C declarations in this file - it is included by
> + *  assembler source.
> + */
> +
> +/*
> + * Endian independent macros for shifting bytes within registers.
> + */
> +#ifndef __ARMEB__
> +#define pull		lsr
> +#define push		lsl
> +#define get_byte_0	lsl #0
> +#define get_byte_1	lsr #8
> +#define get_byte_2	lsr #16
> +#define get_byte_3	lsr #24
> +#define put_byte_0	lsl #0
> +#define put_byte_1	lsl #8
> +#define put_byte_2	lsl #16
> +#define put_byte_3	lsl #24
> +#else
> +#define pull		lsl
> +#define push		lsr
> +#define get_byte_0	lsr #24
> +#define get_byte_1	lsr #16
> +#define get_byte_2	lsr #8
> +#define get_byte_3      lsl #0
> +#define put_byte_0	lsl #24
> +#define put_byte_1	lsl #16
> +#define put_byte_2	lsl #8
> +#define put_byte_3      lsl #0
> +#endif
> +
> +/*
> + * Data preload for architectures that support it
> + */
> +#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \
> +	defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
> +	defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || \
> +	defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_7A__) || \
> +	defined(__ARM_ARCH_7R__)
> +#define PLD(code...)	code
> +#else
> +#define PLD(code...)
> +#endif
> +
> +/*
> + * Cache alligned
> + */
> +#define CALGN(code...) code
> diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h
> index c3ea582..c6dfb25 100644
> --- a/arch/arm/include/asm/string.h
> +++ b/arch/arm/include/asm/string.h
> @@ -1,6 +1,8 @@
>   #ifndef __ASM_ARM_STRING_H
>   #define __ASM_ARM_STRING_H
>
> +#include<config.h>
> +
>   /*
>    * We don't do inline string functions, since the
>    * optimised inline asm versions are not small.
> @@ -12,7 +14,9 @@ extern char * strrchr(const char * s, int c);
>   #undef __HAVE_ARCH_STRCHR
>   extern char * strchr(const char * s, int c);
>
> -#undef __HAVE_ARCH_MEMCPY
> +#ifdef CONFIG_USE_ARCH_MEMCPY
> +#define __HAVE_ARCH_MEMCPY
> +#endif
>   extern void * memcpy(void *, const void *, __kernel_size_t);
>
>   #undef __HAVE_ARCH_MEMMOVE
> @@ -22,7 +26,9 @@ extern void * memmove(void *, const void *, __kernel_size_t);
>   extern void * memchr(const void *, int, __kernel_size_t);
>
>   #undef __HAVE_ARCH_MEMZERO
> -#undef __HAVE_ARCH_MEMSET
> +#ifdef CONFIG_USE_ARCH_MEMSET
> +#define __HAVE_ARCH_MEMSET
> +#endif
>   extern void * memset(void *, int, __kernel_size_t);
>
>   #if 0
> diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
> index 454440c..03b1b5e 100644
> --- a/arch/arm/lib/Makefile
> +++ b/arch/arm/lib/Makefile
> @@ -44,6 +44,8 @@ COBJS-y	+= cache-cp15.o
>   endif
>   COBJS-y	+= interrupts.o
>   COBJS-y	+= reset.o
> +SOBJS-$(CONFIG_USE_ARCH_MEMSET) += memset.o
> +SOBJS-$(CONFIG_USE_ARCH_MEMCPY) += memcpy.o
>
>   SRCS	:= $(GLSOBJS:.o=.S) $(GLCOBJS:.o=.c) \
>   	   $(SOBJS-y:.o=.S) $(COBJS-y:.o=.c)
> diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
> new file mode 100644
> index 0000000..40db90e
> --- /dev/null
> +++ b/arch/arm/lib/memcpy.S
> @@ -0,0 +1,241 @@
> +/*
> + *  linux/arch/arm/lib/memcpy.S
> + *
> + *  Author:	Nicolas Pitre
> + *  Created:	Sep 28, 2005
> + *  Copyright:	MontaVista Software, Inc.
> + *
> + *  This program is free software; you can redistribute it and/or modify
> + *  it under the terms of the GNU General Public License version 2 as
> + *  published by the Free Software Foundation.
> + */
> +
> +#include<asm/assembler.h>
> +
> +#define W(instr)	instr
> +
> +#define LDR1W_SHIFT	0
> +#define STR1W_SHIFT	0
> +
> +	.macro ldr1w ptr reg abort
> +	W(ldr) \reg, [\ptr], #4
> +	.endm
> +
> +	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
> +	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
> +	.endm
> +
> +	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
> +	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
> +	.endm
> +
> +	.macro ldr1b ptr reg cond=al abort
> +	ldr\cond\()b \reg, [\ptr], #1
> +	.endm
> +
> +	.macro str1w ptr reg abort
> +	W(str) \reg, [\ptr], #4
> +	.endm
> +
> +	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
> +	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
> +	.endm
> +
> +	.macro str1b ptr reg cond=al abort
> +	str\cond\()b \reg, [\ptr], #1
> +	.endm
> +
> +	.macro enter reg1 reg2
> +	stmdb sp!, {r0, \reg1, \reg2}
> +	.endm
> +
> +	.macro exit reg1 reg2
> +	ldmfd sp!, {r0, \reg1, \reg2}
> +	.endm
> +
> +	.text
> +
> +/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
> +
> +.globl memcpy
> +memcpy:
> +
> +		enter	r4, lr
> +
> +		subs	r2, r2, #4
> +		blt	8f
> +		ands	ip, r0, #3
> +	PLD(	pld	[r1, #0]		)
> +		bne	9f
> +		ands	ip, r1, #3
> +		bne	10f
> +
> +1:		subs	r2, r2, #(28)
> +		stmfd	sp!, {r5 - r8}
> +		blt	5f
> +
> +	CALGN(	ands	ip, r0, #31		)
> +	CALGN(	rsb	r3, ip, #32		)
> +	CALGN(	sbcnes	r4, r3, r2		)  @ C is always set here
> +	CALGN(	bcs	2f			)
> +	CALGN(	adr	r4, 6f			)
> +	CALGN(	subs	r2, r2, r3		)  @ C gets set
> +	CALGN(	add	pc, r4, ip		)
> +
> +	PLD(	pld	[r1, #0]		)
> +2:	PLD(	subs	r2, r2, #96		)
> +	PLD(	pld	[r1, #28]		)
> +	PLD(	blt	4f			)
> +	PLD(	pld	[r1, #60]		)
> +	PLD(	pld	[r1, #92]		)
> +
> +3:	PLD(	pld	[r1, #124]		)
> +4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> +		subs	r2, r2, #32
> +		str8w	r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> +		bge	3b
> +	PLD(	cmn	r2, #96			)
> +	PLD(	bge	4b			)
> +
> +5:		ands	ip, r2, #28
> +		rsb	ip, ip, #32
> +#if LDR1W_SHIFT>  0
> +		lsl	ip, ip, #LDR1W_SHIFT
> +#endif
> +		addne	pc, pc, ip		@ C is always clear here
> +		b	7f
> +6:
> +		.rept	(1<<  LDR1W_SHIFT)
> +		W(nop)
> +		.endr
> +		ldr1w	r1, r3, abort=20f
> +		ldr1w	r1, r4, abort=20f
> +		ldr1w	r1, r5, abort=20f
> +		ldr1w	r1, r6, abort=20f
> +		ldr1w	r1, r7, abort=20f
> +		ldr1w	r1, r8, abort=20f
> +		ldr1w	r1, lr, abort=20f
> +
> +#if LDR1W_SHIFT<  STR1W_SHIFT
> +		lsl	ip, ip, #STR1W_SHIFT - LDR1W_SHIFT
> +#elif LDR1W_SHIFT>  STR1W_SHIFT
> +		lsr	ip, ip, #LDR1W_SHIFT - STR1W_SHIFT
> +#endif
> +		add	pc, pc, ip
> +		nop
> +		.rept	(1<<  STR1W_SHIFT)
> +		W(nop)
> +		.endr
> +		str1w	r0, r3, abort=20f
> +		str1w	r0, r4, abort=20f
> +		str1w	r0, r5, abort=20f
> +		str1w	r0, r6, abort=20f
> +		str1w	r0, r7, abort=20f
> +		str1w	r0, r8, abort=20f
> +		str1w	r0, lr, abort=20f
> +
> +	CALGN(	bcs	2b			)
> +
> +7:		ldmfd	sp!, {r5 - r8}
> +
> +8:		movs	r2, r2, lsl #31
> +		ldr1b	r1, r3, ne, abort=21f
> +		ldr1b	r1, r4, cs, abort=21f
> +		ldr1b	r1, ip, cs, abort=21f
> +		str1b	r0, r3, ne, abort=21f
> +		str1b	r0, r4, cs, abort=21f
> +		str1b	r0, ip, cs, abort=21f
> +
> +		exit	r4, pc
> +
> +9:		rsb	ip, ip, #4
> +		cmp	ip, #2
> +		ldr1b	r1, r3, gt, abort=21f
> +		ldr1b	r1, r4, ge, abort=21f
> +		ldr1b	r1, lr, abort=21f
> +		str1b	r0, r3, gt, abort=21f
> +		str1b	r0, r4, ge, abort=21f
> +		subs	r2, r2, ip
> +		str1b	r0, lr, abort=21f
> +		blt	8b
> +		ands	ip, r1, #3
> +		beq	1b
> +
> +10:		bic	r1, r1, #3
> +		cmp	ip, #2
> +		ldr1w	r1, lr, abort=21f
> +		beq	17f
> +		bgt	18f
> +
> +
> +		.macro	forward_copy_shift pull push
> +
> +		subs	r2, r2, #28
> +		blt	14f
> +
> +	CALGN(	ands	ip, r0, #31		)
> +	CALGN(	rsb	ip, ip, #32		)
> +	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
> +	CALGN(	subcc	r2, r2, ip		)
> +	CALGN(	bcc	15f			)
> +
> +11:		stmfd	sp!, {r5 - r9}
> +
> +	PLD(	pld	[r1, #0]		)
> +	PLD(	subs	r2, r2, #96		)
> +	PLD(	pld	[r1, #28]		)
> +	PLD(	blt	13f			)
> +	PLD(	pld	[r1, #60]		)
> +	PLD(	pld	[r1, #92]		)
> +
> +12:	PLD(	pld	[r1, #124]		)
> +13:		ldr4w	r1, r4, r5, r6, r7, abort=19f
> +		mov	r3, lr, pull #\pull
> +		subs	r2, r2, #32
> +		ldr4w	r1, r8, r9, ip, lr, abort=19f
> +		orr	r3, r3, r4, push #\push
> +		mov	r4, r4, pull #\pull
> +		orr	r4, r4, r5, push #\push
> +		mov	r5, r5, pull #\pull
> +		orr	r5, r5, r6, push #\push
> +		mov	r6, r6, pull #\pull
> +		orr	r6, r6, r7, push #\push
> +		mov	r7, r7, pull #\pull
> +		orr	r7, r7, r8, push #\push
> +		mov	r8, r8, pull #\pull
> +		orr	r8, r8, r9, push #\push
> +		mov	r9, r9, pull #\pull
> +		orr	r9, r9, ip, push #\push
> +		mov	ip, ip, pull #\pull
> +		orr	ip, ip, lr, push #\push
> +		str8w	r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
> +		bge	12b
> +	PLD(	cmn	r2, #96			)
> +	PLD(	bge	13b			)
> +
> +		ldmfd	sp!, {r5 - r9}
> +
> +14:		ands	ip, r2, #28
> +		beq	16f
> +
> +15:		mov	r3, lr, pull #\pull
> +		ldr1w	r1, lr, abort=21f
> +		subs	ip, ip, #4
> +		orr	r3, r3, lr, push #\push
> +		str1w	r0, r3, abort=21f
> +		bgt	15b
> +	CALGN(	cmp	r2, #0			)
> +	CALGN(	bge	11b			)
> +
> +16:		sub	r1, r1, #(\push / 8)
> +		b	8b
> +
> +		.endm
> +
> +
> +		forward_copy_shift	pull=8	push=24
> +
> +17:		forward_copy_shift	pull=16	push=16
> +
> +18:		forward_copy_shift	pull=24	push=8
> +
> diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S
> new file mode 100644
> index 0000000..0cdf895
> --- /dev/null
> +++ b/arch/arm/lib/memset.S
> @@ -0,0 +1,126 @@
> +/*
> + *  linux/arch/arm/lib/memset.S
> + *
> + *  Copyright (C) 1995-2000 Russell King
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + *  ASM optimised string functions
> + */
> +#include<asm/assembler.h>
> +
> +	.text
> +	.align	5
> +	.word	0
> +
> +1:	subs	r2, r2, #4		@ 1 do we have enough
> +	blt	5f			@ 1 bytes to align with?
> +	cmp	r3, #2			@ 1
> +	strltb	r1, [r0], #1		@ 1
> +	strleb	r1, [r0], #1		@ 1
> +	strb	r1, [r0], #1		@ 1
> +	add	r2, r2, r3		@ 1 (r2 = r2 - (4 - r3))
> +/*
> + * The pointer is now aligned and the length is adjusted.  Try doing the
> + * memset again.
> + */
> +
> +.globl memset
> +memset:
> +	ands	r3, r0, #3		@ 1 unaligned?
> +	bne	1b			@ 1
> +/*
> + * we know that the pointer in r0 is aligned to a word boundary.
> + */
> +	orr	r1, r1, r1, lsl #8
> +	orr	r1, r1, r1, lsl #16
> +	mov	r3, r1
> +	cmp	r2, #16
> +	blt	4f
> +
> +#if ! CALGN(1)+0
> +
> +/*
> + * We need an extra register for this loop - save the return address and
> + * use the LR
> + */
> +	str	lr, [sp, #-4]!
> +	mov	ip, r1
> +	mov	lr, r1
> +
> +2:	subs	r2, r2, #64
> +	stmgeia	r0!, {r1, r3, ip, lr}	@ 64 bytes at a time.
> +	stmgeia	r0!, {r1, r3, ip, lr}
> +	stmgeia	r0!, {r1, r3, ip, lr}
> +	stmgeia	r0!, {r1, r3, ip, lr}
> +	bgt	2b
> +	ldmeqfd	sp!, {pc}		@ Now<64 bytes to go.
> +/*
> + * No need to correct the count; we're only testing bits from now on
> + */
> +	tst	r2, #32
> +	stmneia	r0!, {r1, r3, ip, lr}
> +	stmneia	r0!, {r1, r3, ip, lr}
> +	tst	r2, #16
> +	stmneia	r0!, {r1, r3, ip, lr}
> +	ldr	lr, [sp], #4
> +
> +#else
> +
> +/*
> + * This version aligns the destination pointer in order to write
> + * whole cache lines at once.
> + */
> +
> +	stmfd	sp!, {r4-r7, lr}
> +	mov	r4, r1
> +	mov	r5, r1
> +	mov	r6, r1
> +	mov	r7, r1
> +	mov	ip, r1
> +	mov	lr, r1
> +
> +	cmp	r2, #96
> +	tstgt	r0, #31
> +	ble	3f
> +
> +	and	ip, r0, #31
> +	rsb	ip, ip, #32
> +	sub	r2, r2, ip
> +	movs	ip, ip, lsl #(32 - 4)
> +	stmcsia	r0!, {r4, r5, r6, r7}
> +	stmmiia	r0!, {r4, r5}
> +	tst	ip, #(1<<  30)
> +	mov	ip, r1
> +	strne	r1, [r0], #4
> +
> +3:	subs	r2, r2, #64
> +	stmgeia	r0!, {r1, r3-r7, ip, lr}
> +	stmgeia	r0!, {r1, r3-r7, ip, lr}
> +	bgt	3b
> +	ldmeqfd	sp!, {r4-r7, pc}
> +
> +	tst	r2, #32
> +	stmneia	r0!, {r1, r3-r7, ip, lr}
> +	tst	r2, #16
> +	stmneia	r0!, {r4-r7}
> +	ldmfd	sp!, {r4-r7, lr}
> +
> +#endif
> +
> +4:	tst	r2, #8
> +	stmneia	r0!, {r1, r3}
> +	tst	r2, #4
> +	strne	r1, [r0], #4
> +/*
> + * When we get here, we've got less than 4 bytes to zero.  We
> + * may have an unaligned pointer as well.
> + */
> +5:	tst	r2, #2
> +	strneb	r1, [r0], #1
> +	strneb	r1, [r0], #1
> +	tst	r2, #1
> +	strneb	r1, [r0], #1
> +	mov	pc, lr

Amicalement,
-- 
Albert.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [U-Boot] [PATCH V2] arm: Use optimized memcpy and memset from linux
  2011-01-26 12:07   ` Albert ARIBAUD
@ 2011-01-26 12:50     ` Matthias Weißer
  2011-01-26 13:07       ` Wolfgang Denk
  0 siblings, 1 reply; 15+ messages in thread
From: Matthias Weißer @ 2011-01-26 12:50 UTC (permalink / raw)
  To: u-boot

Am 26.01.2011 13:07, schrieb Albert ARIBAUD:
>> ---
>
> IIRC, the '---' line separates patch commit message (above) from
> freeform comments and history (below). Here, at least the version
> history should move below the '---' line.

Wolfgang asked me that I add the numbers to the commit message. For the 
changelog I will investigate the git commands on how to do that best 
without manually editing the patch file before git send-email them.

>> +- CONFIG_USE_ARCH_MEMCPY
>> + CONFIG_USE_ARCH_MEMSET
>> + If these options are used a optimized version of memcpy/memset will
>> + be used if available. These functions may be faster under some
>> + conditions but may increase the binary size.
>> +
>
> The name of the options is not self-explaining to me. If the difference
> is "generic vs arch-optimal", then maybe CONFIG_USE_ARCH_OPTIMAL_MEMxxx
> would be a better name?

Wolfgang didn't object on these names. If we use the OPTIMAL form it is 
still not clear what optimal mean. There may be a size optimized version 
and a speed optimized version. So we would need 
CONFIG_USE_ARCH_SPEED_OPTIMAL_MEMxxx which I personally dislike a lot as 
it is quite long. I also think that if there is an architecture specific 
function that it should be clear that this is optimal in some way.

Thanks for review

Regards
Matthias

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [U-Boot] [PATCH V2] arm: Use optimized memcpy and memset from linux
  2011-01-26 12:50     ` Matthias Weißer
@ 2011-01-26 13:07       ` Wolfgang Denk
  2011-01-27 18:39         ` Albert ARIBAUD
  0 siblings, 1 reply; 15+ messages in thread
From: Wolfgang Denk @ 2011-01-26 13:07 UTC (permalink / raw)
  To: u-boot

Dear =?ISO-8859-15?Q?Matthias_Wei=DFer?=,

In message <4D4018AD.7090001@arcor.de> you wrote:
>
> > IIRC, the '---' line separates patch commit message (above) from
> > freeform comments and history (below). Here, at least the version
> > history should move below the '---' line.
> 
> Wolfgang asked me that I add the numbers to the commit message. For the 
> changelog I will investigate the git commands on how to do that best 
> without manually editing the patch file before git send-email them.

Indeed I find that these numbers are information that should go into
the commit message so this data is available to users who have to
decide whether they want to trade the increased speed for the
increased memory footprint.

> >> +- CONFIG_USE_ARCH_MEMCPY
> >> + CONFIG_USE_ARCH_MEMSET
> >> + If these options are used a optimized version of memcpy/memset will
> >> + be used if available. These functions may be faster under some
> >> + conditions but may increase the binary size.
> >> +
> >
> > The name of the options is not self-explaining to me. If the difference
> > is "generic vs arch-optimal", then maybe CONFIG_USE_ARCH_OPTIMAL_MEMxxx
> > would be a better name?
> 
> Wolfgang didn't object on these names. If we use the OPTIMAL form it is 
> still not clear what optimal mean. There may be a size optimized version 
> and a speed optimized version. So we would need 
> CONFIG_USE_ARCH_SPEED_OPTIMAL_MEMxxx which I personally dislike a lot as 
> it is quite long. I also think that if there is an architecture specific 
> function that it should be clear that this is optimal in some way.

Well, "optimal" is not a good idea as I am pretty sure that some
clever person will still be able to spare some cycles here and there,
so his code would be even "more optimal" ;-)

I think the names CONFIG_USE_ARCH_MEMCPY etc. are actually pretty
good, because they are in line with the standard names
__HAVE_ARCH_MEMCPY etc. that are used in a lot of libraries.

Best regards,

Wolfgang Denk

-- 
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany
Phone: (+49)-8142-66989-10 Fax: (+49)-8142-66989-80 Email: wd at denx.de
"355/113 -- Not the famous irrational number PI,  but  an  incredible
simulation!"

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [U-Boot] [PATCH V2] arm: Use optimized memcpy and memset from linux
  2011-01-26 13:07       ` Wolfgang Denk
@ 2011-01-27 18:39         ` Albert ARIBAUD
  0 siblings, 0 replies; 15+ messages in thread
From: Albert ARIBAUD @ 2011-01-27 18:39 UTC (permalink / raw)
  To: u-boot

Hi Wolfgang,

Le 26/01/2011 14:07, Wolfgang Denk a ?crit :
> Dear =?ISO-8859-15?Q?Matthias_Wei=DFer?=,
>
> In message<4D4018AD.7090001@arcor.de>  you wrote:
>>
>>> IIRC, the '---' line separates patch commit message (above) from
>>> freeform comments and history (below). Here, at least the version
>>> history should move below the '---' line.
>>
>> Wolfgang asked me that I add the numbers to the commit message. For the
>> changelog I will investigate the git commands on how to do that best
>> without manually editing the patch file before git send-email them.
>
> Indeed I find that these numbers are information that should go into
> the commit message so this data is available to users who have to
> decide whether they want to trade the increased speed for the
> increased memory footprint.

Can't we have thses numbers in a more compact form then? That makes a 
really big commit message.

>>>> +- CONFIG_USE_ARCH_MEMCPY
>>>> + CONFIG_USE_ARCH_MEMSET
>>>> + If these options are used a optimized version of memcpy/memset will
>>>> + be used if available. These functions may be faster under some
>>>> + conditions but may increase the binary size.
>>>> +
>>>
>>> The name of the options is not self-explaining to me. If the difference
>>> is "generic vs arch-optimal", then maybe CONFIG_USE_ARCH_OPTIMAL_MEMxxx
>>> would be a better name?
>>
>> Wolfgang didn't object on these names. If we use the OPTIMAL form it is
>> still not clear what optimal mean. There may be a size optimized version
>> and a speed optimized version. So we would need
>> CONFIG_USE_ARCH_SPEED_OPTIMAL_MEMxxx which I personally dislike a lot as
>> it is quite long. I also think that if there is an architecture specific
>> function that it should be clear that this is optimal in some way.
>
> Well, "optimal" is not a good idea as I am pretty sure that some
> clever person will still be able to spare some cycles here and there,
> so his code would be even "more optimal" ;-)

Granted.

> I think the names CONFIG_USE_ARCH_MEMCPY etc. are actually pretty
> good, because they are in line with the standard names
> __HAVE_ARCH_MEMCPY etc. that are used in a lot of libraries.

All right.

> Best regards,
>
> Wolfgang Denk

Amicalement,
-- 
Albert.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [U-Boot] [PATCH V2] arm: Use optimized memcpy and memset from linux
  2011-01-26 10:45 ` [U-Boot] [PATCH V2] " Matthias Weisser
  2011-01-26 12:07   ` Albert ARIBAUD
@ 2011-02-20 19:35   ` Alexander Holler
  2011-03-03  7:07   ` Albert ARIBAUD
  2 siblings, 0 replies; 15+ messages in thread
From: Alexander Holler @ 2011-02-20 19:35 UTC (permalink / raw)
  To: u-boot

Hello,

Am 26.01.2011 11:45, schrieb Matthias Weisser:
> Using optimized versions of memset and memcpy from linux brings a quite
> noticeable speed (x2 or better) improvement for these two functions.

Thanks, tested on armv5 and armv7, therefor

Tested-by: Alexander Holler <holler@ahsoftware.de>

Regards,

Alexander

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [U-Boot] [PATCH V2] arm: Use optimized memcpy and memset from linux
  2011-01-26 10:45 ` [U-Boot] [PATCH V2] " Matthias Weisser
  2011-01-26 12:07   ` Albert ARIBAUD
  2011-02-20 19:35   ` Alexander Holler
@ 2011-03-03  7:07   ` Albert ARIBAUD
  2 siblings, 0 replies; 15+ messages in thread
From: Albert ARIBAUD @ 2011-03-03  7:07 UTC (permalink / raw)
  To: u-boot

Hi Matthias,

Le 26/01/2011 11:45, Matthias Weisser a ?crit :
> Using optimized versions of memset and memcpy from linux brings a quite
> noticeable speed (x2 or better) improvement for these two functions.
>
> Here are some numbers for test done with jadecpu
>
>                             | HEAD(1)| HEAD(1)| HEAD(2)| HEAD(2)|
>                             |        | +patch |        | +patch |
> ---------------------------+--------+--------+--------+--------+
> Reset to prompt            |  438ms |  330ms |  228ms |  120ms |
>                             |        |        |        |        |
> TFTP a 3MB img             | 4782ms | 3428ms | 3245ms | 2820ms |
>                             |        |        |        |        |
> FATLOAD USB a 3MB img*     | 8515ms | 8510ms | ------ | ------ |
>                             |        |        |        |        |
> BOOTM LZO img in RAM       | 3473ms | 3168ms |  592ms |  592ms |
>   where CRC is              |  615ms |  615ms |   54ms |   54ms |
>   uncompress                | 2460ms | 2462ms |  450ms |  451ms |
>   final boot_elf            |  376ms |   68ms |   65ms |   65ms |
>                             |        |        |        |        |
> BOOTM LZO img in FLASH     | 3207ms | 2902ms | 1050ms | 1050ms |
>   where CRC is              |  600ms |  600ms |  135ms |  135ms |
>   uncompress                | 2209ms | 2211ms |  828ms |  828ms |
>                             |        |        |        |        |
> Copy 1.4MB from NOR to RAM |  134ms |   72ms |  120ms |   70ms |
>
> (1) No dcache
> (2) dcache enabled in board_init
> *Does not work when dcache is on
>
> Size impact:
>
> C version:
>     text    data     bss     dec     hex filename
>   202862   18912  266456  488230   77326 u-boot
>
> ASM version:
>     text    data     bss     dec     hex filename
>   203798   18912  266288  488998   77626 u-boot
> 222712  u-boot.bin
>
> Changes since V1:
>    - Made the usage of these functions optional be CONFIG_USE_ARCH_MEM
>    - Usage of PLD instruction on all architectures supporting it
>    - Added a README entry
>    - Minor style fixes
>
> Signed-off-by: Matthias Weisser<weisserm@arcor.de>
> ---

Sorry for leaving this patch alone for so long.

Can you please just repost a (rebased) V3 with the history below the cut 
line so that it does not appear in the commit message? As it is not a 
fix but was tested on two different ARM archs, I think I'll add it to 
master rather than next so that it goes into mainline without delay.

Amicalement,
-- 
Albert.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [U-Boot] [PATCH V3] arm: Use optimized memcpy and memset from linux
  2011-01-24 15:56 [U-Boot] [PATCH] arm: Use optimized memcpy and memset from linux Matthias Weisser
  2011-01-24 16:13 ` Wolfgang Denk
  2011-01-26 10:45 ` [U-Boot] [PATCH V2] " Matthias Weisser
@ 2011-03-11  7:36 ` Matthias Weisser
  2011-03-24 14:34   ` Albert ARIBAUD
  2 siblings, 1 reply; 15+ messages in thread
From: Matthias Weisser @ 2011-03-11  7:36 UTC (permalink / raw)
  To: u-boot

Using optimized versions of memset and memcpy from linux brings a quite
noticeable speed (x2 or better) improvement for these two functions.

Here are some numbers for test done with jadecpu

                           | HEAD(1)| HEAD(1)| HEAD(2)| HEAD(2)|
                           |        | +patch |        | +patch |
---------------------------+--------+--------+--------+--------+
Reset to prompt            |  438ms |  330ms |  228ms |  120ms |
                           |        |        |        |        |
TFTP a 3MB img             | 4782ms | 3428ms | 3245ms | 2820ms |
                           |        |        |        |        |
FATLOAD USB a 3MB img*     | 8515ms | 8510ms | ------ | ------ |
                           |        |        |        |        |
BOOTM LZO img in RAM       | 3473ms | 3168ms |  592ms |  592ms |
 where CRC is              |  615ms |  615ms |   54ms |   54ms |
 uncompress                | 2460ms | 2462ms |  450ms |  451ms |
 final boot_elf            |  376ms |   68ms |   65ms |   65ms |
                           |        |        |        |        |
BOOTM LZO img in FLASH     | 3207ms | 2902ms | 1050ms | 1050ms |
 where CRC is              |  600ms |  600ms |  135ms |  135ms |
 uncompress                | 2209ms | 2211ms |  828ms |  828ms |
                           |        |        |        |        |
Copy 1.4MB from NOR to RAM |  134ms |   72ms |  120ms |   70ms |

(1) No dcache
(2) dcache enabled in board_init
*Does not work when dcache is on

Size impact:

C version:
   text    data     bss     dec     hex filename
 202862   18912  266456  488230   77326 u-boot

ASM version:
   text    data     bss     dec     hex filename
 203798   18912  266288  488998   77626 u-boot
222712  u-boot.bin

Signed-off-by: Matthias Weisser <weisserm@arcor.de>
---

Changes since V2:
  - Moved the history below the --- line
  
Changes since V1:
  - Made the usage of these functions optional be CONFIG_USE_ARCH_MEM
  - Usage of PLD instruction on all architectures supporting it
  - Added a README entry
  - Minor style fixes

 README                           |    6 +
 arch/arm/include/asm/assembler.h |   60 ++++++++++
 arch/arm/include/asm/string.h    |   10 ++-
 arch/arm/lib/Makefile            |    2 +
 arch/arm/lib/memcpy.S            |  241 ++++++++++++++++++++++++++++++++++++++
 arch/arm/lib/memset.S            |  126 ++++++++++++++++++++
 6 files changed, 443 insertions(+), 2 deletions(-)
 create mode 100644 arch/arm/include/asm/assembler.h
 create mode 100644 arch/arm/lib/memcpy.S
 create mode 100644 arch/arm/lib/memset.S

diff --git a/README b/README
index 755d17c..c2d82a5 100644
--- a/README
+++ b/README
@@ -2885,6 +2885,12 @@ Low Level (hardware related) configuration options:
 		that is executed before the actual U-Boot. E.g. when
 		compiling a NAND SPL.
 
+- CONFIG_USE_ARCH_MEMCPY
+  CONFIG_USE_ARCH_MEMSET
+		If these options are used a optimized version of memcpy/memset will
+		be used if available. These functions may be faster under some
+		conditions but may increase the binary size.
+
 Building the Software:
 ======================
 
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
new file mode 100644
index 0000000..5e4789b
--- /dev/null
+++ b/arch/arm/include/asm/assembler.h
@@ -0,0 +1,60 @@
+/*
+ *  arch/arm/include/asm/assembler.h
+ *
+ *  Copyright (C) 1996-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  This file contains arm architecture specific defines
+ *  for the different processors.
+ *
+ *  Do not include any C declarations in this file - it is included by
+ *  assembler source.
+ */
+
+/*
+ * Endian independent macros for shifting bytes within registers.
+ */
+#ifndef __ARMEB__
+#define pull		lsr
+#define push		lsl
+#define get_byte_0	lsl #0
+#define get_byte_1	lsr #8
+#define get_byte_2	lsr #16
+#define get_byte_3	lsr #24
+#define put_byte_0	lsl #0
+#define put_byte_1	lsl #8
+#define put_byte_2	lsl #16
+#define put_byte_3	lsl #24
+#else
+#define pull		lsl
+#define push		lsr
+#define get_byte_0	lsr #24
+#define get_byte_1	lsr #16
+#define get_byte_2	lsr #8
+#define get_byte_3      lsl #0
+#define put_byte_0	lsl #24
+#define put_byte_1	lsl #16
+#define put_byte_2	lsl #8
+#define put_byte_3      lsl #0
+#endif
+
+/*
+ * Data preload for architectures that support it
+ */
+#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \
+	defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
+	defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || \
+	defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_7A__) || \
+	defined(__ARM_ARCH_7R__)
+#define PLD(code...)	code
+#else
+#define PLD(code...)
+#endif
+
+/*
+ * Cache alligned
+ */
+#define CALGN(code...) code
diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h
index c3ea582..c6dfb25 100644
--- a/arch/arm/include/asm/string.h
+++ b/arch/arm/include/asm/string.h
@@ -1,6 +1,8 @@
 #ifndef __ASM_ARM_STRING_H
 #define __ASM_ARM_STRING_H
 
+#include <config.h>
+
 /*
  * We don't do inline string functions, since the
  * optimised inline asm versions are not small.
@@ -12,7 +14,9 @@ extern char * strrchr(const char * s, int c);
 #undef __HAVE_ARCH_STRCHR
 extern char * strchr(const char * s, int c);
 
-#undef __HAVE_ARCH_MEMCPY
+#ifdef CONFIG_USE_ARCH_MEMCPY
+#define __HAVE_ARCH_MEMCPY
+#endif
 extern void * memcpy(void *, const void *, __kernel_size_t);
 
 #undef __HAVE_ARCH_MEMMOVE
@@ -22,7 +26,9 @@ extern void * memmove(void *, const void *, __kernel_size_t);
 extern void * memchr(const void *, int, __kernel_size_t);
 
 #undef __HAVE_ARCH_MEMZERO
-#undef __HAVE_ARCH_MEMSET
+#ifdef CONFIG_USE_ARCH_MEMSET
+#define __HAVE_ARCH_MEMSET
+#endif
 extern void * memset(void *, int, __kernel_size_t);
 
 #if 0
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 454440c..03b1b5e 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -44,6 +44,8 @@ COBJS-y	+= cache-cp15.o
 endif
 COBJS-y	+= interrupts.o
 COBJS-y	+= reset.o
+SOBJS-$(CONFIG_USE_ARCH_MEMSET) += memset.o
+SOBJS-$(CONFIG_USE_ARCH_MEMCPY) += memcpy.o
 
 SRCS	:= $(GLSOBJS:.o=.S) $(GLCOBJS:.o=.c) \
 	   $(SOBJS-y:.o=.S) $(COBJS-y:.o=.c)
diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
new file mode 100644
index 0000000..40db90e
--- /dev/null
+++ b/arch/arm/lib/memcpy.S
@@ -0,0 +1,241 @@
+/*
+ *  linux/arch/arm/lib/memcpy.S
+ *
+ *  Author:	Nicolas Pitre
+ *  Created:	Sep 28, 2005
+ *  Copyright:	MontaVista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <asm/assembler.h>
+
+#define W(instr)	instr
+
+#define LDR1W_SHIFT	0
+#define STR1W_SHIFT	0
+
+	.macro ldr1w ptr reg abort
+	W(ldr) \reg, [\ptr], #4
+	.endm
+
+	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
+	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
+	.endm
+
+	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+	.endm
+
+	.macro ldr1b ptr reg cond=al abort
+	ldr\cond\()b \reg, [\ptr], #1
+	.endm
+
+	.macro str1w ptr reg abort
+	W(str) \reg, [\ptr], #4
+	.endm
+
+	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+	.endm
+
+	.macro str1b ptr reg cond=al abort
+	str\cond\()b \reg, [\ptr], #1
+	.endm
+
+	.macro enter reg1 reg2
+	stmdb sp!, {r0, \reg1, \reg2}
+	.endm
+
+	.macro exit reg1 reg2
+	ldmfd sp!, {r0, \reg1, \reg2}
+	.endm
+
+	.text
+
+/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
+
+.globl memcpy
+memcpy:
+
+		enter	r4, lr
+
+		subs	r2, r2, #4
+		blt	8f
+		ands	ip, r0, #3
+	PLD(	pld	[r1, #0]		)
+		bne	9f
+		ands	ip, r1, #3
+		bne	10f
+
+1:		subs	r2, r2, #(28)
+		stmfd	sp!, {r5 - r8}
+		blt	5f
+
+	CALGN(	ands	ip, r0, #31		)
+	CALGN(	rsb	r3, ip, #32		)
+	CALGN(	sbcnes	r4, r3, r2		)  @ C is always set here
+	CALGN(	bcs	2f			)
+	CALGN(	adr	r4, 6f			)
+	CALGN(	subs	r2, r2, r3		)  @ C gets set
+	CALGN(	add	pc, r4, ip		)
+
+	PLD(	pld	[r1, #0]		)
+2:	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[r1, #28]		)
+	PLD(	blt	4f			)
+	PLD(	pld	[r1, #60]		)
+	PLD(	pld	[r1, #92]		)
+
+3:	PLD(	pld	[r1, #124]		)
+4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+		subs	r2, r2, #32
+		str8w	r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+		bge	3b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	4b			)
+
+5:		ands	ip, r2, #28
+		rsb	ip, ip, #32
+#if LDR1W_SHIFT > 0
+		lsl	ip, ip, #LDR1W_SHIFT
+#endif
+		addne	pc, pc, ip		@ C is always clear here
+		b	7f
+6:
+		.rept	(1 << LDR1W_SHIFT)
+		W(nop)
+		.endr
+		ldr1w	r1, r3, abort=20f
+		ldr1w	r1, r4, abort=20f
+		ldr1w	r1, r5, abort=20f
+		ldr1w	r1, r6, abort=20f
+		ldr1w	r1, r7, abort=20f
+		ldr1w	r1, r8, abort=20f
+		ldr1w	r1, lr, abort=20f
+
+#if LDR1W_SHIFT < STR1W_SHIFT
+		lsl	ip, ip, #STR1W_SHIFT - LDR1W_SHIFT
+#elif LDR1W_SHIFT > STR1W_SHIFT
+		lsr	ip, ip, #LDR1W_SHIFT - STR1W_SHIFT
+#endif
+		add	pc, pc, ip
+		nop
+		.rept	(1 << STR1W_SHIFT)
+		W(nop)
+		.endr
+		str1w	r0, r3, abort=20f
+		str1w	r0, r4, abort=20f
+		str1w	r0, r5, abort=20f
+		str1w	r0, r6, abort=20f
+		str1w	r0, r7, abort=20f
+		str1w	r0, r8, abort=20f
+		str1w	r0, lr, abort=20f
+
+	CALGN(	bcs	2b			)
+
+7:		ldmfd	sp!, {r5 - r8}
+
+8:		movs	r2, r2, lsl #31
+		ldr1b	r1, r3, ne, abort=21f
+		ldr1b	r1, r4, cs, abort=21f
+		ldr1b	r1, ip, cs, abort=21f
+		str1b	r0, r3, ne, abort=21f
+		str1b	r0, r4, cs, abort=21f
+		str1b	r0, ip, cs, abort=21f
+
+		exit	r4, pc
+
+9:		rsb	ip, ip, #4
+		cmp	ip, #2
+		ldr1b	r1, r3, gt, abort=21f
+		ldr1b	r1, r4, ge, abort=21f
+		ldr1b	r1, lr, abort=21f
+		str1b	r0, r3, gt, abort=21f
+		str1b	r0, r4, ge, abort=21f
+		subs	r2, r2, ip
+		str1b	r0, lr, abort=21f
+		blt	8b
+		ands	ip, r1, #3
+		beq	1b
+
+10:		bic	r1, r1, #3
+		cmp	ip, #2
+		ldr1w	r1, lr, abort=21f
+		beq	17f
+		bgt	18f
+
+
+		.macro	forward_copy_shift pull push
+
+		subs	r2, r2, #28
+		blt	14f
+
+	CALGN(	ands	ip, r0, #31		)
+	CALGN(	rsb	ip, ip, #32		)
+	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
+	CALGN(	subcc	r2, r2, ip		)
+	CALGN(	bcc	15f			)
+
+11:		stmfd	sp!, {r5 - r9}
+
+	PLD(	pld	[r1, #0]		)
+	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[r1, #28]		)
+	PLD(	blt	13f			)
+	PLD(	pld	[r1, #60]		)
+	PLD(	pld	[r1, #92]		)
+
+12:	PLD(	pld	[r1, #124]		)
+13:		ldr4w	r1, r4, r5, r6, r7, abort=19f
+		mov	r3, lr, pull #\pull
+		subs	r2, r2, #32
+		ldr4w	r1, r8, r9, ip, lr, abort=19f
+		orr	r3, r3, r4, push #\push
+		mov	r4, r4, pull #\pull
+		orr	r4, r4, r5, push #\push
+		mov	r5, r5, pull #\pull
+		orr	r5, r5, r6, push #\push
+		mov	r6, r6, pull #\pull
+		orr	r6, r6, r7, push #\push
+		mov	r7, r7, pull #\pull
+		orr	r7, r7, r8, push #\push
+		mov	r8, r8, pull #\pull
+		orr	r8, r8, r9, push #\push
+		mov	r9, r9, pull #\pull
+		orr	r9, r9, ip, push #\push
+		mov	ip, ip, pull #\pull
+		orr	ip, ip, lr, push #\push
+		str8w	r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
+		bge	12b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	13b			)
+
+		ldmfd	sp!, {r5 - r9}
+
+14:		ands	ip, r2, #28
+		beq	16f
+
+15:		mov	r3, lr, pull #\pull
+		ldr1w	r1, lr, abort=21f
+		subs	ip, ip, #4
+		orr	r3, r3, lr, push #\push
+		str1w	r0, r3, abort=21f
+		bgt	15b
+	CALGN(	cmp	r2, #0			)
+	CALGN(	bge	11b			)
+
+16:		sub	r1, r1, #(\push / 8)
+		b	8b
+
+		.endm
+
+
+		forward_copy_shift	pull=8	push=24
+
+17:		forward_copy_shift	pull=16	push=16
+
+18:		forward_copy_shift	pull=24	push=8
+
diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S
new file mode 100644
index 0000000..0cdf895
--- /dev/null
+++ b/arch/arm/lib/memset.S
@@ -0,0 +1,126 @@
+/*
+ *  linux/arch/arm/lib/memset.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  ASM optimised string functions
+ */
+#include <asm/assembler.h>
+
+	.text
+	.align	5
+	.word	0
+
+1:	subs	r2, r2, #4		@ 1 do we have enough
+	blt	5f			@ 1 bytes to align with?
+	cmp	r3, #2			@ 1
+	strltb	r1, [r0], #1		@ 1
+	strleb	r1, [r0], #1		@ 1
+	strb	r1, [r0], #1		@ 1
+	add	r2, r2, r3		@ 1 (r2 = r2 - (4 - r3))
+/*
+ * The pointer is now aligned and the length is adjusted.  Try doing the
+ * memset again.
+ */
+
+.globl memset
+memset:
+	ands	r3, r0, #3		@ 1 unaligned?
+	bne	1b			@ 1
+/*
+ * we know that the pointer in r0 is aligned to a word boundary.
+ */
+	orr	r1, r1, r1, lsl #8
+	orr	r1, r1, r1, lsl #16
+	mov	r3, r1
+	cmp	r2, #16
+	blt	4f
+
+#if ! CALGN(1)+0
+
+/*
+ * We need an extra register for this loop - save the return address and
+ * use the LR
+ */
+	str	lr, [sp, #-4]!
+	mov	ip, r1
+	mov	lr, r1
+
+2:	subs	r2, r2, #64
+	stmgeia	r0!, {r1, r3, ip, lr}	@ 64 bytes at a time.
+	stmgeia	r0!, {r1, r3, ip, lr}
+	stmgeia	r0!, {r1, r3, ip, lr}
+	stmgeia	r0!, {r1, r3, ip, lr}
+	bgt	2b
+	ldmeqfd	sp!, {pc}		@ Now <64 bytes to go.
+/*
+ * No need to correct the count; we're only testing bits from now on
+ */
+	tst	r2, #32
+	stmneia	r0!, {r1, r3, ip, lr}
+	stmneia	r0!, {r1, r3, ip, lr}
+	tst	r2, #16
+	stmneia	r0!, {r1, r3, ip, lr}
+	ldr	lr, [sp], #4
+
+#else
+
+/*
+ * This version aligns the destination pointer in order to write
+ * whole cache lines@once.
+ */
+
+	stmfd	sp!, {r4-r7, lr}
+	mov	r4, r1
+	mov	r5, r1
+	mov	r6, r1
+	mov	r7, r1
+	mov	ip, r1
+	mov	lr, r1
+
+	cmp	r2, #96
+	tstgt	r0, #31
+	ble	3f
+
+	and	ip, r0, #31
+	rsb	ip, ip, #32
+	sub	r2, r2, ip
+	movs	ip, ip, lsl #(32 - 4)
+	stmcsia	r0!, {r4, r5, r6, r7}
+	stmmiia	r0!, {r4, r5}
+	tst	ip, #(1 << 30)
+	mov	ip, r1
+	strne	r1, [r0], #4
+
+3:	subs	r2, r2, #64
+	stmgeia	r0!, {r1, r3-r7, ip, lr}
+	stmgeia	r0!, {r1, r3-r7, ip, lr}
+	bgt	3b
+	ldmeqfd	sp!, {r4-r7, pc}
+
+	tst	r2, #32
+	stmneia	r0!, {r1, r3-r7, ip, lr}
+	tst	r2, #16
+	stmneia	r0!, {r4-r7}
+	ldmfd	sp!, {r4-r7, lr}
+
+#endif
+
+4:	tst	r2, #8
+	stmneia	r0!, {r1, r3}
+	tst	r2, #4
+	strne	r1, [r0], #4
+/*
+ * When we get here, we've got less than 4 bytes to zero.  We
+ * may have an unaligned pointer as well.
+ */
+5:	tst	r2, #2
+	strneb	r1, [r0], #1
+	strneb	r1, [r0], #1
+	tst	r2, #1
+	strneb	r1, [r0], #1
+	mov	pc, lr
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [U-Boot] [PATCH V3] arm: Use optimized memcpy and memset from linux
  2011-03-11  7:36 ` [U-Boot] [PATCH V3] " Matthias Weisser
@ 2011-03-24 14:34   ` Albert ARIBAUD
  0 siblings, 0 replies; 15+ messages in thread
From: Albert ARIBAUD @ 2011-03-24 14:34 UTC (permalink / raw)
  To: u-boot

Le 11/03/2011 08:36, Matthias Weisser a ?crit :
> Using optimized versions of memset and memcpy from linux brings a quite
> noticeable speed (x2 or better) improvement for these two functions.
>
> Here are some numbers for test done with jadecpu
>
>                             | HEAD(1)| HEAD(1)| HEAD(2)| HEAD(2)|
>                             |        | +patch |        | +patch |
> ---------------------------+--------+--------+--------+--------+
> Reset to prompt            |  438ms |  330ms |  228ms |  120ms |
>                             |        |        |        |        |
> TFTP a 3MB img             | 4782ms | 3428ms | 3245ms | 2820ms |
>                             |        |        |        |        |
> FATLOAD USB a 3MB img*     | 8515ms | 8510ms | ------ | ------ |
>                             |        |        |        |        |
> BOOTM LZO img in RAM       | 3473ms | 3168ms |  592ms |  592ms |
>   where CRC is              |  615ms |  615ms |   54ms |   54ms |
>   uncompress                | 2460ms | 2462ms |  450ms |  451ms |
>   final boot_elf            |  376ms |   68ms |   65ms |   65ms |
>                             |        |        |        |        |
> BOOTM LZO img in FLASH     | 3207ms | 2902ms | 1050ms | 1050ms |
>   where CRC is              |  600ms |  600ms |  135ms |  135ms |
>   uncompress                | 2209ms | 2211ms |  828ms |  828ms |
>                             |        |        |        |        |
> Copy 1.4MB from NOR to RAM |  134ms |   72ms |  120ms |   70ms |
>
> (1) No dcache
> (2) dcache enabled in board_init
> *Does not work when dcache is on
>
> Size impact:
>
> C version:
>     text    data     bss     dec     hex filename
>   202862   18912  266456  488230   77326 u-boot
>
> ASM version:
>     text    data     bss     dec     hex filename
>   203798   18912  266288  488998   77626 u-boot
> 222712  u-boot.bin
>
> Signed-off-by: Matthias Weisser<weisserm@arcor.de>
> ---

Applied to u-boot-arm/master, thanks.

Amicalement,
-- 
Albert.

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2011-03-24 14:34 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-01-24 15:56 [U-Boot] [PATCH] arm: Use optimized memcpy and memset from linux Matthias Weisser
2011-01-24 16:13 ` Wolfgang Denk
2011-01-24 19:24   ` Matthias Weißer
2011-01-24 20:07     ` Wolfgang Denk
2011-01-25 10:55       ` Matthias Weißer
2011-01-25 20:05         ` Wolfgang Denk
2011-01-26 10:45 ` [U-Boot] [PATCH V2] " Matthias Weisser
2011-01-26 12:07   ` Albert ARIBAUD
2011-01-26 12:50     ` Matthias Weißer
2011-01-26 13:07       ` Wolfgang Denk
2011-01-27 18:39         ` Albert ARIBAUD
2011-02-20 19:35   ` Alexander Holler
2011-03-03  7:07   ` Albert ARIBAUD
2011-03-11  7:36 ` [U-Boot] [PATCH V3] " Matthias Weisser
2011-03-24 14:34   ` Albert ARIBAUD

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.