* [U-Boot] [PATCH] arm: Use optimized memcpy and memset from linux @ 2011-01-24 15:56 Matthias Weisser 2011-01-24 16:13 ` Wolfgang Denk ` (2 more replies) 0 siblings, 3 replies; 15+ messages in thread From: Matthias Weisser @ 2011-01-24 15:56 UTC (permalink / raw) To: u-boot Using optimized versions of memset and memcpy from linux brings a quite noticeable speed (x2 or better) improvement for these two functions. Size impact: C version: text data bss dec hex filename 202862 18912 266456 488230 77326 u-boot ASM version: text data bss dec hex filename 203798 18912 266288 488998 77626 u-boot Signed-off-by: Matthias Weisser <weisserm@arcor.de> --- arch/arm/include/asm/assembler.h | 62 ++++++++++ arch/arm/include/asm/string.h | 4 +- arch/arm/lib/Makefile | 2 + arch/arm/lib/memcpy.S | 241 ++++++++++++++++++++++++++++++++++++++ arch/arm/lib/memset.S | 126 ++++++++++++++++++++ 5 files changed, 433 insertions(+), 2 deletions(-) create mode 100644 arch/arm/include/asm/assembler.h create mode 100644 arch/arm/lib/memcpy.S create mode 100644 arch/arm/lib/memset.S diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h new file mode 100644 index 0000000..231b1ae --- /dev/null +++ b/arch/arm/include/asm/assembler.h @@ -0,0 +1,62 @@ +/* + * arch/arm/include/asm/assembler.h + * + * Copyright (C) 1996-2000 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This file contains arm architecture specific defines + * for the different processors. + * + * Do not include any C declarations in this file - it is included by + * assembler source. + */ + +/* + * Endian independent macros for shifting bytes within registers. + */ +#ifndef __ARMEB__ +#define pull lsr +#define push lsl +#define get_byte_0 lsl #0 +#define get_byte_1 lsr #8 +#define get_byte_2 lsr #16 +#define get_byte_3 lsr #24 +#define put_byte_0 lsl #0 +#define put_byte_1 lsl #8 +#define put_byte_2 lsl #16 +#define put_byte_3 lsl #24 +#else +#define pull lsl +#define push lsr +#define get_byte_0 lsr #24 +#define get_byte_1 lsr #16 +#define get_byte_2 lsr #8 +#define get_byte_3 lsl #0 +#define put_byte_0 lsl #24 +#define put_byte_1 lsl #16 +#define put_byte_2 lsl #8 +#define put_byte_3 lsl #0 +#endif + +/* + * Data preload for architectures that support it + */ +#if defined(__ARM_ARCH_5TE__) +#define PLD(code...) code +#else +#define PLD(code...) +#endif + +/* + * This can be used to enable code to cacheline align the destination + * pointer when bulk writing to memory. Experiments on StrongARM and + * XScale didn't show this a worthwhile thing to do when the cache is not + * set to write-allocate (this would need further testing on XScale when WA + * is used). + * + * On Feroceon there is much to gain however, regardless of cache mode. + */ +#define CALGN(code...) code diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h index c3ea582..a939571 100644 --- a/arch/arm/include/asm/string.h +++ b/arch/arm/include/asm/string.h @@ -12,7 +12,7 @@ extern char * strrchr(const char * s, int c); #undef __HAVE_ARCH_STRCHR extern char * strchr(const char * s, int c); -#undef __HAVE_ARCH_MEMCPY +#define __HAVE_ARCH_MEMCPY extern void * memcpy(void *, const void *, __kernel_size_t); #undef __HAVE_ARCH_MEMMOVE @@ -22,7 +22,7 @@ extern void * memmove(void *, const void *, __kernel_size_t); extern void * memchr(const void *, int, __kernel_size_t); #undef __HAVE_ARCH_MEMZERO -#undef __HAVE_ARCH_MEMSET +#define __HAVE_ARCH_MEMSET extern void * memset(void *, int, __kernel_size_t); #if 0 diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index 454440c..575a919 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile @@ -44,6 +44,8 @@ COBJS-y += cache-cp15.o endif COBJS-y += interrupts.o COBJS-y += reset.o +SOBJS-y += memset.o +SOBJS-y += memcpy.o SRCS := $(GLSOBJS:.o=.S) $(GLCOBJS:.o=.c) \ $(SOBJS-y:.o=.S) $(COBJS-y:.o=.c) diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S new file mode 100644 index 0000000..40db90e --- /dev/null +++ b/arch/arm/lib/memcpy.S @@ -0,0 +1,241 @@ +/* + * linux/arch/arm/lib/memcpy.S + * + * Author: Nicolas Pitre + * Created: Sep 28, 2005 + * Copyright: MontaVista Software, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <asm/assembler.h> + +#define W(instr) instr + +#define LDR1W_SHIFT 0 +#define STR1W_SHIFT 0 + + .macro ldr1w ptr reg abort + W(ldr) \reg, [\ptr], #4 + .endm + + .macro ldr4w ptr reg1 reg2 reg3 reg4 abort + ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4} + .endm + + .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort + ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} + .endm + + .macro ldr1b ptr reg cond=al abort + ldr\cond\()b \reg, [\ptr], #1 + .endm + + .macro str1w ptr reg abort + W(str) \reg, [\ptr], #4 + .endm + + .macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort + stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} + .endm + + .macro str1b ptr reg cond=al abort + str\cond\()b \reg, [\ptr], #1 + .endm + + .macro enter reg1 reg2 + stmdb sp!, {r0, \reg1, \reg2} + .endm + + .macro exit reg1 reg2 + ldmfd sp!, {r0, \reg1, \reg2} + .endm + + .text + +/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */ + +.globl memcpy +memcpy: + + enter r4, lr + + subs r2, r2, #4 + blt 8f + ands ip, r0, #3 + PLD( pld [r1, #0] ) + bne 9f + ands ip, r1, #3 + bne 10f + +1: subs r2, r2, #(28) + stmfd sp!, {r5 - r8} + blt 5f + + CALGN( ands ip, r0, #31 ) + CALGN( rsb r3, ip, #32 ) + CALGN( sbcnes r4, r3, r2 ) @ C is always set here + CALGN( bcs 2f ) + CALGN( adr r4, 6f ) + CALGN( subs r2, r2, r3 ) @ C gets set + CALGN( add pc, r4, ip ) + + PLD( pld [r1, #0] ) +2: PLD( subs r2, r2, #96 ) + PLD( pld [r1, #28] ) + PLD( blt 4f ) + PLD( pld [r1, #60] ) + PLD( pld [r1, #92] ) + +3: PLD( pld [r1, #124] ) +4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f + subs r2, r2, #32 + str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f + bge 3b + PLD( cmn r2, #96 ) + PLD( bge 4b ) + +5: ands ip, r2, #28 + rsb ip, ip, #32 +#if LDR1W_SHIFT > 0 + lsl ip, ip, #LDR1W_SHIFT +#endif + addne pc, pc, ip @ C is always clear here + b 7f +6: + .rept (1 << LDR1W_SHIFT) + W(nop) + .endr + ldr1w r1, r3, abort=20f + ldr1w r1, r4, abort=20f + ldr1w r1, r5, abort=20f + ldr1w r1, r6, abort=20f + ldr1w r1, r7, abort=20f + ldr1w r1, r8, abort=20f + ldr1w r1, lr, abort=20f + +#if LDR1W_SHIFT < STR1W_SHIFT + lsl ip, ip, #STR1W_SHIFT - LDR1W_SHIFT +#elif LDR1W_SHIFT > STR1W_SHIFT + lsr ip, ip, #LDR1W_SHIFT - STR1W_SHIFT +#endif + add pc, pc, ip + nop + .rept (1 << STR1W_SHIFT) + W(nop) + .endr + str1w r0, r3, abort=20f + str1w r0, r4, abort=20f + str1w r0, r5, abort=20f + str1w r0, r6, abort=20f + str1w r0, r7, abort=20f + str1w r0, r8, abort=20f + str1w r0, lr, abort=20f + + CALGN( bcs 2b ) + +7: ldmfd sp!, {r5 - r8} + +8: movs r2, r2, lsl #31 + ldr1b r1, r3, ne, abort=21f + ldr1b r1, r4, cs, abort=21f + ldr1b r1, ip, cs, abort=21f + str1b r0, r3, ne, abort=21f + str1b r0, r4, cs, abort=21f + str1b r0, ip, cs, abort=21f + + exit r4, pc + +9: rsb ip, ip, #4 + cmp ip, #2 + ldr1b r1, r3, gt, abort=21f + ldr1b r1, r4, ge, abort=21f + ldr1b r1, lr, abort=21f + str1b r0, r3, gt, abort=21f + str1b r0, r4, ge, abort=21f + subs r2, r2, ip + str1b r0, lr, abort=21f + blt 8b + ands ip, r1, #3 + beq 1b + +10: bic r1, r1, #3 + cmp ip, #2 + ldr1w r1, lr, abort=21f + beq 17f + bgt 18f + + + .macro forward_copy_shift pull push + + subs r2, r2, #28 + blt 14f + + CALGN( ands ip, r0, #31 ) + CALGN( rsb ip, ip, #32 ) + CALGN( sbcnes r4, ip, r2 ) @ C is always set here + CALGN( subcc r2, r2, ip ) + CALGN( bcc 15f ) + +11: stmfd sp!, {r5 - r9} + + PLD( pld [r1, #0] ) + PLD( subs r2, r2, #96 ) + PLD( pld [r1, #28] ) + PLD( blt 13f ) + PLD( pld [r1, #60] ) + PLD( pld [r1, #92] ) + +12: PLD( pld [r1, #124] ) +13: ldr4w r1, r4, r5, r6, r7, abort=19f + mov r3, lr, pull #\pull + subs r2, r2, #32 + ldr4w r1, r8, r9, ip, lr, abort=19f + orr r3, r3, r4, push #\push + mov r4, r4, pull #\pull + orr r4, r4, r5, push #\push + mov r5, r5, pull #\pull + orr r5, r5, r6, push #\push + mov r6, r6, pull #\pull + orr r6, r6, r7, push #\push + mov r7, r7, pull #\pull + orr r7, r7, r8, push #\push + mov r8, r8, pull #\pull + orr r8, r8, r9, push #\push + mov r9, r9, pull #\pull + orr r9, r9, ip, push #\push + mov ip, ip, pull #\pull + orr ip, ip, lr, push #\push + str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f + bge 12b + PLD( cmn r2, #96 ) + PLD( bge 13b ) + + ldmfd sp!, {r5 - r9} + +14: ands ip, r2, #28 + beq 16f + +15: mov r3, lr, pull #\pull + ldr1w r1, lr, abort=21f + subs ip, ip, #4 + orr r3, r3, lr, push #\push + str1w r0, r3, abort=21f + bgt 15b + CALGN( cmp r2, #0 ) + CALGN( bge 11b ) + +16: sub r1, r1, #(\push / 8) + b 8b + + .endm + + + forward_copy_shift pull=8 push=24 + +17: forward_copy_shift pull=16 push=16 + +18: forward_copy_shift pull=24 push=8 + diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S new file mode 100644 index 0000000..0cdf895 --- /dev/null +++ b/arch/arm/lib/memset.S @@ -0,0 +1,126 @@ +/* + * linux/arch/arm/lib/memset.S + * + * Copyright (C) 1995-2000 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * ASM optimised string functions + */ +#include <asm/assembler.h> + + .text + .align 5 + .word 0 + +1: subs r2, r2, #4 @ 1 do we have enough + blt 5f @ 1 bytes to align with? + cmp r3, #2 @ 1 + strltb r1, [r0], #1 @ 1 + strleb r1, [r0], #1 @ 1 + strb r1, [r0], #1 @ 1 + add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3)) +/* + * The pointer is now aligned and the length is adjusted. Try doing the + * memset again. + */ + +.globl memset +memset: + ands r3, r0, #3 @ 1 unaligned? + bne 1b @ 1 +/* + * we know that the pointer in r0 is aligned to a word boundary. + */ + orr r1, r1, r1, lsl #8 + orr r1, r1, r1, lsl #16 + mov r3, r1 + cmp r2, #16 + blt 4f + +#if ! CALGN(1)+0 + +/* + * We need an extra register for this loop - save the return address and + * use the LR + */ + str lr, [sp, #-4]! + mov ip, r1 + mov lr, r1 + +2: subs r2, r2, #64 + stmgeia r0!, {r1, r3, ip, lr} @ 64 bytes at a time. + stmgeia r0!, {r1, r3, ip, lr} + stmgeia r0!, {r1, r3, ip, lr} + stmgeia r0!, {r1, r3, ip, lr} + bgt 2b + ldmeqfd sp!, {pc} @ Now <64 bytes to go. +/* + * No need to correct the count; we're only testing bits from now on + */ + tst r2, #32 + stmneia r0!, {r1, r3, ip, lr} + stmneia r0!, {r1, r3, ip, lr} + tst r2, #16 + stmneia r0!, {r1, r3, ip, lr} + ldr lr, [sp], #4 + +#else + +/* + * This version aligns the destination pointer in order to write + * whole cache lines@once. + */ + + stmfd sp!, {r4-r7, lr} + mov r4, r1 + mov r5, r1 + mov r6, r1 + mov r7, r1 + mov ip, r1 + mov lr, r1 + + cmp r2, #96 + tstgt r0, #31 + ble 3f + + and ip, r0, #31 + rsb ip, ip, #32 + sub r2, r2, ip + movs ip, ip, lsl #(32 - 4) + stmcsia r0!, {r4, r5, r6, r7} + stmmiia r0!, {r4, r5} + tst ip, #(1 << 30) + mov ip, r1 + strne r1, [r0], #4 + +3: subs r2, r2, #64 + stmgeia r0!, {r1, r3-r7, ip, lr} + stmgeia r0!, {r1, r3-r7, ip, lr} + bgt 3b + ldmeqfd sp!, {r4-r7, pc} + + tst r2, #32 + stmneia r0!, {r1, r3-r7, ip, lr} + tst r2, #16 + stmneia r0!, {r4-r7} + ldmfd sp!, {r4-r7, lr} + +#endif + +4: tst r2, #8 + stmneia r0!, {r1, r3} + tst r2, #4 + strne r1, [r0], #4 +/* + * When we get here, we've got less than 4 bytes to zero. We + * may have an unaligned pointer as well. + */ +5: tst r2, #2 + strneb r1, [r0], #1 + strneb r1, [r0], #1 + tst r2, #1 + strneb r1, [r0], #1 + mov pc, lr -- 1.7.0.4 ^ permalink raw reply related [flat|nested] 15+ messages in thread
* [U-Boot] [PATCH] arm: Use optimized memcpy and memset from linux 2011-01-24 15:56 [U-Boot] [PATCH] arm: Use optimized memcpy and memset from linux Matthias Weisser @ 2011-01-24 16:13 ` Wolfgang Denk 2011-01-24 19:24 ` Matthias Weißer 2011-01-26 10:45 ` [U-Boot] [PATCH V2] " Matthias Weisser 2011-03-11 7:36 ` [U-Boot] [PATCH V3] " Matthias Weisser 2 siblings, 1 reply; 15+ messages in thread From: Wolfgang Denk @ 2011-01-24 16:13 UTC (permalink / raw) To: u-boot Dear Matthias Weisser, In message <1295884607-9044-1-git-send-email-weisserm@arcor.de> you wrote: > Using optimized versions of memset and memcpy from linux brings a quite > noticeable speed (x2 or better) improvement for these two functions. > > Size impact: > > C version: > text data bss dec hex filename > 202862 18912 266456 488230 77326 u-boot > > ASM version: > text data bss dec hex filename > 203798 18912 266288 488998 77626 u-boot How exactly did you measure the speed improvement? Best regards, Wolfgang Denk -- DENX Software Engineering GmbH, MD: Wolfgang Denk & Detlev Zundel HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany Phone: (+49)-8142-66989-10 Fax: (+49)-8142-66989-80 Email: wd at denx.de Real programmers can write assembly code in any language. :-) - Larry Wall in <8571@jpl-devvax.JPL.NASA.GOV> ^ permalink raw reply [flat|nested] 15+ messages in thread
* [U-Boot] [PATCH] arm: Use optimized memcpy and memset from linux 2011-01-24 16:13 ` Wolfgang Denk @ 2011-01-24 19:24 ` Matthias Weißer 2011-01-24 20:07 ` Wolfgang Denk 0 siblings, 1 reply; 15+ messages in thread From: Matthias Weißer @ 2011-01-24 19:24 UTC (permalink / raw) To: u-boot Am 24.01.2011 17:13, schrieb Wolfgang Denk: > Dear Matthias Weisser, > > In message <1295884607-9044-1-git-send-email-weisserm@arcor.de> you wrote: >> Using optimized versions of memset and memcpy from linux brings a quite >> noticeable speed (x2 or better) improvement for these two functions. >> >> Size impact: >> >> C version: >> text data bss dec hex filename >> 202862 18912 266456 488230 77326 u-boot >> >> ASM version: >> text data bss dec hex filename >> 203798 18912 266288 488998 77626 u-boot > > How exactly did you measure the speed improvement? I inserted a printf before and after calls to these functions with sizes of 1MB or more each. I then measured the times between these printfs using grabserial (http://elinux.org/Grabserial). In both cases caches where enabled. To be precise: As memset test case I used the memset(.., 0, ..) of the malloc pool (which was 4MB in my case) and a memcpy from flash to RAM which I inserted in cmd_bootm.c of about 2.2MB (see RFC patch http://patchwork.ozlabs.org/patch/79480/ for exact location of the memcpy). Do you think a factor of 2 is not possible against the C version? Maybe I have done something wrong while measuring theses times. From my point of view it should be possible to get such improvements as the code takes cache alignment into account and also uses the PLD instruction. I can do some additional measurements tomorrow on two systems (jadecpu with a 32Bit at 166MHz DDR2 memory and a imx25 based on with 16Bit at 133MHz LPDDR) and come up with some exact numbers. Maybe you can give some more hints what and how the improvements of this patch can be measured. Matthias Wei?er ^ permalink raw reply [flat|nested] 15+ messages in thread
* [U-Boot] [PATCH] arm: Use optimized memcpy and memset from linux 2011-01-24 19:24 ` Matthias Weißer @ 2011-01-24 20:07 ` Wolfgang Denk 2011-01-25 10:55 ` Matthias Weißer 0 siblings, 1 reply; 15+ messages in thread From: Wolfgang Denk @ 2011-01-24 20:07 UTC (permalink / raw) To: u-boot Dear =?ISO-8859-1?Q?Matthias_Wei=DFer?=, In message <4D3DD1EC.7010506@arcor.de> you wrote: > > >> C version: > >> text data bss dec hex filename > >> 202862 18912 266456 488230 77326 u-boot > >> > >> ASM version: > >> text data bss dec hex filename > >> 203798 18912 266288 488998 77626 u-boot > > > > How exactly did you measure the speed improvement? > > I inserted a printf before and after calls to these functions with sizes > of 1MB or more each. I then measured the times between these printfs > using grabserial (http://elinux.org/Grabserial). In both cases caches > where enabled. > > To be precise: As memset test case I used the memset(.., 0, ..) of the > malloc pool (which was 4MB in my case) and a memcpy from flash to RAM > which I inserted in cmd_bootm.c of about 2.2MB (see RFC patch > http://patchwork.ozlabs.org/patch/79480/ for exact location of the memcpy). OK - so which results do you see in reallife use, say when loading and booting an OS? How much boot time can be saved? > Do you think a factor of 2 is not possible against the C version? Maybe > I have done something wrong while measuring theses times. From my point > of view it should be possible to get such improvements as the code takes > cache alignment into account and also uses the PLD instruction. I don;t doubt your measurements. But this being an optimization approach, it seems appropriate to check wether we are really optimizing a hot spot, and if the measured results can be generalized. I guess the speed improvemnt you see for a few large copy operations is just one side - probably there will be slower excution (due to the effort to set up the operations) for the (many more frequent) small operations. In addition, there is an increase of the memory footprint of nearly 1 kB. I think additional measuremnts need to be done - for example, we should check how the execution times change for typical operations like TFTP download, reading from NAND flash and MMC/SDcard, booting a Linux kernel etc. Also, it should be possible to enable this feature consditionally, so users can decide wether speed or size is more important in their configurations. > I can do some additional measurements tomorrow on two systems (jadecpu > with a 32Bit at 166MHz DDR2 memory and a imx25 based on with 16Bit at 133MHz > LPDDR) and come up with some exact numbers. Maybe you can give some more > hints what and how the improvements of this patch can be measured. See above. Thanks. Wolfgang Denk -- DENX Software Engineering GmbH, MD: Wolfgang Denk & Detlev Zundel HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany Phone: (+49)-8142-66989-10 Fax: (+49)-8142-66989-80 Email: wd at denx.de Egotist: A person of low taste, more interested in himself than in me. - Ambrose Bierce ^ permalink raw reply [flat|nested] 15+ messages in thread
* [U-Boot] [PATCH] arm: Use optimized memcpy and memset from linux 2011-01-24 20:07 ` Wolfgang Denk @ 2011-01-25 10:55 ` Matthias Weißer 2011-01-25 20:05 ` Wolfgang Denk 0 siblings, 1 reply; 15+ messages in thread From: Matthias Weißer @ 2011-01-25 10:55 UTC (permalink / raw) To: u-boot Am 24.01.2011 21:07, schrieb Wolfgang Denk: > OK - so which results do you see in reallife use, say when loading and > booting an OS? How much boot time can be saved? All tests are done with jadecpu | HEAD(1)| HEAD(1)| HEAD(2)| HEAD(2)| | | +patch | | +patch | -----------------------+--------+--------+--------+--------+ Reset to prompt | 438ms | 330ms | 228ms | 120ms | | | | | | TFTP a 3MB img | 4782ms | 3428ms | 3245ms | 2820ms | | | | | | FATLOAD USB a 3MB img* | 8515ms | 8510ms | ------ | ------ | | | | | | BOOTM LZO img in RAM | 3473ms | 3168ms | 592ms | 592ms | where CRC is | 615ms | 615ms | 54ms | 54ms | uncompress | 2460ms | 2462ms | 450ms | 451ms | final boot_elf | 376ms | 68ms | 65ms | 65ms | | | | | | BOOTM LZO img in FLASH | 3207ms | 2902ms | 1050ms | 1050ms | where CRC is | 600ms | 600ms | 135ms | 135ms | uncompress | 2209ms | 2211ms | 828ms | 828ms | final boot_elf | 376ms | 68ms | 65ms | 65ms | (1) No dcache (2) dcache enabled in board_init *Does not work when dcache is on I think we can see that there seems to be no negativ impact of theses patches when only execution speed is taken into consideration. The gain is noticable when caching is not used or not activated. For pure RAM to RAM copy when caching is activated the patch didn't change anything. Here are some additional numbers for copying a 1.4MB image from NOR to RAM: HEAD : 134ms HEAD + patch : 72ms HEAD + dcache : 120ms HEAD + dcache + patch : 70ms So, for copy actions from flash to RAM there is also an improvement. As boot times are a bit critical or us every improvement > 10ms is interesting for us. > I guess the speed improvemnt you see for a few large copy operations > is just one side - probably there will be slower excution (due to the > effort to set up the operations) for the (many more frequent) small > operations. In addition, there is an increase of the memory footprint > of nearly 1 kB. > > I think additional measuremnts need to be done - for example, we > should check how the execution times change for typical operations > like TFTP download, reading from NAND flash and MMC/SDcard, booting a > Linux kernel etc. As the test above show there is no negative performance impact with the test cases I have done. As we don't use Linux here I can't test this. Maybe someone other can jump in here. > Also, it should be possible to enable this feature consditionally, so > users can decide wether speed or size is more important in their > configurations. Would it be an option to use the CONFIG entries CONFIG_USE_ARCH_MEMCPY and CONFIG_USE_ARCH_MEMSET to enable that feature? If that is OK I can send a new version of the patch. The only problem I see with this approach is that there are architectures which already have their own implementations which are then not affected by these config options. Regards Matthias ^ permalink raw reply [flat|nested] 15+ messages in thread
* [U-Boot] [PATCH] arm: Use optimized memcpy and memset from linux 2011-01-25 10:55 ` Matthias Weißer @ 2011-01-25 20:05 ` Wolfgang Denk 0 siblings, 0 replies; 15+ messages in thread From: Wolfgang Denk @ 2011-01-25 20:05 UTC (permalink / raw) To: u-boot Dear =?ISO-8859-1?Q?Matthias_Wei=DFer?=, In message <4D3EAC1A.5030707@arcor.de> you wrote: > > | HEAD(1)| HEAD(1)| HEAD(2)| HEAD(2)| > | | +patch | | +patch | > -----------------------+--------+--------+--------+--------+ > Reset to prompt | 438ms | 330ms | 228ms | 120ms | > | | | | | > TFTP a 3MB img | 4782ms | 3428ms | 3245ms | 2820ms | > | | | | | > FATLOAD USB a 3MB img* | 8515ms | 8510ms | ------ | ------ | > | | | | | > BOOTM LZO img in RAM | 3473ms | 3168ms | 592ms | 592ms | > where CRC is | 615ms | 615ms | 54ms | 54ms | > uncompress | 2460ms | 2462ms | 450ms | 451ms | > final boot_elf | 376ms | 68ms | 65ms | 65ms | > | | | | | > BOOTM LZO img in FLASH | 3207ms | 2902ms | 1050ms | 1050ms | > where CRC is | 600ms | 600ms | 135ms | 135ms | > uncompress | 2209ms | 2211ms | 828ms | 828ms | > final boot_elf | 376ms | 68ms | 65ms | 65ms | > > (1) No dcache > (2) dcache enabled in board_init > *Does not work when dcache is on > > I think we can see that there seems to be no negativ impact of theses > patches when only execution speed is taken into consideration. The gain > is noticable when caching is not used or not activated. For pure RAM to > RAM copy when caching is activated the patch didn't change anything. > > Here are some additional numbers for copying a 1.4MB image from NOR to RAM: > > HEAD : 134ms > HEAD + patch : 72ms > HEAD + dcache : 120ms > HEAD + dcache + patch : 70ms This is pretty much interesting information for developers who have to decide if they want to accept the increased memory footprint. Can you please add this to the commit message? > Would it be an option to use the CONFIG entries CONFIG_USE_ARCH_MEMCPY > and CONFIG_USE_ARCH_MEMSET to enable that feature? If that is OK I can Makes sense to me. > send a new version of the patch. The only problem I see with this > approach is that there are architectures which already have their own > implementations which are then not affected by these config options. If you are aware of any, it might make sense to put the respective maintainers on Cc: to trigger them to adapt / clean up their code. Thanks. Best regards, Wolfgang Denk -- DENX Software Engineering GmbH, MD: Wolfgang Denk & Detlev Zundel HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany Phone: (+49)-8142-66989-10 Fax: (+49)-8142-66989-80 Email: wd at denx.de Its always easier short term to pee in the pond than install a toilet - it's just not a good long term plan. - Alan Cox in <20100101145701.6432e7b7@lxorguk.ukuu.org.uk> ^ permalink raw reply [flat|nested] 15+ messages in thread
* [U-Boot] [PATCH V2] arm: Use optimized memcpy and memset from linux 2011-01-24 15:56 [U-Boot] [PATCH] arm: Use optimized memcpy and memset from linux Matthias Weisser 2011-01-24 16:13 ` Wolfgang Denk @ 2011-01-26 10:45 ` Matthias Weisser 2011-01-26 12:07 ` Albert ARIBAUD ` (2 more replies) 2011-03-11 7:36 ` [U-Boot] [PATCH V3] " Matthias Weisser 2 siblings, 3 replies; 15+ messages in thread From: Matthias Weisser @ 2011-01-26 10:45 UTC (permalink / raw) To: u-boot Using optimized versions of memset and memcpy from linux brings a quite noticeable speed (x2 or better) improvement for these two functions. Here are some numbers for test done with jadecpu | HEAD(1)| HEAD(1)| HEAD(2)| HEAD(2)| | | +patch | | +patch | ---------------------------+--------+--------+--------+--------+ Reset to prompt | 438ms | 330ms | 228ms | 120ms | | | | | | TFTP a 3MB img | 4782ms | 3428ms | 3245ms | 2820ms | | | | | | FATLOAD USB a 3MB img* | 8515ms | 8510ms | ------ | ------ | | | | | | BOOTM LZO img in RAM | 3473ms | 3168ms | 592ms | 592ms | where CRC is | 615ms | 615ms | 54ms | 54ms | uncompress | 2460ms | 2462ms | 450ms | 451ms | final boot_elf | 376ms | 68ms | 65ms | 65ms | | | | | | BOOTM LZO img in FLASH | 3207ms | 2902ms | 1050ms | 1050ms | where CRC is | 600ms | 600ms | 135ms | 135ms | uncompress | 2209ms | 2211ms | 828ms | 828ms | | | | | | Copy 1.4MB from NOR to RAM | 134ms | 72ms | 120ms | 70ms | (1) No dcache (2) dcache enabled in board_init *Does not work when dcache is on Size impact: C version: text data bss dec hex filename 202862 18912 266456 488230 77326 u-boot ASM version: text data bss dec hex filename 203798 18912 266288 488998 77626 u-boot 222712 u-boot.bin Changes since V1: - Made the usage of these functions optional be CONFIG_USE_ARCH_MEM - Usage of PLD instruction on all architectures supporting it - Added a README entry - Minor style fixes Signed-off-by: Matthias Weisser <weisserm@arcor.de> --- README | 6 + arch/arm/include/asm/assembler.h | 60 ++++++++++ arch/arm/include/asm/string.h | 10 ++- arch/arm/lib/Makefile | 2 + arch/arm/lib/memcpy.S | 241 ++++++++++++++++++++++++++++++++++++++ arch/arm/lib/memset.S | 126 ++++++++++++++++++++ 6 files changed, 443 insertions(+), 2 deletions(-) create mode 100644 arch/arm/include/asm/assembler.h create mode 100644 arch/arm/lib/memcpy.S create mode 100644 arch/arm/lib/memset.S diff --git a/README b/README index 755d17c..5c610f2 100644 --- a/README +++ b/README @@ -2885,6 +2885,12 @@ Low Level (hardware related) configuration options: that is executed before the actual U-Boot. E.g. when compiling a NAND SPL. +- CONFIG_USE_ARCH_MEMCPY + CONFIG_USE_ARCH_MEMSET + If these options are used a optimized version of memcpy/memset will + be used if available. These functions may be faster under some + conditions but may increase the binary size. + Building the Software: ====================== diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h new file mode 100644 index 0000000..418ee94 --- /dev/null +++ b/arch/arm/include/asm/assembler.h @@ -0,0 +1,60 @@ +/* + * arch/arm/include/asm/assembler.h + * + * Copyright (C) 1996-2000 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This file contains arm architecture specific defines + * for the different processors. + * + * Do not include any C declarations in this file - it is included by + * assembler source. + */ + +/* + * Endian independent macros for shifting bytes within registers. + */ +#ifndef __ARMEB__ +#define pull lsr +#define push lsl +#define get_byte_0 lsl #0 +#define get_byte_1 lsr #8 +#define get_byte_2 lsr #16 +#define get_byte_3 lsr #24 +#define put_byte_0 lsl #0 +#define put_byte_1 lsl #8 +#define put_byte_2 lsl #16 +#define put_byte_3 lsl #24 +#else +#define pull lsl +#define push lsr +#define get_byte_0 lsr #24 +#define get_byte_1 lsr #16 +#define get_byte_2 lsr #8 +#define get_byte_3 lsl #0 +#define put_byte_0 lsl #24 +#define put_byte_1 lsl #16 +#define put_byte_2 lsl #8 +#define put_byte_3 lsl #0 +#endif + +/* + * Data preload for architectures that support it + */ +#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \ + defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || \ + defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_7A__) || \ + defined(__ARM_ARCH_7R__) +#define PLD(code...) code +#else +#define PLD(code...) +#endif + +/* + * Cache alligned + */ +#define CALGN(code...) code diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h index c3ea582..c6dfb25 100644 --- a/arch/arm/include/asm/string.h +++ b/arch/arm/include/asm/string.h @@ -1,6 +1,8 @@ #ifndef __ASM_ARM_STRING_H #define __ASM_ARM_STRING_H +#include <config.h> + /* * We don't do inline string functions, since the * optimised inline asm versions are not small. @@ -12,7 +14,9 @@ extern char * strrchr(const char * s, int c); #undef __HAVE_ARCH_STRCHR extern char * strchr(const char * s, int c); -#undef __HAVE_ARCH_MEMCPY +#ifdef CONFIG_USE_ARCH_MEMCPY +#define __HAVE_ARCH_MEMCPY +#endif extern void * memcpy(void *, const void *, __kernel_size_t); #undef __HAVE_ARCH_MEMMOVE @@ -22,7 +26,9 @@ extern void * memmove(void *, const void *, __kernel_size_t); extern void * memchr(const void *, int, __kernel_size_t); #undef __HAVE_ARCH_MEMZERO -#undef __HAVE_ARCH_MEMSET +#ifdef CONFIG_USE_ARCH_MEMSET +#define __HAVE_ARCH_MEMSET +#endif extern void * memset(void *, int, __kernel_size_t); #if 0 diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index 454440c..03b1b5e 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile @@ -44,6 +44,8 @@ COBJS-y += cache-cp15.o endif COBJS-y += interrupts.o COBJS-y += reset.o +SOBJS-$(CONFIG_USE_ARCH_MEMSET) += memset.o +SOBJS-$(CONFIG_USE_ARCH_MEMCPY) += memcpy.o SRCS := $(GLSOBJS:.o=.S) $(GLCOBJS:.o=.c) \ $(SOBJS-y:.o=.S) $(COBJS-y:.o=.c) diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S new file mode 100644 index 0000000..40db90e --- /dev/null +++ b/arch/arm/lib/memcpy.S @@ -0,0 +1,241 @@ +/* + * linux/arch/arm/lib/memcpy.S + * + * Author: Nicolas Pitre + * Created: Sep 28, 2005 + * Copyright: MontaVista Software, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <asm/assembler.h> + +#define W(instr) instr + +#define LDR1W_SHIFT 0 +#define STR1W_SHIFT 0 + + .macro ldr1w ptr reg abort + W(ldr) \reg, [\ptr], #4 + .endm + + .macro ldr4w ptr reg1 reg2 reg3 reg4 abort + ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4} + .endm + + .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort + ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} + .endm + + .macro ldr1b ptr reg cond=al abort + ldr\cond\()b \reg, [\ptr], #1 + .endm + + .macro str1w ptr reg abort + W(str) \reg, [\ptr], #4 + .endm + + .macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort + stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} + .endm + + .macro str1b ptr reg cond=al abort + str\cond\()b \reg, [\ptr], #1 + .endm + + .macro enter reg1 reg2 + stmdb sp!, {r0, \reg1, \reg2} + .endm + + .macro exit reg1 reg2 + ldmfd sp!, {r0, \reg1, \reg2} + .endm + + .text + +/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */ + +.globl memcpy +memcpy: + + enter r4, lr + + subs r2, r2, #4 + blt 8f + ands ip, r0, #3 + PLD( pld [r1, #0] ) + bne 9f + ands ip, r1, #3 + bne 10f + +1: subs r2, r2, #(28) + stmfd sp!, {r5 - r8} + blt 5f + + CALGN( ands ip, r0, #31 ) + CALGN( rsb r3, ip, #32 ) + CALGN( sbcnes r4, r3, r2 ) @ C is always set here + CALGN( bcs 2f ) + CALGN( adr r4, 6f ) + CALGN( subs r2, r2, r3 ) @ C gets set + CALGN( add pc, r4, ip ) + + PLD( pld [r1, #0] ) +2: PLD( subs r2, r2, #96 ) + PLD( pld [r1, #28] ) + PLD( blt 4f ) + PLD( pld [r1, #60] ) + PLD( pld [r1, #92] ) + +3: PLD( pld [r1, #124] ) +4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f + subs r2, r2, #32 + str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f + bge 3b + PLD( cmn r2, #96 ) + PLD( bge 4b ) + +5: ands ip, r2, #28 + rsb ip, ip, #32 +#if LDR1W_SHIFT > 0 + lsl ip, ip, #LDR1W_SHIFT +#endif + addne pc, pc, ip @ C is always clear here + b 7f +6: + .rept (1 << LDR1W_SHIFT) + W(nop) + .endr + ldr1w r1, r3, abort=20f + ldr1w r1, r4, abort=20f + ldr1w r1, r5, abort=20f + ldr1w r1, r6, abort=20f + ldr1w r1, r7, abort=20f + ldr1w r1, r8, abort=20f + ldr1w r1, lr, abort=20f + +#if LDR1W_SHIFT < STR1W_SHIFT + lsl ip, ip, #STR1W_SHIFT - LDR1W_SHIFT +#elif LDR1W_SHIFT > STR1W_SHIFT + lsr ip, ip, #LDR1W_SHIFT - STR1W_SHIFT +#endif + add pc, pc, ip + nop + .rept (1 << STR1W_SHIFT) + W(nop) + .endr + str1w r0, r3, abort=20f + str1w r0, r4, abort=20f + str1w r0, r5, abort=20f + str1w r0, r6, abort=20f + str1w r0, r7, abort=20f + str1w r0, r8, abort=20f + str1w r0, lr, abort=20f + + CALGN( bcs 2b ) + +7: ldmfd sp!, {r5 - r8} + +8: movs r2, r2, lsl #31 + ldr1b r1, r3, ne, abort=21f + ldr1b r1, r4, cs, abort=21f + ldr1b r1, ip, cs, abort=21f + str1b r0, r3, ne, abort=21f + str1b r0, r4, cs, abort=21f + str1b r0, ip, cs, abort=21f + + exit r4, pc + +9: rsb ip, ip, #4 + cmp ip, #2 + ldr1b r1, r3, gt, abort=21f + ldr1b r1, r4, ge, abort=21f + ldr1b r1, lr, abort=21f + str1b r0, r3, gt, abort=21f + str1b r0, r4, ge, abort=21f + subs r2, r2, ip + str1b r0, lr, abort=21f + blt 8b + ands ip, r1, #3 + beq 1b + +10: bic r1, r1, #3 + cmp ip, #2 + ldr1w r1, lr, abort=21f + beq 17f + bgt 18f + + + .macro forward_copy_shift pull push + + subs r2, r2, #28 + blt 14f + + CALGN( ands ip, r0, #31 ) + CALGN( rsb ip, ip, #32 ) + CALGN( sbcnes r4, ip, r2 ) @ C is always set here + CALGN( subcc r2, r2, ip ) + CALGN( bcc 15f ) + +11: stmfd sp!, {r5 - r9} + + PLD( pld [r1, #0] ) + PLD( subs r2, r2, #96 ) + PLD( pld [r1, #28] ) + PLD( blt 13f ) + PLD( pld [r1, #60] ) + PLD( pld [r1, #92] ) + +12: PLD( pld [r1, #124] ) +13: ldr4w r1, r4, r5, r6, r7, abort=19f + mov r3, lr, pull #\pull + subs r2, r2, #32 + ldr4w r1, r8, r9, ip, lr, abort=19f + orr r3, r3, r4, push #\push + mov r4, r4, pull #\pull + orr r4, r4, r5, push #\push + mov r5, r5, pull #\pull + orr r5, r5, r6, push #\push + mov r6, r6, pull #\pull + orr r6, r6, r7, push #\push + mov r7, r7, pull #\pull + orr r7, r7, r8, push #\push + mov r8, r8, pull #\pull + orr r8, r8, r9, push #\push + mov r9, r9, pull #\pull + orr r9, r9, ip, push #\push + mov ip, ip, pull #\pull + orr ip, ip, lr, push #\push + str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f + bge 12b + PLD( cmn r2, #96 ) + PLD( bge 13b ) + + ldmfd sp!, {r5 - r9} + +14: ands ip, r2, #28 + beq 16f + +15: mov r3, lr, pull #\pull + ldr1w r1, lr, abort=21f + subs ip, ip, #4 + orr r3, r3, lr, push #\push + str1w r0, r3, abort=21f + bgt 15b + CALGN( cmp r2, #0 ) + CALGN( bge 11b ) + +16: sub r1, r1, #(\push / 8) + b 8b + + .endm + + + forward_copy_shift pull=8 push=24 + +17: forward_copy_shift pull=16 push=16 + +18: forward_copy_shift pull=24 push=8 + diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S new file mode 100644 index 0000000..0cdf895 --- /dev/null +++ b/arch/arm/lib/memset.S @@ -0,0 +1,126 @@ +/* + * linux/arch/arm/lib/memset.S + * + * Copyright (C) 1995-2000 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * ASM optimised string functions + */ +#include <asm/assembler.h> + + .text + .align 5 + .word 0 + +1: subs r2, r2, #4 @ 1 do we have enough + blt 5f @ 1 bytes to align with? + cmp r3, #2 @ 1 + strltb r1, [r0], #1 @ 1 + strleb r1, [r0], #1 @ 1 + strb r1, [r0], #1 @ 1 + add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3)) +/* + * The pointer is now aligned and the length is adjusted. Try doing the + * memset again. + */ + +.globl memset +memset: + ands r3, r0, #3 @ 1 unaligned? + bne 1b @ 1 +/* + * we know that the pointer in r0 is aligned to a word boundary. + */ + orr r1, r1, r1, lsl #8 + orr r1, r1, r1, lsl #16 + mov r3, r1 + cmp r2, #16 + blt 4f + +#if ! CALGN(1)+0 + +/* + * We need an extra register for this loop - save the return address and + * use the LR + */ + str lr, [sp, #-4]! + mov ip, r1 + mov lr, r1 + +2: subs r2, r2, #64 + stmgeia r0!, {r1, r3, ip, lr} @ 64 bytes at a time. + stmgeia r0!, {r1, r3, ip, lr} + stmgeia r0!, {r1, r3, ip, lr} + stmgeia r0!, {r1, r3, ip, lr} + bgt 2b + ldmeqfd sp!, {pc} @ Now <64 bytes to go. +/* + * No need to correct the count; we're only testing bits from now on + */ + tst r2, #32 + stmneia r0!, {r1, r3, ip, lr} + stmneia r0!, {r1, r3, ip, lr} + tst r2, #16 + stmneia r0!, {r1, r3, ip, lr} + ldr lr, [sp], #4 + +#else + +/* + * This version aligns the destination pointer in order to write + * whole cache lines@once. + */ + + stmfd sp!, {r4-r7, lr} + mov r4, r1 + mov r5, r1 + mov r6, r1 + mov r7, r1 + mov ip, r1 + mov lr, r1 + + cmp r2, #96 + tstgt r0, #31 + ble 3f + + and ip, r0, #31 + rsb ip, ip, #32 + sub r2, r2, ip + movs ip, ip, lsl #(32 - 4) + stmcsia r0!, {r4, r5, r6, r7} + stmmiia r0!, {r4, r5} + tst ip, #(1 << 30) + mov ip, r1 + strne r1, [r0], #4 + +3: subs r2, r2, #64 + stmgeia r0!, {r1, r3-r7, ip, lr} + stmgeia r0!, {r1, r3-r7, ip, lr} + bgt 3b + ldmeqfd sp!, {r4-r7, pc} + + tst r2, #32 + stmneia r0!, {r1, r3-r7, ip, lr} + tst r2, #16 + stmneia r0!, {r4-r7} + ldmfd sp!, {r4-r7, lr} + +#endif + +4: tst r2, #8 + stmneia r0!, {r1, r3} + tst r2, #4 + strne r1, [r0], #4 +/* + * When we get here, we've got less than 4 bytes to zero. We + * may have an unaligned pointer as well. + */ +5: tst r2, #2 + strneb r1, [r0], #1 + strneb r1, [r0], #1 + tst r2, #1 + strneb r1, [r0], #1 + mov pc, lr -- 1.7.0.4 ^ permalink raw reply related [flat|nested] 15+ messages in thread
* [U-Boot] [PATCH V2] arm: Use optimized memcpy and memset from linux 2011-01-26 10:45 ` [U-Boot] [PATCH V2] " Matthias Weisser @ 2011-01-26 12:07 ` Albert ARIBAUD 2011-01-26 12:50 ` Matthias Weißer 2011-02-20 19:35 ` Alexander Holler 2011-03-03 7:07 ` Albert ARIBAUD 2 siblings, 1 reply; 15+ messages in thread From: Albert ARIBAUD @ 2011-01-26 12:07 UTC (permalink / raw) To: u-boot Hi Matthias, Le 26/01/2011 11:45, Matthias Weisser a ?crit : > Using optimized versions of memset and memcpy from linux brings a quite > noticeable speed (x2 or better) improvement for these two functions. > > Here are some numbers for test done with jadecpu > > | HEAD(1)| HEAD(1)| HEAD(2)| HEAD(2)| > | | +patch | | +patch | > ---------------------------+--------+--------+--------+--------+ > Reset to prompt | 438ms | 330ms | 228ms | 120ms | > | | | | | > TFTP a 3MB img | 4782ms | 3428ms | 3245ms | 2820ms | > | | | | | > FATLOAD USB a 3MB img* | 8515ms | 8510ms | ------ | ------ | > | | | | | > BOOTM LZO img in RAM | 3473ms | 3168ms | 592ms | 592ms | > where CRC is | 615ms | 615ms | 54ms | 54ms | > uncompress | 2460ms | 2462ms | 450ms | 451ms | > final boot_elf | 376ms | 68ms | 65ms | 65ms | > | | | | | > BOOTM LZO img in FLASH | 3207ms | 2902ms | 1050ms | 1050ms | > where CRC is | 600ms | 600ms | 135ms | 135ms | > uncompress | 2209ms | 2211ms | 828ms | 828ms | > | | | | | > Copy 1.4MB from NOR to RAM | 134ms | 72ms | 120ms | 70ms | > > (1) No dcache > (2) dcache enabled in board_init > *Does not work when dcache is on > > Size impact: > > C version: > text data bss dec hex filename > 202862 18912 266456 488230 77326 u-boot > > ASM version: > text data bss dec hex filename > 203798 18912 266288 488998 77626 u-boot > 222712 u-boot.bin > > Changes since V1: > - Made the usage of these functions optional be CONFIG_USE_ARCH_MEM > - Usage of PLD instruction on all architectures supporting it > - Added a README entry > - Minor style fixes > > Signed-off-by: Matthias Weisser<weisserm@arcor.de> > --- IIRC, the '---' line separates patch commit message (above) from freeform comments and history (below). Here, at least the version history should move below the '---' line. Also, I think that above the line, /some/ indication of performance enhancement and drawbacks should be given, but not a full ASCII table of numbers -- that can go below the line. > README | 6 + > arch/arm/include/asm/assembler.h | 60 ++++++++++ > arch/arm/include/asm/string.h | 10 ++- > arch/arm/lib/Makefile | 2 + > arch/arm/lib/memcpy.S | 241 ++++++++++++++++++++++++++++++++++++++ > arch/arm/lib/memset.S | 126 ++++++++++++++++++++ > 6 files changed, 443 insertions(+), 2 deletions(-) > create mode 100644 arch/arm/include/asm/assembler.h > create mode 100644 arch/arm/lib/memcpy.S > create mode 100644 arch/arm/lib/memset.S > > diff --git a/README b/README > index 755d17c..5c610f2 100644 > --- a/README > +++ b/README > @@ -2885,6 +2885,12 @@ Low Level (hardware related) configuration options: > that is executed before the actual U-Boot. E.g. when > compiling a NAND SPL. > > +- CONFIG_USE_ARCH_MEMCPY > + CONFIG_USE_ARCH_MEMSET > + If these options are used a optimized version of memcpy/memset will > + be used if available. These functions may be faster under some > + conditions but may increase the binary size. > + The name of the options is not self-explaining to me. If the difference is "generic vs arch-optimal", then maybe CONFIG_USE_ARCH_OPTIMAL_MEMxxx would be a better name? > Building the Software: > ====================== > > diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h > new file mode 100644 > index 0000000..418ee94 > --- /dev/null > +++ b/arch/arm/include/asm/assembler.h > @@ -0,0 +1,60 @@ > +/* > + * arch/arm/include/asm/assembler.h > + * > + * Copyright (C) 1996-2000 Russell King > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + * > + * This file contains arm architecture specific defines > + * for the different processors. > + * > + * Do not include any C declarations in this file - it is included by > + * assembler source. > + */ > + > +/* > + * Endian independent macros for shifting bytes within registers. > + */ > +#ifndef __ARMEB__ > +#define pull lsr > +#define push lsl > +#define get_byte_0 lsl #0 > +#define get_byte_1 lsr #8 > +#define get_byte_2 lsr #16 > +#define get_byte_3 lsr #24 > +#define put_byte_0 lsl #0 > +#define put_byte_1 lsl #8 > +#define put_byte_2 lsl #16 > +#define put_byte_3 lsl #24 > +#else > +#define pull lsl > +#define push lsr > +#define get_byte_0 lsr #24 > +#define get_byte_1 lsr #16 > +#define get_byte_2 lsr #8 > +#define get_byte_3 lsl #0 > +#define put_byte_0 lsl #24 > +#define put_byte_1 lsl #16 > +#define put_byte_2 lsl #8 > +#define put_byte_3 lsl #0 > +#endif > + > +/* > + * Data preload for architectures that support it > + */ > +#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \ > + defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ > + defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || \ > + defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_7A__) || \ > + defined(__ARM_ARCH_7R__) > +#define PLD(code...) code > +#else > +#define PLD(code...) > +#endif > + > +/* > + * Cache alligned > + */ > +#define CALGN(code...) code > diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h > index c3ea582..c6dfb25 100644 > --- a/arch/arm/include/asm/string.h > +++ b/arch/arm/include/asm/string.h > @@ -1,6 +1,8 @@ > #ifndef __ASM_ARM_STRING_H > #define __ASM_ARM_STRING_H > > +#include<config.h> > + > /* > * We don't do inline string functions, since the > * optimised inline asm versions are not small. > @@ -12,7 +14,9 @@ extern char * strrchr(const char * s, int c); > #undef __HAVE_ARCH_STRCHR > extern char * strchr(const char * s, int c); > > -#undef __HAVE_ARCH_MEMCPY > +#ifdef CONFIG_USE_ARCH_MEMCPY > +#define __HAVE_ARCH_MEMCPY > +#endif > extern void * memcpy(void *, const void *, __kernel_size_t); > > #undef __HAVE_ARCH_MEMMOVE > @@ -22,7 +26,9 @@ extern void * memmove(void *, const void *, __kernel_size_t); > extern void * memchr(const void *, int, __kernel_size_t); > > #undef __HAVE_ARCH_MEMZERO > -#undef __HAVE_ARCH_MEMSET > +#ifdef CONFIG_USE_ARCH_MEMSET > +#define __HAVE_ARCH_MEMSET > +#endif > extern void * memset(void *, int, __kernel_size_t); > > #if 0 > diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile > index 454440c..03b1b5e 100644 > --- a/arch/arm/lib/Makefile > +++ b/arch/arm/lib/Makefile > @@ -44,6 +44,8 @@ COBJS-y += cache-cp15.o > endif > COBJS-y += interrupts.o > COBJS-y += reset.o > +SOBJS-$(CONFIG_USE_ARCH_MEMSET) += memset.o > +SOBJS-$(CONFIG_USE_ARCH_MEMCPY) += memcpy.o > > SRCS := $(GLSOBJS:.o=.S) $(GLCOBJS:.o=.c) \ > $(SOBJS-y:.o=.S) $(COBJS-y:.o=.c) > diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S > new file mode 100644 > index 0000000..40db90e > --- /dev/null > +++ b/arch/arm/lib/memcpy.S > @@ -0,0 +1,241 @@ > +/* > + * linux/arch/arm/lib/memcpy.S > + * > + * Author: Nicolas Pitre > + * Created: Sep 28, 2005 > + * Copyright: MontaVista Software, Inc. > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#include<asm/assembler.h> > + > +#define W(instr) instr > + > +#define LDR1W_SHIFT 0 > +#define STR1W_SHIFT 0 > + > + .macro ldr1w ptr reg abort > + W(ldr) \reg, [\ptr], #4 > + .endm > + > + .macro ldr4w ptr reg1 reg2 reg3 reg4 abort > + ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4} > + .endm > + > + .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort > + ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} > + .endm > + > + .macro ldr1b ptr reg cond=al abort > + ldr\cond\()b \reg, [\ptr], #1 > + .endm > + > + .macro str1w ptr reg abort > + W(str) \reg, [\ptr], #4 > + .endm > + > + .macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort > + stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} > + .endm > + > + .macro str1b ptr reg cond=al abort > + str\cond\()b \reg, [\ptr], #1 > + .endm > + > + .macro enter reg1 reg2 > + stmdb sp!, {r0, \reg1, \reg2} > + .endm > + > + .macro exit reg1 reg2 > + ldmfd sp!, {r0, \reg1, \reg2} > + .endm > + > + .text > + > +/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */ > + > +.globl memcpy > +memcpy: > + > + enter r4, lr > + > + subs r2, r2, #4 > + blt 8f > + ands ip, r0, #3 > + PLD( pld [r1, #0] ) > + bne 9f > + ands ip, r1, #3 > + bne 10f > + > +1: subs r2, r2, #(28) > + stmfd sp!, {r5 - r8} > + blt 5f > + > + CALGN( ands ip, r0, #31 ) > + CALGN( rsb r3, ip, #32 ) > + CALGN( sbcnes r4, r3, r2 ) @ C is always set here > + CALGN( bcs 2f ) > + CALGN( adr r4, 6f ) > + CALGN( subs r2, r2, r3 ) @ C gets set > + CALGN( add pc, r4, ip ) > + > + PLD( pld [r1, #0] ) > +2: PLD( subs r2, r2, #96 ) > + PLD( pld [r1, #28] ) > + PLD( blt 4f ) > + PLD( pld [r1, #60] ) > + PLD( pld [r1, #92] ) > + > +3: PLD( pld [r1, #124] ) > +4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > + subs r2, r2, #32 > + str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > + bge 3b > + PLD( cmn r2, #96 ) > + PLD( bge 4b ) > + > +5: ands ip, r2, #28 > + rsb ip, ip, #32 > +#if LDR1W_SHIFT> 0 > + lsl ip, ip, #LDR1W_SHIFT > +#endif > + addne pc, pc, ip @ C is always clear here > + b 7f > +6: > + .rept (1<< LDR1W_SHIFT) > + W(nop) > + .endr > + ldr1w r1, r3, abort=20f > + ldr1w r1, r4, abort=20f > + ldr1w r1, r5, abort=20f > + ldr1w r1, r6, abort=20f > + ldr1w r1, r7, abort=20f > + ldr1w r1, r8, abort=20f > + ldr1w r1, lr, abort=20f > + > +#if LDR1W_SHIFT< STR1W_SHIFT > + lsl ip, ip, #STR1W_SHIFT - LDR1W_SHIFT > +#elif LDR1W_SHIFT> STR1W_SHIFT > + lsr ip, ip, #LDR1W_SHIFT - STR1W_SHIFT > +#endif > + add pc, pc, ip > + nop > + .rept (1<< STR1W_SHIFT) > + W(nop) > + .endr > + str1w r0, r3, abort=20f > + str1w r0, r4, abort=20f > + str1w r0, r5, abort=20f > + str1w r0, r6, abort=20f > + str1w r0, r7, abort=20f > + str1w r0, r8, abort=20f > + str1w r0, lr, abort=20f > + > + CALGN( bcs 2b ) > + > +7: ldmfd sp!, {r5 - r8} > + > +8: movs r2, r2, lsl #31 > + ldr1b r1, r3, ne, abort=21f > + ldr1b r1, r4, cs, abort=21f > + ldr1b r1, ip, cs, abort=21f > + str1b r0, r3, ne, abort=21f > + str1b r0, r4, cs, abort=21f > + str1b r0, ip, cs, abort=21f > + > + exit r4, pc > + > +9: rsb ip, ip, #4 > + cmp ip, #2 > + ldr1b r1, r3, gt, abort=21f > + ldr1b r1, r4, ge, abort=21f > + ldr1b r1, lr, abort=21f > + str1b r0, r3, gt, abort=21f > + str1b r0, r4, ge, abort=21f > + subs r2, r2, ip > + str1b r0, lr, abort=21f > + blt 8b > + ands ip, r1, #3 > + beq 1b > + > +10: bic r1, r1, #3 > + cmp ip, #2 > + ldr1w r1, lr, abort=21f > + beq 17f > + bgt 18f > + > + > + .macro forward_copy_shift pull push > + > + subs r2, r2, #28 > + blt 14f > + > + CALGN( ands ip, r0, #31 ) > + CALGN( rsb ip, ip, #32 ) > + CALGN( sbcnes r4, ip, r2 ) @ C is always set here > + CALGN( subcc r2, r2, ip ) > + CALGN( bcc 15f ) > + > +11: stmfd sp!, {r5 - r9} > + > + PLD( pld [r1, #0] ) > + PLD( subs r2, r2, #96 ) > + PLD( pld [r1, #28] ) > + PLD( blt 13f ) > + PLD( pld [r1, #60] ) > + PLD( pld [r1, #92] ) > + > +12: PLD( pld [r1, #124] ) > +13: ldr4w r1, r4, r5, r6, r7, abort=19f > + mov r3, lr, pull #\pull > + subs r2, r2, #32 > + ldr4w r1, r8, r9, ip, lr, abort=19f > + orr r3, r3, r4, push #\push > + mov r4, r4, pull #\pull > + orr r4, r4, r5, push #\push > + mov r5, r5, pull #\pull > + orr r5, r5, r6, push #\push > + mov r6, r6, pull #\pull > + orr r6, r6, r7, push #\push > + mov r7, r7, pull #\pull > + orr r7, r7, r8, push #\push > + mov r8, r8, pull #\pull > + orr r8, r8, r9, push #\push > + mov r9, r9, pull #\pull > + orr r9, r9, ip, push #\push > + mov ip, ip, pull #\pull > + orr ip, ip, lr, push #\push > + str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f > + bge 12b > + PLD( cmn r2, #96 ) > + PLD( bge 13b ) > + > + ldmfd sp!, {r5 - r9} > + > +14: ands ip, r2, #28 > + beq 16f > + > +15: mov r3, lr, pull #\pull > + ldr1w r1, lr, abort=21f > + subs ip, ip, #4 > + orr r3, r3, lr, push #\push > + str1w r0, r3, abort=21f > + bgt 15b > + CALGN( cmp r2, #0 ) > + CALGN( bge 11b ) > + > +16: sub r1, r1, #(\push / 8) > + b 8b > + > + .endm > + > + > + forward_copy_shift pull=8 push=24 > + > +17: forward_copy_shift pull=16 push=16 > + > +18: forward_copy_shift pull=24 push=8 > + > diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S > new file mode 100644 > index 0000000..0cdf895 > --- /dev/null > +++ b/arch/arm/lib/memset.S > @@ -0,0 +1,126 @@ > +/* > + * linux/arch/arm/lib/memset.S > + * > + * Copyright (C) 1995-2000 Russell King > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + * > + * ASM optimised string functions > + */ > +#include<asm/assembler.h> > + > + .text > + .align 5 > + .word 0 > + > +1: subs r2, r2, #4 @ 1 do we have enough > + blt 5f @ 1 bytes to align with? > + cmp r3, #2 @ 1 > + strltb r1, [r0], #1 @ 1 > + strleb r1, [r0], #1 @ 1 > + strb r1, [r0], #1 @ 1 > + add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3)) > +/* > + * The pointer is now aligned and the length is adjusted. Try doing the > + * memset again. > + */ > + > +.globl memset > +memset: > + ands r3, r0, #3 @ 1 unaligned? > + bne 1b @ 1 > +/* > + * we know that the pointer in r0 is aligned to a word boundary. > + */ > + orr r1, r1, r1, lsl #8 > + orr r1, r1, r1, lsl #16 > + mov r3, r1 > + cmp r2, #16 > + blt 4f > + > +#if ! CALGN(1)+0 > + > +/* > + * We need an extra register for this loop - save the return address and > + * use the LR > + */ > + str lr, [sp, #-4]! > + mov ip, r1 > + mov lr, r1 > + > +2: subs r2, r2, #64 > + stmgeia r0!, {r1, r3, ip, lr} @ 64 bytes at a time. > + stmgeia r0!, {r1, r3, ip, lr} > + stmgeia r0!, {r1, r3, ip, lr} > + stmgeia r0!, {r1, r3, ip, lr} > + bgt 2b > + ldmeqfd sp!, {pc} @ Now<64 bytes to go. > +/* > + * No need to correct the count; we're only testing bits from now on > + */ > + tst r2, #32 > + stmneia r0!, {r1, r3, ip, lr} > + stmneia r0!, {r1, r3, ip, lr} > + tst r2, #16 > + stmneia r0!, {r1, r3, ip, lr} > + ldr lr, [sp], #4 > + > +#else > + > +/* > + * This version aligns the destination pointer in order to write > + * whole cache lines at once. > + */ > + > + stmfd sp!, {r4-r7, lr} > + mov r4, r1 > + mov r5, r1 > + mov r6, r1 > + mov r7, r1 > + mov ip, r1 > + mov lr, r1 > + > + cmp r2, #96 > + tstgt r0, #31 > + ble 3f > + > + and ip, r0, #31 > + rsb ip, ip, #32 > + sub r2, r2, ip > + movs ip, ip, lsl #(32 - 4) > + stmcsia r0!, {r4, r5, r6, r7} > + stmmiia r0!, {r4, r5} > + tst ip, #(1<< 30) > + mov ip, r1 > + strne r1, [r0], #4 > + > +3: subs r2, r2, #64 > + stmgeia r0!, {r1, r3-r7, ip, lr} > + stmgeia r0!, {r1, r3-r7, ip, lr} > + bgt 3b > + ldmeqfd sp!, {r4-r7, pc} > + > + tst r2, #32 > + stmneia r0!, {r1, r3-r7, ip, lr} > + tst r2, #16 > + stmneia r0!, {r4-r7} > + ldmfd sp!, {r4-r7, lr} > + > +#endif > + > +4: tst r2, #8 > + stmneia r0!, {r1, r3} > + tst r2, #4 > + strne r1, [r0], #4 > +/* > + * When we get here, we've got less than 4 bytes to zero. We > + * may have an unaligned pointer as well. > + */ > +5: tst r2, #2 > + strneb r1, [r0], #1 > + strneb r1, [r0], #1 > + tst r2, #1 > + strneb r1, [r0], #1 > + mov pc, lr Amicalement, -- Albert. ^ permalink raw reply [flat|nested] 15+ messages in thread
* [U-Boot] [PATCH V2] arm: Use optimized memcpy and memset from linux 2011-01-26 12:07 ` Albert ARIBAUD @ 2011-01-26 12:50 ` Matthias Weißer 2011-01-26 13:07 ` Wolfgang Denk 0 siblings, 1 reply; 15+ messages in thread From: Matthias Weißer @ 2011-01-26 12:50 UTC (permalink / raw) To: u-boot Am 26.01.2011 13:07, schrieb Albert ARIBAUD: >> --- > > IIRC, the '---' line separates patch commit message (above) from > freeform comments and history (below). Here, at least the version > history should move below the '---' line. Wolfgang asked me that I add the numbers to the commit message. For the changelog I will investigate the git commands on how to do that best without manually editing the patch file before git send-email them. >> +- CONFIG_USE_ARCH_MEMCPY >> + CONFIG_USE_ARCH_MEMSET >> + If these options are used a optimized version of memcpy/memset will >> + be used if available. These functions may be faster under some >> + conditions but may increase the binary size. >> + > > The name of the options is not self-explaining to me. If the difference > is "generic vs arch-optimal", then maybe CONFIG_USE_ARCH_OPTIMAL_MEMxxx > would be a better name? Wolfgang didn't object on these names. If we use the OPTIMAL form it is still not clear what optimal mean. There may be a size optimized version and a speed optimized version. So we would need CONFIG_USE_ARCH_SPEED_OPTIMAL_MEMxxx which I personally dislike a lot as it is quite long. I also think that if there is an architecture specific function that it should be clear that this is optimal in some way. Thanks for review Regards Matthias ^ permalink raw reply [flat|nested] 15+ messages in thread
* [U-Boot] [PATCH V2] arm: Use optimized memcpy and memset from linux 2011-01-26 12:50 ` Matthias Weißer @ 2011-01-26 13:07 ` Wolfgang Denk 2011-01-27 18:39 ` Albert ARIBAUD 0 siblings, 1 reply; 15+ messages in thread From: Wolfgang Denk @ 2011-01-26 13:07 UTC (permalink / raw) To: u-boot Dear =?ISO-8859-15?Q?Matthias_Wei=DFer?=, In message <4D4018AD.7090001@arcor.de> you wrote: > > > IIRC, the '---' line separates patch commit message (above) from > > freeform comments and history (below). Here, at least the version > > history should move below the '---' line. > > Wolfgang asked me that I add the numbers to the commit message. For the > changelog I will investigate the git commands on how to do that best > without manually editing the patch file before git send-email them. Indeed I find that these numbers are information that should go into the commit message so this data is available to users who have to decide whether they want to trade the increased speed for the increased memory footprint. > >> +- CONFIG_USE_ARCH_MEMCPY > >> + CONFIG_USE_ARCH_MEMSET > >> + If these options are used a optimized version of memcpy/memset will > >> + be used if available. These functions may be faster under some > >> + conditions but may increase the binary size. > >> + > > > > The name of the options is not self-explaining to me. If the difference > > is "generic vs arch-optimal", then maybe CONFIG_USE_ARCH_OPTIMAL_MEMxxx > > would be a better name? > > Wolfgang didn't object on these names. If we use the OPTIMAL form it is > still not clear what optimal mean. There may be a size optimized version > and a speed optimized version. So we would need > CONFIG_USE_ARCH_SPEED_OPTIMAL_MEMxxx which I personally dislike a lot as > it is quite long. I also think that if there is an architecture specific > function that it should be clear that this is optimal in some way. Well, "optimal" is not a good idea as I am pretty sure that some clever person will still be able to spare some cycles here and there, so his code would be even "more optimal" ;-) I think the names CONFIG_USE_ARCH_MEMCPY etc. are actually pretty good, because they are in line with the standard names __HAVE_ARCH_MEMCPY etc. that are used in a lot of libraries. Best regards, Wolfgang Denk -- DENX Software Engineering GmbH, MD: Wolfgang Denk & Detlev Zundel HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany Phone: (+49)-8142-66989-10 Fax: (+49)-8142-66989-80 Email: wd at denx.de "355/113 -- Not the famous irrational number PI, but an incredible simulation!" ^ permalink raw reply [flat|nested] 15+ messages in thread
* [U-Boot] [PATCH V2] arm: Use optimized memcpy and memset from linux 2011-01-26 13:07 ` Wolfgang Denk @ 2011-01-27 18:39 ` Albert ARIBAUD 0 siblings, 0 replies; 15+ messages in thread From: Albert ARIBAUD @ 2011-01-27 18:39 UTC (permalink / raw) To: u-boot Hi Wolfgang, Le 26/01/2011 14:07, Wolfgang Denk a ?crit : > Dear =?ISO-8859-15?Q?Matthias_Wei=DFer?=, > > In message<4D4018AD.7090001@arcor.de> you wrote: >> >>> IIRC, the '---' line separates patch commit message (above) from >>> freeform comments and history (below). Here, at least the version >>> history should move below the '---' line. >> >> Wolfgang asked me that I add the numbers to the commit message. For the >> changelog I will investigate the git commands on how to do that best >> without manually editing the patch file before git send-email them. > > Indeed I find that these numbers are information that should go into > the commit message so this data is available to users who have to > decide whether they want to trade the increased speed for the > increased memory footprint. Can't we have thses numbers in a more compact form then? That makes a really big commit message. >>>> +- CONFIG_USE_ARCH_MEMCPY >>>> + CONFIG_USE_ARCH_MEMSET >>>> + If these options are used a optimized version of memcpy/memset will >>>> + be used if available. These functions may be faster under some >>>> + conditions but may increase the binary size. >>>> + >>> >>> The name of the options is not self-explaining to me. If the difference >>> is "generic vs arch-optimal", then maybe CONFIG_USE_ARCH_OPTIMAL_MEMxxx >>> would be a better name? >> >> Wolfgang didn't object on these names. If we use the OPTIMAL form it is >> still not clear what optimal mean. There may be a size optimized version >> and a speed optimized version. So we would need >> CONFIG_USE_ARCH_SPEED_OPTIMAL_MEMxxx which I personally dislike a lot as >> it is quite long. I also think that if there is an architecture specific >> function that it should be clear that this is optimal in some way. > > Well, "optimal" is not a good idea as I am pretty sure that some > clever person will still be able to spare some cycles here and there, > so his code would be even "more optimal" ;-) Granted. > I think the names CONFIG_USE_ARCH_MEMCPY etc. are actually pretty > good, because they are in line with the standard names > __HAVE_ARCH_MEMCPY etc. that are used in a lot of libraries. All right. > Best regards, > > Wolfgang Denk Amicalement, -- Albert. ^ permalink raw reply [flat|nested] 15+ messages in thread
* [U-Boot] [PATCH V2] arm: Use optimized memcpy and memset from linux 2011-01-26 10:45 ` [U-Boot] [PATCH V2] " Matthias Weisser 2011-01-26 12:07 ` Albert ARIBAUD @ 2011-02-20 19:35 ` Alexander Holler 2011-03-03 7:07 ` Albert ARIBAUD 2 siblings, 0 replies; 15+ messages in thread From: Alexander Holler @ 2011-02-20 19:35 UTC (permalink / raw) To: u-boot Hello, Am 26.01.2011 11:45, schrieb Matthias Weisser: > Using optimized versions of memset and memcpy from linux brings a quite > noticeable speed (x2 or better) improvement for these two functions. Thanks, tested on armv5 and armv7, therefor Tested-by: Alexander Holler <holler@ahsoftware.de> Regards, Alexander ^ permalink raw reply [flat|nested] 15+ messages in thread
* [U-Boot] [PATCH V2] arm: Use optimized memcpy and memset from linux 2011-01-26 10:45 ` [U-Boot] [PATCH V2] " Matthias Weisser 2011-01-26 12:07 ` Albert ARIBAUD 2011-02-20 19:35 ` Alexander Holler @ 2011-03-03 7:07 ` Albert ARIBAUD 2 siblings, 0 replies; 15+ messages in thread From: Albert ARIBAUD @ 2011-03-03 7:07 UTC (permalink / raw) To: u-boot Hi Matthias, Le 26/01/2011 11:45, Matthias Weisser a ?crit : > Using optimized versions of memset and memcpy from linux brings a quite > noticeable speed (x2 or better) improvement for these two functions. > > Here are some numbers for test done with jadecpu > > | HEAD(1)| HEAD(1)| HEAD(2)| HEAD(2)| > | | +patch | | +patch | > ---------------------------+--------+--------+--------+--------+ > Reset to prompt | 438ms | 330ms | 228ms | 120ms | > | | | | | > TFTP a 3MB img | 4782ms | 3428ms | 3245ms | 2820ms | > | | | | | > FATLOAD USB a 3MB img* | 8515ms | 8510ms | ------ | ------ | > | | | | | > BOOTM LZO img in RAM | 3473ms | 3168ms | 592ms | 592ms | > where CRC is | 615ms | 615ms | 54ms | 54ms | > uncompress | 2460ms | 2462ms | 450ms | 451ms | > final boot_elf | 376ms | 68ms | 65ms | 65ms | > | | | | | > BOOTM LZO img in FLASH | 3207ms | 2902ms | 1050ms | 1050ms | > where CRC is | 600ms | 600ms | 135ms | 135ms | > uncompress | 2209ms | 2211ms | 828ms | 828ms | > | | | | | > Copy 1.4MB from NOR to RAM | 134ms | 72ms | 120ms | 70ms | > > (1) No dcache > (2) dcache enabled in board_init > *Does not work when dcache is on > > Size impact: > > C version: > text data bss dec hex filename > 202862 18912 266456 488230 77326 u-boot > > ASM version: > text data bss dec hex filename > 203798 18912 266288 488998 77626 u-boot > 222712 u-boot.bin > > Changes since V1: > - Made the usage of these functions optional be CONFIG_USE_ARCH_MEM > - Usage of PLD instruction on all architectures supporting it > - Added a README entry > - Minor style fixes > > Signed-off-by: Matthias Weisser<weisserm@arcor.de> > --- Sorry for leaving this patch alone for so long. Can you please just repost a (rebased) V3 with the history below the cut line so that it does not appear in the commit message? As it is not a fix but was tested on two different ARM archs, I think I'll add it to master rather than next so that it goes into mainline without delay. Amicalement, -- Albert. ^ permalink raw reply [flat|nested] 15+ messages in thread
* [U-Boot] [PATCH V3] arm: Use optimized memcpy and memset from linux 2011-01-24 15:56 [U-Boot] [PATCH] arm: Use optimized memcpy and memset from linux Matthias Weisser 2011-01-24 16:13 ` Wolfgang Denk 2011-01-26 10:45 ` [U-Boot] [PATCH V2] " Matthias Weisser @ 2011-03-11 7:36 ` Matthias Weisser 2011-03-24 14:34 ` Albert ARIBAUD 2 siblings, 1 reply; 15+ messages in thread From: Matthias Weisser @ 2011-03-11 7:36 UTC (permalink / raw) To: u-boot Using optimized versions of memset and memcpy from linux brings a quite noticeable speed (x2 or better) improvement for these two functions. Here are some numbers for test done with jadecpu | HEAD(1)| HEAD(1)| HEAD(2)| HEAD(2)| | | +patch | | +patch | ---------------------------+--------+--------+--------+--------+ Reset to prompt | 438ms | 330ms | 228ms | 120ms | | | | | | TFTP a 3MB img | 4782ms | 3428ms | 3245ms | 2820ms | | | | | | FATLOAD USB a 3MB img* | 8515ms | 8510ms | ------ | ------ | | | | | | BOOTM LZO img in RAM | 3473ms | 3168ms | 592ms | 592ms | where CRC is | 615ms | 615ms | 54ms | 54ms | uncompress | 2460ms | 2462ms | 450ms | 451ms | final boot_elf | 376ms | 68ms | 65ms | 65ms | | | | | | BOOTM LZO img in FLASH | 3207ms | 2902ms | 1050ms | 1050ms | where CRC is | 600ms | 600ms | 135ms | 135ms | uncompress | 2209ms | 2211ms | 828ms | 828ms | | | | | | Copy 1.4MB from NOR to RAM | 134ms | 72ms | 120ms | 70ms | (1) No dcache (2) dcache enabled in board_init *Does not work when dcache is on Size impact: C version: text data bss dec hex filename 202862 18912 266456 488230 77326 u-boot ASM version: text data bss dec hex filename 203798 18912 266288 488998 77626 u-boot 222712 u-boot.bin Signed-off-by: Matthias Weisser <weisserm@arcor.de> --- Changes since V2: - Moved the history below the --- line Changes since V1: - Made the usage of these functions optional be CONFIG_USE_ARCH_MEM - Usage of PLD instruction on all architectures supporting it - Added a README entry - Minor style fixes README | 6 + arch/arm/include/asm/assembler.h | 60 ++++++++++ arch/arm/include/asm/string.h | 10 ++- arch/arm/lib/Makefile | 2 + arch/arm/lib/memcpy.S | 241 ++++++++++++++++++++++++++++++++++++++ arch/arm/lib/memset.S | 126 ++++++++++++++++++++ 6 files changed, 443 insertions(+), 2 deletions(-) create mode 100644 arch/arm/include/asm/assembler.h create mode 100644 arch/arm/lib/memcpy.S create mode 100644 arch/arm/lib/memset.S diff --git a/README b/README index 755d17c..c2d82a5 100644 --- a/README +++ b/README @@ -2885,6 +2885,12 @@ Low Level (hardware related) configuration options: that is executed before the actual U-Boot. E.g. when compiling a NAND SPL. +- CONFIG_USE_ARCH_MEMCPY + CONFIG_USE_ARCH_MEMSET + If these options are used a optimized version of memcpy/memset will + be used if available. These functions may be faster under some + conditions but may increase the binary size. + Building the Software: ====================== diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h new file mode 100644 index 0000000..5e4789b --- /dev/null +++ b/arch/arm/include/asm/assembler.h @@ -0,0 +1,60 @@ +/* + * arch/arm/include/asm/assembler.h + * + * Copyright (C) 1996-2000 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This file contains arm architecture specific defines + * for the different processors. + * + * Do not include any C declarations in this file - it is included by + * assembler source. + */ + +/* + * Endian independent macros for shifting bytes within registers. + */ +#ifndef __ARMEB__ +#define pull lsr +#define push lsl +#define get_byte_0 lsl #0 +#define get_byte_1 lsr #8 +#define get_byte_2 lsr #16 +#define get_byte_3 lsr #24 +#define put_byte_0 lsl #0 +#define put_byte_1 lsl #8 +#define put_byte_2 lsl #16 +#define put_byte_3 lsl #24 +#else +#define pull lsl +#define push lsr +#define get_byte_0 lsr #24 +#define get_byte_1 lsr #16 +#define get_byte_2 lsr #8 +#define get_byte_3 lsl #0 +#define put_byte_0 lsl #24 +#define put_byte_1 lsl #16 +#define put_byte_2 lsl #8 +#define put_byte_3 lsl #0 +#endif + +/* + * Data preload for architectures that support it + */ +#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \ + defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || \ + defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_7A__) || \ + defined(__ARM_ARCH_7R__) +#define PLD(code...) code +#else +#define PLD(code...) +#endif + +/* + * Cache alligned + */ +#define CALGN(code...) code diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h index c3ea582..c6dfb25 100644 --- a/arch/arm/include/asm/string.h +++ b/arch/arm/include/asm/string.h @@ -1,6 +1,8 @@ #ifndef __ASM_ARM_STRING_H #define __ASM_ARM_STRING_H +#include <config.h> + /* * We don't do inline string functions, since the * optimised inline asm versions are not small. @@ -12,7 +14,9 @@ extern char * strrchr(const char * s, int c); #undef __HAVE_ARCH_STRCHR extern char * strchr(const char * s, int c); -#undef __HAVE_ARCH_MEMCPY +#ifdef CONFIG_USE_ARCH_MEMCPY +#define __HAVE_ARCH_MEMCPY +#endif extern void * memcpy(void *, const void *, __kernel_size_t); #undef __HAVE_ARCH_MEMMOVE @@ -22,7 +26,9 @@ extern void * memmove(void *, const void *, __kernel_size_t); extern void * memchr(const void *, int, __kernel_size_t); #undef __HAVE_ARCH_MEMZERO -#undef __HAVE_ARCH_MEMSET +#ifdef CONFIG_USE_ARCH_MEMSET +#define __HAVE_ARCH_MEMSET +#endif extern void * memset(void *, int, __kernel_size_t); #if 0 diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index 454440c..03b1b5e 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile @@ -44,6 +44,8 @@ COBJS-y += cache-cp15.o endif COBJS-y += interrupts.o COBJS-y += reset.o +SOBJS-$(CONFIG_USE_ARCH_MEMSET) += memset.o +SOBJS-$(CONFIG_USE_ARCH_MEMCPY) += memcpy.o SRCS := $(GLSOBJS:.o=.S) $(GLCOBJS:.o=.c) \ $(SOBJS-y:.o=.S) $(COBJS-y:.o=.c) diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S new file mode 100644 index 0000000..40db90e --- /dev/null +++ b/arch/arm/lib/memcpy.S @@ -0,0 +1,241 @@ +/* + * linux/arch/arm/lib/memcpy.S + * + * Author: Nicolas Pitre + * Created: Sep 28, 2005 + * Copyright: MontaVista Software, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <asm/assembler.h> + +#define W(instr) instr + +#define LDR1W_SHIFT 0 +#define STR1W_SHIFT 0 + + .macro ldr1w ptr reg abort + W(ldr) \reg, [\ptr], #4 + .endm + + .macro ldr4w ptr reg1 reg2 reg3 reg4 abort + ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4} + .endm + + .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort + ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} + .endm + + .macro ldr1b ptr reg cond=al abort + ldr\cond\()b \reg, [\ptr], #1 + .endm + + .macro str1w ptr reg abort + W(str) \reg, [\ptr], #4 + .endm + + .macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort + stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} + .endm + + .macro str1b ptr reg cond=al abort + str\cond\()b \reg, [\ptr], #1 + .endm + + .macro enter reg1 reg2 + stmdb sp!, {r0, \reg1, \reg2} + .endm + + .macro exit reg1 reg2 + ldmfd sp!, {r0, \reg1, \reg2} + .endm + + .text + +/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */ + +.globl memcpy +memcpy: + + enter r4, lr + + subs r2, r2, #4 + blt 8f + ands ip, r0, #3 + PLD( pld [r1, #0] ) + bne 9f + ands ip, r1, #3 + bne 10f + +1: subs r2, r2, #(28) + stmfd sp!, {r5 - r8} + blt 5f + + CALGN( ands ip, r0, #31 ) + CALGN( rsb r3, ip, #32 ) + CALGN( sbcnes r4, r3, r2 ) @ C is always set here + CALGN( bcs 2f ) + CALGN( adr r4, 6f ) + CALGN( subs r2, r2, r3 ) @ C gets set + CALGN( add pc, r4, ip ) + + PLD( pld [r1, #0] ) +2: PLD( subs r2, r2, #96 ) + PLD( pld [r1, #28] ) + PLD( blt 4f ) + PLD( pld [r1, #60] ) + PLD( pld [r1, #92] ) + +3: PLD( pld [r1, #124] ) +4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f + subs r2, r2, #32 + str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f + bge 3b + PLD( cmn r2, #96 ) + PLD( bge 4b ) + +5: ands ip, r2, #28 + rsb ip, ip, #32 +#if LDR1W_SHIFT > 0 + lsl ip, ip, #LDR1W_SHIFT +#endif + addne pc, pc, ip @ C is always clear here + b 7f +6: + .rept (1 << LDR1W_SHIFT) + W(nop) + .endr + ldr1w r1, r3, abort=20f + ldr1w r1, r4, abort=20f + ldr1w r1, r5, abort=20f + ldr1w r1, r6, abort=20f + ldr1w r1, r7, abort=20f + ldr1w r1, r8, abort=20f + ldr1w r1, lr, abort=20f + +#if LDR1W_SHIFT < STR1W_SHIFT + lsl ip, ip, #STR1W_SHIFT - LDR1W_SHIFT +#elif LDR1W_SHIFT > STR1W_SHIFT + lsr ip, ip, #LDR1W_SHIFT - STR1W_SHIFT +#endif + add pc, pc, ip + nop + .rept (1 << STR1W_SHIFT) + W(nop) + .endr + str1w r0, r3, abort=20f + str1w r0, r4, abort=20f + str1w r0, r5, abort=20f + str1w r0, r6, abort=20f + str1w r0, r7, abort=20f + str1w r0, r8, abort=20f + str1w r0, lr, abort=20f + + CALGN( bcs 2b ) + +7: ldmfd sp!, {r5 - r8} + +8: movs r2, r2, lsl #31 + ldr1b r1, r3, ne, abort=21f + ldr1b r1, r4, cs, abort=21f + ldr1b r1, ip, cs, abort=21f + str1b r0, r3, ne, abort=21f + str1b r0, r4, cs, abort=21f + str1b r0, ip, cs, abort=21f + + exit r4, pc + +9: rsb ip, ip, #4 + cmp ip, #2 + ldr1b r1, r3, gt, abort=21f + ldr1b r1, r4, ge, abort=21f + ldr1b r1, lr, abort=21f + str1b r0, r3, gt, abort=21f + str1b r0, r4, ge, abort=21f + subs r2, r2, ip + str1b r0, lr, abort=21f + blt 8b + ands ip, r1, #3 + beq 1b + +10: bic r1, r1, #3 + cmp ip, #2 + ldr1w r1, lr, abort=21f + beq 17f + bgt 18f + + + .macro forward_copy_shift pull push + + subs r2, r2, #28 + blt 14f + + CALGN( ands ip, r0, #31 ) + CALGN( rsb ip, ip, #32 ) + CALGN( sbcnes r4, ip, r2 ) @ C is always set here + CALGN( subcc r2, r2, ip ) + CALGN( bcc 15f ) + +11: stmfd sp!, {r5 - r9} + + PLD( pld [r1, #0] ) + PLD( subs r2, r2, #96 ) + PLD( pld [r1, #28] ) + PLD( blt 13f ) + PLD( pld [r1, #60] ) + PLD( pld [r1, #92] ) + +12: PLD( pld [r1, #124] ) +13: ldr4w r1, r4, r5, r6, r7, abort=19f + mov r3, lr, pull #\pull + subs r2, r2, #32 + ldr4w r1, r8, r9, ip, lr, abort=19f + orr r3, r3, r4, push #\push + mov r4, r4, pull #\pull + orr r4, r4, r5, push #\push + mov r5, r5, pull #\pull + orr r5, r5, r6, push #\push + mov r6, r6, pull #\pull + orr r6, r6, r7, push #\push + mov r7, r7, pull #\pull + orr r7, r7, r8, push #\push + mov r8, r8, pull #\pull + orr r8, r8, r9, push #\push + mov r9, r9, pull #\pull + orr r9, r9, ip, push #\push + mov ip, ip, pull #\pull + orr ip, ip, lr, push #\push + str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f + bge 12b + PLD( cmn r2, #96 ) + PLD( bge 13b ) + + ldmfd sp!, {r5 - r9} + +14: ands ip, r2, #28 + beq 16f + +15: mov r3, lr, pull #\pull + ldr1w r1, lr, abort=21f + subs ip, ip, #4 + orr r3, r3, lr, push #\push + str1w r0, r3, abort=21f + bgt 15b + CALGN( cmp r2, #0 ) + CALGN( bge 11b ) + +16: sub r1, r1, #(\push / 8) + b 8b + + .endm + + + forward_copy_shift pull=8 push=24 + +17: forward_copy_shift pull=16 push=16 + +18: forward_copy_shift pull=24 push=8 + diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S new file mode 100644 index 0000000..0cdf895 --- /dev/null +++ b/arch/arm/lib/memset.S @@ -0,0 +1,126 @@ +/* + * linux/arch/arm/lib/memset.S + * + * Copyright (C) 1995-2000 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * ASM optimised string functions + */ +#include <asm/assembler.h> + + .text + .align 5 + .word 0 + +1: subs r2, r2, #4 @ 1 do we have enough + blt 5f @ 1 bytes to align with? + cmp r3, #2 @ 1 + strltb r1, [r0], #1 @ 1 + strleb r1, [r0], #1 @ 1 + strb r1, [r0], #1 @ 1 + add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3)) +/* + * The pointer is now aligned and the length is adjusted. Try doing the + * memset again. + */ + +.globl memset +memset: + ands r3, r0, #3 @ 1 unaligned? + bne 1b @ 1 +/* + * we know that the pointer in r0 is aligned to a word boundary. + */ + orr r1, r1, r1, lsl #8 + orr r1, r1, r1, lsl #16 + mov r3, r1 + cmp r2, #16 + blt 4f + +#if ! CALGN(1)+0 + +/* + * We need an extra register for this loop - save the return address and + * use the LR + */ + str lr, [sp, #-4]! + mov ip, r1 + mov lr, r1 + +2: subs r2, r2, #64 + stmgeia r0!, {r1, r3, ip, lr} @ 64 bytes at a time. + stmgeia r0!, {r1, r3, ip, lr} + stmgeia r0!, {r1, r3, ip, lr} + stmgeia r0!, {r1, r3, ip, lr} + bgt 2b + ldmeqfd sp!, {pc} @ Now <64 bytes to go. +/* + * No need to correct the count; we're only testing bits from now on + */ + tst r2, #32 + stmneia r0!, {r1, r3, ip, lr} + stmneia r0!, {r1, r3, ip, lr} + tst r2, #16 + stmneia r0!, {r1, r3, ip, lr} + ldr lr, [sp], #4 + +#else + +/* + * This version aligns the destination pointer in order to write + * whole cache lines@once. + */ + + stmfd sp!, {r4-r7, lr} + mov r4, r1 + mov r5, r1 + mov r6, r1 + mov r7, r1 + mov ip, r1 + mov lr, r1 + + cmp r2, #96 + tstgt r0, #31 + ble 3f + + and ip, r0, #31 + rsb ip, ip, #32 + sub r2, r2, ip + movs ip, ip, lsl #(32 - 4) + stmcsia r0!, {r4, r5, r6, r7} + stmmiia r0!, {r4, r5} + tst ip, #(1 << 30) + mov ip, r1 + strne r1, [r0], #4 + +3: subs r2, r2, #64 + stmgeia r0!, {r1, r3-r7, ip, lr} + stmgeia r0!, {r1, r3-r7, ip, lr} + bgt 3b + ldmeqfd sp!, {r4-r7, pc} + + tst r2, #32 + stmneia r0!, {r1, r3-r7, ip, lr} + tst r2, #16 + stmneia r0!, {r4-r7} + ldmfd sp!, {r4-r7, lr} + +#endif + +4: tst r2, #8 + stmneia r0!, {r1, r3} + tst r2, #4 + strne r1, [r0], #4 +/* + * When we get here, we've got less than 4 bytes to zero. We + * may have an unaligned pointer as well. + */ +5: tst r2, #2 + strneb r1, [r0], #1 + strneb r1, [r0], #1 + tst r2, #1 + strneb r1, [r0], #1 + mov pc, lr -- 1.7.0.4 ^ permalink raw reply related [flat|nested] 15+ messages in thread
* [U-Boot] [PATCH V3] arm: Use optimized memcpy and memset from linux 2011-03-11 7:36 ` [U-Boot] [PATCH V3] " Matthias Weisser @ 2011-03-24 14:34 ` Albert ARIBAUD 0 siblings, 0 replies; 15+ messages in thread From: Albert ARIBAUD @ 2011-03-24 14:34 UTC (permalink / raw) To: u-boot Le 11/03/2011 08:36, Matthias Weisser a ?crit : > Using optimized versions of memset and memcpy from linux brings a quite > noticeable speed (x2 or better) improvement for these two functions. > > Here are some numbers for test done with jadecpu > > | HEAD(1)| HEAD(1)| HEAD(2)| HEAD(2)| > | | +patch | | +patch | > ---------------------------+--------+--------+--------+--------+ > Reset to prompt | 438ms | 330ms | 228ms | 120ms | > | | | | | > TFTP a 3MB img | 4782ms | 3428ms | 3245ms | 2820ms | > | | | | | > FATLOAD USB a 3MB img* | 8515ms | 8510ms | ------ | ------ | > | | | | | > BOOTM LZO img in RAM | 3473ms | 3168ms | 592ms | 592ms | > where CRC is | 615ms | 615ms | 54ms | 54ms | > uncompress | 2460ms | 2462ms | 450ms | 451ms | > final boot_elf | 376ms | 68ms | 65ms | 65ms | > | | | | | > BOOTM LZO img in FLASH | 3207ms | 2902ms | 1050ms | 1050ms | > where CRC is | 600ms | 600ms | 135ms | 135ms | > uncompress | 2209ms | 2211ms | 828ms | 828ms | > | | | | | > Copy 1.4MB from NOR to RAM | 134ms | 72ms | 120ms | 70ms | > > (1) No dcache > (2) dcache enabled in board_init > *Does not work when dcache is on > > Size impact: > > C version: > text data bss dec hex filename > 202862 18912 266456 488230 77326 u-boot > > ASM version: > text data bss dec hex filename > 203798 18912 266288 488998 77626 u-boot > 222712 u-boot.bin > > Signed-off-by: Matthias Weisser<weisserm@arcor.de> > --- Applied to u-boot-arm/master, thanks. Amicalement, -- Albert. ^ permalink raw reply [flat|nested] 15+ messages in thread
end of thread, other threads:[~2011-03-24 14:34 UTC | newest] Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2011-01-24 15:56 [U-Boot] [PATCH] arm: Use optimized memcpy and memset from linux Matthias Weisser 2011-01-24 16:13 ` Wolfgang Denk 2011-01-24 19:24 ` Matthias Weißer 2011-01-24 20:07 ` Wolfgang Denk 2011-01-25 10:55 ` Matthias Weißer 2011-01-25 20:05 ` Wolfgang Denk 2011-01-26 10:45 ` [U-Boot] [PATCH V2] " Matthias Weisser 2011-01-26 12:07 ` Albert ARIBAUD 2011-01-26 12:50 ` Matthias Weißer 2011-01-26 13:07 ` Wolfgang Denk 2011-01-27 18:39 ` Albert ARIBAUD 2011-02-20 19:35 ` Alexander Holler 2011-03-03 7:07 ` Albert ARIBAUD 2011-03-11 7:36 ` [U-Boot] [PATCH V3] " Matthias Weisser 2011-03-24 14:34 ` Albert ARIBAUD
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.