From: Matthew Wilcox <mawilcox@microsoft.com> A relatively common idiom we're missing is a function to fill an area of memory with a pattern which is larger than a single byte. I first noticed this with a zram patch which wanted to fill a page with an 'unsigned long' value. There turn out to be quite a few places in the kernel which can benefit from using an optimised function rather than a loop; sometimes text size, sometimes speed, and sometimes both. The optimised PowerPC version (not included here) improves performance by about 30% on POWER8 on just the raw memset_l(). Most of the extra lines of code come from the three testcases I added. Matthew Wilcox (8): Add multibyte memset functions Add testcases for memset16/32/64 x86: Implement memset16, memset32 & memset64 ARM: Implement memset32 & memset64 alpha: Add support for memset16 zram: Convert to using memset_l sym53c8xx_2: Convert to use memset32 vga: Optimise console scrolling arch/alpha/include/asm/string.h | 15 +-- arch/alpha/include/asm/vga.h | 2 +- arch/alpha/lib/memset.S | 10 +- arch/arm/include/asm/string.h | 14 +++ arch/arm/kernel/armksyms.c | 2 + arch/arm/lib/memset.S | 24 +++-- arch/mips/include/asm/vga.h | 7 ++ arch/powerpc/include/asm/vga.h | 8 ++ arch/sparc/include/asm/vga.h | 25 +++++ arch/x86/include/asm/string_32.h | 24 +++++ arch/x86/include/asm/string_64.h | 36 +++++++ drivers/block/zram/zram_drv.c | 13 +-- drivers/scsi/sym53c8xx_2/sym_hipd.c | 11 +- include/linux/string.h | 30 ++++++ include/linux/vt_buffer.h | 12 +++ lib/Kconfig | 3 + lib/string.c | 196 ++++++++++++++++++++++++++++++++++++ 17 files changed, 394 insertions(+), 38 deletions(-) -- 2.13.2
From: Matthew Wilcox <mawilcox@microsoft.com> memset16(), memset32() and memset64() are like memset(), but allow the caller to fill the destination with a value larger than a single byte. memset_l() and memset_p() allow the caller to use unsigned long and pointer values respectively. Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com> --- include/linux/string.h | 30 +++++++++++++++++++++++ lib/string.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/include/linux/string.h b/include/linux/string.h index a467e617eeb0..c8bdafffd2f0 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -99,6 +99,36 @@ extern __kernel_size_t strcspn(const char *,const char *); #ifndef __HAVE_ARCH_MEMSET extern void * memset(void *,int,__kernel_size_t); #endif + +#ifndef __HAVE_ARCH_MEMSET16 +extern void *memset16(uint16_t *, uint16_t, __kernel_size_t); +#endif + +#ifndef __HAVE_ARCH_MEMSET32 +extern void *memset32(uint32_t *, uint32_t, __kernel_size_t); +#endif + +#ifndef __HAVE_ARCH_MEMSET64 +extern void *memset64(uint64_t *, uint64_t, __kernel_size_t); +#endif + +static inline void *memset_l(unsigned long *p, unsigned long v, + __kernel_size_t n) +{ + if (BITS_PER_LONG == 32) + return memset32((uint32_t *)p, v, n); + else + return memset64((uint64_t *)p, v, n); +} + +static inline void *memset_p(void **p, void *v, __kernel_size_t n) +{ + if (BITS_PER_LONG == 32) + return memset32((uint32_t *)p, (uintptr_t)v, n); + else + return memset64((uint64_t *)p, (uintptr_t)v, n); +} + #ifndef __HAVE_ARCH_MEMCPY extern void * memcpy(void *,const void *,__kernel_size_t); #endif diff --git a/lib/string.c b/lib/string.c index ebbb99c775bd..198148bb61fd 100644 --- a/lib/string.c +++ b/lib/string.c @@ -723,6 +723,72 @@ void memzero_explicit(void *s, size_t count) } EXPORT_SYMBOL(memzero_explicit); +#ifndef __HAVE_ARCH_MEMSET16 +/** + * memset16() - Fill a memory area with a uint16_t + * @s: Pointer to the start of the area. + * @v: The value to fill the area with + * @count: The number of values to store + * + * Differs from memset() in that it fills with a uint16_t instead + * of a byte. Remember that @count is the number of uint16_ts to + * store, not the number of bytes. + */ +void *memset16(uint16_t *s, uint16_t v, size_t count) +{ + uint16_t *xs = s; + + while (count--) + *xs++ = v; + return s; +} +EXPORT_SYMBOL(memset16); +#endif + +#ifndef __HAVE_ARCH_MEMSET32 +/** + * memset32() - Fill a memory area with a uint32_t + * @s: Pointer to the start of the area. + * @v: The value to fill the area with + * @count: The number of values to store + * + * Differs from memset() in that it fills with a uint32_t instead + * of a byte. Remember that @count is the number of uint32_ts to + * store, not the number of bytes. + */ +void *memset32(uint32_t *s, uint32_t v, size_t count) +{ + uint32_t *xs = s; + + while (count--) + *xs++ = v; + return s; +} +EXPORT_SYMBOL(memset32); +#endif + +#ifndef __HAVE_ARCH_MEMSET64 +/** + * memset64() - Fill a memory area with a uint64_t + * @s: Pointer to the start of the area. + * @v: The value to fill the area with + * @count: The number of values to store + * + * Differs from memset() in that it fills with a uint64_t instead + * of a byte. Remember that @count is the number of uint64_ts to + * store, not the number of bytes. + */ +void *memset64(uint64_t *s, uint64_t v, size_t count) +{ + uint64_t *xs = s; + + while (count--) + *xs++ = v; + return s; +} +EXPORT_SYMBOL(memset64); +#endif + #ifndef __HAVE_ARCH_MEMCPY /** * memcpy - Copy one area of memory to another -- 2.13.2
From: Matthew Wilcox <mawilcox@microsoft.com> Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com> --- lib/Kconfig | 3 ++ lib/string.c | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) diff --git a/lib/Kconfig b/lib/Kconfig index 6762529ad9e4..40b114a11d7c 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -575,4 +575,7 @@ config PARMAN config PRIME_NUMBERS tristate +config STRING_SELFTEST + bool "Test string functions" + endmenu diff --git a/lib/string.c b/lib/string.c index 198148bb61fd..5af2211c3633 100644 --- a/lib/string.c +++ b/lib/string.c @@ -1051,3 +1051,133 @@ void fortify_panic(const char *name) BUG(); } EXPORT_SYMBOL(fortify_panic); + +#ifdef CONFIG_STRING_SELFTEST +#include <linux/slab.h> +#include <linux/module.h> + +static __init int memset16_selftest(void) +{ + unsigned i, j, k; + u16 v, *p = kmalloc(256 * 2 * 2, GFP_KERNEL); + + for (i = 0; i < 256; i++) { + for (j = 0; j < 256; j++) { + memset(p, 0xa1, 256 * 2 * sizeof(v)); + memset16(p + i, 0xb1b2, j); + for (k = 0; k < 512; k++) { + v = p[k]; + if (k < i) { + if (v != 0xa1a1) + goto fail; + } else if (k < i + j) { + if (v != 0xb1b2) + goto fail; + } else { + if (v != 0xa1a1) + goto fail; + } + } + } + } + +fail: + kfree(p); + if (i < 256) + return (i << 24) | (j << 16) | k; + return 0; +} + +static __init int memset32_selftest(void) +{ + unsigned i, j, k; + u32 v, *p = kmalloc(256 * 2 * 4, GFP_KERNEL); + + for (i = 0; i < 256; i++) { + for (j = 0; j < 256; j++) { + memset(p, 0xa1, 256 * 2 * sizeof(v)); + memset32(p + i, 0xb1b2b3b4, j); + for (k = 0; k < 512; k++) { + v = p[k]; + if (k < i) { + if (v != 0xa1a1a1a1) + goto fail; + } else if (k < i + j) { + if (v != 0xb1b2b3b4) + goto fail; + } else { + if (v != 0xa1a1a1a1) + goto fail; + } + } + } + } + +fail: + kfree(p); + if (i < 256) + return (i << 24) | (j << 16) | k; + return 0; +} + +static __init int memset64_selftest(void) +{ + unsigned i, j, k; + u64 v, *p = kmalloc(256 * 2 * 8, GFP_KERNEL); + + for (i = 0; i < 256; i++) { + for (j = 0; j < 256; j++) { + memset(p, 0xa1, 256 * 2 * sizeof(v)); + memset64(p + i, 0xb1b2b3b4b5b6b7b8ULL, j); + for (k = 0; k < 512; k++) { + v = p[k]; + if (k < i) { + if (v != 0xa1a1a1a1a1a1a1a1ULL) + goto fail; + } else if (k < i + j) { + if (v != 0xb1b2b3b4b5b6b7b8ULL) + goto fail; + } else { + if (v != 0xa1a1a1a1a1a1a1a1ULL) + goto fail; + } + } + } + } + +fail: + kfree(p); + if (i < 256) + return (i << 24) | (j << 16) | k; + return 0; + return 0; +} + +static __init int string_selftest_init(void) +{ + int test, subtest; + + test = 1; + subtest = memset16_selftest(); + if (subtest) + goto fail; + + test = 2; + subtest = memset32_selftest(); + if (subtest) + goto fail; + + test = 3; + subtest = memset64_selftest(); + if (subtest) + goto fail; + + pr_info("String selftests succeeded\n"); + return 0; +fail: + pr_crit("String selftest failure %d.%08x\n", test, subtest); + return 0; +} + +module_init(string_selftest_init); +#endif -- 2.13.2
From: Matthew Wilcox <mawilcox@microsoft.com> These are single instructions on x86. There's no 64-bit instruction for x86-32, but we don't yet have any user for memset64() on 32-bit architectures, so don't bother to implement it. Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com> --- arch/x86/include/asm/string_32.h | 24 ++++++++++++++++++++++++ arch/x86/include/asm/string_64.h | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index e9ee84873de5..e371e7229042 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h @@ -340,6 +340,30 @@ extern void *memset(void *, int, size_t); #endif #endif /* !CONFIG_FORTIFY_SOURCE */ +#define __HAVE_ARCH_MEMSET16 +static inline void *memset16(uint16_t *s, uint16_t v, size_t n) +{ + int d0, d1; + asm volatile("rep\n\t" + "stosw" + : "=&c" (d0), "=&D" (d1) + : "a" (v), "1" (s), "0" (n) + : "memory"); + return s; +} + +#define __HAVE_ARCH_MEMSET32 +static inline void *memset32(uint32_t *s, uint32_t v, size_t n) +{ + int d0, d1; + asm volatile("rep\n\t" + "stosl" + : "=&c" (d0), "=&D" (d1) + : "a" (v), "1" (s), "0" (n) + : "memory"); + return s; +} + /* * find the first occurrence of byte 'c', or 1 past the area if none */ diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 2a8c822de1fc..f372a70a523f 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -58,6 +58,42 @@ extern void *__memcpy(void *to, const void *from, size_t len); void *memset(void *s, int c, size_t n); void *__memset(void *s, int c, size_t n); +#define __HAVE_ARCH_MEMSET16 +static inline void *memset16(uint16_t *s, uint16_t v, size_t n) +{ + long d0, d1; + asm volatile("rep\n\t" + "stosw" + : "=&c" (d0), "=&D" (d1) + : "a" (v), "1" (s), "0" (n) + : "memory"); + return s; +} + +#define __HAVE_ARCH_MEMSET32 +static inline void *memset32(uint32_t *s, uint32_t v, size_t n) +{ + long d0, d1; + asm volatile("rep\n\t" + "stosl" + : "=&c" (d0), "=&D" (d1) + : "a" (v), "1" (s), "0" (n) + : "memory"); + return s; +} + +#define __HAVE_ARCH_MEMSET64 +static inline void *memset64(uint64_t *s, uint64_t v, size_t n) +{ + long d0, d1; + asm volatile("rep\n\t" + "stosq" + : "=&c" (d0), "=&D" (d1) + : "a" (v), "1" (s), "0" (n) + : "memory"); + return s; +} + #define __HAVE_ARCH_MEMMOVE void *memmove(void *dest, const void *src, size_t count); void *__memmove(void *dest, const void *src, size_t count); -- 2.13.2
From: Matthew Wilcox <mawilcox@microsoft.com> Reuse the existing optimised memset implementation to implement an optimised memset32 and memset64. Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com> Reviewed-by: Russell King <rmk+kernel@armlinux.org.uk> --- arch/arm/include/asm/string.h | 14 ++++++++++++++ arch/arm/kernel/armksyms.c | 2 ++ arch/arm/lib/memset.S | 24 ++++++++++++++++++------ 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h index cf4f3aad0fc1..fe1c6af3a1b1 100644 --- a/arch/arm/include/asm/string.h +++ b/arch/arm/include/asm/string.h @@ -24,6 +24,20 @@ extern void * memchr(const void *, int, __kernel_size_t); #define __HAVE_ARCH_MEMSET extern void * memset(void *, int, __kernel_size_t); +#define __HAVE_ARCH_MEMSET32 +extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t); +static inline void *memset32(uint32_t *p, uint32_t v, __kernel_size_t n) +{ + return __memset32(p, v, n * 4); +} + +#define __HAVE_ARCH_MEMSET64 +extern void *__memset64(uint64_t *, uint32_t low, __kernel_size_t, uint32_t hi); +static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n) +{ + return __memset64(p, v, n * 8, v >> 32); +} + extern void __memzero(void *ptr, __kernel_size_t n); #define memset(p,v,n) \ diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c index 8e8d20cdbce7..5266fd9ad6b4 100644 --- a/arch/arm/kernel/armksyms.c +++ b/arch/arm/kernel/armksyms.c @@ -87,6 +87,8 @@ EXPORT_SYMBOL(__raw_writesl); EXPORT_SYMBOL(strchr); EXPORT_SYMBOL(strrchr); EXPORT_SYMBOL(memset); +EXPORT_SYMBOL(__memset32); +EXPORT_SYMBOL(__memset64); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(memchr); diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S index 3c65e3bd790f..ed6d35d9cdb5 100644 --- a/arch/arm/lib/memset.S +++ b/arch/arm/lib/memset.S @@ -28,7 +28,7 @@ UNWIND( .fnstart ) 1: orr r1, r1, r1, lsl #8 orr r1, r1, r1, lsl #16 mov r3, r1 - cmp r2, #16 +7: cmp r2, #16 blt 4f #if ! CALGN(1)+0 @@ -41,7 +41,7 @@ UNWIND( .fnend ) UNWIND( .fnstart ) UNWIND( .save {r8, lr} ) mov r8, r1 - mov lr, r1 + mov lr, r3 2: subs r2, r2, #64 stmgeia ip!, {r1, r3, r8, lr} @ 64 bytes at a time. @@ -73,11 +73,11 @@ UNWIND( .fnend ) UNWIND( .fnstart ) UNWIND( .save {r4-r8, lr} ) mov r4, r1 - mov r5, r1 + mov r5, r3 mov r6, r1 - mov r7, r1 + mov r7, r3 mov r8, r1 - mov lr, r1 + mov lr, r3 cmp r2, #96 tstgt ip, #31 @@ -114,7 +114,7 @@ UNWIND( .fnstart ) tst r2, #4 strne r1, [ip], #4 /* - * When we get here, we've got less than 4 bytes to zero. We + * When we get here, we've got less than 4 bytes to set. We * may have an unaligned pointer as well. */ 5: tst r2, #2 @@ -135,3 +135,15 @@ UNWIND( .fnstart ) UNWIND( .fnend ) ENDPROC(memset) ENDPROC(mmioset) + +ENTRY(__memset32) +UNWIND( .fnstart ) + mov r3, r1 @ copy r1 to r3 and fall into memset64 +UNWIND( .fnend ) +ENDPROC(__memset32) +ENTRY(__memset64) +UNWIND( .fnstart ) + mov ip, r0 @ preserve r0 as return value + b 7b @ jump into the middle of memset +UNWIND( .fnend ) +ENDPROC(__memset64) -- 2.13.2
From: Matthew Wilcox <mawilcox@microsoft.com> Alpha already had an optimised fill-memory-with-16-bit-quantity assembler routine called memsetw(). It has a slightly different calling convention from memset16() in that it takes a byte count, not a count of words. That's the same convention used by ARM's __memset routines, so rename Alpha's routine to match and add a memset16() wrapper around it. Then convert Alpha's scr_memsetw() to call memset16() instead of memsetw(). Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com> --- arch/alpha/include/asm/string.h | 15 ++++++++------- arch/alpha/include/asm/vga.h | 2 +- arch/alpha/lib/memset.S | 10 +++++----- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/arch/alpha/include/asm/string.h b/arch/alpha/include/asm/string.h index c2911f591704..9eb9933d845f 100644 --- a/arch/alpha/include/asm/string.h +++ b/arch/alpha/include/asm/string.h @@ -65,13 +65,14 @@ extern void * memchr(const void *, int, size_t); aligned values. The DEST and COUNT parameters must be even for correct operation. */ -#define __HAVE_ARCH_MEMSETW -extern void * __memsetw(void *dest, unsigned short, size_t count); - -#define memsetw(s, c, n) \ -(__builtin_constant_p(c) \ - ? __constant_c_memset((s),0x0001000100010001UL*(unsigned short)(c),(n)) \ - : __memsetw((s),(c),(n))) +#define __HAVE_ARCH_MEMSET16 +extern void * __memset16(void *dest, unsigned short, size_t count); +static inline void *memset16(uint16_t *p, uint16_t v, size_t n) +{ + if (__builtin_constant_p(v)) + return __constant_c_memset(p, 0x0001000100010001UL * v, n * 2); + return __memset16(p, v, n * 2); +} #endif /* __KERNEL__ */ diff --git a/arch/alpha/include/asm/vga.h b/arch/alpha/include/asm/vga.h index c00106bac521..3c1c2b6128e7 100644 --- a/arch/alpha/include/asm/vga.h +++ b/arch/alpha/include/asm/vga.h @@ -34,7 +34,7 @@ static inline void scr_memsetw(u16 *s, u16 c, unsigned int count) if (__is_ioaddr(s)) memsetw_io((u16 __iomem *) s, c, count); else - memsetw(s, c, count); + memset16(s, c, count / 2); } /* Do not trust that the usage will be correct; analyze the arguments. */ diff --git a/arch/alpha/lib/memset.S b/arch/alpha/lib/memset.S index 89a26f5e89de..f824969e9e77 100644 --- a/arch/alpha/lib/memset.S +++ b/arch/alpha/lib/memset.S @@ -20,7 +20,7 @@ .globl memset .globl __memset .globl ___memset - .globl __memsetw + .globl __memset16 .globl __constant_c_memset .ent ___memset @@ -110,8 +110,8 @@ EXPORT_SYMBOL(___memset) EXPORT_SYMBOL(__constant_c_memset) .align 5 - .ent __memsetw -__memsetw: + .ent __memset16 +__memset16: .prologue 0 inswl $17,0,$1 /* E0 */ @@ -123,8 +123,8 @@ __memsetw: or $1,$4,$17 /* E0 */ br __constant_c_memset /* .. E1 */ - .end __memsetw -EXPORT_SYMBOL(__memsetw) + .end __memset16 +EXPORT_SYMBOL(__memset16) memset = ___memset __memset = ___memset -- 2.13.2
From: Matthew Wilcox <mawilcox@microsoft.com> zram was the motivation for creating memset_l(). Minchan Kim sees a 7% performance improvement on x86 with 100MB of non-zero deduplicatable data: perf stat -r 10 dd if=/dev/zram0 of=/dev/null vanilla: 0.232050465 seconds time elapsed ( +- 0.51% ) memset_l: 0.217219387 seconds time elapsed ( +- 0.07% ) Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com> Tested-by: Minchan Kim <minchan@kernel.org> --- drivers/block/zram/zram_drv.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 856d5dc02451..2df50d82dc29 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -175,20 +175,11 @@ static inline void update_used_max(struct zram *zram, } while (old_max != cur_max); } -static inline void zram_fill_page(char *ptr, unsigned long len, +static inline void zram_fill_page(void *ptr, unsigned long len, unsigned long value) { - int i; - unsigned long *page = (unsigned long *)ptr; - WARN_ON_ONCE(!IS_ALIGNED(len, sizeof(unsigned long))); - - if (likely(value == 0)) { - memset(ptr, 0, len); - } else { - for (i = 0; i < len / sizeof(*page); i++) - page[i] = value; - } + memset_l(ptr, value, len / sizeof(unsigned long)); } static bool page_same_filled(void *ptr, unsigned long *element) -- 2.13.2
From: Matthew Wilcox <mawilcox@microsoft.com> memset32() can be used to initialise these three arrays. Minor code footprint reduction. Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com> --- drivers/scsi/sym53c8xx_2/sym_hipd.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/scsi/sym53c8xx_2/sym_hipd.c b/drivers/scsi/sym53c8xx_2/sym_hipd.c index 6b349e301869..b886b10e3499 100644 --- a/drivers/scsi/sym53c8xx_2/sym_hipd.c +++ b/drivers/scsi/sym53c8xx_2/sym_hipd.c @@ -4985,13 +4985,10 @@ struct sym_lcb *sym_alloc_lcb (struct sym_hcb *np, u_char tn, u_char ln) * Compute the bus address of this table. */ if (ln && !tp->luntbl) { - int i; - tp->luntbl = sym_calloc_dma(256, "LUNTBL"); if (!tp->luntbl) goto fail; - for (i = 0 ; i < 64 ; i++) - tp->luntbl[i] = cpu_to_scr(vtobus(&np->badlun_sa)); + memset32(tp->luntbl, cpu_to_scr(vtobus(&np->badlun_sa)), 64); tp->head.luntbl_sa = cpu_to_scr(vtobus(tp->luntbl)); } @@ -5077,8 +5074,7 @@ static void sym_alloc_lcb_tags (struct sym_hcb *np, u_char tn, u_char ln) /* * Initialize the task table with invalid entries. */ - for (i = 0 ; i < SYM_CONF_MAX_TASK ; i++) - lp->itlq_tbl[i] = cpu_to_scr(np->notask_ba); + memset32(lp->itlq_tbl, cpu_to_scr(np->notask_ba), SYM_CONF_MAX_TASK); /* * Fill up the tag buffer with tag numbers. @@ -5764,8 +5760,7 @@ int sym_hcb_attach(struct Scsi_Host *shost, struct sym_fw *fw, struct sym_nvram goto attach_failed; np->badlun_sa = cpu_to_scr(SCRIPTB_BA(np, resel_bad_lun)); - for (i = 0 ; i < 64 ; i++) /* 64 luns/target, no less */ - np->badluntbl[i] = cpu_to_scr(vtobus(&np->badlun_sa)); + memset32(np->badluntbl, cpu_to_scr(vtobus(&np->badlun_sa)), 64); /* * Prepare the bus address array that contains the bus -- 2.13.2
From: Matthew Wilcox <mawilcox@microsoft.com> Where possible, call memset16(), memmove() or memcpy() instead of using open-coded loops. I don't like the calling convention that uses a byte count instead of a count of u16s, but it's a little late to change that. Reduces code size of fbcon.o by almost 400 bytes on my laptop build. Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com> --- arch/mips/include/asm/vga.h | 7 +++++++ arch/powerpc/include/asm/vga.h | 8 ++++++++ arch/sparc/include/asm/vga.h | 25 +++++++++++++++++++++++++ include/linux/vt_buffer.h | 12 ++++++++++++ 4 files changed, 52 insertions(+) diff --git a/arch/mips/include/asm/vga.h b/arch/mips/include/asm/vga.h index f82c83749a08..975ff51f80c4 100644 --- a/arch/mips/include/asm/vga.h +++ b/arch/mips/include/asm/vga.h @@ -6,6 +6,7 @@ #ifndef _ASM_VGA_H #define _ASM_VGA_H +#include <linux/string.h> #include <asm/addrspace.h> #include <asm/byteorder.h> @@ -40,9 +41,15 @@ static inline u16 scr_readw(volatile const u16 *addr) return le16_to_cpu(*addr); } +static inline void scr_memsetw(u16 *s, u16 v, unsigned int count) +{ + memset16(s, cpu_to_le16(v), count / 2); +} + #define scr_memcpyw(d, s, c) memcpy(d, s, c) #define scr_memmovew(d, s, c) memmove(d, s, c) #define VT_BUF_HAVE_MEMCPYW #define VT_BUF_HAVE_MEMMOVEW +#define VT_BUF_HAVE_MEMSETW #endif /* _ASM_VGA_H */ diff --git a/arch/powerpc/include/asm/vga.h b/arch/powerpc/include/asm/vga.h index ab3acd2f2786..7a7b541b7493 100644 --- a/arch/powerpc/include/asm/vga.h +++ b/arch/powerpc/include/asm/vga.h @@ -33,8 +33,16 @@ static inline u16 scr_readw(volatile const u16 *addr) return le16_to_cpu(*addr); } +#define VT_BUF_HAVE_MEMSETW +static inline void scr_memsetw(u16 *s, u16 v, unsigned int n) +{ + memset16(s, cpu_to_le16(v), n / 2); +} + #define VT_BUF_HAVE_MEMCPYW +#define VT_BUF_HAVE_MEMMOVEW #define scr_memcpyw memcpy +#define scr_memmovew memmove #endif /* !CONFIG_VGA_CONSOLE && !CONFIG_MDA_CONSOLE */ diff --git a/arch/sparc/include/asm/vga.h b/arch/sparc/include/asm/vga.h index ec0e9967d93d..f54e8b6fb197 100644 --- a/arch/sparc/include/asm/vga.h +++ b/arch/sparc/include/asm/vga.h @@ -8,9 +8,13 @@ #define _LINUX_ASM_VGA_H_ #include <linux/bug.h> +#include <linux/string.h> #include <asm/types.h> #define VT_BUF_HAVE_RW +#define VT_BUF_HAVE_MEMSETW +#define VT_BUF_HAVE_MEMCPYW +#define VT_BUF_HAVE_MEMMOVEW #undef scr_writew #undef scr_readw @@ -29,6 +33,27 @@ static inline u16 scr_readw(const u16 *addr) return *addr; } +static inline void scr_memsetw(u16 *p, u16 v, unsigned int n) +{ + BUG_ON((long) p >= 0); + + memset16(p, cpu_to_le16(v), n / 2); +} + +static inline void scr_memcpyw(u16 *d, u16 *s, unsigned int n) +{ + BUG_ON((long) d >= 0); + + memcpy(d, s, n); +} + +static inline void scr_memmovew(u16 *d, u16 *s, unsigned int n) +{ + BUG_ON((long) d >= 0); + + memmove(d, s, n); +} + #define VGA_MAP_MEM(x,s) (x) #endif diff --git a/include/linux/vt_buffer.h b/include/linux/vt_buffer.h index f38c10ba3ff5..31b92fcd8f03 100644 --- a/include/linux/vt_buffer.h +++ b/include/linux/vt_buffer.h @@ -26,24 +26,33 @@ #ifndef VT_BUF_HAVE_MEMSETW static inline void scr_memsetw(u16 *s, u16 c, unsigned int count) { +#ifdef VT_BUF_HAVE_RW count /= 2; while (count--) scr_writew(c, s++); +#else + memset16(s, c, count / 2); +#endif } #endif #ifndef VT_BUF_HAVE_MEMCPYW static inline void scr_memcpyw(u16 *d, const u16 *s, unsigned int count) { +#ifdef VT_BUF_HAVE_RW count /= 2; while (count--) scr_writew(scr_readw(s++), d++); +#else + memcpy(d, s, count); +#endif } #endif #ifndef VT_BUF_HAVE_MEMMOVEW static inline void scr_memmovew(u16 *d, const u16 *s, unsigned int count) { +#ifdef VT_BUF_HAVE_RW if (d < s) scr_memcpyw(d, s, count); else { @@ -53,6 +62,9 @@ static inline void scr_memmovew(u16 *d, const u16 *s, unsigned int count) while (count--) scr_writew(scr_readw(--s), --d); } +#else + memmove(d, s, count); +#endif } #endif -- 2.13.2
Matthew Wilcox <willy@infradead.org> writes:
> From: Matthew Wilcox <mawilcox@microsoft.com>
>
> A relatively common idiom we're missing is a function to fill an area
> of memory with a pattern which is larger than a single byte. I first
> noticed this with a zram patch which wanted to fill a page with an
> 'unsigned long' value. There turn out to be quite a few places in
> the kernel which can benefit from using an optimised function rather
> than a loop; sometimes text size, sometimes speed, and sometimes both.
> The optimised PowerPC version (not included here) improves performance
> by about 30% on POWER8 on just the raw memset_l().
Is the plan that Andrew will merge this series, or are you planning to
put them in a tree of yours?
cheers
On Tue, Jul 25, 2017 at 03:27:38PM +1000, Michael Ellerman wrote:
> Matthew Wilcox <willy@infradead.org> writes:
>
> > From: Matthew Wilcox <mawilcox@microsoft.com>
> >
> > A relatively common idiom we're missing is a function to fill an area
> > of memory with a pattern which is larger than a single byte. I first
> > noticed this with a zram patch which wanted to fill a page with an
> > 'unsigned long' value. There turn out to be quite a few places in
> > the kernel which can benefit from using an optimised function rather
> > than a loop; sometimes text size, sometimes speed, and sometimes both.
> > The optimised PowerPC version (not included here) improves performance
> > by about 30% on POWER8 on just the raw memset_l().
>
> Is the plan that Andrew will merge this series, or are you planning to
> put them in a tree of yours?
I'm hoping Andrew will take it, but I can put it in my own tree if he
doesn't want to take it.