* [PATCH v4 1/8] Add multibyte memset functions
2017-07-20 18:45 [PATCH v4 0/8] Multibyte memset variations Matthew Wilcox
@ 2017-07-20 18:45 ` Matthew Wilcox
2017-07-20 18:45 ` [PATCH v4 2/8] Add testcases for memset16/32/64 Matthew Wilcox
` (7 subsequent siblings)
8 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2017-07-20 18:45 UTC (permalink / raw)
To: linux-kernel; +Cc: minchan, Matthew Wilcox, akpm, mpe
From: Matthew Wilcox <mawilcox@microsoft.com>
memset16(), memset32() and memset64() are like memset(), but allow the
caller to fill the destination with a value larger than a single byte.
memset_l() and memset_p() allow the caller to use unsigned long and
pointer values respectively.
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
include/linux/string.h | 30 +++++++++++++++++++++++
lib/string.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 96 insertions(+)
diff --git a/include/linux/string.h b/include/linux/string.h
index a467e617eeb0..c8bdafffd2f0 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -99,6 +99,36 @@ extern __kernel_size_t strcspn(const char *,const char *);
#ifndef __HAVE_ARCH_MEMSET
extern void * memset(void *,int,__kernel_size_t);
#endif
+
+#ifndef __HAVE_ARCH_MEMSET16
+extern void *memset16(uint16_t *, uint16_t, __kernel_size_t);
+#endif
+
+#ifndef __HAVE_ARCH_MEMSET32
+extern void *memset32(uint32_t *, uint32_t, __kernel_size_t);
+#endif
+
+#ifndef __HAVE_ARCH_MEMSET64
+extern void *memset64(uint64_t *, uint64_t, __kernel_size_t);
+#endif
+
+static inline void *memset_l(unsigned long *p, unsigned long v,
+ __kernel_size_t n)
+{
+ if (BITS_PER_LONG == 32)
+ return memset32((uint32_t *)p, v, n);
+ else
+ return memset64((uint64_t *)p, v, n);
+}
+
+static inline void *memset_p(void **p, void *v, __kernel_size_t n)
+{
+ if (BITS_PER_LONG == 32)
+ return memset32((uint32_t *)p, (uintptr_t)v, n);
+ else
+ return memset64((uint64_t *)p, (uintptr_t)v, n);
+}
+
#ifndef __HAVE_ARCH_MEMCPY
extern void * memcpy(void *,const void *,__kernel_size_t);
#endif
diff --git a/lib/string.c b/lib/string.c
index ebbb99c775bd..198148bb61fd 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -723,6 +723,72 @@ void memzero_explicit(void *s, size_t count)
}
EXPORT_SYMBOL(memzero_explicit);
+#ifndef __HAVE_ARCH_MEMSET16
+/**
+ * memset16() - Fill a memory area with a uint16_t
+ * @s: Pointer to the start of the area.
+ * @v: The value to fill the area with
+ * @count: The number of values to store
+ *
+ * Differs from memset() in that it fills with a uint16_t instead
+ * of a byte. Remember that @count is the number of uint16_ts to
+ * store, not the number of bytes.
+ */
+void *memset16(uint16_t *s, uint16_t v, size_t count)
+{
+ uint16_t *xs = s;
+
+ while (count--)
+ *xs++ = v;
+ return s;
+}
+EXPORT_SYMBOL(memset16);
+#endif
+
+#ifndef __HAVE_ARCH_MEMSET32
+/**
+ * memset32() - Fill a memory area with a uint32_t
+ * @s: Pointer to the start of the area.
+ * @v: The value to fill the area with
+ * @count: The number of values to store
+ *
+ * Differs from memset() in that it fills with a uint32_t instead
+ * of a byte. Remember that @count is the number of uint32_ts to
+ * store, not the number of bytes.
+ */
+void *memset32(uint32_t *s, uint32_t v, size_t count)
+{
+ uint32_t *xs = s;
+
+ while (count--)
+ *xs++ = v;
+ return s;
+}
+EXPORT_SYMBOL(memset32);
+#endif
+
+#ifndef __HAVE_ARCH_MEMSET64
+/**
+ * memset64() - Fill a memory area with a uint64_t
+ * @s: Pointer to the start of the area.
+ * @v: The value to fill the area with
+ * @count: The number of values to store
+ *
+ * Differs from memset() in that it fills with a uint64_t instead
+ * of a byte. Remember that @count is the number of uint64_ts to
+ * store, not the number of bytes.
+ */
+void *memset64(uint64_t *s, uint64_t v, size_t count)
+{
+ uint64_t *xs = s;
+
+ while (count--)
+ *xs++ = v;
+ return s;
+}
+EXPORT_SYMBOL(memset64);
+#endif
+
#ifndef __HAVE_ARCH_MEMCPY
/**
* memcpy - Copy one area of memory to another
--
2.13.2
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v4 2/8] Add testcases for memset16/32/64
2017-07-20 18:45 [PATCH v4 0/8] Multibyte memset variations Matthew Wilcox
2017-07-20 18:45 ` [PATCH v4 1/8] Add multibyte memset functions Matthew Wilcox
@ 2017-07-20 18:45 ` Matthew Wilcox
2017-07-20 18:45 ` [PATCH v4 3/8] x86: Implement memset16, memset32 & memset64 Matthew Wilcox
` (6 subsequent siblings)
8 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2017-07-20 18:45 UTC (permalink / raw)
To: linux-kernel; +Cc: minchan, Matthew Wilcox, akpm, mpe
From: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
lib/Kconfig | 3 ++
lib/string.c | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 133 insertions(+)
diff --git a/lib/Kconfig b/lib/Kconfig
index 6762529ad9e4..40b114a11d7c 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -575,4 +575,7 @@ config PARMAN
config PRIME_NUMBERS
tristate
+config STRING_SELFTEST
+ bool "Test string functions"
+
endmenu
diff --git a/lib/string.c b/lib/string.c
index 198148bb61fd..5af2211c3633 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -1051,3 +1051,133 @@ void fortify_panic(const char *name)
BUG();
}
EXPORT_SYMBOL(fortify_panic);
+
+#ifdef CONFIG_STRING_SELFTEST
+#include <linux/slab.h>
+#include <linux/module.h>
+
+static __init int memset16_selftest(void)
+{
+ unsigned i, j, k;
+ u16 v, *p = kmalloc(256 * 2 * 2, GFP_KERNEL);
+
+ for (i = 0; i < 256; i++) {
+ for (j = 0; j < 256; j++) {
+ memset(p, 0xa1, 256 * 2 * sizeof(v));
+ memset16(p + i, 0xb1b2, j);
+ for (k = 0; k < 512; k++) {
+ v = p[k];
+ if (k < i) {
+ if (v != 0xa1a1)
+ goto fail;
+ } else if (k < i + j) {
+ if (v != 0xb1b2)
+ goto fail;
+ } else {
+ if (v != 0xa1a1)
+ goto fail;
+ }
+ }
+ }
+ }
+
+fail:
+ kfree(p);
+ if (i < 256)
+ return (i << 24) | (j << 16) | k;
+ return 0;
+}
+
+static __init int memset32_selftest(void)
+{
+ unsigned i, j, k;
+ u32 v, *p = kmalloc(256 * 2 * 4, GFP_KERNEL);
+
+ for (i = 0; i < 256; i++) {
+ for (j = 0; j < 256; j++) {
+ memset(p, 0xa1, 256 * 2 * sizeof(v));
+ memset32(p + i, 0xb1b2b3b4, j);
+ for (k = 0; k < 512; k++) {
+ v = p[k];
+ if (k < i) {
+ if (v != 0xa1a1a1a1)
+ goto fail;
+ } else if (k < i + j) {
+ if (v != 0xb1b2b3b4)
+ goto fail;
+ } else {
+ if (v != 0xa1a1a1a1)
+ goto fail;
+ }
+ }
+ }
+ }
+
+fail:
+ kfree(p);
+ if (i < 256)
+ return (i << 24) | (j << 16) | k;
+ return 0;
+}
+
+static __init int memset64_selftest(void)
+{
+ unsigned i, j, k;
+ u64 v, *p = kmalloc(256 * 2 * 8, GFP_KERNEL);
+
+ for (i = 0; i < 256; i++) {
+ for (j = 0; j < 256; j++) {
+ memset(p, 0xa1, 256 * 2 * sizeof(v));
+ memset64(p + i, 0xb1b2b3b4b5b6b7b8ULL, j);
+ for (k = 0; k < 512; k++) {
+ v = p[k];
+ if (k < i) {
+ if (v != 0xa1a1a1a1a1a1a1a1ULL)
+ goto fail;
+ } else if (k < i + j) {
+ if (v != 0xb1b2b3b4b5b6b7b8ULL)
+ goto fail;
+ } else {
+ if (v != 0xa1a1a1a1a1a1a1a1ULL)
+ goto fail;
+ }
+ }
+ }
+ }
+
+fail:
+ kfree(p);
+ if (i < 256)
+ return (i << 24) | (j << 16) | k;
+ return 0;
+ return 0;
+}
+
+static __init int string_selftest_init(void)
+{
+ int test, subtest;
+
+ test = 1;
+ subtest = memset16_selftest();
+ if (subtest)
+ goto fail;
+
+ test = 2;
+ subtest = memset32_selftest();
+ if (subtest)
+ goto fail;
+
+ test = 3;
+ subtest = memset64_selftest();
+ if (subtest)
+ goto fail;
+
+ pr_info("String selftests succeeded\n");
+ return 0;
+fail:
+ pr_crit("String selftest failure %d.%08x\n", test, subtest);
+ return 0;
+}
+
+module_init(string_selftest_init);
+#endif
--
2.13.2
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v4 3/8] x86: Implement memset16, memset32 & memset64
2017-07-20 18:45 [PATCH v4 0/8] Multibyte memset variations Matthew Wilcox
2017-07-20 18:45 ` [PATCH v4 1/8] Add multibyte memset functions Matthew Wilcox
2017-07-20 18:45 ` [PATCH v4 2/8] Add testcases for memset16/32/64 Matthew Wilcox
@ 2017-07-20 18:45 ` Matthew Wilcox
2017-07-20 18:45 ` [PATCH v4 4/8] ARM: Implement " Matthew Wilcox
` (5 subsequent siblings)
8 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2017-07-20 18:45 UTC (permalink / raw)
To: linux-kernel; +Cc: minchan, Matthew Wilcox, akpm, mpe
From: Matthew Wilcox <mawilcox@microsoft.com>
These are single instructions on x86. There's no 64-bit instruction
for x86-32, but we don't yet have any user for memset64() on 32-bit
architectures, so don't bother to implement it.
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
arch/x86/include/asm/string_32.h | 24 ++++++++++++++++++++++++
arch/x86/include/asm/string_64.h | 36 ++++++++++++++++++++++++++++++++++++
2 files changed, 60 insertions(+)
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index e9ee84873de5..e371e7229042 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -340,6 +340,30 @@ extern void *memset(void *, int, size_t);
#endif
#endif /* !CONFIG_FORTIFY_SOURCE */
+#define __HAVE_ARCH_MEMSET16
+static inline void *memset16(uint16_t *s, uint16_t v, size_t n)
+{
+ int d0, d1;
+ asm volatile("rep\n\t"
+ "stosw"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
+
+#define __HAVE_ARCH_MEMSET32
+static inline void *memset32(uint32_t *s, uint32_t v, size_t n)
+{
+ int d0, d1;
+ asm volatile("rep\n\t"
+ "stosl"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
+
/*
* find the first occurrence of byte 'c', or 1 past the area if none
*/
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 2a8c822de1fc..f372a70a523f 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -58,6 +58,42 @@ extern void *__memcpy(void *to, const void *from, size_t len);
void *memset(void *s, int c, size_t n);
void *__memset(void *s, int c, size_t n);
+#define __HAVE_ARCH_MEMSET16
+static inline void *memset16(uint16_t *s, uint16_t v, size_t n)
+{
+ long d0, d1;
+ asm volatile("rep\n\t"
+ "stosw"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
+
+#define __HAVE_ARCH_MEMSET32
+static inline void *memset32(uint32_t *s, uint32_t v, size_t n)
+{
+ long d0, d1;
+ asm volatile("rep\n\t"
+ "stosl"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
+
+#define __HAVE_ARCH_MEMSET64
+static inline void *memset64(uint64_t *s, uint64_t v, size_t n)
+{
+ long d0, d1;
+ asm volatile("rep\n\t"
+ "stosq"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
+
#define __HAVE_ARCH_MEMMOVE
void *memmove(void *dest, const void *src, size_t count);
void *__memmove(void *dest, const void *src, size_t count);
--
2.13.2
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v4 4/8] ARM: Implement memset32 & memset64
2017-07-20 18:45 [PATCH v4 0/8] Multibyte memset variations Matthew Wilcox
` (2 preceding siblings ...)
2017-07-20 18:45 ` [PATCH v4 3/8] x86: Implement memset16, memset32 & memset64 Matthew Wilcox
@ 2017-07-20 18:45 ` Matthew Wilcox
2017-07-20 18:45 ` [PATCH v4 5/8] alpha: Add support for memset16 Matthew Wilcox
` (4 subsequent siblings)
8 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2017-07-20 18:45 UTC (permalink / raw)
To: linux-kernel; +Cc: minchan, Matthew Wilcox, akpm, mpe
From: Matthew Wilcox <mawilcox@microsoft.com>
Reuse the existing optimised memset implementation to implement an
optimised memset32 and memset64.
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Reviewed-by: Russell King <rmk+kernel@armlinux.org.uk>
---
arch/arm/include/asm/string.h | 14 ++++++++++++++
arch/arm/kernel/armksyms.c | 2 ++
arch/arm/lib/memset.S | 24 ++++++++++++++++++------
3 files changed, 34 insertions(+), 6 deletions(-)
diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h
index cf4f3aad0fc1..fe1c6af3a1b1 100644
--- a/arch/arm/include/asm/string.h
+++ b/arch/arm/include/asm/string.h
@@ -24,6 +24,20 @@ extern void * memchr(const void *, int, __kernel_size_t);
#define __HAVE_ARCH_MEMSET
extern void * memset(void *, int, __kernel_size_t);
+#define __HAVE_ARCH_MEMSET32
+extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
+static inline void *memset32(uint32_t *p, uint32_t v, __kernel_size_t n)
+{
+ return __memset32(p, v, n * 4);
+}
+
+#define __HAVE_ARCH_MEMSET64
+extern void *__memset64(uint64_t *, uint32_t low, __kernel_size_t, uint32_t hi);
+static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n)
+{
+ return __memset64(p, v, n * 8, v >> 32);
+}
+
extern void __memzero(void *ptr, __kernel_size_t n);
#define memset(p,v,n) \
diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
index 8e8d20cdbce7..5266fd9ad6b4 100644
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -87,6 +87,8 @@ EXPORT_SYMBOL(__raw_writesl);
EXPORT_SYMBOL(strchr);
EXPORT_SYMBOL(strrchr);
EXPORT_SYMBOL(memset);
+EXPORT_SYMBOL(__memset32);
+EXPORT_SYMBOL(__memset64);
EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(memmove);
EXPORT_SYMBOL(memchr);
diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S
index 3c65e3bd790f..ed6d35d9cdb5 100644
--- a/arch/arm/lib/memset.S
+++ b/arch/arm/lib/memset.S
@@ -28,7 +28,7 @@ UNWIND( .fnstart )
1: orr r1, r1, r1, lsl #8
orr r1, r1, r1, lsl #16
mov r3, r1
- cmp r2, #16
+7: cmp r2, #16
blt 4f
#if ! CALGN(1)+0
@@ -41,7 +41,7 @@ UNWIND( .fnend )
UNWIND( .fnstart )
UNWIND( .save {r8, lr} )
mov r8, r1
- mov lr, r1
+ mov lr, r3
2: subs r2, r2, #64
stmgeia ip!, {r1, r3, r8, lr} @ 64 bytes at a time.
@@ -73,11 +73,11 @@ UNWIND( .fnend )
UNWIND( .fnstart )
UNWIND( .save {r4-r8, lr} )
mov r4, r1
- mov r5, r1
+ mov r5, r3
mov r6, r1
- mov r7, r1
+ mov r7, r3
mov r8, r1
- mov lr, r1
+ mov lr, r3
cmp r2, #96
tstgt ip, #31
@@ -114,7 +114,7 @@ UNWIND( .fnstart )
tst r2, #4
strne r1, [ip], #4
/*
- * When we get here, we've got less than 4 bytes to zero. We
+ * When we get here, we've got less than 4 bytes to set. We
* may have an unaligned pointer as well.
*/
5: tst r2, #2
@@ -135,3 +135,15 @@ UNWIND( .fnstart )
UNWIND( .fnend )
ENDPROC(memset)
ENDPROC(mmioset)
+
+ENTRY(__memset32)
+UNWIND( .fnstart )
+ mov r3, r1 @ copy r1 to r3 and fall into memset64
+UNWIND( .fnend )
+ENDPROC(__memset32)
+ENTRY(__memset64)
+UNWIND( .fnstart )
+ mov ip, r0 @ preserve r0 as return value
+ b 7b @ jump into the middle of memset
+UNWIND( .fnend )
+ENDPROC(__memset64)
--
2.13.2
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v4 5/8] alpha: Add support for memset16
2017-07-20 18:45 [PATCH v4 0/8] Multibyte memset variations Matthew Wilcox
` (3 preceding siblings ...)
2017-07-20 18:45 ` [PATCH v4 4/8] ARM: Implement " Matthew Wilcox
@ 2017-07-20 18:45 ` Matthew Wilcox
2017-07-20 18:45 ` [PATCH v4 6/8] zram: Convert to using memset_l Matthew Wilcox
` (3 subsequent siblings)
8 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2017-07-20 18:45 UTC (permalink / raw)
To: linux-kernel; +Cc: minchan, Matthew Wilcox, akpm, mpe
From: Matthew Wilcox <mawilcox@microsoft.com>
Alpha already had an optimised fill-memory-with-16-bit-quantity assembler
routine called memsetw(). It has a slightly different calling convention
from memset16() in that it takes a byte count, not a count of words.
That's the same convention used by ARM's __memset routines, so rename
Alpha's routine to match and add a memset16() wrapper around it. Then
convert Alpha's scr_memsetw() to call memset16() instead of memsetw().
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
arch/alpha/include/asm/string.h | 15 ++++++++-------
arch/alpha/include/asm/vga.h | 2 +-
arch/alpha/lib/memset.S | 10 +++++-----
3 files changed, 14 insertions(+), 13 deletions(-)
diff --git a/arch/alpha/include/asm/string.h b/arch/alpha/include/asm/string.h
index c2911f591704..9eb9933d845f 100644
--- a/arch/alpha/include/asm/string.h
+++ b/arch/alpha/include/asm/string.h
@@ -65,13 +65,14 @@ extern void * memchr(const void *, int, size_t);
aligned values. The DEST and COUNT parameters must be even for
correct operation. */
-#define __HAVE_ARCH_MEMSETW
-extern void * __memsetw(void *dest, unsigned short, size_t count);
-
-#define memsetw(s, c, n) \
-(__builtin_constant_p(c) \
- ? __constant_c_memset((s),0x0001000100010001UL*(unsigned short)(c),(n)) \
- : __memsetw((s),(c),(n)))
+#define __HAVE_ARCH_MEMSET16
+extern void * __memset16(void *dest, unsigned short, size_t count);
+static inline void *memset16(uint16_t *p, uint16_t v, size_t n)
+{
+ if (__builtin_constant_p(v))
+ return __constant_c_memset(p, 0x0001000100010001UL * v, n * 2);
+ return __memset16(p, v, n * 2);
+}
#endif /* __KERNEL__ */
diff --git a/arch/alpha/include/asm/vga.h b/arch/alpha/include/asm/vga.h
index c00106bac521..3c1c2b6128e7 100644
--- a/arch/alpha/include/asm/vga.h
+++ b/arch/alpha/include/asm/vga.h
@@ -34,7 +34,7 @@ static inline void scr_memsetw(u16 *s, u16 c, unsigned int count)
if (__is_ioaddr(s))
memsetw_io((u16 __iomem *) s, c, count);
else
- memsetw(s, c, count);
+ memset16(s, c, count / 2);
}
/* Do not trust that the usage will be correct; analyze the arguments. */
diff --git a/arch/alpha/lib/memset.S b/arch/alpha/lib/memset.S
index 89a26f5e89de..f824969e9e77 100644
--- a/arch/alpha/lib/memset.S
+++ b/arch/alpha/lib/memset.S
@@ -20,7 +20,7 @@
.globl memset
.globl __memset
.globl ___memset
- .globl __memsetw
+ .globl __memset16
.globl __constant_c_memset
.ent ___memset
@@ -110,8 +110,8 @@ EXPORT_SYMBOL(___memset)
EXPORT_SYMBOL(__constant_c_memset)
.align 5
- .ent __memsetw
-__memsetw:
+ .ent __memset16
+__memset16:
.prologue 0
inswl $17,0,$1 /* E0 */
@@ -123,8 +123,8 @@ __memsetw:
or $1,$4,$17 /* E0 */
br __constant_c_memset /* .. E1 */
- .end __memsetw
-EXPORT_SYMBOL(__memsetw)
+ .end __memset16
+EXPORT_SYMBOL(__memset16)
memset = ___memset
__memset = ___memset
--
2.13.2
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v4 6/8] zram: Convert to using memset_l
2017-07-20 18:45 [PATCH v4 0/8] Multibyte memset variations Matthew Wilcox
` (4 preceding siblings ...)
2017-07-20 18:45 ` [PATCH v4 5/8] alpha: Add support for memset16 Matthew Wilcox
@ 2017-07-20 18:45 ` Matthew Wilcox
2017-07-20 18:45 ` [PATCH v4 7/8] sym53c8xx_2: Convert to use memset32 Matthew Wilcox
` (2 subsequent siblings)
8 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2017-07-20 18:45 UTC (permalink / raw)
To: linux-kernel; +Cc: minchan, Matthew Wilcox, akpm, mpe
From: Matthew Wilcox <mawilcox@microsoft.com>
zram was the motivation for creating memset_l(). Minchan Kim sees a 7%
performance improvement on x86 with 100MB of non-zero deduplicatable
data:
perf stat -r 10 dd if=/dev/zram0 of=/dev/null
vanilla: 0.232050465 seconds time elapsed ( +- 0.51% )
memset_l: 0.217219387 seconds time elapsed ( +- 0.07% )
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Tested-by: Minchan Kim <minchan@kernel.org>
---
drivers/block/zram/zram_drv.c | 13 ++-----------
1 file changed, 2 insertions(+), 11 deletions(-)
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 856d5dc02451..2df50d82dc29 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -175,20 +175,11 @@ static inline void update_used_max(struct zram *zram,
} while (old_max != cur_max);
}
-static inline void zram_fill_page(char *ptr, unsigned long len,
+static inline void zram_fill_page(void *ptr, unsigned long len,
unsigned long value)
{
- int i;
- unsigned long *page = (unsigned long *)ptr;
-
WARN_ON_ONCE(!IS_ALIGNED(len, sizeof(unsigned long)));
-
- if (likely(value == 0)) {
- memset(ptr, 0, len);
- } else {
- for (i = 0; i < len / sizeof(*page); i++)
- page[i] = value;
- }
+ memset_l(ptr, value, len / sizeof(unsigned long));
}
static bool page_same_filled(void *ptr, unsigned long *element)
--
2.13.2
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v4 7/8] sym53c8xx_2: Convert to use memset32
2017-07-20 18:45 [PATCH v4 0/8] Multibyte memset variations Matthew Wilcox
` (5 preceding siblings ...)
2017-07-20 18:45 ` [PATCH v4 6/8] zram: Convert to using memset_l Matthew Wilcox
@ 2017-07-20 18:45 ` Matthew Wilcox
2017-07-20 18:45 ` [PATCH v4 8/8] vga: Optimise console scrolling Matthew Wilcox
2017-07-25 5:27 ` [PATCH v4 0/8] Multibyte memset variations Michael Ellerman
8 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2017-07-20 18:45 UTC (permalink / raw)
To: linux-kernel; +Cc: minchan, Matthew Wilcox, akpm, mpe
From: Matthew Wilcox <mawilcox@microsoft.com>
memset32() can be used to initialise these three arrays. Minor code
footprint reduction.
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
drivers/scsi/sym53c8xx_2/sym_hipd.c | 11 +++--------
1 file changed, 3 insertions(+), 8 deletions(-)
diff --git a/drivers/scsi/sym53c8xx_2/sym_hipd.c b/drivers/scsi/sym53c8xx_2/sym_hipd.c
index 6b349e301869..b886b10e3499 100644
--- a/drivers/scsi/sym53c8xx_2/sym_hipd.c
+++ b/drivers/scsi/sym53c8xx_2/sym_hipd.c
@@ -4985,13 +4985,10 @@ struct sym_lcb *sym_alloc_lcb (struct sym_hcb *np, u_char tn, u_char ln)
* Compute the bus address of this table.
*/
if (ln && !tp->luntbl) {
- int i;
-
tp->luntbl = sym_calloc_dma(256, "LUNTBL");
if (!tp->luntbl)
goto fail;
- for (i = 0 ; i < 64 ; i++)
- tp->luntbl[i] = cpu_to_scr(vtobus(&np->badlun_sa));
+ memset32(tp->luntbl, cpu_to_scr(vtobus(&np->badlun_sa)), 64);
tp->head.luntbl_sa = cpu_to_scr(vtobus(tp->luntbl));
}
@@ -5077,8 +5074,7 @@ static void sym_alloc_lcb_tags (struct sym_hcb *np, u_char tn, u_char ln)
/*
* Initialize the task table with invalid entries.
*/
- for (i = 0 ; i < SYM_CONF_MAX_TASK ; i++)
- lp->itlq_tbl[i] = cpu_to_scr(np->notask_ba);
+ memset32(lp->itlq_tbl, cpu_to_scr(np->notask_ba), SYM_CONF_MAX_TASK);
/*
* Fill up the tag buffer with tag numbers.
@@ -5764,8 +5760,7 @@ int sym_hcb_attach(struct Scsi_Host *shost, struct sym_fw *fw, struct sym_nvram
goto attach_failed;
np->badlun_sa = cpu_to_scr(SCRIPTB_BA(np, resel_bad_lun));
- for (i = 0 ; i < 64 ; i++) /* 64 luns/target, no less */
- np->badluntbl[i] = cpu_to_scr(vtobus(&np->badlun_sa));
+ memset32(np->badluntbl, cpu_to_scr(vtobus(&np->badlun_sa)), 64);
/*
* Prepare the bus address array that contains the bus
--
2.13.2
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v4 8/8] vga: Optimise console scrolling
2017-07-20 18:45 [PATCH v4 0/8] Multibyte memset variations Matthew Wilcox
` (6 preceding siblings ...)
2017-07-20 18:45 ` [PATCH v4 7/8] sym53c8xx_2: Convert to use memset32 Matthew Wilcox
@ 2017-07-20 18:45 ` Matthew Wilcox
2017-07-25 5:27 ` [PATCH v4 0/8] Multibyte memset variations Michael Ellerman
8 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2017-07-20 18:45 UTC (permalink / raw)
To: linux-kernel; +Cc: minchan, Matthew Wilcox, akpm, mpe
From: Matthew Wilcox <mawilcox@microsoft.com>
Where possible, call memset16(), memmove() or memcpy() instead of using
open-coded loops. I don't like the calling convention that uses a byte
count instead of a count of u16s, but it's a little late to change that.
Reduces code size of fbcon.o by almost 400 bytes on my laptop build.
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
arch/mips/include/asm/vga.h | 7 +++++++
arch/powerpc/include/asm/vga.h | 8 ++++++++
arch/sparc/include/asm/vga.h | 25 +++++++++++++++++++++++++
include/linux/vt_buffer.h | 12 ++++++++++++
4 files changed, 52 insertions(+)
diff --git a/arch/mips/include/asm/vga.h b/arch/mips/include/asm/vga.h
index f82c83749a08..975ff51f80c4 100644
--- a/arch/mips/include/asm/vga.h
+++ b/arch/mips/include/asm/vga.h
@@ -6,6 +6,7 @@
#ifndef _ASM_VGA_H
#define _ASM_VGA_H
+#include <linux/string.h>
#include <asm/addrspace.h>
#include <asm/byteorder.h>
@@ -40,9 +41,15 @@ static inline u16 scr_readw(volatile const u16 *addr)
return le16_to_cpu(*addr);
}
+static inline void scr_memsetw(u16 *s, u16 v, unsigned int count)
+{
+ memset16(s, cpu_to_le16(v), count / 2);
+}
+
#define scr_memcpyw(d, s, c) memcpy(d, s, c)
#define scr_memmovew(d, s, c) memmove(d, s, c)
#define VT_BUF_HAVE_MEMCPYW
#define VT_BUF_HAVE_MEMMOVEW
+#define VT_BUF_HAVE_MEMSETW
#endif /* _ASM_VGA_H */
diff --git a/arch/powerpc/include/asm/vga.h b/arch/powerpc/include/asm/vga.h
index ab3acd2f2786..7a7b541b7493 100644
--- a/arch/powerpc/include/asm/vga.h
+++ b/arch/powerpc/include/asm/vga.h
@@ -33,8 +33,16 @@ static inline u16 scr_readw(volatile const u16 *addr)
return le16_to_cpu(*addr);
}
+#define VT_BUF_HAVE_MEMSETW
+static inline void scr_memsetw(u16 *s, u16 v, unsigned int n)
+{
+ memset16(s, cpu_to_le16(v), n / 2);
+}
+
#define VT_BUF_HAVE_MEMCPYW
+#define VT_BUF_HAVE_MEMMOVEW
#define scr_memcpyw memcpy
+#define scr_memmovew memmove
#endif /* !CONFIG_VGA_CONSOLE && !CONFIG_MDA_CONSOLE */
diff --git a/arch/sparc/include/asm/vga.h b/arch/sparc/include/asm/vga.h
index ec0e9967d93d..f54e8b6fb197 100644
--- a/arch/sparc/include/asm/vga.h
+++ b/arch/sparc/include/asm/vga.h
@@ -8,9 +8,13 @@
#define _LINUX_ASM_VGA_H_
#include <linux/bug.h>
+#include <linux/string.h>
#include <asm/types.h>
#define VT_BUF_HAVE_RW
+#define VT_BUF_HAVE_MEMSETW
+#define VT_BUF_HAVE_MEMCPYW
+#define VT_BUF_HAVE_MEMMOVEW
#undef scr_writew
#undef scr_readw
@@ -29,6 +33,27 @@ static inline u16 scr_readw(const u16 *addr)
return *addr;
}
+static inline void scr_memsetw(u16 *p, u16 v, unsigned int n)
+{
+ BUG_ON((long) p >= 0);
+
+ memset16(p, cpu_to_le16(v), n / 2);
+}
+
+static inline void scr_memcpyw(u16 *d, u16 *s, unsigned int n)
+{
+ BUG_ON((long) d >= 0);
+
+ memcpy(d, s, n);
+}
+
+static inline void scr_memmovew(u16 *d, u16 *s, unsigned int n)
+{
+ BUG_ON((long) d >= 0);
+
+ memmove(d, s, n);
+}
+
#define VGA_MAP_MEM(x,s) (x)
#endif
diff --git a/include/linux/vt_buffer.h b/include/linux/vt_buffer.h
index f38c10ba3ff5..31b92fcd8f03 100644
--- a/include/linux/vt_buffer.h
+++ b/include/linux/vt_buffer.h
@@ -26,24 +26,33 @@
#ifndef VT_BUF_HAVE_MEMSETW
static inline void scr_memsetw(u16 *s, u16 c, unsigned int count)
{
+#ifdef VT_BUF_HAVE_RW
count /= 2;
while (count--)
scr_writew(c, s++);
+#else
+ memset16(s, c, count / 2);
+#endif
}
#endif
#ifndef VT_BUF_HAVE_MEMCPYW
static inline void scr_memcpyw(u16 *d, const u16 *s, unsigned int count)
{
+#ifdef VT_BUF_HAVE_RW
count /= 2;
while (count--)
scr_writew(scr_readw(s++), d++);
+#else
+ memcpy(d, s, count);
+#endif
}
#endif
#ifndef VT_BUF_HAVE_MEMMOVEW
static inline void scr_memmovew(u16 *d, const u16 *s, unsigned int count)
{
+#ifdef VT_BUF_HAVE_RW
if (d < s)
scr_memcpyw(d, s, count);
else {
@@ -53,6 +62,9 @@ static inline void scr_memmovew(u16 *d, const u16 *s, unsigned int count)
while (count--)
scr_writew(scr_readw(--s), --d);
}
+#else
+ memmove(d, s, count);
+#endif
}
#endif
--
2.13.2
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH v4 0/8] Multibyte memset variations
2017-07-20 18:45 [PATCH v4 0/8] Multibyte memset variations Matthew Wilcox
` (7 preceding siblings ...)
2017-07-20 18:45 ` [PATCH v4 8/8] vga: Optimise console scrolling Matthew Wilcox
@ 2017-07-25 5:27 ` Michael Ellerman
2017-07-25 13:08 ` Matthew Wilcox
8 siblings, 1 reply; 11+ messages in thread
From: Michael Ellerman @ 2017-07-25 5:27 UTC (permalink / raw)
To: Matthew Wilcox, linux-kernel; +Cc: minchan, Matthew Wilcox, akpm
Matthew Wilcox <willy@infradead.org> writes:
> From: Matthew Wilcox <mawilcox@microsoft.com>
>
> A relatively common idiom we're missing is a function to fill an area
> of memory with a pattern which is larger than a single byte. I first
> noticed this with a zram patch which wanted to fill a page with an
> 'unsigned long' value. There turn out to be quite a few places in
> the kernel which can benefit from using an optimised function rather
> than a loop; sometimes text size, sometimes speed, and sometimes both.
> The optimised PowerPC version (not included here) improves performance
> by about 30% on POWER8 on just the raw memset_l().
Is the plan that Andrew will merge this series, or are you planning to
put them in a tree of yours?
cheers
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v4 0/8] Multibyte memset variations
2017-07-25 5:27 ` [PATCH v4 0/8] Multibyte memset variations Michael Ellerman
@ 2017-07-25 13:08 ` Matthew Wilcox
0 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2017-07-25 13:08 UTC (permalink / raw)
To: Michael Ellerman; +Cc: linux-kernel, minchan, Matthew Wilcox, akpm
On Tue, Jul 25, 2017 at 03:27:38PM +1000, Michael Ellerman wrote:
> Matthew Wilcox <willy@infradead.org> writes:
>
> > From: Matthew Wilcox <mawilcox@microsoft.com>
> >
> > A relatively common idiom we're missing is a function to fill an area
> > of memory with a pattern which is larger than a single byte. I first
> > noticed this with a zram patch which wanted to fill a page with an
> > 'unsigned long' value. There turn out to be quite a few places in
> > the kernel which can benefit from using an optimised function rather
> > than a loop; sometimes text size, sometimes speed, and sometimes both.
> > The optimised PowerPC version (not included here) improves performance
> > by about 30% on POWER8 on just the raw memset_l().
>
> Is the plan that Andrew will merge this series, or are you planning to
> put them in a tree of yours?
I'm hoping Andrew will take it, but I can put it in my own tree if he
doesn't want to take it.
^ permalink raw reply [flat|nested] 11+ messages in thread