linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v4 0/8] Multibyte memset variations
@ 2017-07-20 18:45 Matthew Wilcox
  2017-07-20 18:45 ` [PATCH v4 1/8] Add multibyte memset functions Matthew Wilcox
                   ` (8 more replies)
  0 siblings, 9 replies; 11+ messages in thread
From: Matthew Wilcox @ 2017-07-20 18:45 UTC (permalink / raw)
  To: linux-kernel; +Cc: minchan, Matthew Wilcox, akpm, mpe

From: Matthew Wilcox <mawilcox@microsoft.com>

A relatively common idiom we're missing is a function to fill an area
of memory with a pattern which is larger than a single byte.  I first
noticed this with a zram patch which wanted to fill a page with an
'unsigned long' value.  There turn out to be quite a few places in
the kernel which can benefit from using an optimised function rather
than a loop; sometimes text size, sometimes speed, and sometimes both.
The optimised PowerPC version (not included here) improves performance
by about 30% on POWER8 on just the raw memset_l().

Most of the extra lines of code come from the three testcases I added.

Matthew Wilcox (8):
  Add multibyte memset functions
  Add testcases for memset16/32/64
  x86: Implement memset16, memset32 & memset64
  ARM: Implement memset32 & memset64
  alpha: Add support for memset16
  zram: Convert to using memset_l
  sym53c8xx_2: Convert to use memset32
  vga: Optimise console scrolling

 arch/alpha/include/asm/string.h     |  15 +--
 arch/alpha/include/asm/vga.h        |   2 +-
 arch/alpha/lib/memset.S             |  10 +-
 arch/arm/include/asm/string.h       |  14 +++
 arch/arm/kernel/armksyms.c          |   2 +
 arch/arm/lib/memset.S               |  24 +++--
 arch/mips/include/asm/vga.h         |   7 ++
 arch/powerpc/include/asm/vga.h      |   8 ++
 arch/sparc/include/asm/vga.h        |  25 +++++
 arch/x86/include/asm/string_32.h    |  24 +++++
 arch/x86/include/asm/string_64.h    |  36 +++++++
 drivers/block/zram/zram_drv.c       |  13 +--
 drivers/scsi/sym53c8xx_2/sym_hipd.c |  11 +-
 include/linux/string.h              |  30 ++++++
 include/linux/vt_buffer.h           |  12 +++
 lib/Kconfig                         |   3 +
 lib/string.c                        | 196 ++++++++++++++++++++++++++++++++++++
 17 files changed, 394 insertions(+), 38 deletions(-)

-- 
2.13.2

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v4 1/8] Add multibyte memset functions
  2017-07-20 18:45 [PATCH v4 0/8] Multibyte memset variations Matthew Wilcox
@ 2017-07-20 18:45 ` Matthew Wilcox
  2017-07-20 18:45 ` [PATCH v4 2/8] Add testcases for memset16/32/64 Matthew Wilcox
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2017-07-20 18:45 UTC (permalink / raw)
  To: linux-kernel; +Cc: minchan, Matthew Wilcox, akpm, mpe

From: Matthew Wilcox <mawilcox@microsoft.com>

memset16(), memset32() and memset64() are like memset(), but allow the
caller to fill the destination with a value larger than a single byte.
memset_l() and memset_p() allow the caller to use unsigned long and
pointer values respectively.

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
 include/linux/string.h | 30 +++++++++++++++++++++++
 lib/string.c           | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+)

diff --git a/include/linux/string.h b/include/linux/string.h
index a467e617eeb0..c8bdafffd2f0 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -99,6 +99,36 @@ extern __kernel_size_t strcspn(const char *,const char *);
 #ifndef __HAVE_ARCH_MEMSET
 extern void * memset(void *,int,__kernel_size_t);
 #endif
+
+#ifndef __HAVE_ARCH_MEMSET16
+extern void *memset16(uint16_t *, uint16_t, __kernel_size_t);
+#endif
+
+#ifndef __HAVE_ARCH_MEMSET32
+extern void *memset32(uint32_t *, uint32_t, __kernel_size_t);
+#endif
+
+#ifndef __HAVE_ARCH_MEMSET64
+extern void *memset64(uint64_t *, uint64_t, __kernel_size_t);
+#endif
+
+static inline void *memset_l(unsigned long *p, unsigned long v,
+		__kernel_size_t n)
+{
+	if (BITS_PER_LONG == 32)
+		return memset32((uint32_t *)p, v, n);
+	else
+		return memset64((uint64_t *)p, v, n);
+}
+
+static inline void *memset_p(void **p, void *v, __kernel_size_t n)
+{
+	if (BITS_PER_LONG == 32)
+		return memset32((uint32_t *)p, (uintptr_t)v, n);
+	else
+		return memset64((uint64_t *)p, (uintptr_t)v, n);
+}
+
 #ifndef __HAVE_ARCH_MEMCPY
 extern void * memcpy(void *,const void *,__kernel_size_t);
 #endif
diff --git a/lib/string.c b/lib/string.c
index ebbb99c775bd..198148bb61fd 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -723,6 +723,72 @@ void memzero_explicit(void *s, size_t count)
 }
 EXPORT_SYMBOL(memzero_explicit);
 
+#ifndef __HAVE_ARCH_MEMSET16
+/**
+ * memset16() - Fill a memory area with a uint16_t
+ * @s: Pointer to the start of the area.
+ * @v: The value to fill the area with
+ * @count: The number of values to store
+ *
+ * Differs from memset() in that it fills with a uint16_t instead
+ * of a byte.  Remember that @count is the number of uint16_ts to
+ * store, not the number of bytes.
+ */
+void *memset16(uint16_t *s, uint16_t v, size_t count)
+{
+	uint16_t *xs = s;
+
+	while (count--)
+		*xs++ = v;
+	return s;
+}
+EXPORT_SYMBOL(memset16);
+#endif
+
+#ifndef __HAVE_ARCH_MEMSET32
+/**
+ * memset32() - Fill a memory area with a uint32_t
+ * @s: Pointer to the start of the area.
+ * @v: The value to fill the area with
+ * @count: The number of values to store
+ *
+ * Differs from memset() in that it fills with a uint32_t instead
+ * of a byte.  Remember that @count is the number of uint32_ts to
+ * store, not the number of bytes.
+ */
+void *memset32(uint32_t *s, uint32_t v, size_t count)
+{
+	uint32_t *xs = s;
+
+	while (count--)
+		*xs++ = v;
+	return s;
+}
+EXPORT_SYMBOL(memset32);
+#endif
+
+#ifndef __HAVE_ARCH_MEMSET64
+/**
+ * memset64() - Fill a memory area with a uint64_t
+ * @s: Pointer to the start of the area.
+ * @v: The value to fill the area with
+ * @count: The number of values to store
+ *
+ * Differs from memset() in that it fills with a uint64_t instead
+ * of a byte.  Remember that @count is the number of uint64_ts to
+ * store, not the number of bytes.
+ */
+void *memset64(uint64_t *s, uint64_t v, size_t count)
+{
+	uint64_t *xs = s;
+
+	while (count--)
+		*xs++ = v;
+	return s;
+}
+EXPORT_SYMBOL(memset64);
+#endif
+
 #ifndef __HAVE_ARCH_MEMCPY
 /**
  * memcpy - Copy one area of memory to another
-- 
2.13.2

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v4 2/8] Add testcases for memset16/32/64
  2017-07-20 18:45 [PATCH v4 0/8] Multibyte memset variations Matthew Wilcox
  2017-07-20 18:45 ` [PATCH v4 1/8] Add multibyte memset functions Matthew Wilcox
@ 2017-07-20 18:45 ` Matthew Wilcox
  2017-07-20 18:45 ` [PATCH v4 3/8] x86: Implement memset16, memset32 & memset64 Matthew Wilcox
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2017-07-20 18:45 UTC (permalink / raw)
  To: linux-kernel; +Cc: minchan, Matthew Wilcox, akpm, mpe

From: Matthew Wilcox <mawilcox@microsoft.com>

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
 lib/Kconfig  |   3 ++
 lib/string.c | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 133 insertions(+)

diff --git a/lib/Kconfig b/lib/Kconfig
index 6762529ad9e4..40b114a11d7c 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -575,4 +575,7 @@ config PARMAN
 config PRIME_NUMBERS
 	tristate
 
+config STRING_SELFTEST
+	bool "Test string functions"
+
 endmenu
diff --git a/lib/string.c b/lib/string.c
index 198148bb61fd..5af2211c3633 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -1051,3 +1051,133 @@ void fortify_panic(const char *name)
 	BUG();
 }
 EXPORT_SYMBOL(fortify_panic);
+
+#ifdef CONFIG_STRING_SELFTEST
+#include <linux/slab.h>
+#include <linux/module.h>
+
+static __init int memset16_selftest(void)
+{
+	unsigned i, j, k;
+	u16 v, *p = kmalloc(256 * 2 * 2, GFP_KERNEL);
+
+	for (i = 0; i < 256; i++) {
+		for (j = 0; j < 256; j++) {
+			memset(p, 0xa1, 256 * 2 * sizeof(v));
+			memset16(p + i, 0xb1b2, j);
+			for (k = 0; k < 512; k++) {
+				v = p[k];
+				if (k < i) {
+					if (v != 0xa1a1)
+						goto fail;
+				} else if (k < i + j) {
+					if (v != 0xb1b2)
+						goto fail;
+				} else {
+					if (v != 0xa1a1)
+						goto fail;
+				}
+			}
+		}
+	}
+
+fail:
+	kfree(p);
+	if (i < 256)
+		return (i << 24) | (j << 16) | k;
+	return 0;
+}
+
+static __init int memset32_selftest(void)
+{
+	unsigned i, j, k;
+	u32 v, *p = kmalloc(256 * 2 * 4, GFP_KERNEL);
+
+	for (i = 0; i < 256; i++) {
+		for (j = 0; j < 256; j++) {
+			memset(p, 0xa1, 256 * 2 * sizeof(v));
+			memset32(p + i, 0xb1b2b3b4, j);
+			for (k = 0; k < 512; k++) {
+				v = p[k];
+				if (k < i) {
+					if (v != 0xa1a1a1a1)
+						goto fail;
+				} else if (k < i + j) {
+					if (v != 0xb1b2b3b4)
+						goto fail;
+				} else {
+					if (v != 0xa1a1a1a1)
+						goto fail;
+				}
+			}
+		}
+	}
+
+fail:
+	kfree(p);
+	if (i < 256)
+		return (i << 24) | (j << 16) | k;
+	return 0;
+}
+
+static __init int memset64_selftest(void)
+{
+	unsigned i, j, k;
+	u64 v, *p = kmalloc(256 * 2 * 8, GFP_KERNEL);
+
+	for (i = 0; i < 256; i++) {
+		for (j = 0; j < 256; j++) {
+			memset(p, 0xa1, 256 * 2 * sizeof(v));
+			memset64(p + i, 0xb1b2b3b4b5b6b7b8ULL, j);
+			for (k = 0; k < 512; k++) {
+				v = p[k];
+				if (k < i) {
+					if (v != 0xa1a1a1a1a1a1a1a1ULL)
+						goto fail;
+				} else if (k < i + j) {
+					if (v != 0xb1b2b3b4b5b6b7b8ULL)
+						goto fail;
+				} else {
+					if (v != 0xa1a1a1a1a1a1a1a1ULL)
+						goto fail;
+				}
+			}
+		}
+	}
+
+fail:
+	kfree(p);
+	if (i < 256)
+		return (i << 24) | (j << 16) | k;
+	return 0;
+	return 0;
+}
+
+static __init int string_selftest_init(void)
+{
+	int test, subtest;
+
+	test = 1;
+	subtest = memset16_selftest();
+	if (subtest)
+		goto fail;
+
+	test = 2;
+	subtest = memset32_selftest();
+	if (subtest)
+		goto fail;
+
+	test = 3;
+	subtest = memset64_selftest();
+	if (subtest)
+		goto fail;
+
+	pr_info("String selftests succeeded\n");
+	return 0;
+fail:
+	pr_crit("String selftest failure %d.%08x\n", test, subtest);
+	return 0;
+}
+
+module_init(string_selftest_init);
+#endif
-- 
2.13.2

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v4 3/8] x86: Implement memset16, memset32 & memset64
  2017-07-20 18:45 [PATCH v4 0/8] Multibyte memset variations Matthew Wilcox
  2017-07-20 18:45 ` [PATCH v4 1/8] Add multibyte memset functions Matthew Wilcox
  2017-07-20 18:45 ` [PATCH v4 2/8] Add testcases for memset16/32/64 Matthew Wilcox
@ 2017-07-20 18:45 ` Matthew Wilcox
  2017-07-20 18:45 ` [PATCH v4 4/8] ARM: Implement " Matthew Wilcox
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2017-07-20 18:45 UTC (permalink / raw)
  To: linux-kernel; +Cc: minchan, Matthew Wilcox, akpm, mpe

From: Matthew Wilcox <mawilcox@microsoft.com>

These are single instructions on x86.  There's no 64-bit instruction
for x86-32, but we don't yet have any user for memset64() on 32-bit
architectures, so don't bother to implement it.

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
 arch/x86/include/asm/string_32.h | 24 ++++++++++++++++++++++++
 arch/x86/include/asm/string_64.h | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index e9ee84873de5..e371e7229042 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -340,6 +340,30 @@ extern void *memset(void *, int, size_t);
 #endif
 #endif /* !CONFIG_FORTIFY_SOURCE */
 
+#define __HAVE_ARCH_MEMSET16
+static inline void *memset16(uint16_t *s, uint16_t v, size_t n)
+{
+	int d0, d1;
+	asm volatile("rep\n\t"
+		     "stosw"
+		     : "=&c" (d0), "=&D" (d1)
+		     : "a" (v), "1" (s), "0" (n)
+		     : "memory");
+	return s;
+}
+
+#define __HAVE_ARCH_MEMSET32
+static inline void *memset32(uint32_t *s, uint32_t v, size_t n)
+{
+	int d0, d1;
+	asm volatile("rep\n\t"
+		     "stosl"
+		     : "=&c" (d0), "=&D" (d1)
+		     : "a" (v), "1" (s), "0" (n)
+		     : "memory");
+	return s;
+}
+
 /*
  * find the first occurrence of byte 'c', or 1 past the area if none
  */
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 2a8c822de1fc..f372a70a523f 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -58,6 +58,42 @@ extern void *__memcpy(void *to, const void *from, size_t len);
 void *memset(void *s, int c, size_t n);
 void *__memset(void *s, int c, size_t n);
 
+#define __HAVE_ARCH_MEMSET16
+static inline void *memset16(uint16_t *s, uint16_t v, size_t n)
+{
+	long d0, d1;
+	asm volatile("rep\n\t"
+		     "stosw"
+		     : "=&c" (d0), "=&D" (d1)
+		     : "a" (v), "1" (s), "0" (n)
+		     : "memory");
+	return s;
+}
+
+#define __HAVE_ARCH_MEMSET32
+static inline void *memset32(uint32_t *s, uint32_t v, size_t n)
+{
+	long d0, d1;
+	asm volatile("rep\n\t"
+		     "stosl"
+		     : "=&c" (d0), "=&D" (d1)
+		     : "a" (v), "1" (s), "0" (n)
+		     : "memory");
+	return s;
+}
+
+#define __HAVE_ARCH_MEMSET64
+static inline void *memset64(uint64_t *s, uint64_t v, size_t n)
+{
+	long d0, d1;
+	asm volatile("rep\n\t"
+		     "stosq"
+		     : "=&c" (d0), "=&D" (d1)
+		     : "a" (v), "1" (s), "0" (n)
+		     : "memory");
+	return s;
+}
+
 #define __HAVE_ARCH_MEMMOVE
 void *memmove(void *dest, const void *src, size_t count);
 void *__memmove(void *dest, const void *src, size_t count);
-- 
2.13.2

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v4 4/8] ARM: Implement memset32 & memset64
  2017-07-20 18:45 [PATCH v4 0/8] Multibyte memset variations Matthew Wilcox
                   ` (2 preceding siblings ...)
  2017-07-20 18:45 ` [PATCH v4 3/8] x86: Implement memset16, memset32 & memset64 Matthew Wilcox
@ 2017-07-20 18:45 ` Matthew Wilcox
  2017-07-20 18:45 ` [PATCH v4 5/8] alpha: Add support for memset16 Matthew Wilcox
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2017-07-20 18:45 UTC (permalink / raw)
  To: linux-kernel; +Cc: minchan, Matthew Wilcox, akpm, mpe

From: Matthew Wilcox <mawilcox@microsoft.com>

Reuse the existing optimised memset implementation to implement an
optimised memset32 and memset64.

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Reviewed-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 arch/arm/include/asm/string.h | 14 ++++++++++++++
 arch/arm/kernel/armksyms.c    |  2 ++
 arch/arm/lib/memset.S         | 24 ++++++++++++++++++------
 3 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h
index cf4f3aad0fc1..fe1c6af3a1b1 100644
--- a/arch/arm/include/asm/string.h
+++ b/arch/arm/include/asm/string.h
@@ -24,6 +24,20 @@ extern void * memchr(const void *, int, __kernel_size_t);
 #define __HAVE_ARCH_MEMSET
 extern void * memset(void *, int, __kernel_size_t);
 
+#define __HAVE_ARCH_MEMSET32
+extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
+static inline void *memset32(uint32_t *p, uint32_t v, __kernel_size_t n)
+{
+	return __memset32(p, v, n * 4);
+}
+
+#define __HAVE_ARCH_MEMSET64
+extern void *__memset64(uint64_t *, uint32_t low, __kernel_size_t, uint32_t hi);
+static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n)
+{
+	return __memset64(p, v, n * 8, v >> 32);
+}
+
 extern void __memzero(void *ptr, __kernel_size_t n);
 
 #define memset(p,v,n)							\
diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
index 8e8d20cdbce7..5266fd9ad6b4 100644
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -87,6 +87,8 @@ EXPORT_SYMBOL(__raw_writesl);
 EXPORT_SYMBOL(strchr);
 EXPORT_SYMBOL(strrchr);
 EXPORT_SYMBOL(memset);
+EXPORT_SYMBOL(__memset32);
+EXPORT_SYMBOL(__memset64);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memmove);
 EXPORT_SYMBOL(memchr);
diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S
index 3c65e3bd790f..ed6d35d9cdb5 100644
--- a/arch/arm/lib/memset.S
+++ b/arch/arm/lib/memset.S
@@ -28,7 +28,7 @@ UNWIND( .fnstart         )
 1:	orr	r1, r1, r1, lsl #8
 	orr	r1, r1, r1, lsl #16
 	mov	r3, r1
-	cmp	r2, #16
+7:	cmp	r2, #16
 	blt	4f
 
 #if ! CALGN(1)+0
@@ -41,7 +41,7 @@ UNWIND( .fnend              )
 UNWIND( .fnstart            )
 UNWIND( .save {r8, lr}      )
 	mov	r8, r1
-	mov	lr, r1
+	mov	lr, r3
 
 2:	subs	r2, r2, #64
 	stmgeia	ip!, {r1, r3, r8, lr}	@ 64 bytes at a time.
@@ -73,11 +73,11 @@ UNWIND( .fnend                 )
 UNWIND( .fnstart               )
 UNWIND( .save {r4-r8, lr}      )
 	mov	r4, r1
-	mov	r5, r1
+	mov	r5, r3
 	mov	r6, r1
-	mov	r7, r1
+	mov	r7, r3
 	mov	r8, r1
-	mov	lr, r1
+	mov	lr, r3
 
 	cmp	r2, #96
 	tstgt	ip, #31
@@ -114,7 +114,7 @@ UNWIND( .fnstart            )
 	tst	r2, #4
 	strne	r1, [ip], #4
 /*
- * When we get here, we've got less than 4 bytes to zero.  We
+ * When we get here, we've got less than 4 bytes to set.  We
  * may have an unaligned pointer as well.
  */
 5:	tst	r2, #2
@@ -135,3 +135,15 @@ UNWIND( .fnstart            )
 UNWIND( .fnend   )
 ENDPROC(memset)
 ENDPROC(mmioset)
+
+ENTRY(__memset32)
+UNWIND( .fnstart         )
+	mov	r3, r1			@ copy r1 to r3 and fall into memset64
+UNWIND( .fnend   )
+ENDPROC(__memset32)
+ENTRY(__memset64)
+UNWIND( .fnstart         )
+	mov	ip, r0			@ preserve r0 as return value
+	b	7b			@ jump into the middle of memset
+UNWIND( .fnend   )
+ENDPROC(__memset64)
-- 
2.13.2

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v4 5/8] alpha: Add support for memset16
  2017-07-20 18:45 [PATCH v4 0/8] Multibyte memset variations Matthew Wilcox
                   ` (3 preceding siblings ...)
  2017-07-20 18:45 ` [PATCH v4 4/8] ARM: Implement " Matthew Wilcox
@ 2017-07-20 18:45 ` Matthew Wilcox
  2017-07-20 18:45 ` [PATCH v4 6/8] zram: Convert to using memset_l Matthew Wilcox
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2017-07-20 18:45 UTC (permalink / raw)
  To: linux-kernel; +Cc: minchan, Matthew Wilcox, akpm, mpe

From: Matthew Wilcox <mawilcox@microsoft.com>

Alpha already had an optimised fill-memory-with-16-bit-quantity assembler
routine called memsetw().  It has a slightly different calling convention
from memset16() in that it takes a byte count, not a count of words.
That's the same convention used by ARM's __memset routines, so rename
Alpha's routine to match and add a memset16() wrapper around it.  Then
convert Alpha's scr_memsetw() to call memset16() instead of memsetw().

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
 arch/alpha/include/asm/string.h | 15 ++++++++-------
 arch/alpha/include/asm/vga.h    |  2 +-
 arch/alpha/lib/memset.S         | 10 +++++-----
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/arch/alpha/include/asm/string.h b/arch/alpha/include/asm/string.h
index c2911f591704..9eb9933d845f 100644
--- a/arch/alpha/include/asm/string.h
+++ b/arch/alpha/include/asm/string.h
@@ -65,13 +65,14 @@ extern void * memchr(const void *, int, size_t);
    aligned values.  The DEST and COUNT parameters must be even for 
    correct operation.  */
 
-#define __HAVE_ARCH_MEMSETW
-extern void * __memsetw(void *dest, unsigned short, size_t count);
-
-#define memsetw(s, c, n)						 \
-(__builtin_constant_p(c)						 \
- ? __constant_c_memset((s),0x0001000100010001UL*(unsigned short)(c),(n)) \
- : __memsetw((s),(c),(n)))
+#define __HAVE_ARCH_MEMSET16
+extern void * __memset16(void *dest, unsigned short, size_t count);
+static inline void *memset16(uint16_t *p, uint16_t v, size_t n)
+{
+	if (__builtin_constant_p(v))
+		return __constant_c_memset(p, 0x0001000100010001UL * v, n * 2);
+	return __memset16(p, v, n * 2);
+}
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/alpha/include/asm/vga.h b/arch/alpha/include/asm/vga.h
index c00106bac521..3c1c2b6128e7 100644
--- a/arch/alpha/include/asm/vga.h
+++ b/arch/alpha/include/asm/vga.h
@@ -34,7 +34,7 @@ static inline void scr_memsetw(u16 *s, u16 c, unsigned int count)
 	if (__is_ioaddr(s))
 		memsetw_io((u16 __iomem *) s, c, count);
 	else
-		memsetw(s, c, count);
+		memset16(s, c, count / 2);
 }
 
 /* Do not trust that the usage will be correct; analyze the arguments.  */
diff --git a/arch/alpha/lib/memset.S b/arch/alpha/lib/memset.S
index 89a26f5e89de..f824969e9e77 100644
--- a/arch/alpha/lib/memset.S
+++ b/arch/alpha/lib/memset.S
@@ -20,7 +20,7 @@
 	.globl memset
 	.globl __memset
 	.globl ___memset
-	.globl __memsetw
+	.globl __memset16
 	.globl __constant_c_memset
 
 	.ent ___memset
@@ -110,8 +110,8 @@ EXPORT_SYMBOL(___memset)
 EXPORT_SYMBOL(__constant_c_memset)
 
 	.align 5
-	.ent __memsetw
-__memsetw:
+	.ent __memset16
+__memset16:
 	.prologue 0
 
 	inswl $17,0,$1		/* E0 */
@@ -123,8 +123,8 @@ __memsetw:
 	or $1,$4,$17		/* E0 */
 	br __constant_c_memset	/* .. E1 */
 
-	.end __memsetw
-EXPORT_SYMBOL(__memsetw)
+	.end __memset16
+EXPORT_SYMBOL(__memset16)
 
 memset = ___memset
 __memset = ___memset
-- 
2.13.2

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v4 6/8] zram: Convert to using memset_l
  2017-07-20 18:45 [PATCH v4 0/8] Multibyte memset variations Matthew Wilcox
                   ` (4 preceding siblings ...)
  2017-07-20 18:45 ` [PATCH v4 5/8] alpha: Add support for memset16 Matthew Wilcox
@ 2017-07-20 18:45 ` Matthew Wilcox
  2017-07-20 18:45 ` [PATCH v4 7/8] sym53c8xx_2: Convert to use memset32 Matthew Wilcox
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2017-07-20 18:45 UTC (permalink / raw)
  To: linux-kernel; +Cc: minchan, Matthew Wilcox, akpm, mpe

From: Matthew Wilcox <mawilcox@microsoft.com>

zram was the motivation for creating memset_l().  Minchan Kim sees a 7%
performance improvement on x86 with 100MB of non-zero deduplicatable
data:

        perf stat -r 10 dd if=/dev/zram0 of=/dev/null

vanilla:        0.232050465 seconds time elapsed ( +-  0.51% )
memset_l:	0.217219387 seconds time elapsed ( +-  0.07% )

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Tested-by: Minchan Kim <minchan@kernel.org>
---
 drivers/block/zram/zram_drv.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 856d5dc02451..2df50d82dc29 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -175,20 +175,11 @@ static inline void update_used_max(struct zram *zram,
 	} while (old_max != cur_max);
 }
 
-static inline void zram_fill_page(char *ptr, unsigned long len,
+static inline void zram_fill_page(void *ptr, unsigned long len,
 					unsigned long value)
 {
-	int i;
-	unsigned long *page = (unsigned long *)ptr;
-
 	WARN_ON_ONCE(!IS_ALIGNED(len, sizeof(unsigned long)));
-
-	if (likely(value == 0)) {
-		memset(ptr, 0, len);
-	} else {
-		for (i = 0; i < len / sizeof(*page); i++)
-			page[i] = value;
-	}
+	memset_l(ptr, value, len / sizeof(unsigned long));
 }
 
 static bool page_same_filled(void *ptr, unsigned long *element)
-- 
2.13.2

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v4 7/8] sym53c8xx_2: Convert to use memset32
  2017-07-20 18:45 [PATCH v4 0/8] Multibyte memset variations Matthew Wilcox
                   ` (5 preceding siblings ...)
  2017-07-20 18:45 ` [PATCH v4 6/8] zram: Convert to using memset_l Matthew Wilcox
@ 2017-07-20 18:45 ` Matthew Wilcox
  2017-07-20 18:45 ` [PATCH v4 8/8] vga: Optimise console scrolling Matthew Wilcox
  2017-07-25  5:27 ` [PATCH v4 0/8] Multibyte memset variations Michael Ellerman
  8 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2017-07-20 18:45 UTC (permalink / raw)
  To: linux-kernel; +Cc: minchan, Matthew Wilcox, akpm, mpe

From: Matthew Wilcox <mawilcox@microsoft.com>

memset32() can be used to initialise these three arrays.  Minor code
footprint reduction.

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
 drivers/scsi/sym53c8xx_2/sym_hipd.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/scsi/sym53c8xx_2/sym_hipd.c b/drivers/scsi/sym53c8xx_2/sym_hipd.c
index 6b349e301869..b886b10e3499 100644
--- a/drivers/scsi/sym53c8xx_2/sym_hipd.c
+++ b/drivers/scsi/sym53c8xx_2/sym_hipd.c
@@ -4985,13 +4985,10 @@ struct sym_lcb *sym_alloc_lcb (struct sym_hcb *np, u_char tn, u_char ln)
 	 *  Compute the bus address of this table.
 	 */
 	if (ln && !tp->luntbl) {
-		int i;
-
 		tp->luntbl = sym_calloc_dma(256, "LUNTBL");
 		if (!tp->luntbl)
 			goto fail;
-		for (i = 0 ; i < 64 ; i++)
-			tp->luntbl[i] = cpu_to_scr(vtobus(&np->badlun_sa));
+		memset32(tp->luntbl, cpu_to_scr(vtobus(&np->badlun_sa)), 64);
 		tp->head.luntbl_sa = cpu_to_scr(vtobus(tp->luntbl));
 	}
 
@@ -5077,8 +5074,7 @@ static void sym_alloc_lcb_tags (struct sym_hcb *np, u_char tn, u_char ln)
 	/*
 	 *  Initialize the task table with invalid entries.
 	 */
-	for (i = 0 ; i < SYM_CONF_MAX_TASK ; i++)
-		lp->itlq_tbl[i] = cpu_to_scr(np->notask_ba);
+	memset32(lp->itlq_tbl, cpu_to_scr(np->notask_ba), SYM_CONF_MAX_TASK);
 
 	/*
 	 *  Fill up the tag buffer with tag numbers.
@@ -5764,8 +5760,7 @@ int sym_hcb_attach(struct Scsi_Host *shost, struct sym_fw *fw, struct sym_nvram
 		goto attach_failed;
 
 	np->badlun_sa = cpu_to_scr(SCRIPTB_BA(np, resel_bad_lun));
-	for (i = 0 ; i < 64 ; i++)	/* 64 luns/target, no less */
-		np->badluntbl[i] = cpu_to_scr(vtobus(&np->badlun_sa));
+	memset32(np->badluntbl, cpu_to_scr(vtobus(&np->badlun_sa)), 64);
 
 	/*
 	 *  Prepare the bus address array that contains the bus 
-- 
2.13.2

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v4 8/8] vga: Optimise console scrolling
  2017-07-20 18:45 [PATCH v4 0/8] Multibyte memset variations Matthew Wilcox
                   ` (6 preceding siblings ...)
  2017-07-20 18:45 ` [PATCH v4 7/8] sym53c8xx_2: Convert to use memset32 Matthew Wilcox
@ 2017-07-20 18:45 ` Matthew Wilcox
  2017-07-25  5:27 ` [PATCH v4 0/8] Multibyte memset variations Michael Ellerman
  8 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2017-07-20 18:45 UTC (permalink / raw)
  To: linux-kernel; +Cc: minchan, Matthew Wilcox, akpm, mpe

From: Matthew Wilcox <mawilcox@microsoft.com>

Where possible, call memset16(), memmove() or memcpy() instead of using
open-coded loops.  I don't like the calling convention that uses a byte
count instead of a count of u16s, but it's a little late to change that.
Reduces code size of fbcon.o by almost 400 bytes on my laptop build.

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
 arch/mips/include/asm/vga.h    |  7 +++++++
 arch/powerpc/include/asm/vga.h |  8 ++++++++
 arch/sparc/include/asm/vga.h   | 25 +++++++++++++++++++++++++
 include/linux/vt_buffer.h      | 12 ++++++++++++
 4 files changed, 52 insertions(+)

diff --git a/arch/mips/include/asm/vga.h b/arch/mips/include/asm/vga.h
index f82c83749a08..975ff51f80c4 100644
--- a/arch/mips/include/asm/vga.h
+++ b/arch/mips/include/asm/vga.h
@@ -6,6 +6,7 @@
 #ifndef _ASM_VGA_H
 #define _ASM_VGA_H
 
+#include <linux/string.h>
 #include <asm/addrspace.h>
 #include <asm/byteorder.h>
 
@@ -40,9 +41,15 @@ static inline u16 scr_readw(volatile const u16 *addr)
 	return le16_to_cpu(*addr);
 }
 
+static inline void scr_memsetw(u16 *s, u16 v, unsigned int count)
+{
+	memset16(s, cpu_to_le16(v), count / 2);
+}
+
 #define scr_memcpyw(d, s, c) memcpy(d, s, c)
 #define scr_memmovew(d, s, c) memmove(d, s, c)
 #define VT_BUF_HAVE_MEMCPYW
 #define VT_BUF_HAVE_MEMMOVEW
+#define VT_BUF_HAVE_MEMSETW
 
 #endif /* _ASM_VGA_H */
diff --git a/arch/powerpc/include/asm/vga.h b/arch/powerpc/include/asm/vga.h
index ab3acd2f2786..7a7b541b7493 100644
--- a/arch/powerpc/include/asm/vga.h
+++ b/arch/powerpc/include/asm/vga.h
@@ -33,8 +33,16 @@ static inline u16 scr_readw(volatile const u16 *addr)
 	return le16_to_cpu(*addr);
 }
 
+#define VT_BUF_HAVE_MEMSETW
+static inline void scr_memsetw(u16 *s, u16 v, unsigned int n)
+{
+	memset16(s, cpu_to_le16(v), n / 2);
+}
+
 #define VT_BUF_HAVE_MEMCPYW
+#define VT_BUF_HAVE_MEMMOVEW
 #define scr_memcpyw	memcpy
+#define scr_memmovew	memmove
 
 #endif /* !CONFIG_VGA_CONSOLE && !CONFIG_MDA_CONSOLE */
 
diff --git a/arch/sparc/include/asm/vga.h b/arch/sparc/include/asm/vga.h
index ec0e9967d93d..f54e8b6fb197 100644
--- a/arch/sparc/include/asm/vga.h
+++ b/arch/sparc/include/asm/vga.h
@@ -8,9 +8,13 @@
 #define _LINUX_ASM_VGA_H_
 
 #include <linux/bug.h>
+#include <linux/string.h>
 #include <asm/types.h>
 
 #define VT_BUF_HAVE_RW
+#define VT_BUF_HAVE_MEMSETW
+#define VT_BUF_HAVE_MEMCPYW
+#define VT_BUF_HAVE_MEMMOVEW
 
 #undef scr_writew
 #undef scr_readw
@@ -29,6 +33,27 @@ static inline u16 scr_readw(const u16 *addr)
 	return *addr;
 }
 
+static inline void scr_memsetw(u16 *p, u16 v, unsigned int n)
+{
+	BUG_ON((long) p >= 0);
+
+	memset16(p, cpu_to_le16(v), n / 2);
+}
+
+static inline void scr_memcpyw(u16 *d, u16 *s, unsigned int n)
+{
+	BUG_ON((long) d >= 0);
+
+	memcpy(d, s, n);
+}
+
+static inline void scr_memmovew(u16 *d, u16 *s, unsigned int n)
+{
+	BUG_ON((long) d >= 0);
+
+	memmove(d, s, n);
+}
+
 #define VGA_MAP_MEM(x,s) (x)
 
 #endif
diff --git a/include/linux/vt_buffer.h b/include/linux/vt_buffer.h
index f38c10ba3ff5..31b92fcd8f03 100644
--- a/include/linux/vt_buffer.h
+++ b/include/linux/vt_buffer.h
@@ -26,24 +26,33 @@
 #ifndef VT_BUF_HAVE_MEMSETW
 static inline void scr_memsetw(u16 *s, u16 c, unsigned int count)
 {
+#ifdef VT_BUF_HAVE_RW
 	count /= 2;
 	while (count--)
 		scr_writew(c, s++);
+#else
+	memset16(s, c, count / 2);
+#endif
 }
 #endif
 
 #ifndef VT_BUF_HAVE_MEMCPYW
 static inline void scr_memcpyw(u16 *d, const u16 *s, unsigned int count)
 {
+#ifdef VT_BUF_HAVE_RW
 	count /= 2;
 	while (count--)
 		scr_writew(scr_readw(s++), d++);
+#else
+	memcpy(d, s, count);
+#endif
 }
 #endif
 
 #ifndef VT_BUF_HAVE_MEMMOVEW
 static inline void scr_memmovew(u16 *d, const u16 *s, unsigned int count)
 {
+#ifdef VT_BUF_HAVE_RW
 	if (d < s)
 		scr_memcpyw(d, s, count);
 	else {
@@ -53,6 +62,9 @@ static inline void scr_memmovew(u16 *d, const u16 *s, unsigned int count)
 		while (count--)
 			scr_writew(scr_readw(--s), --d);
 	}
+#else
+	memmove(d, s, count);
+#endif
 }
 #endif
 
-- 
2.13.2

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v4 0/8] Multibyte memset variations
  2017-07-20 18:45 [PATCH v4 0/8] Multibyte memset variations Matthew Wilcox
                   ` (7 preceding siblings ...)
  2017-07-20 18:45 ` [PATCH v4 8/8] vga: Optimise console scrolling Matthew Wilcox
@ 2017-07-25  5:27 ` Michael Ellerman
  2017-07-25 13:08   ` Matthew Wilcox
  8 siblings, 1 reply; 11+ messages in thread
From: Michael Ellerman @ 2017-07-25  5:27 UTC (permalink / raw)
  To: Matthew Wilcox, linux-kernel; +Cc: minchan, Matthew Wilcox, akpm

Matthew Wilcox <willy@infradead.org> writes:

> From: Matthew Wilcox <mawilcox@microsoft.com>
>
> A relatively common idiom we're missing is a function to fill an area
> of memory with a pattern which is larger than a single byte.  I first
> noticed this with a zram patch which wanted to fill a page with an
> 'unsigned long' value.  There turn out to be quite a few places in
> the kernel which can benefit from using an optimised function rather
> than a loop; sometimes text size, sometimes speed, and sometimes both.
> The optimised PowerPC version (not included here) improves performance
> by about 30% on POWER8 on just the raw memset_l().

Is the plan that Andrew will merge this series, or are you planning to
put them in a tree of yours?

cheers

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v4 0/8] Multibyte memset variations
  2017-07-25  5:27 ` [PATCH v4 0/8] Multibyte memset variations Michael Ellerman
@ 2017-07-25 13:08   ` Matthew Wilcox
  0 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2017-07-25 13:08 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linux-kernel, minchan, Matthew Wilcox, akpm

On Tue, Jul 25, 2017 at 03:27:38PM +1000, Michael Ellerman wrote:
> Matthew Wilcox <willy@infradead.org> writes:
> 
> > From: Matthew Wilcox <mawilcox@microsoft.com>
> >
> > A relatively common idiom we're missing is a function to fill an area
> > of memory with a pattern which is larger than a single byte.  I first
> > noticed this with a zram patch which wanted to fill a page with an
> > 'unsigned long' value.  There turn out to be quite a few places in
> > the kernel which can benefit from using an optimised function rather
> > than a loop; sometimes text size, sometimes speed, and sometimes both.
> > The optimised PowerPC version (not included here) improves performance
> > by about 30% on POWER8 on just the raw memset_l().
> 
> Is the plan that Andrew will merge this series, or are you planning to
> put them in a tree of yours?

I'm hoping Andrew will take it, but I can put it in my own tree if he
doesn't want to take it.

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2017-07-25 13:08 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-07-20 18:45 [PATCH v4 0/8] Multibyte memset variations Matthew Wilcox
2017-07-20 18:45 ` [PATCH v4 1/8] Add multibyte memset functions Matthew Wilcox
2017-07-20 18:45 ` [PATCH v4 2/8] Add testcases for memset16/32/64 Matthew Wilcox
2017-07-20 18:45 ` [PATCH v4 3/8] x86: Implement memset16, memset32 & memset64 Matthew Wilcox
2017-07-20 18:45 ` [PATCH v4 4/8] ARM: Implement " Matthew Wilcox
2017-07-20 18:45 ` [PATCH v4 5/8] alpha: Add support for memset16 Matthew Wilcox
2017-07-20 18:45 ` [PATCH v4 6/8] zram: Convert to using memset_l Matthew Wilcox
2017-07-20 18:45 ` [PATCH v4 7/8] sym53c8xx_2: Convert to use memset32 Matthew Wilcox
2017-07-20 18:45 ` [PATCH v4 8/8] vga: Optimise console scrolling Matthew Wilcox
2017-07-25  5:27 ` [PATCH v4 0/8] Multibyte memset variations Michael Ellerman
2017-07-25 13:08   ` Matthew Wilcox

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).