linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] x86_64: new and improved memset()
@ 2019-09-14 10:33 Alexey Dobriyan
  2019-09-14 11:37 ` Borislav Petkov
                   ` (3 more replies)
  0 siblings, 4 replies; 6+ messages in thread
From: Alexey Dobriyan @ 2019-09-14 10:33 UTC (permalink / raw)
  To: tglx, mingo, bp, hpa; +Cc: linux-kernel, x86, linux, torvalds

Current memset() implementation does silly things:
* multiplication to get register-wide constant:
	waste of cycles if filler is known at compile time,

* REP STOSQ followed by REP STOSB:
	REP STOSB setup overhead is very high because trailing length
	is very low (< 8)

* suboptimal calling convention:
	REP STOSB/STOSQ favours (rdi, rcx), ABI gives (rdi, rsi, rdx).
	While shuffling registers is free, rcx and rdx are equivalent
	code generation wise.

* memset_orig():
	memset(..., 0, ...) could be done within 3 registers,
	memset(..., != 0, ...) -- within 4 registers, anything else is
	a waste. CPUs which required unrolling are hopefully gone by now.

New implementation is based on the following observations:
* c == 0 is the most common form,
	filler can be done with "xor eax, eax" and pushed into memset()
	saving 2 bytes per call and multiplication

* "len" divisible by 8 is the most common form:
	all it takes is one pointer or unsigned long inside structure,
	dispatch at compile time to code without those ugly "lets fill
	at most 7 bytes" tails,

* multiplication to get wider filler value can be done at compile time
  for "c != 0" with 1 insn/10 bytes at most saving multiplication.

Note: "memset0" name is chosen because "bzero" is officially deprecated.

Note: memset(,0,) form is interleaved into memset(,c,) form to save space.

TODO:
	CONFIG_FORTIFY_SOURCE is enabled by distros
	inline "xor eax, eax; rep stosb"
	benchmarks
	testing

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

 arch/x86/boot/compressed/Makefile     |    1 
 arch/x86/include/asm/string_64.h      |  104 ++++++++++++++++++++++++++++++++++
 arch/x86/lib/Makefile                 |    1 
 arch/x86/lib/memset0_64.S             |   86 ++++++++++++++++++++++++++++
 drivers/firmware/efi/libstub/Makefile |    2 
 5 files changed, 193 insertions(+), 1 deletion(-)

--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -38,6 +38,7 @@ KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
 KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
 KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
 KBUILD_CFLAGS += -Wno-pointer-sign
+KBUILD_CFLAGS += -D_ARCH_X86_BOOT
 
 KBUILD_AFLAGS  := $(KBUILD_CFLAGS) -D__ASSEMBLY__
 GCOV_PROFILE := n
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -15,7 +15,111 @@ extern void *memcpy(void *to, const void *from, size_t len);
 extern void *__memcpy(void *to, const void *from, size_t len);
 
 #define __HAVE_ARCH_MEMSET
+#if defined(_ARCH_X86_BOOT) || defined(CONFIG_FORTIFY_SOURCE)
 void *memset(void *s, int c, size_t n);
+#else
+#include <asm/alternative.h>
+#include <asm/cpufeatures.h>
+
+/* Internal, do not use. */
+static __always_inline void memset0(void *s, size_t n)
+{
+	/* Internal, do not use. */
+	void _memset0_mov(void);
+	void _memset0_rep_stosq(void);
+	void memset0_mov(void);
+	void memset0_rep_stosq(void);
+	void memset0_rep_stosb(void);
+
+	if (__builtin_constant_p(n) && n == 0) {
+	} else if (__builtin_constant_p(n) && n == 1) {
+		*(uint8_t *)s = 0;
+	} else if (__builtin_constant_p(n) && n == 2) {
+		*(uint16_t *)s = 0;
+	} else if (__builtin_constant_p(n) && n == 4) {
+		*(uint32_t *)s = 0;
+	} else if (__builtin_constant_p(n) && n == 6) {
+		*(uint32_t *)s = 0;
+		*(uint16_t *)(s + 4) = 0;
+	} else if (__builtin_constant_p(n) && n == 8) {
+		*(uint64_t *)s = 0;
+	} else if (__builtin_constant_p(n) && (n & 7) == 0) {
+		alternative_call_2(
+			_memset0_mov,
+			_memset0_rep_stosq, X86_FEATURE_REP_GOOD,
+			memset0_rep_stosb, X86_FEATURE_ERMS,
+			ASM_OUTPUT2("=D" (s), "=c" (n)),
+			"D" (s), "c" (n)
+			: "rax", "cc", "memory"
+		);
+	} else {
+		alternative_call_2(
+			memset0_mov,
+			memset0_rep_stosq, X86_FEATURE_REP_GOOD,
+			memset0_rep_stosb, X86_FEATURE_ERMS,
+			ASM_OUTPUT2("=D" (s), "=c" (n)),
+			"D" (s), "c" (n)
+			: "rax", "rsi", "cc", "memory"
+		);
+	}
+}
+
+/* Internal, do not use. */
+static __always_inline void memsetx(void *s, int c, size_t n)
+{
+	/* Internal, do not use. */
+	void _memsetx_mov(void);
+	void _memsetx_rep_stosq(void);
+	void memsetx_mov(void);
+	void memsetx_rep_stosq(void);
+	void memsetx_rep_stosb(void);
+
+	const uint64_t ccc = (uint8_t)c * 0x0101010101010101ULL;
+
+	if (__builtin_constant_p(n) && n == 0) {
+	} else if (__builtin_constant_p(n) && n == 1) {
+		*(uint8_t *)s = ccc;
+	} else if (__builtin_constant_p(n) && n == 2) {
+		*(uint16_t *)s = ccc;
+	} else if (__builtin_constant_p(n) && n == 4) {
+		*(uint32_t *)s = ccc;
+	} else if (__builtin_constant_p(n) && n == 8) {
+		*(uint64_t *)s = ccc;
+	} else if (__builtin_constant_p(n) && (n & 7) == 0) {
+		alternative_call_2(
+			_memsetx_mov,
+			_memsetx_rep_stosq, X86_FEATURE_REP_GOOD,
+			memsetx_rep_stosb, X86_FEATURE_ERMS,
+			ASM_OUTPUT2("=D" (s), "=c" (n)),
+			"D" (s), "c" (n), "a" (ccc)
+			: "cc", "memory"
+		);
+	} else {
+		alternative_call_2(
+			memsetx_mov,
+			memsetx_rep_stosq, X86_FEATURE_REP_GOOD,
+			memsetx_rep_stosb, X86_FEATURE_ERMS,
+			ASM_OUTPUT2("=D" (s), "=c" (n)),
+			"D" (s), "c" (n), "a" (ccc)
+			: "rsi", "cc", "memory"
+		);
+	}
+}
+
+static __always_inline void *memset(void *s, int c, size_t n)
+{
+	if (__builtin_constant_p(c)) {
+		if (c == 0) {
+			memset0(s, n);
+		} else {
+			memsetx(s, c, n);
+		}
+		return s;
+	} else {
+		return __builtin_memset(s, c, n);
+	}
+}
+#endif
 void *__memset(void *s, int c, size_t n);
 
 #define __HAVE_ARCH_MEMSET16
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -58,6 +58,7 @@ else
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
         lib-y += clear_page_64.o copy_page_64.o
         lib-y += memmove_64.o memset_64.o
+	lib-y += memset0_64.o
         lib-y += copy_user_64.o
 	lib-y += cmpxchg16b_emu.o
 endif
new file mode 100644
--- /dev/null
+++ b/arch/x86/lib/memset0_64.S
@@ -0,0 +1,86 @@
+#include <linux/linkage.h>
+#include <asm/export.h>
+
+.intel_syntax noprefix
+
+ENTRY(memset0_rep_stosb)
+	xor	eax, eax
+.globl memsetx_rep_stosb
+memsetx_rep_stosb:
+	rep stosb
+	ret
+ENDPROC(memset0_rep_stosb)
+ENDPROC(memsetx_rep_stosb)
+EXPORT_SYMBOL(memset0_rep_stosb)
+EXPORT_SYMBOL(memsetx_rep_stosb)
+
+ENTRY(_memset0_rep_stosq)
+	xor	eax, eax
+.globl _memsetx_rep_stosq
+_memsetx_rep_stosq:
+	shr	rcx, 3
+	rep stosq
+	ret
+ENDPROC(_memset0_rep_stosq)
+ENDPROC(_memsetx_rep_stosq)
+EXPORT_SYMBOL(_memset0_rep_stosq)
+EXPORT_SYMBOL(_memsetx_rep_stosq)
+
+ENTRY(memset0_rep_stosq)
+	xor	eax, eax
+.globl memsetx_rep_stosq
+memsetx_rep_stosq:
+	lea	rsi, [rdi + rcx]
+	shr	rcx, 3
+	rep stosq
+	cmp	rdi, rsi
+	je	1f
+2:
+	mov	[rdi], al
+	add	rdi, 1
+	cmp	rdi, rsi
+	jne	2b
+1:
+	ret
+ENDPROC(memset0_rep_stosq)
+ENDPROC(memsetx_rep_stosq)
+EXPORT_SYMBOL(memset0_rep_stosq)
+EXPORT_SYMBOL(memsetx_rep_stosq)
+
+ENTRY(_memset0_mov)
+	xor	eax, eax
+.globl _memsetx_mov
+_memsetx_mov:
+	add	rcx, rdi
+	cmp	rdi, rcx
+	je	1f
+2:
+	mov	[rdi], rax
+	add	rdi, 8
+	cmp	rdi, rcx
+	jne	2b
+1:
+	ret
+ENDPROC(_memset0_mov)
+ENDPROC(_memsetx_mov)
+EXPORT_SYMBOL(_memset0_mov)
+EXPORT_SYMBOL(_memsetx_mov)
+
+ENTRY(memset0_mov)
+	xor	eax, eax
+.globl memsetx_mov
+memsetx_mov:
+	lea	rsi, [rdi + rcx]
+	cmp	rdi, rsi
+	je	1f
+2:
+	mov	[rdi], al
+	add	rdi, 1
+	cmp	rdi, rsi
+	jne	2b
+1:
+	ret
+ENDPROC(memset0_mov)
+ENDPROC(memsetx_mov)
+EXPORT_SYMBOL(memset0_mov)
+EXPORT_SYMBOL(memsetx_mov)
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -28,7 +28,7 @@ KBUILD_CFLAGS			:= $(cflags-y) -DDISABLE_BRANCH_PROFILING \
 				   -D__NO_FORTIFY \
 				   $(call cc-option,-ffreestanding) \
 				   $(call cc-option,-fno-stack-protector) \
-				   -D__DISABLE_EXPORTS
+				   -D__DISABLE_EXPORTS -D_ARCH_X86_BOOT
 
 GCOV_PROFILE			:= n
 KASAN_SANITIZE			:= n

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] x86_64: new and improved memset()
  2019-09-14 10:33 [PATCH] x86_64: new and improved memset() Alexey Dobriyan
@ 2019-09-14 11:37 ` Borislav Petkov
  2019-09-14 15:15   ` Alexey Dobriyan
  2019-09-16  7:54 ` kbuild test robot
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 6+ messages in thread
From: Borislav Petkov @ 2019-09-14 11:37 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: tglx, mingo, hpa, linux-kernel, x86, linux, torvalds

On Sat, Sep 14, 2019 at 01:33:45PM +0300, Alexey Dobriyan wrote:
> --- a/arch/x86/include/asm/string_64.h
> +++ b/arch/x86/include/asm/string_64.h
> @@ -15,7 +15,111 @@ extern void *memcpy(void *to, const void *from, size_t len);
>  extern void *__memcpy(void *to, const void *from, size_t len);
>  
>  #define __HAVE_ARCH_MEMSET
> +#if defined(_ARCH_X86_BOOT) || defined(CONFIG_FORTIFY_SOURCE)
>  void *memset(void *s, int c, size_t n);
> +#else
> +#include <asm/alternative.h>
> +#include <asm/cpufeatures.h>
> +
> +/* Internal, do not use. */
> +static __always_inline void memset0(void *s, size_t n)
> +{
> +	/* Internal, do not use. */
> +	void _memset0_mov(void);
> +	void _memset0_rep_stosq(void);
> +	void memset0_mov(void);
> +	void memset0_rep_stosq(void);
> +	void memset0_rep_stosb(void);
> +
> +	if (__builtin_constant_p(n) && n == 0) {
> +	} else if (__builtin_constant_p(n) && n == 1) {
> +		*(uint8_t *)s = 0;
> +	} else if (__builtin_constant_p(n) && n == 2) {
> +		*(uint16_t *)s = 0;
> +	} else if (__builtin_constant_p(n) && n == 4) {
> +		*(uint32_t *)s = 0;
> +	} else if (__builtin_constant_p(n) && n == 6) {
> +		*(uint32_t *)s = 0;
> +		*(uint16_t *)(s + 4) = 0;
> +	} else if (__builtin_constant_p(n) && n == 8) {
> +		*(uint64_t *)s = 0;
> +	} else if (__builtin_constant_p(n) && (n & 7) == 0) {
> +		alternative_call_2(
> +			_memset0_mov,
> +			_memset0_rep_stosq, X86_FEATURE_REP_GOOD,
> +			memset0_rep_stosb, X86_FEATURE_ERMS,
> +			ASM_OUTPUT2("=D" (s), "=c" (n)),
> +			"D" (s), "c" (n)
> +			: "rax", "cc", "memory"
> +		);
> +	} else {
> +		alternative_call_2(
> +			memset0_mov,
> +			memset0_rep_stosq, X86_FEATURE_REP_GOOD,
> +			memset0_rep_stosb, X86_FEATURE_ERMS,
> +			ASM_OUTPUT2("=D" (s), "=c" (n)),
> +			"D" (s), "c" (n)
> +			: "rax", "rsi", "cc", "memory"
> +		);
> +	}
> +}
> +
> +/* Internal, do not use. */
> +static __always_inline void memsetx(void *s, int c, size_t n)
> +{
> +	/* Internal, do not use. */
> +	void _memsetx_mov(void);
> +	void _memsetx_rep_stosq(void);
> +	void memsetx_mov(void);
> +	void memsetx_rep_stosq(void);
> +	void memsetx_rep_stosb(void);
> +
> +	const uint64_t ccc = (uint8_t)c * 0x0101010101010101ULL;
> +
> +	if (__builtin_constant_p(n) && n == 0) {
> +	} else if (__builtin_constant_p(n) && n == 1) {
> +		*(uint8_t *)s = ccc;
> +	} else if (__builtin_constant_p(n) && n == 2) {
> +		*(uint16_t *)s = ccc;
> +	} else if (__builtin_constant_p(n) && n == 4) {
> +		*(uint32_t *)s = ccc;
> +	} else if (__builtin_constant_p(n) && n == 8) {
> +		*(uint64_t *)s = ccc;
> +	} else if (__builtin_constant_p(n) && (n & 7) == 0) {
> +		alternative_call_2(
> +			_memsetx_mov,
> +			_memsetx_rep_stosq, X86_FEATURE_REP_GOOD,
> +			memsetx_rep_stosb, X86_FEATURE_ERMS,
> +			ASM_OUTPUT2("=D" (s), "=c" (n)),
> +			"D" (s), "c" (n), "a" (ccc)
> +			: "cc", "memory"
> +		);
> +	} else {
> +		alternative_call_2(
> +			memsetx_mov,
> +			memsetx_rep_stosq, X86_FEATURE_REP_GOOD,
> +			memsetx_rep_stosb, X86_FEATURE_ERMS,
> +			ASM_OUTPUT2("=D" (s), "=c" (n)),
> +			"D" (s), "c" (n), "a" (ccc)
> +			: "rsi", "cc", "memory"
> +		);
> +	}
> +}
> +
> +static __always_inline void *memset(void *s, int c, size_t n)
> +{
> +	if (__builtin_constant_p(c)) {
> +		if (c == 0) {
> +			memset0(s, n);
> +		} else {
> +			memsetx(s, c, n);
> +		}
> +		return s;
> +	} else {
> +		return __builtin_memset(s, c, n);
> +	}
> +}

I'm willing to take something like that only when such complexity is
justified by numbers. I.e., I'm much more inclined to capping it under
32 and 64 byte sizes and keeping it simple.

...

> +ENTRY(_memset0_mov)
> +	xor	eax, eax
> +.globl _memsetx_mov
> +_memsetx_mov:
> +	add	rcx, rdi
> +	cmp	rdi, rcx
> +	je	1f
> +2:
> +	mov	[rdi], rax
> +	add	rdi, 8
> +	cmp	rdi, rcx
> +	jne	2b
> +1:
> +	ret
> +ENDPROC(_memset0_mov)
> +ENDPROC(_memsetx_mov)
> +EXPORT_SYMBOL(_memset0_mov)
> +EXPORT_SYMBOL(_memsetx_mov)
> +
> +ENTRY(memset0_mov)
> +	xor	eax, eax
> +.globl memsetx_mov
> +memsetx_mov:
> +	lea	rsi, [rdi + rcx]
> +	cmp	rdi, rsi
> +	je	1f
> +2:
> +	mov	[rdi], al
> +	add	rdi, 1
> +	cmp	rdi, rsi
> +	jne	2b
> +1:
> +	ret

Say what now? Intel syntax? You must be joking...

> +ENDPROC(memset0_mov)
> +ENDPROC(memsetx_mov)
> +EXPORT_SYMBOL(memset0_mov)
> +EXPORT_SYMBOL(memsetx_mov)

Too many exported symbols. Again, I'll much more prefer a cleaner,
smaller solution than one where readability suffers greatly at the
expense of *maybe* getting a bit better performance.

> --- a/drivers/firmware/efi/libstub/Makefile
> +++ b/drivers/firmware/efi/libstub/Makefile
> @@ -28,7 +28,7 @@ KBUILD_CFLAGS			:= $(cflags-y) -DDISABLE_BRANCH_PROFILING \
>  				   -D__NO_FORTIFY \
>  				   $(call cc-option,-ffreestanding) \
>  				   $(call cc-option,-fno-stack-protector) \
> -				   -D__DISABLE_EXPORTS
> +				   -D__DISABLE_EXPORTS -D_ARCH_X86_BOOT

Yeah, something like that is inevitable, I've come to realize too. ;-\

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] x86_64: new and improved memset()
  2019-09-14 11:37 ` Borislav Petkov
@ 2019-09-14 15:15   ` Alexey Dobriyan
  0 siblings, 0 replies; 6+ messages in thread
From: Alexey Dobriyan @ 2019-09-14 15:15 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: tglx, mingo, hpa, linux-kernel, x86, linux, torvalds

On Sat, Sep 14, 2019 at 01:37:17PM +0200, Borislav Petkov wrote:
> On Sat, Sep 14, 2019 at 01:33:45PM +0300, Alexey Dobriyan wrote:
> > --- a/arch/x86/include/asm/string_64.h
> > +++ b/arch/x86/include/asm/string_64.h
> > @@ -15,7 +15,111 @@ extern void *memcpy(void *to, const void *from, size_t len);
> >  extern void *__memcpy(void *to, const void *from, size_t len);
> >  
> >  #define __HAVE_ARCH_MEMSET
> > +#if defined(_ARCH_X86_BOOT) || defined(CONFIG_FORTIFY_SOURCE)
> >  void *memset(void *s, int c, size_t n);
> > +#else
> > +#include <asm/alternative.h>
> > +#include <asm/cpufeatures.h>
> > +
> > +/* Internal, do not use. */
> > +static __always_inline void memset0(void *s, size_t n)
> > +{
> > +	/* Internal, do not use. */
> > +	void _memset0_mov(void);
> > +	void _memset0_rep_stosq(void);
> > +	void memset0_mov(void);
> > +	void memset0_rep_stosq(void);
> > +	void memset0_rep_stosb(void);
> > +
> > +	if (__builtin_constant_p(n) && n == 0) {
> > +	} else if (__builtin_constant_p(n) && n == 1) {
> > +		*(uint8_t *)s = 0;
> > +	} else if (__builtin_constant_p(n) && n == 2) {
> > +		*(uint16_t *)s = 0;
> > +	} else if (__builtin_constant_p(n) && n == 4) {
> > +		*(uint32_t *)s = 0;
> > +	} else if (__builtin_constant_p(n) && n == 6) {
> > +		*(uint32_t *)s = 0;
> > +		*(uint16_t *)(s + 4) = 0;
> > +	} else if (__builtin_constant_p(n) && n == 8) {
> > +		*(uint64_t *)s = 0;
> > +	} else if (__builtin_constant_p(n) && (n & 7) == 0) {
> > +		alternative_call_2(
> > +			_memset0_mov,
> > +			_memset0_rep_stosq, X86_FEATURE_REP_GOOD,
> > +			memset0_rep_stosb, X86_FEATURE_ERMS,
> > +			ASM_OUTPUT2("=D" (s), "=c" (n)),
> > +			"D" (s), "c" (n)
> > +			: "rax", "cc", "memory"
> > +		);
> > +	} else {
> > +		alternative_call_2(
> > +			memset0_mov,
> > +			memset0_rep_stosq, X86_FEATURE_REP_GOOD,
> > +			memset0_rep_stosb, X86_FEATURE_ERMS,
> > +			ASM_OUTPUT2("=D" (s), "=c" (n)),
> > +			"D" (s), "c" (n)
> > +			: "rax", "rsi", "cc", "memory"
> > +		);
> > +	}
> > +}
> > +
> > +/* Internal, do not use. */
> > +static __always_inline void memsetx(void *s, int c, size_t n)
> > +{
> > +	/* Internal, do not use. */
> > +	void _memsetx_mov(void);
> > +	void _memsetx_rep_stosq(void);
> > +	void memsetx_mov(void);
> > +	void memsetx_rep_stosq(void);
> > +	void memsetx_rep_stosb(void);
> > +
> > +	const uint64_t ccc = (uint8_t)c * 0x0101010101010101ULL;
> > +
> > +	if (__builtin_constant_p(n) && n == 0) {
> > +	} else if (__builtin_constant_p(n) && n == 1) {
> > +		*(uint8_t *)s = ccc;
> > +	} else if (__builtin_constant_p(n) && n == 2) {
> > +		*(uint16_t *)s = ccc;
> > +	} else if (__builtin_constant_p(n) && n == 4) {
> > +		*(uint32_t *)s = ccc;
> > +	} else if (__builtin_constant_p(n) && n == 8) {
> > +		*(uint64_t *)s = ccc;
> > +	} else if (__builtin_constant_p(n) && (n & 7) == 0) {
> > +		alternative_call_2(
> > +			_memsetx_mov,
> > +			_memsetx_rep_stosq, X86_FEATURE_REP_GOOD,
> > +			memsetx_rep_stosb, X86_FEATURE_ERMS,
> > +			ASM_OUTPUT2("=D" (s), "=c" (n)),
> > +			"D" (s), "c" (n), "a" (ccc)
> > +			: "cc", "memory"
> > +		);
> > +	} else {
> > +		alternative_call_2(
> > +			memsetx_mov,
> > +			memsetx_rep_stosq, X86_FEATURE_REP_GOOD,
> > +			memsetx_rep_stosb, X86_FEATURE_ERMS,
> > +			ASM_OUTPUT2("=D" (s), "=c" (n)),
> > +			"D" (s), "c" (n), "a" (ccc)
> > +			: "rsi", "cc", "memory"
> > +		);
> > +	}
> > +}
> > +
> > +static __always_inline void *memset(void *s, int c, size_t n)
> > +{
> > +	if (__builtin_constant_p(c)) {
> > +		if (c == 0) {
> > +			memset0(s, n);
> > +		} else {
> > +			memsetx(s, c, n);
> > +		}
> > +		return s;
> > +	} else {
> > +		return __builtin_memset(s, c, n);
> > +	}
> > +}
> 
> I'm willing to take something like that only when such complexity is
> justified by numbers. I.e., I'm much more inclined to capping it under
> 32 and 64 byte sizes and keeping it simple.

OK. Those small lengths were indeed annoying.

> > +ENTRY(_memset0_mov)
> > +	xor	eax, eax
> > +.globl _memsetx_mov
> > +_memsetx_mov:
> > +	add	rcx, rdi
> > +	cmp	rdi, rcx
> > +	je	1f
> > +2:
> > +	mov	[rdi], rax
> > +	add	rdi, 8
> > +	cmp	rdi, rcx
> > +	jne	2b
> > +1:
> > +	ret
> > +ENDPROC(_memset0_mov)
> > +ENDPROC(_memsetx_mov)
> > +EXPORT_SYMBOL(_memset0_mov)
> > +EXPORT_SYMBOL(_memsetx_mov)
> > +
> > +ENTRY(memset0_mov)
> > +	xor	eax, eax
> > +.globl memsetx_mov
> > +memsetx_mov:
> > +	lea	rsi, [rdi + rcx]
> > +	cmp	rdi, rsi
> > +	je	1f
> > +2:
> > +	mov	[rdi], al
> > +	add	rdi, 1
> > +	cmp	rdi, rsi
> > +	jne	2b
> > +1:
> > +	ret
> 
> Say what now? Intel syntax? You must be joking...

It is the best thing in the x86 assembler universe.

> > +ENDPROC(memset0_mov)
> > +ENDPROC(memsetx_mov)
> > +EXPORT_SYMBOL(memset0_mov)
> > +EXPORT_SYMBOL(memsetx_mov)
> 
> Too many exported symbols.

Those are technical exports. memset() remains the only developer-visible
interface.

> Again, I'll much more prefer a cleaner,
> smaller solution than one where readability suffers greatly at the
> expense of *maybe* getting a bit better performance.

Readability is red herring, I for one find AT&T syntax unreadable.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] x86_64: new and improved memset()
  2019-09-14 10:33 [PATCH] x86_64: new and improved memset() Alexey Dobriyan
  2019-09-14 11:37 ` Borislav Petkov
@ 2019-09-16  7:54 ` kbuild test robot
  2019-09-16  8:43 ` kbuild test robot
  2019-09-16 14:18 ` David Laight
  3 siblings, 0 replies; 6+ messages in thread
From: kbuild test robot @ 2019-09-16  7:54 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: kbuild-all, tglx, mingo, bp, hpa, linux-kernel, x86, linux, torvalds

[-- Attachment #1: Type: text/plain, Size: 5926 bytes --]

Hi Alexey,

I love your patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[cannot apply to v5.3 next-20190915]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Alexey-Dobriyan/x86_64-new-and-improved-memset/20190916-140315
config: x86_64-fedora-25 (attached as .config)
compiler: gcc-7 (Debian 7.4.0-11) 7.4.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings

All warnings (new ones prefixed by >>):

   sound/core/pcm_iec958.c: In function 'create_iec958_consumer':
>> sound/core/pcm_iec958.c:77:9: warning: 'ws' may be used uninitialized in this function [-Wmaybe-uninitialized]
      cs[4] = ws;
      ~~~~~~^~~~

vim +/ws +77 sound/core/pcm_iec958.c

9203dd016a5d8f Russell King 2015-05-09  11  
4a4436573a6669 Jyri Sarha   2016-03-31  12  static int create_iec958_consumer(uint rate, uint sample_width,
4a4436573a6669 Jyri Sarha   2016-03-31  13  				  u8 *cs, size_t len)
9203dd016a5d8f Russell King 2015-05-09  14  {
9203dd016a5d8f Russell King 2015-05-09  15  	unsigned int fs, ws;
9203dd016a5d8f Russell King 2015-05-09  16  
9203dd016a5d8f Russell King 2015-05-09  17  	if (len < 4)
9203dd016a5d8f Russell King 2015-05-09  18  		return -EINVAL;
9203dd016a5d8f Russell King 2015-05-09  19  
4a4436573a6669 Jyri Sarha   2016-03-31  20  	switch (rate) {
9203dd016a5d8f Russell King 2015-05-09  21  	case 32000:
9203dd016a5d8f Russell King 2015-05-09  22  		fs = IEC958_AES3_CON_FS_32000;
9203dd016a5d8f Russell King 2015-05-09  23  		break;
9203dd016a5d8f Russell King 2015-05-09  24  	case 44100:
9203dd016a5d8f Russell King 2015-05-09  25  		fs = IEC958_AES3_CON_FS_44100;
9203dd016a5d8f Russell King 2015-05-09  26  		break;
9203dd016a5d8f Russell King 2015-05-09  27  	case 48000:
9203dd016a5d8f Russell King 2015-05-09  28  		fs = IEC958_AES3_CON_FS_48000;
9203dd016a5d8f Russell King 2015-05-09  29  		break;
9203dd016a5d8f Russell King 2015-05-09  30  	case 88200:
9203dd016a5d8f Russell King 2015-05-09  31  		fs = IEC958_AES3_CON_FS_88200;
9203dd016a5d8f Russell King 2015-05-09  32  		break;
9203dd016a5d8f Russell King 2015-05-09  33  	case 96000:
9203dd016a5d8f Russell King 2015-05-09  34  		fs = IEC958_AES3_CON_FS_96000;
9203dd016a5d8f Russell King 2015-05-09  35  		break;
9203dd016a5d8f Russell King 2015-05-09  36  	case 176400:
9203dd016a5d8f Russell King 2015-05-09  37  		fs = IEC958_AES3_CON_FS_176400;
9203dd016a5d8f Russell King 2015-05-09  38  		break;
9203dd016a5d8f Russell King 2015-05-09  39  	case 192000:
9203dd016a5d8f Russell King 2015-05-09  40  		fs = IEC958_AES3_CON_FS_192000;
9203dd016a5d8f Russell King 2015-05-09  41  		break;
9203dd016a5d8f Russell King 2015-05-09  42  	default:
9203dd016a5d8f Russell King 2015-05-09  43  		return -EINVAL;
9203dd016a5d8f Russell King 2015-05-09  44  	}
9203dd016a5d8f Russell King 2015-05-09  45  
9203dd016a5d8f Russell King 2015-05-09  46  	if (len > 4) {
4a4436573a6669 Jyri Sarha   2016-03-31  47  		switch (sample_width) {
9203dd016a5d8f Russell King 2015-05-09  48  		case 16:
9203dd016a5d8f Russell King 2015-05-09  49  			ws = IEC958_AES4_CON_WORDLEN_20_16;
9203dd016a5d8f Russell King 2015-05-09  50  			break;
9203dd016a5d8f Russell King 2015-05-09  51  		case 18:
9203dd016a5d8f Russell King 2015-05-09  52  			ws = IEC958_AES4_CON_WORDLEN_22_18;
9203dd016a5d8f Russell King 2015-05-09  53  			break;
9203dd016a5d8f Russell King 2015-05-09  54  		case 20:
9203dd016a5d8f Russell King 2015-05-09  55  			ws = IEC958_AES4_CON_WORDLEN_20_16 |
9203dd016a5d8f Russell King 2015-05-09  56  			     IEC958_AES4_CON_MAX_WORDLEN_24;
9203dd016a5d8f Russell King 2015-05-09  57  			break;
9203dd016a5d8f Russell King 2015-05-09  58  		case 24:
4a462ce084d5be Jyri Sarha   2016-03-31  59  		case 32: /* Assume 24-bit width for 32-bit samples. */
9203dd016a5d8f Russell King 2015-05-09  60  			ws = IEC958_AES4_CON_WORDLEN_24_20 |
9203dd016a5d8f Russell King 2015-05-09  61  			     IEC958_AES4_CON_MAX_WORDLEN_24;
9203dd016a5d8f Russell King 2015-05-09  62  			break;
9203dd016a5d8f Russell King 2015-05-09  63  
9203dd016a5d8f Russell King 2015-05-09  64  		default:
9203dd016a5d8f Russell King 2015-05-09  65  			return -EINVAL;
9203dd016a5d8f Russell King 2015-05-09  66  		}
9203dd016a5d8f Russell King 2015-05-09  67  	}
9203dd016a5d8f Russell King 2015-05-09  68  
9203dd016a5d8f Russell King 2015-05-09  69  	memset(cs, 0, len);
9203dd016a5d8f Russell King 2015-05-09  70  
9203dd016a5d8f Russell King 2015-05-09  71  	cs[0] = IEC958_AES0_CON_NOT_COPYRIGHT | IEC958_AES0_CON_EMPHASIS_NONE;
9203dd016a5d8f Russell King 2015-05-09  72  	cs[1] = IEC958_AES1_CON_GENERAL;
9203dd016a5d8f Russell King 2015-05-09  73  	cs[2] = IEC958_AES2_CON_SOURCE_UNSPEC | IEC958_AES2_CON_CHANNEL_UNSPEC;
9203dd016a5d8f Russell King 2015-05-09  74  	cs[3] = IEC958_AES3_CON_CLOCK_1000PPM | fs;
9203dd016a5d8f Russell King 2015-05-09  75  
9203dd016a5d8f Russell King 2015-05-09  76  	if (len > 4)
9203dd016a5d8f Russell King 2015-05-09 @77  		cs[4] = ws;
9203dd016a5d8f Russell King 2015-05-09  78  
9203dd016a5d8f Russell King 2015-05-09  79  	return len;
9203dd016a5d8f Russell King 2015-05-09  80  }
4a4436573a6669 Jyri Sarha   2016-03-31  81  

:::::: The code at line 77 was first introduced by commit
:::::: 9203dd016a5d8ffb2eb6acdca60cd0b5dfe38c2b ALSA: pcm: add IEC958 channel status helper

:::::: TO: Russell King <rmk+kernel@arm.linux.org.uk>
:::::: CC: Takashi Iwai <tiwai@suse.de>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 50839 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] x86_64: new and improved memset()
  2019-09-14 10:33 [PATCH] x86_64: new and improved memset() Alexey Dobriyan
  2019-09-14 11:37 ` Borislav Petkov
  2019-09-16  7:54 ` kbuild test robot
@ 2019-09-16  8:43 ` kbuild test robot
  2019-09-16 14:18 ` David Laight
  3 siblings, 0 replies; 6+ messages in thread
From: kbuild test robot @ 2019-09-16  8:43 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: kbuild-all, tglx, mingo, bp, hpa, linux-kernel, x86, linux, torvalds

[-- Attachment #1: Type: text/plain, Size: 4645 bytes --]

Hi Alexey,

I love your patch! Yet something to improve:

[auto build test ERROR on linus/master]
[cannot apply to v5.3 next-20190915]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Alexey-Dobriyan/x86_64-new-and-improved-memset/20190916-140315
config: um-x86_64_defconfig (attached as .config)
compiler: gcc-7 (Debian 7.4.0-11) 7.4.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=um SUBARCH=x86_64

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   /usr/bin/ld: arch/um/kernel/mem.o: in function `pgd_alloc':
>> mem.c:(.text+0x4a): undefined reference to `memset0_mov'
   /usr/bin/ld: arch/um/kernel/mem.o: in function `pmd_alloc_one':
>> mem.c:(.text+0xd1): undefined reference to `_memset0_mov'
   /usr/bin/ld: arch/um/kernel/mem.o: in function `mem_init':
>> mem.c:(.init.text+0x20): undefined reference to `_memset0_mov'
>> /usr/bin/ld: arch/um/kernel/mem.o:(.altinstr_replacement+0x1): undefined reference to `_memset0_rep_stosq'
>> /usr/bin/ld: arch/um/kernel/mem.o:(.altinstr_replacement+0x6): undefined reference to `memset0_rep_stosb'
>> /usr/bin/ld: arch/um/kernel/mem.o:(.altinstr_replacement+0xb): undefined reference to `memset0_rep_stosq'
   /usr/bin/ld: arch/um/kernel/mem.o:(.altinstr_replacement+0x10): undefined reference to `memset0_rep_stosb'
   /usr/bin/ld: arch/um/kernel/mem.o:(.altinstr_replacement+0x15): undefined reference to `_memset0_rep_stosq'
   /usr/bin/ld: arch/um/kernel/mem.o:(.altinstr_replacement+0x1a): undefined reference to `memset0_rep_stosb'
   /usr/bin/ld: arch/um/kernel/process.o: in function `copy_from_user_proc':
>> process.c:(.text+0x665): undefined reference to `memset0_mov'
   /usr/bin/ld: arch/um/kernel/process.o: in function `sysemu_proc_write':
   process.c:(.text+0x76a): undefined reference to `memset0_mov'
>> /usr/bin/ld: arch/um/kernel/process.o:(.altinstr_replacement+0x1): undefined reference to `memset0_rep_stosq'
>> /usr/bin/ld: arch/um/kernel/process.o:(.altinstr_replacement+0x6): undefined reference to `memset0_rep_stosb'
   /usr/bin/ld: arch/um/kernel/process.o:(.altinstr_replacement+0xb): undefined reference to `memset0_rep_stosq'
   /usr/bin/ld: arch/um/kernel/process.o:(.altinstr_replacement+0x10): undefined reference to `memset0_rep_stosb'
   /usr/bin/ld: arch/um/drivers/net_kern.o: in function `eth_configure':
>> net_kern.c:(.text+0xbd0): undefined reference to `memset0_mov'
>> /usr/bin/ld: arch/um/drivers/net_kern.o:(.altinstr_replacement+0x1): undefined reference to `memset0_rep_stosq'
>> /usr/bin/ld: arch/um/drivers/net_kern.o:(.altinstr_replacement+0x6): undefined reference to `memset0_rep_stosb'
   /usr/bin/ld: arch/um/drivers/ubd_kern.o: in function `ubd_ioctl':
>> ubd_kern.c:(.text+0x984): undefined reference to `_memset0_mov'
>> /usr/bin/ld: ubd_kern.c:(.text+0xa5a): undefined reference to `memset0_mov'
   /usr/bin/ld: arch/um/drivers/ubd_kern.o: in function `io_thread':
>> ubd_kern.c:(.text+0x211f): undefined reference to `memset0_mov'
>> /usr/bin/ld: arch/um/drivers/ubd_kern.o:(.altinstr_replacement+0x1): undefined reference to `_memset0_rep_stosq'
>> /usr/bin/ld: arch/um/drivers/ubd_kern.o:(.altinstr_replacement+0x6): undefined reference to `memset0_rep_stosb'
>> /usr/bin/ld: arch/um/drivers/ubd_kern.o:(.altinstr_replacement+0xb): undefined reference to `memset0_rep_stosq'
   /usr/bin/ld: arch/um/drivers/ubd_kern.o:(.altinstr_replacement+0x10): undefined reference to `memset0_rep_stosb'
   /usr/bin/ld: arch/um/drivers/ubd_kern.o:(.altinstr_replacement+0x15): undefined reference to `memset0_rep_stosq'
   /usr/bin/ld: arch/um/drivers/ubd_kern.o:(.altinstr_replacement+0x1a): undefined reference to `memset0_rep_stosb'
   /usr/bin/ld: kernel/fork.o: in function `copy_clone_args_from_user':
>> fork.c:(.text+0x5c4): undefined reference to `memset0_mov'
   /usr/bin/ld: kernel/fork.o: in function `mm_init.isra.5':
>> fork.c:(.text+0x75c): undefined reference to `_memset0_mov'
   /usr/bin/ld: kernel/fork.o: in function `vm_area_alloc':
   fork.c:(.text+0xaa4): undefined reference to `_memset0_mov'
   /usr/bin/ld: kernel/fork.o: in function `mm_alloc':
   fork.c:(.text+0xb9f): undefined reference to `_memset0_mov'
   /usr/bin/ld: kernel/fork.o: in function `copy_process':
   fork.c:(.text+0x1e0c): undefined reference to `_memset0_mov'

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 8262 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH] x86_64: new and improved memset()
  2019-09-14 10:33 [PATCH] x86_64: new and improved memset() Alexey Dobriyan
                   ` (2 preceding siblings ...)
  2019-09-16  8:43 ` kbuild test robot
@ 2019-09-16 14:18 ` David Laight
  3 siblings, 0 replies; 6+ messages in thread
From: David Laight @ 2019-09-16 14:18 UTC (permalink / raw)
  To: 'Alexey Dobriyan', tglx, mingo, bp, hpa
  Cc: linux-kernel, x86, linux, torvalds

From: Alexey Dobriyan
> Sent: 14 September 2019 11:34
...
> +ENTRY(memset0_rep_stosq)
> +	xor	eax, eax
> +.globl memsetx_rep_stosq
> +memsetx_rep_stosq:
> +	lea	rsi, [rdi + rcx]
> +	shr	rcx, 3
> +	rep stosq
> +	cmp	rdi, rsi
> +	je	1f
> +2:
> +	mov	[rdi], al
> +	add	rdi, 1
> +	cmp	rdi, rsi
> +	jne	2b
> +1:
> +	ret

You can do the 'trailing bytes' first with a potentially misaligned store.
Something like (modulo asm syntax and argument ordering):
	lea	rsi, [rdi + rdx]
	shr	rcx, 3
	jcxz	1f		# Short buffer
	mov	-8[rsi], rax
	rep stosq
	ret
1:
	mov	[rdi], al
	add	rdi, 1
	cmp	rdi, rsi
	jne	1b
	ret

The final loop can be one instruction shorter by arranging to do:
1:
	mov	[rdi+rxx], al
	add	rdi, 1
	jnz	1b
	ret

Last I looked 'jcxz' was 'ok' on all recent amd and intel cpus.
OTOH 'loop' is horrid on intel ones.

The same applies to the other versions.

I suspect it isn't worth optimising to realign misaligned buffers
they are unlikely to happen often enough.

I also think that gcc's __builtin version does some of the short
buffer optimisations already.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2019-09-16 14:19 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-09-14 10:33 [PATCH] x86_64: new and improved memset() Alexey Dobriyan
2019-09-14 11:37 ` Borislav Petkov
2019-09-14 15:15   ` Alexey Dobriyan
2019-09-16  7:54 ` kbuild test robot
2019-09-16  8:43 ` kbuild test robot
2019-09-16 14:18 ` David Laight

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).