All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ankur Arora <ankur.a.arora@oracle.com>
To: linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org
Cc: mingo@kernel.org, bp@alien8.de, luto@kernel.org,
	akpm@linux-foundation.org, mike.kravetz@oracle.com,
	jon.grimm@amd.com, kvm@vger.kernel.org, konrad.wilk@oracle.com,
	boris.ostrovsky@oracle.com,
	Ankur Arora <ankur.a.arora@oracle.com>
Subject: [PATCH v2 02/14] perf bench: add memset_movnti()
Date: Wed, 20 Oct 2021 10:02:53 -0700	[thread overview]
Message-ID: <20211020170305.376118-3-ankur.a.arora@oracle.com> (raw)
In-Reply-To: <20211020170305.376118-1-ankur.a.arora@oracle.com>

Clone memset_movnti() from arch/x86/lib/memset_64.S.

perf bench mem memset -f x86-64-movnt on Intel Icelake-X, AMD Milan:

  # Intel Icelake-X

  $ for i in 8 32 128 512; do
         perf bench mem memset -f x86-64-movnt -s ${i}MB -l 5
     done

  # Output pruned.
  # Running 'mem/memset' benchmark:
  # function 'x86-64-movnt' (movnt-based memset() in arch/x86/lib/memset_64.S)
  # Copying 8MB bytes ...
      12.896170 GB/sec
  # Copying 32MB bytes ...
      15.879065 GB/sec
  # Copying 128MB bytes ...
      20.813214 GB/sec
  # Copying 512MB bytes ...
      24.190817 GB/sec

  # AMD Milan

  $ for i in 8 32 128 512; do
         perf bench mem memset -f x86-64-movnt -s ${i}MB -l 5
     done

  # Output pruned.
  # Running 'mem/memset' benchmark:
  # function 'x86-64-movnt' (movnt-based memset() in arch/x86/lib/memset_64.S)
  # Copying 8MB bytes ...
        22.372566 GB/sec
  # Copying 32MB bytes ...
        22.507923 GB/sec
  # Copying 128MB bytes ...
        22.492532 GB/sec
  # Copying 512MB bytes ...
        22.434603 GB/sec

Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
---
 tools/arch/x86/lib/memset_64.S               | 68 +++++++++++---------
 tools/perf/bench/mem-memset-x86-64-asm-def.h |  6 +-
 2 files changed, 43 insertions(+), 31 deletions(-)

diff --git a/tools/arch/x86/lib/memset_64.S b/tools/arch/x86/lib/memset_64.S
index 9827ae267f96..ef2a091563d9 100644
--- a/tools/arch/x86/lib/memset_64.S
+++ b/tools/arch/x86/lib/memset_64.S
@@ -25,7 +25,7 @@ SYM_FUNC_START(__memset)
 	 *
 	 * Otherwise, use original memset function.
 	 */
-	ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
+	ALTERNATIVE_2 "jmp memset_movq", "", X86_FEATURE_REP_GOOD, \
 		      "jmp memset_erms", X86_FEATURE_ERMS
 
 	movq %rdi,%r9
@@ -66,7 +66,8 @@ SYM_FUNC_START_LOCAL(memset_erms)
 	ret
 SYM_FUNC_END(memset_erms)
 
-SYM_FUNC_START_LOCAL(memset_orig)
+.macro MEMSET_MOV OP fence
+SYM_FUNC_START_LOCAL(memset_\OP)
 	movq %rdi,%r10
 
 	/* expand byte value  */
@@ -77,64 +78,71 @@ SYM_FUNC_START_LOCAL(memset_orig)
 	/* align dst */
 	movl  %edi,%r9d
 	andl  $7,%r9d
-	jnz  .Lbad_alignment
-.Lafter_bad_alignment:
+	jnz  .Lbad_alignment_\@
+.Lafter_bad_alignment_\@:
 
 	movq  %rdx,%rcx
 	shrq  $6,%rcx
-	jz	 .Lhandle_tail
+	jz	 .Lhandle_tail_\@
 
 	.p2align 4
-.Lloop_64:
+.Lloop_64_\@:
 	decq  %rcx
-	movq  %rax,(%rdi)
-	movq  %rax,8(%rdi)
-	movq  %rax,16(%rdi)
-	movq  %rax,24(%rdi)
-	movq  %rax,32(%rdi)
-	movq  %rax,40(%rdi)
-	movq  %rax,48(%rdi)
-	movq  %rax,56(%rdi)
+	\OP  %rax,(%rdi)
+	\OP  %rax,8(%rdi)
+	\OP  %rax,16(%rdi)
+	\OP  %rax,24(%rdi)
+	\OP  %rax,32(%rdi)
+	\OP  %rax,40(%rdi)
+	\OP  %rax,48(%rdi)
+	\OP  %rax,56(%rdi)
 	leaq  64(%rdi),%rdi
-	jnz    .Lloop_64
+	jnz    .Lloop_64_\@
 
 	/* Handle tail in loops. The loops should be faster than hard
 	   to predict jump tables. */
 	.p2align 4
-.Lhandle_tail:
+.Lhandle_tail_\@:
 	movl	%edx,%ecx
 	andl    $63&(~7),%ecx
-	jz 		.Lhandle_7
+	jz 		.Lhandle_7_\@
 	shrl	$3,%ecx
 	.p2align 4
-.Lloop_8:
+.Lloop_8_\@:
 	decl   %ecx
-	movq  %rax,(%rdi)
+	\OP  %rax,(%rdi)
 	leaq  8(%rdi),%rdi
-	jnz    .Lloop_8
+	jnz    .Lloop_8_\@
 
-.Lhandle_7:
+.Lhandle_7_\@:
 	andl	$7,%edx
-	jz      .Lende
+	jz      .Lende_\@
 	.p2align 4
-.Lloop_1:
+.Lloop_1_\@:
 	decl    %edx
 	movb 	%al,(%rdi)
 	leaq	1(%rdi),%rdi
-	jnz     .Lloop_1
+	jnz     .Lloop_1_\@
 
-.Lende:
+.Lende_\@:
+	.if \fence
+	sfence
+	.endif
 	movq	%r10,%rax
 	ret
 
-.Lbad_alignment:
+.Lbad_alignment_\@:
 	cmpq $7,%rdx
-	jbe	.Lhandle_7
+	jbe	.Lhandle_7_\@
 	movq %rax,(%rdi)	/* unaligned store */
 	movq $8,%r8
 	subq %r9,%r8
 	addq %r8,%rdi
 	subq %r8,%rdx
-	jmp .Lafter_bad_alignment
-.Lfinal:
-SYM_FUNC_END(memset_orig)
+	jmp .Lafter_bad_alignment_\@
+.Lfinal_\@:
+SYM_FUNC_END(memset_\OP)
+.endm
+
+MEMSET_MOV OP=movq fence=0
+MEMSET_MOV OP=movnti fence=1
diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h
index dac6d2b7c39b..53ead7f91313 100644
--- a/tools/perf/bench/mem-memset-x86-64-asm-def.h
+++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
-MEMSET_FN(memset_orig,
+MEMSET_FN(memset_movq,
 	"x86-64-unrolled",
 	"unrolled memset() in arch/x86/lib/memset_64.S")
 
@@ -11,3 +11,7 @@ MEMSET_FN(__memset,
 MEMSET_FN(memset_erms,
 	"x86-64-stosb",
 	"movsb-based memset() in arch/x86/lib/memset_64.S")
+
+MEMSET_FN(memset_movnti,
+	"x86-64-movnt",
+	"movnt-based memset() in arch/x86/lib/memset_64.S")
-- 
2.29.2


  parent reply	other threads:[~2021-10-20 17:05 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-10-20 17:02 [PATCH v2 00/14] Use uncached stores while clearing huge pages Ankur Arora
2021-10-20 17:02 ` [PATCH v2 01/14] x86/asm: add memset_movnti() Ankur Arora
2021-10-20 17:02 ` Ankur Arora [this message]
2021-10-20 17:02 ` [PATCH v2 03/14] x86/asm: add uncached page clearing Ankur Arora
2021-10-20 17:02 ` [PATCH v2 04/14] x86/asm: add clzero based " Ankur Arora
2021-10-20 17:02 ` [PATCH v2 05/14] x86/cpuid: add X86_FEATURE_MOVNT_SLOW Ankur Arora
2021-10-20 17:02 ` [PATCH v2 06/14] sparse: add address_space __incoherent Ankur Arora
2021-10-20 17:02 ` [PATCH v2 07/14] x86/clear_page: add clear_page_uncached() Ankur Arora
2021-12-08  8:58   ` Yu Xu
2021-12-10  4:37     ` Ankur Arora
2021-12-10  4:48       ` Yu Xu
2021-12-10 14:26         ` Ankur Arora
2021-10-20 17:02 ` [PATCH v2 08/14] mm/clear_page: add clear_page_uncached_threshold() Ankur Arora
2021-10-20 17:03 ` [PATCH v2 09/14] x86/clear_page: add arch_clear_page_uncached_threshold() Ankur Arora
2021-10-20 17:03 ` [PATCH v2 10/14] clear_huge_page: use uncached path Ankur Arora
2021-10-20 17:03 ` [PATCH v2 11/14] gup: add FOLL_HINT_BULK, FAULT_FLAG_UNCACHED Ankur Arora
2021-10-20 17:03 ` [PATCH v2 12/14] gup: use uncached path when clearing large regions Ankur Arora
2021-10-20 18:52 ` [PATCH v2 13/14] vfio_iommu_type1: specify FOLL_HINT_BULK to pin_user_pages() Ankur Arora
2021-10-20 18:52 ` [PATCH v2 14/14] x86/cpu/intel: set X86_FEATURE_MOVNT_SLOW for Skylake Ankur Arora

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20211020170305.376118-3-ankur.a.arora@oracle.com \
    --to=ankur.a.arora@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=boris.ostrovsky@oracle.com \
    --cc=bp@alien8.de \
    --cc=jon.grimm@amd.com \
    --cc=konrad.wilk@oracle.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=luto@kernel.org \
    --cc=mike.kravetz@oracle.com \
    --cc=mingo@kernel.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.