From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
To: linux-mm@kvack.org
Cc: Thomas Gleixner <tglx@linutronix.de>,
Ingo Molnar <mingo@redhat.com>, "H. Peter Anvin" <hpa@zytor.com>,
x86@kernel.org, Andi Kleen <ak@linux.intel.com>,
"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>,
Tim Chen <tim.c.chen@linux.intel.com>,
Alex Shi <alex.shu@intel.com>, Jan Beulich <jbeulich@novell.com>,
Robert Richter <robert.richter@amd.com>,
Andy Lutomirski <luto@amacapital.net>,
Andrew Morton <akpm@linux-foundation.org>,
Andrea Arcangeli <aarcange@redhat.com>,
Johannes Weiner <hannes@cmpxchg.org>,
Hugh Dickins <hughd@google.com>,
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>,
Mel Gorman <mgorman@suse.de>,
linux-kernel@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
linux-mips@linux-mips.org, linux-sh@vger.kernel.org,
sparclinux@vger.kernel.org
Subject: [PATCH v2 6/6] x86: switch the 64bit uncached page clear to SSE/AVX v2
Date: Thu, 9 Aug 2012 18:03:03 +0300 [thread overview]
Message-ID: <1344524583-1096-7-git-send-email-kirill.shutemov@linux.intel.com> (raw)
In-Reply-To: <1344524583-1096-1-git-send-email-kirill.shutemov@linux.intel.com>
From: Andi Kleen <ak@linux.intel.com>
With multiple threads vector stores are more efficient, so use them.
This will cause the page clear to run non preemptable and add some
overhead. However on 32bit it was already non preempable (due to
kmap_atomic) and there is an preemption opportunity every 4K unit.
On a NPB (Nasa Parallel Benchmark) 128GB run on a Westmere this improves
the performance regression of enabling transparent huge pages
by ~2% (2.81% to 0.81%), near the runtime variability now.
On a system with AVX support more is expected.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
[kirill.shutemov@linux.intel.com: Properly save/restore arguments]
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
arch/x86/lib/clear_page_nocache_64.S | 91 ++++++++++++++++++++++++++++-----
1 files changed, 77 insertions(+), 14 deletions(-)
diff --git a/arch/x86/lib/clear_page_nocache_64.S b/arch/x86/lib/clear_page_nocache_64.S
index ee16d15..c092919 100644
--- a/arch/x86/lib/clear_page_nocache_64.S
+++ b/arch/x86/lib/clear_page_nocache_64.S
@@ -1,29 +1,92 @@
+/*
+ * Clear pages with cache bypass.
+ *
+ * Copyright (C) 2011, 2012 Intel Corporation
+ * Author: Andi Kleen
+ *
+ * This software may be redistributed and/or modified under the terms of
+ * the GNU General Public License ("GPL") version 2 only as published by the
+ * Free Software Foundation.
+ */
+
#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/cpufeature.h>
#include <asm/dwarf2.h>
+#define SSE_UNROLL 128
+
/*
* Zero a page avoiding the caches
* rdi page
*/
ENTRY(clear_page_nocache)
CFI_STARTPROC
- xorl %eax,%eax
- movl $4096/64,%ecx
+ push %rdi
+ call kernel_fpu_begin
+ pop %rdi
+ sub $16,%rsp
+ CFI_ADJUST_CFA_OFFSET 16
+ movdqu %xmm0,(%rsp)
+ xorpd %xmm0,%xmm0
+ movl $4096/SSE_UNROLL,%ecx
.p2align 4
.Lloop:
decl %ecx
-#define PUT(x) movnti %rax,x*8(%rdi)
- movnti %rax,(%rdi)
- PUT(1)
- PUT(2)
- PUT(3)
- PUT(4)
- PUT(5)
- PUT(6)
- PUT(7)
- leaq 64(%rdi),%rdi
+ .set x,0
+ .rept SSE_UNROLL/16
+ movntdq %xmm0,x(%rdi)
+ .set x,x+16
+ .endr
+ leaq SSE_UNROLL(%rdi),%rdi
jnz .Lloop
- nop
- ret
+ movdqu (%rsp),%xmm0
+ addq $16,%rsp
+ CFI_ADJUST_CFA_OFFSET -16
+ jmp kernel_fpu_end
CFI_ENDPROC
ENDPROC(clear_page_nocache)
+
+#ifdef CONFIG_AS_AVX
+
+ .section .altinstr_replacement,"ax"
+1: .byte 0xeb /* jmp <disp8> */
+ .byte (clear_page_nocache_avx - clear_page_nocache) - (2f - 1b)
+ /* offset */
+2:
+ .previous
+ .section .altinstructions,"a"
+ altinstruction_entry clear_page_nocache,1b,X86_FEATURE_AVX,\
+ 16, 2b-1b
+ .previous
+
+#define AVX_UNROLL 256 /* TUNE ME */
+
+ENTRY(clear_page_nocache_avx)
+ CFI_STARTPROC
+ push %rdi
+ call kernel_fpu_begin
+ pop %rdi
+ sub $32,%rsp
+ CFI_ADJUST_CFA_OFFSET 32
+ vmovdqu %ymm0,(%rsp)
+ vxorpd %ymm0,%ymm0,%ymm0
+ movl $4096/AVX_UNROLL,%ecx
+ .p2align 4
+.Lloop_avx:
+ decl %ecx
+ .set x,0
+ .rept AVX_UNROLL/32
+ vmovntdq %ymm0,x(%rdi)
+ .set x,x+32
+ .endr
+ leaq AVX_UNROLL(%rdi),%rdi
+ jnz .Lloop_avx
+ vmovdqu (%rsp),%ymm0
+ addq $32,%rsp
+ CFI_ADJUST_CFA_OFFSET -32
+ jmp kernel_fpu_end
+ CFI_ENDPROC
+ENDPROC(clear_page_nocache_avx)
+
+#endif
--
1.7.7.6
next prev parent reply other threads:[~2012-08-09 15:05 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-08-09 15:02 [PATCH v2 0/6] Avoid cache trashing on clearing huge/gigantic page Kirill A. Shutemov
2012-08-09 15:02 ` [PATCH v2 1/6] THP: Use real address for NUMA policy Kirill A. Shutemov
2012-08-09 15:02 ` [PATCH v2 2/6] mm: make clear_huge_page tolerate non aligned address Kirill A. Shutemov
2012-08-09 15:03 ` [PATCH v2 3/6] THP: Pass real, not rounded, address to clear_huge_page Kirill A. Shutemov
2012-08-09 15:03 ` [PATCH v2 4/6] x86: Add clear_page_nocache Kirill A. Shutemov
2012-08-09 15:22 ` Jan Beulich
2012-08-09 15:26 ` Andi Kleen
2012-08-13 11:43 ` Kirill A. Shutemov
2012-08-13 12:02 ` Jan Beulich
2012-08-13 16:27 ` Andi Kleen
2012-08-13 17:04 ` Borislav Petkov
2012-08-13 19:07 ` Kirill A. Shutemov
2012-08-09 15:23 ` H. Peter Anvin
2012-08-09 15:03 ` [PATCH v2 5/6] mm: make clear_huge_page cache clear only around the fault address Kirill A. Shutemov
2012-08-09 15:03 ` Kirill A. Shutemov [this message]
2012-08-09 15:28 ` [PATCH v2 6/6] x86: switch the 64bit uncached page clear to SSE/AVX v2 Jan Beulich
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1344524583-1096-7-git-send-email-kirill.shutemov@linux.intel.com \
--to=kirill.shutemov@linux.intel.com \
--cc=aarcange@redhat.com \
--cc=ak@linux.intel.com \
--cc=akpm@linux-foundation.org \
--cc=alex.shu@intel.com \
--cc=hannes@cmpxchg.org \
--cc=hpa@zytor.com \
--cc=hughd@google.com \
--cc=jbeulich@novell.com \
--cc=kamezawa.hiroyu@jp.fujitsu.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mips@linux-mips.org \
--cc=linux-mm@kvack.org \
--cc=linux-sh@vger.kernel.org \
--cc=linuxppc-dev@lists.ozlabs.org \
--cc=luto@amacapital.net \
--cc=mgorman@suse.de \
--cc=mingo@redhat.com \
--cc=robert.richter@amd.com \
--cc=sparclinux@vger.kernel.org \
--cc=tglx@linutronix.de \
--cc=tim.c.chen@linux.intel.com \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).