linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: ling.ma@intel.com
To: mingo@elte.hu
Cc: hpa@zytor.com, tglx@linutronix.de, linux-kernel@vger.kernel.org,
	ling.ma@intel.com
Subject: [PATCH RFC V2] [x86] Optimize copy-page by reducing impact from HW prefetch
Date: Sat, 18 Jun 2011 07:24:13 +0800	[thread overview]
Message-ID: <1308353053-1928-1-git-send-email-ling.ma@intel.com> (raw)

From: Ma Ling <ling.ma@intel.com>

Program's temporal & spatial locality introduce cache unit to overcome
the processor-memory performance gap, hardware prefetch is very important
to improve performance by reducing cache miss. Modern CPU micro-architecture
mainly support two kinds of prefetch mechanism in L1 data cache:

a. Data cache unit (DCU) prefetcher. Data spatial locality ask us to provide
   adjacent data while handling current data. larger cache line size
   is one choice, but it would cause more cached data to be evicted and latency
   to load, so we simply prefetch next line when accessing current data.
   This mode only prefetch data of ascending address.
  
b. Instruction pointer (IP)- based strided prefetcher. Based on Load/write
   instruction address the mechanism predicate to prefetch data with adaptive stride,
   including ascending and descending address

DCU mode is good when time program data operation spend is longer than that of
prefetch next line, however copy-page function breaks the assumption,
DCU mode is hardly helpful, specially we append software prefetch and data is
in cache, so bus traffic is more busy that impact perforamnce seriously.

In this patch we introduce backward copy to successfully avoid HW prfetch
impact(DCU prefetcher), and simplify original code.
The performance is improved about 15% on core2, 36% on snb respectively.
(We use our micro-benchmark, and will do further test according to your requirment)

Thanks
Ling

---
 arch/x86/lib/copy_page_64.S |  124 +++++++++++++++++++-----------------------
 1 files changed, 56 insertions(+), 68 deletions(-)

diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 6fec2d1..0a60705 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -1,4 +1,5 @@
 /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
+/* Updated 2011 by Ma Ling to introduce backward copy */
 
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
@@ -17,83 +18,70 @@ ENDPROC(copy_page_c)
 	    
 /* Could vary the prefetch distance based on SMP/UP */
 
+/* 
+ * By backward copy we manage to reduce impact from HW prefetch
+ * when data is in L1 cache, and get benefit when data is not in L1 cache.
+ */
 ENTRY(copy_page)
 	CFI_STARTPROC
-	subq	$3*8,%rsp
-	CFI_ADJUST_CFA_OFFSET 3*8
-	movq	%rbx,(%rsp)
-	CFI_REL_OFFSET rbx, 0
-	movq	%r12,1*8(%rsp)
-	CFI_REL_OFFSET r12, 1*8
-	movq	%r13,2*8(%rsp)
-	CFI_REL_OFFSET r13, 2*8
-
-	movl	$(4096/64)-5,%ecx
-	.p2align 4
+	lea	4096(%rsi), %rsi
+	lea	4096(%rdi), %rdi
+	mov	$(4096/64)-5,	%cl
+	mov	$5,	%dl
+	/*
+	 * Nop force following instruction to be 16 bytes aligned.
+	 */
+	nop
 .Loop64:
-  	dec     %rcx
-
-	movq        (%rsi), %rax
-	movq      8 (%rsi), %rbx
-	movq     16 (%rsi), %rdx
-	movq     24 (%rsi), %r8
-	movq     32 (%rsi), %r9
-	movq     40 (%rsi), %r10
-	movq     48 (%rsi), %r11
-	movq     56 (%rsi), %r12
-
-	prefetcht0 5*64(%rsi)
-
-	movq     %rax,    (%rdi)
-	movq     %rbx,  8 (%rdi)
-	movq     %rdx, 16 (%rdi)
-	movq     %r8,  24 (%rdi)
-	movq     %r9,  32 (%rdi)
-	movq     %r10, 40 (%rdi)
-	movq     %r11, 48 (%rdi)
-	movq     %r12, 56 (%rdi)
-
-	leaq    64 (%rsi), %rsi
-	leaq    64 (%rdi), %rdi
+	prefetchnta	-5*64(%rsi)
+  	dec	%cl
+
+	movq	-0x8*1(%rsi),	%rax
+	movq	-0x8*2(%rsi),	%r8
+	movq	-0x8*3(%rsi),	%r9
+	movq	-0x8*4(%rsi),	%r10
+	movq	%rax,	-0x8*1(%rdi)
+	movq	%r8,	-0x8*2(%rdi)
+	movq	%r9,	-0x8*3(%rdi)
+	movq	%r10,	-0x8*4(%rdi)
+
+	movq	-0x8*5(%rsi),	%rax
+	movq	-0x8*6(%rsi),	%r8
+	movq	-0x8*7(%rsi),	%r9
+	movq	-0x8*8(%rsi),	%r10
+	leaq	-64(%rsi),	%rsi
+	movq	%rax,	-0x8*5(%rdi)
+	movq	%r8,	-0x8*6(%rdi)
+	movq	%r9,	-0x8*7(%rdi)
+	movq	%r10,	-0x8*8(%rdi)
+	leaq	-64(%rdi),	%rdi
 
 	jnz     .Loop64
 
-	movl	$5,%ecx
-	.p2align 4
 .Loop2:
-	decl   %ecx
-
-	movq        (%rsi), %rax
-	movq      8 (%rsi), %rbx
-	movq     16 (%rsi), %rdx
-	movq     24 (%rsi), %r8
-	movq     32 (%rsi), %r9
-	movq     40 (%rsi), %r10
-	movq     48 (%rsi), %r11
-	movq     56 (%rsi), %r12
-
-	movq     %rax,    (%rdi)
-	movq     %rbx,  8 (%rdi)
-	movq     %rdx, 16 (%rdi)
-	movq     %r8,  24 (%rdi)
-	movq     %r9,  32 (%rdi)
-	movq     %r10, 40 (%rdi)
-	movq     %r11, 48 (%rdi)
-	movq     %r12, 56 (%rdi)
-
-	leaq	64(%rdi),%rdi
-	leaq	64(%rsi),%rsi
-
+	dec	%dl
+
+	movq	-0x8*1(%rsi),	%rax
+	movq	-0x8*2(%rsi),	%r8
+	movq	-0x8*3(%rsi),	%r9
+	movq	-0x8*4(%rsi),	%r10
+	movq	%rax,	-0x8*1(%rdi)
+	movq	%r8,	-0x8*2(%rdi)
+	movq	%r9,	-0x8*3(%rdi)
+	movq	%r10,	-0x8*4(%rdi)
+
+	movq	-0x8*5(%rsi),	%rax
+	movq	-0x8*6(%rsi),	%r8
+	movq	-0x8*7(%rsi),	%r9
+	movq	-0x8*8(%rsi),	%r10
+	leaq	-64(%rsi),	%rsi
+	movq	%rax,	-0x8*5(%rdi)
+	movq	%r8,	-0x8*6(%rdi)
+	movq	%r9,	-0x8*7(%rdi)
+	movq	%r10,	-0x8*8(%rdi)
+	leaq	-64(%rdi),	%rdi
 	jnz	.Loop2
 
-	movq	(%rsp),%rbx
-	CFI_RESTORE rbx
-	movq	1*8(%rsp),%r12
-	CFI_RESTORE r12
-	movq	2*8(%rsp),%r13
-	CFI_RESTORE r13
-	addq	$3*8,%rsp
-	CFI_ADJUST_CFA_OFFSET -3*8
 	ret
 .Lcopy_page_end:
 	CFI_ENDPROC
-- 
1.6.5.2


             reply	other threads:[~2011-06-17 16:09 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-06-17 23:24 ling.ma [this message]
2011-06-20  3:42 ` [PATCH RFC V2] [x86] Optimize copy-page by reducing impact from HW prefetch Ma, Ling
2011-06-23 14:34 ` Ingo Molnar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1308353053-1928-1-git-send-email-ling.ma@intel.com \
    --to=ling.ma@intel.com \
    --cc=hpa@zytor.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).