[PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register
@ 2012-10-11 12:29 ling.ma
  2012-10-11 13:40 ` Andi Kleen
  2012-10-11 14:35 ` Konrad Rzeszutek Wilk
  0 siblings, 2 replies; 16+ messages in thread
From: ling.ma @ 2012-10-11 12:29 UTC (permalink / raw)
  To: mingo; +Cc: hpa, tglx, linux-kernel, Ma Ling

From: Ma Ling <ling.ma@intel.com>

Load and write operation occupy about 35% and 10% respectively
for most industry benchmarks. Fetched 16-aligned bytes code include 
about 4 instructions, implying 1.34(0.35 * 4) load, 0.4 write.  
Modern CPU support 2 load and 1 write per cycle, so throughput from write is
bottleneck for memcpy or copy_page, and some slight CPU only support one mem
operation per cycle. So it is enough to issue one read and write instruction
per cycle, and we can save registers. 

In this patch we also re-arrange instruction sequence to improve performance
The performance on atom is improved about 11%, 9% on hot/cold-cache case respectively.

Signed-off-by: Ma Ling <ling.ma@intel.com>

---
 arch/x86/lib/copy_page_64.S |  103 +++++++++++++++++-------------------------
 1 files changed, 42 insertions(+), 61 deletions(-)

diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 3da5527..13c97f4 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -20,76 +20,57 @@ ENDPROC(copy_page_rep)
 
 ENTRY(copy_page)
 	CFI_STARTPROC
-	subq	$2*8,	%rsp
-	CFI_ADJUST_CFA_OFFSET 2*8
-	movq	%rbx,	(%rsp)
-	CFI_REL_OFFSET rbx, 0
-	movq	%r12,	1*8(%rsp)
-	CFI_REL_OFFSET r12, 1*8
+	mov	$(4096/64)-5, %ecx
 
-	movl	$(4096/64)-5,	%ecx
-	.p2align 4
 .Loop64:
-  	dec	%rcx
-
-	movq	0x8*0(%rsi), %rax
-	movq	0x8*1(%rsi), %rbx
-	movq	0x8*2(%rsi), %rdx
-	movq	0x8*3(%rsi), %r8
-	movq	0x8*4(%rsi), %r9
-	movq	0x8*5(%rsi), %r10
-	movq	0x8*6(%rsi), %r11
-	movq	0x8*7(%rsi), %r12
-
 	prefetcht0 5*64(%rsi)
-
-	movq	%rax, 0x8*0(%rdi)
-	movq	%rbx, 0x8*1(%rdi)
-	movq	%rdx, 0x8*2(%rdi)
-	movq	%r8,  0x8*3(%rdi)
-	movq	%r9,  0x8*4(%rdi)
-	movq	%r10, 0x8*5(%rdi)
-	movq	%r11, 0x8*6(%rdi)
-	movq	%r12, 0x8*7(%rdi)
-
-	leaq	64 (%rsi), %rsi
-	leaq	64 (%rdi), %rdi
-
+	decb	%cl
+
+	movq	0x8*0(%rsi), %r10
+	movq	0x8*1(%rsi), %rax
+	movq	0x8*2(%rsi), %r8
+	movq	0x8*3(%rsi), %r9
+	movq	%r10, 0x8*0(%rdi)
+	movq	%rax, 0x8*1(%rdi)
+	movq	%r8, 0x8*2(%rdi)
+	movq	%r9, 0x8*3(%rdi)
+
+	movq	0x8*4(%rsi), %r10
+	movq	0x8*5(%rsi), %rax
+	movq	0x8*6(%rsi), %r8
+	movq	0x8*7(%rsi), %r9
+	leaq	64(%rsi), %rsi
+	movq	%r10, 0x8*4(%rdi)
+	movq	%rax, 0x8*5(%rdi)
+	movq	%r8, 0x8*6(%rdi)
+	movq	%r9, 0x8*7(%rdi)
+	leaq	64(%rdi), %rdi
 	jnz	.Loop64
 
-	movl	$5, %ecx
-	.p2align 4
+	mov	$5, %dl
 .Loop2:
-	decl	%ecx
-
-	movq	0x8*0(%rsi), %rax
-	movq	0x8*1(%rsi), %rbx
-	movq	0x8*2(%rsi), %rdx
-	movq	0x8*3(%rsi), %r8
-	movq	0x8*4(%rsi), %r9
-	movq	0x8*5(%rsi), %r10
-	movq	0x8*6(%rsi), %r11
-	movq	0x8*7(%rsi), %r12
-
-	movq	%rax, 0x8*0(%rdi)
-	movq	%rbx, 0x8*1(%rdi)
-	movq	%rdx, 0x8*2(%rdi)
-	movq	%r8,  0x8*3(%rdi)
-	movq	%r9,  0x8*4(%rdi)
-	movq	%r10, 0x8*5(%rdi)
-	movq	%r11, 0x8*6(%rdi)
-	movq	%r12, 0x8*7(%rdi)
-
-	leaq	64(%rdi), %rdi
+	decb	%dl
+	movq	0x8*0(%rsi), %r10
+	movq	0x8*1(%rsi), %rax
+	movq	0x8*2(%rsi), %r8
+	movq	0x8*3(%rsi), %r9
+	movq	%r10, 0x8*0(%rdi)
+	movq	%rax, 0x8*1(%rdi)
+	movq	%r8, 0x8*2(%rdi)
+	movq	%r9, 0x8*3(%rdi)
+
+	movq	0x8*4(%rsi), %r10
+	movq	0x8*5(%rsi), %rax
+	movq	0x8*6(%rsi), %r8
+	movq	0x8*7(%rsi), %r9
 	leaq	64(%rsi), %rsi
+	movq	%r10, 0x8*4(%rdi)
+	movq	%rax, 0x8*5(%rdi)
+	movq	%r8, 0x8*6(%rdi)
+	movq	%r9, 0x8*7(%rdi)
+	leaq	64(%rdi), %rdi
 	jnz	.Loop2
 
-	movq	(%rsp), %rbx
-	CFI_RESTORE rbx
-	movq	1*8(%rsp), %r12
-	CFI_RESTORE r12
-	addq	$2*8, %rsp
-	CFI_ADJUST_CFA_OFFSET -2*8
 	ret
 .Lcopy_page_end:
 	CFI_ENDPROC
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register
  2012-10-11 12:29 [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register ling.ma
@ 2012-10-11 13:40 ` Andi Kleen
  2012-10-12  3:10   ` Ma, Ling
  2012-10-11 14:35 ` Konrad Rzeszutek Wilk
  1 sibling, 1 reply; 16+ messages in thread
From: Andi Kleen @ 2012-10-11 13:40 UTC (permalink / raw)
  To: ling.ma; +Cc: mingo, hpa, tglx, linux-kernel

ling.ma@intel.com writes:

> From: Ma Ling <ling.ma@intel.com>
>
> Load and write operation occupy about 35% and 10% respectively
> for most industry benchmarks. Fetched 16-aligned bytes code include 
> about 4 instructions, implying 1.34(0.35 * 4) load, 0.4 write.  
> Modern CPU support 2 load and 1 write per cycle, so throughput from write is
> bottleneck for memcpy or copy_page, and some slight CPU only support one mem
> operation per cycle. So it is enough to issue one read and write instruction
> per cycle, and we can save registers. 

I don't think "saving registers" is a useful goal here.

>
> In this patch we also re-arrange instruction sequence to improve performance
> The performance on atom is improved about 11%, 9% on hot/cold-cache
> case respectively.

That's great, but the question is what happened to the older CPUs that
also this sequence. It may be safer to add a new variant for Atom,
unless you can benchmark those too.

-Andi


-- 
ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register
  2012-10-11 12:29 [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register ling.ma
  2012-10-11 13:40 ` Andi Kleen
@ 2012-10-11 14:35 ` Konrad Rzeszutek Wilk
  2012-10-12  3:37   ` Ma, Ling
  1 sibling, 1 reply; 16+ messages in thread
From: Konrad Rzeszutek Wilk @ 2012-10-11 14:35 UTC (permalink / raw)
  To: ling.ma; +Cc: mingo, hpa, tglx, linux-kernel

On Thu, Oct 11, 2012 at 08:29:08PM +0800, ling.ma@intel.com wrote:
> From: Ma Ling <ling.ma@intel.com>
> 
> Load and write operation occupy about 35% and 10% respectively
> for most industry benchmarks. Fetched 16-aligned bytes code include 
> about 4 instructions, implying 1.34(0.35 * 4) load, 0.4 write.  
> Modern CPU support 2 load and 1 write per cycle, so throughput from write is
> bottleneck for memcpy or copy_page, and some slight CPU only support one mem
> operation per cycle. So it is enough to issue one read and write instruction
> per cycle, and we can save registers. 

So is that also true for AMD CPUs?
> 
> In this patch we also re-arrange instruction sequence to improve performance
> The performance on atom is improved about 11%, 9% on hot/cold-cache case respectively.
> 
> Signed-off-by: Ma Ling <ling.ma@intel.com>
> 
> ---
>  arch/x86/lib/copy_page_64.S |  103 +++++++++++++++++-------------------------
>  1 files changed, 42 insertions(+), 61 deletions(-)
> 
> diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
> index 3da5527..13c97f4 100644
> --- a/arch/x86/lib/copy_page_64.S
> +++ b/arch/x86/lib/copy_page_64.S
> @@ -20,76 +20,57 @@ ENDPROC(copy_page_rep)
>  
>  ENTRY(copy_page)
>  	CFI_STARTPROC
> -	subq	$2*8,	%rsp
> -	CFI_ADJUST_CFA_OFFSET 2*8
> -	movq	%rbx,	(%rsp)
> -	CFI_REL_OFFSET rbx, 0
> -	movq	%r12,	1*8(%rsp)
> -	CFI_REL_OFFSET r12, 1*8
> +	mov	$(4096/64)-5, %ecx
>  
> -	movl	$(4096/64)-5,	%ecx
> -	.p2align 4
>  .Loop64:
> -  	dec	%rcx
> -
> -	movq	0x8*0(%rsi), %rax
> -	movq	0x8*1(%rsi), %rbx
> -	movq	0x8*2(%rsi), %rdx
> -	movq	0x8*3(%rsi), %r8
> -	movq	0x8*4(%rsi), %r9
> -	movq	0x8*5(%rsi), %r10
> -	movq	0x8*6(%rsi), %r11
> -	movq	0x8*7(%rsi), %r12
> -
>  	prefetcht0 5*64(%rsi)
> -
> -	movq	%rax, 0x8*0(%rdi)
> -	movq	%rbx, 0x8*1(%rdi)
> -	movq	%rdx, 0x8*2(%rdi)
> -	movq	%r8,  0x8*3(%rdi)
> -	movq	%r9,  0x8*4(%rdi)
> -	movq	%r10, 0x8*5(%rdi)
> -	movq	%r11, 0x8*6(%rdi)
> -	movq	%r12, 0x8*7(%rdi)
> -
> -	leaq	64 (%rsi), %rsi
> -	leaq	64 (%rdi), %rdi
> -
> +	decb	%cl
> +
> +	movq	0x8*0(%rsi), %r10
> +	movq	0x8*1(%rsi), %rax
> +	movq	0x8*2(%rsi), %r8
> +	movq	0x8*3(%rsi), %r9
> +	movq	%r10, 0x8*0(%rdi)
> +	movq	%rax, 0x8*1(%rdi)
> +	movq	%r8, 0x8*2(%rdi)
> +	movq	%r9, 0x8*3(%rdi)
> +
> +	movq	0x8*4(%rsi), %r10
> +	movq	0x8*5(%rsi), %rax
> +	movq	0x8*6(%rsi), %r8
> +	movq	0x8*7(%rsi), %r9
> +	leaq	64(%rsi), %rsi
> +	movq	%r10, 0x8*4(%rdi)
> +	movq	%rax, 0x8*5(%rdi)
> +	movq	%r8, 0x8*6(%rdi)
> +	movq	%r9, 0x8*7(%rdi)
> +	leaq	64(%rdi), %rdi
>  	jnz	.Loop64
>  
> -	movl	$5, %ecx
> -	.p2align 4
> +	mov	$5, %dl
>  .Loop2:
> -	decl	%ecx
> -
> -	movq	0x8*0(%rsi), %rax
> -	movq	0x8*1(%rsi), %rbx
> -	movq	0x8*2(%rsi), %rdx
> -	movq	0x8*3(%rsi), %r8
> -	movq	0x8*4(%rsi), %r9
> -	movq	0x8*5(%rsi), %r10
> -	movq	0x8*6(%rsi), %r11
> -	movq	0x8*7(%rsi), %r12
> -
> -	movq	%rax, 0x8*0(%rdi)
> -	movq	%rbx, 0x8*1(%rdi)
> -	movq	%rdx, 0x8*2(%rdi)
> -	movq	%r8,  0x8*3(%rdi)
> -	movq	%r9,  0x8*4(%rdi)
> -	movq	%r10, 0x8*5(%rdi)
> -	movq	%r11, 0x8*6(%rdi)
> -	movq	%r12, 0x8*7(%rdi)
> -
> -	leaq	64(%rdi), %rdi
> +	decb	%dl
> +	movq	0x8*0(%rsi), %r10
> +	movq	0x8*1(%rsi), %rax
> +	movq	0x8*2(%rsi), %r8
> +	movq	0x8*3(%rsi), %r9
> +	movq	%r10, 0x8*0(%rdi)
> +	movq	%rax, 0x8*1(%rdi)
> +	movq	%r8, 0x8*2(%rdi)
> +	movq	%r9, 0x8*3(%rdi)
> +
> +	movq	0x8*4(%rsi), %r10
> +	movq	0x8*5(%rsi), %rax
> +	movq	0x8*6(%rsi), %r8
> +	movq	0x8*7(%rsi), %r9
>  	leaq	64(%rsi), %rsi
> +	movq	%r10, 0x8*4(%rdi)
> +	movq	%rax, 0x8*5(%rdi)
> +	movq	%r8, 0x8*6(%rdi)
> +	movq	%r9, 0x8*7(%rdi)
> +	leaq	64(%rdi), %rdi
>  	jnz	.Loop2
>  
> -	movq	(%rsp), %rbx
> -	CFI_RESTORE rbx
> -	movq	1*8(%rsp), %r12
> -	CFI_RESTORE r12
> -	addq	$2*8, %rsp
> -	CFI_ADJUST_CFA_OFFSET -2*8
>  	ret
>  .Lcopy_page_end:
>  	CFI_ENDPROC
> -- 
> 1.6.5.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register
  2012-10-11 13:40 ` Andi Kleen
@ 2012-10-12  3:10   ` Ma, Ling
  2012-10-12 13:35     ` Andi Kleen
  0 siblings, 1 reply; 16+ messages in thread
From: Ma, Ling @ 2012-10-12  3:10 UTC (permalink / raw)
  To: Andi Kleen; +Cc: mingo, hpa, tglx, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1521 bytes --]

> > Load and write operation occupy about 35% and 10% respectively for
> > most industry benchmarks. Fetched 16-aligned bytes code include about
> > 4 instructions, implying 1.34(0.35 * 4) load, 0.4 write.
> > Modern CPU support 2 load and 1 write per cycle, so throughput from
> > write is bottleneck for memcpy or copy_page, and some slight CPU only
> > support one mem operation per cycle. So it is enough to issue one
> read
> > and write instruction per cycle, and we can save registers.
> 
> I don't think "saving registers" is a useful goal here.

Ling: issuing one read and write ops in one cycle is enough for copy_page or memcpy performance,
so we could avoid saving and restoring registers operation.

> >
> > In this patch we also re-arrange instruction sequence to improve
> > performance The performance on atom is improved about 11%, 9% on
> > hot/cold-cache case respectively.
> 
> That's great, but the question is what happened to the older CPUs that
> also this sequence. It may be safer to add a new variant for Atom,
> unless you can benchmark those too.

Ling: 
I tested new and original version on core2, the patch improved performance about 9%,
Although core2 is out-of-order pipeline and weaken instruction sequence requirement, 
because of ROB size limitation, new patch issues write operation earlier and
get more parallelism possibility for the pair of write and load ops and better result.
Attached core2-cpu-info (I have no older machine)


Thanks
Ling

 

[-- Attachment #2: core2-cpu-info --]
[-- Type: application/octet-stream, Size: 2992 bytes --]

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 15
model name	: Intel(R) Core(TM)2 Quad CPU    Q6600  @ 2.40GHz
stepping	: 11
cpu MHz		: 2400.003
cache size	: 4096 KB
physical id	: 0
siblings	: 4
core id		: 0
cpu cores	: 4
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 10
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc arch_perfmon pebs bts rep_good aperfmperf pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm lahf_lm tpr_shadow vnmi flexpriority
bogomips	: 4788.13
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 1
vendor_id	: GenuineIntel
cpu family	: 6
model		: 15
model name	: Intel(R) Core(TM)2 Quad CPU    Q6600  @ 2.40GHz
stepping	: 11
cpu MHz		: 2400.003
cache size	: 4096 KB
physical id	: 0
siblings	: 4
core id		: 1
cpu cores	: 4
apicid		: 1
initial apicid	: 1
fpu		: yes
fpu_exception	: yes
cpuid level	: 10
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc arch_perfmon pebs bts rep_good aperfmperf pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm lahf_lm tpr_shadow vnmi flexpriority
bogomips	: 4787.76
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 2
vendor_id	: GenuineIntel
cpu family	: 6
model		: 15
model name	: Intel(R) Core(TM)2 Quad CPU    Q6600  @ 2.40GHz
stepping	: 11
cpu MHz		: 2400.003
cache size	: 4096 KB
physical id	: 0
siblings	: 4
core id		: 2
cpu cores	: 4
apicid		: 2
initial apicid	: 2
fpu		: yes
fpu_exception	: yes
cpuid level	: 10
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc arch_perfmon pebs bts rep_good aperfmperf pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm lahf_lm tpr_shadow vnmi flexpriority
bogomips	: 4787.78
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 3
vendor_id	: GenuineIntel
cpu family	: 6
model		: 15
model name	: Intel(R) Core(TM)2 Quad CPU    Q6600  @ 2.40GHz
stepping	: 11
cpu MHz		: 2400.003
cache size	: 4096 KB
physical id	: 0
siblings	: 4
core id		: 3
cpu cores	: 4
apicid		: 3
initial apicid	: 3
fpu		: yes
fpu_exception	: yes
cpuid level	: 10
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc arch_perfmon pebs bts rep_good aperfmperf pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm lahf_lm tpr_shadow vnmi flexpriority
bogomips	: 4787.76
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:


^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register
  2012-10-11 14:35 ` Konrad Rzeszutek Wilk
@ 2012-10-12  3:37   ` Ma, Ling
  2012-10-12  6:18     ` Borislav Petkov
  0 siblings, 1 reply; 16+ messages in thread
From: Ma, Ling @ 2012-10-12  3:37 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk; +Cc: mingo, hpa, tglx, linux-kernel

> > Load and write operation occupy about 35% and 10% respectively for
> > most industry benchmarks. Fetched 16-aligned bytes code include about
> > 4 instructions, implying 1.34(0.35 * 4) load, 0.4 write.
> > Modern CPU support 2 load and 1 write per cycle, so throughput from
> > write is bottleneck for memcpy or copy_page, and some slight CPU only
> > support one mem operation per cycle. So it is enough to issue one
> read
> > and write instruction per cycle, and we can save registers.
> 
> So is that also true for AMD CPUs?
Although Bulldozer put 32byte instruction into decoupled 16byte entry buffers,
it still decode 4 instructions per cycle, so 4 instructions will be fed into execution unit and
2 loads ,1 write will be issued per cycle.

Thanks
Ling

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register
  2012-10-12  3:37   ` Ma, Ling
@ 2012-10-12  6:18     ` Borislav Petkov
  2012-10-12  9:07       ` Ma, Ling
  0 siblings, 1 reply; 16+ messages in thread
From: Borislav Petkov @ 2012-10-12  6:18 UTC (permalink / raw)
  To: Ma, Ling; +Cc: Konrad Rzeszutek Wilk, mingo, hpa, tglx, linux-kernel

On Fri, Oct 12, 2012 at 03:37:50AM +0000, Ma, Ling wrote:
> > > Load and write operation occupy about 35% and 10% respectively for
> > > most industry benchmarks. Fetched 16-aligned bytes code include about
> > > 4 instructions, implying 1.34(0.35 * 4) load, 0.4 write.
> > > Modern CPU support 2 load and 1 write per cycle, so throughput from
> > > write is bottleneck for memcpy or copy_page, and some slight CPU only
> > > support one mem operation per cycle. So it is enough to issue one
> > read
> > > and write instruction per cycle, and we can save registers.
> > 
> > So is that also true for AMD CPUs?
> Although Bulldozer put 32byte instruction into decoupled 16byte entry buffers,
> it still decode 4 instructions per cycle, so 4 instructions will be fed into execution unit and
> 2 loads ,1 write will be issued per cycle.

I'd be very interested with what benchmarks are you seeing that perf
improvement on Atom and who knows, maybe I could find time to run them
on Bulldozer and see how your patch behaves there :-).

Thanks.

-- 
Regards/Gruss,
    Boris.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register
  2012-10-12  6:18     ` Borislav Petkov
@ 2012-10-12  9:07       ` Ma, Ling
  2012-10-12 18:04         ` Borislav Petkov
  0 siblings, 1 reply; 16+ messages in thread
From: Ma, Ling @ 2012-10-12  9:07 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Konrad Rzeszutek Wilk, mingo, hpa, tglx, linux-kernel, iant

[-- Attachment #1: Type: text/plain, Size: 812 bytes --]

> > > So is that also true for AMD CPUs?
> > Although Bulldozer put 32byte instruction into decoupled 16byte entry
> > buffers, it still decode 4 instructions per cycle, so 4 instructions
> > will be fed into execution unit and
> > 2 loads ,1 write will be issued per cycle.
> 
> I'd be very interested with what benchmarks are you seeing that perf
> improvement on Atom and who knows, maybe I could find time to run them
> on Bulldozer and see how your patch behaves there :-).M
I use another benchmark from gcc, there are many code, and extract one simple benchmark, you may use it to test (cc -o copy_page copy_page.c),
my initial result shows new copy page version is still better on bulldozer machine, because the machine is first release, please verify result.
And CC to Ian.

Thanks
Ling


[-- Attachment #2: copy_page.c --]
[-- Type: text/plain, Size: 5975 bytes --]

#include<stdio.h>
#include <stdlib.h>


typedef unsigned long long int hp_timing_t;
#define  MAXSAMPLESTPT        1000
#define  MAXCOPYSIZE          (1024 * 1024)
#define  ORIG  0
#define  NEW   1
static char* buf1 = NULL;
static char* buf2 = NULL;
static int repeat_one_test = 32;

hp_timing_t _dl_hp_timing_overhead;
# define HP_TIMING_NOW(Var) \
  ({ unsigned long long _hi, _lo; \
     asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
     (Var) = _hi << 32 | _lo; })

#define HP_TIMING_DIFF(Diff, Start, End)	(Diff) = ((End) - (Start))
#define HP_TIMING_TOTAL(total_time, start, end)	\
  do									\
    {									\
      hp_timing_t tmptime;						\
      HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end);	\
	total_time += tmptime;						\
    }									\
  while (0)

#define HP_TIMING_BEST(best_time, start, end)	\
  do									\
    {									\
      hp_timing_t tmptime;						\
      HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end);	\
      if (best_time > tmptime)						\
	best_time = tmptime;						\
    }									\
  while (0)


void copy_page_org(char *dst, char *src, int len);
void copy_page_new(char *dst, char *src, int len);
void memcpy_c(char *dst, char *src, int len);
void (*do_memcpy)(char *dst, char *src, int len);

static void
do_one_test ( char *dst, char *src,
	     size_t len)
{
      hp_timing_t start __attribute ((unused));
      hp_timing_t stop __attribute ((unused));
      hp_timing_t best_time = ~ (hp_timing_t) 0;
      size_t i,j;

      for (i = 0; i < repeat_one_test; ++i)
	{
	  HP_TIMING_NOW (start);
	  do_memcpy ( dst, src, len);
	  HP_TIMING_NOW (stop);
	  HP_TIMING_BEST (best_time, start, stop);
	}

      printf ("\t%zd", (size_t) best_time);
}

static void
do_test (size_t align1, size_t align2, size_t len)
{
  size_t i, j;
  char *s1, *s2;

  s1 = (char *) (buf1 + align1);
  s2 = (char *) (buf2 + align2);


   printf ("TPT: Len %4zd, alignment %2zd/%2zd:", len, align1, align2);
   do_memcpy = copy_page_org;
   do_one_test (s2, s1, len);
   do_memcpy = copy_page_new;
   do_one_test (s2+ (1 << 16), s1 + (1 << 16), len);
    putchar ('\n');
}

static test_init(void)
{
  int i;
  buf1 = valloc(MAXCOPYSIZE);
  buf2 = valloc(MAXCOPYSIZE);

  for (i = 0; i < MAXCOPYSIZE ; i = i + 64) {
        buf1[i] = buf2[i] = i & 0xff;
  }

}

void copy_page_new(char *dst, char *src, int len)
{
	__asm__("mov	$(4096/64)-5, %ecx");
__asm__("1:");
	__asm__("prefetcht0 5*64(%rsi)");
	__asm__("decb	%cl");

	__asm__("movq	0x8*0(%rsi), %r10");
	__asm__("movq	0x8*1(%rsi), %rax");
	__asm__("movq	0x8*2(%rsi), %r8");
	__asm__("movq	0x8*3(%rsi), %r9");
	__asm__("movq	%r10, 0x8*0(%rdi)");
	__asm__("movq	%rax, 0x8*1(%rdi)");
	__asm__("movq	%r8, 0x8*2(%rdi)");
	__asm__("movq	%r9, 0x8*3(%rdi)");

	__asm__("movq	0x8*4(%rsi), %r10");
	__asm__("movq	0x8*5(%rsi), %rax");
	__asm__("movq	0x8*6(%rsi), %r8");
	__asm__("movq	0x8*7(%rsi), %r9");
	__asm__("leaq	64(%rsi), %rsi");
	__asm__("movq	%r10, 0x8*4(%rdi)");
	__asm__("movq	%rax, 0x8*5(%rdi)");
	__asm__("movq	%r8, 0x8*6(%rdi)");
	__asm__("movq	%r9, 0x8*7(%rdi)");
	__asm__("leaq	64(%rdi), %rdi");
	__asm__("jnz     1b");
	__asm__("mov	$5, %dl");
__asm__("2:");
	__asm__("decb	%dl");
	__asm__("movq	0x8*0(%rsi), %r10");
	__asm__("movq	0x8*1(%rsi), %rax");
	__asm__("movq	0x8*2(%rsi), %r8");
	__asm__("movq	0x8*3(%rsi), %r9");
	__asm__("movq	%r10, 0x8*0(%rdi)");
	__asm__("movq	%rax, 0x8*1(%rdi)");
	__asm__("movq	%r8, 0x8*2(%rdi)");
	__asm__("movq	%r9, 0x8*3(%rdi)");

	__asm__("movq	0x8*4(%rsi), %r10");
	__asm__("movq	0x8*5(%rsi), %rax");
	__asm__("movq	0x8*6(%rsi), %r8");
	__asm__("movq	0x8*7(%rsi), %r9");
	__asm__("leaq	64(%rsi), %rsi");
	__asm__("movq	%r10, 0x8*4(%rdi)");
	__asm__("movq	%rax, 0x8*5(%rdi)");
	__asm__("movq	%r8, 0x8*6(%rdi)");
	__asm__("movq	%r9, 0x8*7(%rdi)");
	__asm__("leaq	64(%rdi), %rdi");

	__asm__("jnz	2b");

}


void copy_page_org(char *dst, char *src, int len)
{

	__asm__("subq	$2*8,%rsp");
	__asm__("movq	%rbx,(%rsp)");
	__asm__("movq	%r12,1*8(%rsp)");
	__asm__("movl	$(4096/64)-5,%ecx");
	__asm__(".p2align 4");
__asm__("1:");
	__asm__("dec     %rcx");

	__asm__("movq        (%rsi), %rax");
	__asm__("movq      8 (%rsi), %rbx");
	__asm__("movq     16 (%rsi), %rdx");
	__asm__("movq     24 (%rsi), %r8");
	__asm__("movq     32 (%rsi), %r9");
	__asm__("movq     40 (%rsi), %r10");
	__asm__("movq     48 (%rsi), %r11");
	__asm__("movq     56 (%rsi), %r12");

	__asm__("prefetcht0 5*64(%rsi)");

	__asm__("movq     %rax,    (%rdi)");
	__asm__("movq     %rbx,  8 (%rdi)");
	__asm__("movq     %rdx, 16 (%rdi)");
	__asm__("movq     %r8,  24 (%rdi)");
	__asm__("movq     %r9,  32 (%rdi)");
	__asm__("movq     %r10, 40 (%rdi)");
	__asm__("movq     %r11, 48 (%rdi)");
	__asm__("movq     %r12, 56 (%rdi)");

	__asm__("leaq    64 (%rsi), %rsi");
	__asm__("leaq    64 (%rdi), %rdi");
	__asm__("jnz     1b");

	__asm__("movl	$5,%ecx");
	__asm__(".p2align 4");
__asm__("2:");
	__asm__("decl   %ecx");

	__asm__("movq        (%rsi), %rax");
	__asm__("movq      8 (%rsi), %rbx");
	__asm__("movq     16 (%rsi), %rdx");
	__asm__("movq     24 (%rsi), %r8");
	__asm__("movq     32 (%rsi), %r9");
	__asm__("movq     40 (%rsi), %r10");
	__asm__("movq     48 (%rsi), %r11");
	__asm__("movq     56 (%rsi), %r12");

	__asm__("movq     %rax,    (%rdi)");
	__asm__("movq     %rbx,  8 (%rdi)");
	__asm__("movq     %rdx, 16 (%rdi)");
	__asm__("movq     %r8,  24 (%rdi)");
	__asm__("movq     %r9,  32 (%rdi)");
	__asm__("movq     %r10, 40 (%rdi)");
	__asm__("movq     %r11, 48 (%rdi)");
	__asm__("movq     %r12, 56 (%rdi)");

	__asm__("leaq	64(%rdi),%rdi");
	__asm__("leaq	64(%rsi),%rsi");

	__asm__("jnz	2b");

	__asm__("movq	(%rsp),%rbx");
	__asm__("movq	1*8(%rsp),%r12");
	__asm__("addq	$2*8,%rsp");
}


void main(void)
{
  int i;
	test_init();
	printf ("%23s", "");
	printf ("\t%s\t%s\t%s\n", "copy_page_org", "copy_page_new");

	do_test(0, 0, 4096);
	do_test(0, 0, 4096);
	do_test(0, 0, 4096);
	do_test(0, 0, 4096);
	do_test(0, 0, 4096);
	return ;
}

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register
  2012-10-12  3:10   ` Ma, Ling
@ 2012-10-12 13:35     ` Andi Kleen
  2012-10-12 14:54       ` Ma, Ling
  0 siblings, 1 reply; 16+ messages in thread
From: Andi Kleen @ 2012-10-12 13:35 UTC (permalink / raw)
  To: Ma, Ling; +Cc: Andi Kleen, mingo, hpa, tglx, linux-kernel

> I tested new and original version on core2, the patch improved performance about 9%,

That's not useful because core2 doesn't use this variant, it uses the
rep string variant. Primary user is P4.

> Although core2 is out-of-order pipeline and weaken instruction sequence requirement, 
> because of ROB size limitation, new patch issues write operation earlier and
> get more parallelism possibility for the pair of write and load ops and better result.
> Attached core2-cpu-info (I have no older machine)

If you can't test the CPUs who run this code I think it's safer if you
add a new variant for Atom, not change the existing well tested code.
Otherwise you risk performance regressions on these older CPUs.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register
  2012-10-12 13:35     ` Andi Kleen
@ 2012-10-12 14:54       ` Ma, Ling
  2012-10-12 15:14         ` Andi Kleen
  0 siblings, 1 reply; 16+ messages in thread
From: Ma, Ling @ 2012-10-12 14:54 UTC (permalink / raw)
  To: Andi Kleen; +Cc: mingo, hpa, tglx, linux-kernel, iant

[-- Attachment #1: Type: text/plain, Size: 694 bytes --]

> If you can't test the CPUs who run this code I think it's safer if you
> add a new variant for Atom, not change the existing well tested code.
> Otherwise you risk performance regressions on these older CPUs.

I found one older machine, and tested the code on it, the results between them are almost the same as below(attached cpu info).
      1                         copy_page_org   copy_page_new
      2 TPT: Len 4096, alignment  0/ 0: 2252    2218
      3 TPT: Len 4096, alignment  0/ 0: 2244    2193
      4 TPT: Len 4096, alignment  0/ 0: 2261    2227
      5 TPT: Len 4096, alignment  0/ 0: 2235    2244
      6 TPT: Len 4096, alignment  0/ 0: 2261    2184

Thanks
Ling

[-- Attachment #2: xeon-cpu-info --]
[-- Type: application/octet-stream, Size: 2376 bytes --]

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 15
model		: 3
model name	:                   Intel(R) Xeon(TM) CPU 3.40GHz
stepping	: 4
cpu MHz		: 3400.285
cache size	: 1024 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 1
fpu		: yes
fpu_exception	: yes
cpuid level	: 5
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm syscall lm pni monitor ds_cpl est tm2 cid xtpr
bogomips	: 6717.44
clflush size	: 64
cache_alignment	: 128
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 1
vendor_id	: GenuineIntel
cpu family	: 15
model		: 3
model name	:                   Intel(R) Xeon(TM) CPU 3.40GHz
stepping	: 4
cpu MHz		: 3400.285
cache size	: 1024 KB
physical id	: 3
siblings	: 2
core id		: 3
cpu cores	: 1
fpu		: yes
fpu_exception	: yes
cpuid level	: 5
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm syscall lm pni monitor ds_cpl est tm2 cid xtpr
bogomips	: 6782.97
clflush size	: 64
cache_alignment	: 128
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 2
vendor_id	: GenuineIntel
cpu family	: 15
model		: 3
model name	:                   Intel(R) Xeon(TM) CPU 3.40GHz
stepping	: 4
cpu MHz		: 3400.285
cache size	: 1024 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 1
fpu		: yes
fpu_exception	: yes
cpuid level	: 5
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm syscall lm pni monitor ds_cpl est tm2 cid xtpr
bogomips	: 6782.97
clflush size	: 64
cache_alignment	: 128
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 3
vendor_id	: GenuineIntel
cpu family	: 15
model		: 3
model name	:                   Intel(R) Xeon(TM) CPU 3.40GHz
stepping	: 4
cpu MHz		: 3400.285
cache size	: 1024 KB
physical id	: 3
siblings	: 2
core id		: 3
cpu cores	: 1
fpu		: yes
fpu_exception	: yes
cpuid level	: 5
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm syscall lm pni monitor ds_cpl est tm2 cid xtpr
bogomips	: 6782.97
clflush size	: 64
cache_alignment	: 128
address sizes	: 36 bits physical, 48 bits virtual
power management:


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register
  2012-10-12 14:54       ` Ma, Ling
@ 2012-10-12 15:14         ` Andi Kleen
  0 siblings, 0 replies; 16+ messages in thread
From: Andi Kleen @ 2012-10-12 15:14 UTC (permalink / raw)
  To: Ma, Ling; +Cc: Andi Kleen, mingo, hpa, tglx, linux-kernel, iant

On Fri, Oct 12, 2012 at 02:54:54PM +0000, Ma, Ling wrote:
> > If you can't test the CPUs who run this code I think it's safer if you
> > add a new variant for Atom, not change the existing well tested code.
> > Otherwise you risk performance regressions on these older CPUs.
> 
> I found one older machine, and tested the code on it, the results between them are almost the same as below(attached cpu info).

Was that a P4 (family 15)? 

Those were the main users. There were a few others, but they are obscure
(early steppings of K8) 

-Andi


-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register
  2012-10-12  9:07       ` Ma, Ling
@ 2012-10-12 18:04         ` Borislav Petkov
  2012-10-14 10:58           ` Borislav Petkov
  0 siblings, 1 reply; 16+ messages in thread
From: Borislav Petkov @ 2012-10-12 18:04 UTC (permalink / raw)
  To: Ma, Ling; +Cc: Konrad Rzeszutek Wilk, mingo, hpa, tglx, linux-kernel, iant

On Fri, Oct 12, 2012 at 09:07:43AM +0000, Ma, Ling wrote:
> > > > So is that also true for AMD CPUs?
> > > Although Bulldozer put 32byte instruction into decoupled 16byte entry
> > > buffers, it still decode 4 instructions per cycle, so 4 instructions
> > > will be fed into execution unit and
> > > 2 loads ,1 write will be issued per cycle.
> > 
> > I'd be very interested with what benchmarks are you seeing that perf
> > improvement on Atom and who knows, maybe I could find time to run them
> > on Bulldozer and see how your patch behaves there :-).M
> I use another benchmark from gcc, there are many code, and extract
> one simple benchmark, you may use it to test (cc -o copy_page
> copy_page.c), my initial result shows new copy page version is still
> better on bulldozer machine, because the machine is first release,
> please verify result. And CC to Ian.

Right, so benchmark shows around 20% speedup on Bulldozer but this is a
microbenchmark and before pursue this further, we need to verify whether
this brings any palpable speedup with a real benchmark, I don't know,
kernbench, netbench, whatever. Even something as boring as kernel build.
And probably check for perf regressions on the rest of the uarches.

Thanks.

-- 
Regards/Gruss,
    Boris.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register
  2012-10-12 18:04         ` Borislav Petkov
@ 2012-10-14 10:58           ` Borislav Petkov
  2012-10-15  5:00             ` Ma, Ling
  2012-10-15  5:13             ` George Spelvin
  0 siblings, 2 replies; 16+ messages in thread
From: Borislav Petkov @ 2012-10-14 10:58 UTC (permalink / raw)
  To: Ma, Ling
  Cc: Konrad Rzeszutek Wilk, mingo, hpa, tglx, linux-kernel, iant,
	George Spelvin

[-- Attachment #1: Type: text/plain, Size: 807 bytes --]

On Fri, Oct 12, 2012 at 08:04:11PM +0200, Borislav Petkov wrote:
> Right, so benchmark shows around 20% speedup on Bulldozer but this is
> a microbenchmark and before pursue this further, we need to verify
> whether this brings any palpable speedup with a real benchmark, I
> don't know, kernbench, netbench, whatever. Even something as boring as
> kernel build. And probably check for perf regressions on the rest of
> the uarches.

Ok, so to summarize, on AMD we're using REP MOVSQ which is even
faster than the unrolled version. I've added the REP MOVSQ version
to the µbenchmark. It nicely validates that we're correctly setting
X86_FEATURE_REP_GOOD on everything >= F10h and some K8s.

So, to answer Konrad's question: those patches don't concern AMD
machines.

Thanks.

-- 
Regards/Gruss,
    Boris.

[-- Attachment #2: copy-page.c --]
[-- Type: text/x-csrc, Size: 6205 bytes --]

#include<stdio.h>
#include <stdlib.h>


typedef unsigned long long int hp_timing_t;
#define  MAXSAMPLESTPT        1000
#define  MAXCOPYSIZE          (1024 * 1024)
#define  ORIG  0
#define  NEW   1
static char* buf1 = NULL;
static char* buf2 = NULL;
static int repeat_one_test = 32;

hp_timing_t _dl_hp_timing_overhead;
# define HP_TIMING_NOW(Var) \
  ({ unsigned long long _hi, _lo; \
     asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
     (Var) = _hi << 32 | _lo; })

#define HP_TIMING_DIFF(Diff, Start, End)	(Diff) = ((End) - (Start))
#define HP_TIMING_TOTAL(total_time, start, end)	\
  do									\
    {									\
      hp_timing_t tmptime;						\
      HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end);	\
	total_time += tmptime;						\
    }									\
  while (0)

#define HP_TIMING_BEST(best_time, start, end)	\
  do									\
    {									\
      hp_timing_t tmptime;						\
      HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end);	\
      if (best_time > tmptime)						\
	best_time = tmptime;						\
    }									\
  while (0)


void copy_page_org(char *dst, char *src, int len);
void copy_page_new(char *dst, char *src, int len);
void copy_page_rep_movsq(char *dst, char *src, int len);
void memcpy_c(char *dst, char *src, int len);
void (*do_memcpy)(char *dst, char *src, int len);

static void
do_one_test ( char *dst, char *src,
	     size_t len)
{
      hp_timing_t start __attribute ((unused));
      hp_timing_t stop __attribute ((unused));
      hp_timing_t best_time = ~ (hp_timing_t) 0;
      size_t i;

      for (i = 0; i < repeat_one_test; ++i)
	{
	  HP_TIMING_NOW (start);
	  do_memcpy ( dst, src, len);
	  HP_TIMING_NOW (stop);
	  HP_TIMING_BEST (best_time, start, stop);
	}

      printf ("\t\t%zd", (size_t) best_time);
}

static void
do_test (size_t align1, size_t align2, size_t len)
{
  char *s1, *s2;

  s1 = (char *) (buf1 + align1);
  s2 = (char *) (buf2 + align2);


   printf ("TPT: Len %4zd, alignment %2zd/%2zd:", len, align1, align2);
   do_memcpy = copy_page_org;
   do_one_test (s2, s1, len);
   do_memcpy = copy_page_new;
   do_one_test (s2+ (1 << 16), s1 + (1 << 16), len);
   do_memcpy = copy_page_rep_movsq;
   do_one_test(s2, s1, len);
    putchar ('\n');
}

static void test_init(void)
{
  int i;
  buf1 = valloc(MAXCOPYSIZE);
  buf2 = valloc(MAXCOPYSIZE);

  for (i = 0; i < MAXCOPYSIZE ; i = i + 64) {
        buf1[i] = buf2[i] = i & 0xff;
  }

}

void copy_page_new(char *dst, char *src, int len)
{
	__asm__("mov	$(4096/64)-5, %ecx");
__asm__("1:");
	__asm__("prefetcht0 5*64(%rsi)");
	__asm__("decb	%cl");

	__asm__("movq	0x8*0(%rsi), %r10");
	__asm__("movq	0x8*1(%rsi), %rax");
	__asm__("movq	0x8*2(%rsi), %r8");
	__asm__("movq	0x8*3(%rsi), %r9");
	__asm__("movq	%r10, 0x8*0(%rdi)");
	__asm__("movq	%rax, 0x8*1(%rdi)");
	__asm__("movq	%r8, 0x8*2(%rdi)");
	__asm__("movq	%r9, 0x8*3(%rdi)");

	__asm__("movq	0x8*4(%rsi), %r10");
	__asm__("movq	0x8*5(%rsi), %rax");
	__asm__("movq	0x8*6(%rsi), %r8");
	__asm__("movq	0x8*7(%rsi), %r9");
	__asm__("leaq	64(%rsi), %rsi");
	__asm__("movq	%r10, 0x8*4(%rdi)");
	__asm__("movq	%rax, 0x8*5(%rdi)");
	__asm__("movq	%r8, 0x8*6(%rdi)");
	__asm__("movq	%r9, 0x8*7(%rdi)");
	__asm__("leaq	64(%rdi), %rdi");
	__asm__("jnz     1b");
	__asm__("mov	$5, %dl");
__asm__("2:");
	__asm__("decb	%dl");
	__asm__("movq	0x8*0(%rsi), %r10");
	__asm__("movq	0x8*1(%rsi), %rax");
	__asm__("movq	0x8*2(%rsi), %r8");
	__asm__("movq	0x8*3(%rsi), %r9");
	__asm__("movq	%r10, 0x8*0(%rdi)");
	__asm__("movq	%rax, 0x8*1(%rdi)");
	__asm__("movq	%r8, 0x8*2(%rdi)");
	__asm__("movq	%r9, 0x8*3(%rdi)");

	__asm__("movq	0x8*4(%rsi), %r10");
	__asm__("movq	0x8*5(%rsi), %rax");
	__asm__("movq	0x8*6(%rsi), %r8");
	__asm__("movq	0x8*7(%rsi), %r9");
	__asm__("leaq	64(%rsi), %rsi");
	__asm__("movq	%r10, 0x8*4(%rdi)");
	__asm__("movq	%rax, 0x8*5(%rdi)");
	__asm__("movq	%r8, 0x8*6(%rdi)");
	__asm__("movq	%r9, 0x8*7(%rdi)");
	__asm__("leaq	64(%rdi), %rdi");

	__asm__("jnz	2b");

}


void copy_page_org(char *dst, char *src, int len)
{

	__asm__("subq	$2*8,%rsp");
	__asm__("movq	%rbx,(%rsp)");
	__asm__("movq	%r12,1*8(%rsp)");
	__asm__("movl	$(4096/64)-5,%ecx");
	__asm__(".p2align 4");
__asm__("1:");
	__asm__("dec     %rcx");

	__asm__("movq        (%rsi), %rax");
	__asm__("movq      8 (%rsi), %rbx");
	__asm__("movq     16 (%rsi), %rdx");
	__asm__("movq     24 (%rsi), %r8");
	__asm__("movq     32 (%rsi), %r9");
	__asm__("movq     40 (%rsi), %r10");
	__asm__("movq     48 (%rsi), %r11");
	__asm__("movq     56 (%rsi), %r12");

	__asm__("prefetcht0 5*64(%rsi)");

	__asm__("movq     %rax,    (%rdi)");
	__asm__("movq     %rbx,  8 (%rdi)");
	__asm__("movq     %rdx, 16 (%rdi)");
	__asm__("movq     %r8,  24 (%rdi)");
	__asm__("movq     %r9,  32 (%rdi)");
	__asm__("movq     %r10, 40 (%rdi)");
	__asm__("movq     %r11, 48 (%rdi)");
	__asm__("movq     %r12, 56 (%rdi)");

	__asm__("leaq    64 (%rsi), %rsi");
	__asm__("leaq    64 (%rdi), %rdi");
	__asm__("jnz     1b");

	__asm__("movl	$5,%ecx");
	__asm__(".p2align 4");
__asm__("2:");
	__asm__("decl   %ecx");

	__asm__("movq        (%rsi), %rax");
	__asm__("movq      8 (%rsi), %rbx");
	__asm__("movq     16 (%rsi), %rdx");
	__asm__("movq     24 (%rsi), %r8");
	__asm__("movq     32 (%rsi), %r9");
	__asm__("movq     40 (%rsi), %r10");
	__asm__("movq     48 (%rsi), %r11");
	__asm__("movq     56 (%rsi), %r12");

	__asm__("movq     %rax,    (%rdi)");
	__asm__("movq     %rbx,  8 (%rdi)");
	__asm__("movq     %rdx, 16 (%rdi)");
	__asm__("movq     %r8,  24 (%rdi)");
	__asm__("movq     %r9,  32 (%rdi)");
	__asm__("movq     %r10, 40 (%rdi)");
	__asm__("movq     %r11, 48 (%rdi)");
	__asm__("movq     %r12, 56 (%rdi)");

	__asm__("leaq	64(%rdi),%rdi");
	__asm__("leaq	64(%rsi),%rsi");

	__asm__("jnz	2b");

	__asm__("movq	(%rsp),%rbx");
	__asm__("movq	1*8(%rsp),%r12");
	__asm__("addq	$2*8,%rsp");
}

void copy_page_rep_movsq(char *dst, char *src, int len)
{
	__asm__("movl $4096/8,%ecx");
	__asm__("rep movsq");
}

int main(void)
{
	test_init();
	printf ("%35s", "");
	printf ("\t%s\t%s\t%s\n", "copy_page_org", "copy_page_new", "REP MOVSQ");

	do_test(0, 0, 4096);
	do_test(0, 0, 4096);
	do_test(0, 0, 4096);
	do_test(0, 0, 4096);
	do_test(0, 0, 4096);
	return 0;
}

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register
  2012-10-14 10:58           ` Borislav Petkov
@ 2012-10-15  5:00             ` Ma, Ling
  2012-10-15  5:13             ` George Spelvin
  1 sibling, 0 replies; 16+ messages in thread
From: Ma, Ling @ 2012-10-15  5:00 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Konrad Rzeszutek Wilk, mingo, hpa, tglx, linux-kernel, iant,
	George Spelvin

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 1585 bytes --]

Thanks Boris!
So the patch is helpful and no impact for other/older machines,
I will re-send new version according to comments.
Any further comments are appreciated!

Regards
Ling

> -----Original Message-----
> From: Borislav Petkov [mailto:bp@alien8.de]
> Sent: Sunday, October 14, 2012 6:58 PM
> To: Ma, Ling
> Cc: Konrad Rzeszutek Wilk; mingo@elte.hu; hpa@zytor.com;
> tglx@linutronix.de; linux-kernel@vger.kernel.org; iant@google.com;
> George Spelvin
> Subject: Re: [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging
> instruction sequence and saving register
> 
> On Fri, Oct 12, 2012 at 08:04:11PM +0200, Borislav Petkov wrote:
> > Right, so benchmark shows around 20% speedup on Bulldozer but this is
> > a microbenchmark and before pursue this further, we need to verify
> > whether this brings any palpable speedup with a real benchmark, I
> > don't know, kernbench, netbench, whatever. Even something as boring
> as
> > kernel build. And probably check for perf regressions on the rest of
> > the uarches.
> 
> Ok, so to summarize, on AMD we're using REP MOVSQ which is even faster
> than the unrolled version. I've added the REP MOVSQ version to the
> Âµbenchmark. It nicely validates that we're correctly setting
> X86_FEATURE_REP_GOOD on everything >= F10h and some K8s.
> 
> So, to answer Konrad's question: those patches don't concern AMD
> machines.
> 
> Thanks.
> 
> --
> Regards/Gruss,
>     Boris.
ÿôèº{.nÇ+‰·Ÿ®‰†+%ŠËÿ±éÝ¶\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dÊ‡Ú™ë,j\a¢f£¢·hšïêÿ‘êçz_è®\x03(éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨èÚ&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register
  2012-10-14 10:58           ` Borislav Petkov
  2012-10-15  5:00             ` Ma, Ling
@ 2012-10-15  5:13             ` George Spelvin
  1 sibling, 0 replies; 16+ messages in thread
From: George Spelvin @ 2012-10-15  5:13 UTC (permalink / raw)
  To: bp, ling.ma; +Cc: hpa, iant, konrad, linux-kernel, linux, mingo, tglx

Just for everyone's information, here's the updated benchmark code on
the same Phenom.  The REP MOVSQ code is indeed much faster.

vendor_id       : AuthenticAMD
cpu family      : 16
model           : 2
model name      : AMD Phenom(tm) 9850 Quad-Core Processor
stepping        : 3
microcode       : 0x1000083
cpu MHz         : 2500.210
cache size      : 512 KB
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm 3dnowext 3dnow constant_tsc rep_good nopl nonstop_tsc extd_apicid pni monitor cx16 popcnt lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs hw_pstate npt lbrv svm_lock
bogomips        : 5000.42
TLB size        : 1024 4K pages
clflush size    : 64
cache_alignment : 64

                                   	copy_page_org	copy_page_new	REP MOVSQ
TPT: Len 4096, alignment  0/ 0:		708		758		611
TPT: Len 4096, alignment  0/ 0:		672		758		611
TPT: Len 4096, alignment  0/ 0:		708		758		611
TPT: Len 4096, alignment  0/ 0:		708		758		611
TPT: Len 4096, alignment  0/ 0:		708		758		611
                                   	copy_page_org	copy_page_new	REP MOVSQ
TPT: Len 4096, alignment  0/ 0:		694		759		611
TPT: Len 4096, alignment  0/ 0:		672		758		611
TPT: Len 4096, alignment  0/ 0:		708		758		611
TPT: Len 4096, alignment  0/ 0:		708		759		611
TPT: Len 4096, alignment  0/ 0:		708		757		611
                                   	copy_page_org	copy_page_new	REP MOVSQ
TPT: Len 4096, alignment  0/ 0:		697		758		611
TPT: Len 4096, alignment  0/ 0:		708		758		611
TPT: Len 4096, alignment  0/ 0:		708		758		611
TPT: Len 4096, alignment  0/ 0:		708		758		611
TPT: Len 4096, alignment  0/ 0:		708		757		611
                                   	copy_page_org	copy_page_new	REP MOVSQ
TPT: Len 4096, alignment  0/ 0:		703		758		612
TPT: Len 4096, alignment  0/ 0:		709		758		611
TPT: Len 4096, alignment  0/ 0:		709		757		611
TPT: Len 4096, alignment  0/ 0:		709		759		613
TPT: Len 4096, alignment  0/ 0:		709		759		611
                                   	copy_page_org	copy_page_new	REP MOVSQ
TPT: Len 4096, alignment  0/ 0:		669		758		613
TPT: Len 4096, alignment  0/ 0:		671		758		611
TPT: Len 4096, alignment  0/ 0:		708		758		611
TPT: Len 4096, alignment  0/ 0:		708		758		611
TPT: Len 4096, alignment  0/ 0:		708		758		613
                                   	copy_page_org	copy_page_new	REP MOVSQ
TPT: Len 4096, alignment  0/ 0:		679		758		612
TPT: Len 4096, alignment  0/ 0:		671		758		612
TPT: Len 4096, alignment  0/ 0:		684		759		612
TPT: Len 4096, alignment  0/ 0:		709		759		613
TPT: Len 4096, alignment  0/ 0:		709		759		611
                                   	copy_page_org	copy_page_new	REP MOVSQ
TPT: Len 4096, alignment  0/ 0:		682		758		612
TPT: Len 4096, alignment  0/ 0:		673		758		613
TPT: Len 4096, alignment  0/ 0:		704		759		613
TPT: Len 4096, alignment  0/ 0:		709		758		613
TPT: Len 4096, alignment  0/ 0:		709		758		611
                                   	copy_page_org	copy_page_new	REP MOVSQ
TPT: Len 4096, alignment  0/ 0:		669		759		611
TPT: Len 4096, alignment  0/ 0:		671		759		611
TPT: Len 4096, alignment  0/ 0:		709		759		613
TPT: Len 4096, alignment  0/ 0:		709		759		613
TPT: Len 4096, alignment  0/ 0:		708		759		613
                                   	copy_page_org	copy_page_new	REP MOVSQ
TPT: Len 4096, alignment  0/ 0:		668		759		612
TPT: Len 4096, alignment  0/ 0:		709		759		612
TPT: Len 4096, alignment  0/ 0:		709		759		612
TPT: Len 4096, alignment  0/ 0:		709		759		612
TPT: Len 4096, alignment  0/ 0:		709		759		612
                                   	copy_page_org	copy_page_new	REP MOVSQ
TPT: Len 4096, alignment  0/ 0:		694		758		611
TPT: Len 4096, alignment  0/ 0:		671		759		611
TPT: Len 4096, alignment  0/ 0:		708		759		611
TPT: Len 4096, alignment  0/ 0:		708		759		611
TPT: Len 4096, alignment  0/ 0:		708		759		613

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register
  2012-10-12 21:02 George Spelvin
@ 2012-10-12 23:17 ` Borislav Petkov
  0 siblings, 0 replies; 16+ messages in thread
From: Borislav Petkov @ 2012-10-12 23:17 UTC (permalink / raw)
  To: George Spelvin; +Cc: linux-kernel, Ma Ling, x86

On Fri, Oct 12, 2012 at 05:02:57PM -0400, George Spelvin wrote:
> Here are some Phenom results for that benchmark.  The average time
> increases from 700 to 760 cycles (+8.6%).

I was afraid something like that would show up.

Btw, in looking at this more and IINM, we use the REP MOVSQ version on
AMD anyway because of X86_FEATURE_REP_GOOD being set on some K8 and
everything from F10h on.

So, actually this µbenchmark should be comparing the REP MOVSQ case too
and the changes to the unrolled copy_page shouldn't concern AMD boxes
actually...

Hmm.

-- 
Regards/Gruss,
    Boris.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register
@ 2012-10-12 21:02 George Spelvin
  2012-10-12 23:17 ` Borislav Petkov
  0 siblings, 1 reply; 16+ messages in thread
From: George Spelvin @ 2012-10-12 21:02 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux

Here are some Phenom results for that benchmark.  The average time
increases from 700 to 760 cycles (+8.6%).

vendor_id       : AuthenticAMD
cpu family      : 16
model           : 2
model name      : AMD Phenom(tm) 9850 Quad-Core Processor
stepping        : 3
microcode       : 0x1000083
cpu MHz         : 2500.210
cache size      : 512 KB
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm 3dnowext 3dnow constant_tsc rep_good nopl nonstop_tsc extd_apicid pni monitor cx16 popcnt lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs hw_pstate npt lbrv svm_lock
bogomips        : 5000.42
TLB size        : 1024 4K pages
clflush size    : 64
cache_alignment : 64

                       	copy_page_org	copy_page_new	
TPT: Len 4096, alignment  0/ 0:	678	760
TPT: Len 4096, alignment  0/ 0:	710	760
TPT: Len 4096, alignment  0/ 0:	710	760
TPT: Len 4096, alignment  0/ 0:	710	760
TPT: Len 4096, alignment  0/ 0:	710	760
                       	copy_page_org	copy_page_new	
TPT: Len 4096, alignment  0/ 0:	667	760
TPT: Len 4096, alignment  0/ 0:	673	760
TPT: Len 4096, alignment  0/ 0:	710	760
TPT: Len 4096, alignment  0/ 0:	710	760
TPT: Len 4096, alignment  0/ 0:	710	760
                       	copy_page_org	copy_page_new	
TPT: Len 4096, alignment  0/ 0:	667	760
TPT: Len 4096, alignment  0/ 0:	673	760
TPT: Len 4096, alignment  0/ 0:	710	760
TPT: Len 4096, alignment  0/ 0:	710	760
TPT: Len 4096, alignment  0/ 0:	710	760
                       	copy_page_org	copy_page_new	
TPT: Len 4096, alignment  0/ 0:	671	760
TPT: Len 4096, alignment  0/ 0:	673	760
TPT: Len 4096, alignment  0/ 0:	671	760
TPT: Len 4096, alignment  0/ 0:	709	760
TPT: Len 4096, alignment  0/ 0:	708	760
                       	copy_page_org	copy_page_new	
TPT: Len 4096, alignment  0/ 0:	667	760
TPT: Len 4096, alignment  0/ 0:	710	760
TPT: Len 4096, alignment  0/ 0:	710	760
TPT: Len 4096, alignment  0/ 0:	710	760
TPT: Len 4096, alignment  0/ 0:	710	760
                       	copy_page_org	copy_page_new	
TPT: Len 4096, alignment  0/ 0:	671	760
TPT: Len 4096, alignment  0/ 0:	710	760
TPT: Len 4096, alignment  0/ 0:	710	760
TPT: Len 4096, alignment  0/ 0:	710	760
TPT: Len 4096, alignment  0/ 0:	710	760
                       	copy_page_org	copy_page_new	
TPT: Len 4096, alignment  0/ 0:	678	760
TPT: Len 4096, alignment  0/ 0:	709	758
TPT: Len 4096, alignment  0/ 0:	710	760
TPT: Len 4096, alignment  0/ 0:	709	759
TPT: Len 4096, alignment  0/ 0:	710	760
                       	copy_page_org	copy_page_new	
TPT: Len 4096, alignment  0/ 0:	680	760
TPT: Len 4096, alignment  0/ 0:	710	760
TPT: Len 4096, alignment  0/ 0:	710	760
TPT: Len 4096, alignment  0/ 0:	710	760
TPT: Len 4096, alignment  0/ 0:	710	760
                       	copy_page_org	copy_page_new	
TPT: Len 4096, alignment  0/ 0:	667	760
TPT: Len 4096, alignment  0/ 0:	710	760
TPT: Len 4096, alignment  0/ 0:	709	760
TPT: Len 4096, alignment  0/ 0:	709	759
TPT: Len 4096, alignment  0/ 0:	710	760
                       	copy_page_org	copy_page_new	
TPT: Len 4096, alignment  0/ 0:	678	760
TPT: Len 4096, alignment  0/ 0:	710	760
TPT: Len 4096, alignment  0/ 0:	710	760
TPT: Len 4096, alignment  0/ 0:	710	760
TPT: Len 4096, alignment  0/ 0:	710	760

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2012-10-15  5:13 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-10-11 12:29 [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register ling.ma
2012-10-11 13:40 ` Andi Kleen
2012-10-12  3:10   ` Ma, Ling
2012-10-12 13:35     ` Andi Kleen
2012-10-12 14:54       ` Ma, Ling
2012-10-12 15:14         ` Andi Kleen
2012-10-11 14:35 ` Konrad Rzeszutek Wilk
2012-10-12  3:37   ` Ma, Ling
2012-10-12  6:18     ` Borislav Petkov
2012-10-12  9:07       ` Ma, Ling
2012-10-12 18:04         ` Borislav Petkov
2012-10-14 10:58           ` Borislav Petkov
2012-10-15  5:00             ` Ma, Ling
2012-10-15  5:13             ` George Spelvin
2012-10-12 21:02 George Spelvin
2012-10-12 23:17 ` Borislav Petkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).