All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
@ 2010-10-08  7:28 Miao Xie
  2010-10-08  7:42 ` Ma, Ling
  0 siblings, 1 reply; 15+ messages in thread
From: Miao Xie @ 2010-10-08  7:28 UTC (permalink / raw)
  To: Ingo Molnar, Andi Kleen, Ma Ling, H. Peter Anvin,
	Thomas Gleixner, ykzhao
  Cc: Linux Kernel

memcpy of x86_64 hasn't been optimized for the unaligned copy like other
architecture, this patch fixed this problem.

I have tested this patch by my benchmark tool(doing 500 bytes memory copy
for 5,000,000 times)with various alignments and buffer sizes on my Core2
box.

Len	Src/Dst	Old memcpy	New memcpy
	align
---	-------	-------------	-------------
1	0/0	0s 47015us	0s 28265us
1	0/4	0s 28201us	0s 28199us
1	4/0	0s 28200us	0s 28199us
1	4/4	0s 28199us	0s 28206us
7	0/0	0s 24441us	0s 24438us
7	0/4	0s 24439us	0s 24438us
7	4/0	0s 24439us	0s 24438us
7	4/4	0s 24439us	0s 24439us
8	0/0	0s 20699us	0s 20687us
8	0/4	0s 20689us	0s 20901us
8	4/0	0s 20692us	0s 20679us
8	4/4	0s 20679us	0s 20679us
16	0/0	0s 18807us	0s 18802us
16	0/4	0s 26319us	0s 18800us
16	4/0	0s 18800us	0s 18806us
16	4/4	0s 26317us	0s 18803us
32	0/0	0s 35728us	0s 18800us
32	0/4	0s 35716us	0s 18800us
32	4/0	0s 35717us	0s 18800us
32	4/4	0s 35724us	0s 18803us
48	0/0	0s 26897us	0s 30080us
48	0/4	0s 33837us	0s 33838us
48	4/0	0s 27600us	0s 30079us
48	4/4	0s 30087us	0s 33854us
64	0/0	0s 41369us	0s 45115us
64	0/4	0s 62042us	0s 65800us
64	4/0	0s 56400us	0s 58278us
64	4/4	0s 84596us	0s 84606us
80	0/0	0s 35877us	0s 37611us
80	0/4	0s 77083us	0s 56404us
80	4/0	0s 52652us	0s 55611us
80	4/4	0s 75200us	0s 78968us
128	0/0	0s 52642us	0s 56403us
128	0/4	0s 95883us	0s 95891us
128	4/0	0s 114683us	0s 108511us
128	4/4	0s 144780us	0s 110927us
256	0/0	0s 80832us	0s 86489us
256	0/4	0s 178586us	0s 163562us
256	4/0	0s 208670us	0s 181719us
256	4/4	0s 270705us	0s 148525us
512	0/0	0s 156049us	0s 148348us
512	0/4	0s 313933us	0s 298908us
512	4/0	0s 411671us	0s 329025us
512	4/4	0s 516971us	0s 208746us
1024	0/0	0s 297067us	0s 274019us
1024	0/4	0s 584703us	0s 569604us
1024	4/0	0s 818104us	0s 616419us
1024	4/4	1s 22839us	0s 328953us
2048	0/0	0s 577077us	0s 524148us
2048	0/4	1s 125953us	1s 111258us
2048	4/0	1s 894000us	1s 202724us
2048	4/4	2s 331807us	0s 822437us
4096	0/0	1s 25881us	1s 34128us
4096	0/4	2s 619273us	2s 606489us
4096	4/0	3s 553989us	2s 390272us
4096	4/4	4s 737789us	1s 433213us

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 arch/x86/lib/memcpy_64.S |  135 +++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 134 insertions(+), 1 deletions(-)

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 75ef61e..b0224f8 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -46,9 +46,39 @@ ENTRY(memcpy)
 	 * Use 32bit CMP here to avoid long NOP padding.
 	 */
 	cmp  $0x20, %edx
-	jb .Lhandle_tail
+	jbe .Lhandle_tail
 
 	/*
+	 * the code for unaligned copy is good for large-size copy(>100),
+	 * so if the size is small, we needn't check dst and src is aligned
+	 * or not.
+	 */
+	cmp $100, %edx
+	jb .Lboth_aligned
+
+	/*
+	 * unaligned access always leads to bad performance, so in order to
+	 * avoid unaligned access, we align the address(both src and dest)
+	 * first, and then copy from a aligned src to an aligned dst by using
+	 * shifts.
+	 * But we found if src is aligned, although dest is unaligned, the
+	 * performance of generic memory copy (That is reading data aligned
+	 * from the source and writing data unaligned to the dest) is better
+	 * than the one that uses shifts to avoid unaligned access.
+	 * So if src is aligned, we needn't check dest is aligned or not, just
+	 * goto .Lboth_aligned
+	 */
+	test $7, %esi		/* src align check */
+	jz .Lboth_aligned
+
+	/* if dest and src both are unaligned, goto unaligned copy */
+	test $7, %edi
+	jnz .Ldst_unaligned
+
+	jmp .Lsrc_unaligned_dst_aligned
+
+.Lboth_aligned:
+	/*
 	 * We check whether memory false dependece could occur,
 	 * then jump to corresponding copy mode.
 	 */
@@ -166,6 +196,109 @@ ENTRY(memcpy)
 
 .Lend:
 	retq
+
+	.p2align 4
+.Ldst_unaligned:
+	movq %rdi, %rcx
+	andq $7, %rcx		/* Align the destination */
+	negq %rcx
+	andq $7, %rcx
+	subq %rcx, %rdx
+
+	/* tune dst address */
+	movq (%rsi), %r8
+	movq %r8, (%rdi)
+	addq %rcx, %rdi
+	addq %rcx, %rsi
+
+	test $7, %esi		/* src align check */
+	jz .Lboth_aligned
+
+	.p2align 4
+.Lsrc_unaligned_dst_aligned:
+	push %rbx
+	push %r12
+	push %r13
+	push %r14
+	push %r15
+	/*
+	 * Calculate how to shift a word read at the memory operation
+	 * aligned srcp to make it aligned for copy.
+	 */
+	movq %rsi, %r14
+	andq $7, %r14
+	shlq $3, %r14
+	
+	movq $64, %r15
+	subq %r14, %r15
+
+	andq $-8, %rsi		/* src aligned */
+	movq 0*8(%rsi), %r8
+
+	movq %rdx, %rbx
+	shrq $5, %rbx
+	jz .Lsrc_unaligned_less32
+
+	/*
+	 * %r8 : store src[0]
+	 * %r9 : store src[1]
+	 * %r10: store src[2]
+	 * %r11: store src[3]
+	 * %r12: store src[4]
+	 * %r13: store the tmp data
+	 */ 
+	.p2align 4
+.Lsrc_unaligned_loop32:
+	movq 1*8(%rsi), %r9
+	movq 2*8(%rsi), %r10
+	movq 3*8(%rsi), %r11
+	movq 4*8(%rsi), %r12
+
+	movq %r9, %r13
+	movb %r14b, %cl
+	shrq %cl, %r8
+	shrq %cl, %r13
+	movb %r15b, %cl
+	shlq  %cl, %r9
+	orq %r8, %r9
+	movq %r10, %r8
+	shlq  %cl, %r10
+	orq %r13, %r10
+
+	movq %r11, %r13
+	movb %r14b, %cl
+	shrq %cl, %r8
+	shrq %cl, %r13
+	movb %r15b, %cl
+	shlq  %cl, %r11
+	orq %r8, %r11	
+	movq %r12, %r8
+	shlq  %cl, %r12
+	orq %r13, %r12
+
+	movq %r9, 0*8(%rdi)
+	movq %r10, 1*8(%rdi)
+	movq %r11, 2*8(%rdi)
+	movq %r12, 3*8(%rdi)
+	
+	leaq 4*8(%rdi), %rdi
+	leaq 4*8(%rsi), %rsi
+	decq %rbx
+	jnz .Lsrc_unaligned_loop32
+
+	.p2align 4
+.Lsrc_unaligned_less32:
+	shrq $3, %r14
+	addq %r14, %rsi
+	pop %r15
+	pop %r14
+	pop %r13
+	pop %r12
+	pop %rbx
+	andq $31, %rdx
+	jnz .Lhandle_tail
+	retq
+	
 	CFI_ENDPROC
 ENDPROC(memcpy)
 ENDPROC(__memcpy)
-- 
1.7.0.1

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* RE: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
  2010-10-08  7:28 [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy Miao Xie
@ 2010-10-08  7:42 ` Ma, Ling
  2010-10-08  9:02   ` Miao Xie
  0 siblings, 1 reply; 15+ messages in thread
From: Ma, Ling @ 2010-10-08  7:42 UTC (permalink / raw)
  To: miaox, Ingo Molnar, Andi Kleen, H. Peter Anvin, Thomas Gleixner,
	Zhao, Yakui
  Cc: Linux Kernel

Could you please give us full address for each comparison result,we will do some tests on my machine.
For unaligned cases older cpus will crossing cache line and slow down caused by load and store, but for nhm, no necessary to care about it.
By the way in kernel 64bit mode, our access mode should be around 8byte aligned.

Thanks
Ling 

> -----Original Message-----
> From: Miao Xie [mailto:miaox@cn.fujitsu.com]
> Sent: Friday, October 08, 2010 3:28 PM
> To: Ingo Molnar; Andi Kleen; Ma, Ling; H. Peter Anvin; Thomas Gleixner; Zhao,
> Yakui
> Cc: Linux Kernel
> Subject: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for
> unaligned copy
> 
> memcpy of x86_64 hasn't been optimized for the unaligned copy like other
> architecture, this patch fixed this problem.
> 
> I have tested this patch by my benchmark tool(doing 500 bytes memory copy
> for 5,000,000 times)with various alignments and buffer sizes on my Core2
> box.
> 
> Len	Src/Dst	Old memcpy	New memcpy
> 	align
> ---	-------	-------------	-------------
> 1	0/0	0s 47015us	0s 28265us
> 1	0/4	0s 28201us	0s 28199us
> 1	4/0	0s 28200us	0s 28199us
> 1	4/4	0s 28199us	0s 28206us
> 7	0/0	0s 24441us	0s 24438us
> 7	0/4	0s 24439us	0s 24438us
> 7	4/0	0s 24439us	0s 24438us
> 7	4/4	0s 24439us	0s 24439us
> 8	0/0	0s 20699us	0s 20687us
> 8	0/4	0s 20689us	0s 20901us
> 8	4/0	0s 20692us	0s 20679us
> 8	4/4	0s 20679us	0s 20679us
> 16	0/0	0s 18807us	0s 18802us
> 16	0/4	0s 26319us	0s 18800us
> 16	4/0	0s 18800us	0s 18806us
> 16	4/4	0s 26317us	0s 18803us
> 32	0/0	0s 35728us	0s 18800us
> 32	0/4	0s 35716us	0s 18800us
> 32	4/0	0s 35717us	0s 18800us
> 32	4/4	0s 35724us	0s 18803us
> 48	0/0	0s 26897us	0s 30080us
> 48	0/4	0s 33837us	0s 33838us
> 48	4/0	0s 27600us	0s 30079us
> 48	4/4	0s 30087us	0s 33854us
> 64	0/0	0s 41369us	0s 45115us
> 64	0/4	0s 62042us	0s 65800us
> 64	4/0	0s 56400us	0s 58278us
> 64	4/4	0s 84596us	0s 84606us
> 80	0/0	0s 35877us	0s 37611us
> 80	0/4	0s 77083us	0s 56404us
> 80	4/0	0s 52652us	0s 55611us
> 80	4/4	0s 75200us	0s 78968us
> 128	0/0	0s 52642us	0s 56403us
> 128	0/4	0s 95883us	0s 95891us
> 128	4/0	0s 114683us	0s 108511us
> 128	4/4	0s 144780us	0s 110927us
> 256	0/0	0s 80832us	0s 86489us
> 256	0/4	0s 178586us	0s 163562us
> 256	4/0	0s 208670us	0s 181719us
> 256	4/4	0s 270705us	0s 148525us
> 512	0/0	0s 156049us	0s 148348us
> 512	0/4	0s 313933us	0s 298908us
> 512	4/0	0s 411671us	0s 329025us
> 512	4/4	0s 516971us	0s 208746us
> 1024	0/0	0s 297067us	0s 274019us
> 1024	0/4	0s 584703us	0s 569604us
> 1024	4/0	0s 818104us	0s 616419us
> 1024	4/4	1s 22839us	0s 328953us
> 2048	0/0	0s 577077us	0s 524148us
> 2048	0/4	1s 125953us	1s 111258us
> 2048	4/0	1s 894000us	1s 202724us
> 2048	4/4	2s 331807us	0s 822437us
> 4096	0/0	1s 25881us	1s 34128us
> 4096	0/4	2s 619273us	2s 606489us
> 4096	4/0	3s 553989us	2s 390272us
> 4096	4/4	4s 737789us	1s 433213us
> 
> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
> ---
>  arch/x86/lib/memcpy_64.S |  135
> +++++++++++++++++++++++++++++++++++++++++++++-
>  1 files changed, 134 insertions(+), 1 deletions(-)
> 
> diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
> index 75ef61e..b0224f8 100644
> --- a/arch/x86/lib/memcpy_64.S
> +++ b/arch/x86/lib/memcpy_64.S
> @@ -46,9 +46,39 @@ ENTRY(memcpy)
>  	 * Use 32bit CMP here to avoid long NOP padding.
>  	 */
>  	cmp  $0x20, %edx
> -	jb .Lhandle_tail
> +	jbe .Lhandle_tail
> 
>  	/*
> +	 * the code for unaligned copy is good for large-size copy(>100),
> +	 * so if the size is small, we needn't check dst and src is aligned
> +	 * or not.
> +	 */
> +	cmp $100, %edx
> +	jb .Lboth_aligned
> +
> +	/*
> +	 * unaligned access always leads to bad performance, so in order to
> +	 * avoid unaligned access, we align the address(both src and dest)
> +	 * first, and then copy from a aligned src to an aligned dst by using
> +	 * shifts.
> +	 * But we found if src is aligned, although dest is unaligned, the
> +	 * performance of generic memory copy (That is reading data aligned
> +	 * from the source and writing data unaligned to the dest) is better
> +	 * than the one that uses shifts to avoid unaligned access.
> +	 * So if src is aligned, we needn't check dest is aligned or not, just
> +	 * goto .Lboth_aligned
> +	 */
> +	test $7, %esi		/* src align check */
> +	jz .Lboth_aligned
> +
> +	/* if dest and src both are unaligned, goto unaligned copy */
> +	test $7, %edi
> +	jnz .Ldst_unaligned
> +
> +	jmp .Lsrc_unaligned_dst_aligned
> +
> +.Lboth_aligned:
> +	/*
>  	 * We check whether memory false dependece could occur,
>  	 * then jump to corresponding copy mode.
>  	 */
> @@ -166,6 +196,109 @@ ENTRY(memcpy)
> 
>  .Lend:
>  	retq
> +
> +	.p2align 4
> +.Ldst_unaligned:
> +	movq %rdi, %rcx
> +	andq $7, %rcx		/* Align the destination */
> +	negq %rcx
> +	andq $7, %rcx
> +	subq %rcx, %rdx
> +
> +	/* tune dst address */
> +	movq (%rsi), %r8
> +	movq %r8, (%rdi)
> +	addq %rcx, %rdi
> +	addq %rcx, %rsi
> +
> +	test $7, %esi		/* src align check */
> +	jz .Lboth_aligned
> +
> +	.p2align 4
> +.Lsrc_unaligned_dst_aligned:
> +	push %rbx
> +	push %r12
> +	push %r13
> +	push %r14
> +	push %r15
> +	/*
> +	 * Calculate how to shift a word read at the memory operation
> +	 * aligned srcp to make it aligned for copy.
> +	 */
> +	movq %rsi, %r14
> +	andq $7, %r14
> +	shlq $3, %r14
> +
> +	movq $64, %r15
> +	subq %r14, %r15
> +
> +	andq $-8, %rsi		/* src aligned */
> +	movq 0*8(%rsi), %r8
> +
> +	movq %rdx, %rbx
> +	shrq $5, %rbx
> +	jz .Lsrc_unaligned_less32
> +
> +	/*
> +	 * %r8 : store src[0]
> +	 * %r9 : store src[1]
> +	 * %r10: store src[2]
> +	 * %r11: store src[3]
> +	 * %r12: store src[4]
> +	 * %r13: store the tmp data
> +	 */
> +	.p2align 4
> +.Lsrc_unaligned_loop32:
> +	movq 1*8(%rsi), %r9
> +	movq 2*8(%rsi), %r10
> +	movq 3*8(%rsi), %r11
> +	movq 4*8(%rsi), %r12
> +
> +	movq %r9, %r13
> +	movb %r14b, %cl
> +	shrq %cl, %r8
> +	shrq %cl, %r13
> +	movb %r15b, %cl
> +	shlq  %cl, %r9
> +	orq %r8, %r9
> +	movq %r10, %r8
> +	shlq  %cl, %r10
> +	orq %r13, %r10
> +
> +	movq %r11, %r13
> +	movb %r14b, %cl
> +	shrq %cl, %r8
> +	shrq %cl, %r13
> +	movb %r15b, %cl
> +	shlq  %cl, %r11
> +	orq %r8, %r11
> +	movq %r12, %r8
> +	shlq  %cl, %r12
> +	orq %r13, %r12
> +
> +	movq %r9, 0*8(%rdi)
> +	movq %r10, 1*8(%rdi)
> +	movq %r11, 2*8(%rdi)
> +	movq %r12, 3*8(%rdi)
> +
> +	leaq 4*8(%rdi), %rdi
> +	leaq 4*8(%rsi), %rsi
> +	decq %rbx
> +	jnz .Lsrc_unaligned_loop32
> +
> +	.p2align 4
> +.Lsrc_unaligned_less32:
> +	shrq $3, %r14
> +	addq %r14, %rsi
> +	pop %r15
> +	pop %r14
> +	pop %r13
> +	pop %r12
> +	pop %rbx
> +	andq $31, %rdx
> +	jnz .Lhandle_tail
> +	retq
> +
>  	CFI_ENDPROC
>  ENDPROC(memcpy)
>  ENDPROC(__memcpy)
> --
> 1.7.0.1

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
  2010-10-08  7:42 ` Ma, Ling
@ 2010-10-08  9:02   ` Miao Xie
  2010-10-13 21:31     ` H. Peter Anvin
  0 siblings, 1 reply; 15+ messages in thread
From: Miao Xie @ 2010-10-08  9:02 UTC (permalink / raw)
  To: Ma, Ling
  Cc: Ingo Molnar, Andi Kleen, H. Peter Anvin, Thomas Gleixner, Zhao,
	Yakui, Linux Kernel

[-- Attachment #1: Type: text/plain, Size: 7035 bytes --]

On Fri, 8 Oct 2010 15:42:45 +0800, Ma, Ling wrote:
> Could you please give us full address for each comparison result,we will do some tests on my machine.
> For unaligned cases older cpus will crossing cache line and slow down caused by load and store, but for nhm, no necessary to care about it.
> By the way in kernel 64bit mode, our access mode should be around 8byte aligned.

Would you need my benchmark tool? I think it is helpful for your test.

Thanks
Miao

> Thanks
> Ling
>
>> -----Original Message-----
>> From: Miao Xie [mailto:miaox@cn.fujitsu.com]
>> Sent: Friday, October 08, 2010 3:28 PM
>> To: Ingo Molnar; Andi Kleen; Ma, Ling; H. Peter Anvin; Thomas Gleixner; Zhao,
>> Yakui
>> Cc: Linux Kernel
>> Subject: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for
>> unaligned copy
>>
>> memcpy of x86_64 hasn't been optimized for the unaligned copy like other
>> architecture, this patch fixed this problem.
>>
>> I have tested this patch by my benchmark tool(doing 500 bytes memory copy
>> for 5,000,000 times)with various alignments and buffer sizes on my Core2
>> box.
>>
>> Len	Src/Dst	Old memcpy	New memcpy
>> 	align
>> ---	-------	-------------	-------------
>> 1	0/0	0s 47015us	0s 28265us
>> 1	0/4	0s 28201us	0s 28199us
>> 1	4/0	0s 28200us	0s 28199us
>> 1	4/4	0s 28199us	0s 28206us
>> 7	0/0	0s 24441us	0s 24438us
>> 7	0/4	0s 24439us	0s 24438us
>> 7	4/0	0s 24439us	0s 24438us
>> 7	4/4	0s 24439us	0s 24439us
>> 8	0/0	0s 20699us	0s 20687us
>> 8	0/4	0s 20689us	0s 20901us
>> 8	4/0	0s 20692us	0s 20679us
>> 8	4/4	0s 20679us	0s 20679us
>> 16	0/0	0s 18807us	0s 18802us
>> 16	0/4	0s 26319us	0s 18800us
>> 16	4/0	0s 18800us	0s 18806us
>> 16	4/4	0s 26317us	0s 18803us
>> 32	0/0	0s 35728us	0s 18800us
>> 32	0/4	0s 35716us	0s 18800us
>> 32	4/0	0s 35717us	0s 18800us
>> 32	4/4	0s 35724us	0s 18803us
>> 48	0/0	0s 26897us	0s 30080us
>> 48	0/4	0s 33837us	0s 33838us
>> 48	4/0	0s 27600us	0s 30079us
>> 48	4/4	0s 30087us	0s 33854us
>> 64	0/0	0s 41369us	0s 45115us
>> 64	0/4	0s 62042us	0s 65800us
>> 64	4/0	0s 56400us	0s 58278us
>> 64	4/4	0s 84596us	0s 84606us
>> 80	0/0	0s 35877us	0s 37611us
>> 80	0/4	0s 77083us	0s 56404us
>> 80	4/0	0s 52652us	0s 55611us
>> 80	4/4	0s 75200us	0s 78968us
>> 128	0/0	0s 52642us	0s 56403us
>> 128	0/4	0s 95883us	0s 95891us
>> 128	4/0	0s 114683us	0s 108511us
>> 128	4/4	0s 144780us	0s 110927us
>> 256	0/0	0s 80832us	0s 86489us
>> 256	0/4	0s 178586us	0s 163562us
>> 256	4/0	0s 208670us	0s 181719us
>> 256	4/4	0s 270705us	0s 148525us
>> 512	0/0	0s 156049us	0s 148348us
>> 512	0/4	0s 313933us	0s 298908us
>> 512	4/0	0s 411671us	0s 329025us
>> 512	4/4	0s 516971us	0s 208746us
>> 1024	0/0	0s 297067us	0s 274019us
>> 1024	0/4	0s 584703us	0s 569604us
>> 1024	4/0	0s 818104us	0s 616419us
>> 1024	4/4	1s 22839us	0s 328953us
>> 2048	0/0	0s 577077us	0s 524148us
>> 2048	0/4	1s 125953us	1s 111258us
>> 2048	4/0	1s 894000us	1s 202724us
>> 2048	4/4	2s 331807us	0s 822437us
>> 4096	0/0	1s 25881us	1s 34128us
>> 4096	0/4	2s 619273us	2s 606489us
>> 4096	4/0	3s 553989us	2s 390272us
>> 4096	4/4	4s 737789us	1s 433213us
>>
>> Signed-off-by: Miao Xie<miaox@cn.fujitsu.com>
>> ---
>>   arch/x86/lib/memcpy_64.S |  135
>> +++++++++++++++++++++++++++++++++++++++++++++-
>>   1 files changed, 134 insertions(+), 1 deletions(-)
>>
>> diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
>> index 75ef61e..b0224f8 100644
>> --- a/arch/x86/lib/memcpy_64.S
>> +++ b/arch/x86/lib/memcpy_64.S
>> @@ -46,9 +46,39 @@ ENTRY(memcpy)
>>   	 * Use 32bit CMP here to avoid long NOP padding.
>>   	 */
>>   	cmp  $0x20, %edx
>> -	jb .Lhandle_tail
>> +	jbe .Lhandle_tail
>>
>>   	/*
>> +	 * the code for unaligned copy is good for large-size copy(>100),
>> +	 * so if the size is small, we needn't check dst and src is aligned
>> +	 * or not.
>> +	 */
>> +	cmp $100, %edx
>> +	jb .Lboth_aligned
>> +
>> +	/*
>> +	 * unaligned access always leads to bad performance, so in order to
>> +	 * avoid unaligned access, we align the address(both src and dest)
>> +	 * first, and then copy from a aligned src to an aligned dst by using
>> +	 * shifts.
>> +	 * But we found if src is aligned, although dest is unaligned, the
>> +	 * performance of generic memory copy (That is reading data aligned
>> +	 * from the source and writing data unaligned to the dest) is better
>> +	 * than the one that uses shifts to avoid unaligned access.
>> +	 * So if src is aligned, we needn't check dest is aligned or not, just
>> +	 * goto .Lboth_aligned
>> +	 */
>> +	test $7, %esi		/* src align check */
>> +	jz .Lboth_aligned
>> +
>> +	/* if dest and src both are unaligned, goto unaligned copy */
>> +	test $7, %edi
>> +	jnz .Ldst_unaligned
>> +
>> +	jmp .Lsrc_unaligned_dst_aligned
>> +
>> +.Lboth_aligned:
>> +	/*
>>   	 * We check whether memory false dependece could occur,
>>   	 * then jump to corresponding copy mode.
>>   	 */
>> @@ -166,6 +196,109 @@ ENTRY(memcpy)
>>
>>   .Lend:
>>   	retq
>> +
>> +	.p2align 4
>> +.Ldst_unaligned:
>> +	movq %rdi, %rcx
>> +	andq $7, %rcx		/* Align the destination */
>> +	negq %rcx
>> +	andq $7, %rcx
>> +	subq %rcx, %rdx
>> +
>> +	/* tune dst address */
>> +	movq (%rsi), %r8
>> +	movq %r8, (%rdi)
>> +	addq %rcx, %rdi
>> +	addq %rcx, %rsi
>> +
>> +	test $7, %esi		/* src align check */
>> +	jz .Lboth_aligned
>> +
>> +	.p2align 4
>> +.Lsrc_unaligned_dst_aligned:
>> +	push %rbx
>> +	push %r12
>> +	push %r13
>> +	push %r14
>> +	push %r15
>> +	/*
>> +	 * Calculate how to shift a word read at the memory operation
>> +	 * aligned srcp to make it aligned for copy.
>> +	 */
>> +	movq %rsi, %r14
>> +	andq $7, %r14
>> +	shlq $3, %r14
>> +
>> +	movq $64, %r15
>> +	subq %r14, %r15
>> +
>> +	andq $-8, %rsi		/* src aligned */
>> +	movq 0*8(%rsi), %r8
>> +
>> +	movq %rdx, %rbx
>> +	shrq $5, %rbx
>> +	jz .Lsrc_unaligned_less32
>> +
>> +	/*
>> +	 * %r8 : store src[0]
>> +	 * %r9 : store src[1]
>> +	 * %r10: store src[2]
>> +	 * %r11: store src[3]
>> +	 * %r12: store src[4]
>> +	 * %r13: store the tmp data
>> +	 */
>> +	.p2align 4
>> +.Lsrc_unaligned_loop32:
>> +	movq 1*8(%rsi), %r9
>> +	movq 2*8(%rsi), %r10
>> +	movq 3*8(%rsi), %r11
>> +	movq 4*8(%rsi), %r12
>> +
>> +	movq %r9, %r13
>> +	movb %r14b, %cl
>> +	shrq %cl, %r8
>> +	shrq %cl, %r13
>> +	movb %r15b, %cl
>> +	shlq  %cl, %r9
>> +	orq %r8, %r9
>> +	movq %r10, %r8
>> +	shlq  %cl, %r10
>> +	orq %r13, %r10
>> +
>> +	movq %r11, %r13
>> +	movb %r14b, %cl
>> +	shrq %cl, %r8
>> +	shrq %cl, %r13
>> +	movb %r15b, %cl
>> +	shlq  %cl, %r11
>> +	orq %r8, %r11
>> +	movq %r12, %r8
>> +	shlq  %cl, %r12
>> +	orq %r13, %r12
>> +
>> +	movq %r9, 0*8(%rdi)
>> +	movq %r10, 1*8(%rdi)
>> +	movq %r11, 2*8(%rdi)
>> +	movq %r12, 3*8(%rdi)
>> +
>> +	leaq 4*8(%rdi), %rdi
>> +	leaq 4*8(%rsi), %rsi
>> +	decq %rbx
>> +	jnz .Lsrc_unaligned_loop32
>> +
>> +	.p2align 4
>> +.Lsrc_unaligned_less32:
>> +	shrq $3, %r14
>> +	addq %r14, %rsi
>> +	pop %r15
>> +	pop %r14
>> +	pop %r13
>> +	pop %r12
>> +	pop %rbx
>> +	andq $31, %rdx
>> +	jnz .Lhandle_tail
>> +	retq
>> +
>>   	CFI_ENDPROC
>>   ENDPROC(memcpy)
>>   ENDPROC(__memcpy)
>> --
>> 1.7.0.1
>
>


[-- Attachment #2: benchmark.tar.gz --]
[-- Type: application/x-gzip, Size: 3132 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
  2010-10-08  9:02   ` Miao Xie
@ 2010-10-13 21:31     ` H. Peter Anvin
  2010-10-14  1:14       ` Ma, Ling
                         ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: H. Peter Anvin @ 2010-10-13 21:31 UTC (permalink / raw)
  To: miaox
  Cc: Ma, Ling, Ingo Molnar, Andi Kleen, Thomas Gleixner, Zhao, Yakui,
	Linux Kernel

On 10/08/2010 02:02 AM, Miao Xie wrote:
> On Fri, 8 Oct 2010 15:42:45 +0800, Ma, Ling wrote:
>> Could you please give us full address for each comparison result,we will do some tests on my machine.
>> For unaligned cases older cpus will crossing cache line and slow down caused by load and store, but for nhm, no necessary to care about it.
>> By the way in kernel 64bit mode, our access mode should be around 8byte aligned.
> 
> Would you need my benchmark tool? I think it is helpful for your test.
> 

If you could post the benchmark tool that would be great.

	-hpa

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
  2010-10-13 21:31     ` H. Peter Anvin
@ 2010-10-14  1:14       ` Ma, Ling
  2010-10-14 19:43       ` Ma, Ling
  2010-10-18  3:12       ` Miao Xie
  2 siblings, 0 replies; 15+ messages in thread
From: Ma, Ling @ 2010-10-14  1:14 UTC (permalink / raw)
  To: H. Peter Anvin, miaox
  Cc: Ingo Molnar, Andi Kleen, Thomas Gleixner, Zhao, Yakui, Linux Kernel

Sure, I will post benchmark tool and benchmark on Atom 64bit soon.

Thanks
Ling

-----Original Message-----
From: H. Peter Anvin [mailto:hpa@zytor.com] 
Sent: Thursday, October 14, 2010 5:32 AM
To: miaox@cn.fujitsu.com
Cc: Ma, Ling; Ingo Molnar; Andi Kleen; Thomas Gleixner; Zhao, Yakui; Linux Kernel
Subject: Re: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy

On 10/08/2010 02:02 AM, Miao Xie wrote:
> On Fri, 8 Oct 2010 15:42:45 +0800, Ma, Ling wrote:
>> Could you please give us full address for each comparison result,we will do some tests on my machine.
>> For unaligned cases older cpus will crossing cache line and slow down caused by load and store, but for nhm, no necessary to care about it.
>> By the way in kernel 64bit mode, our access mode should be around 8byte aligned.
> 
> Would you need my benchmark tool? I think it is helpful for your test.
> 

If you could post the benchmark tool that would be great.

	-hpa

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
  2010-10-13 21:31     ` H. Peter Anvin
  2010-10-14  1:14       ` Ma, Ling
@ 2010-10-14 19:43       ` Ma, Ling
  2010-10-18  6:23         ` Miao Xie
  2010-10-18  3:12       ` Miao Xie
  2 siblings, 1 reply; 15+ messages in thread
From: Ma, Ling @ 2010-10-14 19:43 UTC (permalink / raw)
  To: Ma, Ling, H. Peter Anvin, miaox
  Cc: Ingo Molnar, Andi Kleen, Thomas Gleixner, Zhao, Yakui, Linux Kernel

[-- Attachment #1: Type: text/plain, Size: 1427 bytes --]

Attachment includes memcpy-kernel.c(cc -O2 memcpy-kernel.c -o memcpy-kernel),
and unaligned test cases on Atom.

Thanks
Ling

-----Original Message-----
From: Ma, Ling 
Sent: Thursday, October 14, 2010 9:14 AM
To: 'H. Peter Anvin'; miaox@cn.fujitsu.com
Cc: Ingo Molnar; Andi Kleen; Thomas Gleixner; Zhao, Yakui; Linux Kernel
Subject: RE: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy

Sure, I will post benchmark tool and benchmark on Atom 64bit soon.

Thanks
Ling

-----Original Message-----
From: H. Peter Anvin [mailto:hpa@zytor.com] 
Sent: Thursday, October 14, 2010 5:32 AM
To: miaox@cn.fujitsu.com
Cc: Ma, Ling; Ingo Molnar; Andi Kleen; Thomas Gleixner; Zhao, Yakui; Linux Kernel
Subject: Re: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy

On 10/08/2010 02:02 AM, Miao Xie wrote:
> On Fri, 8 Oct 2010 15:42:45 +0800, Ma, Ling wrote:
>> Could you please give us full address for each comparison result,we will do some tests on my machine.
>> For unaligned cases older cpus will crossing cache line and slow down caused by load and store, but for nhm, no necessary to care about it.
>> By the way in kernel 64bit mode, our access mode should be around 8byte aligned.
> 
> Would you need my benchmark tool? I think it is helpful for your test.
> 

If you could post the benchmark tool that would be great.

	-hpa

[-- Attachment #2: memcpy-atom-unaligned-cases --]
[-- Type: application/octet-stream, Size: 11036 bytes --]

                       	memcpy_orig	memcpy_new
LAT: Len    1, alignment  0/ 0:	48	36
LAT: Len    1, alignment  0/ 0:	48	36
LAT: Len    2, alignment  1/ 0:	48	48
LAT: Len    2, alignment  0/ 1:	48	48
LAT: Len    4, alignment  2/ 0:	36	48
LAT: Len    4, alignment  0/ 2:	36	36
LAT: Len    8, alignment  3/ 0:	36	36
LAT: Len    8, alignment  0/ 3:	36	36
LAT: Len   16, alignment  4/ 0:	36	48
LAT: Len   16, alignment  0/ 4:	36	36
LAT: Len   32, alignment  5/ 0:	36	48
LAT: Len   32, alignment  0/ 5:	36	48
LAT: Len   64, alignment  6/ 0:	72	72
LAT: Len   64, alignment  0/ 6:	72	84
LAT: Len  128, alignment  7/ 0:	168	108
LAT: Len  128, alignment  0/ 7:	108	108
LAT: Len  256, alignment  8/ 0:	120	120
LAT: Len  256, alignment  0/ 8:	120	132
LAT: Len  512, alignment  9/ 0:	492	324
LAT: Len  512, alignment  0/ 9:	336	300
LAT: Len 1024, alignment 10/ 0:	912	588
LAT: Len 1024, alignment  0/10:	588	564
LAT: Len 2048, alignment 11/ 0:	1740	1092
LAT: Len 2048, alignment  0/11:	1128	1068
LAT: Len 4096, alignment 12/ 0:	3408	2124
LAT: Len 4096, alignment  0/12:	2172	2100
LAT: Len    0, alignment  0/ 0:	36	36
LAT: Len    0, alignment  0/ 0:	36	36
LAT: Len    1, alignment  1/ 0:	48	36
LAT: Len    1, alignment  0/ 1:	48	36
LAT: Len    2, alignment  2/ 0:	48	48
LAT: Len    2, alignment  0/ 2:	48	60
LAT: Len    3, alignment  3/ 0:	60	48
LAT: Len    3, alignment  0/ 3:	60	60
LAT: Len    4, alignment  4/ 0:	36	36
LAT: Len    4, alignment  0/ 4:	36	48
LAT: Len    5, alignment  5/ 0:	36	48
LAT: Len    5, alignment  0/ 5:	36	48
LAT: Len    6, alignment  6/ 0:	36	48
LAT: Len    6, alignment  0/ 6:	36	36
LAT: Len    7, alignment  7/ 0:	36	36
LAT: Len    7, alignment  0/ 7:	36	48
LAT: Len    8, alignment  8/ 0:	36	36
LAT: Len    8, alignment  0/ 8:	36	36
LAT: Len    9, alignment  9/ 0:	36	36
LAT: Len    9, alignment  0/ 9:	36	36
LAT: Len   10, alignment 10/ 0:	36	36
LAT: Len   10, alignment  0/10:	36	36
LAT: Len   11, alignment 11/ 0:	36	36
LAT: Len   11, alignment  0/11:	36	36
LAT: Len   12, alignment 12/ 0:	36	36
LAT: Len   12, alignment  0/12:	36	36
LAT: Len   13, alignment 13/ 0:	36	36
LAT: Len   13, alignment  0/13:	36	36
LAT: Len   14, alignment 14/ 0:	36	36
LAT: Len   14, alignment  0/14:	36	36
LAT: Len   15, alignment 15/ 0:	36	36
LAT: Len   15, alignment  0/15:	36	36
LAT: Len   16, alignment 16/ 0:	36	36
LAT: Len   16, alignment  0/16:	36	48
LAT: Len   17, alignment 17/ 0:	36	36
LAT: Len   17, alignment  0/17:	36	36
LAT: Len   18, alignment 18/ 0:	36	36
LAT: Len   18, alignment  0/18:	36	36
LAT: Len   19, alignment 19/ 0:	36	48
LAT: Len   19, alignment  0/19:	36	48
LAT: Len   20, alignment 20/ 0:	36	36
LAT: Len   20, alignment  0/20:	36	36
LAT: Len   21, alignment 21/ 0:	36	36
LAT: Len   21, alignment  0/21:	36	48
LAT: Len   22, alignment 22/ 0:	36	36
LAT: Len   22, alignment  0/22:	36	48
LAT: Len   23, alignment 23/ 0:	36	48
LAT: Len   23, alignment  0/23:	36	48
LAT: Len   24, alignment 24/ 0:	36	48
LAT: Len   24, alignment  0/24:	36	36
LAT: Len   25, alignment 25/ 0:	36	48
LAT: Len   25, alignment  0/25:	36	36
LAT: Len   26, alignment 26/ 0:	36	36
LAT: Len   26, alignment  0/26:	36	36
LAT: Len   27, alignment 27/ 0:	36	48
LAT: Len   27, alignment  0/27:	36	36
LAT: Len   28, alignment 28/ 0:	36	48
LAT: Len   28, alignment  0/28:	36	36
LAT: Len   29, alignment 29/ 0:	36	36
LAT: Len   29, alignment  0/29:	36	48
LAT: Len   30, alignment 30/ 0:	36	48
LAT: Len   30, alignment  0/30:	36	36
LAT: Len   31, alignment 31/ 0:	36	48
LAT: Len   31, alignment  0/31:	36	48
LAT: Len    0, alignment  0/ 8:	36	36
LAT: Len    0, alignment  1/ 8:	36	36
LAT: Len    0, alignment  4/ 8:	36	36
LAT: Len    1, alignment  0/ 8:	48	36
LAT: Len    1, alignment  1/ 8:	48	36
LAT: Len    1, alignment  4/ 8:	36	36
LAT: Len    2, alignment  0/ 8:	48	48
LAT: Len    2, alignment  1/ 8:	48	48
LAT: Len    2, alignment  4/ 8:	48	48
LAT: Len    3, alignment  0/ 8:	48	48
LAT: Len    3, alignment  1/ 8:	60	60
LAT: Len    3, alignment  4/ 8:	60	60
LAT: Len    4, alignment  0/ 8:	36	48
LAT: Len    4, alignment  1/ 8:	36	48
LAT: Len    4, alignment  4/ 8:	36	48
LAT: Len    5, alignment  0/ 8:	36	48
LAT: Len    5, alignment  1/ 8:	36	48
LAT: Len    5, alignment  4/ 8:	36	48
LAT: Len    6, alignment  0/ 8:	36	36
LAT: Len    6, alignment  1/ 8:	36	36
LAT: Len    6, alignment  4/ 8:	36	48
LAT: Len    7, alignment  0/ 8:	36	48
LAT: Len    7, alignment  1/ 8:	36	48
LAT: Len    7, alignment  4/ 8:	36	48
LAT: Len    8, alignment  0/ 8:	36	36
LAT: Len    8, alignment  1/ 8:	36	36
LAT: Len    8, alignment  4/ 8:	36	36
LAT: Len    9, alignment  0/ 8:	36	36
LAT: Len    9, alignment  1/ 8:	36	36
LAT: Len    9, alignment  4/ 8:	36	36
LAT: Len   10, alignment  0/ 8:	36	36
LAT: Len   10, alignment  1/ 8:	36	36
LAT: Len   10, alignment  4/ 8:	36	36
LAT: Len   11, alignment  0/ 8:	36	36
LAT: Len   11, alignment  1/ 8:	36	36
LAT: Len   11, alignment  4/ 8:	36	36
LAT: Len   12, alignment  0/ 8:	36	36
LAT: Len   12, alignment  1/ 8:	36	36
LAT: Len   12, alignment  4/ 8:	36	36
LAT: Len   13, alignment  0/ 8:	36	36
LAT: Len   13, alignment  1/ 8:	36	36
LAT: Len   13, alignment  4/ 8:	36	36
LAT: Len   14, alignment  0/ 8:	36	36
LAT: Len   14, alignment  1/ 8:	36	36
LAT: Len   14, alignment  4/ 8:	36	36
LAT: Len   15, alignment  0/ 8:	36	36
LAT: Len   15, alignment  1/ 8:	36	36
LAT: Len   15, alignment  4/ 8:	36	36
LAT: Len   16, alignment  0/ 8:	36	36
LAT: Len   16, alignment  1/ 8:	36	36
LAT: Len   16, alignment  4/ 8:	36	48
LAT: Len   17, alignment  0/ 8:	36	36
LAT: Len   17, alignment  1/ 8:	36	36
LAT: Len   17, alignment  4/ 8:	36	48
LAT: Len   18, alignment  0/ 8:	36	48
LAT: Len   18, alignment  1/ 8:	36	36
LAT: Len   18, alignment  4/ 8:	36	48
LAT: Len   19, alignment  0/ 8:	36	36
LAT: Len   19, alignment  1/ 8:	36	48
LAT: Len   19, alignment  4/ 8:	36	48
LAT: Len   20, alignment  0/ 8:	36	48
LAT: Len   20, alignment  1/ 8:	36	36
LAT: Len   20, alignment  4/ 8:	36	36
LAT: Len   21, alignment  0/ 8:	36	48
LAT: Len   21, alignment  1/ 8:	36	48
LAT: Len   21, alignment  4/ 8:	36	48
LAT: Len   22, alignment  0/ 8:	36	48
LAT: Len   22, alignment  1/ 8:	36	36
LAT: Len   22, alignment  4/ 8:	36	36
LAT: Len   23, alignment  0/ 8:	36	48
LAT: Len   23, alignment  1/ 8:	36	36
LAT: Len   23, alignment  4/ 8:	36	36
LAT: Len   24, alignment  0/ 8:	36	36
LAT: Len   24, alignment  1/ 8:	36	36
LAT: Len   24, alignment  4/ 8:	36	36
LAT: Len   25, alignment  0/ 8:	36	36
LAT: Len   25, alignment  1/ 8:	36	36
LAT: Len   25, alignment  4/ 8:	36	36
LAT: Len   26, alignment  0/ 8:	36	48
LAT: Len   26, alignment  1/ 8:	36	36
LAT: Len   26, alignment  4/ 8:	36	36
LAT: Len   27, alignment  0/ 8:	36	48
LAT: Len   27, alignment  1/ 8:	36	36
LAT: Len   27, alignment  4/ 8:	36	36
LAT: Len   28, alignment  0/ 8:	36	36
LAT: Len   28, alignment  1/ 8:	36	48
LAT: Len   28, alignment  4/ 8:	36	36
LAT: Len   29, alignment  0/ 8:	36	48
LAT: Len   29, alignment  1/ 8:	36	48
LAT: Len   29, alignment  4/ 8:	36	36
LAT: Len   30, alignment  0/ 8:	36	36
LAT: Len   30, alignment  1/ 8:	36	48
LAT: Len   30, alignment  4/ 8:	36	48
LAT: Len   31, alignment  0/ 8:	36	36
LAT: Len   31, alignment  1/ 8:	36	48
LAT: Len   31, alignment  4/ 8:	36	36
LAT: Len   32, alignment  0/ 8:	36	48
LAT: Len   32, alignment  1/ 8:	36	48
LAT: Len   32, alignment  4/ 8:	36	48
LAT: Len   33, alignment  0/ 8:	60	48
LAT: Len   33, alignment  1/ 8:	60	48
LAT: Len   33, alignment  4/ 8:	60	48
LAT: Len   34, alignment  0/ 8:	60	48
LAT: Len   34, alignment  1/ 8:	60	48
LAT: Len   34, alignment  4/ 8:	60	48
LAT: Len   35, alignment  0/ 8:	72	48
LAT: Len   35, alignment  1/ 8:	72	48
LAT: Len   35, alignment  4/ 8:	72	48
LAT: Len   36, alignment  0/ 8:	60	48
LAT: Len   36, alignment  1/ 8:	60	48
LAT: Len   36, alignment  4/ 8:	60	48
LAT: Len   37, alignment  0/ 8:	60	48
LAT: Len   37, alignment  1/ 8:	60	48
LAT: Len   37, alignment  4/ 8:	60	48
LAT: Len   38, alignment  0/ 8:	60	48
LAT: Len   38, alignment  1/ 8:	60	48
LAT: Len   38, alignment  4/ 8:	60	48
LAT: Len   39, alignment  0/ 8:	60	48
LAT: Len   39, alignment  1/ 8:	60	48
LAT: Len   39, alignment  4/ 8:	60	48
LAT: Len   40, alignment  0/ 8:	60	60
LAT: Len   40, alignment  1/ 8:	60	72
LAT: Len   40, alignment  4/ 8:	60	72
LAT: Len   41, alignment  0/ 8:	60	60
LAT: Len   41, alignment  1/ 8:	60	72
LAT: Len   41, alignment  4/ 8:	60	72
LAT: Len   42, alignment  0/ 8:	60	60
LAT: Len   42, alignment  1/ 8:	60	72
LAT: Len   42, alignment  4/ 8:	60	72
LAT: Len   43, alignment  0/ 8:	60	60
LAT: Len   43, alignment  1/ 8:	60	72
LAT: Len   43, alignment  4/ 8:	60	72
LAT: Len   44, alignment  0/ 8:	60	60
LAT: Len   44, alignment  1/ 8:	60	60
LAT: Len   44, alignment  4/ 8:	60	60
LAT: Len   45, alignment  0/ 8:	60	60
LAT: Len   45, alignment  1/ 8:	60	60
LAT: Len   45, alignment  4/ 8:	60	60
LAT: Len   46, alignment  0/ 8:	60	60
LAT: Len   46, alignment  1/ 8:	60	60
LAT: Len   46, alignment  4/ 8:	60	60
LAT: Len   47, alignment  0/ 8:	60	60
LAT: Len   47, alignment  1/ 8:	60	60
LAT: Len   47, alignment  4/ 8:	60	60
LAT: Len   48, alignment  3/ 0:	48	60
LAT: Len   48, alignment  0/ 3:	60	72
LAT: Len   80, alignment  5/ 0:	72	84
LAT: Len   80, alignment  0/ 5:	84	84
LAT: Len   96, alignment  6/ 0:	84	84
LAT: Len   96, alignment  0/ 6:	84	84
LAT: Len  112, alignment  7/ 0:	156	84
LAT: Len  112, alignment  0/ 7:	96	96
LAT: Len  144, alignment  9/ 0:	168	108
LAT: Len  144, alignment  0/ 9:	120	120
LAT: Len  160, alignment 10/ 0:	192	120
LAT: Len  160, alignment  0/10:	120	120
LAT: Len  176, alignment 11/ 0:	204	120
LAT: Len  176, alignment  0/11:	120	132
LAT: Len  192, alignment 12/ 0:	216	156
LAT: Len  192, alignment  0/12:	144	132
LAT: Len  208, alignment 13/ 0:	228	144
LAT: Len  208, alignment  0/13:	156	144
LAT: Len  224, alignment 14/ 0:	252	156
LAT: Len  224, alignment  0/14:	156	156
LAT: Len  240, alignment 15/ 0:	252	156
LAT: Len  240, alignment  0/15:	156	156
LAT: Len  272, alignment 17/ 0:	276	180
LAT: Len  272, alignment  0/17:	180	180
LAT: Len  288, alignment 18/ 0:	300	180
LAT: Len  288, alignment  0/18:	180	180
LAT: Len  304, alignment 19/ 0:	336	216
LAT: Len  304, alignment  0/19:	204	204
LAT: Len  320, alignment 20/ 0:	324	204
LAT: Len  320, alignment  0/20:	204	192
LAT: Len  336, alignment 21/ 0:	324	204
LAT: Len  336, alignment  0/21:	216	216
LAT: Len  352, alignment 22/ 0:	348	216
LAT: Len  352, alignment  0/22:	216	216
LAT: Len  368, alignment 23/ 0:	384	252
LAT: Len  368, alignment  0/23:	240	240
LAT: Len  384, alignment 24/ 0:	156	156
LAT: Len  384, alignment  0/24:	180	168
LAT: Len  400, alignment 25/ 0:	384	240
LAT: Len  400, alignment  0/25:	264	252
LAT: Len  416, alignment 26/ 0:	396	240
LAT: Len  416, alignment  0/26:	276	252
LAT: Len  432, alignment 27/ 0:	432	264
LAT: Len  432, alignment  0/27:	288	276
LAT: Len  448, alignment 28/ 0:	444	264
LAT: Len  448, alignment  0/28:	300	276
LAT: Len  464, alignment 29/ 0:	444	288
LAT: Len  464, alignment  0/29:	300	300
LAT: Len  480, alignment 30/ 0:	468	300
LAT: Len  480, alignment  0/30:	300	300
LAT: Len  496, alignment 31/ 0:	504	312
LAT: Len  496, alignment  0/31:	324	312

[-- Attachment #3: memcpy-kernel.c --]
[-- Type: text/plain, Size: 12972 bytes --]

#include<stdio.h>
#include <stdlib.h>


typedef unsigned long long int hp_timing_t;
#define  MAXSAMPLESTPT        1000
#define  MAXCOPYSIZE          (1024 * 1024 * 100)
#define  ORIG  0
#define  NEW   1
static char* buf1 = NULL;
static char* buf2 = NULL;
static int repeat_one_test = 32;

hp_timing_t _dl_hp_timing_overhead;
# define HP_TIMING_NOW(Var) \
  ({ unsigned long long _hi, _lo; \
     asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
     (Var) = _hi << 32 | _lo; })

#define HP_TIMING_DIFF(Diff, Start, End)	(Diff) = ((End) - (Start))
#define HP_TIMING_TOTAL(total_time, start, end)	\
  do									\
    {									\
      hp_timing_t tmptime;						\
      HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end);	\
	total_time += tmptime;						\
    }									\
  while (0)

#define HP_TIMING_BEST(best_time, start, end)	\
  do									\
    {									\
      hp_timing_t tmptime;						\
      HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end);	\
      if (best_time > tmptime)						\
	best_time = tmptime;						\
    }									\
  while (0)


void memcpy_orig(char *dst, char *src, int len);
void memcpy_new(char *dst, char *src, int len);
void memcpy_c(char *dst, char *src, int len);
void (*do_memcpy)(char *dst, char *src, int len);

static void
do_one_test ( char *dst, char *src,
	     size_t len)
{
      hp_timing_t start __attribute ((unused));
      hp_timing_t stop __attribute ((unused));
      hp_timing_t best_time = ~ (hp_timing_t) 0;
      size_t i,j;

      for (i = 0; i < repeat_one_test; ++i)
	{
	  HP_TIMING_NOW (start);
	  do_memcpy ( dst, src, len);
	  HP_TIMING_NOW (stop);
	  HP_TIMING_BEST (best_time, start, stop);
	}

      printf ("\t%zd", (size_t) best_time);
}

static void
do_test (size_t align1, size_t align2, size_t len)
{
  size_t i, j;
  char *s1, *s2;

  s1 = (char *) (buf1 + align1);
  s2 = (char *) (buf2 + align2);


   printf ("LAT: Len %4zd, alignment %2zd/%2zd:", len, align1, align2);
   do_memcpy = memcpy_orig;
   do_one_test (s2, s1, len);
   do_memcpy = memcpy_new;
   do_one_test (s2, s1, len);

    putchar ('\n');
}

static test_init(void)
{
  int i;
  buf1 = valloc(MAXCOPYSIZE);
  buf2 = valloc(MAXCOPYSIZE);

  for (i = 0; i < MAXCOPYSIZE ; i = i + 64) {
        buf1[i] = buf2[i] = i & 0xff;
  }

}


void memcpy_new(char *dst, char *src, int len)
{

	__asm__("movq %rdi, %rax");
	__asm__("cmp  $0x28, %rdx");
	__asm__("jb 1f");

	/*
	 * We check whether memory false dependece could occur,
	 * then jump to corresponding copy mode.
	 */
	__asm__("cmp  %dil, %sil");
	__asm__("jl 2f");
	/*
	 * We append data to avoid store crossing cache.
	 */
	__asm__("movq (%rsi), %rcx");
	__asm__("movq %rdi, %r8");
	__asm__("addq $8, %rdi");
	__asm__("andq $-8, %rdi");
	__asm__("movq %rcx, (%r8)");
	__asm__("subq %rdi, %r8");
	__asm__("addq %r8, %rdx");
	__asm__("subq %r8, %rsi");

	__asm__("subq $0x20, %rdx");
__asm__("3:");
	__asm__("subq $0x20,	%rdx");

	/*
	 * Move in blocks of 4x8 bytes:
	 */
	__asm__("movq 0*8(%rsi),	%r8");
	__asm__("movq 1*8(%rsi),	%r9");
	__asm__("movq 2*8(%rsi),	%r10");
	__asm__("movq 3*8(%rsi),	%r11");
	__asm__("leaq 4*8(%rsi),	%rsi");

	__asm__("movq %r8,	0*8(%rdi)");
	__asm__("movq %r9,	1*8(%rdi)");
	__asm__("movq %r10,	2*8(%rdi)");
	__asm__("movq %r11,	3*8(%rdi)");
	__asm__("leaq 4*8(%rdi),	%rdi");
	__asm__("jae  3b");
	__asm__("addq $0x20, %rdx");
	__asm__("jmp  10f");

__asm__("2:");
	/*
	 * Calculate copy position to tail.
	 */
	__asm__("addq %rdx, %rsi");
	__asm__("addq %rdx, %rdi");
	/*
	 * We append data to avoid store crossing cache.
	 */

	__asm__("movq -8(%rsi), %rcx");
	__asm__("movq %rdi, %r8");
	__asm__("andq $-8, %rdi");
	__asm__("movq %rcx, -8(%r8)");
	__asm__("subq %rdi, %r8");
	__asm__("subq %r8, %rdx");
	__asm__("subq %r8, %rsi");

	__asm__("subq $0x20,	%rdx");
	__asm__(".p2align 4");
__asm__("4:");
	__asm__("subq $0x20,	%rdx");
	__asm__("movq -1*8(%rsi),	%r8");
	__asm__("movq -2*8(%rsi),	%r9");
	__asm__("movq -3*8(%rsi),	%r10");
	__asm__("movq -4*8(%rsi),	%r11");
	__asm__("leaq -4*8(%rsi),	%rsi");
	__asm__("movq %r8,		-1*8(%rdi)");
	__asm__("movq %r9,		-2*8(%rdi)");
	__asm__("movq %r10,		-3*8(%rdi)");
	__asm__("movq %r11,		-4*8(%rdi)");
	__asm__("leaq -4*8(%rdi),	%rdi");
	__asm__("jae  4b");

	/*
	 * Calculate copy position to head.
	 */
	__asm__("addq $0x20, %rdx");
	__asm__("subq %rdx, %rsi");
	__asm__("subq %rdx, %rdi");
	__asm__("jmp  10f");
__asm__("1:");
	__asm__("cmpq $32, %rdx");
	__asm__("jb   10f");
	/*
	 * Move data from 32 bytes to 39 bytes.
	 */
	__asm__("movq 0*8(%rsi), %rcx");
	__asm__("movq 1*8(%rsi),	%r8");
	__asm__("movq -3*8(%rsi, %rdx),	%r9");
	__asm__("movq -2*8(%rsi, %rdx),	%r10");
	__asm__("movq -1*8(%rsi, %rdx),	%r11");
	__asm__("movq %rcx,	0*8(%rdi)");
	__asm__("movq %r8,	1*8(%rdi)");
	__asm__("movq %r9,	-3*8(%rdi, %rdx)");
	__asm__("movq %r10,	-2*8(%rdi, %rdx)");
	__asm__("movq %r11,	-1*8(%rdi, %rdx)");
	__asm__("retq");

	/*
	 * Move data from 16 bytes to 31 bytes.
	 */
__asm__("10:");
	__asm__("cmpq $16, %rdx");
	__asm__("jb   5f");
	__asm__("movq 0*8(%rsi),	%r8");
	__asm__("movq 1*8(%rsi),	%r9");
	__asm__("movq -2*8(%rsi, %rdx),	%r10");
	__asm__("movq -1*8(%rsi, %rdx),	%r11");
	__asm__("movq %r8,	0*8(%rdi)");
	__asm__("movq %r9,	1*8(%rdi)");
	__asm__("movq %r10,	-2*8(%rdi, %rdx)");
	__asm__("movq %r11,	-1*8(%rdi, %rdx)");
	__asm__("retq");
	__asm__(".p2align 4");
__asm__("5:");
	__asm__("cmpq $8, %rdx");
	__asm__("jb   6f");
	/*
	 * Move data from 8 bytes to 15 bytes.
	 */
	__asm__("movq 0*8(%rsi),	%r8");
	__asm__("movq -1*8(%rsi, %rdx),	%r9");
	__asm__("movq %r8,	0*8(%rdi)");
	__asm__("movq %r9,	-1*8(%rdi, %rdx)");
	__asm__("retq");
	__asm__(".p2align 4");
__asm__("6:");
	__asm__("cmpq $4, %rdx");
	__asm__("jb   7f");

	/*
	 * Move data from 4 bytes to 7 bytes.
	 */
	__asm__("movl (%rsi), %ecx");
	__asm__("movl -4(%rsi, %rdx), %r8d");
	__asm__("movl %ecx, (%rdi)");
	__asm__("movl %r8d, -4(%rdi, %rdx)");
	__asm__("retq");
	__asm__(".p2align 4");
__asm__("7:");
	__asm__("cmpl $0, %edx");
	__asm__("je 8f");
	/*
	 * Move data from 1 bytes to 3 bytes.
	 */
__asm__("9:");
	__asm__("movb (%rsi), %r8b");
	__asm__("movb %r8b, (%rdi)");
	__asm__("incq %rdi");
	__asm__("incq %rsi");
	__asm__("decl %edx");
	__asm__("jnz 9b");

__asm__("8:");
}

void memcpy_orig(char *dst, char *src, int len)
{

	__asm("movq %rdi, %rax");

	/*
	 * Use 32bit CMP here to avoid long NOP padding.
	 */
	__asm("cmp  $0x20, %edx");
	__asm("jbe 1f");

	/*
	 * the code for unaligned copy is good for large-size copy(>100),
	 * so if the size is small, we needn't check dst and src is aligned
	 * or not.
	 */
	__asm("cmp $100, %edx");
	__asm("jb 2f");

	/*
	 * unaligned access always leads to bad performance, so in order to
	 * avoid unaligned access, we align the address(both src and dest)
	 * first, and then copy from a aligned src to an aligned dst by using
	 * shifts.
	 * But we found if src is aligned, although dest is unaligned, the
	 * performance of generic memory copy (That is reading data aligned
	 * from the source and writing data unaligned to the dest) is better
	 * than the one that uses shifts to avoid unaligned access.
	 * So if src is aligned, we needn't check dest is aligned or not, just
	 * goto 2:
	 */
	__asm("test $7, %esi");		/* src align check */
	__asm("jz 2f");

	/* if dest and src both are unaligned, goto unaligned copy */
	__asm("test $7, %edi");
	__asm("jnz 3f");

	__asm("jmp 4f");

__asm("2:");
	/*
	 * We check whether memory false dependece could occur,
	 * then jump to corresponding copy mode.
	 */
	__asm("cmp  %dil, %sil");
	__asm("jl 5f");
	__asm("subl $0x20, %edx");
__asm("6:");
	__asm("subq $0x20,	%rdx");

	/*
	 * Move in blocks of 4x8 bytes:
	 */
	__asm("movq 0*8(%rsi),	%r8");
	__asm("movq 1*8(%rsi),	%r9");
	__asm("movq 2*8(%rsi),	%r10");
	__asm("movq 3*8(%rsi),	%r11");
	__asm("leaq 4*8(%rsi),	%rsi");

	__asm("movq %r8,	0*8(%rdi)");
	__asm("movq %r9,	1*8(%rdi)");
	__asm("movq %r10,	2*8(%rdi)");
	__asm("movq %r11,	3*8(%rdi)");
	__asm("leaq 4*8(%rdi),	%rdi");
	__asm("jae  6b");
	__asm("addq $0x20,	%rdx");
	__asm("jmp  1f");

__asm("5:");
	/*
	 * Calculate copy position to tail.
	 */
	__asm("addq %rdx,	%rsi");
	__asm("addq %rdx,	%rdi");
	__asm("subq $0x20,	%rdx");
	/*
	 * At most 3 ALU operations in one cycle,
	 * so append NOPS in the same 16bytes trunk.
	 */
	__asm(".p2align 4");
__asm("6:");
	__asm("subq $0x20,	%rdx");
	__asm("movq -1*8(%rsi),	%r8");
	__asm("movq -2*8(%rsi),	%r9");
	__asm("movq -3*8(%rsi),	%r10");
	__asm("movq -4*8(%rsi),	%r11");
	__asm("leaq -4*8(%rsi),	%rsi");
	__asm("movq %r8,		-1*8(%rdi)");
	__asm("movq %r9,		-2*8(%rdi)");
	__asm("movq %r10,		-3*8(%rdi)");
	__asm("movq %r11,		-4*8(%rdi)");
	__asm("leaq -4*8(%rdi),	%rdi");
	__asm("jae  6b");

	/*
	 * Calculate copy position to head.
	 */
	__asm("addq $0x20,	%rdx");
	__asm("subq %rdx,	%rsi");
	__asm("subq %rdx,	%rdi");
__asm__("1:");
	__asm("cmpq $16,	%rdx");
	__asm("jb   7f");

	/*
	 * Move data from 16 bytes to 31 bytes.
	 */
	__asm("movq 0*8(%rsi), %r8");
	__asm("movq 1*8(%rsi),	%r9");
	__asm("movq -2*8(%rsi, %rdx),	%r10");
	__asm("movq -1*8(%rsi, %rdx),	%r11");
	__asm("movq %r8,	0*8(%rdi)");
	__asm("movq %r9,	1*8(%rdi)");
	__asm("movq %r10,	-2*8(%rdi, %rdx)");
	__asm("movq %r11,	-1*8(%rdi, %rdx)");
	__asm("retq");
	__asm(".p2align 4");
__asm__("7:");
	__asm("cmpq $8,	%rdx");
	__asm("jb   8f");
	/*
	 * Move data from 8 bytes to 15 bytes.
	 */
	__asm("movq 0*8(%rsi),	%r8");
	__asm("movq -1*8(%rsi, %rdx),	%r9");
	__asm("movq %r8,	0*8(%rdi)");
	__asm("movq %r9,	-1*8(%rdi, %rdx)");
	__asm("retq");
	__asm(".p2align 4");
__asm__("8:");
	__asm("cmpq $4,	%rdx");
	__asm("jb   9f");

	/*
	 * Move data from 4 bytes to 7 bytes.
	 */
	__asm("movl (%rsi), %ecx");
	__asm("movl -4(%rsi, %rdx), %r8d");
	__asm("movl %ecx, (%rdi)");
	__asm("movl %r8d, -4(%rdi, %rdx)");
	__asm("retq");
	__asm(".p2align 4");
__asm__("9:");
	__asm("cmpl $0, %edx");
	__asm("je 10f");
	/*
	 * Move data from 1 bytes to 3 bytes.
	 */
__asm__("11:");
	__asm("movb (%rsi), %r8b");
	__asm("movb %r8b, (%rdi)");
	__asm("incq %rdi");
	__asm("incq %rsi");
	__asm("decl %edx");
	__asm("jnz 11b");

__asm__("10:");
	__asm("retq");

	__asm(".p2align 4");
__asm__("3:");
	__asm("movq %rdi, %rcx");
	__asm("andq $7, %rcx");		/* Align the destination */
	__asm("negq %rcx");
	__asm("andq $7, %rcx");
	__asm("subq %rcx, %rdx");

	/* tune dst address */
	__asm("movq (%rsi), %r8");
	__asm("movq %r8, (%rdi)");
	__asm("addq %rcx, %rdi");
	__asm("addq %rcx, %rsi");

	__asm("test $7, %esi");		/* src align check */
	__asm("jz 2b");

	__asm(".p2align 4");
__asm__("4:");
	__asm("push %rbx");
	__asm("push %r12");
	__asm("push %r13");
	__asm("push %r14");
	__asm("push %r15");
	/*
	 * Calculate how to shift a word read at the memory operation
	 * aligned srcp to make it aligned for copy.
	 */
	__asm("movq %rsi, %r14");
	__asm("andq $7, %r14");
	__asm("shlq $3, %r14");

	__asm("movq $64, %r15");
	__asm("subq %r14, %r15");

	__asm("andq $-8, %rsi");		/* src aligned */
	__asm("movq 0*8(%rsi), %r8");

	__asm("movq %rdx, %rbx");
	__asm("shrq $5, %rbx");
	__asm("jz 12f");

	/*
	 * %r8 : store src[0]
	 * %r9 : store src[1]
	 * %r10: store src[2]
	 * %r11: store src[3]
	 * %r12: store src[4]
	 * %r13: store the tmp data
	 */
	__asm(".p2align 4");
__asm("13:");
	__asm("movq 1*8(%rsi), %r9");
	__asm("movq 2*8(%rsi), %r10");
	__asm("movq 3*8(%rsi), %r11");
	__asm("movq 4*8(%rsi), %r12");

	__asm("movq %r9, %r13");
	__asm("movb %r14b, %cl");
	__asm("shrq %cl, %r8");
	__asm("shrq %cl, %r13");
	__asm("movb %r15b, %cl");
	__asm("shlq  %cl, %r9");
	__asm("orq %r8, %r9");
	__asm("movq %r10, %r8");
	__asm("shlq  %cl, %r10");
	__asm("orq %r13, %r10");

	__asm("movq %r11, %r13");
	__asm("movb %r14b, %cl");
	__asm("shrq %cl, %r8");
	__asm("shrq %cl, %r13");
	__asm("movb %r15b, %cl");
	__asm("shlq  %cl, %r11");
	__asm("orq %r8, %r11");
	__asm("movq %r12, %r8");
	__asm("shlq  %cl, %r12");
	__asm("orq %r13, %r12");

	__asm("movq %r9, 0*8(%rdi)");
	__asm("movq %r10, 1*8(%rdi)");
	__asm("movq %r11, 2*8(%rdi)");
	__asm("movq %r12, 3*8(%rdi)");

	__asm("leaq 4*8(%rdi), %rdi");
	__asm("leaq 4*8(%rsi), %rsi");
	__asm("decq %rbx");
	__asm("jnz 13b");

	__asm(".p2align 4");
__asm("12:");
	__asm("shrq $3, %r14");
	__asm("addq %r14, %rsi");
	__asm("pop %r15");
	__asm("pop %r14");
	__asm("pop %r13");
	__asm("pop %r12");
	__asm("pop %rbx");
	__asm("andq $31, %rdx");
	__asm("jnz 1b");
	__asm("retq");



}


void main(void)
{
  int i;
  test_init();
  printf ("%23s", "");
  printf ("\t%s\t%s\n", "memcpy_orig", "memcpy_new");

for (i = 0; i <= 12; ++i)
    {
      do_test (i, 0, 1 << i);
      do_test (0, i, 1 << i);
    }
for (i = 0; i < 32; ++i)
    {
      do_test (i, 0, i);
      do_test (0, i, i);
    }

  for (i = 0; i < 48; ++i)
    {
      do_test (0, 8, i);
      do_test (1, 8, i);
      do_test (4, 8, i);
    }

  for (i = 3; i < 32; ++i)
    {
      if ((i & (i - 1)) == 0)
	continue;
      do_test (i, 0, 16 * i);
      do_test (0, i, 16 * i);
    }


}

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
  2010-10-13 21:31     ` H. Peter Anvin
  2010-10-14  1:14       ` Ma, Ling
  2010-10-14 19:43       ` Ma, Ling
@ 2010-10-18  3:12       ` Miao Xie
  2 siblings, 0 replies; 15+ messages in thread
From: Miao Xie @ 2010-10-18  3:12 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Ma, Ling, Ingo Molnar, Andi Kleen, Thomas Gleixner, Zhao, Yakui,
	Linux Kernel

On Wed, 13 Oct 2010 14:31:47 -0700, H. Peter Anvin wrote:
> On 10/08/2010 02:02 AM, Miao Xie wrote:
>> On Fri, 8 Oct 2010 15:42:45 +0800, Ma, Ling wrote:
>>> Could you please give us full address for each comparison result,we will do some tests on my machine.
>>> For unaligned cases older cpus will crossing cache line and slow down caused by load and store, but for nhm, no necessary to care about it.
>>> By the way in kernel 64bit mode, our access mode should be around 8byte aligned.
>>
>> Would you need my benchmark tool? I think it is helpful for your test.
>>
>
> If you could post the benchmark tool that would be great.

The following is the URL of the benchmark tool that I have posted.
   http://marc.info/?l=linux-kernel&m=128652854608425&q=p3

Thanks
Miao

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
  2010-10-14 19:43       ` Ma, Ling
@ 2010-10-18  6:23         ` Miao Xie
  2010-10-18  6:27           ` Ma, Ling
  0 siblings, 1 reply; 15+ messages in thread
From: Miao Xie @ 2010-10-18  6:23 UTC (permalink / raw)
  To: Ma, Ling
  Cc: H. Peter Anvin, Ingo Molnar, Andi Kleen, Thomas Gleixner, Zhao,
	Yakui, Linux Kernel

[-- Attachment #1: Type: text/plain, Size: 1796 bytes --]

On Fri, 15 Oct 2010 03:43:53 +0800, Ma, Ling wrote:
> Attachment includes memcpy-kernel.c(cc -O2 memcpy-kernel.c -o memcpy-kernel),
> and unaligned test cases on Atom.

I have tested on my Core2 Duo machine with your benchmark tool. Attachment is the
test result. But the result is different with yours on Atom, It seems the performance
is better with this patch.

According to these two different result, maybe we need optimize memcpy() by CPU
model.

Thanks
Miao

>
> Thanks
> Ling
>
> -----Original Message-----
> From: Ma, Ling
> Sent: Thursday, October 14, 2010 9:14 AM
> To: 'H. Peter Anvin'; miaox@cn.fujitsu.com
> Cc: Ingo Molnar; Andi Kleen; Thomas Gleixner; Zhao, Yakui; Linux Kernel
> Subject: RE: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
>
> Sure, I will post benchmark tool and benchmark on Atom 64bit soon.
>
> Thanks
> Ling
>
> -----Original Message-----
> From: H. Peter Anvin [mailto:hpa@zytor.com]
> Sent: Thursday, October 14, 2010 5:32 AM
> To: miaox@cn.fujitsu.com
> Cc: Ma, Ling; Ingo Molnar; Andi Kleen; Thomas Gleixner; Zhao, Yakui; Linux Kernel
> Subject: Re: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
>
> On 10/08/2010 02:02 AM, Miao Xie wrote:
>> On Fri, 8 Oct 2010 15:42:45 +0800, Ma, Ling wrote:
>>> Could you please give us full address for each comparison result,we will do some tests on my machine.
>>> For unaligned cases older cpus will crossing cache line and slow down caused by load and store, but for nhm, no necessary to care about it.
>>> By the way in kernel 64bit mode, our access mode should be around 8byte aligned.
>>
>> Would you need my benchmark tool? I think it is helpful for your test.
>>
>
> If you could post the benchmark tool that would be great.
>
> 	-hpa


[-- Attachment #2: memcpy-Core2-Duo-CPU-unaligned-result --]
[-- Type: text/plain, Size: 11008 bytes --]

                       	memcpy_orig	memcpy_new
LAT: Len    1, alignment  0/ 0:	40	40
LAT: Len    1, alignment  0/ 0:	40	40
LAT: Len    2, alignment  1/ 0:	40	40
LAT: Len    2, alignment  0/ 1:	50	50
LAT: Len    4, alignment  2/ 0:	40	40
LAT: Len    4, alignment  0/ 2:	40	40
LAT: Len    8, alignment  3/ 0:	40	40
LAT: Len    8, alignment  0/ 3:	40	40
LAT: Len   16, alignment  4/ 0:	40	40
LAT: Len   16, alignment  0/ 4:	40	40
LAT: Len   32, alignment  5/ 0:	40	40
LAT: Len   32, alignment  0/ 5:	40	40
LAT: Len   64, alignment  6/ 0:	60	50
LAT: Len   64, alignment  0/ 6:	60	60
LAT: Len  128, alignment  7/ 0:	70	70
LAT: Len  128, alignment  0/ 7:	80	80
LAT: Len  256, alignment  8/ 0:	80	70
LAT: Len  256, alignment  0/ 8:	80	80
LAT: Len  512, alignment  9/ 0:	190	260
LAT: Len  512, alignment  0/ 9:	190	220
LAT: Len 1024, alignment 10/ 0:	340	490
LAT: Len 1024, alignment  0/10:	340	440
LAT: Len 2048, alignment 11/ 0:	650	940
LAT: Len 2048, alignment  0/11:	620	870
LAT: Len 4096, alignment 12/ 0:	1280	2140
LAT: Len 4096, alignment  0/12:	1410	1750
LAT: Len    0, alignment  0/ 0:	40	40
LAT: Len    0, alignment  0/ 0:	40	40
LAT: Len    1, alignment  1/ 0:	40	40
LAT: Len    1, alignment  0/ 1:	40	40
LAT: Len    2, alignment  2/ 0:	40	40
LAT: Len    2, alignment  0/ 2:	40	40
LAT: Len    3, alignment  3/ 0:	40	40
LAT: Len    3, alignment  0/ 3:	40	40
LAT: Len    4, alignment  4/ 0:	40	40
LAT: Len    4, alignment  0/ 4:	40	40
LAT: Len    5, alignment  5/ 0:	40	40
LAT: Len    5, alignment  0/ 5:	40	40
LAT: Len    6, alignment  6/ 0:	40	40
LAT: Len    6, alignment  0/ 6:	40	40
LAT: Len    7, alignment  7/ 0:	40	40
LAT: Len    7, alignment  0/ 7:	40	40
LAT: Len    8, alignment  8/ 0:	40	40
LAT: Len    8, alignment  0/ 8:	40	40
LAT: Len    9, alignment  9/ 0:	40	40
LAT: Len    9, alignment  0/ 9:	40	40
LAT: Len   10, alignment 10/ 0:	40	40
LAT: Len   10, alignment  0/10:	40	40
LAT: Len   11, alignment 11/ 0:	40	40
LAT: Len   11, alignment  0/11:	40	40
LAT: Len   12, alignment 12/ 0:	40	40
LAT: Len   12, alignment  0/12:	40	40
LAT: Len   13, alignment 13/ 0:	40	40
LAT: Len   13, alignment  0/13:	40	40
LAT: Len   14, alignment 14/ 0:	40	40
LAT: Len   14, alignment  0/14:	40	40
LAT: Len   15, alignment 15/ 0:	40	40
LAT: Len   15, alignment  0/15:	40	40
LAT: Len   16, alignment 16/ 0:	40	40
LAT: Len   16, alignment  0/16:	40	40
LAT: Len   17, alignment 17/ 0:	40	40
LAT: Len   17, alignment  0/17:	40	40
LAT: Len   18, alignment 18/ 0:	40	40
LAT: Len   18, alignment  0/18:	40	40
LAT: Len   19, alignment 19/ 0:	40	40
LAT: Len   19, alignment  0/19:	40	40
LAT: Len   20, alignment 20/ 0:	40	40
LAT: Len   20, alignment  0/20:	40	40
LAT: Len   21, alignment 21/ 0:	40	40
LAT: Len   21, alignment  0/21:	40	40
LAT: Len   22, alignment 22/ 0:	40	40
LAT: Len   22, alignment  0/22:	40	40
LAT: Len   23, alignment 23/ 0:	40	40
LAT: Len   23, alignment  0/23:	40	40
LAT: Len   24, alignment 24/ 0:	40	40
LAT: Len   24, alignment  0/24:	40	40
LAT: Len   25, alignment 25/ 0:	40	40
LAT: Len   25, alignment  0/25:	40	40
LAT: Len   26, alignment 26/ 0:	40	40
LAT: Len   26, alignment  0/26:	40	40
LAT: Len   27, alignment 27/ 0:	40	40
LAT: Len   27, alignment  0/27:	40	40
LAT: Len   28, alignment 28/ 0:	40	40
LAT: Len   28, alignment  0/28:	40	40
LAT: Len   29, alignment 29/ 0:	40	40
LAT: Len   29, alignment  0/29:	40	40
LAT: Len   30, alignment 30/ 0:	40	40
LAT: Len   30, alignment  0/30:	40	40
LAT: Len   31, alignment 31/ 0:	40	40
LAT: Len   31, alignment  0/31:	40	40
LAT: Len    0, alignment  0/ 8:	40	40
LAT: Len    0, alignment  1/ 8:	40	40
LAT: Len    0, alignment  4/ 8:	40	40
LAT: Len    1, alignment  0/ 8:	40	40
LAT: Len    1, alignment  1/ 8:	40	40
LAT: Len    1, alignment  4/ 8:	40	40
LAT: Len    2, alignment  0/ 8:	40	40
LAT: Len    2, alignment  1/ 8:	40	40
LAT: Len    2, alignment  4/ 8:	40	40
LAT: Len    3, alignment  0/ 8:	40	40
LAT: Len    3, alignment  1/ 8:	40	40
LAT: Len    3, alignment  4/ 8:	40	40
LAT: Len    4, alignment  0/ 8:	40	40
LAT: Len    4, alignment  1/ 8:	40	40
LAT: Len    4, alignment  4/ 8:	40	40
LAT: Len    5, alignment  0/ 8:	40	40
LAT: Len    5, alignment  1/ 8:	40	40
LAT: Len    5, alignment  4/ 8:	40	40
LAT: Len    6, alignment  0/ 8:	40	40
LAT: Len    6, alignment  1/ 8:	40	40
LAT: Len    6, alignment  4/ 8:	40	40
LAT: Len    7, alignment  0/ 8:	40	40
LAT: Len    7, alignment  1/ 8:	40	40
LAT: Len    7, alignment  4/ 8:	40	40
LAT: Len    8, alignment  0/ 8:	40	40
LAT: Len    8, alignment  1/ 8:	40	40
LAT: Len    8, alignment  4/ 8:	40	40
LAT: Len    9, alignment  0/ 8:	40	40
LAT: Len    9, alignment  1/ 8:	40	40
LAT: Len    9, alignment  4/ 8:	40	40
LAT: Len   10, alignment  0/ 8:	40	40
LAT: Len   10, alignment  1/ 8:	40	40
LAT: Len   10, alignment  4/ 8:	40	40
LAT: Len   11, alignment  0/ 8:	40	40
LAT: Len   11, alignment  1/ 8:	40	40
LAT: Len   11, alignment  4/ 8:	40	40
LAT: Len   12, alignment  0/ 8:	40	40
LAT: Len   12, alignment  1/ 8:	40	40
LAT: Len   12, alignment  4/ 8:	40	40
LAT: Len   13, alignment  0/ 8:	40	40
LAT: Len   13, alignment  1/ 8:	40	40
LAT: Len   13, alignment  4/ 8:	40	40
LAT: Len   14, alignment  0/ 8:	40	40
LAT: Len   14, alignment  1/ 8:	40	40
LAT: Len   14, alignment  4/ 8:	40	40
LAT: Len   15, alignment  0/ 8:	40	40
LAT: Len   15, alignment  1/ 8:	40	40
LAT: Len   15, alignment  4/ 8:	40	40
LAT: Len   16, alignment  0/ 8:	40	40
LAT: Len   16, alignment  1/ 8:	40	40
LAT: Len   16, alignment  4/ 8:	40	40
LAT: Len   17, alignment  0/ 8:	40	40
LAT: Len   17, alignment  1/ 8:	40	40
LAT: Len   17, alignment  4/ 8:	40	40
LAT: Len   18, alignment  0/ 8:	40	40
LAT: Len   18, alignment  1/ 8:	40	40
LAT: Len   18, alignment  4/ 8:	40	40
LAT: Len   19, alignment  0/ 8:	40	40
LAT: Len   19, alignment  1/ 8:	40	40
LAT: Len   19, alignment  4/ 8:	40	40
LAT: Len   20, alignment  0/ 8:	40	40
LAT: Len   20, alignment  1/ 8:	40	40
LAT: Len   20, alignment  4/ 8:	40	40
LAT: Len   21, alignment  0/ 8:	40	40
LAT: Len   21, alignment  1/ 8:	40	40
LAT: Len   21, alignment  4/ 8:	40	40
LAT: Len   22, alignment  0/ 8:	40	40
LAT: Len   22, alignment  1/ 8:	40	40
LAT: Len   22, alignment  4/ 8:	40	40
LAT: Len   23, alignment  0/ 8:	40	40
LAT: Len   23, alignment  1/ 8:	40	40
LAT: Len   23, alignment  4/ 8:	40	40
LAT: Len   24, alignment  0/ 8:	40	40
LAT: Len   24, alignment  1/ 8:	40	40
LAT: Len   24, alignment  4/ 8:	40	40
LAT: Len   25, alignment  0/ 8:	40	40
LAT: Len   25, alignment  1/ 8:	40	40
LAT: Len   25, alignment  4/ 8:	40	40
LAT: Len   26, alignment  0/ 8:	40	40
LAT: Len   26, alignment  1/ 8:	40	40
LAT: Len   26, alignment  4/ 8:	40	40
LAT: Len   27, alignment  0/ 8:	40	40
LAT: Len   27, alignment  1/ 8:	40	40
LAT: Len   27, alignment  4/ 8:	40	40
LAT: Len   28, alignment  0/ 8:	40	40
LAT: Len   28, alignment  1/ 8:	40	40
LAT: Len   28, alignment  4/ 8:	40	40
LAT: Len   29, alignment  0/ 8:	40	40
LAT: Len   29, alignment  1/ 8:	40	40
LAT: Len   29, alignment  4/ 8:	40	40
LAT: Len   30, alignment  0/ 8:	40	40
LAT: Len   30, alignment  1/ 8:	40	40
LAT: Len   30, alignment  4/ 8:	40	40
LAT: Len   31, alignment  0/ 8:	40	40
LAT: Len   31, alignment  1/ 8:	40	40
LAT: Len   31, alignment  4/ 8:	40	40
LAT: Len   32, alignment  0/ 8:	40	40
LAT: Len   32, alignment  1/ 8:	40	40
LAT: Len   32, alignment  4/ 8:	40	40
LAT: Len   33, alignment  0/ 8:	50	40
LAT: Len   33, alignment  1/ 8:	50	40
LAT: Len   33, alignment  4/ 8:	50	40
LAT: Len   34, alignment  0/ 8:	50	40
LAT: Len   34, alignment  1/ 8:	50	40
LAT: Len   34, alignment  4/ 8:	50	40
LAT: Len   35, alignment  0/ 8:	50	40
LAT: Len   35, alignment  1/ 8:	50	40
LAT: Len   35, alignment  4/ 8:	50	40
LAT: Len   36, alignment  0/ 8:	40	40
LAT: Len   36, alignment  1/ 8:	40	40
LAT: Len   36, alignment  4/ 8:	40	40
LAT: Len   37, alignment  0/ 8:	40	40
LAT: Len   37, alignment  1/ 8:	40	40
LAT: Len   37, alignment  4/ 8:	50	40
LAT: Len   38, alignment  0/ 8:	40	40
LAT: Len   38, alignment  1/ 8:	40	40
LAT: Len   38, alignment  4/ 8:	50	40
LAT: Len   39, alignment  0/ 8:	40	40
LAT: Len   39, alignment  1/ 8:	40	40
LAT: Len   39, alignment  4/ 8:	50	40
LAT: Len   40, alignment  0/ 8:	40	40
LAT: Len   40, alignment  1/ 8:	40	50
LAT: Len   40, alignment  4/ 8:	40	50
LAT: Len   41, alignment  0/ 8:	40	40
LAT: Len   41, alignment  1/ 8:	40	50
LAT: Len   41, alignment  4/ 8:	40	50
LAT: Len   42, alignment  0/ 8:	40	40
LAT: Len   42, alignment  1/ 8:	40	50
LAT: Len   42, alignment  4/ 8:	40	50
LAT: Len   43, alignment  0/ 8:	40	40
LAT: Len   43, alignment  1/ 8:	40	50
LAT: Len   43, alignment  4/ 8:	40	50
LAT: Len   44, alignment  0/ 8:	40	40
LAT: Len   44, alignment  1/ 8:	40	50
LAT: Len   44, alignment  4/ 8:	40	50
LAT: Len   45, alignment  0/ 8:	40	40
LAT: Len   45, alignment  1/ 8:	40	50
LAT: Len   45, alignment  4/ 8:	50	50
LAT: Len   46, alignment  0/ 8:	40	40
LAT: Len   46, alignment  1/ 8:	40	50
LAT: Len   46, alignment  4/ 8:	50	50
LAT: Len   47, alignment  0/ 8:	40	40
LAT: Len   47, alignment  1/ 8:	40	50
LAT: Len   47, alignment  4/ 8:	50	50
LAT: Len   48, alignment  3/ 0:	40	40
LAT: Len   48, alignment  0/ 3:	40	50
LAT: Len   80, alignment  5/ 0:	60	60
LAT: Len   80, alignment  0/ 5:	60	70
LAT: Len   96, alignment  6/ 0:	60	60
LAT: Len   96, alignment  0/ 6:	60	70
LAT: Len  112, alignment  7/ 0:	70	60
LAT: Len  112, alignment  0/ 7:	60	80
LAT: Len  144, alignment  9/ 0:	80	90
LAT: Len  144, alignment  0/ 9:	90	90
LAT: Len  160, alignment 10/ 0:	80	90
LAT: Len  160, alignment  0/10:	80	90
LAT: Len  176, alignment 11/ 0:	90	100
LAT: Len  176, alignment  0/11:	90	100
LAT: Len  192, alignment 12/ 0:	90	120
LAT: Len  192, alignment  0/12:	100	90
LAT: Len  208, alignment 13/ 0:	100	120
LAT: Len  208, alignment  0/13:	110	110
LAT: Len  224, alignment 14/ 0:	100	120
LAT: Len  224, alignment  0/14:	110	110
LAT: Len  240, alignment 15/ 0:	100	130
LAT: Len  240, alignment  0/15:	110	130
LAT: Len  272, alignment 17/ 0:	110	150
LAT: Len  272, alignment  0/17:	110	140
LAT: Len  288, alignment 18/ 0:	120	150
LAT: Len  288, alignment  0/18:	130	140
LAT: Len  304, alignment 19/ 0:	140	180
LAT: Len  304, alignment  0/19:	130	180
LAT: Len  320, alignment 20/ 0:	140	180
LAT: Len  320, alignment  0/20:	150	160
LAT: Len  336, alignment 21/ 0:	150	180
LAT: Len  336, alignment  0/21:	140	170
LAT: Len  352, alignment 22/ 0:	140	180
LAT: Len  352, alignment  0/22:	150	170
LAT: Len  368, alignment 23/ 0:	160	210
LAT: Len  368, alignment  0/23:	140	200
LAT: Len  384, alignment 24/ 0:	90	90
LAT: Len  384, alignment  0/24:	100	90
LAT: Len  400, alignment 25/ 0:	150	190
LAT: Len  400, alignment  0/25:	150	200
LAT: Len  416, alignment 26/ 0:	150	190
LAT: Len  416, alignment  0/26:	190	190
LAT: Len  432, alignment 27/ 0:	180	220
LAT: Len  432, alignment  0/27:	170	210
LAT: Len  448, alignment 28/ 0:	160	220
LAT: Len  448, alignment  0/28:	210	200
LAT: Len  464, alignment 29/ 0:	170	220
LAT: Len  464, alignment  0/29:	170	230
LAT: Len  480, alignment 30/ 0:	170	220
LAT: Len  480, alignment  0/30:	220	220
LAT: Len  496, alignment 31/ 0:	200	240
LAT: Len  496, alignment  0/31:	180	240

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
  2010-10-18  6:23         ` Miao Xie
@ 2010-10-18  6:27           ` Ma, Ling
  2010-10-18  6:34             ` Miao Xie
  0 siblings, 1 reply; 15+ messages in thread
From: Ma, Ling @ 2010-10-18  6:27 UTC (permalink / raw)
  To: miaox
  Cc: H. Peter Anvin, Ingo Molnar, Andi Kleen, Thomas Gleixner, Zhao,
	Yakui, Linux Kernel

Could please send out cpu info for this cpu model.

Thanks
Ling

-----Original Message-----
From: Miao Xie [mailto:miaox@cn.fujitsu.com] 
Sent: Monday, October 18, 2010 2:24 PM
To: Ma, Ling
Cc: H. Peter Anvin; Ingo Molnar; Andi Kleen; Thomas Gleixner; Zhao, Yakui; Linux Kernel
Subject: Re: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy

On Fri, 15 Oct 2010 03:43:53 +0800, Ma, Ling wrote:
> Attachment includes memcpy-kernel.c(cc -O2 memcpy-kernel.c -o 
> memcpy-kernel), and unaligned test cases on Atom.

I have tested on my Core2 Duo machine with your benchmark tool. Attachment is the test result. But the result is different with yours on Atom, It seems the performance is better with this patch.

According to these two different result, maybe we need optimize memcpy() by CPU model.

Thanks
Miao

>
> Thanks
> Ling
>
> -----Original Message-----
> From: Ma, Ling
> Sent: Thursday, October 14, 2010 9:14 AM
> To: 'H. Peter Anvin'; miaox@cn.fujitsu.com
> Cc: Ingo Molnar; Andi Kleen; Thomas Gleixner; Zhao, Yakui; Linux 
> Kernel
> Subject: RE: [PATCH V2 -tip] lib,x86_64: improve the performance of 
> memcpy() for unaligned copy
>
> Sure, I will post benchmark tool and benchmark on Atom 64bit soon.
>
> Thanks
> Ling
>
> -----Original Message-----
> From: H. Peter Anvin [mailto:hpa@zytor.com]
> Sent: Thursday, October 14, 2010 5:32 AM
> To: miaox@cn.fujitsu.com
> Cc: Ma, Ling; Ingo Molnar; Andi Kleen; Thomas Gleixner; Zhao, Yakui; 
> Linux Kernel
> Subject: Re: [PATCH V2 -tip] lib,x86_64: improve the performance of 
> memcpy() for unaligned copy
>
> On 10/08/2010 02:02 AM, Miao Xie wrote:
>> On Fri, 8 Oct 2010 15:42:45 +0800, Ma, Ling wrote:
>>> Could you please give us full address for each comparison result,we will do some tests on my machine.
>>> For unaligned cases older cpus will crossing cache line and slow down caused by load and store, but for nhm, no necessary to care about it.
>>> By the way in kernel 64bit mode, our access mode should be around 8byte aligned.
>>
>> Would you need my benchmark tool? I think it is helpful for your test.
>>
>
> If you could post the benchmark tool that would be great.
>
> 	-hpa


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
  2010-10-18  6:27           ` Ma, Ling
@ 2010-10-18  6:34             ` Miao Xie
  2010-10-18  6:43               ` Ma, Ling
  0 siblings, 1 reply; 15+ messages in thread
From: Miao Xie @ 2010-10-18  6:34 UTC (permalink / raw)
  To: Ma, Ling
  Cc: H. Peter Anvin, Ingo Molnar, Andi Kleen, Thomas Gleixner, Zhao,
	Yakui, Linux Kernel

On Mon, 18 Oct 2010 14:27:40 +0800, Ma, Ling wrote:
> Could please send out cpu info for this cpu model.

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 23
model name	: Intel(R) Core(TM)2 Duo CPU     E7300  @ 2.66GHz
stepping	: 6
cpu MHz		: 1603.000
cache size	: 3072 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 2
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 10
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc arch_perfmon pebs bts rep_good aperfmperf pni dtes64 monitor ds_cpl est tm2 ssse3 cx16 xtpr pdcm sse4_1 lahf_lm
bogomips	: 5319.99
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

Thanks
Miao

>
> Thanks
> Ling
>
> -----Original Message-----
> From: Miao Xie [mailto:miaox@cn.fujitsu.com]
> Sent: Monday, October 18, 2010 2:24 PM
> To: Ma, Ling
> Cc: H. Peter Anvin; Ingo Molnar; Andi Kleen; Thomas Gleixner; Zhao, Yakui; Linux Kernel
> Subject: Re: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
>
> On Fri, 15 Oct 2010 03:43:53 +0800, Ma, Ling wrote:
>> Attachment includes memcpy-kernel.c(cc -O2 memcpy-kernel.c -o
>> memcpy-kernel), and unaligned test cases on Atom.
>
> I have tested on my Core2 Duo machine with your benchmark tool. Attachment is the test result. But the result is different with yours on Atom, It seems the performance is better with this patch.
>
> According to these two different result, maybe we need optimize memcpy() by CPU model.
>
> Thanks
> Miao
>
>>
>> Thanks
>> Ling
>>
>> -----Original Message-----
>> From: Ma, Ling
>> Sent: Thursday, October 14, 2010 9:14 AM
>> To: 'H. Peter Anvin'; miaox@cn.fujitsu.com
>> Cc: Ingo Molnar; Andi Kleen; Thomas Gleixner; Zhao, Yakui; Linux
>> Kernel
>> Subject: RE: [PATCH V2 -tip] lib,x86_64: improve the performance of
>> memcpy() for unaligned copy
>>
>> Sure, I will post benchmark tool and benchmark on Atom 64bit soon.
>>
>> Thanks
>> Ling
>>
>> -----Original Message-----
>> From: H. Peter Anvin [mailto:hpa@zytor.com]
>> Sent: Thursday, October 14, 2010 5:32 AM
>> To: miaox@cn.fujitsu.com
>> Cc: Ma, Ling; Ingo Molnar; Andi Kleen; Thomas Gleixner; Zhao, Yakui;
>> Linux Kernel
>> Subject: Re: [PATCH V2 -tip] lib,x86_64: improve the performance of
>> memcpy() for unaligned copy
>>
>> On 10/08/2010 02:02 AM, Miao Xie wrote:
>>> On Fri, 8 Oct 2010 15:42:45 +0800, Ma, Ling wrote:
>>>> Could you please give us full address for each comparison result,we will do some tests on my machine.
>>>> For unaligned cases older cpus will crossing cache line and slow down caused by load and store, but for nhm, no necessary to care about it.
>>>> By the way in kernel 64bit mode, our access mode should be around 8byte aligned.
>>>
>>> Would you need my benchmark tool? I think it is helpful for your test.
>>>
>>
>> If you could post the benchmark tool that would be great.
>>
>> 	-hpa
>
>
>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
  2010-10-18  6:34             ` Miao Xie
@ 2010-10-18  6:43               ` Ma, Ling
  2010-10-18  7:42                 ` Miao Xie
  0 siblings, 1 reply; 15+ messages in thread
From: Ma, Ling @ 2010-10-18  6:43 UTC (permalink / raw)
  To: miaox
  Cc: H. Peter Anvin, Ingo Molnar, Andi Kleen, Thomas Gleixner, Zhao,
	Yakui, Linux Kernel

"wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc arch_perfmon pebs bts rep_good aperfmperf pni dtes64 monitor ds_cpl est tm2 ssse3 cx16 xtpr pdcm sse4_1 lahf_lm"

rep_good will cause memcpy jump to memcpy_c, so not run this patch, 
we may continue to do further optimization on it later.

BTW the improvement is only from core2 shift register optimization,
but for most previous cpus shift register is very sensitive because of decode stage.
I have test Atom, Opteron, and Nocona, new patch is still better.

Thanks
Ling

-----Original Message-----
From: Miao Xie [mailto:miaox@cn.fujitsu.com] 
Sent: Monday, October 18, 2010 2:35 PM
To: Ma, Ling
Cc: H. Peter Anvin; Ingo Molnar; Andi Kleen; Thomas Gleixner; Zhao, Yakui; Linux Kernel
Subject: Re: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy

On Mon, 18 Oct 2010 14:27:40 +0800, Ma, Ling wrote:
> Could please send out cpu info for this cpu model.

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 23
model name	: Intel(R) Core(TM)2 Duo CPU     E7300  @ 2.66GHz
stepping	: 6
cpu MHz		: 1603.000
cache size	: 3072 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 2
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 10
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc arch_perfmon pebs bts rep_good aperfmperf pni dtes64 monitor ds_cpl est tm2 ssse3 cx16 xtpr pdcm sse4_1 lahf_lm
bogomips	: 5319.99
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

Thanks
Miao

>
> Thanks
> Ling
>
> -----Original Message-----
> From: Miao Xie [mailto:miaox@cn.fujitsu.com]
> Sent: Monday, October 18, 2010 2:24 PM
> To: Ma, Ling
> Cc: H. Peter Anvin; Ingo Molnar; Andi Kleen; Thomas Gleixner; Zhao, Yakui; Linux Kernel
> Subject: Re: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
>
> On Fri, 15 Oct 2010 03:43:53 +0800, Ma, Ling wrote:
>> Attachment includes memcpy-kernel.c(cc -O2 memcpy-kernel.c -o
>> memcpy-kernel), and unaligned test cases on Atom.
>
> I have tested on my Core2 Duo machine with your benchmark tool. Attachment is the test result. But the result is different with yours on Atom, It seems the performance is better with this patch.
>
> According to these two different result, maybe we need optimize memcpy() by CPU model.
>
> Thanks
> Miao
>
>>
>> Thanks
>> Ling
>>
>> -----Original Message-----
>> From: Ma, Ling
>> Sent: Thursday, October 14, 2010 9:14 AM
>> To: 'H. Peter Anvin'; miaox@cn.fujitsu.com
>> Cc: Ingo Molnar; Andi Kleen; Thomas Gleixner; Zhao, Yakui; Linux
>> Kernel
>> Subject: RE: [PATCH V2 -tip] lib,x86_64: improve the performance of
>> memcpy() for unaligned copy
>>
>> Sure, I will post benchmark tool and benchmark on Atom 64bit soon.
>>
>> Thanks
>> Ling
>>
>> -----Original Message-----
>> From: H. Peter Anvin [mailto:hpa@zytor.com]
>> Sent: Thursday, October 14, 2010 5:32 AM
>> To: miaox@cn.fujitsu.com
>> Cc: Ma, Ling; Ingo Molnar; Andi Kleen; Thomas Gleixner; Zhao, Yakui;
>> Linux Kernel
>> Subject: Re: [PATCH V2 -tip] lib,x86_64: improve the performance of
>> memcpy() for unaligned copy
>>
>> On 10/08/2010 02:02 AM, Miao Xie wrote:
>>> On Fri, 8 Oct 2010 15:42:45 +0800, Ma, Ling wrote:
>>>> Could you please give us full address for each comparison result,we will do some tests on my machine.
>>>> For unaligned cases older cpus will crossing cache line and slow down caused by load and store, but for nhm, no necessary to care about it.
>>>> By the way in kernel 64bit mode, our access mode should be around 8byte aligned.
>>>
>>> Would you need my benchmark tool? I think it is helpful for your test.
>>>
>>
>> If you could post the benchmark tool that would be great.
>>
>> 	-hpa
>
>
>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
  2010-10-18  6:43               ` Ma, Ling
@ 2010-10-18  7:42                 ` Miao Xie
  2010-10-18  8:01                   ` Ma, Ling
  0 siblings, 1 reply; 15+ messages in thread
From: Miao Xie @ 2010-10-18  7:42 UTC (permalink / raw)
  To: Ma, Ling
  Cc: H. Peter Anvin, Ingo Molnar, Andi Kleen, Thomas Gleixner, Zhao,
	Yakui, Linux Kernel

On Mon, 18 Oct 2010 14:43:32 +0800, Ma, Ling wrote:
> "wp		: yes
> flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc arch_perfmon pebs bts rep_good aperfmperf pni dtes64 monitor ds_cpl est tm2 ssse3 cx16 xtpr pdcm sse4_1 lahf_lm"
>
> rep_good will cause memcpy jump to memcpy_c, so not run this patch,
> we may continue to do further optimization on it later.

Yes, but in fact, the performance of memcpy_c is not better on some micro-architecture(such as:
Wolfdale-3M, ), especially in the unaligned cases, so we need do optimization for it, and I think
the first step of optimization is optimizing the original code of memcpy().

> BTW the improvement is only from core2 shift register optimization,
> but for most previous cpus shift register is very sensitive because of decode stage.
> I have test Atom, Opteron, and Nocona, new patch is still better.

I think we can add a flag to make this improvement only valid for Core2 or other CPU like it,
just like X86_FEATURE_REP_GOOD.

Regards
Miao

>
> Thanks
> Ling
>
> -----Original Message-----
> From: Miao Xie [mailto:miaox@cn.fujitsu.com]
> Sent: Monday, October 18, 2010 2:35 PM
> To: Ma, Ling
> Cc: H. Peter Anvin; Ingo Molnar; Andi Kleen; Thomas Gleixner; Zhao, Yakui; Linux Kernel
> Subject: Re: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
>
> On Mon, 18 Oct 2010 14:27:40 +0800, Ma, Ling wrote:
>> Could please send out cpu info for this cpu model.
>
> processor	: 0
> vendor_id	: GenuineIntel
> cpu family	: 6
> model		: 23
> model name	: Intel(R) Core(TM)2 Duo CPU     E7300  @ 2.66GHz
> stepping	: 6
> cpu MHz		: 1603.000
> cache size	: 3072 KB
> physical id	: 0
> siblings	: 2
> core id		: 0
> cpu cores	: 2
> apicid		: 0
> initial apicid	: 0
> fpu		: yes
> fpu_exception	: yes
> cpuid level	: 10
> wp		: yes
> flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc arch_perfmon pebs bts rep_good aperfmperf pni dtes64 monitor ds_cpl est tm2 ssse3 cx16 xtpr pdcm sse4_1 lahf_lm
> bogomips	: 5319.99
> clflush size	: 64
> cache_alignment	: 64
> address sizes	: 36 bits physical, 48 bits virtual
> power management:
>
> Thanks
> Miao
>
>>
>> Thanks
>> Ling
>>
>> -----Original Message-----
>> From: Miao Xie [mailto:miaox@cn.fujitsu.com]
>> Sent: Monday, October 18, 2010 2:24 PM
>> To: Ma, Ling
>> Cc: H. Peter Anvin; Ingo Molnar; Andi Kleen; Thomas Gleixner; Zhao, Yakui; Linux Kernel
>> Subject: Re: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
>>
>> On Fri, 15 Oct 2010 03:43:53 +0800, Ma, Ling wrote:
>>> Attachment includes memcpy-kernel.c(cc -O2 memcpy-kernel.c -o
>>> memcpy-kernel), and unaligned test cases on Atom.
>>
>> I have tested on my Core2 Duo machine with your benchmark tool. Attachment is the test result. But the result is different with yours on Atom, It seems the performance is better with this patch.
>>
>> According to these two different result, maybe we need optimize memcpy() by CPU model.
>>
>> Thanks
>> Miao
>>
>>>
>>> Thanks
>>> Ling
>>>
>>> -----Original Message-----
>>> From: Ma, Ling
>>> Sent: Thursday, October 14, 2010 9:14 AM
>>> To: 'H. Peter Anvin'; miaox@cn.fujitsu.com
>>> Cc: Ingo Molnar; Andi Kleen; Thomas Gleixner; Zhao, Yakui; Linux
>>> Kernel
>>> Subject: RE: [PATCH V2 -tip] lib,x86_64: improve the performance of
>>> memcpy() for unaligned copy
>>>
>>> Sure, I will post benchmark tool and benchmark on Atom 64bit soon.
>>>
>>> Thanks
>>> Ling
>>>
>>> -----Original Message-----
>>> From: H. Peter Anvin [mailto:hpa@zytor.com]
>>> Sent: Thursday, October 14, 2010 5:32 AM
>>> To: miaox@cn.fujitsu.com
>>> Cc: Ma, Ling; Ingo Molnar; Andi Kleen; Thomas Gleixner; Zhao, Yakui;
>>> Linux Kernel
>>> Subject: Re: [PATCH V2 -tip] lib,x86_64: improve the performance of
>>> memcpy() for unaligned copy
>>>
>>> On 10/08/2010 02:02 AM, Miao Xie wrote:
>>>> On Fri, 8 Oct 2010 15:42:45 +0800, Ma, Ling wrote:
>>>>> Could you please give us full address for each comparison result,we will do some tests on my machine.
>>>>> For unaligned cases older cpus will crossing cache line and slow down caused by load and store, but for nhm, no necessary to care about it.
>>>>> By the way in kernel 64bit mode, our access mode should be around 8byte aligned.
>>>>
>>>> Would you need my benchmark tool? I think it is helpful for your test.
>>>>
>>>
>>> If you could post the benchmark tool that would be great.
>>>
>>> 	-hpa
>>
>>
>>
>
>
>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
  2010-10-18  7:42                 ` Miao Xie
@ 2010-10-18  8:01                   ` Ma, Ling
  2010-10-19  2:53                     ` Miao Xie
  0 siblings, 1 reply; 15+ messages in thread
From: Ma, Ling @ 2010-10-18  8:01 UTC (permalink / raw)
  To: miaox
  Cc: H. Peter Anvin, Ingo Molnar, Andi Kleen, Thomas Gleixner, Zhao,
	Yakui, Linux Kernel




>> rep_good will cause memcpy jump to memcpy_c, so not run this patch,
>> we may continue to do further optimization on it later.

>Yes, but in fact, the performance of memcpy_c is not better on some micro-architecture(such as:
>Wolfdale-3M, ), especially in the unaligned cases, so we need do optimization for it, and I think
>the first step of optimization is optimizing the original code of memcpy().

As mentioned above , we will optimize further memcpy_c soon.
Two reasons : 
  1. movs instruction need long lantency to startup
  2. movs instruction is not good for unaligned case.
  
>> BTW the improvement is only from core2 shift register optimization,
>> but for most previous cpus shift register is very sensitive because of decode stage. 
>> I have test Atom, Opteron, and Nocona, new patch is still better.

>I think we can add a flag to make this improvement only valid for Core2 or other CPU like it,
>just like X86_FEATURE_REP_GOOD.

We should optimize core2 in memcpy_c function in future, I think.

Thanks
Ling



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
  2010-10-18  8:01                   ` Ma, Ling
@ 2010-10-19  2:53                     ` Miao Xie
  2010-10-19  4:06                       ` Ma, Ling
  0 siblings, 1 reply; 15+ messages in thread
From: Miao Xie @ 2010-10-19  2:53 UTC (permalink / raw)
  To: Ma, Ling
  Cc: H. Peter Anvin, Ingo Molnar, Andi Kleen, Thomas Gleixner, Zhao,
	Yakui, Linux Kernel

On Mon, 18 Oct 2010 16:01:13 +0800, Ma, Ling wrote:
>>> rep_good will cause memcpy jump to memcpy_c, so not run this patch,
>>> we may continue to do further optimization on it later.
>
>> Yes, but in fact, the performance of memcpy_c is not better on some micro-architecture(such as:
>> Wolfdale-3M, ), especially in the unaligned cases, so we need do optimization for it, and I think
>> the first step of optimization is optimizing the original code of memcpy().
>
> As mentioned above , we will optimize further memcpy_c soon.
> Two reasons :
>    1. movs instruction need long lantency to startup
>    2. movs instruction is not good for unaligned case.
>
>>> BTW the improvement is only from core2 shift register optimization,
>>> but for most previous cpus shift register is very sensitive because of decode stage.
>>> I have test Atom, Opteron, and Nocona, new patch is still better.
>
>> I think we can add a flag to make this improvement only valid for Core2 or other CPU like it,
>> just like X86_FEATURE_REP_GOOD.
>
> We should optimize core2 in memcpy_c function in future, I think.

But there is a problem, the length of new instruction must be less or equal the length of
original instruction if we use alternatives, but IT seems the length of core2's optimization
instruction may be greater than the original instruction. So I think we can't optimize core2
in memcpy_c function, just in memcpy function.

Regards
Miao

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
  2010-10-19  2:53                     ` Miao Xie
@ 2010-10-19  4:06                       ` Ma, Ling
  0 siblings, 0 replies; 15+ messages in thread
From: Ma, Ling @ 2010-10-19  4:06 UTC (permalink / raw)
  To: miaox
  Cc: H. Peter Anvin, Ingo Molnar, Andi Kleen, Thomas Gleixner, Zhao,
	Yakui, Linux Kernel



On Mon, 18 Oct 2010 16:01:13 +0800, Ma, Ling wrote:
>>>> rep_good will cause memcpy jump to memcpy_c, so not run this patch,
>>> we may continue to do further optimization on it later.
>>
>>> Yes, but in fact, the performance of memcpy_c is not better on some micro-architecture(such as:
>>> Wolfdale-3M, ), especially in the unaligned cases, so we need do optimization for it, and I think
>>> the first step of optimization is optimizing the original code of memcpy().
>>
>> As mentioned above , we will optimize further memcpy_c soon.
>> Two reasons :
>>    1. movs instruction need long lantency to startup
>>    2. movs instruction is not good for unaligned case.
>>
>>>> BTW the improvement is only from core2 shift register optimization,
>>>> but for most previous cpus shift register is very sensitive because of decode stage.
>>>> I have test Atom, Opteron, and Nocona, new patch is still better.
>>
>>> I think we can add a flag to make this improvement only valid for Core2 or other CPU like it,
>>> just like X86_FEATURE_REP_GOOD.
>>
>> We should optimize core2 in memcpy_c function in future, I think.

>But there is a problem, the length of new instruction must be less or equal the length of
>original instruction if we use alternatives, but IT seems the length of core2's optimization
>instruction may be greater than the original instruction. So I think we can't optimize core2
>in memcpy_c function, just in memcpy function.
We keep above rule because we worry about i-cache capability miss and impact total performance.
However we have several questions about it according to modern CPU arch.
1. Current Linux kernel is far more previous versions and i-cache size(32k). 
2. Hardware prefetch predication become more important and sophisticated, even when we access current cache line,
   Hardware prefetch will fetch next line/lines on intel and AMD platform.
3. Based on our test, we don't find compile operation Os(for size) is better than O2 (for performance) totally on modern CPU,
   such as specjbb2005/2000, volano, kbuild ...,
4. We have found memcpy_c have performance problem, we should manage to resolve it in small size as possible.
   It is strange to separate core2 from other cpus by appending new flag,
   And I think your patch must be bigger than last version.

Thanks
Ling



^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2010-10-19  4:06 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-10-08  7:28 [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy Miao Xie
2010-10-08  7:42 ` Ma, Ling
2010-10-08  9:02   ` Miao Xie
2010-10-13 21:31     ` H. Peter Anvin
2010-10-14  1:14       ` Ma, Ling
2010-10-14 19:43       ` Ma, Ling
2010-10-18  6:23         ` Miao Xie
2010-10-18  6:27           ` Ma, Ling
2010-10-18  6:34             ` Miao Xie
2010-10-18  6:43               ` Ma, Ling
2010-10-18  7:42                 ` Miao Xie
2010-10-18  8:01                   ` Ma, Ling
2010-10-19  2:53                     ` Miao Xie
2010-10-19  4:06                       ` Ma, Ling
2010-10-18  3:12       ` Miao Xie

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.