From: "Ma, Ling" <ling.ma@intel.com>
To: Borislav Petkov <bp@alien8.de>
Cc: Konrad Rzeszutek Wilk <konrad@kernel.org>,
"mingo@elte.hu" <mingo@elte.hu>, "hpa@zytor.com" <hpa@zytor.com>,
"tglx@linutronix.de" <tglx@linutronix.de>,
"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
"iant@google.com" <iant@google.com>
Subject: RE: [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register
Date: Fri, 12 Oct 2012 09:07:43 +0000 [thread overview]
Message-ID: <B2310DA9850C8743AA7AA0055500E90F0FD70AC5@SHSMSX102.ccr.corp.intel.com> (raw)
In-Reply-To: <20121012061813.GC9881@liondog.tnic>
[-- Attachment #1: Type: text/plain, Size: 812 bytes --]
> > > So is that also true for AMD CPUs?
> > Although Bulldozer put 32byte instruction into decoupled 16byte entry
> > buffers, it still decode 4 instructions per cycle, so 4 instructions
> > will be fed into execution unit and
> > 2 loads ,1 write will be issued per cycle.
>
> I'd be very interested with what benchmarks are you seeing that perf
> improvement on Atom and who knows, maybe I could find time to run them
> on Bulldozer and see how your patch behaves there :-).M
I use another benchmark from gcc, there are many code, and extract one simple benchmark, you may use it to test (cc -o copy_page copy_page.c),
my initial result shows new copy page version is still better on bulldozer machine, because the machine is first release, please verify result.
And CC to Ian.
Thanks
Ling
[-- Attachment #2: copy_page.c --]
[-- Type: text/plain, Size: 5975 bytes --]
#include<stdio.h>
#include <stdlib.h>
typedef unsigned long long int hp_timing_t;
#define MAXSAMPLESTPT 1000
#define MAXCOPYSIZE (1024 * 1024)
#define ORIG 0
#define NEW 1
static char* buf1 = NULL;
static char* buf2 = NULL;
static int repeat_one_test = 32;
hp_timing_t _dl_hp_timing_overhead;
# define HP_TIMING_NOW(Var) \
({ unsigned long long _hi, _lo; \
asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
(Var) = _hi << 32 | _lo; })
#define HP_TIMING_DIFF(Diff, Start, End) (Diff) = ((End) - (Start))
#define HP_TIMING_TOTAL(total_time, start, end) \
do \
{ \
hp_timing_t tmptime; \
HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end); \
total_time += tmptime; \
} \
while (0)
#define HP_TIMING_BEST(best_time, start, end) \
do \
{ \
hp_timing_t tmptime; \
HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end); \
if (best_time > tmptime) \
best_time = tmptime; \
} \
while (0)
void copy_page_org(char *dst, char *src, int len);
void copy_page_new(char *dst, char *src, int len);
void memcpy_c(char *dst, char *src, int len);
void (*do_memcpy)(char *dst, char *src, int len);
static void
do_one_test ( char *dst, char *src,
size_t len)
{
hp_timing_t start __attribute ((unused));
hp_timing_t stop __attribute ((unused));
hp_timing_t best_time = ~ (hp_timing_t) 0;
size_t i,j;
for (i = 0; i < repeat_one_test; ++i)
{
HP_TIMING_NOW (start);
do_memcpy ( dst, src, len);
HP_TIMING_NOW (stop);
HP_TIMING_BEST (best_time, start, stop);
}
printf ("\t%zd", (size_t) best_time);
}
static void
do_test (size_t align1, size_t align2, size_t len)
{
size_t i, j;
char *s1, *s2;
s1 = (char *) (buf1 + align1);
s2 = (char *) (buf2 + align2);
printf ("TPT: Len %4zd, alignment %2zd/%2zd:", len, align1, align2);
do_memcpy = copy_page_org;
do_one_test (s2, s1, len);
do_memcpy = copy_page_new;
do_one_test (s2+ (1 << 16), s1 + (1 << 16), len);
putchar ('\n');
}
static test_init(void)
{
int i;
buf1 = valloc(MAXCOPYSIZE);
buf2 = valloc(MAXCOPYSIZE);
for (i = 0; i < MAXCOPYSIZE ; i = i + 64) {
buf1[i] = buf2[i] = i & 0xff;
}
}
void copy_page_new(char *dst, char *src, int len)
{
__asm__("mov $(4096/64)-5, %ecx");
__asm__("1:");
__asm__("prefetcht0 5*64(%rsi)");
__asm__("decb %cl");
__asm__("movq 0x8*0(%rsi), %r10");
__asm__("movq 0x8*1(%rsi), %rax");
__asm__("movq 0x8*2(%rsi), %r8");
__asm__("movq 0x8*3(%rsi), %r9");
__asm__("movq %r10, 0x8*0(%rdi)");
__asm__("movq %rax, 0x8*1(%rdi)");
__asm__("movq %r8, 0x8*2(%rdi)");
__asm__("movq %r9, 0x8*3(%rdi)");
__asm__("movq 0x8*4(%rsi), %r10");
__asm__("movq 0x8*5(%rsi), %rax");
__asm__("movq 0x8*6(%rsi), %r8");
__asm__("movq 0x8*7(%rsi), %r9");
__asm__("leaq 64(%rsi), %rsi");
__asm__("movq %r10, 0x8*4(%rdi)");
__asm__("movq %rax, 0x8*5(%rdi)");
__asm__("movq %r8, 0x8*6(%rdi)");
__asm__("movq %r9, 0x8*7(%rdi)");
__asm__("leaq 64(%rdi), %rdi");
__asm__("jnz 1b");
__asm__("mov $5, %dl");
__asm__("2:");
__asm__("decb %dl");
__asm__("movq 0x8*0(%rsi), %r10");
__asm__("movq 0x8*1(%rsi), %rax");
__asm__("movq 0x8*2(%rsi), %r8");
__asm__("movq 0x8*3(%rsi), %r9");
__asm__("movq %r10, 0x8*0(%rdi)");
__asm__("movq %rax, 0x8*1(%rdi)");
__asm__("movq %r8, 0x8*2(%rdi)");
__asm__("movq %r9, 0x8*3(%rdi)");
__asm__("movq 0x8*4(%rsi), %r10");
__asm__("movq 0x8*5(%rsi), %rax");
__asm__("movq 0x8*6(%rsi), %r8");
__asm__("movq 0x8*7(%rsi), %r9");
__asm__("leaq 64(%rsi), %rsi");
__asm__("movq %r10, 0x8*4(%rdi)");
__asm__("movq %rax, 0x8*5(%rdi)");
__asm__("movq %r8, 0x8*6(%rdi)");
__asm__("movq %r9, 0x8*7(%rdi)");
__asm__("leaq 64(%rdi), %rdi");
__asm__("jnz 2b");
}
void copy_page_org(char *dst, char *src, int len)
{
__asm__("subq $2*8,%rsp");
__asm__("movq %rbx,(%rsp)");
__asm__("movq %r12,1*8(%rsp)");
__asm__("movl $(4096/64)-5,%ecx");
__asm__(".p2align 4");
__asm__("1:");
__asm__("dec %rcx");
__asm__("movq (%rsi), %rax");
__asm__("movq 8 (%rsi), %rbx");
__asm__("movq 16 (%rsi), %rdx");
__asm__("movq 24 (%rsi), %r8");
__asm__("movq 32 (%rsi), %r9");
__asm__("movq 40 (%rsi), %r10");
__asm__("movq 48 (%rsi), %r11");
__asm__("movq 56 (%rsi), %r12");
__asm__("prefetcht0 5*64(%rsi)");
__asm__("movq %rax, (%rdi)");
__asm__("movq %rbx, 8 (%rdi)");
__asm__("movq %rdx, 16 (%rdi)");
__asm__("movq %r8, 24 (%rdi)");
__asm__("movq %r9, 32 (%rdi)");
__asm__("movq %r10, 40 (%rdi)");
__asm__("movq %r11, 48 (%rdi)");
__asm__("movq %r12, 56 (%rdi)");
__asm__("leaq 64 (%rsi), %rsi");
__asm__("leaq 64 (%rdi), %rdi");
__asm__("jnz 1b");
__asm__("movl $5,%ecx");
__asm__(".p2align 4");
__asm__("2:");
__asm__("decl %ecx");
__asm__("movq (%rsi), %rax");
__asm__("movq 8 (%rsi), %rbx");
__asm__("movq 16 (%rsi), %rdx");
__asm__("movq 24 (%rsi), %r8");
__asm__("movq 32 (%rsi), %r9");
__asm__("movq 40 (%rsi), %r10");
__asm__("movq 48 (%rsi), %r11");
__asm__("movq 56 (%rsi), %r12");
__asm__("movq %rax, (%rdi)");
__asm__("movq %rbx, 8 (%rdi)");
__asm__("movq %rdx, 16 (%rdi)");
__asm__("movq %r8, 24 (%rdi)");
__asm__("movq %r9, 32 (%rdi)");
__asm__("movq %r10, 40 (%rdi)");
__asm__("movq %r11, 48 (%rdi)");
__asm__("movq %r12, 56 (%rdi)");
__asm__("leaq 64(%rdi),%rdi");
__asm__("leaq 64(%rsi),%rsi");
__asm__("jnz 2b");
__asm__("movq (%rsp),%rbx");
__asm__("movq 1*8(%rsp),%r12");
__asm__("addq $2*8,%rsp");
}
void main(void)
{
int i;
test_init();
printf ("%23s", "");
printf ("\t%s\t%s\t%s\n", "copy_page_org", "copy_page_new");
do_test(0, 0, 4096);
do_test(0, 0, 4096);
do_test(0, 0, 4096);
do_test(0, 0, 4096);
do_test(0, 0, 4096);
return ;
}
next prev parent reply other threads:[~2012-10-12 9:07 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-10-11 12:29 [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register ling.ma
2012-10-11 13:40 ` Andi Kleen
2012-10-12 3:10 ` Ma, Ling
2012-10-12 13:35 ` Andi Kleen
2012-10-12 14:54 ` Ma, Ling
2012-10-12 15:14 ` Andi Kleen
2012-10-11 14:35 ` Konrad Rzeszutek Wilk
2012-10-12 3:37 ` Ma, Ling
2012-10-12 6:18 ` Borislav Petkov
2012-10-12 9:07 ` Ma, Ling [this message]
2012-10-12 18:04 ` Borislav Petkov
2012-10-14 10:58 ` Borislav Petkov
2012-10-15 5:00 ` Ma, Ling
2012-10-15 5:13 ` George Spelvin
2012-10-12 21:02 George Spelvin
2012-10-12 23:17 ` Borislav Petkov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=B2310DA9850C8743AA7AA0055500E90F0FD70AC5@SHSMSX102.ccr.corp.intel.com \
--to=ling.ma@intel.com \
--cc=bp@alien8.de \
--cc=hpa@zytor.com \
--cc=iant@google.com \
--cc=konrad@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=tglx@linutronix.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).