RE: [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register

From: "Ma, Ling" <ling.ma@intel.com>
To: Borislav Petkov <bp@alien8.de>
Cc: Konrad Rzeszutek Wilk <konrad@kernel.org>,
	"mingo@elte.hu" <mingo@elte.hu>, "hpa@zytor.com" <hpa@zytor.com>,
	"tglx@linutronix.de" <tglx@linutronix.de>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"iant@google.com" <iant@google.com>
Subject: RE: [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register
Date: Fri, 12 Oct 2012 09:07:43 +0000	[thread overview]
Message-ID: <B2310DA9850C8743AA7AA0055500E90F0FD70AC5@SHSMSX102.ccr.corp.intel.com> (raw)
In-Reply-To: <20121012061813.GC9881@liondog.tnic>

[-- Attachment #1: Type: text/plain, Size: 812 bytes --]

> > > So is that also true for AMD CPUs?
> > Although Bulldozer put 32byte instruction into decoupled 16byte entry
> > buffers, it still decode 4 instructions per cycle, so 4 instructions
> > will be fed into execution unit and
> > 2 loads ,1 write will be issued per cycle.
> 
> I'd be very interested with what benchmarks are you seeing that perf
> improvement on Atom and who knows, maybe I could find time to run them
> on Bulldozer and see how your patch behaves there :-).M
I use another benchmark from gcc, there are many code, and extract one simple benchmark, you may use it to test (cc -o copy_page copy_page.c),
my initial result shows new copy page version is still better on bulldozer machine, because the machine is first release, please verify result.
And CC to Ian.

Thanks
Ling


[-- Attachment #2: copy_page.c --]
[-- Type: text/plain, Size: 5975 bytes --]

#include<stdio.h>
#include <stdlib.h>


typedef unsigned long long int hp_timing_t;
#define  MAXSAMPLESTPT        1000
#define  MAXCOPYSIZE          (1024 * 1024)
#define  ORIG  0
#define  NEW   1
static char* buf1 = NULL;
static char* buf2 = NULL;
static int repeat_one_test = 32;

hp_timing_t _dl_hp_timing_overhead;
# define HP_TIMING_NOW(Var) \
  ({ unsigned long long _hi, _lo; \
     asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
     (Var) = _hi << 32 | _lo; })

#define HP_TIMING_DIFF(Diff, Start, End)	(Diff) = ((End) - (Start))
#define HP_TIMING_TOTAL(total_time, start, end)	\
  do									\
    {									\
      hp_timing_t tmptime;						\
      HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end);	\
	total_time += tmptime;						\
    }									\
  while (0)

#define HP_TIMING_BEST(best_time, start, end)	\
  do									\
    {									\
      hp_timing_t tmptime;						\
      HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end);	\
      if (best_time > tmptime)						\
	best_time = tmptime;						\
    }									\
  while (0)


void copy_page_org(char *dst, char *src, int len);
void copy_page_new(char *dst, char *src, int len);
void memcpy_c(char *dst, char *src, int len);
void (*do_memcpy)(char *dst, char *src, int len);

static void
do_one_test ( char *dst, char *src,
	     size_t len)
{
      hp_timing_t start __attribute ((unused));
      hp_timing_t stop __attribute ((unused));
      hp_timing_t best_time = ~ (hp_timing_t) 0;
      size_t i,j;

      for (i = 0; i < repeat_one_test; ++i)
	{
	  HP_TIMING_NOW (start);
	  do_memcpy ( dst, src, len);
	  HP_TIMING_NOW (stop);
	  HP_TIMING_BEST (best_time, start, stop);
	}

      printf ("\t%zd", (size_t) best_time);
}

static void
do_test (size_t align1, size_t align2, size_t len)
{
  size_t i, j;
  char *s1, *s2;

  s1 = (char *) (buf1 + align1);
  s2 = (char *) (buf2 + align2);


   printf ("TPT: Len %4zd, alignment %2zd/%2zd:", len, align1, align2);
   do_memcpy = copy_page_org;
   do_one_test (s2, s1, len);
   do_memcpy = copy_page_new;
   do_one_test (s2+ (1 << 16), s1 + (1 << 16), len);
    putchar ('\n');
}

static test_init(void)
{
  int i;
  buf1 = valloc(MAXCOPYSIZE);
  buf2 = valloc(MAXCOPYSIZE);

  for (i = 0; i < MAXCOPYSIZE ; i = i + 64) {
        buf1[i] = buf2[i] = i & 0xff;
  }

}

void copy_page_new(char *dst, char *src, int len)
{
	__asm__("mov	$(4096/64)-5, %ecx");
__asm__("1:");
	__asm__("prefetcht0 5*64(%rsi)");
	__asm__("decb	%cl");

	__asm__("movq	0x8*0(%rsi), %r10");
	__asm__("movq	0x8*1(%rsi), %rax");
	__asm__("movq	0x8*2(%rsi), %r8");
	__asm__("movq	0x8*3(%rsi), %r9");
	__asm__("movq	%r10, 0x8*0(%rdi)");
	__asm__("movq	%rax, 0x8*1(%rdi)");
	__asm__("movq	%r8, 0x8*2(%rdi)");
	__asm__("movq	%r9, 0x8*3(%rdi)");

	__asm__("movq	0x8*4(%rsi), %r10");
	__asm__("movq	0x8*5(%rsi), %rax");
	__asm__("movq	0x8*6(%rsi), %r8");
	__asm__("movq	0x8*7(%rsi), %r9");
	__asm__("leaq	64(%rsi), %rsi");
	__asm__("movq	%r10, 0x8*4(%rdi)");
	__asm__("movq	%rax, 0x8*5(%rdi)");
	__asm__("movq	%r8, 0x8*6(%rdi)");
	__asm__("movq	%r9, 0x8*7(%rdi)");
	__asm__("leaq	64(%rdi), %rdi");
	__asm__("jnz     1b");
	__asm__("mov	$5, %dl");
__asm__("2:");
	__asm__("decb	%dl");
	__asm__("movq	0x8*0(%rsi), %r10");
	__asm__("movq	0x8*1(%rsi), %rax");
	__asm__("movq	0x8*2(%rsi), %r8");
	__asm__("movq	0x8*3(%rsi), %r9");
	__asm__("movq	%r10, 0x8*0(%rdi)");
	__asm__("movq	%rax, 0x8*1(%rdi)");
	__asm__("movq	%r8, 0x8*2(%rdi)");
	__asm__("movq	%r9, 0x8*3(%rdi)");

	__asm__("movq	0x8*4(%rsi), %r10");
	__asm__("movq	0x8*5(%rsi), %rax");
	__asm__("movq	0x8*6(%rsi), %r8");
	__asm__("movq	0x8*7(%rsi), %r9");
	__asm__("leaq	64(%rsi), %rsi");
	__asm__("movq	%r10, 0x8*4(%rdi)");
	__asm__("movq	%rax, 0x8*5(%rdi)");
	__asm__("movq	%r8, 0x8*6(%rdi)");
	__asm__("movq	%r9, 0x8*7(%rdi)");
	__asm__("leaq	64(%rdi), %rdi");

	__asm__("jnz	2b");

}


void copy_page_org(char *dst, char *src, int len)
{

	__asm__("subq	$2*8,%rsp");
	__asm__("movq	%rbx,(%rsp)");
	__asm__("movq	%r12,1*8(%rsp)");
	__asm__("movl	$(4096/64)-5,%ecx");
	__asm__(".p2align 4");
__asm__("1:");
	__asm__("dec     %rcx");

	__asm__("movq        (%rsi), %rax");
	__asm__("movq      8 (%rsi), %rbx");
	__asm__("movq     16 (%rsi), %rdx");
	__asm__("movq     24 (%rsi), %r8");
	__asm__("movq     32 (%rsi), %r9");
	__asm__("movq     40 (%rsi), %r10");
	__asm__("movq     48 (%rsi), %r11");
	__asm__("movq     56 (%rsi), %r12");

	__asm__("prefetcht0 5*64(%rsi)");

	__asm__("movq     %rax,    (%rdi)");
	__asm__("movq     %rbx,  8 (%rdi)");
	__asm__("movq     %rdx, 16 (%rdi)");
	__asm__("movq     %r8,  24 (%rdi)");
	__asm__("movq     %r9,  32 (%rdi)");
	__asm__("movq     %r10, 40 (%rdi)");
	__asm__("movq     %r11, 48 (%rdi)");
	__asm__("movq     %r12, 56 (%rdi)");

	__asm__("leaq    64 (%rsi), %rsi");
	__asm__("leaq    64 (%rdi), %rdi");
	__asm__("jnz     1b");

	__asm__("movl	$5,%ecx");
	__asm__(".p2align 4");
__asm__("2:");
	__asm__("decl   %ecx");

	__asm__("movq        (%rsi), %rax");
	__asm__("movq      8 (%rsi), %rbx");
	__asm__("movq     16 (%rsi), %rdx");
	__asm__("movq     24 (%rsi), %r8");
	__asm__("movq     32 (%rsi), %r9");
	__asm__("movq     40 (%rsi), %r10");
	__asm__("movq     48 (%rsi), %r11");
	__asm__("movq     56 (%rsi), %r12");

	__asm__("movq     %rax,    (%rdi)");
	__asm__("movq     %rbx,  8 (%rdi)");
	__asm__("movq     %rdx, 16 (%rdi)");
	__asm__("movq     %r8,  24 (%rdi)");
	__asm__("movq     %r9,  32 (%rdi)");
	__asm__("movq     %r10, 40 (%rdi)");
	__asm__("movq     %r11, 48 (%rdi)");
	__asm__("movq     %r12, 56 (%rdi)");

	__asm__("leaq	64(%rdi),%rdi");
	__asm__("leaq	64(%rsi),%rsi");

	__asm__("jnz	2b");

	__asm__("movq	(%rsp),%rbx");
	__asm__("movq	1*8(%rsp),%r12");
	__asm__("addq	$2*8,%rsp");
}


void main(void)
{
  int i;
	test_init();
	printf ("%23s", "");
	printf ("\t%s\t%s\t%s\n", "copy_page_org", "copy_page_new");

	do_test(0, 0, 4096);
	do_test(0, 0, 4096);
	do_test(0, 0, 4096);
	do_test(0, 0, 4096);
	do_test(0, 0, 4096);
	return ;
}