RE: [PATCH RFC] [X86] performance improvement for memcpy_64.S by fast string.

From: "Ma, Ling" <ling.ma@intel.com>
To: Cyrill Gorcunov <gorcunov@gmail.com>, "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>, Ingo Molnar <mingo@redhat.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	linux-kernel <linux-kernel@vger.kernel.org>
Subject: RE: [PATCH RFC] [X86] performance improvement for memcpy_64.S by fast 	string.
Date: Thu, 12 Nov 2009 12:49:04 +0800	[thread overview]
Message-ID: <8FED46E8A9CA574792FC7AACAC38FE7714FE8307DB@PDSMSX501.ccr.corp.intel.com> (raw)
In-Reply-To: <aa79d98a0911112028nf3fc475r30aa8dc37936ea22@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 2814 bytes --]

Hi All
The attachment is latest memcpy.c, please update by 
"cc -o memcpy memcpy.c -O2 -m64".

Thanks
Ling

>-----Original Message-----
>From: Cyrill Gorcunov [mailto:gorcunov@gmail.com]
>Sent: 2009年11月12日 12:28
>To: H. Peter Anvin
>Cc: Ma, Ling; Ingo Molnar; Ingo Molnar; Thomas Gleixner; linux-kernel
>Subject: Re: [PATCH RFC] [X86] performance improvement for memcpy_64.S by fast
>string.
>
>On Thu, Nov 12, 2009 at 1:39 AM, H. Peter Anvin <hpa@zytor.com> wrote:
>> On 11/11/2009 12:34 PM, Cyrill Gorcunov wrote:
>>>                                               memcpy_orig     memc
>py_new
>>> TPT: Len 1024, alignment  8/ 0:               490             570
>>> TPT: Len 2048, alignment  8/ 0:               826             329
>>> TPT: Len 3072, alignment  8/ 0:               441             464
>>> TPT: Len 4096, alignment  8/ 0:               579             596
>>> TPT: Len 5120, alignment  8/ 0:               723             729
>>> TPT: Len 6144, alignment  8/ 0:               859             861
>>> TPT: Len 7168, alignment  8/ 0:               996             994
>>> TPT: Len 8192, alignment  8/ 0:               1165            1127
>>> TPT: Len 9216, alignment  8/ 0:               1273            1260
>>> TPT: Len 10240, alignment  8/ 0:      1402            1395
>>> TPT: Len 11264, alignment  8/ 0:      1543            1525
>>> TPT: Len 12288, alignment  8/ 0:      1682            1659
>>> TPT: Len 13312, alignment  8/ 0:      1869            1815
>>> TPT: Len 14336, alignment  8/ 0:      1982            1951
>>> TPT: Len 15360, alignment  8/ 0:      2185            2110
>>>
>>> I've run this test a few times and results almost the same,
>>> with alignment 1024, 3072, 4096, 5120, 6144, new version a bit slowly.
>>>
>>
>> Was the result for 2048 consistent (it seems odd in the extreme)... the
>> discrepancy between this result and Ling's results bothers me; perhaps
>> the right answer is to leave the current code for Core2 and use new code
>> (with a lower than 1024 threshold?) for NHM and K8?
>>
>>        -hpa
>>
>
>Hi Peter,
>
>no, results for 2048 is not repeatable (that is why I didn't mention this number
>in a former report).
>
>Test1:
>TPT: Len 2048, alignment  8/ 0:	826	329
>Test2:
>TPT: Len 2048, alignment  8/ 0:	359	329
>Test3:
>TPT: Len 2048, alignment  8/ 0:	306	331
>Test4:
>TPT: Len 2048, alignment  8/ 0:	415	329
>
>I guess this was due to cpu frequency change from 800 to 2.1Ghz since
>I did tests manually
>not using any kind of bash cycle to run the test program.

[-- Attachment #2: memcpy.c --]
[-- Type: text/plain, Size: 5495 bytes --]

#include<stdio.h>
#include <stdlib.h>

typedef unsigned long long int hp_timing_t;
#define  MAXSAMPLESTPT        100000
#define  MAXCOPYSIZE          (1024 * 32)
#define  ORIG  0
#define  NEW   1
static char* buf1 = NULL;
static char* buf2 = NULL;

hp_timing_t _dl_hp_timing_overhead;
# define HP_TIMING_NOW(Var) \
  ({ unsigned long long _hi, _lo; \
     asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
     (Var) = _hi << 32 | _lo; })

#define HP_TIMING_DIFF(Diff, Start, End)	(Diff) = ((End) - (Start))
#define HP_TIMING_TOTAL(total_time, start, end)	\
  do									\
    {									\
      hp_timing_t tmptime;						\
      HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end);	\
	total_time += tmptime;						\
    }									\
  while (0)

void memcpy_orig(char *dst, char *src, int len);
void memcpy_new(char *dst, char *src, int len);
void memcpy_c(char *dst, char *src, int len);
void (*do_memcpy)(char *dst, char *src, int len);

static void
do_one_throughput ( char *dst, char *src,
	     size_t len)
{
      __asm__("cpuid" : : : "eax", "ebx", "ecx", "edx");
      size_t i;
      hp_timing_t start __attribute ((unused));
      hp_timing_t stop __attribute ((unused));
      hp_timing_t total_time =  (hp_timing_t) 0;

      __asm__("cpuid" : : : "eax", "ebx", "ecx", "edx");
      for (i = 0; i < MAXSAMPLESTPT; ++i)  {
          HP_TIMING_NOW (start);
		do_memcpy(buf1, buf2, len);
	  HP_TIMING_NOW (stop);
	  HP_TIMING_TOTAL (total_time, start, stop);
      }

      printf ("\t%zd", (size_t) total_time/MAXSAMPLESTPT);

}

static void
do_tpt_test (size_t align1, size_t align2, size_t len)
{
  size_t i, j;
  char *s1, *s2;

  s1 = (char *) (buf1 + align1);
  s2 = (char *) (buf2 + align2);

   printf ("TPT: Len %4zd, alignment %2zd/%2zd:", len, align1, align2);
   do_memcpy = memcpy_orig;
   do_one_throughput (s2, s1, len);
   do_memcpy = memcpy_new;
   do_one_throughput (s2, s1, len);

    putchar ('\n');
}

static test_init(void)
{
  int i;
  buf1 = valloc(MAXCOPYSIZE);
  buf2 = valloc(MAXCOPYSIZE);

  for (i = 0; i < MAXCOPYSIZE ; i = i + 64) {
        buf1[i] = buf2[i] = i & 0xff;
  }

}

void memcpy_new(char *dst, char *src, int len)
{
	__asm__("movq %rdi, %rax");
	__asm__("movl %edx, %ecx");
	__asm__("shrl   $6, %ecx");
	__asm__("jz 2f");

	__asm__("cmp $0x400, %edx");
	__asm__("jae 7f");

	__asm__("1:");
	__asm__("decl %ecx");

	__asm__("movq 0*8(%rsi), %r11");
	__asm__("movq 1*8(%rdi), %r8");
	__asm__("movq %r11,	0*8(%rdi)");
	__asm__("movq %r8,	1*8(%rdi)");

	__asm__("movq 2*8(%rsi), %r9");
	__asm__("movq 3*8(%rdi), %r10");
	__asm__("movq %r9,	2*8(%rdi)");
	__asm__("movq %r10,	3*8(%rdi)");

	__asm__("movq 4*8(%rsi), %r11");
	__asm__("movq 5*8(%rdi), %r8");
	__asm__("movq %r11,	4*8(%rdi)");
	__asm__("movq %r8,	5*8(%rdi)");

	__asm__("movq 6*8(%rsi), %r9");
	__asm__("movq 7*8(%rdi), %r10");
	__asm__("movq %r9,	6*8(%rdi)");
	__asm__("movq %r10,	7*8(%rdi)");

	__asm__("leaq 64(%rsi), %rsi");
	__asm__("leaq 64(%rdi), %rdi");

	__asm__("jnz  1b");

	__asm__("2:");
	__asm__("movl %edx, %ecx");
	__asm__("andl $63, %ecx");
	__asm__("shl   $3, %ecx");
	__asm__("jz 4f");

	__asm__("3:");
	__asm__("decl %ecx");
	__asm__("movq (%rsi),	%r8");
	__asm__("movq %r8,	(%rdi)");
	__asm__("leaq 8(%rdi),	%rdi");
	__asm__("leaq 8(%rsi),	%rsi");
	__asm__("jnz 3b");

	__asm__("4:");
	__asm__("movl %edx,	%ecx");
	__asm__("andl $7,	%ecx");
	__asm__("jz 6f");

	__asm__("5:");
	__asm__("movb (%rsi),	%r8b");
	__asm__("movb %r8b, (%rdi)");
	__asm__("incq %rdi");
	__asm__("incq %rsi");
	__asm__("decl %ecx");
	__asm__("jnz 5b");

	__asm__("6:");
	__asm__("retq");

	__asm__("7:");
	__asm__("movl %edx, %ecx");
	__asm__ ("shr $3, %ecx");
	__asm__ ("andl $7, %edx");
	__asm__("rep movsq ");
	__asm__ ("jz 8f");
	__asm__("movl %edx, %ecx");
	__asm__("rep movsb");

	__asm__("8:");
}
void memcpy_orig(char *dst, char *src, int len)
{
	__asm__("movq %rdi, %rax");
	__asm__("movl %edx, %ecx");
	__asm__("shrl   $6, %ecx");
	__asm__("jz 2f");

	__asm__("mov $0x80, %r8d  ");  /*aligned case for loop 1 */

	__asm__("1:");
	__asm__("decl %ecx");

	__asm__("movq 0*8(%rsi), %r11");
	__asm__("movq 1*8(%rdi), %r8");
	__asm__("movq %r11,	0*8(%rdi)");
	__asm__("movq %r8,	1*8(%rdi)");

	__asm__("movq 2*8(%rsi), %r9");
	__asm__("movq 3*8(%rdi), %r10");
	__asm__("movq %r9,	2*8(%rdi)");
	__asm__("movq %r10,	3*8(%rdi)");

	__asm__("movq 4*8(%rsi), %r11");
	__asm__("movq 5*8(%rdi), %r8");
	__asm__("movq %r11,	4*8(%rdi)");
	__asm__("movq %r8,	5*8(%rdi)");

	__asm__("movq 6*8(%rsi), %r9");
	__asm__("movq 7*8(%rdi), %r10");
	__asm__("movq %r9,	6*8(%rdi)");
	__asm__("movq %r10,	7*8(%rdi)");

	__asm__("leaq 64(%rsi), %rsi");
	__asm__("leaq 64(%rdi), %rdi");

	__asm__("jnz  1b");

	__asm__("2:");
	__asm__("movl %edx, %ecx");
	__asm__("andl $63, %ecx");
	__asm__("shl   $3, %ecx");
	__asm__("jz 4f");

	__asm__("3:");
	__asm__("decl %ecx");
	__asm__("movq (%rsi),	%r8");
	__asm__("movq %r8,	(%rdi)");
	__asm__("leaq 8(%rdi),	%rdi");
	__asm__("leaq 8(%rsi),	%rsi");
	__asm__("jnz 3b");

	__asm__("4:");
	__asm__("movl %edx,	%ecx");
	__asm__("andl $7,	%ecx");
	__asm__("jz 6f");

	__asm__("5:");
	__asm__("movb (%rsi),	%r8b");
	__asm__("movb %r8b, (%rdi)");
	__asm__("incq %rdi");
	__asm__("incq %rsi");
	__asm__("decl %ecx");
	__asm__("jnz 5b");

	__asm__("6:");
}

void main(void)
{
  int i;
  test_init();
  printf ("%23s", "");
  printf ("\t%s\t%s\t%s\n", "memcpy_orig", "memcpy_new");

  for (i = 1024; i < 1024 * 16; i = i+ 1024)
     do_tpt_test(0, 0, i);

}