RE: [PATCH RFC] [X86] performance improvement for memcpy_64.S by fast string.

From: "Ma, Ling" <ling.ma@intel.com>
To: Ingo Molnar <mingo@elte.hu>, "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	linux-kernel <linux-kernel@vger.kernel.org>
Subject: RE: [PATCH RFC] [X86] performance improvement for memcpy_64.S by fast string.
Date: Wed, 11 Nov 2009 15:05:34 +0800	[thread overview]
Message-ID: <8FED46E8A9CA574792FC7AACAC38FE7714FE830398@PDSMSX501.ccr.corp.intel.com> (raw)
In-Reply-To: <20091109080830.GI453@elte.hu>

[-- Attachment #1: Type: text/plain, Size: 2415 bytes --]

Hi All
Please use the memcpy.c(cc -o memcpy memcpy.c -O2) to test more cases,
if you have interest. In this program we did simple modification
on memcpy_new function.

Thanks
Ling

>-----Original Message-----
>From: Ingo Molnar [mailto:mingo@elte.hu]
>Sent: 2009年11月9日 16:09
>To: H. Peter Anvin
>Cc: Ma, Ling; Ingo Molnar; Thomas Gleixner; linux-kernel
>Subject: Re: [PATCH RFC] [X86] performance improvement for memcpy_64.S by fast
>string.
>
>
>* H. Peter Anvin <hpa@zytor.com> wrote:
>
>> On 11/08/2009 11:24 PM, Ma, Ling wrote:
>> > Hi All
>> >
>> > Today we run our benchmark on Core2 and Sandy Bridge:
>> >
>>
>> Hi Ling,
>>
>> Thanks for doing that.  Do you also have access to any older CPUs?  I
>> suspect that the CPUs that Andi are worried about are older CPUs like
>> P4, K8 or Pentium M/Core 1.  (Andi: please do clarify if you have
>> additional information.)
>>
>> My personal opinion is that if we can show no significant slowdown on
>> P4, K8, P-M/Core 1, Core 2, and Nehalem then we can simply use this
>> code unconditionally.  If one of them is radically worse than
>> baseline, then we have to do something conditional, which is a lot
>> more complicated.
>>
>> [Ingo, Thomas: do you agree?]
>
>Yeah. IIRC the worst-case were the old P2's which had a really slow,
>microcode based string ops. (Some of them even had erratums in early
>prototypes although we can certainly ignore those as string ops get
>relied on quite frequently.)
>
>IIRC the original PPro core came up with some nifty, hardwired string
>ops, but those had to be dumbed down and emulated in microcode due to
>SMP bugs - making it an inferior choice in the end.
>
>But that should be ancient history and i'd suggest we ignore the P4
>dead-end too, unless it's some really big slowdown (which i doubt). If
>anyone cares then some optional assembly implementations could be added
>back.
>
>Ling, if you are interested, could you send a user-space test-app to
>this thread that everyone could just compile and run on various older
>boxes, to gather a performance profile of hand-coded versus string ops
>performance?
>
>( And i think we can make a judgement based on cache-hot performance
>  alone - if then the strings ops will perform comparatively better in
>  cache-cold scenarios, so the cache-hot numbers would be a conservative
>  estimate. )
>
>	Ingo

[-- Attachment #2: memcpy.c --]
[-- Type: text/plain, Size: 5683 bytes --]

#include<stdio.h>
#include <stdlib.h>

typedef unsigned long long int hp_timing_t;
#define  MAXSAMPLESTPT        100000
#define  MAXCOPYSIZE          (1024 * 32)
#define  ORIG  0
#define  NEW   1
static char* buf1 = NULL;
static char* buf2 = NULL;

hp_timing_t _dl_hp_timing_overhead;
# define HP_TIMING_NOW(Var) \
  ({ unsigned long long _hi, _lo; \
     asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
     (Var) = _hi << 32 | _lo; })

#define HP_TIMING_DIFF(Diff, Start, End)	(Diff) = ((End) - (Start))
#define HP_TIMING_TOTAL(total_time, start, end)	\
  do									\
    {									\
      hp_timing_t tmptime;						\
      HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end);	\
	total_time += tmptime;						\
    }									\
  while (0)

void memcpy_orig(char *dst, char *src, int len);
void memcpy_new(char *dst, char *src, int len);
void (*do_memcpy)(char *dst, char *src, int len);

static void
do_one_throughput ( char *dst, char *src,
	     size_t len)
{
      __asm__("cpuid" : : : "eax", "ebx", "ecx", "edx");
      size_t i;
      hp_timing_t start __attribute ((unused));
      hp_timing_t stop __attribute ((unused));
      hp_timing_t total_time =  (hp_timing_t) 0;

      __asm__("cpuid" : : : "eax", "ebx", "ecx", "edx");
      for (i = 0; i < MAXSAMPLESTPT; ++i)  {
          HP_TIMING_NOW (start);
		do_memcpy(buf1, buf2, len);
	  HP_TIMING_NOW (stop);
	  HP_TIMING_TOTAL (total_time, start, stop);
      }

      printf ("\t%zd", (size_t) total_time/MAXSAMPLESTPT);

}

static void
do_tpt_test (size_t align1, size_t align2, size_t len)
{
  size_t i, j;
  char *s1, *s2;

  s1 = (char *) (buf1 + align1);
  s2 = (char *) (buf2 + align2);

   printf ("TPT: Len %4zd, alignment %2zd/%2zd:", len, align1, align2);
   do_memcpy = memcpy_orig;
   do_one_throughput (s2, s1, len);
   do_memcpy = memcpy_new;
   do_one_throughput (s2, s1, len);

    putchar ('\n');
}

static test_init(void)
{
  int i;
  buf1 = valloc(MAXCOPYSIZE);
  buf2 = valloc(MAXCOPYSIZE);

  for (i = 0; i < MAXCOPYSIZE ; i = i + 64) {
        buf1[i] = buf2[i] = i & 0xff;
  }

}

void memcpy_new(char *dst, char *src, int len)
{

	__asm__("movq %rdi, %rax");
	__asm__("movl %edx, %ecx");
	__asm__("shrl   $6, %ecx");
	__asm__("jz 2f");

	__asm__("cmp $0x400, %rdx");
	__asm__("jae 8f");

	__asm__("1:");
	__asm__("decl %ecx");

	__asm__("movq 0*8(%rsi), %r11");
	__asm__("movq 1*8(%rdi), %r8");
	__asm__("movq %r11,	0*8(%rdi)");
	__asm__("movq %r8,	1*8(%rdi)");

	__asm__("movq 2*8(%rsi), %r9");
	__asm__("movq 3*8(%rdi), %r10");
	__asm__("movq %r9,	2*8(%rdi)");
	__asm__("movq %r10,	3*8(%rdi)");

	__asm__("movq 4*8(%rsi), %r11");
	__asm__("movq 5*8(%rdi), %r8");
	__asm__("movq %r11,	4*8(%rdi)");
	__asm__("movq %r8,	5*8(%rdi)");

	__asm__("movq 6*8(%rsi), %r9");
	__asm__("movq 7*8(%rdi), %r10");
	__asm__("movq %r9,	6*8(%rdi)");
	__asm__("movq %r10,	7*8(%rdi)");

	__asm__("leaq 64(%rsi), %rsi");
	__asm__("leaq 64(%rdi), %rdi");

	__asm__("jnz  1b");

	__asm__("2:");
	__asm__("movl %edx, %ecx");
	__asm__("andl $63, %ecx");
	__asm__("shl   $3, %ecx");
	__asm__("jz 5f");

	__asm__("3:");
	__asm__("cmp %edi, %esi");
	__asm__("mov $8, %r9");
	__asm__("jl 4f");
	__asm__("neg %r9");

	__asm__("4:");
	__asm__("decl %ecx");
	__asm__("movq (%rsi),	%r8");
	__asm__("movq %r8,	(%rdi)");
	__asm__("leaq 8(%rdi),	%rdi");
	__asm__("leaq 8(%rsi),	%rsi");
	__asm__("jnz 3b");

	__asm__("5:");
	__asm__("movl %edx,	%ecx");
	__asm__("andl $7,	%ecx");
	__asm__("jz 7f");

	__asm__("6:");
	__asm__("movb (%rsi),	%r8b");
	__asm__("movb %r8b, (%rdi)");
	__asm__("incq %rdi");
	__asm__("incq %rsi");
	__asm__("decl %ecx");
	__asm__("jnz 6b");

	__asm__("7:");
	__asm__("retq");

	__asm__("8:");
	__asm__("movl %edx, %ecx");
	__asm__ ("shr $3, %ecx");
	__asm__ ("andl $7, %edx");
	__asm__("rep movsq ");
	__asm__ ("jz 9f");
	__asm__("movl %edx, %ecx");
	__asm__("rep movsb");

	__asm__("9:");
}
void memcpy_orig(char *dst, char *src, int len)
{
	__asm__("movq %rdi, %rax");
	__asm__("movl %edx, %ecx");
	__asm__("shrl   $6, %ecx");
	__asm__("jz 2f");

	__asm__("mov $0x80, %r8d  ");  /*aligned case for loop 1 */

	__asm__("1:");
	__asm__("decl %ecx");

	__asm__("movq 0*8(%rsi), %r11");
	__asm__("movq 1*8(%rdi), %r8");
	__asm__("movq %r11,	0*8(%rdi)");
	__asm__("movq %r8,	1*8(%rdi)");

	__asm__("movq 2*8(%rsi), %r9");
	__asm__("movq 3*8(%rdi), %r10");
	__asm__("movq %r9,	2*8(%rdi)");
	__asm__("movq %r10,	3*8(%rdi)");

	__asm__("movq 4*8(%rsi), %r11");
	__asm__("movq 5*8(%rdi), %r8");
	__asm__("movq %r11,	4*8(%rdi)");
	__asm__("movq %r8,	5*8(%rdi)");

	__asm__("movq 6*8(%rsi), %r9");
	__asm__("movq 7*8(%rdi), %r10");
	__asm__("movq %r9,	6*8(%rdi)");
	__asm__("movq %r10,	7*8(%rdi)");

	__asm__("leaq 64(%rsi), %rsi");
	__asm__("leaq 64(%rdi), %rdi");

	__asm__("jnz  1b");

	__asm__("2:");
	__asm__("movl %edx, %ecx");
	__asm__("andl $63, %ecx");
	__asm__("shl   $3, %ecx");
	__asm__("jz 5f");

	__asm__("3:");
	__asm__("cmp %edi, %esi");
	__asm__("mov $8, %r9");
	__asm__("jl 4f");
	__asm__("neg %r9");

	__asm__("4:");
	__asm__("decl %ecx");
	__asm__("movq (%rsi),	%r8");
	__asm__("movq %r8,	(%rdi)");
	__asm__("leaq 8(%rdi),	%rdi");
	__asm__("leaq 8(%rsi),	%rsi");
	__asm__("jnz 3b");

	__asm__("5:");
	__asm__("movl %edx,	%ecx");
	__asm__("andl $7,	%ecx");
	__asm__("jz 7f");

	__asm__("6:");
	__asm__("movb (%rsi),	%r8b");
	__asm__("movb %r8b, (%rdi)");
	__asm__("incq %rdi");
	__asm__("incq %rsi");
	__asm__("decl %ecx");
	__asm__("jnz 6b");

	__asm__("7:");
	__asm__("retq");
}

void main(void)
{
  int i;
  test_init();
  printf ("%23s", "");
  printf ("\t%s\t%s\n", "memcpy_orig", "memcpy_new");

  for (i = 1024; i < 1024 * 16; i = i + 1024)
     do_tpt_test(8, 0, i);

}