linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Borislav Petkov <bp@alien8.de>
To: "Ma, Ling" <ling.ma@intel.com>
Cc: Konrad Rzeszutek Wilk <konrad@kernel.org>,
	"mingo@elte.hu" <mingo@elte.hu>, "hpa@zytor.com" <hpa@zytor.com>,
	"tglx@linutronix.de" <tglx@linutronix.de>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"iant@google.com" <iant@google.com>,
	George Spelvin <linux@horizon.com>
Subject: Re: [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register
Date: Sun, 14 Oct 2012 12:58:21 +0200	[thread overview]
Message-ID: <20121014105821.GB2165@liondog.tnic> (raw)
In-Reply-To: <20121012180411.GA26245@liondog.tnic>

[-- Attachment #1: Type: text/plain, Size: 807 bytes --]

On Fri, Oct 12, 2012 at 08:04:11PM +0200, Borislav Petkov wrote:
> Right, so benchmark shows around 20% speedup on Bulldozer but this is
> a microbenchmark and before pursue this further, we need to verify
> whether this brings any palpable speedup with a real benchmark, I
> don't know, kernbench, netbench, whatever. Even something as boring as
> kernel build. And probably check for perf regressions on the rest of
> the uarches.

Ok, so to summarize, on AMD we're using REP MOVSQ which is even
faster than the unrolled version. I've added the REP MOVSQ version
to the µbenchmark. It nicely validates that we're correctly setting
X86_FEATURE_REP_GOOD on everything >= F10h and some K8s.

So, to answer Konrad's question: those patches don't concern AMD
machines.

Thanks.

-- 
Regards/Gruss,
    Boris.

[-- Attachment #2: copy-page.c --]
[-- Type: text/x-csrc, Size: 6205 bytes --]

#include<stdio.h>
#include <stdlib.h>


typedef unsigned long long int hp_timing_t;
#define  MAXSAMPLESTPT        1000
#define  MAXCOPYSIZE          (1024 * 1024)
#define  ORIG  0
#define  NEW   1
static char* buf1 = NULL;
static char* buf2 = NULL;
static int repeat_one_test = 32;

hp_timing_t _dl_hp_timing_overhead;
# define HP_TIMING_NOW(Var) \
  ({ unsigned long long _hi, _lo; \
     asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
     (Var) = _hi << 32 | _lo; })

#define HP_TIMING_DIFF(Diff, Start, End)	(Diff) = ((End) - (Start))
#define HP_TIMING_TOTAL(total_time, start, end)	\
  do									\
    {									\
      hp_timing_t tmptime;						\
      HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end);	\
	total_time += tmptime;						\
    }									\
  while (0)

#define HP_TIMING_BEST(best_time, start, end)	\
  do									\
    {									\
      hp_timing_t tmptime;						\
      HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end);	\
      if (best_time > tmptime)						\
	best_time = tmptime;						\
    }									\
  while (0)


void copy_page_org(char *dst, char *src, int len);
void copy_page_new(char *dst, char *src, int len);
void copy_page_rep_movsq(char *dst, char *src, int len);
void memcpy_c(char *dst, char *src, int len);
void (*do_memcpy)(char *dst, char *src, int len);

static void
do_one_test ( char *dst, char *src,
	     size_t len)
{
      hp_timing_t start __attribute ((unused));
      hp_timing_t stop __attribute ((unused));
      hp_timing_t best_time = ~ (hp_timing_t) 0;
      size_t i;

      for (i = 0; i < repeat_one_test; ++i)
	{
	  HP_TIMING_NOW (start);
	  do_memcpy ( dst, src, len);
	  HP_TIMING_NOW (stop);
	  HP_TIMING_BEST (best_time, start, stop);
	}

      printf ("\t\t%zd", (size_t) best_time);
}

static void
do_test (size_t align1, size_t align2, size_t len)
{
  char *s1, *s2;

  s1 = (char *) (buf1 + align1);
  s2 = (char *) (buf2 + align2);


   printf ("TPT: Len %4zd, alignment %2zd/%2zd:", len, align1, align2);
   do_memcpy = copy_page_org;
   do_one_test (s2, s1, len);
   do_memcpy = copy_page_new;
   do_one_test (s2+ (1 << 16), s1 + (1 << 16), len);
   do_memcpy = copy_page_rep_movsq;
   do_one_test(s2, s1, len);
    putchar ('\n');
}

static void test_init(void)
{
  int i;
  buf1 = valloc(MAXCOPYSIZE);
  buf2 = valloc(MAXCOPYSIZE);

  for (i = 0; i < MAXCOPYSIZE ; i = i + 64) {
        buf1[i] = buf2[i] = i & 0xff;
  }

}

void copy_page_new(char *dst, char *src, int len)
{
	__asm__("mov	$(4096/64)-5, %ecx");
__asm__("1:");
	__asm__("prefetcht0 5*64(%rsi)");
	__asm__("decb	%cl");

	__asm__("movq	0x8*0(%rsi), %r10");
	__asm__("movq	0x8*1(%rsi), %rax");
	__asm__("movq	0x8*2(%rsi), %r8");
	__asm__("movq	0x8*3(%rsi), %r9");
	__asm__("movq	%r10, 0x8*0(%rdi)");
	__asm__("movq	%rax, 0x8*1(%rdi)");
	__asm__("movq	%r8, 0x8*2(%rdi)");
	__asm__("movq	%r9, 0x8*3(%rdi)");

	__asm__("movq	0x8*4(%rsi), %r10");
	__asm__("movq	0x8*5(%rsi), %rax");
	__asm__("movq	0x8*6(%rsi), %r8");
	__asm__("movq	0x8*7(%rsi), %r9");
	__asm__("leaq	64(%rsi), %rsi");
	__asm__("movq	%r10, 0x8*4(%rdi)");
	__asm__("movq	%rax, 0x8*5(%rdi)");
	__asm__("movq	%r8, 0x8*6(%rdi)");
	__asm__("movq	%r9, 0x8*7(%rdi)");
	__asm__("leaq	64(%rdi), %rdi");
	__asm__("jnz     1b");
	__asm__("mov	$5, %dl");
__asm__("2:");
	__asm__("decb	%dl");
	__asm__("movq	0x8*0(%rsi), %r10");
	__asm__("movq	0x8*1(%rsi), %rax");
	__asm__("movq	0x8*2(%rsi), %r8");
	__asm__("movq	0x8*3(%rsi), %r9");
	__asm__("movq	%r10, 0x8*0(%rdi)");
	__asm__("movq	%rax, 0x8*1(%rdi)");
	__asm__("movq	%r8, 0x8*2(%rdi)");
	__asm__("movq	%r9, 0x8*3(%rdi)");

	__asm__("movq	0x8*4(%rsi), %r10");
	__asm__("movq	0x8*5(%rsi), %rax");
	__asm__("movq	0x8*6(%rsi), %r8");
	__asm__("movq	0x8*7(%rsi), %r9");
	__asm__("leaq	64(%rsi), %rsi");
	__asm__("movq	%r10, 0x8*4(%rdi)");
	__asm__("movq	%rax, 0x8*5(%rdi)");
	__asm__("movq	%r8, 0x8*6(%rdi)");
	__asm__("movq	%r9, 0x8*7(%rdi)");
	__asm__("leaq	64(%rdi), %rdi");

	__asm__("jnz	2b");

}


void copy_page_org(char *dst, char *src, int len)
{

	__asm__("subq	$2*8,%rsp");
	__asm__("movq	%rbx,(%rsp)");
	__asm__("movq	%r12,1*8(%rsp)");
	__asm__("movl	$(4096/64)-5,%ecx");
	__asm__(".p2align 4");
__asm__("1:");
	__asm__("dec     %rcx");

	__asm__("movq        (%rsi), %rax");
	__asm__("movq      8 (%rsi), %rbx");
	__asm__("movq     16 (%rsi), %rdx");
	__asm__("movq     24 (%rsi), %r8");
	__asm__("movq     32 (%rsi), %r9");
	__asm__("movq     40 (%rsi), %r10");
	__asm__("movq     48 (%rsi), %r11");
	__asm__("movq     56 (%rsi), %r12");

	__asm__("prefetcht0 5*64(%rsi)");

	__asm__("movq     %rax,    (%rdi)");
	__asm__("movq     %rbx,  8 (%rdi)");
	__asm__("movq     %rdx, 16 (%rdi)");
	__asm__("movq     %r8,  24 (%rdi)");
	__asm__("movq     %r9,  32 (%rdi)");
	__asm__("movq     %r10, 40 (%rdi)");
	__asm__("movq     %r11, 48 (%rdi)");
	__asm__("movq     %r12, 56 (%rdi)");

	__asm__("leaq    64 (%rsi), %rsi");
	__asm__("leaq    64 (%rdi), %rdi");
	__asm__("jnz     1b");

	__asm__("movl	$5,%ecx");
	__asm__(".p2align 4");
__asm__("2:");
	__asm__("decl   %ecx");

	__asm__("movq        (%rsi), %rax");
	__asm__("movq      8 (%rsi), %rbx");
	__asm__("movq     16 (%rsi), %rdx");
	__asm__("movq     24 (%rsi), %r8");
	__asm__("movq     32 (%rsi), %r9");
	__asm__("movq     40 (%rsi), %r10");
	__asm__("movq     48 (%rsi), %r11");
	__asm__("movq     56 (%rsi), %r12");

	__asm__("movq     %rax,    (%rdi)");
	__asm__("movq     %rbx,  8 (%rdi)");
	__asm__("movq     %rdx, 16 (%rdi)");
	__asm__("movq     %r8,  24 (%rdi)");
	__asm__("movq     %r9,  32 (%rdi)");
	__asm__("movq     %r10, 40 (%rdi)");
	__asm__("movq     %r11, 48 (%rdi)");
	__asm__("movq     %r12, 56 (%rdi)");

	__asm__("leaq	64(%rdi),%rdi");
	__asm__("leaq	64(%rsi),%rsi");

	__asm__("jnz	2b");

	__asm__("movq	(%rsp),%rbx");
	__asm__("movq	1*8(%rsp),%r12");
	__asm__("addq	$2*8,%rsp");
}

void copy_page_rep_movsq(char *dst, char *src, int len)
{
	__asm__("movl $4096/8,%ecx");
	__asm__("rep movsq");
}

int main(void)
{
	test_init();
	printf ("%35s", "");
	printf ("\t%s\t%s\t%s\n", "copy_page_org", "copy_page_new", "REP MOVSQ");

	do_test(0, 0, 4096);
	do_test(0, 0, 4096);
	do_test(0, 0, 4096);
	do_test(0, 0, 4096);
	do_test(0, 0, 4096);
	return 0;
}

  reply	other threads:[~2012-10-14 10:58 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-10-11 12:29 [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging instruction sequence and saving register ling.ma
2012-10-11 13:40 ` Andi Kleen
2012-10-12  3:10   ` Ma, Ling
2012-10-12 13:35     ` Andi Kleen
2012-10-12 14:54       ` Ma, Ling
2012-10-12 15:14         ` Andi Kleen
2012-10-11 14:35 ` Konrad Rzeszutek Wilk
2012-10-12  3:37   ` Ma, Ling
2012-10-12  6:18     ` Borislav Petkov
2012-10-12  9:07       ` Ma, Ling
2012-10-12 18:04         ` Borislav Petkov
2012-10-14 10:58           ` Borislav Petkov [this message]
2012-10-15  5:00             ` Ma, Ling
2012-10-15  5:13             ` George Spelvin
2012-10-12 21:02 George Spelvin
2012-10-12 23:17 ` Borislav Petkov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20121014105821.GB2165@liondog.tnic \
    --to=bp@alien8.de \
    --cc=hpa@zytor.com \
    --cc=iant@google.com \
    --cc=konrad@kernel.org \
    --cc=ling.ma@intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux@horizon.com \
    --cc=mingo@elte.hu \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).