linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Borislav Petkov <bp@alien8.de>
To: Rasmus Villemoes <mail@rasmusvillemoes.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>,
	x86-ml <x86@kernel.org>, Andy Lutomirski <luto@kernel.org>,
	Josh Poimboeuf <jpoimboe@redhat.com>,
	lkml <linux-kernel@vger.kernel.org>
Subject: Re: [RFC] Improve memset
Date: Fri, 13 Sep 2019 18:36:45 +0200	[thread overview]
Message-ID: <20190913163645.GC4190@zn.tnic> (raw)
In-Reply-To: <20190913104232.GA4190@zn.tnic>

On Fri, Sep 13, 2019 at 12:42:32PM +0200, Borislav Petkov wrote:
> Or should we talk to Intel hw folks about it...

Or, I can do something like this, while waiting. Benchmark at the end.

The numbers are from a KBL box:

model           : 158
model name      : Intel(R) Core(TM) i5-9600K CPU @ 3.70GHz
stepping        : 12

and if I'm not doing anything wrong with the benchmark (the asm looks
ok but I could very well be missing something), the numbers say that
the REP; STOSB is better from sizes of 8 and upwards and up to two
cachelines we're pretty much on-par with the builtin variant.

But after that we're pretty close too.

But this is just a single uarch which is pretty new, I guess the 32 or
64 bytes cap should be good enough for most, especially those where REP;
STOSB isn't as good.

Hmm.

Loops: 1000000
size            builtin_memset  rep;stosb       memset_rep
0               46              85              131
8               88              65              104
16              66              65              103
24              66              65              103
32              66              65              104
40              66              65              104
48              66              65              103
56              66              65              103
64              66              65              103
72              66              65              103
80              66              65              103
88              66              65              103
96              66              65              103
104             66              65              103
112             66              65              103
120             66              65              103
128             66              65              103
136             66              71              108
144             73              71              108
152             72              71              108
160             73              71              108
168             73              71              108
176             73              71              108
184             73              71              108
192             72              74              107
200             75              78              116
208             80              78              116
216             80              78              116
224             80              78              116
232             80              78              116
240             80              78              116
248             80              78              116


---
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

typedef unsigned long long u64;

#define DECLARE_ARGS(val, low, high)    unsigned low, high
#define EAX_EDX_VAL(val, low, high)     ((low) | ((u64)(high) << 32))
#define EAX_EDX_ARGS(val, low, high)    "a" (low), "d" (high)
#define EAX_EDX_RET(val, low, high)     "=a" (low), "=d" (high)

static __always_inline unsigned long long rdtsc(void)
{
        DECLARE_ARGS(val, low, high);

        asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));

        return EAX_EDX_VAL(val, low, high);
}

/**
 * rdtsc_ordered() - read the current TSC in program order
 *
 * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer.
 * It is ordered like a load to a global in-memory counter.  It should
 * be impossible to observe non-monotonic rdtsc_unordered() behavior
 * across multiple CPUs as long as the TSC is synced.
 */
static __always_inline unsigned long long rdtsc_ordered(void)
{
        /*
         * The RDTSC instruction is not ordered relative to memory
         * access.  The Intel SDM and the AMD APM are both vague on this
         * point, but empirically an RDTSC instruction can be
         * speculatively executed before prior loads.  An RDTSC
         * immediately after an appropriate barrier appears to be
         * ordered as a normal load, that is, it provides the same
         * ordering guarantees as reading from a global memory location
         * that some other imaginary CPU is updating continuously with a
         * time stamp.
         */
        asm volatile("mfence");
        return rdtsc();
}

static __always_inline void *rep_stosb(void *dest, int c, size_t n)
{
        void *ret, *dummy;

	asm volatile("rep; stosb"
		     : "=&D" (ret), "=c" (dummy)
		     : "0" (dest), "a" (c), "c" (n)
		     : "memory");

	return ret;
}

static __always_inline void *memset_rep(void *dest, int c, size_t n)
{
	void *ret, *dummy;

	asm volatile("push %%rdi\n\t"
		     "mov %%rax, %%rsi\n\t"
		     "mov %%rcx, %%rdx\n\t"
		     "andl $7,%%edx\n\t"
		     "shrq $3,%%rcx\n\t"
		     "movzbl %%sil,%%esi\n\t"
		     "movabs $0x0101010101010101,%%rax\n\t"
		     "imulq %%rsi,%%rax\n\t"
		     "rep stosq\n\t"
		     "movl %%edx,%%ecx\n\t"
		     "rep stosb\n\t"
		     "pop %%rax\n\t"
		     : "=&D" (ret), "=c" (dummy)
		     : "0" (dest), "a" (c), "c" (n)
		     : "rsi", "rdx", "memory");

	return ret;
}

#define BUF_SIZE 1024
static void bench_memset(unsigned int loops)
{
	u64 p0, p1, m_cycles = 0, r_cycles = 0, rep_cyc = 0;
	int i, s;
	void *dst;

	dst = malloc(BUF_SIZE);
	if (!dst)
		perror("malloc");

	printf("Loops: %d\n", loops);
	printf("size\t\tbuiltin_memset\trep;stosb\tmemset_rep\n");

	for (s = 0; s < BUF_SIZE; s += 16) {
		for (i = 0; i < loops; i++) {
			p0 = rdtsc_ordered();
			__builtin_memset(dst, 0, s);
			p1 = rdtsc_ordered();

			m_cycles += p1 - p0;
		}

		for (i = 0; i < loops; i++) {
			p0 = rdtsc_ordered();
			rep_stosb(dst, 0, s);
			p1 = rdtsc_ordered();

			r_cycles += p1 - p0;
		}

		for (i = 0; i < loops; i++) {
			p0 = rdtsc_ordered();
			memset_rep(dst, 0, s);
			p1 = rdtsc_ordered();

			rep_cyc += p1 - p0;
		}


		m_cycles /= loops;
		r_cycles /= loops;
		rep_cyc  /= loops;

		printf("%d\t\t%llu\t\t%llu\t\t%llu\n", s, m_cycles, r_cycles, rep_cyc);
		m_cycles = r_cycles = rep_cyc = 0;
	}

	free(dst);
}

int main(void)
{
	bench_memset(1000000);

	return 0;
}


-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

  reply	other threads:[~2019-09-13 16:36 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-09-13  7:22 [RFC] Improve memset Borislav Petkov
2019-09-13  7:35 ` Ingo Molnar
2019-09-13  7:50   ` Borislav Petkov
2019-09-13  8:51 ` Rasmus Villemoes
2019-09-13  9:00 ` Linus Torvalds
2019-09-13  9:18   ` Rasmus Villemoes
2019-09-13 10:42     ` Borislav Petkov
2019-09-13 16:36       ` Borislav Petkov [this message]
2019-09-16  9:18         ` Rasmus Villemoes
2019-09-16 17:25           ` Linus Torvalds
2019-09-16 17:40             ` Andy Lutomirski
2019-09-16 21:29               ` Linus Torvalds
2019-09-16 23:13                 ` Andy Lutomirski
2019-09-16 23:26                   ` Linus Torvalds
2019-09-17  8:15             ` Borislav Petkov
2019-09-17 10:55             ` David Laight
2019-09-17 20:10 ` Josh Poimboeuf
2019-09-17 20:45   ` Linus Torvalds
2019-09-19 12:55     ` Borislav Petkov
2019-09-19 12:49   ` Borislav Petkov
2019-09-14  9:29 Alexey Dobriyan
2019-09-14 11:39 ` Borislav Petkov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190913163645.GC4190@zn.tnic \
    --to=bp@alien8.de \
    --cc=jpoimboe@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=mail@rasmusvillemoes.dk \
    --cc=torvalds@linux-foundation.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).