RE: [PATCH v1] x86/lib: Optimize 8x loop and memory clobbers in csum_partial.c

From: David Laight <David.Laight@ACULAB.COM>
To: 'Noah Goldstein' <goldstein.w.n@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>,
	"tglx@linutronix.de" <tglx@linutronix.de>,
	"mingo@redhat.com" <mingo@redhat.com>,
	Borislav Petkov <bp@alien8.de>,
	"dave.hansen@linux.intel.com" <dave.hansen@linux.intel.com>,
	X86 ML <x86@kernel.org>, "hpa@zytor.com" <hpa@zytor.com>,
	"peterz@infradead.org" <peterz@infradead.org>,
	"alexanderduyck@fb.com" <alexanderduyck@fb.com>,
	open list <linux-kernel@vger.kernel.org>,
	netdev <netdev@vger.kernel.org>
Subject: RE: [PATCH v1] x86/lib: Optimize 8x loop and memory clobbers in csum_partial.c
Date: Thu, 2 Dec 2021 14:24:47 +0000	[thread overview]
Message-ID: <29cf408370b749069f3b395781fe434c@AcuMS.aculab.com> (raw)
In-Reply-To: <CAFUsyfLoEckBrnYKUgqWC7AJPTBDfarjBOgBvtK7eGVZj9muYQ@mail.gmail.com>

I've dug out my test program and measured the performance of
various copied of the inner loop - usually 64 bytes/iteration.
Code is below.

It uses the hardware performance counter to get the number of
clocks the inner loop takes.
This is reasonable stable once the branch predictor has settled down.
So the different in clocks between a 64 byte buffer and a 128 byte
buffer is the number of clocks for 64 bytes.
(Unlike the TSC the pmc count doesn't depend on the cpu frequency.)

What is interesting is that even some of the trivial loops appear
to be doing 16 bytes per clock for short buffers - which is impossible.
Checksum 1k bytes and you get an entirely different answer.
The only loop that really exceeds 8 bytes/clock for long buffers
is the adxc/adoc one.

What is almost certainly happening is that all the memory reads and
the dependant add/adc instructions are all queued up in the 'out of
order' execution unit.
Since 'rdpmc' isn't a serialising instruction they can still be
outstanding when the function returns.
Uncomment the 'rdtsc' and you get much slower values for short buffers.

When testing the full checksum function the queued up memory
reads and adc are probably running in parallel with the logic
that is handling lengths that aren't multiples of 64.

I also found nothing consistently different for misaligned reads.

These were all tested on my i7-7700 cpu.

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>

#include <linux/perf_event.h>
#include <sys/mman.h>
#include <sys/syscall.h>

static int init_pmc(void)
{
        static struct perf_event_attr perf_attr = {
                .type = PERF_TYPE_HARDWARE,
                .config = PERF_COUNT_HW_CPU_CYCLES,
                .pinned = 1,
        };
        struct perf_event_mmap_page *pc;

        int perf_fd;
        perf_fd = syscall(__NR_perf_event_open, &perf_attr, 0, -1, -1, 0);
        if (perf_fd < 0) {
                fprintf(stderr, "perf_event_open failed: errno %d\n", errno);
                exit(1);
        }
        pc = mmap(NULL, 4096, PROT_READ, MAP_SHARED, perf_fd, 0);
        if (pc == MAP_FAILED) {
                fprintf(stderr, "perf_event mmap() failed: errno %d\n", errno);
                exit(1);
        }
        return pc->index - 1;
}

static inline unsigned int rdpmc(unsigned int counter)
{
        unsigned int low, high;

        // asm volatile("rdtsc" : "=a" (low), "=d" (high));
        asm volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter));

        // return low bits, counter might to 32 or 40 bits wide.
        return low;
}

#ifndef MODE
#define MODE 0
#endif

__attribute__((noinline))
unsigned long csum64(unsigned long csum, const unsigned char *buff, unsigned long len)
{
#if MODE == 0
// Simple 'adc' loop, abs max 8 bytes/clock.
// Will be very slow on old cpu (Pre Ivy bridge) cpu where 'dec'
// has a false dependency against the carry flag.
// Nearly as good if doing 32 bytes/iteration.
#define DFLT_OFFSET 40
        long count = len/32;
        asm("   clc\n"
            "1: adcq    (%[buff]), %[csum]\n"
            "   adcq    8(%[buff]), %[csum]\n"
            "   adcq    16(%[buff]), %[csum]\n"
            "   adcq    24(%[buff]), %[csum]\n"
            "   lea     32(%[buff]), %[buff]\n"
            "   dec     %[count]\n"
            "   jnz     1b\n"
            "   adcq    $0, %[csum]\n"
                : [csum] "+&r" (csum), [len] "+&r" (len), [count] "+&r" (count)
                : [buff] "r" (buff)
                : "memory");
#endif

#if MODE == 1
// Alternate 'adc' loop that avoids using the carry flag.
#define DFLT_OFFSET 40
        buff += len;
        len = -len;
        asm("   clc\n"
            "1: jrcxz   2f\n"
            "   adcq    0(%[buff], %[len]), %[csum]\n"
            "   adcq    8(%[buff], %[len]), %[csum]\n"
            "   adcq    16(%[buff], %[len]), %[csum]\n"
            "   adcq    24(%[buff], %[len]), %[csum]\n"
            "   lea     32(%[len]), %[len]\n"
            "   jmp     1b\n"
            "2: adcq    $0, %[csum]\n"
                : [csum] "+&r" (csum), [len] "+&c" (len)
                : [buff] "r" (buff)
                : "memory");
#endif

#if MODE == 2
// Existing kernel loop.
// The adc chain limits it to about 8 bytes/clock.
#define DFLT_OFFSET 40
        long count = len/64;
        asm("1: addq    0(%[buff]), %[csum]\n"
            "   adcq    8(%[buff]), %[csum]\n"
            "   adcq    16(%[buff]), %[csum]\n"
            "   adcq    24(%[buff]), %[csum]\n"
            "   adcq    32(%[buff]), %[csum]\n"
            "   adcq    40(%[buff]), %[csum]\n"
            "   adcq    48(%[buff]), %[csum]\n"
            "   adcq    56(%[buff]), %[csum]\n"
            "   adcq    $0, %[csum]\n"
            "   lea     64(%[buff]), %[buff]\n"
            "   dec     %[count]\n"
            "   jnz     1b"
                : [csum] "+&r" (csum), [len] "+&r" (len), [count] "+&r" (count)
                : [buff] "r" (buff)
                : "memory");
#endif

#if MODE == 3
// Proposed kernel loop.
// This splits the adc chain in two.
// Faster than (2) for short buffers - not sure why.
#define DFLT_OFFSET 38
        long count = len/64;
        unsigned long csum1;
        asm("1: movq    0(%[buff]), %[csum1]\n"
            "   adcq    8(%[buff]), %[csum1]\n"
            "   adcq    16(%[buff]), %[csum1]\n"
            "   adcq    24(%[buff]), %[csum1]\n"
            "   adcq    32(%[buff]), %[csum1]\n"
            "   adcq    $0, %[csum1]\n"
            "   lea     64(%[buff]), %[buff]\n"
            "   addq    40-64(%[buff]), %[csum]\n"
            "   adcq    48-64(%[buff]), %[csum]\n"
            "   adcq    56-64(%[buff]), %[csum]\n"
            "   adcq    %[csum1], %[csum]\n"
            "   adcq    $0, %[csum]\n"
            "   dec     %[count]\n"
            "   jnz     1b"
                : [csum] "+&r" (csum), [csum1] "=&r" (csum1), [len] "+&r" (len), [count] "+&r" (count)
                : [buff] "r" (buff)
                : "memory");
#endif

#if MODE == 4
// Less unrolled version of (3).
// Maybe just as fast!
#define DFLT_OFFSET 43
        long count = len;
        unsigned long csum1;
        asm("1: movq    0(%[buff]), %[csum1]\n"
            "   adcq    8(%[buff]), %[csum1]\n"
            "   adcq    $0, %[csum1]\n"
            "   lea     32(%[buff]), %[buff]\n"
            "   addq    16-32(%[buff]), %[csum]\n"
            "   adcq    24-32(%[buff]), %[csum]\n"
            "   adcq    %[csum1], %[csum]\n"
            "   adcq    $0, %[csum]\n"
            "   sub     $32, %[count]\n"
            "   jnz     1b"
                : [csum] "+&r" (csum), [csum1] "=&r" (csum1), [len] "+&r" (len), [count] "+&r" (count)
                : [buff] "r" (buff)
                : "memory");
#endif

#if MODE == 5
// adxc/adxo loop.
// This is the only loop that exceeds 8 bytes/clock for 1k buffers.
#define DFLT_OFFSET 40
        unsigned long csum_odd;
        buff += len;
        len = -len;
        asm(    "       xor   %[csum_odd], %[csum_odd]\n"    // Also clears carry and overflow
                "10:    jrcxz 20f\n"
                "       adcx    (%[buff], %[len]), %[csum]\n"
                "       adox   8(%[buff], %[len]), %[csum_odd]\n"
                "       adcx  16(%[buff], %[len]), %[csum]\n"
                "       adox  24(%[buff], %[len]), %[csum_odd]\n"
                "       adcx  32(%[buff], %[len]), %[csum]\n"
                "       adox  40(%[buff], %[len]), %[csum_odd]\n"
                "       adcx  48(%[buff], %[len]), %[csum]\n"
                "       adox  56(%[buff], %[len]), %[csum_odd]\n"
                "       lea   64(%[len]), %[len]\n"
                "       jmp   10b\n"
                "20:    adox  %[len], %[csum_odd]\n"  // [len] is zero
                "       adcx  %[csum_odd], %[csum]\n"
                "       adcx  %[len], %[csum]"
            : [csum] "+&r" (csum), [len] "+&c" (len), [csum_odd] "=&r" (csum_odd)
            : [buff] "r" (buff)
            : "memory");
#endif

        return csum;
}

unsigned char buff[8192] __attribute__((aligned(4096))) = {
0x46,0x56,0x20,0x04,0x10,0x02,0x50,0x07,0x72,0x4d,0xc6,0x3d,0x31,0x85,0x2d,0xbd,
0xe2,0xe0,0x9d,0x3e,0x3b,0x7a,0x70,0x3d,0xd2,0xfb,0x8c,0xbf,0x95,0x10,0xa9,0xbe,
0xeb,0xfd,0x29,0x40,0xd5,0x7a,0x61,0x40,0xde,0xcd,0x14,0xbf,0x81,0x1b,0xf6,0x3f,
0xbc,0xff,0x17,0x3f,0x67,0x1c,0x6e,0xbe,0xf4,0xc2,0x05,0x40,0x0b,0x13,0x78,0x3f,
0xfe,0x47,0xa7,0xbd,0x59,0xc2,0x15,0x3f,0x07,0xd0,0xea,0xbf,0x97,0xf1,0x3c,0x3f,
0xcc,0xfa,0x6b,0x40,0x72,0x6a,0x4f,0xbe,0x0b,0xe3,0x75,0x3e,0x3c,0x9b,0x0e,0xbf,
0xa9,0xeb,0xb7,0x3f,0xeb,0x4a,0xec,0x3e,0x33,0x8c,0x0c,0x3f,0x6a,0xf2,0xf3,0x3e,
0x2b,0x45,0x86,0x3f,0x83,0xce,0x8a,0x3f,0xf6,0x01,0x16,0x40,0x9c,0x17,0x47,0x3e,
0x44,0x83,0x61,0x40,0x74,0xc7,0x5c,0x3f,0xec,0xe7,0x95,0x3f,0xee,0x19,0xb5,0xbf,
0xb5,0xf0,0x03,0xbf,0xd1,0x02,0x1c,0x3e,0xa3,0x55,0x90,0xbe,0x1e,0x0b,0xa1,0xbf,
0xa4,0xa8,0xb4,0x3f,0xc6,0x68,0x91,0x3f,0xd1,0xc5,0xab,0x3f,0xb9,0x14,0x62,0x3f,
0x7c,0xe0,0xb9,0xbf,0xc0,0xa4,0xb5,0x3d,0x6f,0xd9,0xa7,0x3f,0x8f,0xc4,0xb0,0x3d,
0x48,0x2c,0x7a,0x3e,0x83,0xb2,0x3c,0x40,0x36,0xd3,0x18,0x40,0xb7,0xa9,0x57,0x40,
0xda,0xd3,0x95,0x3f,0x74,0x95,0xc0,0xbe,0xbb,0xce,0x71,0x3e,0x95,0xec,0x18,0xbf,
0x94,0x17,0xdd,0x3f,0x98,0xa5,0x02,0x3f,0xbb,0xfb,0xbb,0x3e,0xd0,0x5a,0x9c,0x3f,
0xd4,0x70,0x9b,0xbf,0x3b,0x9f,0x20,0xc0,0x84,0x5b,0x0f,0x40,0x5e,0x48,0x2c,0xbf,
};

#ifndef PASSES
#define PASSES 10
#endif

#ifndef OFFSET
#ifdef DFLT_OFFSET
#define OFFSET DFLT_OFFSET
#else
#define OFFSET 0
#endif
#endif

int main(int argc, char **argv)
{
        unsigned long csum;
        unsigned int tick;
        unsigned int ticks[PASSES];
        unsigned int len, off;
        unsigned int i;
        unsigned int id = init_pmc();

        len = sizeof buff;
        off = 0;
        if (argv[1]) {
                len = atoi(argv[1]);
                len = (len + 63) & ~63;
                if (argv[2])
                        off = atoi(argv[2]) & 7;
        }

        for (i = 0; i < PASSES; i++) {
                tick = rdpmc(id);
                csum = csum64(0, buff + off, len);
                ticks[i] = rdpmc(id) - tick;
        }

        printf("   ticks    rate %lx\n", csum);
        for (i = 0; i < PASSES; i++)
                printf(" %7u %7u\n", ticks[i], 100 * len / (ticks[i] - OFFSET));
        return 0;
}

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)