RE: x86/csum: Remove unnecessary odd handling

From: David Laight <David.Laight@ACULAB.COM>
To: 'Linus Torvalds' <torvalds@linux-foundation.org>
Cc: Noah Goldstein <goldstein.w.n@gmail.com>,
	"x86@kernel.org" <x86@kernel.org>,
	"oe-kbuild-all@lists.linux.dev" <oe-kbuild-all@lists.linux.dev>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"edumazet@google.com" <edumazet@google.com>,
	"tglx@linutronix.de" <tglx@linutronix.de>,
	"mingo@redhat.com" <mingo@redhat.com>,
	"bp@alien8.de" <bp@alien8.de>,
	"dave.hansen@linux.intel.com" <dave.hansen@linux.intel.com>,
	"hpa@zytor.com" <hpa@zytor.com>
Subject: RE: x86/csum: Remove unnecessary odd handling
Date: Sat, 6 Jan 2024 22:08:48 +0000	[thread overview]
Message-ID: <124b21857fe44e499e29800cbf4f63f8@AcuMS.aculab.com> (raw)
In-Reply-To: <CAHk-=wg8vssPVO68_qH_BHBCj6_DDawKQHBOgZh4gw5YFmpCKA@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 2325 bytes --]

From: Linus Torvalds
> Sent: 05 January 2024 18:06
> 
> On Fri, 5 Jan 2024 at 02:41, David Laight <David.Laight@aculab.com> wrote:
> >
> > Interesting, I'm pretty sure trying to get two blocks of
> >  'adc' scheduled in parallel like that doesn't work.
> 
> You should check out the benchmark at
> 
>        https://github.com/fenrus75/csum_partial
> 
> and see if you can improve on it. I'm including the patch (on top of
> that code by Arjan) to implement the actual current kernel version as
> "New version".

Annoyingly (for me) you are partially right...

I found where my ip checksum perf code was hiding and revisited it.
Although I found comments elsewhere that the 'jecxz, adc, adc, lea, jmp'
did an adc every clock it isn't happening for me now.

I'm only measuring the inner loop for multiples of 64 bytes.
The code less than 8 bytes and partial final words is a
separate problem.
The less unrolled the main loop, the less overhead there'll
be for 'normal' sizes.
So I've changed your '80 byte' block to 64 bytes for consistency.

I'm ignoring pre-sandy bridge cpu (no split flags) and pre-broadwell
(adc takes two clocks - although adc to alternate regs is one clock
on sandy bridge).
My test system is an i7-7700, I think anything from broadwell (gen 4)
will be at least as good.
I don't have a modern amd cpu.

The best loop for 256+ bytes is an adxc/adxo one.
However that requires the run-time patching.
Followed by new kernel version (two blocks of 4 adc).
The surprising one is:
		xor	sum, sum
	1:	adc	(buff), sum
		adc	8(buff), sum
		lea	16(buff), buff
		dec	count
		jnz	1b
		adc	$0, sum
For 256 bytes it is only a couple of clocks slower.
Maybe 10% slower for 512+ bytes.
But it need almost no extra code for 'normal' buffer sizes.
By comparison the adxc/adxo one is 20% faster.

The code is doing:
	old = rdpmc
	mfence
	csum = do_csum(buf, len);
	mfence
	clocks = rdpmc - old
(That is directly reading the pmc register.)
With 'no-op' function it takes 160 clocks (I-cache resident).
Without the mfence 40 - but pretty much everything can execute
after the 2nd rdpmc.

I've attached my (horrid) test program.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

[-- Attachment #2: ipcsum.c --]
[-- Type: text/plain, Size: 12950 bytes --]

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>

#include <linux/perf_event.h>
#include <sys/mman.h>
#include <sys/syscall.h>

#ifndef PASSES
#define PASSES 10
#endif

static int init_pmc(void)
{
	static struct perf_event_attr perf_attr = {
		.type = PERF_TYPE_HARDWARE,
		.config = PERF_COUNT_HW_CPU_CYCLES,
		.pinned = 1,
	};
	struct perf_event_mmap_page *pc;

	int perf_fd;
	perf_fd = syscall(__NR_perf_event_open, &perf_attr, 0, -1, -1, 0);
	if (perf_fd < 0) {
		fprintf(stderr, "perf_event_open failed: errno %d\n", errno);
		exit(1);
	}
	pc = mmap(NULL, 4096, PROT_READ, MAP_SHARED, perf_fd, 0);
	if (pc == MAP_FAILED) {
		fprintf(stderr, "perf_event mmap() failed: errno %d\n", errno);
		exit(1);
	}
	return pc->index - 1;
}

static inline unsigned int rdpmc(unsigned int counter)
{
        unsigned int low, high;

#ifndef NOFENCE
	asm volatile("mfence");
#endif
	asm volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter));
#ifndef NOFENCE
	asm volatile("mfence");
#endif

	// return low bits, counter might to 32 or 40 bits wide.
	return low;
}

__attribute__((noinline))
unsigned long overhead(const unsigned char *buff, unsigned long len)
{
	return 0;
}

// Existing kernel loop.
// Simple 'adc' loop, abs max 8 bytes/clock.
// Will be very slow on old cpu (Pre Ivy bridge) cpu where 'dec'
// has a false dependeny against the carry flag.
// And also pre-broadwell where the adc take two clocks.
__attribute__((noinline))
unsigned long adc_dec_8(const unsigned char *buff, unsigned long len)
{
	unsigned long csum = 0;

	long count = len/64;
	asm("1:	addq	0(%[buff]), %[csum]\n"
	    "	adcq	8(%[buff]), %[csum]\n"
	    "	adcq	16(%[buff]), %[csum]\n"
	    "	adcq	24(%[buff]), %[csum]\n"
	    "	adcq	32(%[buff]), %[csum]\n"
	    "	adcq	40(%[buff]), %[csum]\n"
	    "	adcq	48(%[buff]), %[csum]\n"
	    "	adcq	56(%[buff]), %[csum]\n"
	    "	adcq	$0, %[csum]\n"
	    " 	lea	64(%[buff]), %[buff]\n"
	    " 	dec	%[count]\n"
	    "	jnz	1b"
		: [csum] "+&r" (csum), [len] "+&r" (len), [count] "+&r" (count)
		: [buff] "r" (buff)
		: "memory");

	return csum;
}

// As above but only 4 adc
__attribute__((noinline))
unsigned long adc_dec_4(const unsigned char *buff, unsigned long len)
{
	unsigned long csum = 0;

	long count = len/32;
	asm("	clc\n"
	    "1:	adcq	(%[buff]), %[csum]\n"
	    "	adcq	8(%[buff]), %[csum]\n"
	    "	adcq	16(%[buff]), %[csum]\n"
	    "	adcq	24(%[buff]), %[csum]\n"
	    " 	lea	32(%[buff]), %[buff]\n"
	    " 	dec	%[count]\n"
	    "	jnz	1b\n"
	    "	adcq	$0, %[csum]\n"
		: [csum] "+&r" (csum), [len] "+&r" (len), [count] "+&r" (count)
		: [buff] "r" (buff)
		: "memory");

	return csum;
}

// As above but only 2 adc
__attribute__((noinline))
unsigned long adc_dec_2(const unsigned char *buff, unsigned long len)
{
	unsigned long csum = 0;

	long count = len/16;
	asm("	clc\n"
	    "1:	adcq	(%[buff]), %[csum]\n"
	    "	adcq	8(%[buff]), %[csum]\n"
	    " 	lea	16(%[buff]), %[buff]\n"
	    " 	dec	%[count]\n"
	    "	jnz	1b\n"
	    "	adcq	$0, %[csum]\n"
		: [csum] "+&r" (csum), [len] "+&r" (len), [count] "+&r" (count)
		: [buff] "r" (buff)
		: "memory");

	return csum;
}

// Alternate 'adc' loop that avoids using the zero flag.
__attribute__((noinline))
unsigned long adc_jcxz_4(const unsigned char *buff, unsigned long len)
{
	unsigned long csum = 0;

	buff += len;
	len = -len;
	asm("	clc\n"
	    "1: jrcxz	2f\n"
	    "	adcq	0(%[buff], %[len]), %[csum]\n"
	    "	adcq	8(%[buff], %[len]), %[csum]\n"
	    "	adcq	16(%[buff], %[len]), %[csum]\n"
	    "	adcq	24(%[buff], %[len]), %[csum]\n"
	    " 	lea	32(%[len]), %[len]\n"
	    "	jmp	1b\n"
	    "2:	adcq	$0, %[csum]\n"
		: [csum] "+&r" (csum), [len] "+&c" (len)
		: [buff] "r" (buff)
		: "memory");

	return csum;
}

// Shorter alternate 'adc' loop that avoids using the zero flag.
__attribute__((noinline))
unsigned long adc_jcxz_2(const unsigned char *buff, unsigned long len)
{
	unsigned long csum = 0;

	buff += len;
	len = -len;
	asm("	clc\n"
	    "1: jrcxz	2f\n"
	    "	adcq	0(%[buff], %[len]), %[csum]\n"
	    "	adcq	8(%[buff], %[len]), %[csum]\n"
	    " 	lea	16(%[len]), %[len]\n"
	    "	jmp	1b\n"
	    "2:	adcq	$0, %[csum]\n"
		: [csum] "+&r" (csum), [len] "+&c" (len)
		: [buff] "r" (buff)
		: "memory");

	return csum;
}

// Proposed kernel loop.
// This splits the adc chain in two.
__attribute__((noinline))
unsigned long adc_4_pair(const unsigned char *buff, unsigned long len)
{
	unsigned long csum = 0;

	long count = len/64;
	unsigned long csum1;
	asm("1:	movq	0(%[buff]), %[csum1]\n"
	    "	adcq	8(%[buff]), %[csum1]\n"
	    "	adcq	16(%[buff]), %[csum1]\n"
	    "	adcq	24(%[buff]), %[csum1]\n"
	    "	adcq	32(%[buff]), %[csum1]\n"
	    "	adcq	$0, %[csum1]\n"
	    " 	lea	64(%[buff]), %[buff]\n"
	    "	addq	40-64(%[buff]), %[csum]\n"
	    "	adcq	48-64(%[buff]), %[csum]\n"
	    "	adcq	56-64(%[buff]), %[csum]\n"
	    "	adcq	%[csum1], %[csum]\n"
	    "	adcq	$0, %[csum]\n"
	    " 	dec	%[count]\n"
	    "	jnz	1b"
		: [csum] "+&r" (csum), [csum1] "=&r" (csum1), [len] "+&r" (len), [count] "+&r" (count)
		: [buff] "r" (buff)
		: "memory");

	return csum;
}

// Less unrolled version of previous.
__attribute__((noinline))
unsigned long adc_2_pair(const unsigned char *buff, unsigned long len)
{
	unsigned long csum = 0;

	long count = len;
	unsigned long csum1;
	asm("1:	movq	0(%[buff]), %[csum1]\n"
	    "	adcq	8(%[buff]), %[csum1]\n"
	    "	adcq	$0, %[csum1]\n"
	    " 	lea	32(%[buff]), %[buff]\n"
	    "	addq	16-32(%[buff]), %[csum]\n"
	    "	adcq	24-32(%[buff]), %[csum]\n"
	    "	adcq	%[csum1], %[csum]\n"
	    "	adcq	$0, %[csum]\n"
	    " 	sub	$32, %[count]\n"
	    "	jnz	1b"
		: [csum] "+&r" (csum), [csum1] "=&r" (csum1), [len] "+&r" (len), [count] "+&r" (count)
		: [buff] "r" (buff)
		: "memory");

	return csum;
}

// adxc/adxo loop.
// This is the only loop that exceeds 8 bytes/clock for 1k buffers.
__attribute__((noinline))
unsigned long adcx_adox(const unsigned char *buff, unsigned long len)
{
	unsigned long csum = 0;

	unsigned long csum_odd;
	buff += len;
	len = -len;
	asm(    "       xor   %[csum_odd], %[csum_odd]\n"    // Also clears carry and overflow
		"10:    jrcxz 20f\n"
		"       adcx    (%[buff], %[len]), %[csum]\n"
		"       adox   8(%[buff], %[len]), %[csum_odd]\n"
		"       adcx  16(%[buff], %[len]), %[csum]\n"
		"       adox  24(%[buff], %[len]), %[csum_odd]\n"
		"       adcx  32(%[buff], %[len]), %[csum]\n"
		"       adox  40(%[buff], %[len]), %[csum_odd]\n"
		"       adcx  48(%[buff], %[len]), %[csum]\n"
		"       adox  56(%[buff], %[len]), %[csum_odd]\n"
		"       lea   64(%[len]), %[len]\n"
		"       jmp   10b\n"
		"20:    adox  %[len], %[csum_odd]\n"  // [len] is zero
		"       adcx  %[csum_odd], %[csum]\n"
		"       adcx  %[len], %[csum]"
	    : [csum] "+&r" (csum), [len] "+&c" (len), [csum_odd] "=&r" (csum_odd)
	    : [buff] "r" (buff)
	    : "memory");

	return csum;
}

#define OPTIMISER_HIDE_VAR(x) asm volatile( "": "+r" (x))

// In principe the cpu can do 2 reads every clock.
// So combining with adds to different registers it should
// be possible to get this C code to run near to 8 bytes/clock.
// But the compiler doesn't want to 'play ball'.
// The OPTIMISER_HIDE_VAR() helps a bit
__attribute__((noinline))
unsigned long add_c_16(const unsigned char *buff, unsigned long len)
{
	unsigned long csum_odd = 0;
	unsigned long csum = 0;

	buff += len;
	len = -len;
	do {
		csum += *(unsigned int *)(buff + len + 0);
		csum_odd += *(unsigned int *)(buff + len + 4);
		OPTIMISER_HIDE_VAR(csum);
		OPTIMISER_HIDE_VAR(csum_odd);
		csum += *(unsigned int *)(buff + len + 8);
		csum_odd += *(unsigned int *)(buff + len + 12);
		OPTIMISER_HIDE_VAR(csum);
		OPTIMISER_HIDE_VAR(csum_odd);
	} while (len += 16);
	csum += csum_odd;

	return csum;
}

// As above but unrolled further
__attribute__((noinline))
unsigned long add_c_32(const unsigned char *buff, unsigned long len)
{
	unsigned long csum_odd = 0;
	unsigned long csum = 0;

	buff += len;
	len = -len;
	do {
		csum += *(unsigned int *)(buff + len + 0);
		csum_odd += *(unsigned int *)(buff + len + 4);
		OPTIMISER_HIDE_VAR(csum);
		OPTIMISER_HIDE_VAR(csum_odd);
		csum += *(unsigned int *)(buff + len + 8);
		csum_odd += *(unsigned int *)(buff + len + 12);
		OPTIMISER_HIDE_VAR(csum);
		OPTIMISER_HIDE_VAR(csum_odd);
		csum += *(unsigned int *)(buff + len + 16);
		csum_odd += *(unsigned int *)(buff + len + 20);
		OPTIMISER_HIDE_VAR(csum);
		OPTIMISER_HIDE_VAR(csum_odd);
		csum += *(unsigned int *)(buff + len + 24);
		csum_odd += *(unsigned int *)(buff + len + 28);
		OPTIMISER_HIDE_VAR(csum);
		OPTIMISER_HIDE_VAR(csum_odd);
	} while (len += 32);
	csum += csum_odd;

	return csum;
}

// Alternative C loop that sums 'val << 32' to get the carries.
// Might be better on arm64 than x86
__attribute__((noinline))
unsigned long add_c_high(const unsigned char *buff, unsigned long len)
{
	unsigned long sum_hi_odd = 0, sum_lo_odd = 0;
	unsigned long sum_hi = 0, sum_lo = 0;
	unsigned long val;
	unsigned int i;

	buff += len;
	len = -len;
	do {
		val = *(unsigned long *)(buff + len + 0);
		sum_lo += val;
		sum_hi += val >> 32;
		val = *(unsigned long *)(buff + len + 8);
		sum_lo_odd += val;
		sum_hi_odd += val >> 32;
	} while (len += 16);
	OPTIMISER_HIDE_VAR(sum_lo);
	OPTIMISER_HIDE_VAR(sum_hi);
	sum_lo += sum_lo_odd;
	sum_hi += sum_hi_odd;

	sum_hi += (sum_lo >> 32) - (sum_hi & 0xffffffffu);
	return sum_hi + (sum_lo & 0xffffffff);
}

unsigned char buffer[8192] __attribute__((aligned(4096))) = {
0x46,0x56,0x20,0x04,0x10,0x02,0x50,0x07,0x72,0x4d,0xc6,0x3d,0x31,0x85,0x2d,0xbd,
0xe2,0xe0,0x9d,0x3e,0x3b,0x7a,0x70,0x3d,0xd2,0xfb,0x8c,0xbf,0x95,0x10,0xa9,0xbe,
0xeb,0xfd,0x29,0x40,0xd5,0x7a,0x61,0x40,0xde,0xcd,0x14,0xbf,0x81,0x1b,0xf6,0x3f,
0xbc,0xff,0x17,0x3f,0x67,0x1c,0x6e,0xbe,0xf4,0xc2,0x05,0x40,0x0b,0x13,0x78,0x3f,
0xfe,0x47,0xa7,0xbd,0x59,0xc2,0x15,0x3f,0x07,0xd0,0xea,0xbf,0x97,0xf1,0x3c,0x3f,
0xcc,0xfa,0x6b,0x40,0x72,0x6a,0x4f,0xbe,0x0b,0xe3,0x75,0x3e,0x3c,0x9b,0x0e,0xbf,
0xa9,0xeb,0xb7,0x3f,0xeb,0x4a,0xec,0x3e,0x33,0x8c,0x0c,0x3f,0x6a,0xf2,0xf3,0x3e,
0x2b,0x45,0x86,0x3f,0x83,0xce,0x8a,0x3f,0xf6,0x01,0x16,0x40,0x9c,0x17,0x47,0x3e,
0x44,0x83,0x61,0x40,0x74,0xc7,0x5c,0x3f,0xec,0xe7,0x95,0x3f,0xee,0x19,0xb5,0xbf,
0xb5,0xf0,0x03,0xbf,0xd1,0x02,0x1c,0x3e,0xa3,0x55,0x90,0xbe,0x1e,0x0b,0xa1,0xbf,
0xa4,0xa8,0xb4,0x3f,0xc6,0x68,0x91,0x3f,0xd1,0xc5,0xab,0x3f,0xb9,0x14,0x62,0x3f,
0x7c,0xe0,0xb9,0xbf,0xc0,0xa4,0xb5,0x3d,0x6f,0xd9,0xa7,0x3f,0x8f,0xc4,0xb0,0x3d,
0x48,0x2c,0x7a,0x3e,0x83,0xb2,0x3c,0x40,0x36,0xd3,0x18,0x40,0xb7,0xa9,0x57,0x40,
0xda,0xd3,0x95,0x3f,0x74,0x95,0xc0,0xbe,0xbb,0xce,0x71,0x3e,0x95,0xec,0x18,0xbf,
0x94,0x17,0xdd,0x3f,0x98,0xa5,0x02,0x3f,0xbb,0xfb,0xbb,0x3e,0xd0,0x5a,0x9c,0x3f,
0xd4,0x70,0x9b,0xbf,0x3b,0x9f,0x20,0xc0,0x84,0x5b,0x0f,0x40,0x5e,0x48,0x2c,0xbf,
};

static __attribute__((noinline)) void
print_csum(const char *msg, unsigned long csum, unsigned int *ticks)
{
	static unsigned int offset;
	unsigned int i;

	csum = (csum & 0xffffffff) + (csum >> 32);
	csum = (csum & 0xffffffff) + (csum >> 32);
	csum = (csum & 0xffff) + (csum >> 16);
	csum = (csum & 0xffff) + (csum >> 16);
	printf("  %4lx", csum);

	for (i = 0; i < PASSES; i++)
		printf(" %5d", ticks[i] - offset);
	printf("  %s\n", msg);

	if (!offset) {
		offset = ticks[0];
		for (i = 1; i < PASSES; i++)
			if (offset > ticks[i])
				offset = ticks[i];
	}
}

static inline __attribute__((always_inline)) void
do_csum(int pmc_id, const char *msg, unsigned long (*fn)(const unsigned char *, unsigned long),
	const unsigned char *buf, unsigned long len)
{
	unsigned int ticks[PASSES];
	unsigned long csum;
	unsigned int tick;
	unsigned int i;

	for (i = 0; i < PASSES; i++) {
		tick = rdpmc(pmc_id);
		csum = fn(buf, len);
		ticks[i] = rdpmc(pmc_id) - tick;
	}

	print_csum(msg, csum, ticks);
}

int main(int argc, char **argv)
{
	unsigned long csum;
	unsigned int len, off;
	unsigned int pmc_id = init_pmc();

	len = 128;
	off = 0;
	for (;;) {
		switch (getopt(argc, argv, "l:o:")) {
		case -1:
			break;
		default:
			exit(1);
		case 'l':
			len = atoi(optarg);
			continue;
		case 'o':
			off = atoi(optarg);
			continue;
		}
		break;
	}

	/* Solving partial blocks is a different problem... */
	len = (len + 63) & ~63;
	if (!len)
		len = 64;

	do_csum(pmc_id, "overhead", overhead, buffer + off, len);
	do_csum(pmc_id, "adc_dec_2", adc_dec_2, buffer + off, len);
	do_csum(pmc_id, "adc_dec_4", adc_dec_4, buffer + off, len);
	do_csum(pmc_id, "adc_dec_8", adc_dec_8, buffer + off, len);
	do_csum(pmc_id, "adc_jcxz_2", adc_jcxz_2, buffer + off, len);
	do_csum(pmc_id, "adc_jcxz_4", adc_jcxz_4, buffer + off, len);
	do_csum(pmc_id, "adc_2_pair", adc_2_pair, buffer + off, len);
	do_csum(pmc_id, "adc_4_pair", adc_4_pair, buffer + off, len);
	do_csum(pmc_id, "adcx_adox", adcx_adox, buffer + off, len);
	do_csum(pmc_id, "add_c_16", add_c_16, buffer + off, len);
	do_csum(pmc_id, "add_c_32", add_c_32, buffer + off, len);
	do_csum(pmc_id, "add_c_high", add_c_high, buffer + off, len);

	return 0;
}