linux-arch.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: David Laight <David.Laight@ACULAB.COM>
To: 'Al Viro' <viro@zeniv.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"linux-arch@vger.kernel.org" <linux-arch@vger.kernel.org>
Subject: RE: [PATCH 04/18] csum_and_copy_..._user(): pass 0xffffffff instead of 0 as initial sum
Date: Thu, 23 Jul 2020 13:54:47 +0000	[thread overview]
Message-ID: <02938acd78fd40beb02ffc5a1b803d85@AcuMS.aculab.com> (raw)
In-Reply-To: <20200722173903.GG2786714@ZenIV.linux.org.uk>

[-- Attachment #1: Type: text/plain, Size: 1012 bytes --]

From: Al Viro
> Sent: 22 July 2020 18:39
> I would love to see your patch, anyway, along with the testcases and performance
> comparison.

See attached program.
Compile and run (as root): csum_iov 1

Unpatched (as shipped) 16 vectors of 1 byte take ~430 clocks on my haswell cpu.
With dsl_patch defined they take ~393.

The maximum throughput is ~1.16 clocks/word for 16 vectors of 1k.
For longer vectors the data gets lost from the cache between the iterations.

On an older Ivy Bridge cpu it never goes faster than 2 clocks/word.
(Due to the implementation of ADC.)

The absolute limit is 1 clock/word - limited by the memory write.
I suspect that is achievable on Haswell with much less loop unrolling.

I had to replace the ror32() with __builtin_bswap32().
The kernel object do contain the 'ror' instruction - even though I
didn't find the asm for it.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

[-- Attachment #2: csum_iov.c --]
[-- Type: text/plain, Size: 7804 bytes --]

/* Test program for checksum+copy
 *
 * Executes csum_and_copy_from_iter() in userspace.
 * Uses PERF_COUNT_HW_CPU_CYCLES to see how fast it runs.
 * Always copies i6 copies of the same buffer to the target.
 * Length of each fragment taken from argv[0].
 *
 * It needs linking with a copy of csum-copy_64.o (eg from a kernel build).
 *
 * For large buffers the 'adc' loop dominates.
 * On anything prior to Haswell this is 2 clocks per adc.
 * On Haswell adc is faster and it seems to approach 1.16 clocks/word.
 * It ought to be possibly to get to 1 clock/word on Ivy bridge (Sandy?)
 * or later.
 */
// define for my version
// #define dsl_patch

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>

#include <linux/perf_event.h>
#include <sys/mman.h>
#include <sys/syscall.h>

#define likely(x) (x)
#define unlikely(x) (x)

typedef uint32_t __wsum;

struct kvec {
	size_t iov_len;
	void   *iov_base;
};

struct iov_iter {
	unsigned int count;
	unsigned int nr_segs;
	const struct kvec *kvec;
	size_t       iov_offset;
};

#define min(a,b) ((a) < (b) ? (a) : (b))

static unsigned short fold(unsigned int csum)
{
	csum = (csum & 0xffff) + (csum >> 16);
	return csum + (csum >> 16);
}

extern __wsum csum_partial_copy_generic(const void *, void *, size_t, __wsum, void *, void *);


__wsum
csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum)
{
        return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL);
}

static inline unsigned add32_with_carry(unsigned a, unsigned b)
{
	asm("addl %2,%0\n\t"
	    "adcl $0,%0"
	    : "=r" (a)
	    : "0" (a), "rm" (b));
	return a;
}


static inline __wsum csum_add(__wsum csum, __wsum addend)
{
	return add32_with_carry(csum, addend);
}

static inline __wsum
csum_block_add(__wsum csum, __wsum sum, int offset)
{
        /* rotate sum to align it with a 16b boundary */
        if (offset & 1)
                sum = __builtin_bswap32(sum);

        return csum_add(csum, sum);
}
//////////////////////////////////////////////////////////////////////

/* Necessary bits from iov_iter.c */

#define iterate_kvec(i, n, __v, __p, skip, STEP) {	\
	size_t wanted = n;				\
	__p = i->kvec;					\
	__v.iov_len = min(n, __p->iov_len - skip);	\
	if (likely(__v.iov_len)) {			\
		__v.iov_base = __p->iov_base + skip;	\
		(void)(STEP);				\
		skip += __v.iov_len;			\
		n -= __v.iov_len;			\
	}						\
	while (unlikely(n)) {				\
		__p++;					\
		__v.iov_len = min(n, __p->iov_len);	\
		if (unlikely(!__v.iov_len))		\
			continue;			\
		__v.iov_base = __p->iov_base;		\
		(void)(STEP);				\
		skip = __v.iov_len;			\
		n -= __v.iov_len;			\
	}						\
	n = wanted;					\
}



#define iterate_and_advance(i, n, v, I, B, K) {			\
	if (unlikely(i->count < n))				\
		n = i->count;					\
	if (i->count) {						\
		size_t skip = i->iov_offset;			\
			const struct kvec *kvec;		\
			struct kvec v;				\
			iterate_kvec(i, n, v, kvec, skip, (K))	\
			if (skip == kvec->iov_len) {		\
				kvec++;				\
				skip = 0;			\
			}					\
			i->nr_segs -= kvec - i->kvec;		\
			i->kvec = kvec;				\
		i->count -= n;					\
		i->iov_offset = skip;				\
	}							\
}


static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
			      __wsum sum, size_t off)
{
#ifdef dsl_patch
	return csum_partial_copy_nocheck(from, to, len, sum);
#else
	__wsum next = csum_partial_copy_nocheck(from, to, len, 0);
	return csum_block_add(sum, next, off);
#endif
}



size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
			       struct iov_iter *i)
{
	char *to = addr;
	__wsum sum, next;
	size_t off = 0;
	sum = *csum;
	iterate_and_advance(i, bytes, v, , ,({
		sum = csum_and_memcpy((to += v.iov_len) - v.iov_len,
				      v.iov_base, v.iov_len,
				      sum, off);
		off += v.iov_len;
#ifdef dsl_patch
		if (v.iov_len & 1)
			sum = __builtin_bswap32(sum);
#endif
			
	})
	)
#ifdef dsl_patch
	if (off & 1)
		sum = __builtin_bswap32(sum);
#endif
	*csum = sum;
	return bytes;
}

//////////////////////////////////////////////////////////////////////

void ex_handler_uaccess(void) { }
void ex_handler_default(void) { }

static char data[65536] = {

0x46,0x56,0x20,0x04,0x00,0x02,0x00,0x00,0x72,0x4d,0xc6,0x3d,0x31,0x85,0x2d,0xbd,
0xe2,0xe0,0x9d,0x3e,0x3b,0x7a,0x70,0x3d,0xd2,0xfb,0x8c,0xbf,0x95,0x10,0xa9,0xbe,
0xeb,0xfd,0x29,0x40,0xd5,0x7a,0x61,0x40,0xde,0xcd,0x14,0xbf,0x81,0x1b,0xf6,0x3f,
0xbc,0xff,0x17,0x3f,0x67,0x1c,0x6e,0xbe,0xf4,0xc2,0x05,0x40,0x0b,0x13,0x78,0x3f,
0xfe,0x47,0xa7,0xbd,0x59,0xc2,0x15,0x3f,0x07,0xd0,0xea,0xbf,0x97,0xf1,0x3c,0x3f,
0xcc,0xfa,0x6b,0x40,0x72,0x6a,0x4f,0xbe,0x0b,0xe3,0x75,0x3e,0x3c,0x9b,0x0e,0xbf,
0xa9,0xeb,0xb7,0x3f,0xeb,0x4a,0xec,0x3e,0x33,0x8c,0x0c,0x3f,0x6a,0xf2,0xf3,0x3e,
0x2b,0x45,0x86,0x3f,0x83,0xce,0x8a,0x3f,0xf6,0x01,0x16,0x40,0x9c,0x17,0x47,0x3e,
0x44,0x83,0x61,0x40,0x74,0xc7,0x5c,0x3f,0xec,0xe7,0x95,0x3f,0xee,0x19,0xb5,0xbf,
0xb5,0xf0,0x03,0xbf,0xd1,0x02,0x1c,0x3e,0xa3,0x55,0x90,0xbe,0x1e,0x0b,0xa1,0xbf,
0xa4,0xa8,0xb4,0x3f,0xc6,0x68,0x91,0x3f,0xd1,0xc5,0xab,0x3f,0xb9,0x14,0x62,0x3f,
0x7c,0xe0,0xb9,0xbf,0xc0,0xa4,0xb5,0x3d,0x6f,0xd9,0xa7,0x3f,0x8f,0xc4,0xb0,0x3d,
0x48,0x2c,0x7a,0x3e,0x83,0xb2,0x3c,0x40,0x36,0xd3,0x18,0x40,0xb7,0xa9,0x57,0x40,
0xda,0xd3,0x95,0x3f,0x74,0x95,0xc0,0xbe,0xbb,0xce,0x71,0x3e,0x95,0xec,0x18,0xbf,
0x94,0x17,0xdd,0x3f,0x98,0xa5,0x02,0x3f,0xbb,0xfb,0xbb,0x3e,0xd0,0x5a,0x9c,0x3f,
0xd4,0x00,0x9b,0xbf,0x3b,0x9f,0x20,0xc0,0x84,0x5b,0x0f,0x40,0x5e,0x48,0x2c,0xbf,

};

#if 0
struct kvec {
	size_t iov_len;
	void   *iov_base;
};

struct iov_iter {
	unsigned int count;
	unsigned int nr_segs;
	const struct kvec *kvec;
	size_t       iov_offset;
};
#endif

static inline unsigned int rdpmc(unsigned int counter)
{
	unsigned int low, high;

	asm volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter));

	// return low bits, counter might to 32 or 40 bits wide.
	return low;
}

unsigned int read_cpu_cycles(void)
{
	static struct perf_event_attr perf_attr = {
		.type = PERF_TYPE_HARDWARE,
		.config = PERF_COUNT_HW_CPU_CYCLES,
		// .config = PERF_COUNT_HW_INSTRUCTIONS,
		.pinned = 1,
	};
	static struct perf_event_mmap_page *pc;
	unsigned int seq, idx, count;

	if (!pc) {
		int perf_fd;
		perf_fd = syscall(__NR_perf_event_open, &perf_attr, 0, -1, -1, 0);
		if (perf_fd < 0) {
			fprintf(stderr, "perf_event_open failed: errno %d\n", errno);
			exit(1);
		}
		pc = mmap(NULL, 4096, PROT_READ, MAP_SHARED, perf_fd, 0);
		if (pc == MAP_FAILED) {
			fprintf(stderr, "perf_event mmap() failed: errno %d\n", errno);
			exit(1);
		}
	}

	do {
		seq = pc->lock;
		asm volatile("":::"memory");
		idx = pc->index;
		if (!idx) //  || !pc->cap_user_rdpmc)
			return 0;
		count = pc->offset + rdpmc(idx - 1);
		asm volatile("":::"memory");
	} while (pc->lock != seq);

	return count;
}


static int target[16 * sizeof data / 4];

#define PASSES 16
int main(int argc, char **argv)
{
	struct kvec kvec[16];
	struct iov_iter i;
	int len;
	unsigned int clocks[PASSES];
	__wsum csum[PASSES] = {};
	unsigned int pass;
	unsigned int frag_len;

	read_cpu_cycles();
	clocks[0] = read_cpu_cycles();

	frag_len = argv[1] ? atoi(argv[1]) : 0;
	if (!frag_len || frag_len > sizeof data)
		frag_len = sizeof data;

	for (pass = 1; pass < PASSES; pass++) {
		/* Sum the same data 16 times */
		i.count = frag_len * 16;
		i.nr_segs = 16;
		i.kvec = kvec;
		i.iov_offset = 0;

		for (len = 0; len < 16; len++) {
			kvec[len].iov_len = frag_len;
			kvec[len].iov_base = data;
		}
		csum_and_copy_from_iter(target, i.count, csum + pass, &i);
		clocks[pass] = read_cpu_cycles();
	}
	for (pass = 1; pass < PASSES; pass++) {
		unsigned int delta = clocks[pass] - clocks[pass - 1];
		printf("pass %d: length %d, csum %x, clocks %d, clocks/word %5f\n",
			pass, frag_len * 16, fold(csum[pass]), delta, delta / (frag_len * 16/8 + 0.0));
	}

	return 0;
}

  parent reply	other threads:[~2020-07-23 13:54 UTC|newest]

Thread overview: 102+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-07-21 20:24 [RFC][CFT][PATCHSET] saner calling conventions for csum-and-copy primitives Al Viro
2020-07-21 20:25 ` [PATCH 01/18] skb_copy_and_csum_bits(): don't bother with the last argument Al Viro
2020-07-21 20:25   ` [PATCH 02/18] icmp_push_reply(): reorder adding the checksum up Al Viro
2020-07-21 20:25   ` [PATCH 03/18] csum_partial_copy_nocheck(): drop the last argument Al Viro
2020-07-21 20:25     ` Al Viro
2020-07-21 20:25   ` [PATCH 04/18] csum_and_copy_..._user(): pass 0xffffffff instead of 0 as initial sum Al Viro
2020-07-21 20:25     ` Al Viro
2020-07-21 20:55     ` Linus Torvalds
2020-07-21 20:58       ` Linus Torvalds
2020-07-21 21:11         ` Al Viro
2020-07-21 21:16           ` Linus Torvalds
2020-07-21 21:16             ` Linus Torvalds
2020-07-25 17:54           ` Al Viro
2020-07-22  9:45       ` David Laight
2020-07-22  9:27     ` David Laight
2020-07-22 14:42       ` Al Viro
2020-07-22 15:22         ` David Laight
2020-07-22 15:54           ` Al Viro
2020-07-22 16:17             ` David Laight
2020-07-22 17:39               ` Al Viro
2020-07-23  8:29                 ` David Laight
2020-07-23 13:54                 ` David Laight [this message]
2020-07-23 14:30                   ` David Laight
2020-07-23 14:53                   ` Al Viro
2020-07-23 15:19                     ` David Laight
2020-07-23 15:21                     ` Al Viro
2020-07-23 15:36                       ` David Laight
2020-07-21 20:25   ` [PATCH 05/18] saner calling conventions for csum_and_copy_..._user() Al Viro
2020-07-21 20:25     ` Al Viro
2020-07-21 20:25   ` [PATCH 06/18] alpha: propagate the calling convention changes down to csum_partial_copy.c helpers Al Viro
2020-07-21 20:25     ` Al Viro
2020-07-21 20:25   ` [PATCH 07/18] arm: propagate the calling convention changes down to csum_partial_copy_from_user() Al Viro
2020-07-21 20:25   ` [PATCH 08/18] m68k: get rid of zeroing destination on error in csum_and_copy_from_user() Al Viro
2020-07-21 20:25     ` Al Viro
2020-07-21 20:25   ` [PATCH 09/18] sh: propage the calling conventions change down to csum_partial_copy_generic() Al Viro
2020-07-21 20:25   ` [PATCH 10/18] i386: propagate " Al Viro
2020-07-21 20:25     ` Al Viro
2020-07-21 20:25   ` [PATCH 11/18] sparc32: propagate the calling conventions change down to __csum_partial_copy_sparc_generic() Al Viro
2020-07-21 20:25     ` Al Viro
2020-07-22  1:20     ` David Miller
2020-07-21 20:25   ` [PATCH 12/18] mips: csum_and_copy_{to,from}_user() are never called under KERNEL_DS Al Viro
2020-07-21 20:25   ` [PATCH 13/18] mips: __csum_partial_copy_kernel() has no users left Al Viro
2020-07-21 20:25     ` Al Viro
2020-07-21 20:25   ` [PATCH 14/18] mips: propagate the calling convention change down into __csum_partial_copy_..._user() Al Viro
2020-07-21 20:25     ` Al Viro
2020-07-21 20:25   ` [PATCH 15/18] xtensa: propagate the calling conventions change down into csum_partial_copy_generic() Al Viro
2020-07-22  8:56     ` Max Filippov
2020-07-21 20:25   ` [PATCH 16/18] sparc64: propagate the calling convention changes down to __csum_partial_copy_...() Al Viro
2020-07-21 20:25     ` Al Viro
2020-07-22  1:21     ` David Miller
2020-07-21 20:25   ` [PATCH 17/18] amd64: switch csum_partial_copy_generic() to new calling conventions Al Viro
2020-07-21 20:25     ` Al Viro
2020-07-21 20:25   ` [PATCH 18/18] ppc: propagate the calling conventions change down to csum_partial_copy_generic() Al Viro
2020-07-21 20:25     ` Al Viro
2020-07-24  1:25 ` [RFC][CFT][PATCHSET v2] saner calling conventions for csum-and-copy primitives Al Viro
2020-07-24  1:25   ` [PATCH v2 01/20] xtensa: fix access check in csum_and_copy_from_user Al Viro
2020-07-24  1:25     ` [PATCH v2 02/20] skb_copy_and_csum_bits(): don't bother with the last argument Al Viro
2020-07-24  1:25     ` [PATCH v2 03/20] icmp_push_reply(): reorder adding the checksum up Al Viro
2020-07-24  1:25       ` Al Viro
2020-07-24  1:25     ` [PATCH v2 04/20] unify generic instances of csum_partial_copy_nocheck() Al Viro
2020-07-24  6:41       ` Christoph Hellwig
2020-07-24 12:19         ` Al Viro
2020-07-24 12:23           ` Christoph Hellwig
2020-07-24 12:30             ` Al Viro
2020-07-26  7:11               ` Christoph Hellwig
2020-07-26  7:11                 ` Christoph Hellwig
2020-07-27  3:58                 ` Al Viro
2020-07-24  1:25     ` [PATCH v2 05/20] csum_partial_copy_nocheck(): drop the last argument Al Viro
2020-07-24  1:25       ` Al Viro
2020-07-24  1:25     ` [PATCH v2 06/20] csum_and_copy_..._user(): pass 0xffffffff instead of 0 as initial sum Al Viro
2020-07-24  1:25     ` [PATCH v2 07/20] saner calling conventions for csum_and_copy_..._user() Al Viro
2020-07-24  1:25       ` Al Viro
2020-07-24  1:25     ` [PATCH v2 08/20] alpha: propagate the calling convention changes down to csum_partial_copy.c helpers Al Viro
2020-07-24  1:25       ` Al Viro
2020-07-24  1:25     ` [PATCH v2 09/20] arm: propagate the calling convention changes down to csum_partial_copy_from_user() Al Viro
2020-07-24  1:25     ` [PATCH v2 10/20] m68k: get rid of zeroing destination on error in csum_and_copy_from_user() Al Viro
2020-07-24  1:25     ` [PATCH v2 11/20] sh: propage the calling conventions change down to csum_partial_copy_generic() Al Viro
2020-07-24  1:25       ` Al Viro
2020-07-24  1:25     ` [PATCH v2 12/20] i386: propagate " Al Viro
2020-07-24  1:25     ` [PATCH v2 13/20] sparc32: propagate the calling conventions change down to __csum_partial_copy_sparc_generic() Al Viro
2020-07-24  1:25       ` Al Viro
2020-07-24  1:25     ` [PATCH v2 14/20] mips: csum_and_copy_{to,from}_user() are never called under KERNEL_DS Al Viro
2020-07-24  1:25     ` [PATCH v2 15/20] mips: __csum_partial_copy_kernel() has no users left Al Viro
2020-07-24  1:25       ` Al Viro
2020-07-24  1:25     ` [PATCH v2 16/20] mips: propagate the calling convention change down into __csum_partial_copy_..._user() Al Viro
2020-07-24  1:25       ` Al Viro
2020-07-24  1:25     ` [PATCH v2 17/20] xtensa: propagate the calling conventions change down into csum_partial_copy_generic() Al Viro
2020-07-24  1:25       ` Al Viro
2020-07-24  1:25     ` [PATCH v2 18/20] sparc64: propagate the calling convention changes down to __csum_partial_copy_...() Al Viro
2020-07-24  1:25       ` Al Viro
2020-07-24  1:25     ` [PATCH v2 19/20] amd64: switch csum_partial_copy_generic() to new calling conventions Al Viro
2020-07-24  1:25       ` Al Viro
2020-07-24  1:25     ` [PATCH v2 20/20] ppc: propagate the calling conventions change down to csum_partial_copy_generic() Al Viro
2020-07-24  1:25       ` Al Viro
2020-10-14 22:26       ` Jason A. Donenfeld
2020-10-14 22:51         ` Linus Torvalds
2020-10-14 22:53           ` Linus Torvalds
2020-10-14 22:54             ` Jason A. Donenfeld
2020-10-14 22:53           ` Jason A. Donenfeld
2020-10-14 23:12           ` Al Viro
2020-10-14 23:02         ` [PATCH] powerpc32: don't adjust unmoved stack pointer in csum_partial_copy_generic() epilogue Jason A. Donenfeld
2020-10-14 23:05           ` Linus Torvalds

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=02938acd78fd40beb02ffc5a1b803d85@AcuMS.aculab.com \
    --to=david.laight@aculab.com \
    --cc=linux-arch@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=torvalds@linux-foundation.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).