All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 net-next] net: Implement fast csum_partial for x86_64
@ 2016-01-05 18:41 Tom Herbert
  2016-01-05 22:18 ` Eric Dumazet
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Tom Herbert @ 2016-01-05 18:41 UTC (permalink / raw)
  To: davem, netdev; +Cc: kernel-team, tglx, mingo, hpa, x86

Implement assembly routine for csum_partial for 64 bit x86. This
primarily speeds up checksum calculation for smaller lengths such as
those that are present when doing skb_postpull_rcsum when getting
CHECKSUM_COMPLETE from device or after CHECKSUM_UNNECESSARY
conversion.

This implementation is similar to csum_partial implemented in
checksum_32.S, however since we are dealing with 8 bytes at a time
there are more cases for small lengths-- for that we employ a jump
table. Also, we don't do anything special for alignment, unaligned
accesses on x86 do not appear to be a performance issue.

Testing:

Verified correctness by testing arbitrary length buffer filled with
random data. For each buffer I compared the computed checksum
using the original algorithm for each possible alignment (0-7 bytes).

Checksum performance:

Isolating old and new implementation for some common cases:

                        Old      New
Case                    nsecs    nsecs     Improvement
---------------------+--------+--------+-----------------------------
1400 bytes (0 align)    194.5    174.3     10%    (Big packet)
40 bytes (0 align)      13.8     5.8       57%    (Ipv6 hdr common case)
8 bytes (4 align)       8.4      2.9       65%    (UDP, VXLAN in IPv4)
14 bytes (0 align)      10.6     5.8       45%    (Eth hdr)
14 bytes (4 align)      10.8     5.8       46%    (Eth hdr in IPv4)

Signed-off-by: Tom Herbert <tom@herbertland.com>
---
 arch/x86/include/asm/checksum_64.h |   5 ++
 arch/x86/lib/csum-partial_64.S     | 147 ++++++++++++++++++++++++++++++++++++
 arch/x86/lib/csum-partial_64.c     | 148 -------------------------------------
 3 files changed, 152 insertions(+), 148 deletions(-)
 create mode 100644 arch/x86/lib/csum-partial_64.S
 delete mode 100644 arch/x86/lib/csum-partial_64.c

diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index cd00e17..a888f65 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -128,6 +128,11 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
  */
 extern __wsum csum_partial(const void *buff, int len, __wsum sum);
 
+static inline __sum16 ip_compute_csum(const void *buff, int len)
+{
+	return csum_fold(csum_partial(buff, len, 0));
+}
+
 #define  _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1
 #define HAVE_CSUM_COPY_USER 1
 
diff --git a/arch/x86/lib/csum-partial_64.S b/arch/x86/lib/csum-partial_64.S
new file mode 100644
index 0000000..8e387bb
--- /dev/null
+++ b/arch/x86/lib/csum-partial_64.S
@@ -0,0 +1,147 @@
+/* Copyright 2016 Tom Herbert <tom@herbertland.com>
+ *
+ * Checksum partial calculation
+ *
+ * __wsum csum_partial(const void *buff, int len, __wsum sum)
+ *
+ * Computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * Returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * Register usage:
+ *   %rdi: argument 1, buff
+ *   %rsi: argument 2, length
+ *   %rdx: argument 3, add in value
+ *   %rax,%eax: accumulator and return value
+ *   %rcx,%ecx: counter and tmp
+ *   %r11: tmp
+ *
+ * Basic algorithm:
+ *   1) Sum 8 bytes at a time using adcq (unroll main loop
+ *      to do 64 bytes at a time)
+ *   2) Sum remaining length (less than 8 bytes)
+ *
+ * Note that buffer aligment is not considered, unaligned accesses on x86 don't
+ * seem to be a performance hit (CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is set).
+ */
+
+#include <linux/linkage.h>
+#include <asm/errno.h>
+#include <asm/asm.h>
+
+#define branch_tbl_len		.L_branch_tbl_len
+
+ENTRY(csum_partial)
+	movl	%edx, %eax	/* Initialize with initial sum argument */
+
+	/* Check length */
+	cmpl	$8, %esi
+	jg	10f
+	jl	20f
+
+	/* Exactly 8 bytes length */
+	addl	(%rdi), %eax
+	adcl	4(%rdi), %eax
+	adcl	$0, %eax
+	ret
+
+	/* Less than 8 bytes length */
+20:	clc
+	jmpq *branch_tbl_len(, %rsi, 8)
+
+	/* Greater than 8 bytes length. Determine number of quads (n). Sum
+	 * over first n % 8 quads
+	 */
+10:	movl	%esi, %ecx
+	shrl	$3, %ecx
+	andl	$0x7, %ecx
+	negq	%rcx
+	lea	20f(, %rcx, 4), %r11
+	clc
+	jmp	*%r11
+
+.align 8
+	adcq	6*8(%rdi),%rax
+	adcq	5*8(%rdi),%rax
+	adcq	4*8(%rdi),%rax
+	adcq	3*8(%rdi),%rax
+	adcq	2*8(%rdi),%rax
+	adcq	1*8(%rdi),%rax
+	adcq	0*8(%rdi),%rax
+	nop
+20:	/* #quads % 8 jump table base */
+
+	adcq	$0, %rax
+	shlq	$3, %rcx
+	subq	%rcx, %rdi /* %rcx is already negative length */
+
+	/* Now determine number of blocks of 8 quads. Sum 64 bytes at a time
+	 * using unrolled loop.
+	 */
+	movl	%esi, %ecx
+	shrl	$6, %ecx
+	jz	30f
+	clc
+
+	/* Main loop */
+40:	adcq	0*8(%rdi),%rax
+	adcq	1*8(%rdi),%rax
+	adcq	2*8(%rdi),%rax
+	adcq	3*8(%rdi),%rax
+	adcq	4*8(%rdi),%rax
+	adcq	5*8(%rdi),%rax
+	adcq	6*8(%rdi),%rax
+	adcq	7*8(%rdi),%rax
+	lea	64(%rdi), %rdi
+	loop	40b
+
+	adcq	$0, %rax
+
+	/* Handle remaining length which is < 8 bytes */
+30:	andl	$0x7, %esi
+
+	/* Fold 64 bit sum to 32 bits */
+	movq	%rax, %rcx
+	shrq	$32, %rcx
+	addl	%ecx, %eax
+
+	jmpq *branch_tbl_len(, %rsi, 8)
+
+/* Length table targets */
+
+107:	/* Length 7 */
+	adcw	4(%rdi), %ax
+105:	/* Length 5 */
+	adcw	2(%rdi), %ax
+103:	/* Length 3 */
+	adcw	(%rdi), %ax
+101:	/* Length 1, grab the odd byte */
+	adcb	-1(%rdi, %rsi), %al
+	adcb	$0, %ah
+	adcl	$0, %eax
+	ret
+106:	/* Length 6 */
+	adcw	4(%rdi), %ax
+104:	/* Length 4 */
+	adcl	(%rdi), %eax
+	adcl	$0, %eax
+	ret
+102:	/* Length 2 */
+	adcw	(%rdi), %ax
+100:	/* Length 0 */
+	adcl	$0, %eax
+	ret
+
+.section .rodata
+.align 64
+.L_branch_tbl_len:
+	.quad	100b
+	.quad	101b
+	.quad	102b
+	.quad	103b
+	.quad	104b
+	.quad	105b
+	.quad	106b
+	.quad	107b
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
deleted file mode 100644
index 9845371..0000000
--- a/arch/x86/lib/csum-partial_64.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * arch/x86_64/lib/csum-partial.c
- *
- * This file contains network checksum routines that are better done
- * in an architecture-specific manner due to speed.
- */
- 
-#include <linux/compiler.h>
-#include <linux/module.h>
-#include <asm/checksum.h>
-
-static inline unsigned short from32to16(unsigned a) 
-{
-	unsigned short b = a >> 16; 
-	asm("addw %w2,%w0\n\t"
-	    "adcw $0,%w0\n" 
-	    : "=r" (b)
-	    : "0" (b), "r" (a));
-	return b;
-}
-
-/*
- * Do a 64-bit checksum on an arbitrary memory area.
- * Returns a 32bit checksum.
- *
- * This isn't as time critical as it used to be because many NICs
- * do hardware checksumming these days.
- * 
- * Things tried and found to not make it faster:
- * Manual Prefetching
- * Unrolling to an 128 bytes inner loop.
- * Using interleaving with more registers to break the carry chains.
- */
-static unsigned do_csum(const unsigned char *buff, unsigned len)
-{
-	unsigned odd, count;
-	unsigned long result = 0;
-
-	if (unlikely(len == 0))
-		return result; 
-	odd = 1 & (unsigned long) buff;
-	if (unlikely(odd)) {
-		result = *buff << 8;
-		len--;
-		buff++;
-	}
-	count = len >> 1;		/* nr of 16-bit words.. */
-	if (count) {
-		if (2 & (unsigned long) buff) {
-			result += *(unsigned short *)buff;
-			count--;
-			len -= 2;
-			buff += 2;
-		}
-		count >>= 1;		/* nr of 32-bit words.. */
-		if (count) {
-			unsigned long zero;
-			unsigned count64;
-			if (4 & (unsigned long) buff) {
-				result += *(unsigned int *) buff;
-				count--;
-				len -= 4;
-				buff += 4;
-			}
-			count >>= 1;	/* nr of 64-bit words.. */
-
-			/* main loop using 64byte blocks */
-			zero = 0;
-			count64 = count >> 3;
-			while (count64) { 
-				asm("addq 0*8(%[src]),%[res]\n\t"
-				    "adcq 1*8(%[src]),%[res]\n\t"
-				    "adcq 2*8(%[src]),%[res]\n\t"
-				    "adcq 3*8(%[src]),%[res]\n\t"
-				    "adcq 4*8(%[src]),%[res]\n\t"
-				    "adcq 5*8(%[src]),%[res]\n\t"
-				    "adcq 6*8(%[src]),%[res]\n\t"
-				    "adcq 7*8(%[src]),%[res]\n\t"
-				    "adcq %[zero],%[res]"
-				    : [res] "=r" (result)
-				    : [src] "r" (buff), [zero] "r" (zero),
-				    "[res]" (result));
-				buff += 64;
-				count64--;
-			}
-
-			/* last up to 7 8byte blocks */
-			count %= 8; 
-			while (count) { 
-				asm("addq %1,%0\n\t"
-				    "adcq %2,%0\n" 
-					    : "=r" (result)
-				    : "m" (*(unsigned long *)buff), 
-				    "r" (zero),  "0" (result));
-				--count; 
-					buff += 8;
-			}
-			result = add32_with_carry(result>>32,
-						  result&0xffffffff); 
-
-			if (len & 4) {
-				result += *(unsigned int *) buff;
-				buff += 4;
-			}
-		}
-		if (len & 2) {
-			result += *(unsigned short *) buff;
-			buff += 2;
-		}
-	}
-	if (len & 1)
-		result += *buff;
-	result = add32_with_carry(result>>32, result & 0xffffffff); 
-	if (unlikely(odd)) { 
-		result = from32to16(result);
-		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
-	}
-	return result;
-}
-
-/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 64-bit boundary
- */
-__wsum csum_partial(const void *buff, int len, __wsum sum)
-{
-	return (__force __wsum)add32_with_carry(do_csum(buff, len),
-						(__force u32)sum);
-}
-
-/*
- * this routine is used for miscellaneous IP-like checksums, mainly
- * in icmp.c
- */
-__sum16 ip_compute_csum(const void *buff, int len)
-{
-	return csum_fold(csum_partial(buff,len,0));
-}
-EXPORT_SYMBOL(ip_compute_csum);
-
-- 
2.4.6

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 net-next] net: Implement fast csum_partial for x86_64
  2016-01-05 18:41 [PATCH v2 net-next] net: Implement fast csum_partial for x86_64 Tom Herbert
@ 2016-01-05 22:18 ` Eric Dumazet
  2016-01-06  1:10   ` H. Peter Anvin
  2016-01-06 10:16   ` David Laight
  2016-01-05 23:35 ` Hannes Frederic Sowa
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 14+ messages in thread
From: Eric Dumazet @ 2016-01-05 22:18 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev, kernel-team, tglx, mingo, hpa, x86

On Tue, 2016-01-05 at 10:41 -0800, Tom Herbert wrote:
> Implement assembly routine for csum_partial for 64 bit x86. This
> primarily speeds up checksum calculation for smaller lengths such as
> those that are present when doing skb_postpull_rcsum when getting
> CHECKSUM_COMPLETE from device or after CHECKSUM_UNNECESSARY
> conversion.

Very nice !


You might add a comment telling the '4' comes from length of 'adcq
6*8(%rdi),%rax' instruction, and that the 'nop' is to compensate that
'adcq    0*8(%rdi),%rax' is using 3 bytes instead.

We also could use .byte 0x48, 0x13, 0x47, 0x00 to force a 4 bytes
instruction and remove the nop.


+       lea     20f(, %rcx, 4), %r11
+       clc
+       jmp     *%r11
+
+.align 8
+       adcq    6*8(%rdi),%rax
+       adcq    5*8(%rdi),%rax
+       adcq    4*8(%rdi),%rax
+       adcq    3*8(%rdi),%rax
+       adcq    2*8(%rdi),%rax
+       adcq    1*8(%rdi),%rax
+       adcq    0*8(%rdi),%rax // could force a 4 byte instruction (.byte 0x48, 0x13, 0x47, 0x00)
+       nop
+20:    /* #quads % 8 jump table base */


Acked-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 net-next] net: Implement fast csum_partial for x86_64
  2016-01-05 18:41 [PATCH v2 net-next] net: Implement fast csum_partial for x86_64 Tom Herbert
  2016-01-05 22:18 ` Eric Dumazet
@ 2016-01-05 23:35 ` Hannes Frederic Sowa
  2016-01-06  3:21   ` Eric Dumazet
  2016-01-06 20:05 ` Andi Kleen
  2016-01-07  1:52 ` Hannes Frederic Sowa
  3 siblings, 1 reply; 14+ messages in thread
From: Hannes Frederic Sowa @ 2016-01-05 23:35 UTC (permalink / raw)
  To: Tom Herbert, davem, netdev; +Cc: kernel-team, tglx, mingo, hpa, x86

Hi,

On 05.01.2016 19:41, Tom Herbert wrote:
> Implement assembly routine for csum_partial for 64 bit x86. This
> primarily speeds up checksum calculation for smaller lengths such as
> those that are present when doing skb_postpull_rcsum when getting
> CHECKSUM_COMPLETE from device or after CHECKSUM_UNNECESSARY
> conversion.
>
> This implementation is similar to csum_partial implemented in
> checksum_32.S, however since we are dealing with 8 bytes at a time
> there are more cases for small lengths-- for that we employ a jump
> table. Also, we don't do anything special for alignment, unaligned
> accesses on x86 do not appear to be a performance issue.
>
> Testing:
>
> Verified correctness by testing arbitrary length buffer filled with
> random data. For each buffer I compared the computed checksum
> using the original algorithm for each possible alignment (0-7 bytes).
>
> Checksum performance:
>
> Isolating old and new implementation for some common cases:
>
>                          Old      New
> Case                    nsecs    nsecs     Improvement
> ---------------------+--------+--------+-----------------------------
> 1400 bytes (0 align)    194.5    174.3     10%    (Big packet)
> 40 bytes (0 align)      13.8     5.8       57%    (Ipv6 hdr common case)
> 8 bytes (4 align)       8.4      2.9       65%    (UDP, VXLAN in IPv4)
> 14 bytes (0 align)      10.6     5.8       45%    (Eth hdr)
> 14 bytes (4 align)      10.8     5.8       46%    (Eth hdr in IPv4)
>
> Signed-off-by: Tom Herbert <tom@herbertland.com>

Also,

Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>

Tested with the same test cases as the old patch and showed no problems 
and same improvements.

Tom, did you have a look if it makes sense to add a second carry 
addition train with the adcx instruction, which does not signal carry 
via the carry flag but with the overflow flag? This instruction should 
not have any dependencies with the adc instructions and could help the 
CPU to parallelize the code even more (increased instructions per cycle).

Thanks,
Hannes

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 net-next] net: Implement fast csum_partial for x86_64
  2016-01-05 22:18 ` Eric Dumazet
@ 2016-01-06  1:10   ` H. Peter Anvin
  2016-01-06  3:02     ` Eric Dumazet
  2016-01-06 10:16   ` David Laight
  1 sibling, 1 reply; 14+ messages in thread
From: H. Peter Anvin @ 2016-01-06  1:10 UTC (permalink / raw)
  To: Eric Dumazet, Tom Herbert; +Cc: davem, netdev, kernel-team, tglx, mingo, x86

On 01/05/2016 02:18 PM, Eric Dumazet wrote:
> On Tue, 2016-01-05 at 10:41 -0800, Tom Herbert wrote:
>> Implement assembly routine for csum_partial for 64 bit x86. This
>> primarily speeds up checksum calculation for smaller lengths such as
>> those that are present when doing skb_postpull_rcsum when getting
>> CHECKSUM_COMPLETE from device or after CHECKSUM_UNNECESSARY
>> conversion.
> 
> Very nice !
> 
> 
> You might add a comment telling the '4' comes from length of 'adcq
> 6*8(%rdi),%rax' instruction, and that the 'nop' is to compensate that
> 'adcq    0*8(%rdi),%rax' is using 3 bytes instead.
> 
> We also could use .byte 0x48, 0x13, 0x47, 0x00 to force a 4 bytes
> instruction and remove the nop.
> 

Apparently "adcq.d8" will do The Right Thing for this.

	-hpa

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 net-next] net: Implement fast csum_partial for x86_64
  2016-01-06  1:10   ` H. Peter Anvin
@ 2016-01-06  3:02     ` Eric Dumazet
  0 siblings, 0 replies; 14+ messages in thread
From: Eric Dumazet @ 2016-01-06  3:02 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: Tom Herbert, davem, netdev, kernel-team, tglx, mingo, x86

On Tue, 2016-01-05 at 17:10 -0800, H. Peter Anvin wrote:

> Apparently "adcq.d8" will do The Right Thing for this.

Nice trick ;)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 net-next] net: Implement fast csum_partial for x86_64
  2016-01-05 23:35 ` Hannes Frederic Sowa
@ 2016-01-06  3:21   ` Eric Dumazet
  0 siblings, 0 replies; 14+ messages in thread
From: Eric Dumazet @ 2016-01-06  3:21 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: Tom Herbert, davem, netdev, kernel-team, tglx, mingo, hpa, x86

On Wed, 2016-01-06 at 00:35 +0100, Hannes Frederic Sowa wrote:

> 
> Tom, did you have a look if it makes sense to add a second carry 
> addition train with the adcx instruction, which does not signal carry 
> via the carry flag but with the overflow flag? This instruction should 
> not have any dependencies with  the adc instructions and could help the 
> CPU to parallelize the code even more (increased instructions per cycle).

I guess adcx would (possibly) bring improvements for large areas, but
for this case the bottleneck is to bring data from memory.

Note this topic was discussed 2 years ago and no conclusive action was
taken.

https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg529610.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH v2 net-next] net: Implement fast csum_partial for x86_64
  2016-01-05 22:18 ` Eric Dumazet
  2016-01-06  1:10   ` H. Peter Anvin
@ 2016-01-06 10:16   ` David Laight
  2016-01-06 14:25     ` Eric Dumazet
  1 sibling, 1 reply; 14+ messages in thread
From: David Laight @ 2016-01-06 10:16 UTC (permalink / raw)
  To: 'Eric Dumazet', Tom Herbert
  Cc: davem, netdev, kernel-team, tglx, mingo, hpa, x86

From: Eric Dumazet
> Sent: 05 January 2016 22:19
> To: Tom Herbert
> You might add a comment telling the '4' comes from length of 'adcq
> 6*8(%rdi),%rax' instruction, and that the 'nop' is to compensate that
> 'adcq    0*8(%rdi),%rax' is using 3 bytes instead.
> 
> We also could use .byte 0x48, 0x13, 0x47, 0x00 to force a 4 bytes
> instruction and remove the nop.
> 
> 
> +       lea     20f(, %rcx, 4), %r11
> +       clc
> +       jmp     *%r11
> +
> +.align 8
> +       adcq    6*8(%rdi),%rax
> +       adcq    5*8(%rdi),%rax
> +       adcq    4*8(%rdi),%rax
> +       adcq    3*8(%rdi),%rax
> +       adcq    2*8(%rdi),%rax
> +       adcq    1*8(%rdi),%rax
> +       adcq    0*8(%rdi),%rax // could force a 4 byte instruction (.byte 0x48, 0x13, 0x47, 0x00)
> +       nop
> +20:    /* #quads % 8 jump table base */

Or move label at the top (after the .align) and adjust the maths.
You could add a second label after the first adcq and use the
difference between them for the '4'.

	David


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 net-next] net: Implement fast csum_partial for x86_64
  2016-01-06 10:16   ` David Laight
@ 2016-01-06 14:25     ` Eric Dumazet
  2016-01-06 14:49       ` David Laight
  0 siblings, 1 reply; 14+ messages in thread
From: Eric Dumazet @ 2016-01-06 14:25 UTC (permalink / raw)
  To: David Laight
  Cc: Tom Herbert, davem, netdev, kernel-team, tglx, mingo, hpa, x86

On Wed, 2016-01-06 at 10:16 +0000, David Laight wrote:
> From: Eric Dumazet
> > Sent: 05 January 2016 22:19
> > To: Tom Herbert
> > You might add a comment telling the '4' comes from length of 'adcq
> > 6*8(%rdi),%rax' instruction, and that the 'nop' is to compensate that
> > 'adcq    0*8(%rdi),%rax' is using 3 bytes instead.
> > 
> > We also could use .byte 0x48, 0x13, 0x47, 0x00 to force a 4 bytes
> > instruction and remove the nop.
> > 
> > 
> > +       lea     20f(, %rcx, 4), %r11
> > +       clc
> > +       jmp     *%r11
> > +
> > +.align 8
> > +       adcq    6*8(%rdi),%rax
> > +       adcq    5*8(%rdi),%rax
> > +       adcq    4*8(%rdi),%rax
> > +       adcq    3*8(%rdi),%rax
> > +       adcq    2*8(%rdi),%rax
> > +       adcq    1*8(%rdi),%rax
> > +       adcq    0*8(%rdi),%rax // could force a 4 byte instruction (.byte 0x48, 0x13, 0x47, 0x00)
> > +       nop
> > +20:    /* #quads % 8 jump table base */
> 
> Or move label at the top (after the .align) and adjust the maths.
> You could add a second label after the first adcq and use the
> difference between them for the '4'.

Not really.

We could not use the trick it the length was 5.

Only 1, 2, 4 and 8 are supported.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH v2 net-next] net: Implement fast csum_partial for x86_64
  2016-01-06 14:25     ` Eric Dumazet
@ 2016-01-06 14:49       ` David Laight
  2016-01-06 15:03         ` Eric Dumazet
  0 siblings, 1 reply; 14+ messages in thread
From: David Laight @ 2016-01-06 14:49 UTC (permalink / raw)
  To: 'Eric Dumazet'
  Cc: Tom Herbert, davem, netdev, kernel-team, tglx, mingo, hpa, x86

From: Eric Dumazet
> Sent: 06 January 2016 14:25
> On Wed, 2016-01-06 at 10:16 +0000, David Laight wrote:
> > From: Eric Dumazet
> > > Sent: 05 January 2016 22:19
> > > To: Tom Herbert
> > > You might add a comment telling the '4' comes from length of 'adcq
> > > 6*8(%rdi),%rax' instruction, and that the 'nop' is to compensate that
> > > 'adcq    0*8(%rdi),%rax' is using 3 bytes instead.
> > >
> > > We also could use .byte 0x48, 0x13, 0x47, 0x00 to force a 4 bytes
> > > instruction and remove the nop.
> > >
> > >
> > > +       lea     20f(, %rcx, 4), %r11
> > > +       clc
> > > +       jmp     *%r11
> > > +
> > > +.align 8
> > > +       adcq    6*8(%rdi),%rax
> > > +       adcq    5*8(%rdi),%rax
> > > +       adcq    4*8(%rdi),%rax
> > > +       adcq    3*8(%rdi),%rax
> > > +       adcq    2*8(%rdi),%rax
> > > +       adcq    1*8(%rdi),%rax
> > > +       adcq    0*8(%rdi),%rax // could force a 4 byte instruction (.byte 0x48, 0x13, 0x47, 0x00)
> > > +       nop
> > > +20:    /* #quads % 8 jump table base */
> >
> > Or move label at the top (after the .align) and adjust the maths.
> > You could add a second label after the first adcq and use the
> > difference between them for the '4'.
> 
> Not really.
> 
> We could not use the trick it the length was 5.
> 
> Only 1, 2, 4 and 8 are supported.

Indeed, and 'lea  20f(, %rcx, 5), %r11' will generate an error from the
assembler.
Seems appropriate to get the assembler to verify this for you.

Assuming this code block is completely skipped for aligned lengths
the nop isn't needed provided the '20:' label is at the right place.

Someone also pointed out that the code is memory limited (dual add
chains making no difference), so why is it unrolled at all?

OTOH I'm sure I remember something about false dependencies on the
x86 flags register because of instructions only changing some bits.
So it might be that you can't (or couldn't) get concurrency between
instructions that update different parts of the flags register.

	David



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 net-next] net: Implement fast csum_partial for x86_64
  2016-01-06 14:49       ` David Laight
@ 2016-01-06 15:03         ` Eric Dumazet
  0 siblings, 0 replies; 14+ messages in thread
From: Eric Dumazet @ 2016-01-06 15:03 UTC (permalink / raw)
  To: David Laight
  Cc: Tom Herbert, davem, netdev, kernel-team, tglx, mingo, hpa, x86

On Wed, 2016-01-06 at 14:49 +0000, David Laight wrote:

> Someone also pointed out that the code is memory limited (dual add
> chains making no difference), so why is it unrolled at all?

Because it matters if the data is already present in CPU caches.

So why not unrolling if it helps in some situations ?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 net-next] net: Implement fast csum_partial for x86_64
  2016-01-05 18:41 [PATCH v2 net-next] net: Implement fast csum_partial for x86_64 Tom Herbert
  2016-01-05 22:18 ` Eric Dumazet
  2016-01-05 23:35 ` Hannes Frederic Sowa
@ 2016-01-06 20:05 ` Andi Kleen
  2016-01-07  1:52 ` Hannes Frederic Sowa
  3 siblings, 0 replies; 14+ messages in thread
From: Andi Kleen @ 2016-01-06 20:05 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev, kernel-team, tglx, mingo, hpa, x86

Tom Herbert <tom@herbertland.com> writes:

> Also, we don't do anything special for alignment, unaligned
> accesses on x86 do not appear to be a performance issue.

This is not true on Atom CPUs.

Also on most CPUs there is still a larger penalty when crossing
cache lines.

> Verified correctness by testing arbitrary length buffer filled with
> random data. For each buffer I compared the computed checksum
> using the original algorithm for each possible alignment (0-7 bytes).
>
> Checksum performance:
>
> Isolating old and new implementation for some common cases:

You forgot to state the CPU. The results likely depend heavily
on the micro architecture.

The original C code was optimized for K8 FWIW.

Overall your assembler looks similar to the C code, except for the jump
table. Jump table has the disadvantage that it is much harder to branch
predict, with a large penalty if it's mispredicted.

I would expect it to be slower for cases where the length
changes frequently. Did you benchmark that case?

-Andi

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 net-next] net: Implement fast csum_partial for x86_64
  2016-01-05 18:41 [PATCH v2 net-next] net: Implement fast csum_partial for x86_64 Tom Herbert
                   ` (2 preceding siblings ...)
  2016-01-06 20:05 ` Andi Kleen
@ 2016-01-07  1:52 ` Hannes Frederic Sowa
  2016-01-07  2:36   ` Tom Herbert
  3 siblings, 1 reply; 14+ messages in thread
From: Hannes Frederic Sowa @ 2016-01-07  1:52 UTC (permalink / raw)
  To: Tom Herbert, davem, netdev; +Cc: kernel-team, tglx, mingo, hpa, x86

Hi Tom,

On 05.01.2016 19:41, Tom Herbert wrote:
> --- /dev/null
> +++ b/arch/x86/lib/csum-partial_64.S
> @@ -0,0 +1,147 @@
> +/* Copyright 2016 Tom Herbert <tom@herbertland.com>
> + *
> + * Checksum partial calculation
> + *
> + * __wsum csum_partial(const void *buff, int len, __wsum sum)
> + *
> + * Computes the checksum of a memory block at buff, length len,
> + * and adds in "sum" (32-bit)
> + *
> + * Returns a 32-bit number suitable for feeding into itself
> + * or csum_tcpudp_magic
> + *
> + * Register usage:
> + *   %rdi: argument 1, buff
> + *   %rsi: argument 2, length
> + *   %rdx: argument 3, add in value

I think you forgot to carry-add-in the %rdx register.

The assembly code replaces do_csum but not csum_partial.

Bye,
Hannes

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 net-next] net: Implement fast csum_partial for x86_64
  2016-01-07  1:52 ` Hannes Frederic Sowa
@ 2016-01-07  2:36   ` Tom Herbert
  2016-01-07  2:43     ` Hannes Frederic Sowa
  0 siblings, 1 reply; 14+ messages in thread
From: Tom Herbert @ 2016-01-07  2:36 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: David S. Miller, Linux Kernel Network Developers, Kernel Team,
	tglx, mingo, hpa, x86

On Wed, Jan 6, 2016 at 5:52 PM, Hannes Frederic Sowa
<hannes@stressinduktion.org> wrote:
> Hi Tom,
>
> On 05.01.2016 19:41, Tom Herbert wrote:
>>
>> --- /dev/null
>> +++ b/arch/x86/lib/csum-partial_64.S
>> @@ -0,0 +1,147 @@
>> +/* Copyright 2016 Tom Herbert <tom@herbertland.com>
>> + *
>> + * Checksum partial calculation
>> + *
>> + * __wsum csum_partial(const void *buff, int len, __wsum sum)
>> + *
>> + * Computes the checksum of a memory block at buff, length len,
>> + * and adds in "sum" (32-bit)
>> + *
>> + * Returns a 32-bit number suitable for feeding into itself
>> + * or csum_tcpudp_magic
>> + *
>> + * Register usage:
>> + *   %rdi: argument 1, buff
>> + *   %rsi: argument 2, length
>> + *   %rdx: argument 3, add in value
>
>
> I think you forgot to carry-add-in the %rdx register.
>
> The assembly code replaces do_csum but not csum_partial.
>
First instruction is: movl    %edx, %eax      /* Initialize with
initial sum argument */

Tom

> Bye,
> Hannes
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 net-next] net: Implement fast csum_partial for x86_64
  2016-01-07  2:36   ` Tom Herbert
@ 2016-01-07  2:43     ` Hannes Frederic Sowa
  0 siblings, 0 replies; 14+ messages in thread
From: Hannes Frederic Sowa @ 2016-01-07  2:43 UTC (permalink / raw)
  To: Tom Herbert
  Cc: David S. Miller, Linux Kernel Network Developers, Kernel Team,
	tglx, mingo, hpa, x86

On 07.01.2016 03:36, Tom Herbert wrote:
> On Wed, Jan 6, 2016 at 5:52 PM, Hannes Frederic Sowa
> <hannes@stressinduktion.org> wrote:
>> Hi Tom,
>>
>> On 05.01.2016 19:41, Tom Herbert wrote:
>>>
>>> --- /dev/null
>>> +++ b/arch/x86/lib/csum-partial_64.S
>>> @@ -0,0 +1,147 @@
>>> +/* Copyright 2016 Tom Herbert <tom@herbertland.com>
>>> + *
>>> + * Checksum partial calculation
>>> + *
>>> + * __wsum csum_partial(const void *buff, int len, __wsum sum)
>>> + *
>>> + * Computes the checksum of a memory block at buff, length len,
>>> + * and adds in "sum" (32-bit)
>>> + *
>>> + * Returns a 32-bit number suitable for feeding into itself
>>> + * or csum_tcpudp_magic
>>> + *
>>> + * Register usage:
>>> + *   %rdi: argument 1, buff
>>> + *   %rsi: argument 2, length
>>> + *   %rdx: argument 3, add in value
>>
>>
>> I think you forgot to carry-add-in the %rdx register.
>>
>> The assembly code replaces do_csum but not csum_partial.
>>
> First instruction is: movl    %edx, %eax      /* Initialize with
> initial sum argument */

Ups, sorry. I only grepped through the source. Meanwhile my test also 
uses non-zero sums and shows that it is fine.

Bye,
Hannes

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2016-01-07  2:43 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-05 18:41 [PATCH v2 net-next] net: Implement fast csum_partial for x86_64 Tom Herbert
2016-01-05 22:18 ` Eric Dumazet
2016-01-06  1:10   ` H. Peter Anvin
2016-01-06  3:02     ` Eric Dumazet
2016-01-06 10:16   ` David Laight
2016-01-06 14:25     ` Eric Dumazet
2016-01-06 14:49       ` David Laight
2016-01-06 15:03         ` Eric Dumazet
2016-01-05 23:35 ` Hannes Frederic Sowa
2016-01-06  3:21   ` Eric Dumazet
2016-01-06 20:05 ` Andi Kleen
2016-01-07  1:52 ` Hannes Frederic Sowa
2016-01-07  2:36   ` Tom Herbert
2016-01-07  2:43     ` Hannes Frederic Sowa

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.