All of lore.kernel.org
 help / color / mirror / Atom feed
From: Al Viro <viro@ZenIV.linux.org.uk>
To: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org, linux-arch@vger.kernel.org
Subject: [PATCH 17/18] amd64: switch csum_partial_copy_generic() to new calling conventions
Date: Tue, 21 Jul 2020 21:25:48 +0100	[thread overview]
Message-ID: <20200721202549.4150745-17-viro@ZenIV.linux.org.uk> (raw)
In-Reply-To: <20200721202549.4150745-1-viro@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

... and fold handling of misaligned case into it.

Implementation note: we stash the "will we need to rol8 the sum in the end"
flag into the MSB of %rcx (the lower 32 bits are used for length); the rest
is pretty straightforward.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/include/asm/checksum_64.h |   5 +-
 arch/x86/lib/csum-copy_64.S        | 140 ++++++++++++++++++++++---------------
 arch/x86/lib/csum-wrappers_64.c    |  72 +++----------------
 3 files changed, 94 insertions(+), 123 deletions(-)

diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index 9af3aed54c6b..407beebadaf4 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -130,10 +130,7 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
 extern __wsum csum_partial(const void *buff, int len, __wsum sum);
 
 /* Do not call this directly. Use the wrappers below */
-extern __visible __wsum csum_partial_copy_generic(const void *src, const void *dst,
-					int len, __wsum sum,
-					int *src_err_ptr, int *dst_err_ptr);
-
+extern __visible __wsum csum_partial_copy_generic(const void *src, void *dst, int len);
 
 extern __wsum csum_and_copy_from_user(const void __user *src, void *dst, int len);
 extern __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len);
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index 3394a8ff7fd0..1fbd8ee9642d 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -18,9 +18,6 @@
  * rdi  source
  * rsi  destination
  * edx  len (32bit)
- * ecx  sum (32bit)
- * r8   src_err_ptr (int)
- * r9   dst_err_ptr (int)
  *
  * Output
  * eax  64bit sum. undefined in case of exception.
@@ -31,44 +28,32 @@
 
 	.macro source
 10:
-	_ASM_EXTABLE_UA(10b, .Lbad_source)
+	_ASM_EXTABLE_UA(10b, .Lfault)
 	.endm
 
 	.macro dest
 20:
-	_ASM_EXTABLE_UA(20b, .Lbad_dest)
+	_ASM_EXTABLE_UA(20b, .Lfault)
 	.endm
 
-	/*
-	 * No _ASM_EXTABLE_UA; this is used for intentional prefetch on a
-	 * potentially unmapped kernel address.
-	 */
-	.macro ignore L=.Lignore
-30:
-	_ASM_EXTABLE(30b, \L)
-	.endm
-
-
 SYM_FUNC_START(csum_partial_copy_generic)
-	cmpl	$3*64, %edx
-	jle	.Lignore
-
-.Lignore:
-	subq  $7*8, %rsp
-	movq  %rbx, 2*8(%rsp)
-	movq  %r12, 3*8(%rsp)
-	movq  %r14, 4*8(%rsp)
-	movq  %r13, 5*8(%rsp)
-	movq  %r15, 6*8(%rsp)
+	subq  $5*8, %rsp
+	movq  %rbx, 0*8(%rsp)
+	movq  %r12, 1*8(%rsp)
+	movq  %r14, 2*8(%rsp)
+	movq  %r13, 3*8(%rsp)
+	movq  %r15, 4*8(%rsp)
 
-	movq  %r8, (%rsp)
-	movq  %r9, 1*8(%rsp)
-
-	movl  %ecx, %eax
+	movl  $-1, %eax
+	xorl  %r9d, %r9d
 	movl  %edx, %ecx
+	cmpl  $8, %ecx
+	jb    .Lshort
 
-	xorl  %r9d, %r9d
-	movq  %rcx, %r12
+	testb  $7, %sil
+	jne   .Lunaligned
+.Laligned:
+	movl  %ecx, %r12d
 
 	shrq  $6, %r12
 	jz	.Lhandle_tail       /* < 64 */
@@ -99,7 +84,12 @@ SYM_FUNC_START(csum_partial_copy_generic)
 	source
 	movq  56(%rdi), %r13
 
-	ignore 2f
+30:
+	/*
+	 * No _ASM_EXTABLE_UA; this is used for intentional prefetch on a
+	 * potentially unmapped kernel address.
+	 */
+	_ASM_EXTABLE(30b, 2f)
 	prefetcht0 5*64(%rdi)
 2:
 	adcq  %rbx, %rax
@@ -131,8 +121,6 @@ SYM_FUNC_START(csum_partial_copy_generic)
 	dest
 	movq %r13, 56(%rsi)
 
-3:
-
 	leaq 64(%rdi), %rdi
 	leaq 64(%rsi), %rsi
 
@@ -142,8 +130,8 @@ SYM_FUNC_START(csum_partial_copy_generic)
 
 	/* do last up to 56 bytes */
 .Lhandle_tail:
-	/* ecx:	count */
-	movl %ecx, %r10d
+	/* ecx:	count, rcx.63: the end result needs to be rol8 */
+	movq %rcx, %r10
 	andl $63, %ecx
 	shrl $3, %ecx
 	jz	.Lfold
@@ -172,6 +160,7 @@ SYM_FUNC_START(csum_partial_copy_generic)
 .Lhandle_7:
 	movl %r10d, %ecx
 	andl $7, %ecx
+.L1:				/* .Lshort rejoins the common path here */
 	shrl $1, %ecx
 	jz   .Lhandle_1
 	movl $2, %edx
@@ -203,26 +192,65 @@ SYM_FUNC_START(csum_partial_copy_generic)
 	adcl %r9d, %eax		/* carry */
 
 .Lende:
-	movq 2*8(%rsp), %rbx
-	movq 3*8(%rsp), %r12
-	movq 4*8(%rsp), %r14
-	movq 5*8(%rsp), %r13
-	movq 6*8(%rsp), %r15
-	addq $7*8, %rsp
+	testq %r10, %r10
+	js  .Lwas_odd
+.Lout:
+	movq 0*8(%rsp), %rbx
+	movq 1*8(%rsp), %r12
+	movq 2*8(%rsp), %r14
+	movq 3*8(%rsp), %r13
+	movq 4*8(%rsp), %r15
+	addq $5*8, %rsp
 	ret
+.Lshort:
+	movl %ecx, %r10d
+	jmp  .L1
+.Lunaligned:
+	xorl %ebx, %ebx
+	testb $1, %sil
+	jne  .Lodd
+1:	testb $2, %sil
+	je   2f
+	source
+	movw (%rdi), %bx
+	dest
+	movw %bx, (%rsi)
+	leaq 2(%rdi), %rdi
+	subq $2, %rcx
+	leaq 2(%rsi), %rsi
+	addq %rbx, %rax
+2:	testb $4, %sil
+	je .Laligned
+	source
+	movl (%rdi), %ebx
+	dest
+	movl %ebx, (%rsi)
+	leaq 4(%rdi), %rdi
+	subq $4, %rcx
+	leaq 4(%rsi), %rsi
+	addq %rbx, %rax
+	jmp .Laligned
+
+.Lodd:
+	source
+	movb (%rdi), %bl
+	dest
+	movb %bl, (%rsi)
+	leaq 1(%rdi), %rdi
+	leaq 1(%rsi), %rsi
+	/* decrement, set MSB */
+	leaq -1(%rcx, %rcx), %rcx
+	rorq $1, %rcx
+	shll $8, %ebx
+	addq %rbx, %rax
+	jmp 1b
+
+.Lwas_odd:
+	roll $8, %eax
+	jmp .Lout
 
-	/* Exception handlers. Very simple, zeroing is done in the wrappers */
-.Lbad_source:
-	movq (%rsp), %rax
-	testq %rax, %rax
-	jz   .Lende
-	movl $-EFAULT, (%rax)
-	jmp  .Lende
-
-.Lbad_dest:
-	movq 8(%rsp), %rax
-	testq %rax, %rax
-	jz   .Lende
-	movl $-EFAULT, (%rax)
-	jmp .Lende
+	/* Exception: just return 0 */
+.Lfault:
+	xorl %eax, %eax
+	jmp  .Lout
 SYM_FUNC_END(csum_partial_copy_generic)
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index ae2fb87e2274..189344924a2b 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -21,49 +21,16 @@
  * src and dst are best aligned to 64bits.
  */
 __wsum
-csum_and_copy_from_user(const void __user *src, void *dst,
-			    int len)
+csum_and_copy_from_user(const void __user *src, void *dst, int len)
 {
-	int err = 0;
-	__wsum isum = ~0U;
+	__wsum sum;
 
 	might_sleep();
-
 	if (!user_access_begin(src, len))
 		return 0;
-
-	/*
-	 * Why 6, not 7? To handle odd addresses aligned we
-	 * would need to do considerable complications to fix the
-	 * checksum which is defined as an 16bit accumulator. The
-	 * fix alignment code is primarily for performance
-	 * compatibility with 32bit and that will handle odd
-	 * addresses slowly too.
-	 */
-	if (unlikely((unsigned long)src & 6)) {
-		while (((unsigned long)src & 6) && len >= 2) {
-			__u16 val16;
-
-			unsafe_get_user(val16, (const __u16 __user *)src, out);
-
-			*(__u16 *)dst = val16;
-			isum = (__force __wsum)add32_with_carry(
-					(__force unsigned)isum, val16);
-			src += 2;
-			dst += 2;
-			len -= 2;
-		}
-	}
-	isum = csum_partial_copy_generic((__force const void *)src,
-				dst, len, isum, &err, NULL);
-	user_access_end();
-	if (unlikely(err))
-		isum = 0;
-	return isum;
-
-out:
+	sum = csum_partial_copy_generic((__force const void *)src, dst, len);
 	user_access_end();
-	return 0;
+	return sum;
 }
 EXPORT_SYMBOL(csum_and_copy_from_user);
 
@@ -79,37 +46,16 @@ EXPORT_SYMBOL(csum_and_copy_from_user);
  * src and dst are best aligned to 64bits.
  */
 __wsum
-csum_and_copy_to_user(const void *src, void __user *dst,
-			  int len)
+csum_and_copy_to_user(const void *src, void __user *dst, int len)
 {
-	__wsum ret, isum = ~0U;
-	int err = 0;
+	__wsum sum;
 
 	might_sleep();
-
 	if (!user_access_begin(dst, len))
 		return 0;
-
-	if (unlikely((unsigned long)dst & 6)) {
-		while (((unsigned long)dst & 6) && len >= 2) {
-			__u16 val16 = *(__u16 *)src;
-
-			isum = (__force __wsum)add32_with_carry(
-					(__force unsigned)isum, val16);
-			unsafe_put_user(val16, (__u16 __user *)dst, out);
-			src += 2;
-			dst += 2;
-			len -= 2;
-		}
-	}
-
-	ret = csum_partial_copy_generic(src, (void __force *)dst,
-					len, isum, NULL, &err);
-	user_access_end();
-	return err ? 0 : ret;
-out:
+	sum = csum_partial_copy_generic(src, (void __force *)dst, len);
 	user_access_end();
-	return 0;
+	return sum;
 }
 EXPORT_SYMBOL(csum_and_copy_to_user);
 
@@ -125,7 +71,7 @@ EXPORT_SYMBOL(csum_and_copy_to_user);
 __wsum
 csum_partial_copy_nocheck(const void *src, void *dst, int len)
 {
-	return csum_partial_copy_generic(src, dst, len, 0, NULL, NULL);
+	return csum_partial_copy_generic(src, dst, len);
 }
 EXPORT_SYMBOL(csum_partial_copy_nocheck);
 
-- 
2.11.0


  parent reply	other threads:[~2020-07-21 20:26 UTC|newest]

Thread overview: 77+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-07-21 20:24 [RFC][CFT][PATCHSET] saner calling conventions for csum-and-copy primitives Al Viro
2020-07-21 20:25 ` [PATCH 01/18] skb_copy_and_csum_bits(): don't bother with the last argument Al Viro
2020-07-21 20:25   ` [PATCH 02/18] icmp_push_reply(): reorder adding the checksum up Al Viro
2020-07-21 20:25   ` [PATCH 03/18] csum_partial_copy_nocheck(): drop the last argument Al Viro
2020-07-21 20:25   ` [PATCH 04/18] csum_and_copy_..._user(): pass 0xffffffff instead of 0 as initial sum Al Viro
2020-07-21 20:55     ` Linus Torvalds
2020-07-21 20:58       ` Linus Torvalds
2020-07-21 21:11         ` Al Viro
2020-07-21 21:16           ` Linus Torvalds
2020-07-25 17:54           ` Al Viro
2020-07-22  9:45       ` David Laight
2020-07-22  9:27     ` David Laight
2020-07-22 14:42       ` Al Viro
2020-07-22 15:22         ` David Laight
2020-07-22 15:54           ` Al Viro
2020-07-22 16:17             ` David Laight
2020-07-22 17:39               ` Al Viro
2020-07-23  8:29                 ` David Laight
2020-07-23 13:54                 ` David Laight
2020-07-23 14:30                   ` David Laight
2020-07-23 14:53                   ` Al Viro
2020-07-23 15:19                     ` David Laight
2020-07-23 15:21                     ` Al Viro
2020-07-23 15:36                       ` David Laight
2020-07-21 20:25   ` [PATCH 05/18] saner calling conventions for csum_and_copy_..._user() Al Viro
2020-07-21 20:25   ` [PATCH 06/18] alpha: propagate the calling convention changes down to csum_partial_copy.c helpers Al Viro
2020-07-21 20:25   ` [PATCH 07/18] arm: propagate the calling convention changes down to csum_partial_copy_from_user() Al Viro
2020-07-21 20:25   ` [PATCH 08/18] m68k: get rid of zeroing destination on error in csum_and_copy_from_user() Al Viro
2020-07-21 20:25   ` [PATCH 09/18] sh: propage the calling conventions change down to csum_partial_copy_generic() Al Viro
2020-07-21 20:25   ` [PATCH 10/18] i386: propagate " Al Viro
2020-07-21 20:25   ` [PATCH 11/18] sparc32: propagate the calling conventions change down to __csum_partial_copy_sparc_generic() Al Viro
2020-07-22  1:20     ` David Miller
2020-07-21 20:25   ` [PATCH 12/18] mips: csum_and_copy_{to,from}_user() are never called under KERNEL_DS Al Viro
2020-07-21 20:25   ` [PATCH 13/18] mips: __csum_partial_copy_kernel() has no users left Al Viro
2020-07-21 20:25   ` [PATCH 14/18] mips: propagate the calling convention change down into __csum_partial_copy_..._user() Al Viro
2020-07-21 20:25   ` [PATCH 15/18] xtensa: propagate the calling conventions change down into csum_partial_copy_generic() Al Viro
2020-07-22  8:56     ` Max Filippov
2020-07-21 20:25   ` [PATCH 16/18] sparc64: propagate the calling convention changes down to __csum_partial_copy_...() Al Viro
2020-07-22  1:21     ` David Miller
2020-07-21 20:25   ` Al Viro [this message]
2020-07-21 20:25   ` [PATCH 18/18] ppc: propagate the calling conventions change down to csum_partial_copy_generic() Al Viro
2020-07-24  1:25 ` [RFC][CFT][PATCHSET v2] saner calling conventions for csum-and-copy primitives Al Viro
2020-07-24  1:25   ` [PATCH v2 01/20] xtensa: fix access check in csum_and_copy_from_user Al Viro
2020-07-24  1:25     ` [PATCH v2 02/20] skb_copy_and_csum_bits(): don't bother with the last argument Al Viro
2020-07-24  1:25     ` [PATCH v2 03/20] icmp_push_reply(): reorder adding the checksum up Al Viro
2020-07-24  1:25     ` [PATCH v2 04/20] unify generic instances of csum_partial_copy_nocheck() Al Viro
2020-07-24  6:41       ` Christoph Hellwig
2020-07-24 12:19         ` Al Viro
2020-07-24 12:23           ` Christoph Hellwig
2020-07-24 12:30             ` Al Viro
2020-07-26  7:11               ` Christoph Hellwig
2020-07-27  3:58                 ` Al Viro
2020-07-24  1:25     ` [PATCH v2 05/20] csum_partial_copy_nocheck(): drop the last argument Al Viro
2020-07-24 12:21       ` kernel test robot
2020-07-24  1:25     ` [PATCH v2 06/20] csum_and_copy_..._user(): pass 0xffffffff instead of 0 as initial sum Al Viro
2020-07-24  1:25     ` [PATCH v2 07/20] saner calling conventions for csum_and_copy_..._user() Al Viro
2020-07-24  1:25     ` [PATCH v2 08/20] alpha: propagate the calling convention changes down to csum_partial_copy.c helpers Al Viro
2020-07-24  1:25     ` [PATCH v2 09/20] arm: propagate the calling convention changes down to csum_partial_copy_from_user() Al Viro
2020-07-24  1:25     ` [PATCH v2 10/20] m68k: get rid of zeroing destination on error in csum_and_copy_from_user() Al Viro
2020-07-24  1:25     ` [PATCH v2 11/20] sh: propage the calling conventions change down to csum_partial_copy_generic() Al Viro
2020-07-24  1:25     ` [PATCH v2 12/20] i386: propagate " Al Viro
2020-07-24  1:25     ` [PATCH v2 13/20] sparc32: propagate the calling conventions change down to __csum_partial_copy_sparc_generic() Al Viro
2020-07-24  1:25     ` [PATCH v2 14/20] mips: csum_and_copy_{to,from}_user() are never called under KERNEL_DS Al Viro
2020-07-24  1:25     ` [PATCH v2 15/20] mips: __csum_partial_copy_kernel() has no users left Al Viro
2020-07-24  1:25     ` [PATCH v2 16/20] mips: propagate the calling convention change down into __csum_partial_copy_..._user() Al Viro
2020-07-24  1:25     ` [PATCH v2 17/20] xtensa: propagate the calling conventions change down into csum_partial_copy_generic() Al Viro
2020-07-24  1:25     ` [PATCH v2 18/20] sparc64: propagate the calling convention changes down to __csum_partial_copy_...() Al Viro
2020-07-24  1:25     ` [PATCH v2 19/20] amd64: switch csum_partial_copy_generic() to new calling conventions Al Viro
2020-07-24  1:25     ` [PATCH v2 20/20] ppc: propagate the calling conventions change down to csum_partial_copy_generic() Al Viro
2020-10-14 22:26       ` Jason A. Donenfeld
2020-10-14 22:51         ` Linus Torvalds
2020-10-14 22:53           ` Linus Torvalds
2020-10-14 22:54             ` Jason A. Donenfeld
2020-10-14 22:53           ` Jason A. Donenfeld
2020-10-14 23:12           ` Al Viro
2020-10-14 23:02         ` [PATCH] powerpc32: don't adjust unmoved stack pointer in csum_partial_copy_generic() epilogue Jason A. Donenfeld
2020-10-14 23:05           ` Linus Torvalds

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200721202549.4150745-17-viro@ZenIV.linux.org.uk \
    --to=viro@zeniv.linux.org.uk \
    --cc=linux-arch@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.