All of lore.kernel.org
 help / color / mirror / Atom feed
From: Oliver Swede <oli.swede@arm.com>
To: catalin.marinas@arm.com, will@kernel.org
Cc: robin.murphy@arm.com, linux-arm-kernel@lists.indradead.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH v5 08/14] arm64: Import latest optimization of memcpy
Date: Mon, 14 Sep 2020 15:09:52 +0000	[thread overview]
Message-ID: <20200914150958.2200-9-oli.swede@arm.com> (raw)
In-Reply-To: <20200914150958.2200-1-oli.swede@arm.com>

From: Sam Tebbs <sam.tebbs@arm.com>

Import the latest memcpy implementation into memcpy,
copy_{from, to and in}_user.
The implementation of the user routines is separated into two forms:
one for when UAO is enabled and one for when UAO is disabled, with
the two being chosen between with a runtime patch.
This avoids executing the many NOPs emitted when UAO is disabled.

The project containing optimized implementations for various library
functions has now been renamed from 'cortex-strings' to
'optimized-routines', and the new upstream source is
string/aarch64/memcpy.S as of commit 4c175c8be12 in
https://github.com/ARM-software/optimized-routines.

Signed-off-by: Sam Tebbs <sam.tebbs@arm.com>
[ rm: add UAO fixups, streamline copy_exit paths, expand commit message ]
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
[ os: import newer memcpy algorithm, update commit message ]
Signed-off-by: Oliver Swede <oli.swede@arm.com>
---
 arch/arm64/include/asm/alternative.h |  36 ---
 arch/arm64/lib/copy_from_user.S      | 113 ++++++--
 arch/arm64/lib/copy_in_user.S        | 129 +++++++--
 arch/arm64/lib/copy_template.S       | 375 +++++++++++++++------------
 arch/arm64/lib/copy_template_user.S  |  24 ++
 arch/arm64/lib/copy_to_user.S        | 112 ++++++--
 arch/arm64/lib/copy_user_fixup.S     |  14 +
 arch/arm64/lib/memcpy.S              |  47 ++--
 8 files changed, 557 insertions(+), 293 deletions(-)
 create mode 100644 arch/arm64/lib/copy_template_user.S
 create mode 100644 arch/arm64/lib/copy_user_fixup.S

diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
index 619db9b4c9d5..581bacacc1bc 100644
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -230,36 +230,6 @@ alternative_endif
  * unprivileged instructions, and USER() only works for single instructions.
  */
 #ifdef CONFIG_ARM64_UAO
-	.macro uao_ldp l, reg1, reg2, addr, post_inc
-		alternative_if_not ARM64_HAS_UAO
-8888:			ldp	\reg1, \reg2, [\addr], \post_inc;
-8889:			nop;
-			nop;
-		alternative_else
-			ldtr	\reg1, [\addr];
-			ldtr	\reg2, [\addr, #8];
-			add	\addr, \addr, \post_inc;
-		alternative_endif
-
-		_asm_extable	8888b,\l;
-		_asm_extable	8889b,\l;
-	.endm
-
-	.macro uao_stp l, reg1, reg2, addr, post_inc
-		alternative_if_not ARM64_HAS_UAO
-8888:			stp	\reg1, \reg2, [\addr], \post_inc;
-8889:			nop;
-			nop;
-		alternative_else
-			sttr	\reg1, [\addr];
-			sttr	\reg2, [\addr, #8];
-			add	\addr, \addr, \post_inc;
-		alternative_endif
-
-		_asm_extable	8888b,\l;
-		_asm_extable	8889b,\l;
-	.endm
-
 	.macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc
 		alternative_if_not ARM64_HAS_UAO
 8888:			\inst	\reg, [\addr], \post_inc;
@@ -272,12 +242,6 @@ alternative_endif
 		_asm_extable	8888b,\l;
 	.endm
 #else
-	.macro uao_ldp l, reg1, reg2, addr, post_inc
-		USER(\l, ldp \reg1, \reg2, [\addr], \post_inc)
-	.endm
-	.macro uao_stp l, reg1, reg2, addr, post_inc
-		USER(\l, stp \reg1, \reg2, [\addr], \post_inc)
-	.endm
 	.macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc
 		USER(\l, \inst \reg, [\addr], \post_inc)
 	.endm
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 0f8a3a9e3795..86945e84c009 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -19,50 +19,111 @@
  * Returns:
  *	x0 - bytes not copied
  */
+	8888: ldtrb \reg, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
+	.endm
+
+	.macro strb1 reg, ptr, offset=0
+	strb \reg, [\ptr, \offset]
+	.endm
+
+	.macro ldrb1_reg reg, ptr, offset
+	add \ptr, \ptr, \offset
+	8888: ldtrb \reg, [\ptr]
+	sub \ptr, \ptr, \offset
+	_asm_extable_faultaddr	8888b,9998f;
+	.endm
+
+	.macro strb1_reg reg, ptr, offset
+	strb \reg, [\ptr, \offset]
+	.endm
 
-	.macro ldrb1 reg, ptr, val
-	uao_user_alternative 9998f, ldrb, ldtrb, \reg, \ptr, \val
+	.macro ldr1 reg, ptr, offset=0
+	8888: ldtr \reg, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
 	.endm
 
-	.macro strb1 reg, ptr, val
-	strb \reg, [\ptr], \val
+	.macro str1 reg, ptr, offset=0
+	str \reg, [\ptr, \offset]
 	.endm
 
-	.macro ldrh1 reg, ptr, val
-	uao_user_alternative 9998f, ldrh, ldtrh, \reg, \ptr, \val
+	.macro ldp1 regA, regB, ptr, offset=0
+	8888: ldtr \regA, [\ptr, \offset]
+	8889: ldtr \regB, [\ptr, \offset + 8]
+	_asm_extable_faultaddr	8888b,9998f;
+	_asm_extable_faultaddr	8889b,9998f;
 	.endm
 
-	.macro strh1 reg, ptr, val
-	strh \reg, [\ptr], \val
+	.macro stp1 regA, regB, ptr, offset=0
+	stp \regA, \regB, [\ptr, \offset]
 	.endm
 
-	.macro ldr1 reg, ptr, val
-	uao_user_alternative 9998f, ldr, ldtr, \reg, \ptr, \val
+	.macro ldp1_pre regA, regB, ptr, offset
+	8888: ldtr \regA, [\ptr, \offset]
+	8889: ldtr \regB, [\ptr, \offset + 8]
+	add \ptr, \ptr, \offset
+	_asm_extable_faultaddr	8888b,9998f;
+	_asm_extable_faultaddr	8889b,9998f;
 	.endm
 
-	.macro str1 reg, ptr, val
-	str \reg, [\ptr], \val
+	.macro stp1_pre regA, regB, ptr, offset
+	stp \regA, \regB, [\ptr, \offset]!
 	.endm
 
-	.macro ldp1 reg1, reg2, ptr, val
-	uao_ldp 9998f, \reg1, \reg2, \ptr, \val
+	.macro ldrb1_nuao reg, ptr, offset=0
+	8888: ldrb \reg, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
 	.endm
 
-	.macro stp1 reg1, reg2, ptr, val
-	stp \reg1, \reg2, [\ptr], \val
+	.macro strb1_nuao reg, ptr, offset=0
+	strb \reg, [\ptr, \offset]
+	.endm
+
+	.macro ldrb1_nuao_reg reg, ptr, offset=0
+	8888: ldrb \reg, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
+	.endm
+
+	.macro strb1_nuao_reg reg, ptr, offset=0
+	strb \reg, [\ptr, \offset]
+	.endm
+
+	.macro ldr1_nuao reg, ptr, offset=0
+	8888: ldr \reg, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
+	.endm
+
+	.macro str1_nuao reg, ptr, offset=0
+	str \reg, [\ptr, \offset]
+	.endm
+
+	.macro ldp1_nuao  regA, regB, ptr, offset=0
+	8888: ldp \regA, \regB, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
+	.endm
+
+	.macro stp1_nuao regA, regB, ptr, offset=0
+	stp \regA, \regB, [\ptr, \offset]
+	.endm
+
+	.macro ldp1_pre_nuao regA, regB, ptr, offset
+	8888: ldp \regA, \regB, [\ptr, \offset]!
+	_asm_extable_faultaddr	8888b,9998f;
+	.endm
+
+	.macro stp1_pre_nuao regA, regB, ptr, offset
+	stp \regA, \regB, [\ptr, \offset]!
+	.endm
+
+	.macro copy_exit
+	b	.Luaccess_finish
 	.endm
 
-end	.req	x5
 SYM_FUNC_START(__arch_copy_from_user)
-	add	end, x0, x2
-#include "copy_template.S"
-	mov	x0, #0				// Nothing to copy
+#include "copy_template_user.S"
+.Luaccess_finish:
+	mov	x0, #0
 	ret
 SYM_FUNC_END(__arch_copy_from_user)
 EXPORT_SYMBOL(__arch_copy_from_user)
-
-	.section .fixup,"ax"
-	.align	2
-9998:	sub	x0, end, dst			// bytes not copied
-	ret
-	.previous
+#include "copy_user_fixup.S"
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index 80e37ada0ee1..77dfccc618b6 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -21,50 +21,129 @@
  * Returns:
  *	x0 - bytes not copied
  */
-	.macro ldrb1 reg, ptr, val
-	uao_user_alternative 9998f, ldrb, ldtrb, \reg, \ptr, \val
+	.macro ldrb1 reg, ptr, offset=0
+	8888: ldtrb \reg, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
 	.endm
 
-	.macro strb1 reg, ptr, val
-	uao_user_alternative 9998f, strb, sttrb, \reg, \ptr, \val
+	.macro strb1 reg, ptr, offset=0
+	8888: sttrb \reg, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
 	.endm
 
-	.macro ldrh1 reg, ptr, val
-	uao_user_alternative 9998f, ldrh, ldtrh, \reg, \ptr, \val
+	.macro ldrb1_reg reg, ptr, offset
+	add \ptr, \ptr, \offset
+	8888: ldtrb \reg, [\ptr]
+	sub \ptr, \ptr, \offset
+	_asm_extable_faultaddr	8888b,9998f;
 	.endm
 
-	.macro strh1 reg, ptr, val
-	uao_user_alternative 9998f, strh, sttrh, \reg, \ptr, \val
+	.macro strb1_reg reg, ptr, offset
+	add \ptr, \ptr, \offset
+	8888: sttrb \reg, [\ptr]
+	sub \ptr, \ptr, \offset
+	_asm_extable_faultaddr	8888b,9998f;
 	.endm
 
-	.macro ldr1 reg, ptr, val
-	uao_user_alternative 9998f, ldr, ldtr, \reg, \ptr, \val
+	.macro ldr1 reg, ptr, offset=0
+	8888: ldtr \reg, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
 	.endm
 
-	.macro str1 reg, ptr, val
-	uao_user_alternative 9998f, str, sttr, \reg, \ptr, \val
+	.macro str1 reg, ptr, offset=0
+	8888: sttr \reg, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
 	.endm
 
-	.macro ldp1 reg1, reg2, ptr, val
-	uao_ldp 9998f, \reg1, \reg2, \ptr, \val
+	.macro ldp1 regA, regB, ptr, offset=0
+	8888: ldtr \regA, [\ptr, \offset]
+	8889: ldtr \regB, [\ptr, \offset + 8]
+	_asm_extable_faultaddr	8888b,9998f;
+	_asm_extable_faultaddr	8889b,9998f;
 	.endm
 
-	.macro stp1 reg1, reg2, ptr, val
-	uao_stp 9998f, \reg1, \reg2, \ptr, \val
+	.macro stp1 regA, regB, ptr, offset=0
+	8888: sttr \regA, [\ptr, \offset]
+	8889: sttr \regB, [\ptr, \offset + 8]
+	_asm_extable_faultaddr	8888b,9998f;
+	_asm_extable_faultaddr	8889b,9998f;
 	.endm
 
-end	.req	x5
+	.macro ldp1_pre regA, regB, ptr, offset
+	8888: ldtr \regA, [\ptr, \offset]
+	8889: ldtr \regB, [\ptr, \offset + 8]
+	add \ptr, \ptr, \offset
+	_asm_extable_faultaddr	8888b,9998f;
+	_asm_extable_faultaddr	8889b,9998f;
+	.endm
+
+	.macro stp1_pre regA, regB, ptr, offset
+	8888: sttr \regA, [\ptr, \offset]
+	8889: sttr \regB, [\ptr, \offset + 8]
+	add \ptr, \ptr, \offset
+	_asm_extable_faultaddr	8888b,9998f;
+	_asm_extable_faultaddr	8889b,9998f;
+	.endm
+
+	.macro ldrb1_nuao reg, ptr, offset=0
+	8888: ldrb \reg, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
+	.endm
+
+	.macro strb1_nuao reg, ptr, offset=0
+	8888: strb \reg, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
+	.endm
+
+	.macro ldrb1_nuao_reg reg, ptr, offset=0
+	8888: ldrb \reg, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
+	.endm
+
+	.macro strb1_nuao_reg reg, ptr, offset=0
+	8888: strb \reg, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
+	.endm
+
+	.macro ldr1_nuao reg, ptr, offset=0
+	8888: ldr \reg, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
+	.endm
+
+	.macro str1_nuao reg, ptr, offset=0
+	8888: str \reg, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
+	.endm
+
+	.macro ldp1_nuao  regA, regB, ptr, offset=0
+	8888: ldp \regA, \regB, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
+	.endm
+
+	.macro stp1_nuao regA, regB, ptr, offset=0
+	8888: stp \regA, \regB, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
+	.endm
+
+	.macro ldp1_pre_nuao regA, regB, ptr, offset
+	8888: ldp \regA, \regB, [\ptr, \offset]!
+	_asm_extable_faultaddr	8888b,9998f;
+	.endm
+
+	.macro stp1_pre_nuao regA, regB, ptr, offset
+	8888: stp \regA, \regB, [\ptr, \offset]!
+	_asm_extable_faultaddr	8888b,9998f;
+	.endm
+
+	.macro copy_exit
+	b	.Luaccess_finish
+	.endm
 
 SYM_FUNC_START(__arch_copy_in_user)
-	add	end, x0, x2
-#include "copy_template.S"
+#include "copy_template_user.S"
+.Luaccess_finish:
 	mov	x0, #0
 	ret
 SYM_FUNC_END(__arch_copy_in_user)
 EXPORT_SYMBOL(__arch_copy_in_user)
-
-	.section .fixup,"ax"
-	.align	2
-9998:	sub	x0, end, dst			// bytes not copied
-	ret
-	.previous
+#include "copy_user_fixup.S"
diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
index 488df234c49a..90b5f63ff227 100644
--- a/arch/arm64/lib/copy_template.S
+++ b/arch/arm64/lib/copy_template.S
@@ -1,13 +1,12 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2012 Linaro Limited. All rights reserved.
+ * Copyright (c) 2015 ARM Ltd. All rights reserved.
  *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
+ * This code is based on work originally authored by Linaro,
+ * found at:
  *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * https://github.com/ARM-software/optimized-routines
  */
 
 
@@ -21,161 +20,209 @@
  * Returns:
  *	x0 - dest
  */
-dstin	.req	x0
-src	.req	x1
-count	.req	x2
-tmp1	.req	x3
-tmp1w	.req	w3
-tmp2	.req	x4
-tmp2w	.req	w4
-dst	.req	x6
-
-A_l	.req	x7
-A_h	.req	x8
-B_l	.req	x9
-B_h	.req	x10
-C_l	.req	x11
-C_h	.req	x12
-D_l	.req	x13
-D_h	.req	x14
-
-	mov	dst, dstin
-	cmp	count, #16
-	/*When memory length is less than 16, the accessed are not aligned.*/
-	b.lo	.Ltiny15
-
-	neg	tmp2, src
-	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
-	b.eq	.LSrcAligned
-	sub	count, count, tmp2
-	/*
-	* Copy the leading memory data from src to dst in an increasing
-	* address order.By this way,the risk of overwriting the source
-	* memory data is eliminated when the distance between src and
-	* dst is less than 16. The memory accesses here are alignment.
-	*/
-	tbz	tmp2, #0, 1f
-	ldrb1	tmp1w, src, #1
-	strb1	tmp1w, dst, #1
-1:
-	tbz	tmp2, #1, 2f
-	ldrh1	tmp1w, src, #2
-	strh1	tmp1w, dst, #2
-2:
-	tbz	tmp2, #2, 3f
-	ldr1	tmp1w, src, #4
-	str1	tmp1w, dst, #4
-3:
-	tbz	tmp2, #3, .LSrcAligned
-	ldr1	tmp1, src, #8
-	str1	tmp1, dst, #8
-
-.LSrcAligned:
-	cmp	count, #64
-	b.ge	.Lcpy_over64
-	/*
-	* Deal with small copies quickly by dropping straight into the
-	* exit block.
-	*/
-.Ltail63:
-	/*
-	* Copy up to 48 bytes of data. At this point we only need the
-	* bottom 6 bits of count to be accurate.
-	*/
-	ands	tmp1, count, #0x30
-	b.eq	.Ltiny15
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	ldp1	A_l, A_h, src, #16
-	stp1	A_l, A_h, dst, #16
-1:
-	ldp1	A_l, A_h, src, #16
-	stp1	A_l, A_h, dst, #16
-2:
-	ldp1	A_l, A_h, src, #16
-	stp1	A_l, A_h, dst, #16
-.Ltiny15:
-	/*
-	* Prefer to break one ldp/stp into several load/store to access
-	* memory in an increasing address order,rather than to load/store 16
-	* bytes from (src-16) to (dst-16) and to backward the src to aligned
-	* address,which way is used in original cortex memcpy. If keeping
-	* the original memcpy process here, memmove need to satisfy the
-	* precondition that src address is at least 16 bytes bigger than dst
-	* address,otherwise some source data will be overwritten when memove
-	* call memcpy directly. To make memmove simpler and decouple the
-	* memcpy's dependency on memmove, withdrew the original process.
-	*/
-	tbz	count, #3, 1f
-	ldr1	tmp1, src, #8
-	str1	tmp1, dst, #8
-1:
-	tbz	count, #2, 2f
-	ldr1	tmp1w, src, #4
-	str1	tmp1w, dst, #4
-2:
-	tbz	count, #1, 3f
-	ldrh1	tmp1w, src, #2
-	strh1	tmp1w, dst, #2
-3:
-	tbz	count, #0, .Lexitfunc
-	ldrb1	tmp1w, src, #1
-	strb1	tmp1w, dst, #1
-
-	b	.Lexitfunc
-
-.Lcpy_over64:
-	subs	count, count, #128
-	b.ge	.Lcpy_body_large
-	/*
-	* Less than 128 bytes to copy, so handle 64 here and then jump
-	* to the tail.
-	*/
-	ldp1	A_l, A_h, src, #16
-	stp1	A_l, A_h, dst, #16
-	ldp1	B_l, B_h, src, #16
-	ldp1	C_l, C_h, src, #16
-	stp1	B_l, B_h, dst, #16
-	stp1	C_l, C_h, dst, #16
-	ldp1	D_l, D_h, src, #16
-	stp1	D_l, D_h, dst, #16
-
-	tst	count, #0x3f
-	b.ne	.Ltail63
-	b	.Lexitfunc
-
-	/*
-	* Critical loop.  Start at a new cache line boundary.  Assuming
-	* 64 bytes per line this ensures the entire loop is in one line.
-	*/
-	.p2align	L1_CACHE_SHIFT
-.Lcpy_body_large:
-	/* pre-get 64 bytes data. */
-	ldp1	A_l, A_h, src, #16
-	ldp1	B_l, B_h, src, #16
-	ldp1	C_l, C_h, src, #16
-	ldp1	D_l, D_h, src, #16
-1:
-	/*
-	* interlace the load of next 64 bytes data block with store of the last
-	* loaded 64 bytes data.
-	*/
-	stp1	A_l, A_h, dst, #16
-	ldp1	A_l, A_h, src, #16
-	stp1	B_l, B_h, dst, #16
-	ldp1	B_l, B_h, src, #16
-	stp1	C_l, C_h, dst, #16
-	ldp1	C_l, C_h, src, #16
-	stp1	D_l, D_h, dst, #16
-	ldp1	D_l, D_h, src, #16
-	subs	count, count, #64
-	b.ge	1b
-	stp1	A_l, A_h, dst, #16
-	stp1	B_l, B_h, dst, #16
-	stp1	C_l, C_h, dst, #16
-	stp1	D_l, D_h, dst, #16
-
-	tst	count, #0x3f
-	b.ne	.Ltail63
-.Lexitfunc:
+ #define dstin	x0
+ #define src	x1
+ #define count	x2
+ #define dst	x3
+ #define srcend	x4
+ #define dstend	x5
+ #define A_l	x6
+ #define A_lw	w6
+ #define A_h	x7
+ #define B_l	x8
+ #define B_lw	w8
+ #define B_h	x9
+ #define C_l	x10
+ #define C_lw	w10
+ #define C_h	x11
+ #define D_l	x12
+ #define D_h	x13
+ #define E_l	x14
+ #define E_h	x15
+ #define F_l	x16
+ #define F_h	x17
+ #define G_l	count
+ #define G_h	dst
+ #define H_l	src
+ #define H_h	srcend
+ #define tmp1	x14
+
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 128
+	b.hi	L(copy_long)
+	cmp	count, 32
+	b.hi	L(copy32_128)
+
+	/* Small copies: 0..32 bytes. */
+	cmp	count, 16
+	b.lo	L(copy16)
+	ldp1	A_l, A_h, src
+	ldp1	D_l, D_h, srcend, -16
+	stp1	A_l, A_h, dstin
+	stp1	D_l, D_h, dstend, -16
+	copy_exit
+
+	/* Copy 8-15 bytes. */
+L(copy16):
+	tbz	count, 3, L(copy8)
+	ldr1	A_l, src
+	ldr1	A_h, srcend, -8
+	str1	A_l, dstin
+	str1	A_h, dstend, -8
+	copy_exit
+
+	.p2align 3
+	/* Copy 4-7 bytes. */
+L(copy8):
+	tbz	count, 2, L(copy4)
+	ldr1	A_lw, src
+	ldr1	B_lw, srcend, -4
+	str1	A_lw, dstin
+	str1	B_lw, dstend, -4
+	copy_exit
+
+	/* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+	cbz	count, L(copy0)
+	lsr	tmp1, count, 1
+	ldrb1	A_lw, src
+	ldrb1	C_lw, srcend, -1
+	ldrb1_reg	B_lw, src, tmp1
+	strb1	A_lw, dstin
+	strb1_reg	B_lw, dstin, tmp1
+	strb1	C_lw, dstend, -1
+L(copy0):
+	copy_exit
+
+	.p2align 4
+	/* Medium copies: 33..128 bytes. */
+L(copy32_128):
+	ldp1	A_l, A_h, src
+	ldp1	B_l, B_h, src, 16
+	ldp1	C_l, C_h, srcend, -32
+	ldp1	D_l, D_h, srcend, -16
+	cmp	count, 64
+	b.hi	L(copy128)
+	stp1	A_l, A_h, dstin
+	stp1	B_l, B_h, dstin, 16
+	stp1	C_l, C_h, dstend, -32
+	stp1	D_l, D_h, dstend, -16
+	copy_exit
+
+	.p2align 4
+	/* Copy 65..128 bytes. */
+L(copy128):
+	ldp1	E_l, E_h, src, 32
+	ldp1	F_l, F_h, src, 48
+	cmp	count, 96
+	b.ls	L(copy96)
+	ldp1	G_l, G_h, srcend, -64
+	ldp1	H_l, H_h, srcend, -48
+	stp1	G_l, G_h, dstend, -64
+	stp1	H_l, H_h, dstend, -48
+L(copy96):
+	stp1	A_l, A_h, dstin
+	stp1	B_l, B_h, dstin, 16
+	stp1	E_l, E_h, dstin, 32
+	stp1	F_l, F_h, dstin, 48
+	stp1	C_l, C_h, dstend, -32
+	stp1	D_l, D_h, dstend, -16
+	copy_exit
+
+	.p2align 4
+	/* Copy more than 128 bytes. */
+L(copy_long):
+	/* Use backwards copy if there is an overlap. */
+	sub	tmp1, dstin, src
+	cbz	tmp1, L(copy0)
+	cmp	tmp1, count
+	b.lo	L(copy_long_backwards)
+
+	/* Copy 16 bytes and then align dst to 16-byte alignment. */
+
+	ldp1	D_l, D_h, src
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large. */
+	ldp1	A_l, A_h, src, 16
+	stp1	D_l, D_h, dstin
+	ldp1	B_l, B_h, src, 32
+	ldp1	C_l, C_h, src, 48
+	ldp1_pre	D_l, D_h, src, 64
+	subs	count, count, 128 + 16 /* Test and readjust count. */
+	b.ls	L(copy64_from_end)
+
+L(loop64):
+	stp1	A_l, A_h, dst, 16
+	ldp1	A_l, A_h, src, 16
+	stp1	B_l, B_h, dst, 32
+	ldp1	B_l, B_h, src, 32
+	stp1	C_l, C_h, dst, 48
+	ldp1	C_l, C_h, src, 48
+	stp1_pre	D_l, D_h, dst, 64
+	ldp1_pre	D_l, D_h, src, 64
+	subs	count, count, 64
+	b.hi	L(loop64)
+
+	/* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
+	ldp1	E_l, E_h, srcend, -64
+	stp1	A_l, A_h, dst, 16
+	ldp1	A_l, A_h, srcend, -48
+	stp1	B_l, B_h, dst, 32
+	ldp1	B_l, B_h, srcend, -32
+	stp1	C_l, C_h, dst, 48
+	ldp1	C_l, C_h, srcend, -16
+	stp1	D_l, D_h, dst, 64
+	stp1	E_l, E_h, dstend, -64
+	stp1	A_l, A_h, dstend, -48
+	stp1	B_l, B_h, dstend, -32
+	stp1	C_l, C_h, dstend, -16
+	copy_exit
+
+	.p2align 4
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align dst to 16-byte alignment. */
+L(copy_long_backwards):
+	ldp1	D_l, D_h, srcend, -16
+	and	tmp1, dstend, 15
+	sub	srcend, srcend, tmp1
+	sub	count, count, tmp1
+	ldp1	A_l, A_h, srcend, -16
+	stp1	D_l, D_h, dstend, -16
+	ldp1	B_l, B_h, srcend, -32
+	ldp1	C_l, C_h, srcend, -48
+	ldp1_pre	D_l, D_h, srcend, -64
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	L(copy64_from_start)
+
+L(loop64_backwards):
+	stp1	A_l, A_h, dstend, -16
+	ldp1	A_l, A_h, srcend, -16
+	stp1	B_l, B_h, dstend, -32
+	ldp1	B_l, B_h, srcend, -32
+	stp1	C_l, C_h, dstend, -48
+	ldp1	C_l, C_h, srcend, -48
+	stp1_pre	D_l, D_h, dstend, -64
+	ldp1_pre	D_l, D_h, srcend, -64
+	subs	count, count, 64
+	b.hi	L(loop64_backwards)
+
+	/* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
+	ldp1	G_l, G_h, src, 48
+	stp1	A_l, A_h, dstend, -16
+	ldp1	A_l, A_h, src, 32
+	stp1	B_l, B_h, dstend, -32
+	ldp1	B_l, B_h, src, 16
+	stp1	C_l, C_h, dstend, -48
+	ldp1	C_l, C_h, src
+	stp1	D_l, D_h, dstend, -64
+	stp1	G_l, G_h, dstin, 48
+	stp1	A_l, A_h, dstin, 32
+	stp1	B_l, B_h, dstin, 16
+	stp1	C_l, C_h, dstin
+	copy_exit
diff --git a/arch/arm64/lib/copy_template_user.S b/arch/arm64/lib/copy_template_user.S
new file mode 100644
index 000000000000..3db24dcdab05
--- /dev/null
+++ b/arch/arm64/lib/copy_template_user.S
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#define L(l) .L ## l
+
+	alternative_if_not ARM64_HAS_UAO
+	b	L(copy_non_uao)
+	alternative_else_nop_endif
+#include "copy_template.S"
+
+#define ldp1 ldp1_nuao
+#define ldp1_pre ldp1_pre_nuao
+#define stp1 stp1_nuao
+#define stp1_pre stp1_pre_nuao
+#define ldr1 ldr1_nuao
+#define str1 str1_nuao
+#define ldrb1 ldrb1_nuao
+#define strb1 strb1_nuao
+#define ldrb1_reg ldrb1_nuao_reg
+#define strb1_reg strb1_nuao_reg
+
+L(copy_non_uao):
+#undef L
+#define L(l) .Lnuao ## l
+#include "copy_template.S"
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 4ec59704b8f2..6b4742cac083 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -19,49 +19,111 @@
  * Returns:
  *	x0 - bytes not copied
  */
-	.macro ldrb1 reg, ptr, val
-	ldrb  \reg, [\ptr], \val
+	.macro ldrb1 reg, ptr, offset=0
+	ldrb \reg, [\ptr, \offset]
 	.endm
 
-	.macro strb1 reg, ptr, val
-	uao_user_alternative 9998f, strb, sttrb, \reg, \ptr, \val
+	.macro strb1 reg, ptr, offset=0
+	8888: sttrb \reg, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
 	.endm
 
-	.macro ldrh1 reg, ptr, val
-	ldrh  \reg, [\ptr], \val
+	.macro ldrb1_reg reg, ptr, offset
+	ldrb \reg, [\ptr, \offset]
 	.endm
 
-	.macro strh1 reg, ptr, val
-	uao_user_alternative 9998f, strh, sttrh, \reg, \ptr, \val
+	.macro strb1_reg reg, ptr, offset
+	add \ptr, \ptr, \offset
+	8888: sttrb \reg, [\ptr]
+	sub \ptr, \ptr, \offset
+	_asm_extable_faultaddr	8888b,9998f;
 	.endm
 
-	.macro ldr1 reg, ptr, val
-	ldr \reg, [\ptr], \val
+	.macro ldr1 reg, ptr, offset=0
+	ldr \reg, [\ptr, \offset]
 	.endm
 
-	.macro str1 reg, ptr, val
-	uao_user_alternative 9998f, str, sttr, \reg, \ptr, \val
+	.macro str1 reg, ptr, offset=0
+	8888: sttr \reg, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
 	.endm
 
-	.macro ldp1 reg1, reg2, ptr, val
-	ldp \reg1, \reg2, [\ptr], \val
+	.macro ldp1 regA, regB, ptr, offset=0
+	ldp \regA, \regB, [\ptr, \offset]
 	.endm
 
-	.macro stp1 reg1, reg2, ptr, val
-	uao_stp 9998f, \reg1, \reg2, \ptr, \val
+	.macro stp1 regA, regB, ptr, offset=0
+	8888: sttr \regA, [\ptr, \offset]
+	8889: sttr \regB, [\ptr, \offset + 8]
+	_asm_extable_faultaddr	8888b,9998f;
+	_asm_extable_faultaddr	8889b,9998f;
+	.endm
+
+	.macro ldp1_pre regA, regB, ptr, offset
+	ldp \regA, \regB, [\ptr, \offset]!
+	.endm
+
+	.macro stp1_pre regA, regB, ptr, offset
+	8888: sttr \regA, [\ptr, \offset]
+	8889: sttr \regB, [\ptr, \offset + 8]
+	add \ptr, \ptr, \offset
+	_asm_extable_faultaddr	8888b,9998f;
+	_asm_extable_faultaddr	8889b,9998f;
+	.endm
+
+	.macro ldrb1_nuao reg, ptr, offset=0
+	ldrb \reg, [\ptr, \offset]
+	.endm
+
+	.macro strb1_nuao reg, ptr, offset=0
+	8888: strb \reg, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
+	.endm
+
+	.macro ldrb1_nuao_reg reg, ptr, offset=0
+	ldrb \reg, [\ptr, \offset]
+	.endm
+
+	.macro strb1_nuao_reg reg, ptr, offset=0
+	strb \reg, [\ptr, \offset]
+	.endm
+
+	.macro ldr1_nuao reg, ptr, offset=0
+	ldr \reg, [\ptr, \offset]
+	.endm
+
+	.macro str1_nuao reg, ptr, offset=0
+	8888: str \reg, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
+	.endm
+
+	.macro ldp1_nuao  regA, regB, ptr, offset=0
+	ldp \regA, \regB, [\ptr, \offset]
+	.endm
+
+	.macro ldp1_pre_nuao regA, regB, ptr, offset
+	ldp \regA, \regB, [\ptr, \offset]!
+	.endm
+
+	.macro stp1_nuao regA, regB, ptr, offset=0
+	8888: stp \regA, \regB, [\ptr, \offset]
+	_asm_extable_faultaddr	8888b,9998f;
+	.endm
+
+	.macro stp1_pre_nuao regA, regB, ptr, offset
+	8888: stp \regA, \regB, [\ptr, \offset]!
+	_asm_extable_faultaddr	8888b,9998f;
+	.endm
+
+	.macro copy_exit
+	b	.Luaccess_finish
 	.endm
 
-end	.req	x5
 SYM_FUNC_START(__arch_copy_to_user)
-	add	end, x0, x2
-#include "copy_template.S"
+#include "copy_template_user.S"
+.Luaccess_finish:
 	mov	x0, #0
 	ret
 SYM_FUNC_END(__arch_copy_to_user)
 EXPORT_SYMBOL(__arch_copy_to_user)
-
-	.section .fixup,"ax"
-	.align	2
-9998:	sub	x0, end, dst			// bytes not copied
-	ret
-	.previous
+#include "copy_user_fixup.S"
diff --git a/arch/arm64/lib/copy_user_fixup.S b/arch/arm64/lib/copy_user_fixup.S
new file mode 100644
index 000000000000..32fae9e2e799
--- /dev/null
+++ b/arch/arm64/lib/copy_user_fixup.S
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+addr	.req	x15
+.section .fixup,"ax"
+.align	2
+9998:
+	// If it falls in the src range then it was a load that failed,
+	// otherwise it was a store
+	cmp addr, src
+	ccmp addr, srcend, #0x0, ge
+	csel x0, srcend, dstend, lt
+	sub x0, x0, addr
+	ret
+
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index e0bf83d556f2..c24925aef236 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -24,43 +24,56 @@
  * Returns:
  *	x0 - dest
  */
-	.macro ldrb1 reg, ptr, val
-	ldrb  \reg, [\ptr], \val
+ #define L(l) .L ## l
+
+	.macro ldrb1 reg, ptr, offset=0
+	ldrb \reg, [\ptr, \offset]
+	.endm
+
+	.macro strb1 reg, ptr, offset=0
+	strb \reg, [\ptr, \offset]
+	.endm
+
+	.macro ldr1 reg, ptr, offset=0
+	ldr \reg, [\ptr, \offset]
 	.endm
 
-	.macro strb1 reg, ptr, val
-	strb \reg, [\ptr], \val
+	.macro str1 reg, ptr, offset=0
+	str \reg, [\ptr, \offset]
 	.endm
 
-	.macro ldrh1 reg, ptr, val
-	ldrh  \reg, [\ptr], \val
+	.macro ldp1 regA, regB, ptr, offset=0
+	ldp \regA, \regB, [\ptr, \offset]
 	.endm
 
-	.macro strh1 reg, ptr, val
-	strh \reg, [\ptr], \val
+	.macro stp1 regA, regB, ptr, offset=0
+	stp \regA, \regB, [\ptr, \offset]
 	.endm
 
-	.macro ldr1 reg, ptr, val
-	ldr \reg, [\ptr], \val
+	.macro ldrb1_reg reg, ptr, offset
+	ldrb1 \reg, \ptr, \offset
 	.endm
 
-	.macro str1 reg, ptr, val
-	str \reg, [\ptr], \val
+	.macro strb1_reg reg, ptr, offset
+	strb1 \reg, \ptr, \offset
 	.endm
 
-	.macro ldp1 reg1, reg2, ptr, val
-	ldp \reg1, \reg2, [\ptr], \val
+	.macro ldp1_pre regA, regB, ptr, offset
+	ldp \regA, \regB, [\ptr, \offset]!
 	.endm
 
-	.macro stp1 reg1, reg2, ptr, val
-	stp \reg1, \reg2, [\ptr], \val
+	.macro stp1_pre regA, regB, ptr, offset
+	stp \regA, \regB, [\ptr, \offset]!
+	.endm
+
+	.macro copy_exit
+	ret
 	.endm
 
 	.weak memcpy
 SYM_FUNC_START_ALIAS(__memcpy)
 SYM_FUNC_START_PI(memcpy)
 #include "copy_template.S"
-	ret
 SYM_FUNC_END_PI(memcpy)
 EXPORT_SYMBOL(memcpy)
 SYM_FUNC_END_ALIAS(__memcpy)
-- 
2.17.1


  parent reply	other threads:[~2020-09-14 15:12 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-09-14 15:09 [PATCH v5 00/14] Optimise and update memcpy, user copy and string routines Oliver Swede
2020-09-14 15:09 ` [PATCH v5 01/14] arm64: Allow passing fault address to fixup handlers Oliver Swede
2020-09-14 15:09 ` [PATCH v5 02/14] arm64: kprobes: Drop open-coded exception fixup Oliver Swede
2020-09-14 15:09 ` [PATCH v5 03/14] arm64: Import latest version of Cortex Strings' memcmp Oliver Swede
2020-09-14 15:09 ` [PATCH v5 04/14] arm64: Import latest version of Cortex Strings' memmove Oliver Swede
2020-09-14 15:09 ` [PATCH v5 05/14] arm64: Import latest version of Cortex Strings' strcmp Oliver Swede
2020-09-14 15:09 ` [PATCH v5 06/14] arm64: Import latest version of Cortex Strings' strlen Oliver Swede
2020-09-14 15:09 ` [PATCH v5 07/14] arm64: Import latest version of Cortex Strings' strncmp Oliver Swede
2020-09-14 15:09 ` Oliver Swede [this message]
2021-06-01 10:03   ` [PATCH v5 08/14] arm64: Import latest optimization of memcpy Sunil Kovvuri
2021-06-01 12:06     ` Robin Murphy
2021-06-01 12:31       ` Sunil Kovvuri
2021-06-03  8:45       ` David Laight
2020-09-14 15:09 ` [PATCH v5 09/14] arm64: Tidy up _asm_extable_faultaddr usage Oliver Swede
2020-09-14 15:09 ` [PATCH v5 10/14] arm64: usercopy: Store the arguments on stack Oliver Swede
2020-09-14 15:09 ` [PATCH v5 11/14] arm64: usercopy: Check for overlapping buffers in fixup Oliver Swede
2020-09-14 15:09 ` [PATCH v5 12/14] arm64: usercopy: Add intermediate fixup routine Oliver Swede
2020-09-14 15:09 ` [PATCH v5 13/14] arm64: usercopy: Add conclusive " Oliver Swede
2020-09-14 15:09 ` [PATCH v5 14/14] arm64: usercopy: Reduce overhead in fixup Oliver Swede
2020-09-14 15:17 [PATCH v5 00/14] Optimise and update memcpy, user copy and string routines Oliver Swede
2020-09-14 15:17 ` [PATCH v5 08/14] arm64: Import latest optimization of memcpy Oliver Swede

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200914150958.2200-9-oli.swede@arm.com \
    --to=oli.swede@arm.com \
    --cc=catalin.marinas@arm.com \
    --cc=linux-arm-kernel@lists.indradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=robin.murphy@arm.com \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.