From: Oliver Swede <oli.swede@arm.com>
To: catalin.marinas@arm.com, will@kernel.org
Cc: robin.murphy@arm.com, linux-arm-kernel@lists.indradead.org,
linux-kernel@vger.kernel.org
Subject: [PATCH v5 08/14] arm64: Import latest optimization of memcpy
Date: Mon, 14 Sep 2020 15:09:52 +0000 [thread overview]
Message-ID: <20200914150958.2200-9-oli.swede@arm.com> (raw)
In-Reply-To: <20200914150958.2200-1-oli.swede@arm.com>
From: Sam Tebbs <sam.tebbs@arm.com>
Import the latest memcpy implementation into memcpy,
copy_{from, to and in}_user.
The implementation of the user routines is separated into two forms:
one for when UAO is enabled and one for when UAO is disabled, with
the two being chosen between with a runtime patch.
This avoids executing the many NOPs emitted when UAO is disabled.
The project containing optimized implementations for various library
functions has now been renamed from 'cortex-strings' to
'optimized-routines', and the new upstream source is
string/aarch64/memcpy.S as of commit 4c175c8be12 in
https://github.com/ARM-software/optimized-routines.
Signed-off-by: Sam Tebbs <sam.tebbs@arm.com>
[ rm: add UAO fixups, streamline copy_exit paths, expand commit message ]
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
[ os: import newer memcpy algorithm, update commit message ]
Signed-off-by: Oliver Swede <oli.swede@arm.com>
---
arch/arm64/include/asm/alternative.h | 36 ---
arch/arm64/lib/copy_from_user.S | 113 ++++++--
arch/arm64/lib/copy_in_user.S | 129 +++++++--
arch/arm64/lib/copy_template.S | 375 +++++++++++++++------------
arch/arm64/lib/copy_template_user.S | 24 ++
arch/arm64/lib/copy_to_user.S | 112 ++++++--
arch/arm64/lib/copy_user_fixup.S | 14 +
arch/arm64/lib/memcpy.S | 47 ++--
8 files changed, 557 insertions(+), 293 deletions(-)
create mode 100644 arch/arm64/lib/copy_template_user.S
create mode 100644 arch/arm64/lib/copy_user_fixup.S
diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
index 619db9b4c9d5..581bacacc1bc 100644
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -230,36 +230,6 @@ alternative_endif
* unprivileged instructions, and USER() only works for single instructions.
*/
#ifdef CONFIG_ARM64_UAO
- .macro uao_ldp l, reg1, reg2, addr, post_inc
- alternative_if_not ARM64_HAS_UAO
-8888: ldp \reg1, \reg2, [\addr], \post_inc;
-8889: nop;
- nop;
- alternative_else
- ldtr \reg1, [\addr];
- ldtr \reg2, [\addr, #8];
- add \addr, \addr, \post_inc;
- alternative_endif
-
- _asm_extable 8888b,\l;
- _asm_extable 8889b,\l;
- .endm
-
- .macro uao_stp l, reg1, reg2, addr, post_inc
- alternative_if_not ARM64_HAS_UAO
-8888: stp \reg1, \reg2, [\addr], \post_inc;
-8889: nop;
- nop;
- alternative_else
- sttr \reg1, [\addr];
- sttr \reg2, [\addr, #8];
- add \addr, \addr, \post_inc;
- alternative_endif
-
- _asm_extable 8888b,\l;
- _asm_extable 8889b,\l;
- .endm
-
.macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc
alternative_if_not ARM64_HAS_UAO
8888: \inst \reg, [\addr], \post_inc;
@@ -272,12 +242,6 @@ alternative_endif
_asm_extable 8888b,\l;
.endm
#else
- .macro uao_ldp l, reg1, reg2, addr, post_inc
- USER(\l, ldp \reg1, \reg2, [\addr], \post_inc)
- .endm
- .macro uao_stp l, reg1, reg2, addr, post_inc
- USER(\l, stp \reg1, \reg2, [\addr], \post_inc)
- .endm
.macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc
USER(\l, \inst \reg, [\addr], \post_inc)
.endm
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 0f8a3a9e3795..86945e84c009 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -19,50 +19,111 @@
* Returns:
* x0 - bytes not copied
*/
+ 8888: ldtrb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro strb1 reg, ptr, offset=0
+ strb \reg, [\ptr, \offset]
+ .endm
+
+ .macro ldrb1_reg reg, ptr, offset
+ add \ptr, \ptr, \offset
+ 8888: ldtrb \reg, [\ptr]
+ sub \ptr, \ptr, \offset
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro strb1_reg reg, ptr, offset
+ strb \reg, [\ptr, \offset]
+ .endm
- .macro ldrb1 reg, ptr, val
- uao_user_alternative 9998f, ldrb, ldtrb, \reg, \ptr, \val
+ .macro ldr1 reg, ptr, offset=0
+ 8888: ldtr \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
.endm
- .macro strb1 reg, ptr, val
- strb \reg, [\ptr], \val
+ .macro str1 reg, ptr, offset=0
+ str \reg, [\ptr, \offset]
.endm
- .macro ldrh1 reg, ptr, val
- uao_user_alternative 9998f, ldrh, ldtrh, \reg, \ptr, \val
+ .macro ldp1 regA, regB, ptr, offset=0
+ 8888: ldtr \regA, [\ptr, \offset]
+ 8889: ldtr \regB, [\ptr, \offset + 8]
+ _asm_extable_faultaddr 8888b,9998f;
+ _asm_extable_faultaddr 8889b,9998f;
.endm
- .macro strh1 reg, ptr, val
- strh \reg, [\ptr], \val
+ .macro stp1 regA, regB, ptr, offset=0
+ stp \regA, \regB, [\ptr, \offset]
.endm
- .macro ldr1 reg, ptr, val
- uao_user_alternative 9998f, ldr, ldtr, \reg, \ptr, \val
+ .macro ldp1_pre regA, regB, ptr, offset
+ 8888: ldtr \regA, [\ptr, \offset]
+ 8889: ldtr \regB, [\ptr, \offset + 8]
+ add \ptr, \ptr, \offset
+ _asm_extable_faultaddr 8888b,9998f;
+ _asm_extable_faultaddr 8889b,9998f;
.endm
- .macro str1 reg, ptr, val
- str \reg, [\ptr], \val
+ .macro stp1_pre regA, regB, ptr, offset
+ stp \regA, \regB, [\ptr, \offset]!
.endm
- .macro ldp1 reg1, reg2, ptr, val
- uao_ldp 9998f, \reg1, \reg2, \ptr, \val
+ .macro ldrb1_nuao reg, ptr, offset=0
+ 8888: ldrb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
.endm
- .macro stp1 reg1, reg2, ptr, val
- stp \reg1, \reg2, [\ptr], \val
+ .macro strb1_nuao reg, ptr, offset=0
+ strb \reg, [\ptr, \offset]
+ .endm
+
+ .macro ldrb1_nuao_reg reg, ptr, offset=0
+ 8888: ldrb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro strb1_nuao_reg reg, ptr, offset=0
+ strb \reg, [\ptr, \offset]
+ .endm
+
+ .macro ldr1_nuao reg, ptr, offset=0
+ 8888: ldr \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro str1_nuao reg, ptr, offset=0
+ str \reg, [\ptr, \offset]
+ .endm
+
+ .macro ldp1_nuao regA, regB, ptr, offset=0
+ 8888: ldp \regA, \regB, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro stp1_nuao regA, regB, ptr, offset=0
+ stp \regA, \regB, [\ptr, \offset]
+ .endm
+
+ .macro ldp1_pre_nuao regA, regB, ptr, offset
+ 8888: ldp \regA, \regB, [\ptr, \offset]!
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro stp1_pre_nuao regA, regB, ptr, offset
+ stp \regA, \regB, [\ptr, \offset]!
+ .endm
+
+ .macro copy_exit
+ b .Luaccess_finish
.endm
-end .req x5
SYM_FUNC_START(__arch_copy_from_user)
- add end, x0, x2
-#include "copy_template.S"
- mov x0, #0 // Nothing to copy
+#include "copy_template_user.S"
+.Luaccess_finish:
+ mov x0, #0
ret
SYM_FUNC_END(__arch_copy_from_user)
EXPORT_SYMBOL(__arch_copy_from_user)
-
- .section .fixup,"ax"
- .align 2
-9998: sub x0, end, dst // bytes not copied
- ret
- .previous
+#include "copy_user_fixup.S"
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index 80e37ada0ee1..77dfccc618b6 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -21,50 +21,129 @@
* Returns:
* x0 - bytes not copied
*/
- .macro ldrb1 reg, ptr, val
- uao_user_alternative 9998f, ldrb, ldtrb, \reg, \ptr, \val
+ .macro ldrb1 reg, ptr, offset=0
+ 8888: ldtrb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
.endm
- .macro strb1 reg, ptr, val
- uao_user_alternative 9998f, strb, sttrb, \reg, \ptr, \val
+ .macro strb1 reg, ptr, offset=0
+ 8888: sttrb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
.endm
- .macro ldrh1 reg, ptr, val
- uao_user_alternative 9998f, ldrh, ldtrh, \reg, \ptr, \val
+ .macro ldrb1_reg reg, ptr, offset
+ add \ptr, \ptr, \offset
+ 8888: ldtrb \reg, [\ptr]
+ sub \ptr, \ptr, \offset
+ _asm_extable_faultaddr 8888b,9998f;
.endm
- .macro strh1 reg, ptr, val
- uao_user_alternative 9998f, strh, sttrh, \reg, \ptr, \val
+ .macro strb1_reg reg, ptr, offset
+ add \ptr, \ptr, \offset
+ 8888: sttrb \reg, [\ptr]
+ sub \ptr, \ptr, \offset
+ _asm_extable_faultaddr 8888b,9998f;
.endm
- .macro ldr1 reg, ptr, val
- uao_user_alternative 9998f, ldr, ldtr, \reg, \ptr, \val
+ .macro ldr1 reg, ptr, offset=0
+ 8888: ldtr \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
.endm
- .macro str1 reg, ptr, val
- uao_user_alternative 9998f, str, sttr, \reg, \ptr, \val
+ .macro str1 reg, ptr, offset=0
+ 8888: sttr \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
.endm
- .macro ldp1 reg1, reg2, ptr, val
- uao_ldp 9998f, \reg1, \reg2, \ptr, \val
+ .macro ldp1 regA, regB, ptr, offset=0
+ 8888: ldtr \regA, [\ptr, \offset]
+ 8889: ldtr \regB, [\ptr, \offset + 8]
+ _asm_extable_faultaddr 8888b,9998f;
+ _asm_extable_faultaddr 8889b,9998f;
.endm
- .macro stp1 reg1, reg2, ptr, val
- uao_stp 9998f, \reg1, \reg2, \ptr, \val
+ .macro stp1 regA, regB, ptr, offset=0
+ 8888: sttr \regA, [\ptr, \offset]
+ 8889: sttr \regB, [\ptr, \offset + 8]
+ _asm_extable_faultaddr 8888b,9998f;
+ _asm_extable_faultaddr 8889b,9998f;
.endm
-end .req x5
+ .macro ldp1_pre regA, regB, ptr, offset
+ 8888: ldtr \regA, [\ptr, \offset]
+ 8889: ldtr \regB, [\ptr, \offset + 8]
+ add \ptr, \ptr, \offset
+ _asm_extable_faultaddr 8888b,9998f;
+ _asm_extable_faultaddr 8889b,9998f;
+ .endm
+
+ .macro stp1_pre regA, regB, ptr, offset
+ 8888: sttr \regA, [\ptr, \offset]
+ 8889: sttr \regB, [\ptr, \offset + 8]
+ add \ptr, \ptr, \offset
+ _asm_extable_faultaddr 8888b,9998f;
+ _asm_extable_faultaddr 8889b,9998f;
+ .endm
+
+ .macro ldrb1_nuao reg, ptr, offset=0
+ 8888: ldrb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro strb1_nuao reg, ptr, offset=0
+ 8888: strb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro ldrb1_nuao_reg reg, ptr, offset=0
+ 8888: ldrb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro strb1_nuao_reg reg, ptr, offset=0
+ 8888: strb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro ldr1_nuao reg, ptr, offset=0
+ 8888: ldr \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro str1_nuao reg, ptr, offset=0
+ 8888: str \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro ldp1_nuao regA, regB, ptr, offset=0
+ 8888: ldp \regA, \regB, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro stp1_nuao regA, regB, ptr, offset=0
+ 8888: stp \regA, \regB, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro ldp1_pre_nuao regA, regB, ptr, offset
+ 8888: ldp \regA, \regB, [\ptr, \offset]!
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro stp1_pre_nuao regA, regB, ptr, offset
+ 8888: stp \regA, \regB, [\ptr, \offset]!
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro copy_exit
+ b .Luaccess_finish
+ .endm
SYM_FUNC_START(__arch_copy_in_user)
- add end, x0, x2
-#include "copy_template.S"
+#include "copy_template_user.S"
+.Luaccess_finish:
mov x0, #0
ret
SYM_FUNC_END(__arch_copy_in_user)
EXPORT_SYMBOL(__arch_copy_in_user)
-
- .section .fixup,"ax"
- .align 2
-9998: sub x0, end, dst // bytes not copied
- ret
- .previous
+#include "copy_user_fixup.S"
diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
index 488df234c49a..90b5f63ff227 100644
--- a/arch/arm64/lib/copy_template.S
+++ b/arch/arm64/lib/copy_template.S
@@ -1,13 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2012 Linaro Limited. All rights reserved.
+ * Copyright (c) 2015 ARM Ltd. All rights reserved.
*
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
+ * This code is based on work originally authored by Linaro,
+ * found at:
*
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * https://github.com/ARM-software/optimized-routines
*/
@@ -21,161 +20,209 @@
* Returns:
* x0 - dest
*/
-dstin .req x0
-src .req x1
-count .req x2
-tmp1 .req x3
-tmp1w .req w3
-tmp2 .req x4
-tmp2w .req w4
-dst .req x6
-
-A_l .req x7
-A_h .req x8
-B_l .req x9
-B_h .req x10
-C_l .req x11
-C_h .req x12
-D_l .req x13
-D_h .req x14
-
- mov dst, dstin
- cmp count, #16
- /*When memory length is less than 16, the accessed are not aligned.*/
- b.lo .Ltiny15
-
- neg tmp2, src
- ands tmp2, tmp2, #15/* Bytes to reach alignment. */
- b.eq .LSrcAligned
- sub count, count, tmp2
- /*
- * Copy the leading memory data from src to dst in an increasing
- * address order.By this way,the risk of overwriting the source
- * memory data is eliminated when the distance between src and
- * dst is less than 16. The memory accesses here are alignment.
- */
- tbz tmp2, #0, 1f
- ldrb1 tmp1w, src, #1
- strb1 tmp1w, dst, #1
-1:
- tbz tmp2, #1, 2f
- ldrh1 tmp1w, src, #2
- strh1 tmp1w, dst, #2
-2:
- tbz tmp2, #2, 3f
- ldr1 tmp1w, src, #4
- str1 tmp1w, dst, #4
-3:
- tbz tmp2, #3, .LSrcAligned
- ldr1 tmp1, src, #8
- str1 tmp1, dst, #8
-
-.LSrcAligned:
- cmp count, #64
- b.ge .Lcpy_over64
- /*
- * Deal with small copies quickly by dropping straight into the
- * exit block.
- */
-.Ltail63:
- /*
- * Copy up to 48 bytes of data. At this point we only need the
- * bottom 6 bits of count to be accurate.
- */
- ands tmp1, count, #0x30
- b.eq .Ltiny15
- cmp tmp1w, #0x20
- b.eq 1f
- b.lt 2f
- ldp1 A_l, A_h, src, #16
- stp1 A_l, A_h, dst, #16
-1:
- ldp1 A_l, A_h, src, #16
- stp1 A_l, A_h, dst, #16
-2:
- ldp1 A_l, A_h, src, #16
- stp1 A_l, A_h, dst, #16
-.Ltiny15:
- /*
- * Prefer to break one ldp/stp into several load/store to access
- * memory in an increasing address order,rather than to load/store 16
- * bytes from (src-16) to (dst-16) and to backward the src to aligned
- * address,which way is used in original cortex memcpy. If keeping
- * the original memcpy process here, memmove need to satisfy the
- * precondition that src address is at least 16 bytes bigger than dst
- * address,otherwise some source data will be overwritten when memove
- * call memcpy directly. To make memmove simpler and decouple the
- * memcpy's dependency on memmove, withdrew the original process.
- */
- tbz count, #3, 1f
- ldr1 tmp1, src, #8
- str1 tmp1, dst, #8
-1:
- tbz count, #2, 2f
- ldr1 tmp1w, src, #4
- str1 tmp1w, dst, #4
-2:
- tbz count, #1, 3f
- ldrh1 tmp1w, src, #2
- strh1 tmp1w, dst, #2
-3:
- tbz count, #0, .Lexitfunc
- ldrb1 tmp1w, src, #1
- strb1 tmp1w, dst, #1
-
- b .Lexitfunc
-
-.Lcpy_over64:
- subs count, count, #128
- b.ge .Lcpy_body_large
- /*
- * Less than 128 bytes to copy, so handle 64 here and then jump
- * to the tail.
- */
- ldp1 A_l, A_h, src, #16
- stp1 A_l, A_h, dst, #16
- ldp1 B_l, B_h, src, #16
- ldp1 C_l, C_h, src, #16
- stp1 B_l, B_h, dst, #16
- stp1 C_l, C_h, dst, #16
- ldp1 D_l, D_h, src, #16
- stp1 D_l, D_h, dst, #16
-
- tst count, #0x3f
- b.ne .Ltail63
- b .Lexitfunc
-
- /*
- * Critical loop. Start at a new cache line boundary. Assuming
- * 64 bytes per line this ensures the entire loop is in one line.
- */
- .p2align L1_CACHE_SHIFT
-.Lcpy_body_large:
- /* pre-get 64 bytes data. */
- ldp1 A_l, A_h, src, #16
- ldp1 B_l, B_h, src, #16
- ldp1 C_l, C_h, src, #16
- ldp1 D_l, D_h, src, #16
-1:
- /*
- * interlace the load of next 64 bytes data block with store of the last
- * loaded 64 bytes data.
- */
- stp1 A_l, A_h, dst, #16
- ldp1 A_l, A_h, src, #16
- stp1 B_l, B_h, dst, #16
- ldp1 B_l, B_h, src, #16
- stp1 C_l, C_h, dst, #16
- ldp1 C_l, C_h, src, #16
- stp1 D_l, D_h, dst, #16
- ldp1 D_l, D_h, src, #16
- subs count, count, #64
- b.ge 1b
- stp1 A_l, A_h, dst, #16
- stp1 B_l, B_h, dst, #16
- stp1 C_l, C_h, dst, #16
- stp1 D_l, D_h, dst, #16
-
- tst count, #0x3f
- b.ne .Ltail63
-.Lexitfunc:
+ #define dstin x0
+ #define src x1
+ #define count x2
+ #define dst x3
+ #define srcend x4
+ #define dstend x5
+ #define A_l x6
+ #define A_lw w6
+ #define A_h x7
+ #define B_l x8
+ #define B_lw w8
+ #define B_h x9
+ #define C_l x10
+ #define C_lw w10
+ #define C_h x11
+ #define D_l x12
+ #define D_h x13
+ #define E_l x14
+ #define E_h x15
+ #define F_l x16
+ #define F_h x17
+ #define G_l count
+ #define G_h dst
+ #define H_l src
+ #define H_h srcend
+ #define tmp1 x14
+
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 128
+ b.hi L(copy_long)
+ cmp count, 32
+ b.hi L(copy32_128)
+
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
+ ldp1 A_l, A_h, src
+ ldp1 D_l, D_h, srcend, -16
+ stp1 A_l, A_h, dstin
+ stp1 D_l, D_h, dstend, -16
+ copy_exit
+
+ /* Copy 8-15 bytes. */
+L(copy16):
+ tbz count, 3, L(copy8)
+ ldr1 A_l, src
+ ldr1 A_h, srcend, -8
+ str1 A_l, dstin
+ str1 A_h, dstend, -8
+ copy_exit
+
+ .p2align 3
+ /* Copy 4-7 bytes. */
+L(copy8):
+ tbz count, 2, L(copy4)
+ ldr1 A_lw, src
+ ldr1 B_lw, srcend, -4
+ str1 A_lw, dstin
+ str1 B_lw, dstend, -4
+ copy_exit
+
+ /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+ cbz count, L(copy0)
+ lsr tmp1, count, 1
+ ldrb1 A_lw, src
+ ldrb1 C_lw, srcend, -1
+ ldrb1_reg B_lw, src, tmp1
+ strb1 A_lw, dstin
+ strb1_reg B_lw, dstin, tmp1
+ strb1 C_lw, dstend, -1
+L(copy0):
+ copy_exit
+
+ .p2align 4
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ ldp1 A_l, A_h, src
+ ldp1 B_l, B_h, src, 16
+ ldp1 C_l, C_h, srcend, -32
+ ldp1 D_l, D_h, srcend, -16
+ cmp count, 64
+ b.hi L(copy128)
+ stp1 A_l, A_h, dstin
+ stp1 B_l, B_h, dstin, 16
+ stp1 C_l, C_h, dstend, -32
+ stp1 D_l, D_h, dstend, -16
+ copy_exit
+
+ .p2align 4
+ /* Copy 65..128 bytes. */
+L(copy128):
+ ldp1 E_l, E_h, src, 32
+ ldp1 F_l, F_h, src, 48
+ cmp count, 96
+ b.ls L(copy96)
+ ldp1 G_l, G_h, srcend, -64
+ ldp1 H_l, H_h, srcend, -48
+ stp1 G_l, G_h, dstend, -64
+ stp1 H_l, H_h, dstend, -48
+L(copy96):
+ stp1 A_l, A_h, dstin
+ stp1 B_l, B_h, dstin, 16
+ stp1 E_l, E_h, dstin, 32
+ stp1 F_l, F_h, dstin, 48
+ stp1 C_l, C_h, dstend, -32
+ stp1 D_l, D_h, dstend, -16
+ copy_exit
+
+ .p2align 4
+ /* Copy more than 128 bytes. */
+L(copy_long):
+ /* Use backwards copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cbz tmp1, L(copy0)
+ cmp tmp1, count
+ b.lo L(copy_long_backwards)
+
+ /* Copy 16 bytes and then align dst to 16-byte alignment. */
+
+ ldp1 D_l, D_h, src
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp1 A_l, A_h, src, 16
+ stp1 D_l, D_h, dstin
+ ldp1 B_l, B_h, src, 32
+ ldp1 C_l, C_h, src, 48
+ ldp1_pre D_l, D_h, src, 64
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(copy64_from_end)
+
+L(loop64):
+ stp1 A_l, A_h, dst, 16
+ ldp1 A_l, A_h, src, 16
+ stp1 B_l, B_h, dst, 32
+ ldp1 B_l, B_h, src, 32
+ stp1 C_l, C_h, dst, 48
+ ldp1 C_l, C_h, src, 48
+ stp1_pre D_l, D_h, dst, 64
+ ldp1_pre D_l, D_h, src, 64
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
+ ldp1 E_l, E_h, srcend, -64
+ stp1 A_l, A_h, dst, 16
+ ldp1 A_l, A_h, srcend, -48
+ stp1 B_l, B_h, dst, 32
+ ldp1 B_l, B_h, srcend, -32
+ stp1 C_l, C_h, dst, 48
+ ldp1 C_l, C_h, srcend, -16
+ stp1 D_l, D_h, dst, 64
+ stp1 E_l, E_h, dstend, -64
+ stp1 A_l, A_h, dstend, -48
+ stp1 B_l, B_h, dstend, -32
+ stp1 C_l, C_h, dstend, -16
+ copy_exit
+
+ .p2align 4
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align dst to 16-byte alignment. */
+L(copy_long_backwards):
+ ldp1 D_l, D_h, srcend, -16
+ and tmp1, dstend, 15
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp1 A_l, A_h, srcend, -16
+ stp1 D_l, D_h, dstend, -16
+ ldp1 B_l, B_h, srcend, -32
+ ldp1 C_l, C_h, srcend, -48
+ ldp1_pre D_l, D_h, srcend, -64
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls L(copy64_from_start)
+
+L(loop64_backwards):
+ stp1 A_l, A_h, dstend, -16
+ ldp1 A_l, A_h, srcend, -16
+ stp1 B_l, B_h, dstend, -32
+ ldp1 B_l, B_h, srcend, -32
+ stp1 C_l, C_h, dstend, -48
+ ldp1 C_l, C_h, srcend, -48
+ stp1_pre D_l, D_h, dstend, -64
+ ldp1_pre D_l, D_h, srcend, -64
+ subs count, count, 64
+ b.hi L(loop64_backwards)
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
+ ldp1 G_l, G_h, src, 48
+ stp1 A_l, A_h, dstend, -16
+ ldp1 A_l, A_h, src, 32
+ stp1 B_l, B_h, dstend, -32
+ ldp1 B_l, B_h, src, 16
+ stp1 C_l, C_h, dstend, -48
+ ldp1 C_l, C_h, src
+ stp1 D_l, D_h, dstend, -64
+ stp1 G_l, G_h, dstin, 48
+ stp1 A_l, A_h, dstin, 32
+ stp1 B_l, B_h, dstin, 16
+ stp1 C_l, C_h, dstin
+ copy_exit
diff --git a/arch/arm64/lib/copy_template_user.S b/arch/arm64/lib/copy_template_user.S
new file mode 100644
index 000000000000..3db24dcdab05
--- /dev/null
+++ b/arch/arm64/lib/copy_template_user.S
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#define L(l) .L ## l
+
+ alternative_if_not ARM64_HAS_UAO
+ b L(copy_non_uao)
+ alternative_else_nop_endif
+#include "copy_template.S"
+
+#define ldp1 ldp1_nuao
+#define ldp1_pre ldp1_pre_nuao
+#define stp1 stp1_nuao
+#define stp1_pre stp1_pre_nuao
+#define ldr1 ldr1_nuao
+#define str1 str1_nuao
+#define ldrb1 ldrb1_nuao
+#define strb1 strb1_nuao
+#define ldrb1_reg ldrb1_nuao_reg
+#define strb1_reg strb1_nuao_reg
+
+L(copy_non_uao):
+#undef L
+#define L(l) .Lnuao ## l
+#include "copy_template.S"
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 4ec59704b8f2..6b4742cac083 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -19,49 +19,111 @@
* Returns:
* x0 - bytes not copied
*/
- .macro ldrb1 reg, ptr, val
- ldrb \reg, [\ptr], \val
+ .macro ldrb1 reg, ptr, offset=0
+ ldrb \reg, [\ptr, \offset]
.endm
- .macro strb1 reg, ptr, val
- uao_user_alternative 9998f, strb, sttrb, \reg, \ptr, \val
+ .macro strb1 reg, ptr, offset=0
+ 8888: sttrb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
.endm
- .macro ldrh1 reg, ptr, val
- ldrh \reg, [\ptr], \val
+ .macro ldrb1_reg reg, ptr, offset
+ ldrb \reg, [\ptr, \offset]
.endm
- .macro strh1 reg, ptr, val
- uao_user_alternative 9998f, strh, sttrh, \reg, \ptr, \val
+ .macro strb1_reg reg, ptr, offset
+ add \ptr, \ptr, \offset
+ 8888: sttrb \reg, [\ptr]
+ sub \ptr, \ptr, \offset
+ _asm_extable_faultaddr 8888b,9998f;
.endm
- .macro ldr1 reg, ptr, val
- ldr \reg, [\ptr], \val
+ .macro ldr1 reg, ptr, offset=0
+ ldr \reg, [\ptr, \offset]
.endm
- .macro str1 reg, ptr, val
- uao_user_alternative 9998f, str, sttr, \reg, \ptr, \val
+ .macro str1 reg, ptr, offset=0
+ 8888: sttr \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
.endm
- .macro ldp1 reg1, reg2, ptr, val
- ldp \reg1, \reg2, [\ptr], \val
+ .macro ldp1 regA, regB, ptr, offset=0
+ ldp \regA, \regB, [\ptr, \offset]
.endm
- .macro stp1 reg1, reg2, ptr, val
- uao_stp 9998f, \reg1, \reg2, \ptr, \val
+ .macro stp1 regA, regB, ptr, offset=0
+ 8888: sttr \regA, [\ptr, \offset]
+ 8889: sttr \regB, [\ptr, \offset + 8]
+ _asm_extable_faultaddr 8888b,9998f;
+ _asm_extable_faultaddr 8889b,9998f;
+ .endm
+
+ .macro ldp1_pre regA, regB, ptr, offset
+ ldp \regA, \regB, [\ptr, \offset]!
+ .endm
+
+ .macro stp1_pre regA, regB, ptr, offset
+ 8888: sttr \regA, [\ptr, \offset]
+ 8889: sttr \regB, [\ptr, \offset + 8]
+ add \ptr, \ptr, \offset
+ _asm_extable_faultaddr 8888b,9998f;
+ _asm_extable_faultaddr 8889b,9998f;
+ .endm
+
+ .macro ldrb1_nuao reg, ptr, offset=0
+ ldrb \reg, [\ptr, \offset]
+ .endm
+
+ .macro strb1_nuao reg, ptr, offset=0
+ 8888: strb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro ldrb1_nuao_reg reg, ptr, offset=0
+ ldrb \reg, [\ptr, \offset]
+ .endm
+
+ .macro strb1_nuao_reg reg, ptr, offset=0
+ strb \reg, [\ptr, \offset]
+ .endm
+
+ .macro ldr1_nuao reg, ptr, offset=0
+ ldr \reg, [\ptr, \offset]
+ .endm
+
+ .macro str1_nuao reg, ptr, offset=0
+ 8888: str \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro ldp1_nuao regA, regB, ptr, offset=0
+ ldp \regA, \regB, [\ptr, \offset]
+ .endm
+
+ .macro ldp1_pre_nuao regA, regB, ptr, offset
+ ldp \regA, \regB, [\ptr, \offset]!
+ .endm
+
+ .macro stp1_nuao regA, regB, ptr, offset=0
+ 8888: stp \regA, \regB, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro stp1_pre_nuao regA, regB, ptr, offset
+ 8888: stp \regA, \regB, [\ptr, \offset]!
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro copy_exit
+ b .Luaccess_finish
.endm
-end .req x5
SYM_FUNC_START(__arch_copy_to_user)
- add end, x0, x2
-#include "copy_template.S"
+#include "copy_template_user.S"
+.Luaccess_finish:
mov x0, #0
ret
SYM_FUNC_END(__arch_copy_to_user)
EXPORT_SYMBOL(__arch_copy_to_user)
-
- .section .fixup,"ax"
- .align 2
-9998: sub x0, end, dst // bytes not copied
- ret
- .previous
+#include "copy_user_fixup.S"
diff --git a/arch/arm64/lib/copy_user_fixup.S b/arch/arm64/lib/copy_user_fixup.S
new file mode 100644
index 000000000000..32fae9e2e799
--- /dev/null
+++ b/arch/arm64/lib/copy_user_fixup.S
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+addr .req x15
+.section .fixup,"ax"
+.align 2
+9998:
+ // If it falls in the src range then it was a load that failed,
+ // otherwise it was a store
+ cmp addr, src
+ ccmp addr, srcend, #0x0, ge
+ csel x0, srcend, dstend, lt
+ sub x0, x0, addr
+ ret
+
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index e0bf83d556f2..c24925aef236 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -24,43 +24,56 @@
* Returns:
* x0 - dest
*/
- .macro ldrb1 reg, ptr, val
- ldrb \reg, [\ptr], \val
+ #define L(l) .L ## l
+
+ .macro ldrb1 reg, ptr, offset=0
+ ldrb \reg, [\ptr, \offset]
+ .endm
+
+ .macro strb1 reg, ptr, offset=0
+ strb \reg, [\ptr, \offset]
+ .endm
+
+ .macro ldr1 reg, ptr, offset=0
+ ldr \reg, [\ptr, \offset]
.endm
- .macro strb1 reg, ptr, val
- strb \reg, [\ptr], \val
+ .macro str1 reg, ptr, offset=0
+ str \reg, [\ptr, \offset]
.endm
- .macro ldrh1 reg, ptr, val
- ldrh \reg, [\ptr], \val
+ .macro ldp1 regA, regB, ptr, offset=0
+ ldp \regA, \regB, [\ptr, \offset]
.endm
- .macro strh1 reg, ptr, val
- strh \reg, [\ptr], \val
+ .macro stp1 regA, regB, ptr, offset=0
+ stp \regA, \regB, [\ptr, \offset]
.endm
- .macro ldr1 reg, ptr, val
- ldr \reg, [\ptr], \val
+ .macro ldrb1_reg reg, ptr, offset
+ ldrb1 \reg, \ptr, \offset
.endm
- .macro str1 reg, ptr, val
- str \reg, [\ptr], \val
+ .macro strb1_reg reg, ptr, offset
+ strb1 \reg, \ptr, \offset
.endm
- .macro ldp1 reg1, reg2, ptr, val
- ldp \reg1, \reg2, [\ptr], \val
+ .macro ldp1_pre regA, regB, ptr, offset
+ ldp \regA, \regB, [\ptr, \offset]!
.endm
- .macro stp1 reg1, reg2, ptr, val
- stp \reg1, \reg2, [\ptr], \val
+ .macro stp1_pre regA, regB, ptr, offset
+ stp \regA, \regB, [\ptr, \offset]!
+ .endm
+
+ .macro copy_exit
+ ret
.endm
.weak memcpy
SYM_FUNC_START_ALIAS(__memcpy)
SYM_FUNC_START_PI(memcpy)
#include "copy_template.S"
- ret
SYM_FUNC_END_PI(memcpy)
EXPORT_SYMBOL(memcpy)
SYM_FUNC_END_ALIAS(__memcpy)
--
2.17.1
next prev parent reply other threads:[~2020-09-14 15:12 UTC|newest]
Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-09-14 15:09 [PATCH v5 00/14] Optimise and update memcpy, user copy and string routines Oliver Swede
2020-09-14 15:09 ` [PATCH v5 01/14] arm64: Allow passing fault address to fixup handlers Oliver Swede
2020-09-14 15:09 ` [PATCH v5 02/14] arm64: kprobes: Drop open-coded exception fixup Oliver Swede
2020-09-14 15:09 ` [PATCH v5 03/14] arm64: Import latest version of Cortex Strings' memcmp Oliver Swede
2020-09-14 15:09 ` [PATCH v5 04/14] arm64: Import latest version of Cortex Strings' memmove Oliver Swede
2020-09-14 15:09 ` [PATCH v5 05/14] arm64: Import latest version of Cortex Strings' strcmp Oliver Swede
2020-09-14 15:09 ` [PATCH v5 06/14] arm64: Import latest version of Cortex Strings' strlen Oliver Swede
2020-09-14 15:09 ` [PATCH v5 07/14] arm64: Import latest version of Cortex Strings' strncmp Oliver Swede
2020-09-14 15:09 ` Oliver Swede [this message]
2021-06-01 10:03 ` [PATCH v5 08/14] arm64: Import latest optimization of memcpy Sunil Kovvuri
2021-06-01 12:06 ` Robin Murphy
2021-06-01 12:31 ` Sunil Kovvuri
2021-06-03 8:45 ` David Laight
2020-09-14 15:09 ` [PATCH v5 09/14] arm64: Tidy up _asm_extable_faultaddr usage Oliver Swede
2020-09-14 15:09 ` [PATCH v5 10/14] arm64: usercopy: Store the arguments on stack Oliver Swede
2020-09-14 15:09 ` [PATCH v5 11/14] arm64: usercopy: Check for overlapping buffers in fixup Oliver Swede
2020-09-14 15:09 ` [PATCH v5 12/14] arm64: usercopy: Add intermediate fixup routine Oliver Swede
2020-09-14 15:09 ` [PATCH v5 13/14] arm64: usercopy: Add conclusive " Oliver Swede
2020-09-14 15:09 ` [PATCH v5 14/14] arm64: usercopy: Reduce overhead in fixup Oliver Swede
2020-09-14 15:17 [PATCH v5 00/14] Optimise and update memcpy, user copy and string routines Oliver Swede
2020-09-14 15:17 ` [PATCH v5 08/14] arm64: Import latest optimization of memcpy Oliver Swede
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200914150958.2200-9-oli.swede@arm.com \
--to=oli.swede@arm.com \
--cc=catalin.marinas@arm.com \
--cc=linux-arm-kernel@lists.indradead.org \
--cc=linux-kernel@vger.kernel.org \
--cc=robin.murphy@arm.com \
--cc=will@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.