From: Robin Murphy <robin.murphy@arm.com>
To: will@kernel.org, catalin.marinas@arm.com
Cc: sam.tebbs@arm.com, linux-arm-kernel@lists.infradead.org
Subject: [PATCH 6/8] arm64: Import latest version of Cortex Strings' strlen
Date: Tue, 15 Oct 2019 16:50:01 +0100 [thread overview]
Message-ID: <c89c35f2898351ad4c744263eb300aa134c5d640.1571073960.git.robin.murphy@arm.com> (raw)
In-Reply-To: <cover.1571073960.git.robin.murphy@arm.com>
From: Sam Tebbs <sam.tebbs@arm.com>
Import latest version of Cortex Strings' strlen function.
The upstream source is src/aarch64/strlen.S as of commit eb80ac77a6cd
in https://git.linaro.org/toolchain/cortex-strings.git.
Signed-off-by: Sam Tebbs <sam.tebbs@arm.com>
[ rm: update attribution, expand commit message ]
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---
arch/arm64/lib/strlen.S | 249 +++++++++++++++++++++++++++-------------
1 file changed, 169 insertions(+), 80 deletions(-)
diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S
index 87b0cb066915..e404edd6068c 100644
--- a/arch/arm64/lib/strlen.S
+++ b/arch/arm64/lib/strlen.S
@@ -1,13 +1,11 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2013-2015 Linaro Limited. All rights reserved.
*
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
+ * This code is based on glibc Cortex Strings work originally authored by
+ * Linaro, found at:
*
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * https://git.linaro.org/toolchain/cortex-strings.git
*/
#include <linux/linkage.h>
@@ -23,93 +21,184 @@
*/
/* Arguments and results. */
-srcin .req x0
-len .req x0
+#define srcin x0
+#define len x0
/* Locals and temporaries. */
-src .req x1
-data1 .req x2
-data2 .req x3
-data2a .req x4
-has_nul1 .req x5
-has_nul2 .req x6
-tmp1 .req x7
-tmp2 .req x8
-tmp3 .req x9
-tmp4 .req x10
-zeroones .req x11
-pos .req x12
+#define src x1
+#define data1 x2
+#define data2 x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1 x4
+#define tmp2 x5
+#define tmp3 x6
+#define tmp4 x7
+#define zeroones x8
+
+#define L(l) .L ## l
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. A faster check
+ (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
+ false hits for characters 129..255. */
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
+#ifdef TEST_PAGE_CROSS
+# define MIN_PAGE_SIZE 15
+#else
+# define MIN_PAGE_SIZE 4096
+#endif
+
+ /* Since strings are short on average, we check the first 16 bytes
+ of the string for a NUL character. In order to do an unaligned ldp
+ safely we have to do a page cross check first. If there is a NUL
+ byte we calculate the length from the 2 8-byte words using
+ conditional select to reduce branch mispredictions (it is unlikely
+ strlen will be repeatedly called on strings with the same length).
+
+ If the string is longer than 16 bytes, we align src so don't need
+ further page cross checks, and process 32 bytes per iteration
+ using the fast NUL check. If we encounter non-ASCII characters,
+ fallback to a second loop using the full NUL check.
+
+ If the page cross check fails, we read 16 bytes from an aligned
+ address, remove any characters before the string, and continue
+ in the main loop using aligned loads. Since strings crossing a
+ page in the first 16 bytes are rare (probability of
+ 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
+
+ AArch64 systems have a minimum page size of 4k. We don't bother
+ checking for larger page sizes - the cost of setting up the correct
+ page size is just not worth the extra gain from a small reduction in
+ the cases taking the slow path. Note that we only care about
+ whether the first fetch, which may be misaligned, crosses a page
+ boundary. */
+
WEAK(strlen)
- mov zeroones, #REP8_01
- bic src, srcin, #15
- ands tmp1, srcin, #15
- b.ne .Lmisaligned
- /*
- * NUL detection works on the principle that (X - 1) & (~X) & 0x80
- * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- * can be done in parallel across the entire word.
- */
- /*
- * The inner loop deals with two Dwords at a time. This has a
- * slightly higher start-up cost, but we should win quite quickly,
- * especially on cores with a high number of issue slots per
- * cycle, as we get much better parallelism out of the operations.
- */
-.Lloop:
- ldp data1, data2, [src], #16
-.Lrealigned:
+ and tmp1, srcin, MIN_PAGE_SIZE - 1
+ mov zeroones, REP8_01
+ cmp tmp1, MIN_PAGE_SIZE - 16
+ b.gt L(page_cross)
+ ldp data1, data2, [srcin]
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul1/2 directly.
+ Since we expect strings to be small and early-exit,
+ byte-swap the data now so has_null1/2 will be correct. */
+CPU_BE(rev data1, data1)
+CPU_BE(rev data2, data2)
sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
+ orr tmp2, data1, REP8_7f
sub tmp3, data2, zeroones
- orr tmp4, data2, #REP8_7f
- bic has_nul1, tmp1, tmp2
- bics has_nul2, tmp3, tmp4
- ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
- b.eq .Lloop
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ beq L(main_loop_entry)
- sub len, src, srcin
- cbz has_nul1, .Lnul_in_data2
-CPU_BE( mov data2, data1 ) /*prepare data to re-calculate the syndrome*/
- sub len, len, #8
- mov has_nul2, has_nul1
-.Lnul_in_data2:
- /*
- * For big-endian, carry propagation (if the final byte in the
- * string is 0x01) means we cannot use has_nul directly. The
- * easiest way to get the correct byte is to byte-swap the data
- * and calculate the syndrome a second time.
- */
-CPU_BE( rev data2, data2 )
-CPU_BE( sub tmp1, data2, zeroones )
-CPU_BE( orr tmp2, data2, #REP8_7f )
-CPU_BE( bic has_nul2, tmp1, tmp2 )
-
- sub len, len, #8
- rev has_nul2, has_nul2
- clz pos, has_nul2
- add len, len, pos, lsr #3 /* Bits to bytes. */
+ /* Enter with C = has_nul1 == 0. */
+ csel has_nul1, has_nul1, has_nul2, cc
+ mov len, 8
+ rev has_nul1, has_nul1
+ clz tmp1, has_nul1
+ csel len, xzr, len, cc
+ add len, len, tmp1, lsr 3
ret
-.Lmisaligned:
- cmp tmp1, #8
- neg tmp1, tmp1
- ldp data1, data2, [src], #16
- lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
- mov tmp2, #~0
- /* Big-endian. Early bytes are at MSB. */
-CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
- /* Little-endian. Early bytes are at LSB. */
-CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
+ /* The inner loop processes 32 bytes per iteration and uses the fast
+ NUL check. If we encounter non-ASCII characters, use a second
+ loop with the accurate NUL check. */
+ .p2align 4
+L(main_loop_entry):
+ bic src, srcin, 15
+ sub src, src, 16
+L(main_loop):
+ ldp data1, data2, [src, 32]!
+.Lpage_cross_entry:
+ sub tmp1, data1, zeroones
+ sub tmp3, data2, zeroones
+ orr tmp2, tmp1, tmp3
+ tst tmp2, zeroones, lsl 7
+ bne 1f
+ ldp data1, data2, [src, 16]
+ sub tmp1, data1, zeroones
+ sub tmp3, data2, zeroones
+ orr tmp2, tmp1, tmp3
+ tst tmp2, zeroones, lsl 7
+ beq L(main_loop)
+ add src, src, 16
+1:
+ /* The fast check failed, so do the slower, accurate NUL check. */
+ orr tmp2, data1, REP8_7f
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ beq L(nonascii_loop)
- orr data1, data1, tmp2
- orr data2a, data2, tmp2
- csinv data1, data1, xzr, le
- csel data2, data2, data2a, le
- b .Lrealigned
+ /* Enter with C = has_nul1 == 0. */
+L(tail):
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul1/2 directly. The
+ easiest way to get the correct byte is to byte-swap the data
+ and calculate the syndrome a second time. */
+CPU_BE(csel data1, data1, data2, cc)
+CPU_BE(rev data1, data1)
+CPU_BE(sub tmp1, data1, zeroones)
+CPU_BE(orr tmp2, data1, REP8_7f)
+CPU_BE(bic has_nul1, tmp1, tmp2)
+CPU_LE(csel has_nul1, has_nul1, has_nul2, cc)
+ sub len, src, srcin
+ rev has_nul1, has_nul1
+ add tmp2, len, 8
+ clz tmp1, has_nul1
+ csel len, len, tmp2, cc
+ add len, len, tmp1, lsr 3
+ ret
+
+L(nonascii_loop):
+ ldp data1, data2, [src, 16]!
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ bne L(tail)
+ ldp data1, data2, [src, 16]!
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ beq L(nonascii_loop)
+ b L(tail)
+
+ /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
+ srcin to 0x7f, so we ignore any NUL bytes before the string.
+ Then continue in the aligned loop. */
+L(page_cross):
+ bic src, srcin, 15
+ ldp data1, data2, [src]
+ lsl tmp1, srcin, 3
+ mov tmp4, -1
+ /* Big-endian. Early bytes are at MSB. */
+CPU_BE(lsr tmp1, tmp4, tmp1) /* Shift (tmp1 & 63). */
+ /* Little-endian. Early bytes are at LSB. */
+CPU_LE(lsl tmp1, tmp4, tmp1) /* Shift (tmp1 & 63). */
+ orr tmp1, tmp1, REP8_80
+ orn data1, data1, tmp1
+ orn tmp2, data2, tmp1
+ tst srcin, 8
+ csel data1, data1, tmp4, eq
+ csel data2, data2, tmp2, eq
+ b L(page_cross_entry)
ENDPIPROC(strlen)
EXPORT_SYMBOL_NOKASAN(strlen)
--
2.21.0.dirty
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
next prev parent reply other threads:[~2019-10-15 15:52 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-10-15 15:49 [PATCH 0/8] arm64: Optimise and update memcpy, user copy and string routines Robin Murphy
2019-10-15 15:49 ` [PATCH 1/8] arm64: Allow passing fault address to fixup handlers Robin Murphy
2019-10-15 15:49 ` [PATCH 2/8] arm64: Import latest Cortex Strings memcpy implementation Robin Murphy
2019-10-15 15:49 ` [PATCH 3/8] arm64: Import latest version of Cortex Strings' memcmp Robin Murphy
2019-10-15 15:49 ` [PATCH 4/8] arm64: Import latest version of Cortex Strings' memmove Robin Murphy
2019-10-15 15:50 ` [PATCH 5/8] arm64: Import latest version of Cortex Strings' strcmp Robin Murphy
2019-10-15 15:50 ` Robin Murphy [this message]
2019-10-15 15:50 ` [PATCH 7/8] arm64: Import latest version of Cortex Strings' strncmp Robin Murphy
2019-10-15 15:50 ` [PATCH 8/8] arm64: Tidy up _asm_extable_faultaddr usage Robin Murphy
2019-10-17 11:21 ` [PATCH 0/8] arm64: Optimise and update memcpy, user copy and string routines Catalin Marinas
2019-10-18 7:54 ` Catalin Marinas
2019-10-18 9:28 ` Robin Murphy
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=c89c35f2898351ad4c744263eb300aa134c5d640.1571073960.git.robin.murphy@arm.com \
--to=robin.murphy@arm.com \
--cc=catalin.marinas@arm.com \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=sam.tebbs@arm.com \
--cc=will@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).