From: Bin Meng <bmeng.cn@gmail.com>
To: u-boot@lists.denx.de
Subject: [PATCH 1/2] riscv: Fix memmove and optimise memcpy when misalign
Date: Thu, 13 May 2021 16:46:17 +0800 [thread overview]
Message-ID: <20210513084618.2161331-1-bmeng.cn@gmail.com> (raw)
At present U-Boot SPL fails to boot on SiFive Unleashed board, due
to a load address misaligned exception happens when loading the FIT
image in spl_load_simple_fit(). The exception happens in memmove()
which is called by fdt_splice_().
Commit 8f0dc4cfd106 introduces an assembly version of memmove but
it does take misalignment into account (it checks if length is a
multiple of machine word size but pointers need also be aligned).
As a result it will generate misaligned load/store for the majority
of cases and causes significant performance regression on hardware
that traps misaligned load/store and emulate them using firmware.
The current behaviour of memcpy is that it checks if both src and
dest pointers are co-aligned (aka congruent modular SZ_REG). If
aligned, it will copy data word-by-word after first aligning
pointers to word boundary. If src and dst are not co-aligned,
however, byte-wise copy will be performed.
This patch was taken from the Linux kernel patch [1], which has not
been applied at the time being. It fixes the memmove and optimises
memcpy for misaligned cases. It will first align destination pointer
to word-boundary regardless whether src and dest are co-aligned or
not. If they indeed are, then wordwise copy is performed. If they
are not co-aligned, then it will load two adjacent words from src
and use shifts to assemble a full machine word. Some additional
assembly level micro-optimisation is also performed to ensure more
instructions can be compressed (e.g. prefer a0 to t6).
With this patch, U-Boot boots again on SiFive Unleashed board.
[1] https://patchwork.kernel.org/project/linux-riscv/patch/20210216225555.4976-1-gary at garyguo.net/
Fixes: 8f0dc4cfd106 ("riscv: assembler versions of memcpy, memmove, memset")
Signed-off-by: Bin Meng <bmeng.cn@gmail.com>
---
arch/riscv/lib/memcpy.S | 223 ++++++++++++++++++++++++---------------
arch/riscv/lib/memmove.S | 176 ++++++++++++++++++++----------
2 files changed, 257 insertions(+), 142 deletions(-)
diff --git a/arch/riscv/lib/memcpy.S b/arch/riscv/lib/memcpy.S
index 51ab716253..00672c19ad 100644
--- a/arch/riscv/lib/memcpy.S
+++ b/arch/riscv/lib/memcpy.S
@@ -9,100 +9,151 @@
/* void *memcpy(void *, const void *, size_t) */
ENTRY(__memcpy)
WEAK(memcpy)
- move t6, a0 /* Preserve return value */
+ /* Save for return value */
+ mv t6, a0
- /* Defer to byte-oriented copy for small sizes */
- sltiu a3, a2, 128
- bnez a3, 4f
- /* Use word-oriented copy only if low-order bits match */
- andi a3, t6, SZREG-1
- andi a4, a1, SZREG-1
- bne a3, a4, 4f
+ /*
+ * Register allocation for code below:
+ * a0 - start of uncopied dst
+ * a1 - start of uncopied src
+ * t0 - end of uncopied dst
+ */
+ add t0, a0, a2
- beqz a3, 2f /* Skip if already aligned */
/*
- * Round to nearest double word-aligned address
- * greater than or equal to start address
+ * Use bytewise copy if too small.
+ *
+ * This threshold must be at least 2*SZREG to ensure at least one
+ * wordwise copy is performed. It is chosen to be 16 because it will
+ * save at least 7 iterations of bytewise copy, which pays off the
+ * fixed overhead.
*/
- andi a3, a1, ~(SZREG-1)
- addi a3, a3, SZREG
- /* Handle initial misalignment */
- sub a4, a3, a1
+ li a3, 16
+ bltu a2, a3, .Lbyte_copy_tail
+
+ /*
+ * Bytewise copy first to align a0 to word boundary.
+ */
+ addi a2, a0, SZREG-1
+ andi a2, a2, ~(SZREG-1)
+ beq a0, a2, 2f
1:
- lb a5, 0(a1)
- addi a1, a1, 1
- sb a5, 0(t6)
- addi t6, t6, 1
- bltu a1, a3, 1b
- sub a2, a2, a4 /* Update count */
+ lb a5, 0(a1)
+ addi a1, a1, 1
+ sb a5, 0(a0)
+ addi a0, a0, 1
+ bne a0, a2, 1b
+2:
+
+ /*
+ * Now a0 is word-aligned. If a1 is also word aligned, we could perform
+ * aligned word-wise copy. Otherwise we need to perform misaligned
+ * word-wise copy.
+ */
+ andi a3, a1, SZREG-1
+ bnez a3, .Lmisaligned_word_copy
+ /* Unrolled wordwise copy */
+ addi t0, t0, -(16*SZREG-1)
+ bgeu a0, t0, 2f
+1:
+ REG_L a2, 0(a1)
+ REG_L a3, SZREG(a1)
+ REG_L a4, 2*SZREG(a1)
+ REG_L a5, 3*SZREG(a1)
+ REG_L a6, 4*SZREG(a1)
+ REG_L a7, 5*SZREG(a1)
+ REG_L t1, 6*SZREG(a1)
+ REG_L t2, 7*SZREG(a1)
+ REG_L t3, 8*SZREG(a1)
+ REG_L t4, 9*SZREG(a1)
+ REG_L t5, 10*SZREG(a1)
+ REG_S a2, 0(a0)
+ REG_S a3, SZREG(a0)
+ REG_S a4, 2*SZREG(a0)
+ REG_S a5, 3*SZREG(a0)
+ REG_S a6, 4*SZREG(a0)
+ REG_S a7, 5*SZREG(a0)
+ REG_S t1, 6*SZREG(a0)
+ REG_S t2, 7*SZREG(a0)
+ REG_S t3, 8*SZREG(a0)
+ REG_S t4, 9*SZREG(a0)
+ REG_S t5, 10*SZREG(a0)
+ REG_L a2, 11*SZREG(a1)
+ REG_L a3, 12*SZREG(a1)
+ REG_L a4, 13*SZREG(a1)
+ REG_L a5, 14*SZREG(a1)
+ REG_L a6, 15*SZREG(a1)
+ addi a1, a1, 16*SZREG
+ REG_S a2, 11*SZREG(a0)
+ REG_S a3, 12*SZREG(a0)
+ REG_S a4, 13*SZREG(a0)
+ REG_S a5, 14*SZREG(a0)
+ REG_S a6, 15*SZREG(a0)
+ addi a0, a0, 16*SZREG
+ bltu a0, t0, 1b
2:
- andi a4, a2, ~((16*SZREG)-1)
- beqz a4, 4f
- add a3, a1, a4
-3:
- REG_L a4, 0(a1)
- REG_L a5, SZREG(a1)
- REG_L a6, 2*SZREG(a1)
- REG_L a7, 3*SZREG(a1)
- REG_L t0, 4*SZREG(a1)
- REG_L t1, 5*SZREG(a1)
- REG_L t2, 6*SZREG(a1)
- REG_L t3, 7*SZREG(a1)
- REG_L t4, 8*SZREG(a1)
- REG_L t5, 9*SZREG(a1)
- REG_S a4, 0(t6)
- REG_S a5, SZREG(t6)
- REG_S a6, 2*SZREG(t6)
- REG_S a7, 3*SZREG(t6)
- REG_S t0, 4*SZREG(t6)
- REG_S t1, 5*SZREG(t6)
- REG_S t2, 6*SZREG(t6)
- REG_S t3, 7*SZREG(t6)
- REG_S t4, 8*SZREG(t6)
- REG_S t5, 9*SZREG(t6)
- REG_L a4, 10*SZREG(a1)
- REG_L a5, 11*SZREG(a1)
- REG_L a6, 12*SZREG(a1)
- REG_L a7, 13*SZREG(a1)
- REG_L t0, 14*SZREG(a1)
- REG_L t1, 15*SZREG(a1)
- addi a1, a1, 16*SZREG
- REG_S a4, 10*SZREG(t6)
- REG_S a5, 11*SZREG(t6)
- REG_S a6, 12*SZREG(t6)
- REG_S a7, 13*SZREG(t6)
- REG_S t0, 14*SZREG(t6)
- REG_S t1, 15*SZREG(t6)
- addi t6, t6, 16*SZREG
- bltu a1, a3, 3b
- andi a2, a2, (16*SZREG)-1 /* Update count */
-
-4:
- /* Handle trailing misalignment */
- beqz a2, 6f
- add a3, a1, a2
-
- /* Use word-oriented copy if co-aligned to word boundary */
- or a5, a1, t6
- or a5, a5, a3
- andi a5, a5, 3
- bnez a5, 5f
-7:
- lw a4, 0(a1)
- addi a1, a1, 4
- sw a4, 0(t6)
- addi t6, t6, 4
- bltu a1, a3, 7b
+ /* Post-loop increment by 16*SZREG-1 and pre-loop decrement by SZREG-1 */
+ addi t0, t0, 15*SZREG
- ret
+ /* Wordwise copy */
+ bgeu a0, t0, 2f
+1:
+ REG_L a5, 0(a1)
+ addi a1, a1, SZREG
+ REG_S a5, 0(a0)
+ addi a0, a0, SZREG
+ bltu a0, t0, 1b
+2:
+ addi t0, t0, SZREG-1
-5:
- lb a4, 0(a1)
- addi a1, a1, 1
- sb a4, 0(t6)
- addi t6, t6, 1
- bltu a1, a3, 5b
-6:
+.Lbyte_copy_tail:
+ /*
+ * Bytewise copy anything left.
+ */
+ beq a0, t0, 2f
+1:
+ lb a5, 0(a1)
+ addi a1, a1, 1
+ sb a5, 0(a0)
+ addi a0, a0, 1
+ bne a0, t0, 1b
+2:
+
+ mv a0, t6
ret
+
+.Lmisaligned_word_copy:
+ /*
+ * Misaligned word-wise copy.
+ * For misaligned copy we still perform word-wise copy, but we need to
+ * use the value fetched from the previous iteration and do some shifts.
+ * This is safe because we wouldn't access more words than necessary.
+ */
+
+ /* Calculate shifts */
+ slli t3, a3, 3
+ sub t4, x0, t3 /* negate is okay as shift will only look at LSBs */
+
+ /* Load the initial value and align a1 */
+ andi a1, a1, ~(SZREG-1)
+ REG_L a5, 0(a1)
+
+ addi t0, t0, -(SZREG-1)
+ /* At least one iteration will be executed here, no check */
+1:
+ srl a4, a5, t3
+ REG_L a5, SZREG(a1)
+ addi a1, a1, SZREG
+ sll a2, a5, t4
+ or a2, a2, a4
+ REG_S a2, 0(a0)
+ addi a0, a0, SZREG
+ bltu a0, t0, 1b
+
+ /* Update pointers to correct value */
+ addi t0, t0, SZREG-1
+ add a1, a1, a3
+
+ j .Lbyte_copy_tail
END(__memcpy)
diff --git a/arch/riscv/lib/memmove.S b/arch/riscv/lib/memmove.S
index 07d1d2152b..fbe6701dbe 100644
--- a/arch/riscv/lib/memmove.S
+++ b/arch/riscv/lib/memmove.S
@@ -5,60 +5,124 @@
ENTRY(__memmove)
WEAK(memmove)
- move t0, a0
- move t1, a1
-
- beq a0, a1, exit_memcpy
- beqz a2, exit_memcpy
- srli t2, a2, 0x2
-
- slt t3, a0, a1
- beqz t3, do_reverse
-
- andi a2, a2, 0x3
- li t4, 1
- beqz t2, byte_copy
-
-word_copy:
- lw t3, 0(a1)
- addi t2, t2, -1
- addi a1, a1, 4
- sw t3, 0(a0)
- addi a0, a0, 4
- bnez t2, word_copy
- beqz a2, exit_memcpy
- j byte_copy
-
-do_reverse:
- add a0, a0, a2
- add a1, a1, a2
- andi a2, a2, 0x3
- li t4, -1
- beqz t2, reverse_byte_copy
-
-reverse_word_copy:
- addi a1, a1, -4
- addi t2, t2, -1
- lw t3, 0(a1)
- addi a0, a0, -4
- sw t3, 0(a0)
- bnez t2, reverse_word_copy
- beqz a2, exit_memcpy
-
-reverse_byte_copy:
- addi a0, a0, -1
- addi a1, a1, -1
-
-byte_copy:
- lb t3, 0(a1)
- addi a2, a2, -1
- sb t3, 0(a0)
- add a1, a1, t4
- add a0, a0, t4
- bnez a2, byte_copy
-
-exit_memcpy:
- move a0, t0
- move a1, t1
- ret
+ /*
+ * Here we determine if forward copy is possible. Forward copy is
+ * preferred to backward copy as it is more cache friendly.
+ *
+ * If a0 >= a1, t0 gives their distance, if t0 >= a2 then we can
+ * copy forward.
+ * If a0 < a1, we can always copy forward. This will make t0 negative,
+ * so a *unsigned* comparison will always have t0 >= a2.
+ *
+ * For forward copy we just delegate the task to memcpy.
+ */
+ sub t0, a0, a1
+ bltu t0, a2, 1f
+ tail __memcpy
+1:
+
+ /*
+ * Register allocation for code below:
+ * a0 - end of uncopied dst
+ * a1 - end of uncopied src
+ * t0 - start of uncopied dst
+ */
+ mv t0, a0
+ add a0, a0, a2
+ add a1, a1, a2
+
+ /*
+ * Use bytewise copy if too small.
+ *
+ * This threshold must be at least 2*SZREG to ensure at least one
+ * wordwise copy is performed. It is chosen to be 16 because it will
+ * save at least 7 iterations of bytewise copy, which pays off the
+ * fixed overhead.
+ */
+ li a3, 16
+ bltu a2, a3, .Lbyte_copy_tail
+
+ /*
+ * Bytewise copy first to align t0 to word boundary.
+ */
+ andi a2, a0, ~(SZREG-1)
+ beq a0, a2, 2f
+1:
+ addi a1, a1, -1
+ lb a5, 0(a1)
+ addi a0, a0, -1
+ sb a5, 0(a0)
+ bne a0, a2, 1b
+2:
+
+ /*
+ * Now a0 is word-aligned. If a1 is also word aligned, we could perform
+ * aligned word-wise copy. Otherwise we need to perform misaligned
+ * word-wise copy.
+ */
+ andi a3, a1, SZREG-1
+ bnez a3, .Lmisaligned_word_copy
+
+ /* Wordwise copy */
+ addi t0, t0, SZREG-1
+ bleu a0, t0, 2f
+1:
+ addi a1, a1, -SZREG
+ REG_L a5, 0(a1)
+ addi a0, a0, -SZREG
+ REG_S a5, 0(a0)
+ bgtu a0, t0, 1b
+2:
+ addi t0, t0, -(SZREG-1)
+
+.Lbyte_copy_tail:
+ /*
+ * Bytewise copy anything left.
+ */
+ beq a0, t0, 2f
+1:
+ addi a1, a1, -1
+ lb a5, 0(a1)
+ addi a0, a0, -1
+ sb a5, 0(a0)
+ bne a0, t0, 1b
+2:
+
+ mv a0, t0
+ ret
+
+.Lmisaligned_word_copy:
+ /*
+ * Misaligned word-wise copy.
+ * For misaligned copy we still perform word-wise copy, but we need to
+ * use the value fetched from the previous iteration and do some shifts.
+ * This is safe because we wouldn't access more words than necessary.
+ */
+
+ /* Calculate shifts */
+ slli t3, a3, 3
+ sub t4, x0, t3 /* negate is okay as shift will only look at LSBs */
+
+ /* Load the initial value and align a1 */
+ andi a1, a1, ~(SZREG-1)
+ REG_L a5, 0(a1)
+
+ addi t0, t0, SZREG-1
+ /* At least one iteration will be executed here, no check */
+1:
+ sll a4, a5, t4
+ addi a1, a1, -SZREG
+ REG_L a5, 0(a1)
+ srl a2, a5, t3
+ or a2, a2, a4
+ addi a0, a0, -SZREG
+ REG_S a2, 0(a0)
+ bgtu a0, t0, 1b
+
+ /* Update pointers to correct value */
+ addi t0, t0, -(SZREG-1)
+ add a1, a1, a3
+
+ j .Lbyte_copy_tail
+
END(__memmove)
--
2.25.1
next reply other threads:[~2021-05-13 8:46 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-05-13 8:46 Bin Meng [this message]
2021-05-13 8:46 ` [PATCH 2/2] riscv: Group assembly optimized implementation of memory routines into a submenu Bin Meng
2021-05-18 0:48 ` Leo Liang
2021-05-17 2:07 ` [PATCH 1/2] riscv: Fix memmove and optimise memcpy when misalign Bin Meng
2021-05-18 0:46 ` Leo Liang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210513084618.2161331-1-bmeng.cn@gmail.com \
--to=bmeng.cn@gmail.com \
--cc=u-boot@lists.denx.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.