All of lore.kernel.org
 help / color / mirror / Atom feed
From: Bin Meng <bmeng.cn@gmail.com>
To: u-boot@lists.denx.de
Subject: [PATCH 1/2] riscv: Fix memmove and optimise memcpy when misalign
Date: Thu, 13 May 2021 16:46:17 +0800	[thread overview]
Message-ID: <20210513084618.2161331-1-bmeng.cn@gmail.com> (raw)

At present U-Boot SPL fails to boot on SiFive Unleashed board, due
to a load address misaligned exception happens when loading the FIT
image in spl_load_simple_fit(). The exception happens in memmove()
which is called by fdt_splice_().

Commit 8f0dc4cfd106 introduces an assembly version of memmove but
it does take misalignment into account (it checks if length is a
multiple of machine word size but pointers need also be aligned).
As a result it will generate misaligned load/store for the majority
of cases and causes significant performance regression on hardware
that traps misaligned load/store and emulate them using firmware.

The current behaviour of memcpy is that it checks if both src and
dest pointers are co-aligned (aka congruent modular SZ_REG). If
aligned, it will copy data word-by-word after first aligning
pointers to word boundary. If src and dst are not co-aligned,
however, byte-wise copy will be performed.

This patch was taken from the Linux kernel patch [1], which has not
been applied at the time being. It fixes the memmove and optimises
memcpy for misaligned cases. It will first align destination pointer
to word-boundary regardless whether src and dest are co-aligned or
not. If they indeed are, then wordwise copy is performed. If they
are not co-aligned, then it will load two adjacent words from src
and use shifts to assemble a full machine word. Some additional
assembly level micro-optimisation is also performed to ensure more
instructions can be compressed (e.g. prefer a0 to t6).

With this patch, U-Boot boots again on SiFive Unleashed board.

[1] https://patchwork.kernel.org/project/linux-riscv/patch/20210216225555.4976-1-gary at garyguo.net/

Fixes: 8f0dc4cfd106 ("riscv: assembler versions of memcpy, memmove, memset")
Signed-off-by: Bin Meng <bmeng.cn@gmail.com>
---

 arch/riscv/lib/memcpy.S  | 223 ++++++++++++++++++++++++---------------
 arch/riscv/lib/memmove.S | 176 ++++++++++++++++++++----------
 2 files changed, 257 insertions(+), 142 deletions(-)

diff --git a/arch/riscv/lib/memcpy.S b/arch/riscv/lib/memcpy.S
index 51ab716253..00672c19ad 100644
--- a/arch/riscv/lib/memcpy.S
+++ b/arch/riscv/lib/memcpy.S
@@ -9,100 +9,151 @@
 /* void *memcpy(void *, const void *, size_t) */
 ENTRY(__memcpy)
 WEAK(memcpy)
-	move t6, a0  /* Preserve return value */
+	/* Save for return value */
+	mv	t6, a0
 
-	/* Defer to byte-oriented copy for small sizes */
-	sltiu a3, a2, 128
-	bnez a3, 4f
-	/* Use word-oriented copy only if low-order bits match */
-	andi a3, t6, SZREG-1
-	andi a4, a1, SZREG-1
-	bne a3, a4, 4f
+	/*
+	 * Register allocation for code below:
+	 * a0 - start of uncopied dst
+	 * a1 - start of uncopied src
+	 * t0 - end of uncopied dst
+	 */
+	add	t0, a0, a2
 
-	beqz a3, 2f  /* Skip if already aligned */
 	/*
-	 * Round to nearest double word-aligned address
-	 * greater than or equal to start address
+	 * Use bytewise copy if too small.
+	 *
+	 * This threshold must be at least 2*SZREG to ensure at least one
+	 * wordwise copy is performed. It is chosen to be 16 because it will
+	 * save at least 7 iterations of bytewise copy, which pays off the
+	 * fixed overhead.
 	 */
-	andi a3, a1, ~(SZREG-1)
-	addi a3, a3, SZREG
-	/* Handle initial misalignment */
-	sub a4, a3, a1
+	li	a3, 16
+	bltu	a2, a3, .Lbyte_copy_tail
+
+	/*
+	 * Bytewise copy first to align a0 to word boundary.
+	 */
+	addi	a2, a0, SZREG-1
+	andi	a2, a2, ~(SZREG-1)
+	beq	a0, a2, 2f
 1:
-	lb a5, 0(a1)
-	addi a1, a1, 1
-	sb a5, 0(t6)
-	addi t6, t6, 1
-	bltu a1, a3, 1b
-	sub a2, a2, a4  /* Update count */
+	lb	a5, 0(a1)
+	addi	a1, a1, 1
+	sb	a5, 0(a0)
+	addi	a0, a0, 1
+	bne	a0, a2, 1b
+2:
+
+	/*
+	 * Now a0 is word-aligned. If a1 is also word aligned, we could perform
+	 * aligned word-wise copy. Otherwise we need to perform misaligned
+	 * word-wise copy.
+	 */
+	andi	a3, a1, SZREG-1
+	bnez	a3, .Lmisaligned_word_copy
 
+	/* Unrolled wordwise copy */
+	addi	t0, t0, -(16*SZREG-1)
+	bgeu	a0, t0, 2f
+1:
+	REG_L	a2,        0(a1)
+	REG_L	a3,    SZREG(a1)
+	REG_L	a4,  2*SZREG(a1)
+	REG_L	a5,  3*SZREG(a1)
+	REG_L	a6,  4*SZREG(a1)
+	REG_L	a7,  5*SZREG(a1)
+	REG_L	t1,  6*SZREG(a1)
+	REG_L	t2,  7*SZREG(a1)
+	REG_L	t3,  8*SZREG(a1)
+	REG_L	t4,  9*SZREG(a1)
+	REG_L	t5, 10*SZREG(a1)
+	REG_S	a2,        0(a0)
+	REG_S	a3,    SZREG(a0)
+	REG_S	a4,  2*SZREG(a0)
+	REG_S	a5,  3*SZREG(a0)
+	REG_S	a6,  4*SZREG(a0)
+	REG_S	a7,  5*SZREG(a0)
+	REG_S	t1,  6*SZREG(a0)
+	REG_S	t2,  7*SZREG(a0)
+	REG_S	t3,  8*SZREG(a0)
+	REG_S	t4,  9*SZREG(a0)
+	REG_S	t5, 10*SZREG(a0)
+	REG_L	a2, 11*SZREG(a1)
+	REG_L	a3, 12*SZREG(a1)
+	REG_L	a4, 13*SZREG(a1)
+	REG_L	a5, 14*SZREG(a1)
+	REG_L	a6, 15*SZREG(a1)
+	addi	a1, a1, 16*SZREG
+	REG_S	a2, 11*SZREG(a0)
+	REG_S	a3, 12*SZREG(a0)
+	REG_S	a4, 13*SZREG(a0)
+	REG_S	a5, 14*SZREG(a0)
+	REG_S	a6, 15*SZREG(a0)
+	addi	a0, a0, 16*SZREG
+	bltu	a0, t0, 1b
 2:
-	andi a4, a2, ~((16*SZREG)-1)
-	beqz a4, 4f
-	add a3, a1, a4
-3:
-	REG_L a4,       0(a1)
-	REG_L a5,   SZREG(a1)
-	REG_L a6, 2*SZREG(a1)
-	REG_L a7, 3*SZREG(a1)
-	REG_L t0, 4*SZREG(a1)
-	REG_L t1, 5*SZREG(a1)
-	REG_L t2, 6*SZREG(a1)
-	REG_L t3, 7*SZREG(a1)
-	REG_L t4, 8*SZREG(a1)
-	REG_L t5, 9*SZREG(a1)
-	REG_S a4,       0(t6)
-	REG_S a5,   SZREG(t6)
-	REG_S a6, 2*SZREG(t6)
-	REG_S a7, 3*SZREG(t6)
-	REG_S t0, 4*SZREG(t6)
-	REG_S t1, 5*SZREG(t6)
-	REG_S t2, 6*SZREG(t6)
-	REG_S t3, 7*SZREG(t6)
-	REG_S t4, 8*SZREG(t6)
-	REG_S t5, 9*SZREG(t6)
-	REG_L a4, 10*SZREG(a1)
-	REG_L a5, 11*SZREG(a1)
-	REG_L a6, 12*SZREG(a1)
-	REG_L a7, 13*SZREG(a1)
-	REG_L t0, 14*SZREG(a1)
-	REG_L t1, 15*SZREG(a1)
-	addi a1, a1, 16*SZREG
-	REG_S a4, 10*SZREG(t6)
-	REG_S a5, 11*SZREG(t6)
-	REG_S a6, 12*SZREG(t6)
-	REG_S a7, 13*SZREG(t6)
-	REG_S t0, 14*SZREG(t6)
-	REG_S t1, 15*SZREG(t6)
-	addi t6, t6, 16*SZREG
-	bltu a1, a3, 3b
-	andi a2, a2, (16*SZREG)-1  /* Update count */
-
-4:
-	/* Handle trailing misalignment */
-	beqz a2, 6f
-	add a3, a1, a2
-
-	/* Use word-oriented copy if co-aligned to word boundary */
-	or a5, a1, t6
-	or a5, a5, a3
-	andi a5, a5, 3
-	bnez a5, 5f
-7:
-	lw a4, 0(a1)
-	addi a1, a1, 4
-	sw a4, 0(t6)
-	addi t6, t6, 4
-	bltu a1, a3, 7b
+	/* Post-loop increment by 16*SZREG-1 and pre-loop decrement by SZREG-1 */
+	addi	t0, t0, 15*SZREG
 
-	ret
+	/* Wordwise copy */
+	bgeu	a0, t0, 2f
+1:
+	REG_L	a5, 0(a1)
+	addi	a1, a1, SZREG
+	REG_S	a5, 0(a0)
+	addi	a0, a0, SZREG
+	bltu	a0, t0, 1b
+2:
+	addi	t0, t0, SZREG-1
 
-5:
-	lb a4, 0(a1)
-	addi a1, a1, 1
-	sb a4, 0(t6)
-	addi t6, t6, 1
-	bltu a1, a3, 5b
-6:
+.Lbyte_copy_tail:
+	/*
+	 * Bytewise copy anything left.
+	 */
+	beq	a0, t0, 2f
+1:
+	lb	a5, 0(a1)
+	addi	a1, a1, 1
+	sb	a5, 0(a0)
+	addi	a0, a0, 1
+	bne	a0, t0, 1b
+2:
+
+	mv	a0, t6
 	ret
+
+.Lmisaligned_word_copy:
+	/*
+	 * Misaligned word-wise copy.
+	 * For misaligned copy we still perform word-wise copy, but we need to
+	 * use the value fetched from the previous iteration and do some shifts.
+	 * This is safe because we wouldn't access more words than necessary.
+	 */
+
+	/* Calculate shifts */
+	slli	t3, a3, 3
+	sub	t4, x0, t3 /* negate is okay as shift will only look at LSBs */
+
+	/* Load the initial value and align a1 */
+	andi	a1, a1, ~(SZREG-1)
+	REG_L	a5, 0(a1)
+
+	addi	t0, t0, -(SZREG-1)
+	/* At least one iteration will be executed here, no check */
+1:
+	srl	a4, a5, t3
+	REG_L	a5, SZREG(a1)
+	addi	a1, a1, SZREG
+	sll	a2, a5, t4
+	or	a2, a2, a4
+	REG_S	a2, 0(a0)
+	addi	a0, a0, SZREG
+	bltu	a0, t0, 1b
+
+	/* Update pointers to correct value */
+	addi	t0, t0, SZREG-1
+	add	a1, a1, a3
+
+	j	.Lbyte_copy_tail
 END(__memcpy)
diff --git a/arch/riscv/lib/memmove.S b/arch/riscv/lib/memmove.S
index 07d1d2152b..fbe6701dbe 100644
--- a/arch/riscv/lib/memmove.S
+++ b/arch/riscv/lib/memmove.S
@@ -5,60 +5,124 @@
 
 ENTRY(__memmove)
 WEAK(memmove)
-        move    t0, a0
-        move    t1, a1
-
-        beq     a0, a1, exit_memcpy
-        beqz    a2, exit_memcpy
-        srli    t2, a2, 0x2
-
-        slt     t3, a0, a1
-        beqz    t3, do_reverse
-
-        andi    a2, a2, 0x3
-        li      t4, 1
-        beqz    t2, byte_copy
-
-word_copy:
-        lw      t3, 0(a1)
-        addi    t2, t2, -1
-        addi    a1, a1, 4
-        sw      t3, 0(a0)
-        addi    a0, a0, 4
-        bnez    t2, word_copy
-        beqz    a2, exit_memcpy
-        j       byte_copy
-
-do_reverse:
-        add     a0, a0, a2
-        add     a1, a1, a2
-        andi    a2, a2, 0x3
-        li      t4, -1
-        beqz    t2, reverse_byte_copy
-
-reverse_word_copy:
-        addi    a1, a1, -4
-        addi    t2, t2, -1
-        lw      t3, 0(a1)
-        addi    a0, a0, -4
-        sw      t3, 0(a0)
-        bnez    t2, reverse_word_copy
-        beqz    a2, exit_memcpy
-
-reverse_byte_copy:
-        addi    a0, a0, -1
-        addi    a1, a1, -1
-
-byte_copy:
-        lb      t3, 0(a1)
-        addi    a2, a2, -1
-        sb      t3, 0(a0)
-        add     a1, a1, t4
-        add     a0, a0, t4
-        bnez    a2, byte_copy
-
-exit_memcpy:
-        move a0, t0
-        move a1, t1
-        ret
+	/*
+	 * Here we determine if forward copy is possible. Forward copy is
+	 * preferred to backward copy as it is more cache friendly.
+	 *
+	 * If a0 >= a1, t0 gives their distance, if t0 >= a2 then we can
+	 *   copy forward.
+	 * If a0 < a1, we can always copy forward. This will make t0 negative,
+	 *   so a *unsigned* comparison will always have t0 >= a2.
+	 *
+	 * For forward copy we just delegate the task to memcpy.
+	 */
+	sub	t0, a0, a1
+	bltu	t0, a2, 1f
+	tail	__memcpy
+1:
+
+	/*
+	 * Register allocation for code below:
+	 * a0 - end of uncopied dst
+	 * a1 - end of uncopied src
+	 * t0 - start of uncopied dst
+	 */
+	mv	t0, a0
+	add	a0, a0, a2
+	add	a1, a1, a2
+
+	/*
+	 * Use bytewise copy if too small.
+	 *
+	 * This threshold must be at least 2*SZREG to ensure at least one
+	 * wordwise copy is performed. It is chosen to be 16 because it will
+	 * save at least 7 iterations of bytewise copy, which pays off the
+	 * fixed overhead.
+	 */
+	li	a3, 16
+	bltu	a2, a3, .Lbyte_copy_tail
+
+	/*
+	 * Bytewise copy first to align t0 to word boundary.
+	 */
+	andi	a2, a0, ~(SZREG-1)
+	beq	a0, a2, 2f
+1:
+	addi	a1, a1, -1
+	lb	a5, 0(a1)
+	addi	a0, a0, -1
+	sb	a5, 0(a0)
+	bne	a0, a2, 1b
+2:
+
+	/*
+	 * Now a0 is word-aligned. If a1 is also word aligned, we could perform
+	 * aligned word-wise copy. Otherwise we need to perform misaligned
+	 * word-wise copy.
+	 */
+	andi	a3, a1, SZREG-1
+	bnez	a3, .Lmisaligned_word_copy
+
+	/* Wordwise copy */
+	addi	t0, t0, SZREG-1
+	bleu	a0, t0, 2f
+1:
+	addi	a1, a1, -SZREG
+	REG_L	a5, 0(a1)
+	addi	a0, a0, -SZREG
+	REG_S	a5, 0(a0)
+	bgtu	a0, t0, 1b
+2:
+	addi	t0, t0, -(SZREG-1)
+
+.Lbyte_copy_tail:
+	/*
+	 * Bytewise copy anything left.
+	 */
+	beq	a0, t0, 2f
+1:
+	addi	a1, a1, -1
+	lb	a5, 0(a1)
+	addi	a0, a0, -1
+	sb	a5, 0(a0)
+	bne	a0, t0, 1b
+2:
+
+	mv	a0, t0
+	ret
+
+.Lmisaligned_word_copy:
+	/*
+	 * Misaligned word-wise copy.
+	 * For misaligned copy we still perform word-wise copy, but we need to
+	 * use the value fetched from the previous iteration and do some shifts.
+	 * This is safe because we wouldn't access more words than necessary.
+	 */
+
+	/* Calculate shifts */
+	slli	t3, a3, 3
+	sub	t4, x0, t3 /* negate is okay as shift will only look at LSBs */
+
+	/* Load the initial value and align a1 */
+	andi	a1, a1, ~(SZREG-1)
+	REG_L	a5, 0(a1)
+
+	addi	t0, t0, SZREG-1
+	/* At least one iteration will be executed here, no check */
+1:
+	sll	a4, a5, t4
+	addi	a1, a1, -SZREG
+	REG_L	a5, 0(a1)
+	srl	a2, a5, t3
+	or	a2, a2, a4
+	addi	a0, a0, -SZREG
+	REG_S	a2, 0(a0)
+	bgtu	a0, t0, 1b
+
+	/* Update pointers to correct value */
+	addi	t0, t0, -(SZREG-1)
+	add	a1, a1, a3
+
+	j	.Lbyte_copy_tail
+
 END(__memmove)
-- 
2.25.1

             reply	other threads:[~2021-05-13  8:46 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-05-13  8:46 Bin Meng [this message]
2021-05-13  8:46 ` [PATCH 2/2] riscv: Group assembly optimized implementation of memory routines into a submenu Bin Meng
2021-05-18  0:48   ` Leo Liang
2021-05-17  2:07 ` [PATCH 1/2] riscv: Fix memmove and optimise memcpy when misalign Bin Meng
2021-05-18  0:46 ` Leo Liang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210513084618.2161331-1-bmeng.cn@gmail.com \
    --to=bmeng.cn@gmail.com \
    --cc=u-boot@lists.denx.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.