All of lore.kernel.org
 help / color / mirror / Atom feed
From: Simon Glass <sjg@chromium.org>
To: u-boot@lists.denx.de
Subject: [U-Boot] [PATCH v2 02/12] x86: Add an accelerated memmove() function
Date: Wed,  5 Oct 2016 20:42:10 -0600	[thread overview]
Message-ID: <1475721740-15124-3-git-send-email-sjg@chromium.org> (raw)
In-Reply-To: <1475721740-15124-1-git-send-email-sjg@chromium.org>

Bring in a faster memmove() from Linux 4.7. This speeds up scrolling on the
display.

Signed-off-by: Simon Glass <sjg@chromium.org>
---

Changes in v2:
- Move the code into string.c
- Fix multi-line comments that should not be

 arch/x86/include/asm/string.h |   2 +-
 arch/x86/lib/string.c         | 161 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 162 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h
index 0ad612f..38afd23 100644
--- a/arch/x86/include/asm/string.h
+++ b/arch/x86/include/asm/string.h
@@ -17,7 +17,7 @@ extern char * strchr(const char * s, int c);
 #define __HAVE_ARCH_MEMCPY
 extern void * memcpy(void *, const void *, __kernel_size_t);
 
-#undef __HAVE_ARCH_MEMMOVE
+#define __HAVE_ARCH_MEMMOVE
 extern void * memmove(void *, const void *, __kernel_size_t);
 
 #undef __HAVE_ARCH_MEMCHR
diff --git a/arch/x86/lib/string.c b/arch/x86/lib/string.c
index 6c66431..5343c2b 100644
--- a/arch/x86/lib/string.c
+++ b/arch/x86/lib/string.c
@@ -130,3 +130,164 @@ void *memcpy(void *dstpp, const void *srcpp, size_t len)
 
 	return dstpp;
 }
+
+void *memmove(void *dest, const void *src, size_t n)
+{
+	int d0, d1, d2, d3, d4, d5;
+	char *ret = dest;
+
+	__asm__ __volatile__(
+		/* Handle more 16 bytes in loop */
+		"cmp $0x10, %0\n\t"
+		"jb	1f\n\t"
+
+		/* Decide forward/backward copy mode */
+		"cmp %2, %1\n\t"
+		"jb	2f\n\t"
+
+		/*
+		 * movs instruction have many startup latency
+		 * so we handle small size by general register.
+		 */
+		"cmp  $680, %0\n\t"
+		"jb 3f\n\t"
+		/* movs instruction is only good for aligned case */
+		"mov %1, %3\n\t"
+		"xor %2, %3\n\t"
+		"and $0xff, %3\n\t"
+		"jz 4f\n\t"
+		"3:\n\t"
+		"sub $0x10, %0\n\t"
+
+		/* We gobble 16 bytes forward in each loop */
+		"3:\n\t"
+		"sub $0x10, %0\n\t"
+		"mov 0*4(%1), %3\n\t"
+		"mov 1*4(%1), %4\n\t"
+		"mov  %3, 0*4(%2)\n\t"
+		"mov  %4, 1*4(%2)\n\t"
+		"mov 2*4(%1), %3\n\t"
+		"mov 3*4(%1), %4\n\t"
+		"mov  %3, 2*4(%2)\n\t"
+		"mov  %4, 3*4(%2)\n\t"
+		"lea  0x10(%1), %1\n\t"
+		"lea  0x10(%2), %2\n\t"
+		"jae 3b\n\t"
+		"add $0x10, %0\n\t"
+		"jmp 1f\n\t"
+
+		/* Handle data forward by movs */
+		".p2align 4\n\t"
+		"4:\n\t"
+		"mov -4(%1, %0), %3\n\t"
+		"lea -4(%2, %0), %4\n\t"
+		"shr $2, %0\n\t"
+		"rep movsl\n\t"
+		"mov %3, (%4)\n\t"
+		"jmp 11f\n\t"
+		/* Handle data backward by movs */
+		".p2align 4\n\t"
+		"6:\n\t"
+		"mov (%1), %3\n\t"
+		"mov %2, %4\n\t"
+		"lea -4(%1, %0), %1\n\t"
+		"lea -4(%2, %0), %2\n\t"
+		"shr $2, %0\n\t"
+		"std\n\t"
+		"rep movsl\n\t"
+		"mov %3,(%4)\n\t"
+		"cld\n\t"
+		"jmp 11f\n\t"
+
+		/* Start to prepare for backward copy */
+		".p2align 4\n\t"
+		"2:\n\t"
+		"cmp  $680, %0\n\t"
+		"jb 5f\n\t"
+		"mov %1, %3\n\t"
+		"xor %2, %3\n\t"
+		"and $0xff, %3\n\t"
+		"jz 6b\n\t"
+
+		/* Calculate copy position to tail */
+		"5:\n\t"
+		"add %0, %1\n\t"
+		"add %0, %2\n\t"
+		"sub $0x10, %0\n\t"
+
+		/* We gobble 16 bytes backward in each loop */
+		"7:\n\t"
+		"sub $0x10, %0\n\t"
+
+		"mov -1*4(%1), %3\n\t"
+		"mov -2*4(%1), %4\n\t"
+		"mov  %3, -1*4(%2)\n\t"
+		"mov  %4, -2*4(%2)\n\t"
+		"mov -3*4(%1), %3\n\t"
+		"mov -4*4(%1), %4\n\t"
+		"mov  %3, -3*4(%2)\n\t"
+		"mov  %4, -4*4(%2)\n\t"
+		"lea  -0x10(%1), %1\n\t"
+		"lea  -0x10(%2), %2\n\t"
+		"jae 7b\n\t"
+		/* Calculate copy position to head */
+		"add $0x10, %0\n\t"
+		"sub %0, %1\n\t"
+		"sub %0, %2\n\t"
+
+		/* Move data from 8 bytes to 15 bytes */
+		".p2align 4\n\t"
+		"1:\n\t"
+		"cmp $8, %0\n\t"
+		"jb 8f\n\t"
+		"mov 0*4(%1), %3\n\t"
+		"mov 1*4(%1), %4\n\t"
+		"mov -2*4(%1, %0), %5\n\t"
+		"mov -1*4(%1, %0), %1\n\t"
+
+		"mov  %3, 0*4(%2)\n\t"
+		"mov  %4, 1*4(%2)\n\t"
+		"mov  %5, -2*4(%2, %0)\n\t"
+		"mov  %1, -1*4(%2, %0)\n\t"
+		"jmp 11f\n\t"
+
+		/* Move data from 4 bytes to 7 bytes */
+		".p2align 4\n\t"
+		"8:\n\t"
+		"cmp $4, %0\n\t"
+		"jb 9f\n\t"
+		"mov 0*4(%1), %3\n\t"
+		"mov -1*4(%1, %0), %4\n\t"
+		"mov  %3, 0*4(%2)\n\t"
+		"mov  %4, -1*4(%2, %0)\n\t"
+		"jmp 11f\n\t"
+
+		/* Move data from 2 bytes to 3 bytes */
+		".p2align 4\n\t"
+		"9:\n\t"
+		"cmp $2, %0\n\t"
+		"jb 10f\n\t"
+		"movw 0*2(%1), %%dx\n\t"
+		"movw -1*2(%1, %0), %%bx\n\t"
+		"movw %%dx, 0*2(%2)\n\t"
+		"movw %%bx, -1*2(%2, %0)\n\t"
+		"jmp 11f\n\t"
+
+		/* Move data for 1 byte */
+		".p2align 4\n\t"
+		"10:\n\t"
+		"cmp $1, %0\n\t"
+		"jb 11f\n\t"
+		"movb (%1), %%cl\n\t"
+		"movb %%cl, (%2)\n\t"
+		".p2align 4\n\t"
+		"11:"
+		: "=&c" (d0), "=&S" (d1), "=&D" (d2),
+		  "=r" (d3), "=r" (d4), "=r"(d5)
+		: "0" (n),
+		 "1" (src),
+		 "2" (dest)
+		: "memory");
+
+	return ret;
+}
-- 
2.8.0.rc3.226.g39d4020

  parent reply	other threads:[~2016-10-06  2:42 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-10-06  2:42 [U-Boot] [PATCH v2 00/12] dm: x86: Improve vesa driver-model support Simon Glass
2016-10-06  2:42 ` [U-Boot] [PATCH v2 01/12] Revert "x86: broadwell: gpio: Remove the codes to set up pin control" Simon Glass
2016-10-08  4:23   ` Bin Meng
2016-10-06  2:42 ` Simon Glass [this message]
2016-10-08  2:25   ` [U-Boot] [PATCH v2 02/12] x86: Add an accelerated memmove() function Bin Meng
2016-10-08  4:23     ` Bin Meng
2016-10-08  5:53     ` Bin Meng
2016-10-10  2:05       ` Bin Meng
2016-10-13  0:03         ` Simon Glass
2016-10-06  2:42 ` [U-Boot] [PATCH v2 03/12] Fix return value in trailing_strtoln() Simon Glass
2016-10-08  4:23   ` Bin Meng
2016-10-06  2:42 ` [U-Boot] [PATCH v2 04/12] list: Add list_last_entry() to find the last entry Simon Glass
2016-10-08  4:23   ` Bin Meng
2016-10-06  2:42 ` [U-Boot] [PATCH v2 05/12] dm: core: Add a function to get a uclass name Simon Glass
2016-10-08  4:23   ` Bin Meng
2016-10-06  2:42 ` [U-Boot] [PATCH v2 06/12] x86: video: Fix typo in broadwell Kconfig Simon Glass
2016-10-08  4:23   ` Bin Meng
2016-10-06  2:42 ` [U-Boot] [PATCH v2 07/12] dm: x86: video: Add a driver-model driver for ivybridge graphics Simon Glass
2016-10-08  2:33   ` Bin Meng
2016-10-08  4:23     ` Bin Meng
2016-10-06  2:42 ` [U-Boot] [PATCH v2 08/12] dm: stdio: Allow lazy probing of video devices Simon Glass
2016-10-08  2:32   ` Bin Meng
2016-10-08  4:23     ` Bin Meng
2016-10-06  2:42 ` [U-Boot] [PATCH v2 09/12] dm: video: Add driver-model support to vesa graphics Simon Glass
2016-10-08  4:24   ` Bin Meng
2016-10-06  2:42 ` [U-Boot] [PATCH v2 10/12] x86: Adjust config to support DM_VIDEO Simon Glass
2016-10-08  4:24   ` Bin Meng
2016-10-06  2:42 ` [U-Boot] [PATCH v2 11/12] dm: x86: Move samus to use new driver model support Simon Glass
2016-10-08  4:24   ` Bin Meng
2016-10-06  2:42 ` [U-Boot] [PATCH v2 12/12] dm: x86: Move link to use driver model for video Simon Glass
2016-10-08  4:24   ` Bin Meng

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1475721740-15124-3-git-send-email-sjg@chromium.org \
    --to=sjg@chromium.org \
    --cc=u-boot@lists.denx.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.