* [PATCH v4] ARM: Optimise copy_{from/to}_user for !CPU_USE_DOMAINS
@ 2018-10-29 8:06 Vincent Whitchurch
2018-10-29 13:49 ` Nicolas Pitre
0 siblings, 1 reply; 2+ messages in thread
From: Vincent Whitchurch @ 2018-10-29 8:06 UTC (permalink / raw)
To: linux-arm-kernel
ARMv6+ processors do not use CONFIG_CPU_USE_DOMAINS and use privileged
ldr/str instructions in copy_{from/to}_user. They are currently
unnecessarily using single ldr/str instructions and can use ldm/stm
instructions instead like memcpy does (but with appropriate fixup
tables).
This speeds up a "dd if=foo of=bar bs=32k" on a tmpfs filesystem by
about 4% on my Cortex-A9.
before:134217728 bytes (128.0MB) copied, 0.543848 seconds, 235.4MB/s
before:134217728 bytes (128.0MB) copied, 0.538610 seconds, 237.6MB/s
before:134217728 bytes (128.0MB) copied, 0.544356 seconds, 235.1MB/s
before:134217728 bytes (128.0MB) copied, 0.544364 seconds, 235.1MB/s
before:134217728 bytes (128.0MB) copied, 0.537130 seconds, 238.3MB/s
before:134217728 bytes (128.0MB) copied, 0.533443 seconds, 240.0MB/s
before:134217728 bytes (128.0MB) copied, 0.545691 seconds, 234.6MB/s
before:134217728 bytes (128.0MB) copied, 0.534695 seconds, 239.4MB/s
before:134217728 bytes (128.0MB) copied, 0.540561 seconds, 236.8MB/s
before:134217728 bytes (128.0MB) copied, 0.541025 seconds, 236.6MB/s
after:134217728 bytes (128.0MB) copied, 0.520445 seconds, 245.9MB/s
after:134217728 bytes (128.0MB) copied, 0.527846 seconds, 242.5MB/s
after:134217728 bytes (128.0MB) copied, 0.519510 seconds, 246.4MB/s
after:134217728 bytes (128.0MB) copied, 0.527231 seconds, 242.8MB/s
after:134217728 bytes (128.0MB) copied, 0.525030 seconds, 243.8MB/s
after:134217728 bytes (128.0MB) copied, 0.524236 seconds, 244.2MB/s
after:134217728 bytes (128.0MB) copied, 0.523659 seconds, 244.4MB/s
after:134217728 bytes (128.0MB) copied, 0.525018 seconds, 243.8MB/s
after:134217728 bytes (128.0MB) copied, 0.519249 seconds, 246.5MB/s
after:134217728 bytes (128.0MB) copied, 0.518527 seconds, 246.9MB/s
Signed-off-by: Vincent Whitchurch <vincent.whitchurch@axis.com>
---
v4: Do not modify str1b/ldr1b
v3: Explicitly add IT instructions to fix fault handling on Thumb-2
v2: Group *_SHIFT #defines with respective .macro implementations
arch/arm/include/asm/assembler.h | 6 ++++--
arch/arm/lib/copy_from_user.S | 23 ++++++++++++++++++++++-
arch/arm/lib/copy_to_user.S | 27 ++++++++++++++++++++++-----
3 files changed, 48 insertions(+), 8 deletions(-)
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index b17ee03d280b..da16d31c7ef9 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -243,13 +243,15 @@
.endm
#endif
-#define USER(x...) \
+#define USERL(l, x...) \
9999: x; \
.pushsection __ex_table,"a"; \
.align 3; \
- .long 9999b,9001f; \
+ .long 9999b,l; \
.popsection
+#define USER(x...) USERL(9001f, x)
+
#ifdef CONFIG_SMP
#define ALT_SMP(instr...) \
9998: instr
diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S
index a826df3d3814..3f79001830fe 100644
--- a/arch/arm/lib/copy_from_user.S
+++ b/arch/arm/lib/copy_from_user.S
@@ -34,12 +34,13 @@
* Number of bytes NOT copied.
*/
+#ifdef CONFIG_CPU_USE_DOMAINS
+
#ifndef CONFIG_THUMB2_KERNEL
#define LDR1W_SHIFT 0
#else
#define LDR1W_SHIFT 1
#endif
-#define STR1W_SHIFT 0
.macro ldr1w ptr reg abort
ldrusr \reg, \ptr, 4, abort=\abort
@@ -57,10 +58,30 @@
ldr4w \ptr, \reg5, \reg6, \reg7, \reg8, \abort
.endm
+#else
+
+#define LDR1W_SHIFT 0
+
+ .macro ldr1w ptr reg abort
+ USERL(\abort, W(ldr) \reg, [\ptr], #4)
+ .endm
+
+ .macro ldr4w ptr reg1 reg2 reg3 reg4 abort
+ USERL(\abort, ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4})
+ .endm
+
+ .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+ USERL(\abort, ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8})
+ .endm
+
+#endif /* CONFIG_CPU_USE_DOMAINS */
+
.macro ldr1b ptr reg cond=al abort
ldrusr \reg, \ptr, 1, \cond, abort=\abort
.endm
+#define STR1W_SHIFT 0
+
.macro str1w ptr reg abort
W(str) \reg, [\ptr], #4
.endm
diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S
index caf5019d8161..5cb28f6c7941 100644
--- a/arch/arm/lib/copy_to_user.S
+++ b/arch/arm/lib/copy_to_user.S
@@ -35,11 +35,6 @@
*/
#define LDR1W_SHIFT 0
-#ifndef CONFIG_THUMB2_KERNEL
-#define STR1W_SHIFT 0
-#else
-#define STR1W_SHIFT 1
-#endif
.macro ldr1w ptr reg abort
W(ldr) \reg, [\ptr], #4
@@ -57,6 +52,14 @@
ldr\cond\()b \reg, [\ptr], #1
.endm
+#ifdef CONFIG_CPU_USE_DOMAINS
+
+#ifndef CONFIG_THUMB2_KERNEL
+#define STR1W_SHIFT 0
+#else
+#define STR1W_SHIFT 1
+#endif
+
.macro str1w ptr reg abort
strusr \reg, \ptr, 4, abort=\abort
.endm
@@ -72,6 +75,20 @@
str1w \ptr, \reg8, \abort
.endm
+#else
+
+#define STR1W_SHIFT 0
+
+ .macro str1w ptr reg abort
+ USERL(\abort, W(str) \reg, [\ptr], #4)
+ .endm
+
+ .macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+ USERL(\abort, stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8})
+ .endm
+
+#endif /* CONFIG_CPU_USE_DOMAINS */
+
.macro str1b ptr reg cond=al abort
strusr \reg, \ptr, 1, \cond, abort=\abort
.endm
--
2.11.0
^ permalink raw reply related [flat|nested] 2+ messages in thread
* [PATCH v4] ARM: Optimise copy_{from/to}_user for !CPU_USE_DOMAINS
2018-10-29 8:06 [PATCH v4] ARM: Optimise copy_{from/to}_user for !CPU_USE_DOMAINS Vincent Whitchurch
@ 2018-10-29 13:49 ` Nicolas Pitre
0 siblings, 0 replies; 2+ messages in thread
From: Nicolas Pitre @ 2018-10-29 13:49 UTC (permalink / raw)
To: linux-arm-kernel
On Mon, 29 Oct 2018, Vincent Whitchurch wrote:
> ARMv6+ processors do not use CONFIG_CPU_USE_DOMAINS and use privileged
> ldr/str instructions in copy_{from/to}_user. They are currently
> unnecessarily using single ldr/str instructions and can use ldm/stm
> instructions instead like memcpy does (but with appropriate fixup
> tables).
>
> This speeds up a "dd if=foo of=bar bs=32k" on a tmpfs filesystem by
> about 4% on my Cortex-A9.
>
> before:134217728 bytes (128.0MB) copied, 0.543848 seconds, 235.4MB/s
> before:134217728 bytes (128.0MB) copied, 0.538610 seconds, 237.6MB/s
> before:134217728 bytes (128.0MB) copied, 0.544356 seconds, 235.1MB/s
> before:134217728 bytes (128.0MB) copied, 0.544364 seconds, 235.1MB/s
> before:134217728 bytes (128.0MB) copied, 0.537130 seconds, 238.3MB/s
> before:134217728 bytes (128.0MB) copied, 0.533443 seconds, 240.0MB/s
> before:134217728 bytes (128.0MB) copied, 0.545691 seconds, 234.6MB/s
> before:134217728 bytes (128.0MB) copied, 0.534695 seconds, 239.4MB/s
> before:134217728 bytes (128.0MB) copied, 0.540561 seconds, 236.8MB/s
> before:134217728 bytes (128.0MB) copied, 0.541025 seconds, 236.6MB/s
>
> after:134217728 bytes (128.0MB) copied, 0.520445 seconds, 245.9MB/s
> after:134217728 bytes (128.0MB) copied, 0.527846 seconds, 242.5MB/s
> after:134217728 bytes (128.0MB) copied, 0.519510 seconds, 246.4MB/s
> after:134217728 bytes (128.0MB) copied, 0.527231 seconds, 242.8MB/s
> after:134217728 bytes (128.0MB) copied, 0.525030 seconds, 243.8MB/s
> after:134217728 bytes (128.0MB) copied, 0.524236 seconds, 244.2MB/s
> after:134217728 bytes (128.0MB) copied, 0.523659 seconds, 244.4MB/s
> after:134217728 bytes (128.0MB) copied, 0.525018 seconds, 243.8MB/s
> after:134217728 bytes (128.0MB) copied, 0.519249 seconds, 246.5MB/s
> after:134217728 bytes (128.0MB) copied, 0.518527 seconds, 246.9MB/s
>
> Signed-off-by: Vincent Whitchurch <vincent.whitchurch@axis.com>
Reviewed-by: Nicolas Pitre <nico@linaro.org>
> ---
> v4: Do not modify str1b/ldr1b
> v3: Explicitly add IT instructions to fix fault handling on Thumb-2
> v2: Group *_SHIFT #defines with respective .macro implementations
>
> arch/arm/include/asm/assembler.h | 6 ++++--
> arch/arm/lib/copy_from_user.S | 23 ++++++++++++++++++++++-
> arch/arm/lib/copy_to_user.S | 27 ++++++++++++++++++++++-----
> 3 files changed, 48 insertions(+), 8 deletions(-)
>
> diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
> index b17ee03d280b..da16d31c7ef9 100644
> --- a/arch/arm/include/asm/assembler.h
> +++ b/arch/arm/include/asm/assembler.h
> @@ -243,13 +243,15 @@
> .endm
> #endif
>
> -#define USER(x...) \
> +#define USERL(l, x...) \
> 9999: x; \
> .pushsection __ex_table,"a"; \
> .align 3; \
> - .long 9999b,9001f; \
> + .long 9999b,l; \
> .popsection
>
> +#define USER(x...) USERL(9001f, x)
> +
> #ifdef CONFIG_SMP
> #define ALT_SMP(instr...) \
> 9998: instr
> diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S
> index a826df3d3814..3f79001830fe 100644
> --- a/arch/arm/lib/copy_from_user.S
> +++ b/arch/arm/lib/copy_from_user.S
> @@ -34,12 +34,13 @@
> * Number of bytes NOT copied.
> */
>
> +#ifdef CONFIG_CPU_USE_DOMAINS
> +
> #ifndef CONFIG_THUMB2_KERNEL
> #define LDR1W_SHIFT 0
> #else
> #define LDR1W_SHIFT 1
> #endif
> -#define STR1W_SHIFT 0
>
> .macro ldr1w ptr reg abort
> ldrusr \reg, \ptr, 4, abort=\abort
> @@ -57,10 +58,30 @@
> ldr4w \ptr, \reg5, \reg6, \reg7, \reg8, \abort
> .endm
>
> +#else
> +
> +#define LDR1W_SHIFT 0
> +
> + .macro ldr1w ptr reg abort
> + USERL(\abort, W(ldr) \reg, [\ptr], #4)
> + .endm
> +
> + .macro ldr4w ptr reg1 reg2 reg3 reg4 abort
> + USERL(\abort, ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4})
> + .endm
> +
> + .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
> + USERL(\abort, ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8})
> + .endm
> +
> +#endif /* CONFIG_CPU_USE_DOMAINS */
> +
> .macro ldr1b ptr reg cond=al abort
> ldrusr \reg, \ptr, 1, \cond, abort=\abort
> .endm
>
> +#define STR1W_SHIFT 0
> +
> .macro str1w ptr reg abort
> W(str) \reg, [\ptr], #4
> .endm
> diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S
> index caf5019d8161..5cb28f6c7941 100644
> --- a/arch/arm/lib/copy_to_user.S
> +++ b/arch/arm/lib/copy_to_user.S
> @@ -35,11 +35,6 @@
> */
>
> #define LDR1W_SHIFT 0
> -#ifndef CONFIG_THUMB2_KERNEL
> -#define STR1W_SHIFT 0
> -#else
> -#define STR1W_SHIFT 1
> -#endif
>
> .macro ldr1w ptr reg abort
> W(ldr) \reg, [\ptr], #4
> @@ -57,6 +52,14 @@
> ldr\cond\()b \reg, [\ptr], #1
> .endm
>
> +#ifdef CONFIG_CPU_USE_DOMAINS
> +
> +#ifndef CONFIG_THUMB2_KERNEL
> +#define STR1W_SHIFT 0
> +#else
> +#define STR1W_SHIFT 1
> +#endif
> +
> .macro str1w ptr reg abort
> strusr \reg, \ptr, 4, abort=\abort
> .endm
> @@ -72,6 +75,20 @@
> str1w \ptr, \reg8, \abort
> .endm
>
> +#else
> +
> +#define STR1W_SHIFT 0
> +
> + .macro str1w ptr reg abort
> + USERL(\abort, W(str) \reg, [\ptr], #4)
> + .endm
> +
> + .macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
> + USERL(\abort, stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8})
> + .endm
> +
> +#endif /* CONFIG_CPU_USE_DOMAINS */
> +
> .macro str1b ptr reg cond=al abort
> strusr \reg, \ptr, 1, \cond, abort=\abort
> .endm
> --
> 2.11.0
>
>
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2018-10-29 13:49 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-10-29 8:06 [PATCH v4] ARM: Optimise copy_{from/to}_user for !CPU_USE_DOMAINS Vincent Whitchurch
2018-10-29 13:49 ` Nicolas Pitre
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.