All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] ARM: lib: use LDRD/STRD for data copy
@ 2012-03-19  7:02 Boojin Kim
  2012-03-19  8:55 ` Russell King - ARM Linux
                   ` (3 more replies)
  0 siblings, 4 replies; 30+ messages in thread
From: Boojin Kim @ 2012-03-19  7:02 UTC (permalink / raw)
  To: linux-arm-kernel

This patch uses LDRD/STRD that loads and stores data as DWORD unit
for the copy of 8-words data.
It brings better performance than LDRM/STRM that was used originally.

Signed-off-by: Boojin Kim <boojin.kim@samsung.com>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/lib/copy_from_user.S |   14 +++++++++-----
 arch/arm/lib/copy_template.S  |   10 ++++++----
 arch/arm/lib/copy_to_user.S   |   13 +++++++++----
 arch/arm/lib/memcpy.S         |   13 +++++++++----
 4 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S
index 66a477a..15d1e1c 100644
--- a/arch/arm/lib/copy_from_user.S
+++ b/arch/arm/lib/copy_from_user.S
@@ -51,11 +51,6 @@
 	ldr1w \ptr, \reg4, \abort
 	.endm

-	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
-	ldr4w \ptr, \reg1, \reg2, \reg3, \reg4, \abort
-	ldr4w \ptr, \reg5, \reg6, \reg7, \reg8, \abort
-	.endm
-
 	.macro ldr1b ptr reg cond=al abort
 	ldrusr	\reg, \ptr, 1, \cond, abort=\abort
 	.endm
@@ -68,6 +63,15 @@
 	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
 	.endm

+	.macro cpy8w dst src reg1 reg2 abort
+	.irp offset, #0, #8, #16, #24
+	ldr1w \src, \reg1, \abort
+	ldr1w \src, \reg2, \abort
+	strd \reg1, \reg2, [\dst, \offset]
+	.endr
+	add \dst, \dst, #32
+	.endm
+
 	.macro str1b ptr reg cond=al abort
 	str\cond\()b \reg, [\ptr], #1
 	.endm
diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
index 805e3f8..72640aa 100644
--- a/arch/arm/lib/copy_template.S
+++ b/arch/arm/lib/copy_template.S
@@ -28,9 +28,8 @@
  *	'ptr' to the next word. The 'abort' argument is used for fixup tables.
  *
  * ldr4w ptr reg1 reg2 reg3 reg4 abort
- * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
  *
- *	This loads four or eight words starting from 'ptr', stores them
+ *	This loads eight words starting from 'ptr', stores them
  *	in provided registers and increments 'ptr' past those words.
  *	The'abort' argument is used for fixup tables.
  *
@@ -47,6 +46,10 @@
  *	Same as their ldr* counterparts, but data is stored to 'ptr' location
  *	rather than being loaded.
  *
+ * cpy8w src dst reg1 reg2 abort
+ *	This loads eight words starting from 'src' and stores them to 'dst'.
+ *	The 'abort' argument is used for fixup tables.
+ *
  * enter reg1 reg2
  *
  *	Preserve the provided registers on the stack plus any additional
@@ -97,9 +100,8 @@
 	PLD(	pld	[r1, #92]		)

 3:	PLD(	pld	[r1, #124]		)
-4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+4:		cpy8w   r0, r1, r4, r5, abort=20f
 		subs	r2, r2, #32
-		str8w	r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
 		bge	3b
 	PLD(	cmn	r2, #96			)
 	PLD(	bge	4b			)
diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S
index d066df6..9402a08 100644
--- a/arch/arm/lib/copy_to_user.S
+++ b/arch/arm/lib/copy_to_user.S
@@ -48,10 +48,6 @@
 	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
 	.endm

-	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
-	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
-	.endm
-
 	.macro ldr1b ptr reg cond=al abort
 	ldr\cond\()b \reg, [\ptr], #1
 	.endm
@@ -71,6 +67,15 @@
 	str1w \ptr, \reg8, \abort
 	.endm

+	.macro cpy8w dst src reg1 reg2 abort
+	.irp offset, #0, #8, #16, #24
+	ldrd \reg1, \reg2, [\src, \offset]
+	str1w \dst, \reg1, \abort
+	str1w \dst, \reg2, \abort
+	.endr
+	add \src, \src, #32
+	.endm
+
 	.macro str1b ptr reg cond=al abort
 	strusr	\reg, \ptr, 1, \cond, abort=\abort
 	.endm
diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
index a9b9e22..25320c9 100644
--- a/arch/arm/lib/memcpy.S
+++ b/arch/arm/lib/memcpy.S
@@ -24,10 +24,6 @@
 	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
 	.endm

-	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
-	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
-	.endm
-
 	.macro ldr1b ptr reg cond=al abort
 	ldr\cond\()b \reg, [\ptr], #1
 	.endm
@@ -40,6 +36,15 @@
 	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
 	.endm

+	.macro cpy8w dst src reg1 reg2 abort
+	.irp offset, #0, #8, #16, #24
+	ldrd \reg1, \reg2, [\src, \offset]
+	strd \reg1, \reg2, [\dst, \offset]
+	.endr
+	add \src, \src, #32
+	add \dst, \dst, #32
+	.endm
+
 	.macro str1b ptr reg cond=al abort
 	str\cond\()b \reg, [\ptr], #1
 	.endm
--
1.7.1

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH] ARM: lib: use LDRD/STRD for data copy
  2012-03-19  7:02 [PATCH] ARM: lib: use LDRD/STRD for data copy Boojin Kim
@ 2012-03-19  8:55 ` Russell King - ARM Linux
  2012-03-19 14:36   ` Rob Herring
  2012-03-19 14:10 ` Nicolas Pitre
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 30+ messages in thread
From: Russell King - ARM Linux @ 2012-03-19  8:55 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Mar 19, 2012 at 04:02:48PM +0900, Boojin Kim wrote:
> This patch uses LDRD/STRD that loads and stores data as DWORD unit
> for the copy of 8-words data.
> It brings better performance than LDRM/STRM that was used originally.

And what about CPUs that don't have ldrd/strd ?

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH] ARM: lib: use LDRD/STRD for data copy
  2012-03-19  7:02 [PATCH] ARM: lib: use LDRD/STRD for data copy Boojin Kim
  2012-03-19  8:55 ` Russell King - ARM Linux
@ 2012-03-19 14:10 ` Nicolas Pitre
  2012-03-20  0:05   ` Boojin Kim
  2012-03-27  0:26 ` [PATCH 1/2] ARM: lib: Add optimized memcpy with 64 byte pld size Boojin Kim
  2012-03-27  0:27 ` [PATCH 2/2] ARM: lib: use LDRD/STRD for data copy Boojin Kim
  3 siblings, 1 reply; 30+ messages in thread
From: Nicolas Pitre @ 2012-03-19 14:10 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, 19 Mar 2012, Boojin Kim wrote:

> This patch uses LDRD/STRD that loads and stores data as DWORD unit
> for the copy of 8-words data.
> It brings better performance than LDRM/STRM that was used originally.
> 
> Signed-off-by: Boojin Kim <boojin.kim@samsung.com>
> Cc: Russell King <rmk+kernel@arm.linux.org.uk>

Firstly, you're breaking those CPUs without ldrd/strd support.

Secondly, you're breaking to_user/from_user copies when processor 
domains are not disabled.

Then, my question is why didn't you simply provide an alternative 
implementation of ldr8w/str8w using ldrd/strd instead of interleaving 
them?  Certainly that would have allowed you to benefit from SDRAM burst 
transfers which are typically aligned to d-cache lines, as well as 
locating the subs into the unavoidable result delay slot.

> ---
>  arch/arm/lib/copy_from_user.S |   14 +++++++++-----
>  arch/arm/lib/copy_template.S  |   10 ++++++----
>  arch/arm/lib/copy_to_user.S   |   13 +++++++++----
>  arch/arm/lib/memcpy.S         |   13 +++++++++----
>  4 files changed, 33 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S
> index 66a477a..15d1e1c 100644
> --- a/arch/arm/lib/copy_from_user.S
> +++ b/arch/arm/lib/copy_from_user.S
> @@ -51,11 +51,6 @@
>  	ldr1w \ptr, \reg4, \abort
>  	.endm
> 
> -	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
> -	ldr4w \ptr, \reg1, \reg2, \reg3, \reg4, \abort
> -	ldr4w \ptr, \reg5, \reg6, \reg7, \reg8, \abort
> -	.endm
> -
>  	.macro ldr1b ptr reg cond=al abort
>  	ldrusr	\reg, \ptr, 1, \cond, abort=\abort
>  	.endm
> @@ -68,6 +63,15 @@
>  	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
>  	.endm
> 
> +	.macro cpy8w dst src reg1 reg2 abort
> +	.irp offset, #0, #8, #16, #24
> +	ldr1w \src, \reg1, \abort
> +	ldr1w \src, \reg2, \abort
> +	strd \reg1, \reg2, [\dst, \offset]
> +	.endr
> +	add \dst, \dst, #32
> +	.endm
> +
>  	.macro str1b ptr reg cond=al abort
>  	str\cond\()b \reg, [\ptr], #1
>  	.endm
> diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
> index 805e3f8..72640aa 100644
> --- a/arch/arm/lib/copy_template.S
> +++ b/arch/arm/lib/copy_template.S
> @@ -28,9 +28,8 @@
>   *	'ptr' to the next word. The 'abort' argument is used for fixup tables.
>   *
>   * ldr4w ptr reg1 reg2 reg3 reg4 abort
> - * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
>   *
> - *	This loads four or eight words starting from 'ptr', stores them
> + *	This loads eight words starting from 'ptr', stores them
>   *	in provided registers and increments 'ptr' past those words.
>   *	The'abort' argument is used for fixup tables.
>   *
> @@ -47,6 +46,10 @@
>   *	Same as their ldr* counterparts, but data is stored to 'ptr' location
>   *	rather than being loaded.
>   *
> + * cpy8w src dst reg1 reg2 abort
> + *	This loads eight words starting from 'src' and stores them to 'dst'.
> + *	The 'abort' argument is used for fixup tables.
> + *
>   * enter reg1 reg2
>   *
>   *	Preserve the provided registers on the stack plus any additional
> @@ -97,9 +100,8 @@
>  	PLD(	pld	[r1, #92]		)
> 
>  3:	PLD(	pld	[r1, #124]		)
> -4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> +4:		cpy8w   r0, r1, r4, r5, abort=20f
>  		subs	r2, r2, #32
> -		str8w	r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
>  		bge	3b
>  	PLD(	cmn	r2, #96			)
>  	PLD(	bge	4b			)
> diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S
> index d066df6..9402a08 100644
> --- a/arch/arm/lib/copy_to_user.S
> +++ b/arch/arm/lib/copy_to_user.S
> @@ -48,10 +48,6 @@
>  	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
>  	.endm
> 
> -	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
> -	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
> -	.endm
> -
>  	.macro ldr1b ptr reg cond=al abort
>  	ldr\cond\()b \reg, [\ptr], #1
>  	.endm
> @@ -71,6 +67,15 @@
>  	str1w \ptr, \reg8, \abort
>  	.endm
> 
> +	.macro cpy8w dst src reg1 reg2 abort
> +	.irp offset, #0, #8, #16, #24
> +	ldrd \reg1, \reg2, [\src, \offset]
> +	str1w \dst, \reg1, \abort
> +	str1w \dst, \reg2, \abort
> +	.endr
> +	add \src, \src, #32
> +	.endm
> +
>  	.macro str1b ptr reg cond=al abort
>  	strusr	\reg, \ptr, 1, \cond, abort=\abort
>  	.endm
> diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
> index a9b9e22..25320c9 100644
> --- a/arch/arm/lib/memcpy.S
> +++ b/arch/arm/lib/memcpy.S
> @@ -24,10 +24,6 @@
>  	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
>  	.endm
> 
> -	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
> -	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
> -	.endm
> -
>  	.macro ldr1b ptr reg cond=al abort
>  	ldr\cond\()b \reg, [\ptr], #1
>  	.endm
> @@ -40,6 +36,15 @@
>  	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
>  	.endm
> 
> +	.macro cpy8w dst src reg1 reg2 abort
> +	.irp offset, #0, #8, #16, #24
> +	ldrd \reg1, \reg2, [\src, \offset]
> +	strd \reg1, \reg2, [\dst, \offset]
> +	.endr
> +	add \src, \src, #32
> +	add \dst, \dst, #32
> +	.endm
> +
>  	.macro str1b ptr reg cond=al abort
>  	str\cond\()b \reg, [\ptr], #1
>  	.endm
> --
> 1.7.1
> 
> 
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH] ARM: lib: use LDRD/STRD for data copy
  2012-03-19  8:55 ` Russell King - ARM Linux
@ 2012-03-19 14:36   ` Rob Herring
  2012-03-19 15:41     ` Russell King - ARM Linux
  2012-03-20  0:21     ` Boojin Kim
  0 siblings, 2 replies; 30+ messages in thread
From: Rob Herring @ 2012-03-19 14:36 UTC (permalink / raw)
  To: linux-arm-kernel

On 03/19/2012 03:55 AM, Russell King - ARM Linux wrote:
> On Mon, Mar 19, 2012 at 04:02:48PM +0900, Boojin Kim wrote:
>> This patch uses LDRD/STRD that loads and stores data as DWORD unit
>> for the copy of 8-words data.
>> It brings better performance than LDRM/STRM that was used originally.
> 
> And what about CPUs that don't have ldrd/strd ?
> 

And what about CPUs that do have ldrd/strd but is slower than ldm/stm?
I'm pretty sure that is almost everything currently out there.

Rob

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH] ARM: lib: use LDRD/STRD for data copy
  2012-03-19 14:36   ` Rob Herring
@ 2012-03-19 15:41     ` Russell King - ARM Linux
  2012-03-19 16:34       ` Måns Rullgård
  2012-03-19 16:36       ` Rob Herring
  2012-03-20  0:21     ` Boojin Kim
  1 sibling, 2 replies; 30+ messages in thread
From: Russell King - ARM Linux @ 2012-03-19 15:41 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Mar 19, 2012 at 09:36:41AM -0500, Rob Herring wrote:
> On 03/19/2012 03:55 AM, Russell King - ARM Linux wrote:
> > On Mon, Mar 19, 2012 at 04:02:48PM +0900, Boojin Kim wrote:
> >> This patch uses LDRD/STRD that loads and stores data as DWORD unit
> >> for the copy of 8-words data.
> >> It brings better performance than LDRM/STRM that was used originally.
> > 
> > And what about CPUs that don't have ldrd/strd ?
> > 
> 
> And what about CPUs that do have ldrd/strd but is slower than ldm/stm?
> I'm pretty sure that is almost everything currently out there.

The double-word load/stores were introduced in ARMv6.  Some Intel based
CPUs prior to this have the support as well.  Everything else doesn't.

So taht's nowhere close to 'almost everything'.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH] ARM: lib: use LDRD/STRD for data copy
  2012-03-19 15:41     ` Russell King - ARM Linux
@ 2012-03-19 16:34       ` Måns Rullgård
  2012-03-19 16:36       ` Rob Herring
  1 sibling, 0 replies; 30+ messages in thread
From: Måns Rullgård @ 2012-03-19 16:34 UTC (permalink / raw)
  To: linux-arm-kernel

Russell King - ARM Linux <linux@arm.linux.org.uk> writes:

> On Mon, Mar 19, 2012 at 09:36:41AM -0500, Rob Herring wrote:
>> On 03/19/2012 03:55 AM, Russell King - ARM Linux wrote:
>> > On Mon, Mar 19, 2012 at 04:02:48PM +0900, Boojin Kim wrote:
>> >> This patch uses LDRD/STRD that loads and stores data as DWORD unit
>> >> for the copy of 8-words data.
>> >> It brings better performance than LDRM/STRM that was used originally.
>> > 
>> > And what about CPUs that don't have ldrd/strd ?
>> > 
>> 
>> And what about CPUs that do have ldrd/strd but is slower than ldm/stm?
>> I'm pretty sure that is almost everything currently out there.

Care to give an example?  I can't find one.

> The double-word load/stores were introduced in ARMv6.

Not true.  LDRD/STRD were introduced in ARMv5TE.  ARMv6 relaxed the
alignment requirement of these instructions to 4 bytes from being
implementation defined 4 or 8 bytes in ARMv5TE.

-- 
M?ns Rullg?rd
mans at mansr.com

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH] ARM: lib: use LDRD/STRD for data copy
  2012-03-19 15:41     ` Russell King - ARM Linux
  2012-03-19 16:34       ` Måns Rullgård
@ 2012-03-19 16:36       ` Rob Herring
  2012-03-19 16:53         ` Nicolas Pitre
                           ` (2 more replies)
  1 sibling, 3 replies; 30+ messages in thread
From: Rob Herring @ 2012-03-19 16:36 UTC (permalink / raw)
  To: linux-arm-kernel

On 03/19/2012 10:41 AM, Russell King - ARM Linux wrote:
> On Mon, Mar 19, 2012 at 09:36:41AM -0500, Rob Herring wrote:
>> On 03/19/2012 03:55 AM, Russell King - ARM Linux wrote:
>>> On Mon, Mar 19, 2012 at 04:02:48PM +0900, Boojin Kim wrote:
>>>> This patch uses LDRD/STRD that loads and stores data as DWORD unit
>>>> for the copy of 8-words data.
>>>> It brings better performance than LDRM/STRM that was used originally.
>>>
>>> And what about CPUs that don't have ldrd/strd ?
>>>
>>
>> And what about CPUs that do have ldrd/strd but is slower than ldm/stm?
>> I'm pretty sure that is almost everything currently out there.
> 
> The double-word load/stores were introduced in ARMv6.  Some Intel based
> CPUs prior to this have the support as well.  Everything else doesn't.
> 
> So taht's nowhere close to 'almost everything'.

I meant of all platforms that support both instructions, ldm/stm will be
faster than ldrd/strd on almost all of them AFAIK. I don't think the
claim about being faster is true for an CortexA9 or anything prior.
Linaro folks have done some benchmarking in this area and would be
better to comment.

Rob

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH] ARM: lib: use LDRD/STRD for data copy
  2012-03-19 16:36       ` Rob Herring
@ 2012-03-19 16:53         ` Nicolas Pitre
  2012-03-19 17:11         ` Måns Rullgård
  2012-03-19 20:11         ` Michael Hope
  2 siblings, 0 replies; 30+ messages in thread
From: Nicolas Pitre @ 2012-03-19 16:53 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, 19 Mar 2012, Rob Herring wrote:

> On 03/19/2012 10:41 AM, Russell King - ARM Linux wrote:
> > On Mon, Mar 19, 2012 at 09:36:41AM -0500, Rob Herring wrote:
> >> On 03/19/2012 03:55 AM, Russell King - ARM Linux wrote:
> >>> On Mon, Mar 19, 2012 at 04:02:48PM +0900, Boojin Kim wrote:
> >>>> This patch uses LDRD/STRD that loads and stores data as DWORD unit
> >>>> for the copy of 8-words data.
> >>>> It brings better performance than LDRM/STRM that was used originally.
> >>>
> >>> And what about CPUs that don't have ldrd/strd ?
> >>>
> >>
> >> And what about CPUs that do have ldrd/strd but is slower than ldm/stm?
> >> I'm pretty sure that is almost everything currently out there.
> > 
> > The double-word load/stores were introduced in ARMv6.  Some Intel based
> > CPUs prior to this have the support as well.  Everything else doesn't.
> > 
> > So taht's nowhere close to 'almost everything'.
> 
> I meant of all platforms that support both instructions, ldm/stm will be
> faster than ldrd/strd on almost all of them AFAIK. I don't think the
> claim about being faster is true for an CortexA9 or anything prior.
> Linaro folks have done some benchmarking in this area and would be
> better to comment.

And more importantly, the generic copy functions in the kernel are 
typically used for small copies in most cases, while people tend to 
benchmark copy functions with large buffers, leading to wrong decisions.
  
The functions worth optimizing for throughput are rather copy_page(), 
copy_user_page(), clear_page(), etc.  And not forgetting that some of 
them areinvoked with a typical cache state for the involved memory.



Nicolas

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH] ARM: lib: use LDRD/STRD for data copy
  2012-03-19 16:36       ` Rob Herring
  2012-03-19 16:53         ` Nicolas Pitre
@ 2012-03-19 17:11         ` Måns Rullgård
  2012-03-19 20:11         ` Michael Hope
  2 siblings, 0 replies; 30+ messages in thread
From: Måns Rullgård @ 2012-03-19 17:11 UTC (permalink / raw)
  To: linux-arm-kernel

Rob Herring <robherring2@gmail.com> writes:

> On 03/19/2012 10:41 AM, Russell King - ARM Linux wrote:
>> On Mon, Mar 19, 2012 at 09:36:41AM -0500, Rob Herring wrote:
>>> On 03/19/2012 03:55 AM, Russell King - ARM Linux wrote:
>>>> On Mon, Mar 19, 2012 at 04:02:48PM +0900, Boojin Kim wrote:
>>>>> This patch uses LDRD/STRD that loads and stores data as DWORD unit
>>>>> for the copy of 8-words data.
>>>>> It brings better performance than LDRM/STRM that was used originally.
>>>>
>>>> And what about CPUs that don't have ldrd/strd ?
>>>>
>>>
>>> And what about CPUs that do have ldrd/strd but is slower than ldm/stm?
>>> I'm pretty sure that is almost everything currently out there.
>> 
>> The double-word load/stores were introduced in ARMv6.  Some Intel based
>> CPUs prior to this have the support as well.  Everything else doesn't.
>> 
>> So taht's nowhere close to 'almost everything'.
>
> I meant of all platforms that support both instructions, ldm/stm will be
> faster than ldrd/strd on almost all of them AFAIK. I don't think the
> claim about being faster is true for an CortexA9 or anything prior.

The Cortex-A9 TRM insists ldrd and ldm should have the same timing.
However, measuring it suggests that ldm is in fact faster, at least in
some cases.

The Cortex-A8 TRM is a bit unclear, but measuring gives the same speed
for both.

The manuals for older cores suggest equivalent timing, but I don't have
any nearby to test.

-- 
M?ns Rullg?rd
mans at mansr.com

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH] ARM: lib: use LDRD/STRD for data copy
  2012-03-19 16:36       ` Rob Herring
  2012-03-19 16:53         ` Nicolas Pitre
  2012-03-19 17:11         ` Måns Rullgård
@ 2012-03-19 20:11         ` Michael Hope
  2 siblings, 0 replies; 30+ messages in thread
From: Michael Hope @ 2012-03-19 20:11 UTC (permalink / raw)
  To: linux-arm-kernel

On 20 March 2012 05:36, Rob Herring <robherring2@gmail.com> wrote:
> On 03/19/2012 10:41 AM, Russell King - ARM Linux wrote:
>> On Mon, Mar 19, 2012 at 09:36:41AM -0500, Rob Herring wrote:
>>> On 03/19/2012 03:55 AM, Russell King - ARM Linux wrote:
>>>> On Mon, Mar 19, 2012 at 04:02:48PM +0900, Boojin Kim wrote:
>>>>> This patch uses LDRD/STRD that loads and stores data as DWORD unit
>>>>> for the copy of 8-words data.
>>>>> It brings better performance than LDRM/STRM that was used originally.
>>>>
>>>> And what about CPUs that don't have ldrd/strd ?
>>>>
>>>
>>> And what about CPUs that do have ldrd/strd but is slower than ldm/stm?
>>> I'm pretty sure that is almost everything currently out there.
>>
>> The double-word load/stores were introduced in ARMv6. ?Some Intel based
>> CPUs prior to this have the support as well. ?Everything else doesn't.
>>
>> So taht's nowhere close to 'almost everything'.
>
> I meant of all platforms that support both instructions, ldm/stm will be
> faster than ldrd/strd on almost all of them AFAIK. I don't think the
> claim about being faster is true for an CortexA9 or anything prior.
> Linaro folks have done some benchmarking in this area and would be
> better to comment.

My understanding is that the A15 does well with LDRD and poorly with
LDM, all other cores do well with LDM, and the A9 at least does poorly
with LDRD.  I don't have numbers at hand to back it up.

FYI, here's ARM's Cortex-A15 LDRD based memcpy implementation:
 http://sourceware.org/ml/newlib/2011/msg00469.html

-- Michael

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH] ARM: lib: use LDRD/STRD for data copy
  2012-03-19 14:10 ` Nicolas Pitre
@ 2012-03-20  0:05   ` Boojin Kim
  0 siblings, 0 replies; 30+ messages in thread
From: Boojin Kim @ 2012-03-20  0:05 UTC (permalink / raw)
  To: linux-arm-kernel

Nicolas Pitre wrote:

> > This patch uses LDRD/STRD that loads and stores data as DWORD unit
> > for the copy of 8-words data.
> > It brings better performance than LDRM/STRM that was used originally.
> >
> > Signed-off-by: Boojin Kim <boojin.kim@samsung.com>
> > Cc: Russell King <rmk+kernel@arm.linux.org.uk>
>
> Firstly, you're breaking those CPUs without ldrd/strd support.
>
I loss the point. I will fix it on next patch.

> Secondly, you're breaking to_user/from_user copies when processor
> domains are not disabled.
Can you explain it in detail? Which one breaks the to_user/from_user copies?
Thank you for your reply.

>
> Then, my question is why didn't you simply provide an alternative
> implementation of ldr8w/str8w using ldrd/strd instead of interleaving
> them?  Certainly that would have allowed you to benefit from SDRAM burst
> transfers which are typically aligned to d-cache lines, as well as
> locating the subs into the unavoidable result delay slot.
>
> > ---
> >  arch/arm/lib/copy_from_user.S |   14 +++++++++-----
> >  arch/arm/lib/copy_template.S  |   10 ++++++----
> >  arch/arm/lib/copy_to_user.S   |   13 +++++++++----
> >  arch/arm/lib/memcpy.S         |   13 +++++++++----
> >  4 files changed, 33 insertions(+), 17 deletions(-)
> >
> > diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S
> > index 66a477a..15d1e1c 100644
> > --- a/arch/arm/lib/copy_from_user.S
> > +++ b/arch/arm/lib/copy_from_user.S
> > @@ -51,11 +51,6 @@
> >  	ldr1w \ptr, \reg4, \abort
> >  	.endm
> >
> > -	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
> > -	ldr4w \ptr, \reg1, \reg2, \reg3, \reg4, \abort
> > -	ldr4w \ptr, \reg5, \reg6, \reg7, \reg8, \abort
> > -	.endm
> > -
> >  	.macro ldr1b ptr reg cond=al abort
> >  	ldrusr	\reg, \ptr, 1, \cond, abort=\abort
> >  	.endm
> > @@ -68,6 +63,15 @@
> >  	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
> >  	.endm
> >
> > +	.macro cpy8w dst src reg1 reg2 abort
> > +	.irp offset, #0, #8, #16, #24
> > +	ldr1w \src, \reg1, \abort
> > +	ldr1w \src, \reg2, \abort
> > +	strd \reg1, \reg2, [\dst, \offset]
> > +	.endr
> > +	add \dst, \dst, #32
> > +	.endm
> > +
> >  	.macro str1b ptr reg cond=al abort
> >  	str\cond\()b \reg, [\ptr], #1
> >  	.endm
> > diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
> > index 805e3f8..72640aa 100644
> > --- a/arch/arm/lib/copy_template.S
> > +++ b/arch/arm/lib/copy_template.S
> > @@ -28,9 +28,8 @@
> >   *	'ptr' to the next word. The 'abort' argument is used for fixup tables.
> >   *
> >   * ldr4w ptr reg1 reg2 reg3 reg4 abort
> > - * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
> >   *
> > - *	This loads four or eight words starting from 'ptr', stores them
> > + *	This loads eight words starting from 'ptr', stores them
> >   *	in provided registers and increments 'ptr' past those words.
> >   *	The'abort' argument is used for fixup tables.
> >   *
> > @@ -47,6 +46,10 @@
> >   *	Same as their ldr* counterparts, but data is stored to 'ptr' location
> >   *	rather than being loaded.
> >   *
> > + * cpy8w src dst reg1 reg2 abort
> > + *	This loads eight words starting from 'src' and stores them to 'dst'.
> > + *	The 'abort' argument is used for fixup tables.
> > + *
> >   * enter reg1 reg2
> >   *
> >   *	Preserve the provided registers on the stack plus any additional
> > @@ -97,9 +100,8 @@
> >  	PLD(	pld	[r1, #92]		)
> >
> >  3:	PLD(	pld	[r1, #124]		)
> > -4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> > +4:		cpy8w   r0, r1, r4, r5, abort=20f
> >  		subs	r2, r2, #32
> > -		str8w	r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> >  		bge	3b
> >  	PLD(	cmn	r2, #96			)
> >  	PLD(	bge	4b			)
> > diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S
> > index d066df6..9402a08 100644
> > --- a/arch/arm/lib/copy_to_user.S
> > +++ b/arch/arm/lib/copy_to_user.S
> > @@ -48,10 +48,6 @@
> >  	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
> >  	.endm
> >
> > -	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
> > -	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
> > -	.endm
> > -
> >  	.macro ldr1b ptr reg cond=al abort
> >  	ldr\cond\()b \reg, [\ptr], #1
> >  	.endm
> > @@ -71,6 +67,15 @@
> >  	str1w \ptr, \reg8, \abort
> >  	.endm
> >
> > +	.macro cpy8w dst src reg1 reg2 abort
> > +	.irp offset, #0, #8, #16, #24
> > +	ldrd \reg1, \reg2, [\src, \offset]
> > +	str1w \dst, \reg1, \abort
> > +	str1w \dst, \reg2, \abort
> > +	.endr
> > +	add \src, \src, #32
> > +	.endm
> > +
> >  	.macro str1b ptr reg cond=al abort
> >  	strusr	\reg, \ptr, 1, \cond, abort=\abort
> >  	.endm
> > diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
> > index a9b9e22..25320c9 100644
> > --- a/arch/arm/lib/memcpy.S
> > +++ b/arch/arm/lib/memcpy.S
> > @@ -24,10 +24,6 @@
> >  	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
> >  	.endm
> >
> > -	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
> > -	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
> > -	.endm
> > -
> >  	.macro ldr1b ptr reg cond=al abort
> >  	ldr\cond\()b \reg, [\ptr], #1
> >  	.endm
> > @@ -40,6 +36,15 @@
> >  	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
> >  	.endm
> >
> > +	.macro cpy8w dst src reg1 reg2 abort
> > +	.irp offset, #0, #8, #16, #24
> > +	ldrd \reg1, \reg2, [\src, \offset]
> > +	strd \reg1, \reg2, [\dst, \offset]
> > +	.endr
> > +	add \src, \src, #32
> > +	add \dst, \dst, #32
> > +	.endm
> > +
> >  	.macro str1b ptr reg cond=al abort
> >  	str\cond\()b \reg, [\ptr], #1
> >  	.endm
> > --
> > 1.7.1
> >
> >
> >
> > _______________________________________________
> > linux-arm-kernel mailing list
> > linux-arm-kernel at lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
> >
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH] ARM: lib: use LDRD/STRD for data copy
  2012-03-19 14:36   ` Rob Herring
  2012-03-19 15:41     ` Russell King - ARM Linux
@ 2012-03-20  0:21     ` Boojin Kim
  1 sibling, 0 replies; 30+ messages in thread
From: Boojin Kim @ 2012-03-20  0:21 UTC (permalink / raw)
  To: linux-arm-kernel

Rob Herring wrote:


> On 03/19/2012 03:55 AM, Russell King - ARM Linux wrote:
> > On Mon, Mar 19, 2012 at 04:02:48PM +0900, Boojin Kim wrote:
> >> This patch uses LDRD/STRD that loads and stores data as DWORD unit
> >> for the copy of 8-words data.
> >> It brings better performance than LDRM/STRM that was used originally.
> >
> > And what about CPUs that don't have ldrd/strd ?
> >
>
> And what about CPUs that do have ldrd/strd but is slower than ldm/stm?
> I'm pretty sure that is almost everything currently out there.

Actually I didn't measure the memcpy performance on all ARM SoCs.
I just measured it with internal memcpy() benchmark on Cortex-A9 and Cortex-A15.
ldrd/strd is faster than ldm/stm on cortex-a15.
And it's similar on cortex-a9.
I will try again this patch gives meaningful effect to the targeted ARM SoCs

Thank you for your reply.
>
> Rob
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 1/2] ARM: lib: Add optimized memcpy with 64 byte pld size
  2012-03-19  7:02 [PATCH] ARM: lib: use LDRD/STRD for data copy Boojin Kim
  2012-03-19  8:55 ` Russell King - ARM Linux
  2012-03-19 14:10 ` Nicolas Pitre
@ 2012-03-27  0:26 ` Boojin Kim
  2012-03-27  2:35   ` Nicolas Pitre
  2012-03-27  0:27 ` [PATCH 2/2] ARM: lib: use LDRD/STRD for data copy Boojin Kim
  3 siblings, 1 reply; 30+ messages in thread
From: Boojin Kim @ 2012-03-27  0:26 UTC (permalink / raw)
  To: linux-arm-kernel

This patch adds the optimized memcpy() for the architecture that has 64 byte PLD size.

Signed-off-by: Boojin Kim <boojin.kim@samsung.com>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/Kconfig             |    7 ++++++
 arch/arm/lib/copy_template.S |   44 +++++++++++++++++++++++++++++++++--------
 2 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 8fec56d..ba306b3 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1132,6 +1132,13 @@ config ARM_NR_BANKS
 	default 16 if ARCH_EP93XX
 	default 8

+config ARM_PLD_SIZE
+	int
+	default 64 if ARCH_EXYNOS5
+	default 32
+	help
+	  Configure preload size used on memcpy(). Select 64 for cortex-a15.
+
 config IWMMXT
 	bool "Enable iWMMXt support"
 	depends on CPU_XSCALE || CPU_XSC3 || CPU_MOHAWK || CPU_PJ4
diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
index 805e3f8..7dc5b8c 100644
--- a/arch/arm/lib/copy_template.S
+++ b/arch/arm/lib/copy_template.S
@@ -66,6 +66,7 @@
  *	than one 32bit instruction in Thumb-2)
  */

+#define PLDSIZE	(CONFIG_ARM_PLD_SIZE)

 		enter	r4, lr

@@ -90,19 +91,44 @@
 	CALGN(	add	pc, r4, ip		)

 	PLD(	pld	[r1, #0]		)
-2:	PLD(	subs	r2, r2, #96		)
-	PLD(	pld	[r1, #28]		)
+
+#if (PLDSIZE == 64)
+2:	PLD(	cmp	r2, #32)
+	PLD(	blt	.32cpy)
+.64cpy:	PLD(	subs	r2, r2, #(PLDSIZE*3+32)	)
+	PLD(	pld	[r1, #PLDSIZE-4]	)
 	PLD(	blt	4f			)
-	PLD(	pld	[r1, #60]		)
-	PLD(	pld	[r1, #92]		)
+	PLD(	pld	[r1, #PLDSIZE*2-4]	)
+	PLD(	pld	[r1, #PLDSIZE*3-4]	)
+
+3:	PLD(	pld	[r1, #PLDSIZE*4-4]	)
+4:		ldr8w   r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+		str8w   r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+		ldr8w   r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+		str8w   r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+		subs	r2, r2, #PLDSIZE
+		bge	3b
+	PLD(	cmn	r2, #(PLDSIZE*3)	)
+	PLD(	bge	4b			)
+	PLD(	cmn	r2, #(PLDSIZE*4-32)	)
+	PLD(	blt	5f)
+.32cpy:		ldr8w   r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+		str8w   r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+#else
+2:	PLD(	subs	r2, r2, #(PLDSIZE*3)	)
+	PLD(	pld	[r1, #(PLDSIZE-4)]	)
+	PLD(	blt	4f			)
+	PLD(	pld	[r1, #(PLDSIZE*2-4)]	)
+	PLD(	pld	[r1, #(PLDSIZE*3-4)]	)

-3:	PLD(	pld	[r1, #124]		)
-4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
-		subs	r2, r2, #32
-		str8w	r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+3:	PLD(	pld	[r1, #(PLDSIZE*4-4)]	)
+4:		ldr8w   r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+		subs	r2, r2, #PLDSIZE
+		str8w   r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
 		bge	3b
-	PLD(	cmn	r2, #96			)
+	PLD(	cmn	r2, #(PLDSIZE*3)	)
 	PLD(	bge	4b			)
+#endif

 5:		ands	ip, r2, #28
 		rsb	ip, ip, #32
--
1.7.1

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 2/2] ARM: lib: use LDRD/STRD for data copy
  2012-03-19  7:02 [PATCH] ARM: lib: use LDRD/STRD for data copy Boojin Kim
                   ` (2 preceding siblings ...)
  2012-03-27  0:26 ` [PATCH 1/2] ARM: lib: Add optimized memcpy with 64 byte pld size Boojin Kim
@ 2012-03-27  0:27 ` Boojin Kim
  2012-03-27  7:40   ` Russell King - ARM Linux
  3 siblings, 1 reply; 30+ messages in thread
From: Boojin Kim @ 2012-03-27  0:27 UTC (permalink / raw)
  To: linux-arm-kernel

This patch uses LDRD/STRD that loads and stores data as DWORD unit.
It brings better performance than LDRM/STRM with cortex-a15.

Signed-off-by: Boojin Kim <boojin.kim@samsung.com>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/lib/copy_from_user.S |    9 +++++++++
 arch/arm/lib/copy_template.S  |   14 ++++++++------
 arch/arm/lib/copy_to_user.S   |    9 +++++++++
 arch/arm/lib/memcpy.S         |    9 +++++++++
 4 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S
index 66a477a..dd1fe01 100644
--- a/arch/arm/lib/copy_from_user.S
+++ b/arch/arm/lib/copy_from_user.S
@@ -68,6 +68,15 @@
 	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
 	.endm

+	.macro cpy8w dst src reg1 reg2 abort
+	.irp offset, #0, #8, #16, #24
+	ldr1w \src, \reg1, \abort
+	ldr1w \src, \reg2, \abort
+	strd \reg1, \reg2, [\dst, \offset]
+	.endr
+	add \dst, \dst, #32
+	.endm
+
 	.macro str1b ptr reg cond=al abort
 	str\cond\()b \reg, [\ptr], #1
 	.endm
diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
index 7dc5b8c..a2dd5e2 100644
--- a/arch/arm/lib/copy_template.S
+++ b/arch/arm/lib/copy_template.S
@@ -47,6 +47,11 @@
  *	Same as their ldr* counterparts, but data is stored to 'ptr' location
  *	rather than being loaded.
  *
+ * cpy8w src dst reg1 reg2 abort
+ *
+ *	This loads eight words starting from 'src' and stores them to 'dst'.
+ *	The 'abort' argument is used for fixup tables.
+ *
  * enter reg1 reg2
  *
  *	Preserve the provided registers on the stack plus any additional
@@ -102,18 +107,15 @@
 	PLD(	pld	[r1, #PLDSIZE*3-4]	)

 3:	PLD(	pld	[r1, #PLDSIZE*4-4]	)
-4:		ldr8w   r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
-		str8w   r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
-		ldr8w   r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
-		str8w   r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+4:		cpy8w   r0, r1, r4, r5, abort=20f
+		cpy8w   r0, r1, r4, r5, abort=20f
 		subs	r2, r2, #PLDSIZE
 		bge	3b
 	PLD(	cmn	r2, #(PLDSIZE*3)	)
 	PLD(	bge	4b			)
 	PLD(	cmn	r2, #(PLDSIZE*4-32)	)
 	PLD(	blt	5f)
-.32cpy:		ldr8w   r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
-		str8w   r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+32copy:		cpy8w   r0, r1, r4, r5, abort=20f
 #else
 2:	PLD(	subs	r2, r2, #(PLDSIZE*3)	)
 	PLD(	pld	[r1, #(PLDSIZE-4)]	)
diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S
index d066df6..fc8ea7a 100644
--- a/arch/arm/lib/copy_to_user.S
+++ b/arch/arm/lib/copy_to_user.S
@@ -71,6 +71,15 @@
 	str1w \ptr, \reg8, \abort
 	.endm

+	.macro cpy8w dst src reg1 reg2 abort
+	.irp offset, #0, #8, #16, #24
+	ldrd \reg1, \reg2, [\src, \offset]
+	str1w \dst, \reg1, \abort
+	str1w \dst, \reg2, \abort
+	.endr
+	add \src, \src, #32
+	.endm
+
 	.macro str1b ptr reg cond=al abort
 	strusr	\reg, \ptr, 1, \cond, abort=\abort
 	.endm
diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
index a9b9e22..5b4ca72 100644
--- a/arch/arm/lib/memcpy.S
+++ b/arch/arm/lib/memcpy.S
@@ -40,6 +40,15 @@
 	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
 	.endm

+	.macro cpy8w dst src reg1 reg2 abort
+	.irp offset, #0, #8, #16, #24
+	ldrd \reg1, \reg2, [\src, \offset]
+	strd \reg1, \reg2, [\dst, \offset]
+	.endr
+	add \src, \src, #32
+	add \dst, \dst, #32
+	.endm
+
 	.macro str1b ptr reg cond=al abort
 	str\cond\()b \reg, [\ptr], #1
 	.endm
--
1.7.1

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 1/2] ARM: lib: Add optimized memcpy with 64 byte pld size
  2012-03-27  0:26 ` [PATCH 1/2] ARM: lib: Add optimized memcpy with 64 byte pld size Boojin Kim
@ 2012-03-27  2:35   ` Nicolas Pitre
  2012-03-28  0:28     ` Boojin Kim
  0 siblings, 1 reply; 30+ messages in thread
From: Nicolas Pitre @ 2012-03-27  2:35 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 27 Mar 2012, Boojin Kim wrote:

> This patch adds the optimized memcpy() for the architecture that has 64 byte PLD size.
> 
> Signed-off-by: Boojin Kim <boojin.kim@samsung.com>
> Cc: Russell King <rmk+kernel@arm.linux.org.uk>

This creates quite convoluted code.  If this is worth doing, we'll have 
to find a cleaner way to do this.

Could you please provide performance measurement numbers with and 
without this patch, and similarly for the next patch?

Did you try enabling the cache alignment code?  What performance 
difference if any did you see?

> ---
>  arch/arm/Kconfig             |    7 ++++++
>  arch/arm/lib/copy_template.S |   44 +++++++++++++++++++++++++++++++++--------
>  2 files changed, 42 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> index 8fec56d..ba306b3 100644
> --- a/arch/arm/Kconfig
> +++ b/arch/arm/Kconfig
> @@ -1132,6 +1132,13 @@ config ARM_NR_BANKS
>  	default 16 if ARCH_EP93XX
>  	default 8
> 
> +config ARM_PLD_SIZE
> +	int
> +	default 64 if ARCH_EXYNOS5
> +	default 32
> +	help
> +	  Configure preload size used on memcpy(). Select 64 for cortex-a15.
> +
>  config IWMMXT
>  	bool "Enable iWMMXt support"
>  	depends on CPU_XSCALE || CPU_XSC3 || CPU_MOHAWK || CPU_PJ4
> diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
> index 805e3f8..7dc5b8c 100644
> --- a/arch/arm/lib/copy_template.S
> +++ b/arch/arm/lib/copy_template.S
> @@ -66,6 +66,7 @@
>   *	than one 32bit instruction in Thumb-2)
>   */
> 
> +#define PLDSIZE	(CONFIG_ARM_PLD_SIZE)
> 
>  		enter	r4, lr
> 
> @@ -90,19 +91,44 @@
>  	CALGN(	add	pc, r4, ip		)
> 
>  	PLD(	pld	[r1, #0]		)
> -2:	PLD(	subs	r2, r2, #96		)
> -	PLD(	pld	[r1, #28]		)
> +
> +#if (PLDSIZE == 64)
> +2:	PLD(	cmp	r2, #32)
> +	PLD(	blt	.32cpy)
> +.64cpy:	PLD(	subs	r2, r2, #(PLDSIZE*3+32)	)
> +	PLD(	pld	[r1, #PLDSIZE-4]	)
>  	PLD(	blt	4f			)
> -	PLD(	pld	[r1, #60]		)
> -	PLD(	pld	[r1, #92]		)
> +	PLD(	pld	[r1, #PLDSIZE*2-4]	)
> +	PLD(	pld	[r1, #PLDSIZE*3-4]	)
> +
> +3:	PLD(	pld	[r1, #PLDSIZE*4-4]	)
> +4:		ldr8w   r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> +		str8w   r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> +		ldr8w   r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> +		str8w   r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> +		subs	r2, r2, #PLDSIZE
> +		bge	3b
> +	PLD(	cmn	r2, #(PLDSIZE*3)	)
> +	PLD(	bge	4b			)
> +	PLD(	cmn	r2, #(PLDSIZE*4-32)	)
> +	PLD(	blt	5f)
> +.32cpy:		ldr8w   r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> +		str8w   r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> +#else
> +2:	PLD(	subs	r2, r2, #(PLDSIZE*3)	)
> +	PLD(	pld	[r1, #(PLDSIZE-4)]	)
> +	PLD(	blt	4f			)
> +	PLD(	pld	[r1, #(PLDSIZE*2-4)]	)
> +	PLD(	pld	[r1, #(PLDSIZE*3-4)]	)
> 
> -3:	PLD(	pld	[r1, #124]		)
> -4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> -		subs	r2, r2, #32
> -		str8w	r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> +3:	PLD(	pld	[r1, #(PLDSIZE*4-4)]	)
> +4:		ldr8w   r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> +		subs	r2, r2, #PLDSIZE
> +		str8w   r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
>  		bge	3b
> -	PLD(	cmn	r2, #96			)
> +	PLD(	cmn	r2, #(PLDSIZE*3)	)
>  	PLD(	bge	4b			)
> +#endif
> 
>  5:		ands	ip, r2, #28
>  		rsb	ip, ip, #32
> --
> 1.7.1
> 
> 
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 2/2] ARM: lib: use LDRD/STRD for data copy
  2012-03-27  0:27 ` [PATCH 2/2] ARM: lib: use LDRD/STRD for data copy Boojin Kim
@ 2012-03-27  7:40   ` Russell King - ARM Linux
  2012-03-28  0:19     ` Boojin Kim
  0 siblings, 1 reply; 30+ messages in thread
From: Russell King - ARM Linux @ 2012-03-27  7:40 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Mar 27, 2012 at 09:27:52AM +0900, Boojin Kim wrote:
> This patch uses LDRD/STRD that loads and stores data as DWORD unit.
> It brings better performance than LDRM/STRM with cortex-a15.

Why should I bother looking at this rubbish?  You've been told before
that using ldrd and strd unconditionally is not acceptable.  Stop
wasting peoples review time.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 2/2] ARM: lib: use LDRD/STRD for data copy
  2012-03-27  7:40   ` Russell King - ARM Linux
@ 2012-03-28  0:19     ` Boojin Kim
  2012-03-28  4:10       ` Boojin Kim
  0 siblings, 1 reply; 30+ messages in thread
From: Boojin Kim @ 2012-03-28  0:19 UTC (permalink / raw)
  To: linux-arm-kernel

Russell King wrote:
> Sent: Tuesday, March 27, 2012 4:41 PM
> To: Boojin Kim
> Cc: linux-arm-kernel at lists.infradead.org; 'Catalin Marinas'; 'Nicolas Pitre';
> kgene.kim at samsung.com
> Subject: Re: [PATCH 2/2] ARM: lib: use LDRD/STRD for data copy
>
> On Tue, Mar 27, 2012 at 09:27:52AM +0900, Boojin Kim wrote:
> > This patch uses LDRD/STRD that loads and stores data as DWORD unit.
> > It brings better performance than LDRM/STRM with cortex-a15.
>
> Why should I bother looking at this rubbish?  You've been told before
> that using ldrd and strd unconditionally is not acceptable.  Stop
> wasting peoples review time.
This patch brings better memcpy results on Cortex-a15.
Please see following result. I measured it on cortex-a15.
2nd line is default memcpy.
3rd line is memcpy using ldrd/strd with this patch.
4th line is memcpy using ldrd/strd and PLD optimization on my 1st patch.
===================================================================
Memcpy performance (unit: size: Bytes, results: MBps)
===================================================================
size		default	ldrd/strd	ldrd/strd + PLD opti
===================================================================
64		1245.615434	1565.004006		1565.004006
128		1743.861607	2393.535539		2491.230867
256		2199.46509	3212.376645		3487.723214
512		2569.901316	4137.976695		4479.644495
1024		2880.715339	4245.923913		5250.336022
2048		3623.608534	4752.128954		5365.728022
4096		4120.516878	5119.593709		5710.891813
8192		4431.366988	5126.312336		5440.45961
16384		4603.712434	5040.322581		5529.016277
32768		4559.381383	4712.002413		5238.893546
65536		3483.446661	3513.802215		3516.965843
131072		3495.623479	3498.460677		3506.31136
262144		3484.02921	3475.987876		3499.783013
524288		3427.662608	3430.037525		3454.637159
1048576	2263.903195	2225.9222		2458.911587
2097152	1732.182125	1703.940362		1833.96223
4194304	1713.663165	1708.351146		1781.780052
===================================================================
I think it brings meaningful results on cache boundary. So I tried it again.
And, I saw your review. So, I make this patch be effective on cortex-a15 only if machine selects it.
Thanks for your time and review :)

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 1/2] ARM: lib: Add optimized memcpy with 64 byte pld size
  2012-03-27  2:35   ` Nicolas Pitre
@ 2012-03-28  0:28     ` Boojin Kim
  2012-03-28  5:23       ` Nicolas Pitre
  0 siblings, 1 reply; 30+ messages in thread
From: Boojin Kim @ 2012-03-28  0:28 UTC (permalink / raw)
  To: linux-arm-kernel

Nicolas wrote:

> This creates quite convoluted code.  If this is worth doing, we'll have
> to find a cleaner way to do this.
>
> Could you please provide performance measurement numbers with and
> without this patch, and similarly for the next patch?
>
> Did you try enabling the cache alignment code?  What performance
> difference if any did you see?
My patch brings about 10% better result on cache boundary.
64bytes PLD size makes the cache efficiency be higher on machines that has 64byte cache line.
And, Which one is convoluted code? Can you explain it more detail?
Thank you for  your review.

>
> > ---
> >  arch/arm/Kconfig             |    7 ++++++
> >  arch/arm/lib/copy_template.S |   44 +++++++++++++++++++++++++++++++++--------
> >  2 files changed, 42 insertions(+), 9 deletions(-)
> >
> > diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> > index 8fec56d..ba306b3 100644
> > --- a/arch/arm/Kconfig
> > +++ b/arch/arm/Kconfig
> > @@ -1132,6 +1132,13 @@ config ARM_NR_BANKS
> >  	default 16 if ARCH_EP93XX
> >  	default 8
> >
> > +config ARM_PLD_SIZE
> > +	int
> > +	default 64 if ARCH_EXYNOS5
> > +	default 32
> > +	help
> > +	  Configure preload size used on memcpy(). Select 64 for cortex-a15.
> > +
> >  config IWMMXT
> >  	bool "Enable iWMMXt support"
> >  	depends on CPU_XSCALE || CPU_XSC3 || CPU_MOHAWK || CPU_PJ4
> > diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
> > index 805e3f8..7dc5b8c 100644
> > --- a/arch/arm/lib/copy_template.S
> > +++ b/arch/arm/lib/copy_template.S
> > @@ -66,6 +66,7 @@
> >   *	than one 32bit instruction in Thumb-2)
> >   */
> >
> > +#define PLDSIZE	(CONFIG_ARM_PLD_SIZE)
> >
> >  		enter	r4, lr
> >
> > @@ -90,19 +91,44 @@
> >  	CALGN(	add	pc, r4, ip		)
> >
> >  	PLD(	pld	[r1, #0]		)
> > -2:	PLD(	subs	r2, r2, #96		)
> > -	PLD(	pld	[r1, #28]		)
> > +
> > +#if (PLDSIZE == 64)
> > +2:	PLD(	cmp	r2, #32)
> > +	PLD(	blt	.32cpy)
> > +.64cpy:	PLD(	subs	r2, r2, #(PLDSIZE*3+32)	)
> > +	PLD(	pld	[r1, #PLDSIZE-4]	)
> >  	PLD(	blt	4f			)
> > -	PLD(	pld	[r1, #60]		)
> > -	PLD(	pld	[r1, #92]		)
> > +	PLD(	pld	[r1, #PLDSIZE*2-4]	)
> > +	PLD(	pld	[r1, #PLDSIZE*3-4]	)
> > +
> > +3:	PLD(	pld	[r1, #PLDSIZE*4-4]	)
> > +4:		ldr8w   r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> > +		str8w   r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> > +		ldr8w   r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> > +		str8w   r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> > +		subs	r2, r2, #PLDSIZE
> > +		bge	3b
> > +	PLD(	cmn	r2, #(PLDSIZE*3)	)
> > +	PLD(	bge	4b			)
> > +	PLD(	cmn	r2, #(PLDSIZE*4-32)	)
> > +	PLD(	blt	5f)
> > +.32cpy:		ldr8w   r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> > +		str8w   r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> > +#else
> > +2:	PLD(	subs	r2, r2, #(PLDSIZE*3)	)
> > +	PLD(	pld	[r1, #(PLDSIZE-4)]	)
> > +	PLD(	blt	4f			)
> > +	PLD(	pld	[r1, #(PLDSIZE*2-4)]	)
> > +	PLD(	pld	[r1, #(PLDSIZE*3-4)]	)
> >
> > -3:	PLD(	pld	[r1, #124]		)
> > -4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> > -		subs	r2, r2, #32
> > -		str8w	r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> > +3:	PLD(	pld	[r1, #(PLDSIZE*4-4)]	)
> > +4:		ldr8w   r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> > +		subs	r2, r2, #PLDSIZE
> > +		str8w   r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> >  		bge	3b
> > -	PLD(	cmn	r2, #96			)
> > +	PLD(	cmn	r2, #(PLDSIZE*3)	)
> >  	PLD(	bge	4b			)
> > +#endif
> >
> >  5:		ands	ip, r2, #28
> >  		rsb	ip, ip, #32
> > --
> > 1.7.1
> >
> >
> >
> > _______________________________________________
> > linux-arm-kernel mailing list
> > linux-arm-kernel at lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
> >
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 2/2] ARM: lib: use LDRD/STRD for data copy
  2012-03-28  0:19     ` Boojin Kim
@ 2012-03-28  4:10       ` Boojin Kim
  0 siblings, 0 replies; 30+ messages in thread
From: Boojin Kim @ 2012-03-28  4:10 UTC (permalink / raw)
  To: linux-arm-kernel

Boojin Kim wrote:
> > Cc: linux-arm-kernel at lists.infradead.org; 'Catalin Marinas'; 'Nicolas Pitre';
> > kgene.kim at samsung.com
> > Subject: Re: [PATCH 2/2] ARM: lib: use LDRD/STRD for data copy
> >
> > On Tue, Mar 27, 2012 at 09:27:52AM +0900, Boojin Kim wrote:
> > > This patch uses LDRD/STRD that loads and stores data as DWORD unit.
> > > It brings better performance than LDRM/STRM with cortex-a15.
> >
> > Why should I bother looking at this rubbish?  You've been told before
> > that using ldrd and strd unconditionally is not acceptable.  Stop
> > wasting peoples review time.
> This patch brings better memcpy results on Cortex-a15.
Additionally, Following is ARM's memcpy implementation that mentions LDRD/STRD is better for cortex-a15.
http://sourceware.org/ml/newlib/2011/msg00469.html
I'm tring to optimize memcpy for cortex-a15 because it's seems to be worth.
Thank you..

> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 1/2] ARM: lib: Add optimized memcpy with 64 byte pld size
  2012-03-28  0:28     ` Boojin Kim
@ 2012-03-28  5:23       ` Nicolas Pitre
  2012-03-29  4:00         ` [PATCH 0/4] memcpy optimized with strd/ldrd Nicolas Pitre
  0 siblings, 1 reply; 30+ messages in thread
From: Nicolas Pitre @ 2012-03-28  5:23 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, 28 Mar 2012, Boojin Kim wrote:

> Nicolas wrote:
> 
> > This creates quite convoluted code.  If this is worth doing, we'll have
> > to find a cleaner way to do this.
> >
> > Could you please provide performance measurement numbers with and
> > without this patch, and similarly for the next patch?
> >
> > Did you try enabling the cache alignment code?  What performance
> > difference if any did you see?
> My patch brings about 10% better result on cache boundary.
> 64bytes PLD size makes the cache efficiency be higher on machines that has 64byte cache line.
> And, Which one is convoluted code? Can you explain it more detail?

Yes, I will.  I now have reworked this code to be extensible and still 
as clean as possible.  I'm not going to post it right away though, given 
that it is late and I prefer to have another look at it after I had some 
sleep.


Nicolas

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 0/4] memcpy optimized with strd/ldrd
  2012-03-28  5:23       ` Nicolas Pitre
@ 2012-03-29  4:00         ` Nicolas Pitre
  2012-03-29  4:00           ` [PATCH 1/4] ARM: copy_template.S: move some registers around Nicolas Pitre
                             ` (3 more replies)
  0 siblings, 4 replies; 30+ messages in thread
From: Nicolas Pitre @ 2012-03-29  4:00 UTC (permalink / raw)
  To: linux-arm-kernel


Here's my version.  Lightly tested.
I have no A15 hardware to run any performance comparison though.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 1/4] ARM: copy_template.S: move some registers around
  2012-03-29  4:00         ` [PATCH 0/4] memcpy optimized with strd/ldrd Nicolas Pitre
@ 2012-03-29  4:00           ` Nicolas Pitre
  2012-03-29  4:00           ` [PATCH 2/4] ARM: copy_template.S: rework the unaligned copy loop Nicolas Pitre
                             ` (2 subsequent siblings)
  3 siblings, 0 replies; 30+ messages in thread
From: Nicolas Pitre @ 2012-03-29  4:00 UTC (permalink / raw)
  To: linux-arm-kernel

From: Nicolas Pitre <nicolas.pitre@linaro.org>

The copy length is held in r2, making it difficult to use a consecutive
set of registers starting on an even register number as required by the
LDRD and STRD instructions.  Let's move the length to lr instead, and
adjust affected code accordingly.

Functionally speaking, this patch is a no-op.

Signed-off-by: nicolas Pitre <nico@linaro.org>
---
 arch/arm/lib/copy_template.S |   92 +++++++++++++++++++++---------------------
 1 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
index 805e3f8fb0..7244dcef0d 100644
--- a/arch/arm/lib/copy_template.S
+++ b/arch/arm/lib/copy_template.S
@@ -69,7 +69,7 @@
 
 		enter	r4, lr
 
-		subs	r2, r2, #4
+		subs	lr, r2, #4
 		blt	8f
 		ands	ip, r0, #3
 	PLD(	pld	[r1, #0]		)
@@ -77,34 +77,34 @@
 		ands	ip, r1, #3
 		bne	10f
 
-1:		subs	r2, r2, #(28)
+1:		subs	lr, lr, #(28)
 		stmfd	sp!, {r5 - r8}
 		blt	5f
 
 	CALGN(	ands	ip, r0, #31		)
 	CALGN(	rsb	r3, ip, #32		)
-	CALGN(	sbcnes	r4, r3, r2		)  @ C is always set here
+	CALGN(	sbcnes	r4, r3, lr		)  @ C is always set here
 	CALGN(	bcs	2f			)
 	CALGN(	adr	r4, 6f			)
-	CALGN(	subs	r2, r2, r3		)  @ C gets set
+	CALGN(	subs	lr, lr, r3		)  @ C gets set
 	CALGN(	add	pc, r4, ip		)
 
 	PLD(	pld	[r1, #0]		)
-2:	PLD(	subs	r2, r2, #96		)
+2:	PLD(	subs	lr, lr, #96		)
 	PLD(	pld	[r1, #28]		)
 	PLD(	blt	4f			)
 	PLD(	pld	[r1, #60]		)
 	PLD(	pld	[r1, #92]		)
 
 3:	PLD(	pld	[r1, #124]		)
-4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
-		subs	r2, r2, #32
-		str8w	r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+4:		ldr8w	r1, r2, r3, r4, r5, r6, r7, r8, ip, abort=20f
+		subs	lr, lr, #32
+		str8w	r0, r2, r3, r4, r5, r6, r7, r8, ip, abort=20f
 		bge	3b
-	PLD(	cmn	r2, #96			)
+	PLD(	cmn	lr, #96			)
 	PLD(	bge	4b			)
 
-5:		ands	ip, r2, #28
+5:		ands	ip, lr, #28
 		rsb	ip, ip, #32
 #if LDR1W_SHIFT > 0
 		lsl	ip, ip, #LDR1W_SHIFT
@@ -115,13 +115,13 @@
 		.rept	(1 << LDR1W_SHIFT)
 		W(nop)
 		.endr
+		ldr1w	r1, r2, abort=20f
 		ldr1w	r1, r3, abort=20f
 		ldr1w	r1, r4, abort=20f
 		ldr1w	r1, r5, abort=20f
 		ldr1w	r1, r6, abort=20f
 		ldr1w	r1, r7, abort=20f
 		ldr1w	r1, r8, abort=20f
-		ldr1w	r1, lr, abort=20f
 
 #if LDR1W_SHIFT < STR1W_SHIFT
 		lsl	ip, ip, #STR1W_SHIFT - LDR1W_SHIFT
@@ -133,73 +133,75 @@
 		.rept	(1 << STR1W_SHIFT)
 		W(nop)
 		.endr
+		str1w	r0, r2, abort=20f
 		str1w	r0, r3, abort=20f
 		str1w	r0, r4, abort=20f
 		str1w	r0, r5, abort=20f
 		str1w	r0, r6, abort=20f
 		str1w	r0, r7, abort=20f
 		str1w	r0, r8, abort=20f
-		str1w	r0, lr, abort=20f
 
 	CALGN(	bcs	2b			)
 
 7:		ldmfd	sp!, {r5 - r8}
 
-8:		movs	r2, r2, lsl #31
-		ldr1b	r1, r3, ne, abort=21f
+8:		movs	lr, lr, lsl #31
+		ldr1b	r1, r2, ne, abort=21f
+		ldr1b	r1, r3, cs, abort=21f
 		ldr1b	r1, r4, cs, abort=21f
-		ldr1b	r1, ip, cs, abort=21f
-		str1b	r0, r3, ne, abort=21f
+		str1b	r0, r2, ne, abort=21f
+		str1b	r0, r3, cs, abort=21f
 		str1b	r0, r4, cs, abort=21f
-		str1b	r0, ip, cs, abort=21f
 
 		exit	r4, pc
 
 9:		rsb	ip, ip, #4
 		cmp	ip, #2
-		ldr1b	r1, r3, gt, abort=21f
-		ldr1b	r1, r4, ge, abort=21f
-		ldr1b	r1, lr, abort=21f
-		str1b	r0, r3, gt, abort=21f
-		str1b	r0, r4, ge, abort=21f
-		subs	r2, r2, ip
-		str1b	r0, lr, abort=21f
+		ldr1b	r1, r2, gt, abort=21f
+		ldr1b	r1, r3, ge, abort=21f
+		ldr1b	r1, r4, abort=21f
+		str1b	r0, r2, gt, abort=21f
+		str1b	r0, r3, ge, abort=21f
+		subs	lr, lr, ip
+		str1b	r0, r4, abort=21f
 		blt	8b
 		ands	ip, r1, #3
 		beq	1b
 
 10:		bic	r1, r1, #3
 		cmp	ip, #2
-		ldr1w	r1, lr, abort=21f
+		ldr1w	r1, ip, abort=21f
 		beq	17f
 		bgt	18f
 
 
 		.macro	forward_copy_shift pull push
 
-		subs	r2, r2, #28
+		subs	lr, lr, #28
 		blt	14f
 
-	CALGN(	ands	ip, r0, #31		)
-	CALGN(	rsb	ip, ip, #32		)
-	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
-	CALGN(	subcc	r2, r2, ip		)
+	CALGN(	ands	r3, r0, #31		)
+	CALGN(	rsb	r3, r3, #32		)
+	CALGN(	sbcnes	r4, r3, lr		)  @ C is always set here
+	CALGN(	subcc	lr, lr, r3		)
 	CALGN(	bcc	15f			)
 
 11:		stmfd	sp!, {r5 - r9}
 
 	PLD(	pld	[r1, #0]		)
-	PLD(	subs	r2, r2, #96		)
+	PLD(	subs	lr, lr, #96		)
 	PLD(	pld	[r1, #28]		)
 	PLD(	blt	13f			)
 	PLD(	pld	[r1, #60]		)
 	PLD(	pld	[r1, #92]		)
 
 12:	PLD(	pld	[r1, #124]		)
-13:		ldr4w	r1, r4, r5, r6, r7, abort=19f
-		mov	r3, lr, pull #\pull
-		subs	r2, r2, #32
-		ldr4w	r1, r8, r9, ip, lr, abort=19f
+13:		ldr4w	r1, r3, r4, r5, r6, abort=19f
+		mov	r2, ip, pull #\pull
+		subs	lr, lr, #32
+		ldr4w	r1, r7, r8, r9, ip, abort=19f
+		orr	r2, r2, r3, push #\push
+		mov	r3, r3, pull #\pull
 		orr	r3, r3, r4, push #\push
 		mov	r4, r4, pull #\pull
 		orr	r4, r4, r5, push #\push
@@ -213,25 +215,23 @@
 		orr	r8, r8, r9, push #\push
 		mov	r9, r9, pull #\pull
 		orr	r9, r9, ip, push #\push
-		mov	ip, ip, pull #\pull
-		orr	ip, ip, lr, push #\push
-		str8w	r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
+		str8w	r0, r2, r3, r4, r5, r6, r7, r8, r9, abort=19f
 		bge	12b
-	PLD(	cmn	r2, #96			)
+	PLD(	cmn	lr, #96			)
 	PLD(	bge	13b			)
 
 		ldmfd	sp!, {r5 - r9}
 
-14:		ands	ip, r2, #28
+14:		ands	r3, lr, #28
 		beq	16f
 
-15:		mov	r3, lr, pull #\pull
-		ldr1w	r1, lr, abort=21f
-		subs	ip, ip, #4
-		orr	r3, r3, lr, push #\push
-		str1w	r0, r3, abort=21f
+15:		mov	r2, ip, pull #\pull
+		ldr1w	r1, ip, abort=21f
+		subs	r3, r3, #4
+		orr	r2, r2, ip, push #\push
+		str1w	r0, r2, abort=21f
 		bgt	15b
-	CALGN(	cmp	r2, #0			)
+	CALGN(	cmp	lr, #0			)
 	CALGN(	bge	11b			)
 
 16:		sub	r1, r1, #(\push / 8)
-- 
1.7.9.rc2

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 2/4] ARM: copy_template.S: rework the unaligned copy loop
  2012-03-29  4:00         ` [PATCH 0/4] memcpy optimized with strd/ldrd Nicolas Pitre
  2012-03-29  4:00           ` [PATCH 1/4] ARM: copy_template.S: move some registers around Nicolas Pitre
@ 2012-03-29  4:00           ` Nicolas Pitre
  2012-03-29  4:00           ` [PATCH 3/4] ARM: copy_template.S: enforce contigous register set with memory accessors Nicolas Pitre
  2012-03-29  4:00           ` [PATCH 4/4] ARM: option to select LDRD/STRD optimized memory copy Nicolas Pitre
  3 siblings, 0 replies; 30+ messages in thread
From: Nicolas Pitre @ 2012-03-29  4:00 UTC (permalink / raw)
  To: linux-arm-kernel

From: Nicolas Pitre <nicolas.pitre@linaro.org>

Let's rework the unaligned copy loop to enforce a range of contigous
registers starting from an even register, and to use a single ldr8w
construct instead of two ldr4w's.  There are no users of ldr4w anymore,
so its various definitions are removed.

By using one additional temporary registers, it is possible to have the
same register set for the loads and the stores, and to make the loop
friendlier to superscalar CPUs at the same time.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
---
 arch/arm/lib/copy_from_user.S |   11 +++----
 arch/arm/lib/copy_template.S  |   57 ++++++++++++++++++++---------------------
 arch/arm/lib/copy_to_user.S   |    4 ---
 arch/arm/lib/memcpy.S         |    4 ---
 4 files changed, 33 insertions(+), 43 deletions(-)

diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S
index 66a477a3e3..d1df0ec62b 100644
--- a/arch/arm/lib/copy_from_user.S
+++ b/arch/arm/lib/copy_from_user.S
@@ -44,16 +44,15 @@
 	ldrusr	\reg, \ptr, 4, abort=\abort
 	.endm
 
-	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
+	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
 	ldr1w \ptr, \reg1, \abort
 	ldr1w \ptr, \reg2, \abort
 	ldr1w \ptr, \reg3, \abort
 	ldr1w \ptr, \reg4, \abort
-	.endm
-
-	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
-	ldr4w \ptr, \reg1, \reg2, \reg3, \reg4, \abort
-	ldr4w \ptr, \reg5, \reg6, \reg7, \reg8, \abort
+	ldr1w \ptr, \reg5, \abort
+	ldr1w \ptr, \reg6, \abort
+	ldr1w \ptr, \reg7, \abort
+	ldr1w \ptr, \reg8, \abort
 	.endm
 
 	.macro ldr1b ptr reg cond=al abort
diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
index 7244dcef0d..84e94cd48c 100644
--- a/arch/arm/lib/copy_template.S
+++ b/arch/arm/lib/copy_template.S
@@ -27,10 +27,9 @@
  *	This loads one word from 'ptr', stores it in 'reg' and increments
  *	'ptr' to the next word. The 'abort' argument is used for fixup tables.
  *
- * ldr4w ptr reg1 reg2 reg3 reg4 abort
  * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
  *
- *	This loads four or eight words starting from 'ptr', stores them
+ *	This loads eight words starting from 'ptr', stores them
  *	in provided registers and increments 'ptr' past those words.
  *	The'abort' argument is used for fixup tables.
  *
@@ -63,7 +62,7 @@
  *
  *	Correction to be applied to the "ip" register when branching into
  *	the ldr1w or str1w instructions (some of these macros may expand to
- *	than one 32bit instruction in Thumb-2)
+ *	more than one 32bit instruction in Thumb-2)
  */
 
 
@@ -170,7 +169,7 @@
 
 10:		bic	r1, r1, #3
 		cmp	ip, #2
-		ldr1w	r1, ip, abort=21f
+		ldr1w	r1, r2, abort=21f
 		beq	17f
 		bgt	18f
 
@@ -178,6 +177,7 @@
 		.macro	forward_copy_shift pull push
 
 		subs	lr, lr, #28
+		mov	ip, r2, pull #\pull
 		blt	14f
 
 	CALGN(	ands	r3, r0, #31		)
@@ -186,7 +186,7 @@
 	CALGN(	subcc	lr, lr, r3		)
 	CALGN(	bcc	15f			)
 
-11:		stmfd	sp!, {r5 - r9}
+11:		stmfd	sp!, {r5 - sl}
 
 	PLD(	pld	[r1, #0]		)
 	PLD(	subs	lr, lr, #96		)
@@ -196,40 +196,39 @@
 	PLD(	pld	[r1, #92]		)
 
 12:	PLD(	pld	[r1, #124]		)
-13:		ldr4w	r1, r3, r4, r5, r6, abort=19f
-		mov	r2, ip, pull #\pull
+13:		ldr8w	r1, r2, r3, r4, r5, r6, r7, r8, r9, abort=19f
 		subs	lr, lr, #32
-		ldr4w	r1, r7, r8, r9, ip, abort=19f
-		orr	r2, r2, r3, push #\push
-		mov	r3, r3, pull #\pull
-		orr	r3, r3, r4, push #\push
-		mov	r4, r4, pull #\pull
-		orr	r4, r4, r5, push #\push
-		mov	r5, r5, pull #\pull
-		orr	r5, r5, r6, push #\push
-		mov	r6, r6, pull #\pull
-		orr	r6, r6, r7, push #\push
-		mov	r7, r7, pull #\pull
-		orr	r7, r7, r8, push #\push
-		mov	r8, r8, pull #\pull
-		orr	r8, r8, r9, push #\push
-		mov	r9, r9, pull #\pull
-		orr	r9, r9, ip, push #\push
+		mov	sl, r2, pull #\pull
+		orr	r2, ip, r2, push #\push
+		mov	ip, r3, pull #\pull
+		orr	r3, sl, r3, push #\push
+		mov	sl, r4, pull #\pull
+		orr	r4, ip, r4, push #\push
+		mov	ip, r5, pull #\pull
+		orr	r5, sl, r5, push #\push
+		mov	sl, r6, pull #\pull
+		orr	r6, ip, r6, push #\push
+		mov	ip, r7, pull #\pull
+		orr	r7, sl, r7, push #\push
+		mov	sl, r8, pull #\pull
+		orr	r8, ip, r8, push #\push
+		mov	ip, r9, pull #\pull
+		orr	r9, sl, r9, push #\push
 		str8w	r0, r2, r3, r4, r5, r6, r7, r8, r9, abort=19f
 		bge	12b
 	PLD(	cmn	lr, #96			)
 	PLD(	bge	13b			)
 
-		ldmfd	sp!, {r5 - r9}
+		ldmfd	sp!, {r5 - sl}
 
 14:		ands	r3, lr, #28
 		beq	16f
 
-15:		mov	r2, ip, pull #\pull
-		ldr1w	r1, ip, abort=21f
+15:		ldr1w	r1, r2, abort=21f
 		subs	r3, r3, #4
-		orr	r2, r2, ip, push #\push
-		str1w	r0, r2, abort=21f
+		orr	r4, ip, r2, push #\push
+		mov	ip, r2, pull #\pull
+		str1w	r0, r4, abort=21f
 		bgt	15b
 	CALGN(	cmp	lr, #0			)
 	CALGN(	bge	11b			)
@@ -255,7 +254,7 @@
  */
 
 	.macro	copy_abort_preamble
-19:	ldmfd	sp!, {r5 - r9}
+19:	ldmfd	sp!, {r5 - sl}
 	b	21f
 20:	ldmfd	sp!, {r5 - r8}
 21:
diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S
index d066df686e..a83bc04365 100644
--- a/arch/arm/lib/copy_to_user.S
+++ b/arch/arm/lib/copy_to_user.S
@@ -44,10 +44,6 @@
 	W(ldr) \reg, [\ptr], #4
 	.endm
 
-	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
-	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
-	.endm
-
 	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
 	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
 	.endm
diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
index a9b9e2287a..adbccc6e2d 100644
--- a/arch/arm/lib/memcpy.S
+++ b/arch/arm/lib/memcpy.S
@@ -20,10 +20,6 @@
 	W(ldr) \reg, [\ptr], #4
 	.endm
 
-	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
-	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
-	.endm
-
 	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
 	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
 	.endm
-- 
1.7.9.rc2

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 3/4] ARM: copy_template.S: enforce contigous register set with memory accessors
  2012-03-29  4:00         ` [PATCH 0/4] memcpy optimized with strd/ldrd Nicolas Pitre
  2012-03-29  4:00           ` [PATCH 1/4] ARM: copy_template.S: move some registers around Nicolas Pitre
  2012-03-29  4:00           ` [PATCH 2/4] ARM: copy_template.S: rework the unaligned copy loop Nicolas Pitre
@ 2012-03-29  4:00           ` Nicolas Pitre
  2012-03-29  4:00           ` [PATCH 4/4] ARM: option to select LDRD/STRD optimized memory copy Nicolas Pitre
  3 siblings, 0 replies; 30+ messages in thread
From: Nicolas Pitre @ 2012-03-29  4:00 UTC (permalink / raw)
  To: linux-arm-kernel

From: Nicolas Pitre <nicolas.pitre@linaro.org>

Let's enforce a range of contigous registers with the remaining ldr8w
and str8w accessors.  An additional register needs to be preserved to
achieve this although not strictly necessary otherwise, but this will
allow for a greater flexibility inthe accessor implementation.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
---
 arch/arm/lib/copy_template.S |   10 +++++-----
 1 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
index 84e94cd48c..f6f42c3330 100644
--- a/arch/arm/lib/copy_template.S
+++ b/arch/arm/lib/copy_template.S
@@ -77,7 +77,7 @@
 		bne	10f
 
 1:		subs	lr, lr, #(28)
-		stmfd	sp!, {r5 - r8}
+		stmfd	sp!, {r5 - r9}
 		blt	5f
 
 	CALGN(	ands	ip, r0, #31		)
@@ -96,9 +96,9 @@
 	PLD(	pld	[r1, #92]		)
 
 3:	PLD(	pld	[r1, #124]		)
-4:		ldr8w	r1, r2, r3, r4, r5, r6, r7, r8, ip, abort=20f
+4:		ldr8w	r1, r2, r3, r4, r5, r6, r7, r8, r9, abort=20f
 		subs	lr, lr, #32
-		str8w	r0, r2, r3, r4, r5, r6, r7, r8, ip, abort=20f
+		str8w	r0, r2, r3, r4, r5, r6, r7, r8, r9, abort=20f
 		bge	3b
 	PLD(	cmn	lr, #96			)
 	PLD(	bge	4b			)
@@ -142,7 +142,7 @@
 
 	CALGN(	bcs	2b			)
 
-7:		ldmfd	sp!, {r5 - r8}
+7:		ldmfd	sp!, {r5 - r9}
 
 8:		movs	lr, lr, lsl #31
 		ldr1b	r1, r2, ne, abort=21f
@@ -256,7 +256,7 @@
 	.macro	copy_abort_preamble
 19:	ldmfd	sp!, {r5 - sl}
 	b	21f
-20:	ldmfd	sp!, {r5 - r8}
+20:	ldmfd	sp!, {r5 - r9}
 21:
 	.endm
 
-- 
1.7.9.rc2

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 4/4] ARM: option to select LDRD/STRD optimized memory copy
  2012-03-29  4:00         ` [PATCH 0/4] memcpy optimized with strd/ldrd Nicolas Pitre
                             ` (2 preceding siblings ...)
  2012-03-29  4:00           ` [PATCH 3/4] ARM: copy_template.S: enforce contigous register set with memory accessors Nicolas Pitre
@ 2012-03-29  4:00           ` Nicolas Pitre
  3 siblings, 0 replies; 30+ messages in thread
From: Nicolas Pitre @ 2012-03-29  4:00 UTC (permalink / raw)
  To: linux-arm-kernel

From: Nicolas Pitre <nicolas.pitre@linaro.org>

Because STRD requires a 64-bit aligned destination pointer, we
unconditionally enable the cache alignment code.

Same concern with LDRD, but we conditionally execute them or the LDM
fallback depending on the source pointer alignment.

Obviously, this could be optimized further by duplicating each loop and
increasing the code.  Convincing benchmarks would be in order before
doing so.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
---
 arch/arm/Kconfig              |    9 +++++++++
 arch/arm/lib/copy_from_user.S |   15 ++++++++++++++-
 arch/arm/lib/copy_template.S  |    3 +++
 arch/arm/lib/copy_to_user.S   |   11 ++++++++++-
 arch/arm/lib/memcpy.S         |   26 ++++++++++++++++++++++++--
 5 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 5098564d58..b87069730a 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1826,6 +1826,15 @@ config UACCESS_WITH_MEMCPY
 	  However, if the CPU data cache is using a write-allocate mode,
 	  this option is unlikely to provide any performance gain.
 
+config USE_LDRDSTRD_OVER_LDMSTM
+	bool "Use 64-bit access instructions to optimize memory copy"
+	depends on CPU_V7
+	help
+	  Some processors, notably the Cortex-A15, are known to perform
+	  better when accessing memory using LDRD/STRD instructions instead
+	  of LDM/STM.  Select this to optimize memory copy routines
+	  accordingly.
+
 config SECCOMP
 	bool
 	prompt "Enable seccomp to safely compute untrusted bytecode"
diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S
index d1df0ec62b..375cbbf0e5 100644
--- a/arch/arm/lib/copy_from_user.S
+++ b/arch/arm/lib/copy_from_user.S
@@ -40,6 +40,12 @@
 #endif
 #define STR1W_SHIFT	0
 
+#ifdef CONFIG_USE_LDRDSTRD_OVER_LDMSTM
+/* Enforce destination cache line alignment */
+#undef CALGN
+#define CALGN(x...) x
+#endif
+
 	.macro ldr1w ptr reg abort
 	ldrusr	\reg, \ptr, 4, abort=\abort
 	.endm
@@ -64,7 +70,14 @@
 	.endm
 
 	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
-	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+#ifdef CONFIG_USE_LDRDSTRD_OVER_LDMSTM
+	strd	\reg1, \reg2, [\ptr], #8
+	strd	\reg3, \reg4, [\ptr], #8
+	strd	\reg5, \reg6, [\ptr], #8
+	strd	\reg7, \reg8, [\ptr], #8
+#else
+	stmia	\ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+#endif
 	.endm
 
 	.macro str1b ptr reg cond=al abort
diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
index f6f42c3330..6a9823d51f 100644
--- a/arch/arm/lib/copy_template.S
+++ b/arch/arm/lib/copy_template.S
@@ -63,6 +63,9 @@
  *	Correction to be applied to the "ip" register when branching into
  *	the ldr1w or str1w instructions (some of these macros may expand to
  *	more than one 32bit instruction in Thumb-2)
+ *
+ * Note: ldr8w is the only accessor that is allowed to change the
+ * condition code. 
  */
 
 
diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S
index a83bc04365..11534edea1 100644
--- a/arch/arm/lib/copy_to_user.S
+++ b/arch/arm/lib/copy_to_user.S
@@ -45,7 +45,16 @@
 	.endm
 
 	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
-	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+#ifdef CONFIG_USE_LDRDSTRD_OVER_LDMSTM
+	tst	\ptr, #7
+	ldreqd	\reg1, \reg2, [\ptr], #8
+	ldreqd	\reg3, \reg4, [\ptr], #8
+	ldreqd	\reg5, \reg6, [\ptr], #8
+	ldreqd	\reg7, \reg8, [\ptr], #8
+	ldmneia	\ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+#else
+	ldmia	\ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+#endif
 	.endm
 
 	.macro ldr1b ptr reg cond=al abort
diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
index adbccc6e2d..db49a300c8 100644
--- a/arch/arm/lib/memcpy.S
+++ b/arch/arm/lib/memcpy.S
@@ -16,12 +16,27 @@
 #define LDR1W_SHIFT	0
 #define STR1W_SHIFT	0
 
+#ifdef CONFIG_USE_LDRDSTRD_OVER_LDMSTM
+/* Enforce destination cache line alignment */
+#undef CALGN
+#define CALGN(x...) x
+#endif
+
 	.macro ldr1w ptr reg abort
 	W(ldr) \reg, [\ptr], #4
 	.endm
 
 	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
-	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+#ifdef CONFIG_USE_LDRDSTRD_OVER_LDMSTM
+	tst	\ptr, #7
+	ldreqd	\reg1, \reg2, [\ptr], #8
+	ldreqd	\reg3, \reg4, [\ptr], #8
+	ldreqd	\reg5, \reg6, [\ptr], #8
+	ldreqd	\reg7, \reg8, [\ptr], #8
+	ldmneia	\ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+#else
+	ldmia	\ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+#endif
 	.endm
 
 	.macro ldr1b ptr reg cond=al abort
@@ -33,7 +48,14 @@
 	.endm
 
 	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
-	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+#ifdef CONFIG_USE_LDRDSTRD_OVER_LDMSTM
+	strd	\reg1, \reg2, [\ptr], #8
+	strd	\reg3, \reg4, [\ptr], #8
+	strd	\reg5, \reg6, [\ptr], #8
+	strd	\reg7, \reg8, [\ptr], #8
+#else
+	stmia	\ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+#endif
 	.endm
 
 	.macro str1b ptr reg cond=al abort
-- 
1.7.9.rc2

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 0/4] memcpy optimized with strd/ldrd
  2012-04-03 14:48       ` Nicolas Pitre
@ 2012-04-26  7:35         ` Boojin Kim
  0 siblings, 0 replies; 30+ messages in thread
From: Boojin Kim @ 2012-04-26  7:35 UTC (permalink / raw)
  To: linux-arm-kernel

Nicolas Pitre wrote:
> Sent: Tuesday, April 03, 2012 11:49 PM
> To: Boojin Kim
> Cc: linux-arm-kernel at lists.infradead.org
> Subject: RE: [PATCH 0/4] memcpy optimized with strd/ldrd
>
> On Tue, 3 Apr 2012, Boojin Kim wrote:
>
> > Nicolas Pitre wrote:
> >
> > > > >
> > > > > Here's my version.  Lightly tested.
> > > > > I have no A15 hardware to run any performance comparison though.
> > > > >
> > > > I'm reviewing and testing your patch. But, My other work disturbs to reviewing it.
> > > > I will give you feedback soon within this week.
> > > > Wait a little more.
> > > > And, Thanks for your patches. :)
> > >
> > > FYI, it occurred to me that some corner cases might not be quite right
> > > with regards to alignment for the STRD instruction.  It seems that the
> > > hardware on which I tested it (Marvell Dove CPU) apparently copes with
> > > misaligned SDRD's when they're still 32-bit aligned.  So I need to run
> > > this code through a real validation harness on different hardware.
> >
> > It's sad, but the performance result wasn't better after adapting your patch.
> > I think something on 1~3 patch brings performance degreasing.
>
> If you could identify which patch is responsible that would be helpful.
Sorry for late response. I'm so busy these days. Y_Y
I checked your patches. And, the 1st patch makes performance drop.
Transmit time for 4KB memcpy is 489ns. After applying 1st patch, the transmit time is 578ns.
Performance also drops on memcpy of other small size about 10%.
I wish this is helpful for you.
Thanks,
>
> Thanks.
>
>
> Nicolas
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 0/4] memcpy optimized with strd/ldrd
  2012-04-03  8:07     ` Boojin Kim
@ 2012-04-03 14:48       ` Nicolas Pitre
  2012-04-26  7:35         ` Boojin Kim
  0 siblings, 1 reply; 30+ messages in thread
From: Nicolas Pitre @ 2012-04-03 14:48 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 3 Apr 2012, Boojin Kim wrote:

> Nicolas Pitre wrote:
> 
> > > >
> > > > Here's my version.  Lightly tested.
> > > > I have no A15 hardware to run any performance comparison though.
> > > >
> > > I'm reviewing and testing your patch. But, My other work disturbs to reviewing it.
> > > I will give you feedback soon within this week.
> > > Wait a little more.
> > > And, Thanks for your patches. :)
> >
> > FYI, it occurred to me that some corner cases might not be quite right
> > with regards to alignment for the STRD instruction.  It seems that the
> > hardware on which I tested it (Marvell Dove CPU) apparently copes with
> > misaligned SDRD's when they're still 32-bit aligned.  So I need to run
> > this code through a real validation harness on different hardware.
> 
> It's sad, but the performance result wasn't better after adapting your patch.
> I think something on 1~3 patch brings performance degreasing.

If you could identify which patch is responsible that would be helpful.

Thanks.


Nicolas

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 0/4] memcpy optimized with strd/ldrd
  2012-03-30 13:19   ` Nicolas Pitre
@ 2012-04-03  8:07     ` Boojin Kim
  2012-04-03 14:48       ` Nicolas Pitre
  0 siblings, 1 reply; 30+ messages in thread
From: Boojin Kim @ 2012-04-03  8:07 UTC (permalink / raw)
  To: linux-arm-kernel

Nicolas Pitre wrote:

> > >
> > > Here's my version.  Lightly tested.
> > > I have no A15 hardware to run any performance comparison though.
> > >
> > I'm reviewing and testing your patch. But, My other work disturbs to reviewing it.
> > I will give you feedback soon within this week.
> > Wait a little more.
> > And, Thanks for your patches. :)
>
> FYI, it occurred to me that some corner cases might not be quite right
> with regards to alignment for the STRD instruction.  It seems that the
> hardware on which I tested it (Marvell Dove CPU) apparently copes with
> misaligned SDRD's when they're still 32-bit aligned.  So I need to run
> this code through a real validation harness on different hardware.

It's sad, but the performance result wasn't better after adapting your patch.
I think something on 1~3 patch brings performance degreasing.
Thanks :)

>
>
> Nicolas
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 0/4] memcpy optimized with strd/ldrd
  2012-03-30 11:41 ` [PATCH 0/4] memcpy optimized with strd/ldrd Boojin Kim
@ 2012-03-30 13:19   ` Nicolas Pitre
  2012-04-03  8:07     ` Boojin Kim
  0 siblings, 1 reply; 30+ messages in thread
From: Nicolas Pitre @ 2012-03-30 13:19 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 30 Mar 2012, Boojin Kim wrote:

> Nicolas Pitre wrote:
> >
> >
> > Here's my version.  Lightly tested.
> > I have no A15 hardware to run any performance comparison though.
> >
> I'm reviewing and testing your patch. But, My other work disturbs to reviewing it.
> I will give you feedback soon within this week.
> Wait a little more.
> And, Thanks for your patches. :)

FYI, it occurred to me that some corner cases might not be quite right 
with regards to alignment for the STRD instruction.  It seems that the 
hardware on which I tested it (Marvell Dove CPU) apparently copes with 
misaligned SDRD's when they're still 32-bit aligned.  So I need to run 
this code through a real validation harness on different hardware.


Nicolas

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 0/4] memcpy optimized with strd/ldrd
       [not found] <03e101cd0e07$eec39f10$cc4add30$@com>
@ 2012-03-30 11:41 ` Boojin Kim
  2012-03-30 13:19   ` Nicolas Pitre
  0 siblings, 1 reply; 30+ messages in thread
From: Boojin Kim @ 2012-03-30 11:41 UTC (permalink / raw)
  To: linux-arm-kernel

Nicolas Pitre wrote:
>
>
> Here's my version.  Lightly tested.
> I have no A15 hardware to run any performance comparison though.
>
I'm reviewing and testing your patch. But, My other work disturbs to reviewing it.
I will give you feedback soon within this week.
Wait a little more.
And, Thanks for your patches. :)

>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2012-04-26  7:35 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-03-19  7:02 [PATCH] ARM: lib: use LDRD/STRD for data copy Boojin Kim
2012-03-19  8:55 ` Russell King - ARM Linux
2012-03-19 14:36   ` Rob Herring
2012-03-19 15:41     ` Russell King - ARM Linux
2012-03-19 16:34       ` Måns Rullgård
2012-03-19 16:36       ` Rob Herring
2012-03-19 16:53         ` Nicolas Pitre
2012-03-19 17:11         ` Måns Rullgård
2012-03-19 20:11         ` Michael Hope
2012-03-20  0:21     ` Boojin Kim
2012-03-19 14:10 ` Nicolas Pitre
2012-03-20  0:05   ` Boojin Kim
2012-03-27  0:26 ` [PATCH 1/2] ARM: lib: Add optimized memcpy with 64 byte pld size Boojin Kim
2012-03-27  2:35   ` Nicolas Pitre
2012-03-28  0:28     ` Boojin Kim
2012-03-28  5:23       ` Nicolas Pitre
2012-03-29  4:00         ` [PATCH 0/4] memcpy optimized with strd/ldrd Nicolas Pitre
2012-03-29  4:00           ` [PATCH 1/4] ARM: copy_template.S: move some registers around Nicolas Pitre
2012-03-29  4:00           ` [PATCH 2/4] ARM: copy_template.S: rework the unaligned copy loop Nicolas Pitre
2012-03-29  4:00           ` [PATCH 3/4] ARM: copy_template.S: enforce contigous register set with memory accessors Nicolas Pitre
2012-03-29  4:00           ` [PATCH 4/4] ARM: option to select LDRD/STRD optimized memory copy Nicolas Pitre
2012-03-27  0:27 ` [PATCH 2/2] ARM: lib: use LDRD/STRD for data copy Boojin Kim
2012-03-27  7:40   ` Russell King - ARM Linux
2012-03-28  0:19     ` Boojin Kim
2012-03-28  4:10       ` Boojin Kim
     [not found] <03e101cd0e07$eec39f10$cc4add30$@com>
2012-03-30 11:41 ` [PATCH 0/4] memcpy optimized with strd/ldrd Boojin Kim
2012-03-30 13:19   ` Nicolas Pitre
2012-04-03  8:07     ` Boojin Kim
2012-04-03 14:48       ` Nicolas Pitre
2012-04-26  7:35         ` Boojin Kim

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.