All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] mixture of cleanups to cache-v7.S
@ 2015-04-02 22:49 Russell King - ARM Linux
  2015-04-02 22:57 ` Russell King - ARM Linux
  2015-04-10 13:27 ` [RFC] mixture of cleanups to cache-v7.S Catalin Marinas
  0 siblings, 2 replies; 28+ messages in thread
From: Russell King - ARM Linux @ 2015-04-02 22:49 UTC (permalink / raw)
  To: linux-arm-kernel

Several cleanups are in the patch below... I'll separate them out, but
I'd like to hear comments on them.  Basically:

1. cache-v7.S is built for ARMv7 CPUs, so there's no reason not to
   use movw and movt when loading large constants, rather than using
   "ldr rd,=constant"

2. we can do a much more efficient check for the errata in
   v7_flush_dcache_louis than we were doing - rather than putting the
   work-around code in the fast path, we can re-organise this such that
   we only try to run the workaround code if the LoU field is zero.

3. shift the bitfield we want to extract in the CLIDR to the appropriate
   bit position prior to masking; this reduces the complexity of the
   code, particularly with the SMP differences in v7_flush_dcache_louis.

4. pre-shift the Cortex A9 MIDR value to be checked, and shift the
   actual MIDR to lose the bottom four revision bits.

5. as the v7_flush_dcache_louis code is more optimal, I see no reason not
   to enable this workaround by default now - if people really want it to
   be disabled, they can still choose that option.  This is in addition
   to Versatile Express enabling it.  Given the memory corrupting abilities
   of not having this errata enabled, I think it's only sane that it's
   something that should be encouraged to be enabled, even though it only
   affects r0pX CPUs.

One obvious issue comes up here though - in the case that the LoU bits
are validly zero, we merely return from v7_flush_dcache_louis with no
DSB or ISB.  However v7_flush_dcache_all always has a DSB or ISB at the
end, even if LoC is zero.  Is this an intentional difference, or should
v7_flush_dcache_louis always end with a DSB+ISB ?

I haven't tested this patch yet... so no sign-off on it yet.

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 2eb6de9465bf..c26dfef393cd 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1139,6 +1139,7 @@ config ARM_ERRATA_742231
 config ARM_ERRATA_643719
 	bool "ARM errata: LoUIS bit field in CLIDR register is incorrect"
 	depends on CPU_V7 && SMP
+	default y
 	help
 	  This option enables the workaround for the 643719 Cortex-A9 (prior to
 	  r1p0) erratum. On affected cores the LoUIS bit field of the CLIDR
diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index b966656d2c2d..1010bebe05eb 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -36,10 +36,10 @@ ENTRY(v7_invalidate_l1)
        mcr     p15, 2, r0, c0, c0, 0
        mrc     p15, 1, r0, c0, c0, 0
 
-       ldr     r1, =0x7fff
+       movw    r1, #0x7fff
        and     r2, r1, r0, lsr #13
 
-       ldr     r1, =0x3ff
+       movw    r1, #0x3ff
 
        and     r3, r1, r0, lsr #3      @ NumWays - 1
        add     r2, r2, #1              @ NumSets
@@ -90,21 +90,20 @@ ENDPROC(v7_flush_icache_all)
 ENTRY(v7_flush_dcache_louis)
 	dmb					@ ensure ordering with previous memory accesses
 	mrc	p15, 1, r0, c0, c0, 1		@ read clidr, r0 = clidr
-	ALT_SMP(ands	r3, r0, #(7 << 21))	@ extract LoUIS from clidr
-	ALT_UP(ands	r3, r0, #(7 << 27))	@ extract LoUU from clidr
+ALT_SMP(mov	r3, r0, lsr #20)		@ move LoUIS into position
+ALT_UP(	mov	r3, r0, lsr #26)		@ move LoUU into position
+	ands	r3, r3, #7 << 1			@ extract LoU field from clidr
+	bne	start_flush_levels		@ LoU != 0, start flushing
 #ifdef CONFIG_ARM_ERRATA_643719
-	ALT_SMP(mrceq	p15, 0, r2, c0, c0, 0)	@ read main ID register
-	ALT_UP(reteq	lr)			@ LoUU is zero, so nothing to do
-	ldreq	r1, =0x410fc090                 @ ID of ARM Cortex A9 r0p?
-	biceq	r2, r2, #0x0000000f             @ clear minor revision number
-	teqeq	r2, r1                          @ test for errata affected core and if so...
-	orreqs	r3, #(1 << 21)			@   fix LoUIS value (and set flags state to 'ne')
+ALT_SMP(mrc	p15, 0, r2, c0, c0, 0)		@ read main ID register
+ALT_UP( ret	lr)				@ LoUU is zero, so nothing to do
+	movw	r1, #:lower16:(0x410fc090 >> 4)	@ ID of ARM Cortex A9 r0p?
+	movt	r1, #:upper16:(0x410fc090 >> 4)
+	teq	r1, r2, lsr #4			@ test for errata affected core and if so...
+	moveq	r3, #1 << 1			@ fix LoUIS value (and set flags state to 'ne')
+	beq	start_flush_levels		@ start flushing cache levels
 #endif
-	ALT_SMP(mov	r3, r3, lsr #20)	@ r3 = LoUIS * 2
-	ALT_UP(mov	r3, r3, lsr #26)	@ r3 = LoUU * 2
-	reteq	lr				@ return if level == 0
-	mov	r10, #0				@ r10 (starting level) = 0
-	b	flush_levels			@ start flushing cache levels
+	ret	lr
 ENDPROC(v7_flush_dcache_louis)
 
 /*
@@ -119,9 +118,10 @@ ENDPROC(v7_flush_dcache_louis)
 ENTRY(v7_flush_dcache_all)
 	dmb					@ ensure ordering with previous memory accesses
 	mrc	p15, 1, r0, c0, c0, 1		@ read clidr
-	ands	r3, r0, #0x7000000		@ extract loc from clidr
-	mov	r3, r3, lsr #23			@ left align loc bit field
+	mov	r3, r0, lsr #23			@ align LoC
+	ands	r3, r3, #7 << 1			@ extract loc from clidr
 	beq	finished			@ if loc is 0, then no need to clean
+start_flush_levels:
 	mov	r10, #0				@ start clean at cache level 0
 flush_levels:
 	add	r2, r10, r10, lsr #1		@ work out 3x current cache level
@@ -140,10 +140,10 @@ flush_levels:
 #endif
 	and	r2, r1, #7			@ extract the length of the cache lines
 	add	r2, r2, #4			@ add 4 (line length offset)
-	ldr	r4, =0x3ff
+	movw	r4, #0x3ff
 	ands	r4, r4, r1, lsr #3		@ find maximum number on the way size
 	clz	r5, r4				@ find bit position of way size increment
-	ldr	r7, =0x7fff
+	movw	r7, #0x7fff
 	ands	r7, r7, r1, lsr #13		@ extract max number of the index size
 loop1:
 	mov	r9, r7				@ create working copy of max index

-- 
FTTC broadband for 0.8mile line: currently@10.5Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [RFC] mixture of cleanups to cache-v7.S
  2015-04-02 22:49 [RFC] mixture of cleanups to cache-v7.S Russell King - ARM Linux
@ 2015-04-02 22:57 ` Russell King - ARM Linux
  2015-04-03 10:08   ` Russell King - ARM Linux
  2015-04-10 13:27 ` [RFC] mixture of cleanups to cache-v7.S Catalin Marinas
  1 sibling, 1 reply; 28+ messages in thread
From: Russell King - ARM Linux @ 2015-04-02 22:57 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Apr 02, 2015 at 11:49:47PM +0100, Russell King - ARM Linux wrote:
> Several cleanups are in the patch below... I'll separate them out, but
> I'd like to hear comments on them.  Basically:
> 
> 1. cache-v7.S is built for ARMv7 CPUs, so there's no reason not to
>    use movw and movt when loading large constants, rather than using
>    "ldr rd,=constant"
> 
> 2. we can do a much more efficient check for the errata in
>    v7_flush_dcache_louis than we were doing - rather than putting the
>    work-around code in the fast path, we can re-organise this such that
>    we only try to run the workaround code if the LoU field is zero.
> 
> 3. shift the bitfield we want to extract in the CLIDR to the appropriate
>    bit position prior to masking; this reduces the complexity of the
>    code, particularly with the SMP differences in v7_flush_dcache_louis.
> 
> 4. pre-shift the Cortex A9 MIDR value to be checked, and shift the
>    actual MIDR to lose the bottom four revision bits.
> 
> 5. as the v7_flush_dcache_louis code is more optimal, I see no reason not
>    to enable this workaround by default now - if people really want it to
>    be disabled, they can still choose that option.  This is in addition
>    to Versatile Express enabling it.  Given the memory corrupting abilities
>    of not having this errata enabled, I think it's only sane that it's
>    something that should be encouraged to be enabled, even though it only
>    affects r0pX CPUs.
> 
> One obvious issue comes up here though - in the case that the LoU bits
> are validly zero, we merely return from v7_flush_dcache_louis with no
> DSB or ISB.  However v7_flush_dcache_all always has a DSB or ISB at the
> end, even if LoC is zero.  Is this an intentional difference, or should
> v7_flush_dcache_louis always end with a DSB+ISB ?

I should point out that if the DSB+ISB is needed, then the code can
instead become as below - basically, we just move the CLIDR into the
appropriate position and call start_flush_levels, which does the DMB,
applies the mask to extract the appropriate field, and then decides
whether it has any levels to process.

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 2eb6de9465bf..c26dfef393cd 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1139,6 +1139,7 @@ config ARM_ERRATA_742231
 config ARM_ERRATA_643719
 	bool "ARM errata: LoUIS bit field in CLIDR register is incorrect"
 	depends on CPU_V7 && SMP
+	default y
 	help
 	  This option enables the workaround for the 643719 Cortex-A9 (prior to
 	  r1p0) erratum. On affected cores the LoUIS bit field of the CLIDR
diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index b966656d2c2d..14bfdd584385 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -36,10 +36,10 @@ ENTRY(v7_invalidate_l1)
        mcr     p15, 2, r0, c0, c0, 0
        mrc     p15, 1, r0, c0, c0, 0
 
-       ldr     r1, =0x7fff
+       movw    r1, #0x7fff
        and     r2, r1, r0, lsr #13
 
-       ldr     r1, =0x3ff
+       movw    r1, #0x3ff
 
        and     r3, r1, r0, lsr #3      @ NumWays - 1
        add     r2, r2, #1              @ NumSets
@@ -88,23 +88,20 @@ ENDPROC(v7_flush_icache_all)
  */
 
 ENTRY(v7_flush_dcache_louis)
-	dmb					@ ensure ordering with previous memory accesses
 	mrc	p15, 1, r0, c0, c0, 1		@ read clidr, r0 = clidr
-	ALT_SMP(ands	r3, r0, #(7 << 21))	@ extract LoUIS from clidr
-	ALT_UP(ands	r3, r0, #(7 << 27))	@ extract LoUU from clidr
+ALT_SMP(mov	r3, r0, lsr #20)		@ move LoUIS into position
+ALT_UP(	mov	r3, r0, lsr #26)		@ move LoUU into position
 #ifdef CONFIG_ARM_ERRATA_643719
-	ALT_SMP(mrceq	p15, 0, r2, c0, c0, 0)	@ read main ID register
-	ALT_UP(reteq	lr)			@ LoUU is zero, so nothing to do
-	ldreq	r1, =0x410fc090                 @ ID of ARM Cortex A9 r0p?
-	biceq	r2, r2, #0x0000000f             @ clear minor revision number
-	teqeq	r2, r1                          @ test for errata affected core and if so...
-	orreqs	r3, #(1 << 21)			@   fix LoUIS value (and set flags state to 'ne')
+ALT_SMP(ands	r3, r3, #7 << 1)		@ extract LoU field from clidr
+ALT_UP(	b	start_flush_levels)
+	bne	start_flush_levels		@ LoU != 0, start flushing
+	mrc	p15, 0, r2, c0, c0, 0		@ read main ID register
+	movw	r1, #:lower16:(0x410fc090 >> 4)	@ ID of ARM Cortex A9 r0p?
+	movt	r1, #:upper16:(0x410fc090 >> 4)
+	teq	r1, r2, lsr #4			@ test for errata affected core and if so...
+	moveq	r3, #1 << 1			@ fix LoUIS value (and set flags state to 'ne')
 #endif
-	ALT_SMP(mov	r3, r3, lsr #20)	@ r3 = LoUIS * 2
-	ALT_UP(mov	r3, r3, lsr #26)	@ r3 = LoUU * 2
-	reteq	lr				@ return if level == 0
-	mov	r10, #0				@ r10 (starting level) = 0
-	b	flush_levels			@ start flushing cache levels
+	b	start_flush_levels		@ start flushing cache levels
 ENDPROC(v7_flush_dcache_louis)
 
 /*
@@ -117,10 +114,11 @@ ENDPROC(v7_flush_dcache_louis)
  *	- mm    - mm_struct describing address space
  */
 ENTRY(v7_flush_dcache_all)
-	dmb					@ ensure ordering with previous memory accesses
 	mrc	p15, 1, r0, c0, c0, 1		@ read clidr
-	ands	r3, r0, #0x7000000		@ extract loc from clidr
-	mov	r3, r3, lsr #23			@ left align loc bit field
+	mov	r3, r0, lsr #23			@ align LoC
+start_flush_levels:
+	dmb					@ ensure ordering with previous memory accesses
+	ands	r3, r3, #7 << 1			@ extract loc from clidr
 	beq	finished			@ if loc is 0, then no need to clean
 	mov	r10, #0				@ start clean at cache level 0
 flush_levels:
@@ -140,10 +138,10 @@ flush_levels:
 #endif
 	and	r2, r1, #7			@ extract the length of the cache lines
 	add	r2, r2, #4			@ add 4 (line length offset)
-	ldr	r4, =0x3ff
+	movw	r4, #0x3ff
 	ands	r4, r4, r1, lsr #3		@ find maximum number on the way size
 	clz	r5, r4				@ find bit position of way size increment
-	ldr	r7, =0x7fff
+	movw	r7, #0x7fff
 	ands	r7, r7, r1, lsr #13		@ extract max number of the index size
 loop1:
 	mov	r9, r7				@ create working copy of max index

-- 
FTTC broadband for 0.8mile line: currently@10.5Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [RFC] mixture of cleanups to cache-v7.S
  2015-04-02 22:57 ` Russell King - ARM Linux
@ 2015-04-03 10:08   ` Russell King - ARM Linux
  2015-04-03 10:53     ` Russell King - ARM Linux
                       ` (7 more replies)
  0 siblings, 8 replies; 28+ messages in thread
From: Russell King - ARM Linux @ 2015-04-03 10:08 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Apr 02, 2015 at 11:57:59PM +0100, Russell King - ARM Linux wrote:
> On Thu, Apr 02, 2015 at 11:49:47PM +0100, Russell King - ARM Linux wrote:
> > Several cleanups are in the patch below... I'll separate them out, but
> > I'd like to hear comments on them.  Basically:
> > 
> > 1. cache-v7.S is built for ARMv7 CPUs, so there's no reason not to
> >    use movw and movt when loading large constants, rather than using
> >    "ldr rd,=constant"
> > 
> > 2. we can do a much more efficient check for the errata in
> >    v7_flush_dcache_louis than we were doing - rather than putting the
> >    work-around code in the fast path, we can re-organise this such that
> >    we only try to run the workaround code if the LoU field is zero.
> > 
> > 3. shift the bitfield we want to extract in the CLIDR to the appropriate
> >    bit position prior to masking; this reduces the complexity of the
> >    code, particularly with the SMP differences in v7_flush_dcache_louis.
> > 
> > 4. pre-shift the Cortex A9 MIDR value to be checked, and shift the
> >    actual MIDR to lose the bottom four revision bits.
> > 
> > 5. as the v7_flush_dcache_louis code is more optimal, I see no reason not
> >    to enable this workaround by default now - if people really want it to
> >    be disabled, they can still choose that option.  This is in addition
> >    to Versatile Express enabling it.  Given the memory corrupting abilities
> >    of not having this errata enabled, I think it's only sane that it's
> >    something that should be encouraged to be enabled, even though it only
> >    affects r0pX CPUs.
> > 
> > One obvious issue comes up here though - in the case that the LoU bits
> > are validly zero, we merely return from v7_flush_dcache_louis with no
> > DSB or ISB.  However v7_flush_dcache_all always has a DSB or ISB at the
> > end, even if LoC is zero.  Is this an intentional difference, or should
> > v7_flush_dcache_louis always end with a DSB+ISB ?
> 
> I should point out that if the DSB+ISB is needed, then the code can
> instead become as below - basically, we just move the CLIDR into the
> appropriate position and call start_flush_levels, which does the DMB,
> applies the mask to extract the appropriate field, and then decides
> whether it has any levels to process.

I've now tested this patch on the Versatile Express and SDP4430, and
both seem to work fine with the patch below.

> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> index 2eb6de9465bf..c26dfef393cd 100644
> --- a/arch/arm/Kconfig
> +++ b/arch/arm/Kconfig
> @@ -1139,6 +1139,7 @@ config ARM_ERRATA_742231
>  config ARM_ERRATA_643719
>  	bool "ARM errata: LoUIS bit field in CLIDR register is incorrect"
>  	depends on CPU_V7 && SMP
> +	default y
>  	help
>  	  This option enables the workaround for the 643719 Cortex-A9 (prior to
>  	  r1p0) erratum. On affected cores the LoUIS bit field of the CLIDR
> diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
> index b966656d2c2d..14bfdd584385 100644
> --- a/arch/arm/mm/cache-v7.S
> +++ b/arch/arm/mm/cache-v7.S
> @@ -36,10 +36,10 @@ ENTRY(v7_invalidate_l1)
>         mcr     p15, 2, r0, c0, c0, 0
>         mrc     p15, 1, r0, c0, c0, 0
>  
> -       ldr     r1, =0x7fff
> +       movw    r1, #0x7fff
>         and     r2, r1, r0, lsr #13
>  
> -       ldr     r1, =0x3ff
> +       movw    r1, #0x3ff
>  
>         and     r3, r1, r0, lsr #3      @ NumWays - 1
>         add     r2, r2, #1              @ NumSets
> @@ -88,23 +88,20 @@ ENDPROC(v7_flush_icache_all)
>   */
>  
>  ENTRY(v7_flush_dcache_louis)
> -	dmb					@ ensure ordering with previous memory accesses
>  	mrc	p15, 1, r0, c0, c0, 1		@ read clidr, r0 = clidr
> -	ALT_SMP(ands	r3, r0, #(7 << 21))	@ extract LoUIS from clidr
> -	ALT_UP(ands	r3, r0, #(7 << 27))	@ extract LoUU from clidr
> +ALT_SMP(mov	r3, r0, lsr #20)		@ move LoUIS into position
> +ALT_UP(	mov	r3, r0, lsr #26)		@ move LoUU into position
>  #ifdef CONFIG_ARM_ERRATA_643719
> -	ALT_SMP(mrceq	p15, 0, r2, c0, c0, 0)	@ read main ID register
> -	ALT_UP(reteq	lr)			@ LoUU is zero, so nothing to do
> -	ldreq	r1, =0x410fc090                 @ ID of ARM Cortex A9 r0p?
> -	biceq	r2, r2, #0x0000000f             @ clear minor revision number
> -	teqeq	r2, r1                          @ test for errata affected core and if so...
> -	orreqs	r3, #(1 << 21)			@   fix LoUIS value (and set flags state to 'ne')
> +ALT_SMP(ands	r3, r3, #7 << 1)		@ extract LoU field from clidr
> +ALT_UP(	b	start_flush_levels)
> +	bne	start_flush_levels		@ LoU != 0, start flushing
> +	mrc	p15, 0, r2, c0, c0, 0		@ read main ID register
> +	movw	r1, #:lower16:(0x410fc090 >> 4)	@ ID of ARM Cortex A9 r0p?
> +	movt	r1, #:upper16:(0x410fc090 >> 4)
> +	teq	r1, r2, lsr #4			@ test for errata affected core and if so...
> +	moveq	r3, #1 << 1			@ fix LoUIS value (and set flags state to 'ne')
>  #endif
> -	ALT_SMP(mov	r3, r3, lsr #20)	@ r3 = LoUIS * 2
> -	ALT_UP(mov	r3, r3, lsr #26)	@ r3 = LoUU * 2
> -	reteq	lr				@ return if level == 0
> -	mov	r10, #0				@ r10 (starting level) = 0
> -	b	flush_levels			@ start flushing cache levels
> +	b	start_flush_levels		@ start flushing cache levels
>  ENDPROC(v7_flush_dcache_louis)
>  
>  /*
> @@ -117,10 +114,11 @@ ENDPROC(v7_flush_dcache_louis)
>   *	- mm    - mm_struct describing address space
>   */
>  ENTRY(v7_flush_dcache_all)
> -	dmb					@ ensure ordering with previous memory accesses
>  	mrc	p15, 1, r0, c0, c0, 1		@ read clidr
> -	ands	r3, r0, #0x7000000		@ extract loc from clidr
> -	mov	r3, r3, lsr #23			@ left align loc bit field
> +	mov	r3, r0, lsr #23			@ align LoC
> +start_flush_levels:
> +	dmb					@ ensure ordering with previous memory accesses
> +	ands	r3, r3, #7 << 1			@ extract loc from clidr
>  	beq	finished			@ if loc is 0, then no need to clean
>  	mov	r10, #0				@ start clean at cache level 0
>  flush_levels:
> @@ -140,10 +138,10 @@ flush_levels:
>  #endif
>  	and	r2, r1, #7			@ extract the length of the cache lines
>  	add	r2, r2, #4			@ add 4 (line length offset)
> -	ldr	r4, =0x3ff
> +	movw	r4, #0x3ff
>  	ands	r4, r4, r1, lsr #3		@ find maximum number on the way size
>  	clz	r5, r4				@ find bit position of way size increment
> -	ldr	r7, =0x7fff
> +	movw	r7, #0x7fff
>  	ands	r7, r7, r1, lsr #13		@ extract max number of the index size
>  loop1:
>  	mov	r9, r7				@ create working copy of max index
> 
> -- 
> FTTC broadband for 0.8mile line: currently at 10.5Mbps down 400kbps up
> according to speedtest.net.
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

-- 
FTTC broadband for 0.8mile line: currently at 10.5Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [RFC] mixture of cleanups to cache-v7.S
  2015-04-03 10:08   ` Russell King - ARM Linux
@ 2015-04-03 10:53     ` Russell King - ARM Linux
  2015-04-03 10:54     ` [PATCH 1/7] ARM: cache-v7: use movw/movt instructions Russell King
                       ` (6 subsequent siblings)
  7 siblings, 0 replies; 28+ messages in thread
From: Russell King - ARM Linux @ 2015-04-03 10:53 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Apr 03, 2015 at 11:08:48AM +0100, Russell King - ARM Linux wrote:
> On Thu, Apr 02, 2015 at 11:57:59PM +0100, Russell King - ARM Linux wrote:
> > On Thu, Apr 02, 2015 at 11:49:47PM +0100, Russell King - ARM Linux wrote:
> > > Several cleanups are in the patch below... I'll separate them out, but
> > > I'd like to hear comments on them.  Basically:
> > > 
> > > 1. cache-v7.S is built for ARMv7 CPUs, so there's no reason not to
> > >    use movw and movt when loading large constants, rather than using
> > >    "ldr rd,=constant"
> > > 
> > > 2. we can do a much more efficient check for the errata in
> > >    v7_flush_dcache_louis than we were doing - rather than putting the
> > >    work-around code in the fast path, we can re-organise this such that
> > >    we only try to run the workaround code if the LoU field is zero.
> > > 
> > > 3. shift the bitfield we want to extract in the CLIDR to the appropriate
> > >    bit position prior to masking; this reduces the complexity of the
> > >    code, particularly with the SMP differences in v7_flush_dcache_louis.
> > > 
> > > 4. pre-shift the Cortex A9 MIDR value to be checked, and shift the
> > >    actual MIDR to lose the bottom four revision bits.
> > > 
> > > 5. as the v7_flush_dcache_louis code is more optimal, I see no reason not
> > >    to enable this workaround by default now - if people really want it to
> > >    be disabled, they can still choose that option.  This is in addition
> > >    to Versatile Express enabling it.  Given the memory corrupting abilities
> > >    of not having this errata enabled, I think it's only sane that it's
> > >    something that should be encouraged to be enabled, even though it only
> > >    affects r0pX CPUs.
> > > 
> > > One obvious issue comes up here though - in the case that the LoU bits
> > > are validly zero, we merely return from v7_flush_dcache_louis with no
> > > DSB or ISB.  However v7_flush_dcache_all always has a DSB or ISB at the
> > > end, even if LoC is zero.  Is this an intentional difference, or should
> > > v7_flush_dcache_louis always end with a DSB+ISB ?
> > 
> > I should point out that if the DSB+ISB is needed, then the code can
> > instead become as below - basically, we just move the CLIDR into the
> > appropriate position and call start_flush_levels, which does the DMB,
> > applies the mask to extract the appropriate field, and then decides
> > whether it has any levels to process.
> 
> I've now tested this patch on the Versatile Express and SDP4430, and
> both seem to work fine with the patch below.

Here's the patches broken out.  My intention is to put the first five,
which should be entirely non-contravercial, into my for-next branch.
The last two, I'll wait until I hear back from you after the Easter
break.

-- 
FTTC broadband for 0.8mile line: currently at 10.5Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 1/7] ARM: cache-v7: use movw/movt instructions
  2015-04-03 10:08   ` Russell King - ARM Linux
  2015-04-03 10:53     ` Russell King - ARM Linux
@ 2015-04-03 10:54     ` Russell King
  2015-04-09 17:10       ` Catalin Marinas
  2015-04-03 10:54     ` [PATCH 2/7] ARM: cache-v7: shift CLIDR to extract appropriate field before masking Russell King
                       ` (5 subsequent siblings)
  7 siblings, 1 reply; 28+ messages in thread
From: Russell King @ 2015-04-03 10:54 UTC (permalink / raw)
  To: linux-arm-kernel

We always build cache-v7.S for ARMv7, so we can use the ARMv7 16-bit
move instructions to load large constants, rather than using constants
in a literal pool.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/cache-v7.S | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index b966656d2c2d..30c81e7d6aaa 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -36,10 +36,10 @@ ENTRY(v7_invalidate_l1)
        mcr     p15, 2, r0, c0, c0, 0
        mrc     p15, 1, r0, c0, c0, 0
 
-       ldr     r1, =0x7fff
+       movw    r1, #0x7fff
        and     r2, r1, r0, lsr #13
 
-       ldr     r1, =0x3ff
+       movw    r1, #0x3ff
 
        and     r3, r1, r0, lsr #3      @ NumWays - 1
        add     r2, r2, #1              @ NumSets
@@ -95,7 +95,8 @@ ENTRY(v7_flush_dcache_louis)
 #ifdef CONFIG_ARM_ERRATA_643719
 	ALT_SMP(mrceq	p15, 0, r2, c0, c0, 0)	@ read main ID register
 	ALT_UP(reteq	lr)			@ LoUU is zero, so nothing to do
-	ldreq	r1, =0x410fc090                 @ ID of ARM Cortex A9 r0p?
+	movweq	r1, #:lower16:0x410fc090	@ ID of ARM Cortex A9 r0p?
+	movteq	r1, #:upper16:0x410fc090
 	biceq	r2, r2, #0x0000000f             @ clear minor revision number
 	teqeq	r2, r1                          @ test for errata affected core and if so...
 	orreqs	r3, #(1 << 21)			@   fix LoUIS value (and set flags state to 'ne')
@@ -140,10 +141,10 @@ flush_levels:
 #endif
 	and	r2, r1, #7			@ extract the length of the cache lines
 	add	r2, r2, #4			@ add 4 (line length offset)
-	ldr	r4, =0x3ff
+	movw	r4, #0x3ff
 	ands	r4, r4, r1, lsr #3		@ find maximum number on the way size
 	clz	r5, r4				@ find bit position of way size increment
-	ldr	r7, =0x7fff
+	movw	r7, #0x7fff
 	ands	r7, r7, r1, lsr #13		@ extract max number of the index size
 loop1:
 	mov	r9, r7				@ create working copy of max index
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 2/7] ARM: cache-v7: shift CLIDR to extract appropriate field before masking
  2015-04-03 10:08   ` Russell King - ARM Linux
  2015-04-03 10:53     ` Russell King - ARM Linux
  2015-04-03 10:54     ` [PATCH 1/7] ARM: cache-v7: use movw/movt instructions Russell King
@ 2015-04-03 10:54     ` Russell King
  2015-04-09 17:09       ` Catalin Marinas
  2015-04-03 10:54     ` [PATCH 3/7] ARM: cache-v7: consolidate initialisation of cache level index Russell King
                       ` (4 subsequent siblings)
  7 siblings, 1 reply; 28+ messages in thread
From: Russell King @ 2015-04-03 10:54 UTC (permalink / raw)
  To: linux-arm-kernel

Rather than have code which masks and then shifts, such as:

	mrc     p15, 1, r0, c0, c0, 1
ALT_SMP(ands	r3, r0, #7 << 21)
ALT_UP( ands	r3, r0, #7 << 27)
ALT_SMP(mov	r3, r3, lsr #20)
ALT_UP(	mov	r3, r3, lsr #26)

re-arrange this as a shift and then mask.  The masking is the same for
each field which we want to extract, so this allows the mask to be
shared amongst code paths:

	mrc     p15, 1, r0, c0, c0, 1
ALT_SMP(mov	r3, r0, lsr #20)
ALT_UP(	mov	r3, r0, lsr #26)
	ands	r3, r3, #7 << 1

Use this method for the LoUIS, LoUU and LoC fields.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/cache-v7.S | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index 30c81e7d6aaa..d062f8cbb886 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -90,8 +90,9 @@ ENDPROC(v7_flush_icache_all)
 ENTRY(v7_flush_dcache_louis)
 	dmb					@ ensure ordering with previous memory accesses
 	mrc	p15, 1, r0, c0, c0, 1		@ read clidr, r0 = clidr
-	ALT_SMP(ands	r3, r0, #(7 << 21))	@ extract LoUIS from clidr
-	ALT_UP(ands	r3, r0, #(7 << 27))	@ extract LoUU from clidr
+ALT_SMP(mov	r3, r0, lsr #20)		@ move LoUIS into position
+ALT_UP(	mov	r3, r0, lsr #26)		@ move LoUU into position
+	ands	r3, r3, #7 << 1 		@ extract LoU*2 field from clidr
 #ifdef CONFIG_ARM_ERRATA_643719
 	ALT_SMP(mrceq	p15, 0, r2, c0, c0, 0)	@ read main ID register
 	ALT_UP(reteq	lr)			@ LoUU is zero, so nothing to do
@@ -99,10 +100,8 @@ ENTRY(v7_flush_dcache_louis)
 	movteq	r1, #:upper16:0x410fc090
 	biceq	r2, r2, #0x0000000f             @ clear minor revision number
 	teqeq	r2, r1                          @ test for errata affected core and if so...
-	orreqs	r3, #(1 << 21)			@   fix LoUIS value (and set flags state to 'ne')
+	moveqs	r3, #1 << 1			@   fix LoUIS value (and set flags state to 'ne')
 #endif
-	ALT_SMP(mov	r3, r3, lsr #20)	@ r3 = LoUIS * 2
-	ALT_UP(mov	r3, r3, lsr #26)	@ r3 = LoUU * 2
 	reteq	lr				@ return if level == 0
 	mov	r10, #0				@ r10 (starting level) = 0
 	b	flush_levels			@ start flushing cache levels
@@ -120,8 +119,8 @@ ENDPROC(v7_flush_dcache_louis)
 ENTRY(v7_flush_dcache_all)
 	dmb					@ ensure ordering with previous memory accesses
 	mrc	p15, 1, r0, c0, c0, 1		@ read clidr
-	ands	r3, r0, #0x7000000		@ extract loc from clidr
-	mov	r3, r3, lsr #23			@ left align loc bit field
+	mov	r3, r0, lsr #23			@ move LoC into position
+	ands	r3, r3, #7 << 1			@ extract LoC*2 from clidr
 	beq	finished			@ if loc is 0, then no need to clean
 	mov	r10, #0				@ start clean@cache level 0
 flush_levels:
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 3/7] ARM: cache-v7: consolidate initialisation of cache level index
  2015-04-03 10:08   ` Russell King - ARM Linux
                       ` (2 preceding siblings ...)
  2015-04-03 10:54     ` [PATCH 2/7] ARM: cache-v7: shift CLIDR to extract appropriate field before masking Russell King
@ 2015-04-03 10:54     ` Russell King
  2015-04-09 17:11       ` Catalin Marinas
  2015-04-03 10:54     ` [PATCH 4/7] ARM: cache-v7: optimise branches in v7_flush_cache_louis Russell King
                       ` (3 subsequent siblings)
  7 siblings, 1 reply; 28+ messages in thread
From: Russell King @ 2015-04-03 10:54 UTC (permalink / raw)
  To: linux-arm-kernel

Both v7_flush_cache_louis and v7_flush_dcache_all both begin the
flush_levels loop with r10 initialised to zero.  In each case, this
is done immediately prior to entering the loop.  Branch to this
instruction in v7_flush_dcache_all from v7_flush_cache_louis and
eliminate the unnecessary initialisation in v7_flush_cache_louis.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/cache-v7.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index d062f8cbb886..5b5d0c00bca7 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -103,8 +103,7 @@ ALT_UP(	mov	r3, r0, lsr #26)		@ move LoUU into position
 	moveqs	r3, #1 << 1			@   fix LoUIS value (and set flags state to 'ne')
 #endif
 	reteq	lr				@ return if level == 0
-	mov	r10, #0				@ r10 (starting level) = 0
-	b	flush_levels			@ start flushing cache levels
+	b	start_flush_levels		@ start flushing cache levels
 ENDPROC(v7_flush_dcache_louis)
 
 /*
@@ -122,6 +121,7 @@ ENTRY(v7_flush_dcache_all)
 	mov	r3, r0, lsr #23			@ move LoC into position
 	ands	r3, r3, #7 << 1			@ extract LoC*2 from clidr
 	beq	finished			@ if loc is 0, then no need to clean
+start_flush_levels:
 	mov	r10, #0				@ start clean@cache level 0
 flush_levels:
 	add	r2, r10, r10, lsr #1		@ work out 3x current cache level
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 4/7] ARM: cache-v7: optimise branches in v7_flush_cache_louis
  2015-04-03 10:08   ` Russell King - ARM Linux
                       ` (3 preceding siblings ...)
  2015-04-03 10:54     ` [PATCH 3/7] ARM: cache-v7: consolidate initialisation of cache level index Russell King
@ 2015-04-03 10:54     ` Russell King
  2015-04-09  8:13       ` Arnd Bergmann
  2015-04-09 17:15       ` Catalin Marinas
  2015-04-03 10:54     ` [PATCH 5/7] ARM: cache-v7: optimise test for Cortex A9 r0pX devices Russell King
                       ` (2 subsequent siblings)
  7 siblings, 2 replies; 28+ messages in thread
From: Russell King @ 2015-04-03 10:54 UTC (permalink / raw)
  To: linux-arm-kernel

Optimise the branches such that for the majority of unaffected devices,
we avoid needing to execute the errata work-around code path by
branching to start_flush_levels early.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/cache-v7.S | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index 5b5d0c00bca7..793d061b4dce 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -93,17 +93,18 @@ ENTRY(v7_flush_dcache_louis)
 ALT_SMP(mov	r3, r0, lsr #20)		@ move LoUIS into position
 ALT_UP(	mov	r3, r0, lsr #26)		@ move LoUU into position
 	ands	r3, r3, #7 << 1 		@ extract LoU*2 field from clidr
+	bne	start_flush_levels		@ LoU != 0, start flushing
 #ifdef CONFIG_ARM_ERRATA_643719
-	ALT_SMP(mrceq	p15, 0, r2, c0, c0, 0)	@ read main ID register
-	ALT_UP(reteq	lr)			@ LoUU is zero, so nothing to do
-	movweq	r1, #:lower16:0x410fc090	@ ID of ARM Cortex A9 r0p?
-	movteq	r1, #:upper16:0x410fc090
-	biceq	r2, r2, #0x0000000f             @ clear minor revision number
-	teqeq	r2, r1                          @ test for errata affected core and if so...
-	moveqs	r3, #1 << 1			@   fix LoUIS value (and set flags state to 'ne')
+ALT_SMP(mrc	p15, 0, r2, c0, c0, 0)		@ read main ID register
+ALT_UP(	ret	lr)				@ LoUU is zero, so nothing to do
+	movw	r1, #:lower16:0x410fc090	@ ID of ARM Cortex A9 r0p?
+	movt	r1, #:upper16:0x410fc090
+	bic	r2, r2, #0x0000000f             @ clear minor revision number
+	teq	r2, r1                          @ test for errata affected core and if so...
+	moveq	r3, #1 << 1			@   fix LoUIS value
+	beq	start_flush_levels		@   start flushing cache levels
 #endif
-	reteq	lr				@ return if level == 0
-	b	start_flush_levels		@ start flushing cache levels
+	ret	lr
 ENDPROC(v7_flush_dcache_louis)
 
 /*
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 5/7] ARM: cache-v7: optimise test for Cortex A9 r0pX devices
  2015-04-03 10:08   ` Russell King - ARM Linux
                       ` (4 preceding siblings ...)
  2015-04-03 10:54     ` [PATCH 4/7] ARM: cache-v7: optimise branches in v7_flush_cache_louis Russell King
@ 2015-04-03 10:54     ` Russell King
  2015-04-09 17:20       ` Catalin Marinas
  2015-04-03 10:54     ` [PATCH 6/7] ARM: enable ARM errata 643719 workaround by default Russell King
  2015-04-03 10:54     ` [PATCH 7/7] ARM: cache-v7: further cleanups (and fix?) Russell King
  7 siblings, 1 reply; 28+ messages in thread
From: Russell King @ 2015-04-03 10:54 UTC (permalink / raw)
  To: linux-arm-kernel

Eliminate one unnecessary instruction from this test by pre-shifting
the Cortex A9 ID - we can shift the actual ID in the teq instruction
thereby losing the pX bit of the ID at no cost.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/cache-v7.S | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index 793d061b4dce..a134d8a13d00 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -97,10 +97,9 @@ ALT_UP(	mov	r3, r0, lsr #26)		@ move LoUU into position
 #ifdef CONFIG_ARM_ERRATA_643719
 ALT_SMP(mrc	p15, 0, r2, c0, c0, 0)		@ read main ID register
 ALT_UP(	ret	lr)				@ LoUU is zero, so nothing to do
-	movw	r1, #:lower16:0x410fc090	@ ID of ARM Cortex A9 r0p?
-	movt	r1, #:upper16:0x410fc090
-	bic	r2, r2, #0x0000000f             @ clear minor revision number
-	teq	r2, r1                          @ test for errata affected core and if so...
+	movw	r1, #:lower16:(0x410fc090 >> 4)	@ ID of ARM Cortex A9 r0p?
+	movt	r1, #:upper16:(0x410fc090 >> 4)
+	teq	r1, r2, lsr #4			@ test for errata affected core and if so...
 	moveq	r3, #1 << 1			@   fix LoUIS value
 	beq	start_flush_levels		@   start flushing cache levels
 #endif
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 6/7] ARM: enable ARM errata 643719 workaround by default
  2015-04-03 10:08   ` Russell King - ARM Linux
                       ` (5 preceding siblings ...)
  2015-04-03 10:54     ` [PATCH 5/7] ARM: cache-v7: optimise test for Cortex A9 r0pX devices Russell King
@ 2015-04-03 10:54     ` Russell King
  2015-04-09 17:21       ` Catalin Marinas
  2015-04-03 10:54     ` [PATCH 7/7] ARM: cache-v7: further cleanups (and fix?) Russell King
  7 siblings, 1 reply; 28+ messages in thread
From: Russell King @ 2015-04-03 10:54 UTC (permalink / raw)
  To: linux-arm-kernel

The effects of not having ARM errata 643719 enabled on affected CPUs
can be very confusing and hard to debug.  Rather than leave this to
chance, enable this workaround by default.  Now that we have rearranged
the code, it should have a low impact on the majority of CPUs.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 2eb6de9465bf..c26dfef393cd 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1139,6 +1139,7 @@ config ARM_ERRATA_742231
 config ARM_ERRATA_643719
 	bool "ARM errata: LoUIS bit field in CLIDR register is incorrect"
 	depends on CPU_V7 && SMP
+	default y
 	help
 	  This option enables the workaround for the 643719 Cortex-A9 (prior to
 	  r1p0) erratum. On affected cores the LoUIS bit field of the CLIDR
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 7/7] ARM: cache-v7: further cleanups (and fix?)
  2015-04-03 10:08   ` Russell King - ARM Linux
                       ` (6 preceding siblings ...)
  2015-04-03 10:54     ` [PATCH 6/7] ARM: enable ARM errata 643719 workaround by default Russell King
@ 2015-04-03 10:54     ` Russell King
  7 siblings, 0 replies; 28+ messages in thread
From: Russell King @ 2015-04-03 10:54 UTC (permalink / raw)
  To: linux-arm-kernel

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/cache-v7.S | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index a134d8a13d00..51d6a336bac2 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -88,22 +88,20 @@ ENDPROC(v7_flush_icache_all)
  */
 
 ENTRY(v7_flush_dcache_louis)
-	dmb					@ ensure ordering with previous memory accesses
 	mrc	p15, 1, r0, c0, c0, 1		@ read clidr, r0 = clidr
 ALT_SMP(mov	r3, r0, lsr #20)		@ move LoUIS into position
 ALT_UP(	mov	r3, r0, lsr #26)		@ move LoUU into position
-	ands	r3, r3, #7 << 1 		@ extract LoU*2 field from clidr
-	bne	start_flush_levels		@ LoU != 0, start flushing
 #ifdef CONFIG_ARM_ERRATA_643719
-ALT_SMP(mrc	p15, 0, r2, c0, c0, 0)		@ read main ID register
-ALT_UP(	ret	lr)				@ LoUU is zero, so nothing to do
+ALT_SMP(ands	r3, r3, #7 << 1)		@ extract LoU*2 field from clidr
+ALT_UP(	b	start_flush_levels)
+	bne	start_flush_levels		@ LoU != 0, start flushing
+	mrc	p15, 0, r2, c0, c0, 0		@ read main ID register
 	movw	r1, #:lower16:(0x410fc090 >> 4)	@ ID of ARM Cortex A9 r0p?
 	movt	r1, #:upper16:(0x410fc090 >> 4)
 	teq	r1, r2, lsr #4			@ test for errata affected core and if so...
-	moveq	r3, #1 << 1			@   fix LoUIS value
-	beq	start_flush_levels		@   start flushing cache levels
+	moveq	r3, #1 << 1			@ fix LoUIS value (and set flags state to 'ne')
 #endif
-	ret	lr
+	b	start_flush_levels		@ start flushing cache levels
 ENDPROC(v7_flush_dcache_louis)
 
 /*
@@ -116,12 +114,12 @@ ENDPROC(v7_flush_dcache_louis)
  *	- mm    - mm_struct describing address space
  */
 ENTRY(v7_flush_dcache_all)
-	dmb					@ ensure ordering with previous memory accesses
 	mrc	p15, 1, r0, c0, c0, 1		@ read clidr
 	mov	r3, r0, lsr #23			@ move LoC into position
-	ands	r3, r3, #7 << 1			@ extract LoC*2 from clidr
-	beq	finished			@ if loc is 0, then no need to clean
 start_flush_levels:
+	dmb					@ ensure ordering with previous memory accesses
+	ands	r3, r3, #7 << 1			@ extract field from clidr
+	beq	finished			@ if loc is 0, then no need to clean
 	mov	r10, #0				@ start clean@cache level 0
 flush_levels:
 	add	r2, r10, r10, lsr #1		@ work out 3x current cache level
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 4/7] ARM: cache-v7: optimise branches in v7_flush_cache_louis
  2015-04-03 10:54     ` [PATCH 4/7] ARM: cache-v7: optimise branches in v7_flush_cache_louis Russell King
@ 2015-04-09  8:13       ` Arnd Bergmann
  2015-04-09  8:21         ` Russell King - ARM Linux
  2015-04-09 17:15       ` Catalin Marinas
  1 sibling, 1 reply; 28+ messages in thread
From: Arnd Bergmann @ 2015-04-09  8:13 UTC (permalink / raw)
  To: linux-arm-kernel

On Friday 03 April 2015 11:54:32 Russell King wrote:
> diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
> index 5b5d0c00bca7..793d061b4dce 100644
> --- a/arch/arm/mm/cache-v7.S
> +++ b/arch/arm/mm/cache-v7.S
> @@ -93,17 +93,18 @@ ENTRY(v7_flush_dcache_louis)
>  ALT_SMP(mov	r3, r0, lsr #20)		@ move LoUIS into position
>  ALT_UP(	mov	r3, r0, lsr #26)		@ move LoUU into position
>  	ands	r3, r3, #7 << 1 		@ extract LoU*2 field from clidr
> +	bne	start_flush_levels		@ LoU != 0, start flushing
>  #ifdef CONFIG_ARM_ERRATA_643719
> -	ALT_SMP(mrceq	p15, 0, r2, c0, c0, 0)	@ read main ID register
> -	ALT_UP(reteq	lr)			@ LoUU is zero, so nothing to do
> -	movweq	r1, #:lower16:0x410fc090	@ ID of ARM Cortex A9 r0p?
> -	movteq	r1, #:upper16:0x410fc090
> -	biceq	r2, r2, #0x0000000f             @ clear minor revision number
> -	teqeq	r2, r1                          @ test for errata affected core and if so...
> -	moveqs	r3, #1 << 1			@   fix LoUIS value (and set flags state to 'ne')
> +ALT_SMP(mrc	p15, 0, r2, c0, c0, 0)		@ read main ID register
> +ALT_UP(	ret	lr)				@ LoUU is zero, so nothing to do
> +	movw	r1, #:lower16:0x410fc090	@ ID of ARM Cortex A9 r0p?

With this in linux-next, I get a build failure on randconfig kernels with
THUMB2_KERNEL enabled:

arch/arm/mm/cache-v7.S: Assembler messages:
arch/arm/mm/cache-v7.S:99: Error: ALT_UP() content must assemble to exactly 4 bytes

Any idea for a method that will work with all combinations of SMP/UP
and ARM/THUMB? The best I could come up with was to add an extra 'mov r0,r0',
but that gets rather ugly as you then have to do it only for THUMB2.

	Arnd

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 4/7] ARM: cache-v7: optimise branches in v7_flush_cache_louis
  2015-04-09  8:13       ` Arnd Bergmann
@ 2015-04-09  8:21         ` Russell King - ARM Linux
  2015-04-09 10:29           ` Arnd Bergmann
  2015-04-09 17:17           ` Catalin Marinas
  0 siblings, 2 replies; 28+ messages in thread
From: Russell King - ARM Linux @ 2015-04-09  8:21 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Apr 09, 2015 at 10:13:06AM +0200, Arnd Bergmann wrote:
> On Friday 03 April 2015 11:54:32 Russell King wrote:
> > diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
> > index 5b5d0c00bca7..793d061b4dce 100644
> > --- a/arch/arm/mm/cache-v7.S
> > +++ b/arch/arm/mm/cache-v7.S
> > @@ -93,17 +93,18 @@ ENTRY(v7_flush_dcache_louis)
> >  ALT_SMP(mov	r3, r0, lsr #20)		@ move LoUIS into position
> >  ALT_UP(	mov	r3, r0, lsr #26)		@ move LoUU into position
> >  	ands	r3, r3, #7 << 1 		@ extract LoU*2 field from clidr
> > +	bne	start_flush_levels		@ LoU != 0, start flushing
> >  #ifdef CONFIG_ARM_ERRATA_643719
> > -	ALT_SMP(mrceq	p15, 0, r2, c0, c0, 0)	@ read main ID register
> > -	ALT_UP(reteq	lr)			@ LoUU is zero, so nothing to do
> > -	movweq	r1, #:lower16:0x410fc090	@ ID of ARM Cortex A9 r0p?
> > -	movteq	r1, #:upper16:0x410fc090
> > -	biceq	r2, r2, #0x0000000f             @ clear minor revision number
> > -	teqeq	r2, r1                          @ test for errata affected core and if so...
> > -	moveqs	r3, #1 << 1			@   fix LoUIS value (and set flags state to 'ne')
> > +ALT_SMP(mrc	p15, 0, r2, c0, c0, 0)		@ read main ID register
> > +ALT_UP(	ret	lr)				@ LoUU is zero, so nothing to do
> > +	movw	r1, #:lower16:0x410fc090	@ ID of ARM Cortex A9 r0p?
> 
> With this in linux-next, I get a build failure on randconfig kernels with
> THUMB2_KERNEL enabled:
> 
> arch/arm/mm/cache-v7.S: Assembler messages:
> arch/arm/mm/cache-v7.S:99: Error: ALT_UP() content must assemble to exactly 4 bytes
> 
> Any idea for a method that will work with all combinations of SMP/UP
> and ARM/THUMB? The best I could come up with was to add an extra 'mov r0,r0',
> but that gets rather ugly as you then have to do it only for THUMB2.

How about we make ALT_UP() add the additional padding?  Something like
this maybe?

diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index f67fd3afebdf..79f421796aab 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -237,6 +237,9 @@
 	.pushsection ".alt.smp.init", "a"			;\
 	.long	9998b						;\
 9997:	instr							;\
+	.if . - 9997b == 2					;\
+		nop						;\
+	.endif
 	.if . - 9997b != 4					;\
 		.error "ALT_UP() content must assemble to exactly 4 bytes";\
 	.endif							;\


-- 
FTTC broadband for 0.8mile line: currently at 10.5Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 4/7] ARM: cache-v7: optimise branches in v7_flush_cache_louis
  2015-04-09  8:21         ` Russell King - ARM Linux
@ 2015-04-09 10:29           ` Arnd Bergmann
  2015-04-09 13:46             ` Russell King - ARM Linux
  2015-04-09 17:17           ` Catalin Marinas
  1 sibling, 1 reply; 28+ messages in thread
From: Arnd Bergmann @ 2015-04-09 10:29 UTC (permalink / raw)
  To: linux-arm-kernel

On Thursday 09 April 2015 09:21:16 Russell King - ARM Linux wrote:
> On Thu, Apr 09, 2015 at 10:13:06AM +0200, Arnd Bergmann wrote:
> > 
> > With this in linux-next, I get a build failure on randconfig kernels with
> > THUMB2_KERNEL enabled:
> > 
> > arch/arm/mm/cache-v7.S: Assembler messages:
> > arch/arm/mm/cache-v7.S:99: Error: ALT_UP() content must assemble to exactly 4 bytes
> > 
> > Any idea for a method that will work with all combinations of SMP/UP
> > and ARM/THUMB? The best I could come up with was to add an extra 'mov r0,r0',
> > but that gets rather ugly as you then have to do it only for THUMB2.
> 
> How about we make ALT_UP() add the additional padding?  Something like
> this maybe?
> 
> diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
> index f67fd3afebdf..79f421796aab 100644
> --- a/arch/arm/include/asm/assembler.h
> +++ b/arch/arm/include/asm/assembler.h
> @@ -237,6 +237,9 @@
>  	.pushsection ".alt.smp.init", "a"			;\
>  	.long	9998b						;\
>  9997:	instr							;\
> +	.if . - 9997b == 2					;\
> +		nop						;\
> +	.endif
>  	.if . - 9997b != 4					;\
>  		.error "ALT_UP() content must assemble to exactly 4 bytes";\
>  	.endif							;\
> 

This looks like a good solution, and works fine after adding the
missing ';\' characters behind the .endif.

I don't expect any problems but I'm doing some more randconfig builds
now with this patch, and if you don't hear back today, feel free to add

Acked-by: Arnd Bergmann <arnd@arndb.de>

Thanks!

	Arnd

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 4/7] ARM: cache-v7: optimise branches in v7_flush_cache_louis
  2015-04-09 10:29           ` Arnd Bergmann
@ 2015-04-09 13:46             ` Russell King - ARM Linux
  2015-04-09 17:26               ` Catalin Marinas
  0 siblings, 1 reply; 28+ messages in thread
From: Russell King - ARM Linux @ 2015-04-09 13:46 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Apr 09, 2015 at 12:29:12PM +0200, Arnd Bergmann wrote:
> On Thursday 09 April 2015 09:21:16 Russell King - ARM Linux wrote:
> > On Thu, Apr 09, 2015 at 10:13:06AM +0200, Arnd Bergmann wrote:
> > > 
> > > With this in linux-next, I get a build failure on randconfig kernels with
> > > THUMB2_KERNEL enabled:
> > > 
> > > arch/arm/mm/cache-v7.S: Assembler messages:
> > > arch/arm/mm/cache-v7.S:99: Error: ALT_UP() content must assemble to exactly 4 bytes
> > > 
> > > Any idea for a method that will work with all combinations of SMP/UP
> > > and ARM/THUMB? The best I could come up with was to add an extra 'mov r0,r0',
> > > but that gets rather ugly as you then have to do it only for THUMB2.
> > 
> > How about we make ALT_UP() add the additional padding?  Something like
> > this maybe?
> > 
> > diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
> > index f67fd3afebdf..79f421796aab 100644
> > --- a/arch/arm/include/asm/assembler.h
> > +++ b/arch/arm/include/asm/assembler.h
> > @@ -237,6 +237,9 @@
> >  	.pushsection ".alt.smp.init", "a"			;\
> >  	.long	9998b						;\
> >  9997:	instr							;\
> > +	.if . - 9997b == 2					;\
> > +		nop						;\
> > +	.endif
> >  	.if . - 9997b != 4					;\
> >  		.error "ALT_UP() content must assemble to exactly 4 bytes";\
> >  	.endif							;\
> > 
> 
> This looks like a good solution, and works fine after adding the
> missing ';\' characters behind the .endif.
> 
> I don't expect any problems but I'm doing some more randconfig builds
> now with this patch, and if you don't hear back today, feel free to add
> 
> Acked-by: Arnd Bergmann <arnd@arndb.de>

Thanks.  I'm also intending to merge this too - we could go a bit
further, but I don't want to at the moment...

8<====
From: Russell King <rmk+kernel@arm.linux.org.uk>
Subject: [PATCH] ARM: remove uses of ALT_UP(W(...))

Since we expand a thumb ALT_UP() instruction instruction to be 32-bit,
we no longer need to manually code this.  Remove instances of
ALT_UP(W()).

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/include/asm/assembler.h |  4 ----
 arch/arm/lib/bitops.h            |  4 ++--
 arch/arm/mm/cache-v7.S           | 10 +++++-----
 arch/arm/mm/tlb-v7.S             |  2 +-
 4 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index 186270b3e194..ed823dcc8296 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -283,11 +283,7 @@
 #else
 #error Incompatible SMP platform
 #endif
-	.ifeqs "\mode","arm"
 	ALT_UP(nop)
-	.else
-	ALT_UP(W(nop))
-	.endif
 #endif
 	.endm
 
diff --git a/arch/arm/lib/bitops.h b/arch/arm/lib/bitops.h
index 7d807cfd8ef5..0f2055fe21af 100644
--- a/arch/arm/lib/bitops.h
+++ b/arch/arm/lib/bitops.h
@@ -14,7 +14,7 @@ UNWIND(	.fnstart	)
 #if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
 	.arch_extension	mp
 	ALT_SMP(W(pldw)	[r1])
-	ALT_UP(W(nop))
+	ALT_UP(nop)
 #endif
 	mov	r3, r2, lsl r3
 1:	ldrex	r2, [r1]
@@ -41,7 +41,7 @@ UNWIND(	.fnstart	)
 #if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
 	.arch_extension	mp
 	ALT_SMP(W(pldw)	[r1])
-	ALT_UP(W(nop))
+	ALT_UP(nop)
 #endif
 1:	ldrex	r2, [r1]
 	ands	r0, r2, r3		@ save old value of bit
diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index 51d6a336bac2..21f084bdf60a 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -273,7 +273,7 @@ ENTRY(v7_coherent_user_range)
 	bic	r12, r0, r3
 #ifdef CONFIG_ARM_ERRATA_764369
 	ALT_SMP(W(dsb))
-	ALT_UP(W(nop))
+ALT_UP(	nop)
 #endif
 1:
  USER(	mcr	p15, 0, r12, c7, c11, 1	)	@ clean D line to the point of unification
@@ -326,7 +326,7 @@ ENTRY(v7_flush_kern_dcache_area)
 	bic	r0, r0, r3
 #ifdef CONFIG_ARM_ERRATA_764369
 	ALT_SMP(W(dsb))
-	ALT_UP(W(nop))
+ALT_UP(	nop)
 #endif
 1:
 	mcr	p15, 0, r0, c7, c14, 1		@ clean & invalidate D line / unified line
@@ -354,7 +354,7 @@ v7_dma_inv_range:
 	bic	r0, r0, r3
 #ifdef CONFIG_ARM_ERRATA_764369
 	ALT_SMP(W(dsb))
-	ALT_UP(W(nop))
+ALT_UP(	nop)
 #endif
 	mcrne	p15, 0, r0, c7, c14, 1		@ clean & invalidate D / U line
 
@@ -381,7 +381,7 @@ v7_dma_clean_range:
 	bic	r0, r0, r3
 #ifdef CONFIG_ARM_ERRATA_764369
 	ALT_SMP(W(dsb))
-	ALT_UP(W(nop))
+ALT_UP(	nop)
 #endif
 1:
 	mcr	p15, 0, r0, c7, c10, 1		@ clean D / U line
@@ -403,7 +403,7 @@ ENTRY(v7_dma_flush_range)
 	bic	r0, r0, r3
 #ifdef CONFIG_ARM_ERRATA_764369
 	ALT_SMP(W(dsb))
-	ALT_UP(W(nop))
+ALT_UP(	nop)
 #endif
 1:
 	mcr	p15, 0, r0, c7, c14, 1		@ clean & invalidate D / U line
diff --git a/arch/arm/mm/tlb-v7.S b/arch/arm/mm/tlb-v7.S
index e5101a3bc57c..58261e0415c2 100644
--- a/arch/arm/mm/tlb-v7.S
+++ b/arch/arm/mm/tlb-v7.S
@@ -41,7 +41,7 @@ ENTRY(v7wbi_flush_user_tlb_range)
 	asid	r3, r3				@ mask ASID
 #ifdef CONFIG_ARM_ERRATA_720789
 	ALT_SMP(W(mov)	r3, #0	)
-	ALT_UP(W(nop)		)
+ALT_UP(	nop)
 #endif
 	orr	r0, r3, r0, lsl #PAGE_SHIFT	@ Create initial MVA
 	mov	r1, r1, lsl #PAGE_SHIFT
-- 
1.8.3.1


-- 
FTTC broadband for 0.8mile line: currently at 10.5Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 2/7] ARM: cache-v7: shift CLIDR to extract appropriate field before masking
  2015-04-03 10:54     ` [PATCH 2/7] ARM: cache-v7: shift CLIDR to extract appropriate field before masking Russell King
@ 2015-04-09 17:09       ` Catalin Marinas
  0 siblings, 0 replies; 28+ messages in thread
From: Catalin Marinas @ 2015-04-09 17:09 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Apr 03, 2015 at 11:54:22AM +0100, Russell King wrote:
> Rather than have code which masks and then shifts, such as:
> 
> 	mrc     p15, 1, r0, c0, c0, 1
> ALT_SMP(ands	r3, r0, #7 << 21)
> ALT_UP( ands	r3, r0, #7 << 27)
> ALT_SMP(mov	r3, r3, lsr #20)
> ALT_UP(	mov	r3, r3, lsr #26)
> 
> re-arrange this as a shift and then mask.  The masking is the same for
> each field which we want to extract, so this allows the mask to be
> shared amongst code paths:
> 
> 	mrc     p15, 1, r0, c0, c0, 1
> ALT_SMP(mov	r3, r0, lsr #20)
> ALT_UP(	mov	r3, r0, lsr #26)
> 	ands	r3, r3, #7 << 1

It looks fine.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 1/7] ARM: cache-v7: use movw/movt instructions
  2015-04-03 10:54     ` [PATCH 1/7] ARM: cache-v7: use movw/movt instructions Russell King
@ 2015-04-09 17:10       ` Catalin Marinas
  0 siblings, 0 replies; 28+ messages in thread
From: Catalin Marinas @ 2015-04-09 17:10 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Apr 03, 2015 at 11:54:17AM +0100, Russell King wrote:
> We always build cache-v7.S for ARMv7, so we can use the ARMv7 16-bit
> move instructions to load large constants, rather than using constants
> in a literal pool.
> 
> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 3/7] ARM: cache-v7: consolidate initialisation of cache level index
  2015-04-03 10:54     ` [PATCH 3/7] ARM: cache-v7: consolidate initialisation of cache level index Russell King
@ 2015-04-09 17:11       ` Catalin Marinas
  0 siblings, 0 replies; 28+ messages in thread
From: Catalin Marinas @ 2015-04-09 17:11 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Apr 03, 2015 at 11:54:27AM +0100, Russell King wrote:
> Both v7_flush_cache_louis and v7_flush_dcache_all both begin the
> flush_levels loop with r10 initialised to zero.  In each case, this
> is done immediately prior to entering the loop.  Branch to this
> instruction in v7_flush_dcache_all from v7_flush_cache_louis and
> eliminate the unnecessary initialisation in v7_flush_cache_louis.
> 
> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 4/7] ARM: cache-v7: optimise branches in v7_flush_cache_louis
  2015-04-03 10:54     ` [PATCH 4/7] ARM: cache-v7: optimise branches in v7_flush_cache_louis Russell King
  2015-04-09  8:13       ` Arnd Bergmann
@ 2015-04-09 17:15       ` Catalin Marinas
  1 sibling, 0 replies; 28+ messages in thread
From: Catalin Marinas @ 2015-04-09 17:15 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Apr 03, 2015 at 11:54:32AM +0100, Russell King wrote:
> Optimise the branches such that for the majority of unaffected devices,
> we avoid needing to execute the errata work-around code path by
> branching to start_flush_levels early.
> 
> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 4/7] ARM: cache-v7: optimise branches in v7_flush_cache_louis
  2015-04-09  8:21         ` Russell King - ARM Linux
  2015-04-09 10:29           ` Arnd Bergmann
@ 2015-04-09 17:17           ` Catalin Marinas
  1 sibling, 0 replies; 28+ messages in thread
From: Catalin Marinas @ 2015-04-09 17:17 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Apr 09, 2015 at 09:21:16AM +0100, Russell King - ARM Linux wrote:
> On Thu, Apr 09, 2015 at 10:13:06AM +0200, Arnd Bergmann wrote:
> > On Friday 03 April 2015 11:54:32 Russell King wrote:
> > > diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
> > > index 5b5d0c00bca7..793d061b4dce 100644
> > > --- a/arch/arm/mm/cache-v7.S
> > > +++ b/arch/arm/mm/cache-v7.S
> > > @@ -93,17 +93,18 @@ ENTRY(v7_flush_dcache_louis)
> > >  ALT_SMP(mov	r3, r0, lsr #20)		@ move LoUIS into position
> > >  ALT_UP(	mov	r3, r0, lsr #26)		@ move LoUU into position
> > >  	ands	r3, r3, #7 << 1 		@ extract LoU*2 field from clidr
> > > +	bne	start_flush_levels		@ LoU != 0, start flushing
> > >  #ifdef CONFIG_ARM_ERRATA_643719
> > > -	ALT_SMP(mrceq	p15, 0, r2, c0, c0, 0)	@ read main ID register
> > > -	ALT_UP(reteq	lr)			@ LoUU is zero, so nothing to do
> > > -	movweq	r1, #:lower16:0x410fc090	@ ID of ARM Cortex A9 r0p?
> > > -	movteq	r1, #:upper16:0x410fc090
> > > -	biceq	r2, r2, #0x0000000f             @ clear minor revision number
> > > -	teqeq	r2, r1                          @ test for errata affected core and if so...
> > > -	moveqs	r3, #1 << 1			@   fix LoUIS value (and set flags state to 'ne')
> > > +ALT_SMP(mrc	p15, 0, r2, c0, c0, 0)		@ read main ID register
> > > +ALT_UP(	ret	lr)				@ LoUU is zero, so nothing to do
> > > +	movw	r1, #:lower16:0x410fc090	@ ID of ARM Cortex A9 r0p?
> > 
> > With this in linux-next, I get a build failure on randconfig kernels with
> > THUMB2_KERNEL enabled:
> > 
> > arch/arm/mm/cache-v7.S: Assembler messages:
> > arch/arm/mm/cache-v7.S:99: Error: ALT_UP() content must assemble to exactly 4 bytes
> > 
> > Any idea for a method that will work with all combinations of SMP/UP
> > and ARM/THUMB? The best I could come up with was to add an extra 'mov r0,r0',
> > but that gets rather ugly as you then have to do it only for THUMB2.
> 
> How about we make ALT_UP() add the additional padding?  Something like
> this maybe?
> 
> diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
> index f67fd3afebdf..79f421796aab 100644
> --- a/arch/arm/include/asm/assembler.h
> +++ b/arch/arm/include/asm/assembler.h
> @@ -237,6 +237,9 @@
>  	.pushsection ".alt.smp.init", "a"			;\
>  	.long	9998b						;\
>  9997:	instr							;\
> +	.if . - 9997b == 2					;\
> +		nop						;\
> +	.endif
>  	.if . - 9997b != 4					;\
>  		.error "ALT_UP() content must assemble to exactly 4 bytes";\
>  	.endif							;\

I wonder whether, as a general rule, it's better to use the 4-byte wide
instruction where possible instead of the additional nop. Anyway, this
could be left with the ALT_* macros user.

-- 
Catalin

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 5/7] ARM: cache-v7: optimise test for Cortex A9 r0pX devices
  2015-04-03 10:54     ` [PATCH 5/7] ARM: cache-v7: optimise test for Cortex A9 r0pX devices Russell King
@ 2015-04-09 17:20       ` Catalin Marinas
  0 siblings, 0 replies; 28+ messages in thread
From: Catalin Marinas @ 2015-04-09 17:20 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Apr 03, 2015 at 11:54:37AM +0100, Russell King wrote:
> Eliminate one unnecessary instruction from this test by pre-shifting
> the Cortex A9 ID - we can shift the actual ID in the teq instruction
> thereby losing the pX bit of the ID at no cost.
> 
> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 6/7] ARM: enable ARM errata 643719 workaround by default
  2015-04-03 10:54     ` [PATCH 6/7] ARM: enable ARM errata 643719 workaround by default Russell King
@ 2015-04-09 17:21       ` Catalin Marinas
  0 siblings, 0 replies; 28+ messages in thread
From: Catalin Marinas @ 2015-04-09 17:21 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Apr 03, 2015 at 11:54:42AM +0100, Russell King wrote:
> The effects of not having ARM errata 643719 enabled on affected CPUs
> can be very confusing and hard to debug.  Rather than leave this to
> chance, enable this workaround by default.  Now that we have rearranged
> the code, it should have a low impact on the majority of CPUs.
> 
> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

Acked-by: Catalin Marinas <catalin.marinas@arm.com>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 4/7] ARM: cache-v7: optimise branches in v7_flush_cache_louis
  2015-04-09 13:46             ` Russell King - ARM Linux
@ 2015-04-09 17:26               ` Catalin Marinas
  0 siblings, 0 replies; 28+ messages in thread
From: Catalin Marinas @ 2015-04-09 17:26 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Apr 09, 2015 at 02:46:56PM +0100, Russell King - ARM Linux wrote:
> diff --git a/arch/arm/lib/bitops.h b/arch/arm/lib/bitops.h
> index 7d807cfd8ef5..0f2055fe21af 100644
> --- a/arch/arm/lib/bitops.h
> +++ b/arch/arm/lib/bitops.h
> @@ -14,7 +14,7 @@ UNWIND(	.fnstart	)
>  #if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
>  	.arch_extension	mp
>  	ALT_SMP(W(pldw)	[r1])
> -	ALT_UP(W(nop))
> +	ALT_UP(nop)

So for cases like this, we end up with two NOPs on Thumb-2 UP. Depending
on the microarchitecture implementation, we may have two instructions in
the pipeline to execute instead of one.

-- 
Catalin

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [RFC] mixture of cleanups to cache-v7.S
  2015-04-02 22:49 [RFC] mixture of cleanups to cache-v7.S Russell King - ARM Linux
  2015-04-02 22:57 ` Russell King - ARM Linux
@ 2015-04-10 13:27 ` Catalin Marinas
  2015-04-10 14:11   ` Lorenzo Pieralisi
  1 sibling, 1 reply; 28+ messages in thread
From: Catalin Marinas @ 2015-04-10 13:27 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Apr 02, 2015 at 11:49:47PM +0100, Russell King - ARM Linux wrote:
> Several cleanups are in the patch below... I'll separate them out, but
> I'd like to hear comments on them.  Basically:
> 
> 1. cache-v7.S is built for ARMv7 CPUs, so there's no reason not to
>    use movw and movt when loading large constants, rather than using
>    "ldr rd,=constant"
> 
> 2. we can do a much more efficient check for the errata in
>    v7_flush_dcache_louis than we were doing - rather than putting the
>    work-around code in the fast path, we can re-organise this such that
>    we only try to run the workaround code if the LoU field is zero.
> 
> 3. shift the bitfield we want to extract in the CLIDR to the appropriate
>    bit position prior to masking; this reduces the complexity of the
>    code, particularly with the SMP differences in v7_flush_dcache_louis.
> 
> 4. pre-shift the Cortex A9 MIDR value to be checked, and shift the
>    actual MIDR to lose the bottom four revision bits.
> 
> 5. as the v7_flush_dcache_louis code is more optimal, I see no reason not
>    to enable this workaround by default now - if people really want it to
>    be disabled, they can still choose that option.  This is in addition
>    to Versatile Express enabling it.  Given the memory corrupting abilities
>    of not having this errata enabled, I think it's only sane that it's
>    something that should be encouraged to be enabled, even though it only
>    affects r0pX CPUs.
> 
> One obvious issue comes up here though - in the case that the LoU bits
> are validly zero, we merely return from v7_flush_dcache_louis with no
> DSB or ISB.  However v7_flush_dcache_all always has a DSB or ISB at the
> end, even if LoC is zero.  Is this an intentional difference, or should
> v7_flush_dcache_louis always end with a DSB+ISB ?

Cc'ing Lorenzo since he added this code.

The DSB+ISB at the end is usually meant to wait for the completion of
the cache maintenance operation. If there is no cache maintenance
required, you don't need the barriers (nor the DMB at the beginning of
the function), unless the semantics of this function always imply
barrier. I assume not since we have a DSB anyway before issuing the WFI
to put the CPU into a sleep state but I'll wait for Lorenzo to confirm.

As for the v7_flush_dcache_all always having the barriers, I guess
no-one is expecting a v7 CPU without caches.

-- 
Catalin

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [RFC] mixture of cleanups to cache-v7.S
  2015-04-10 13:27 ` [RFC] mixture of cleanups to cache-v7.S Catalin Marinas
@ 2015-04-10 14:11   ` Lorenzo Pieralisi
  2015-04-10 14:26     ` Russell King - ARM Linux
  0 siblings, 1 reply; 28+ messages in thread
From: Lorenzo Pieralisi @ 2015-04-10 14:11 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Apr 10, 2015 at 02:27:42PM +0100, Catalin Marinas wrote:
> On Thu, Apr 02, 2015 at 11:49:47PM +0100, Russell King - ARM Linux wrote:
> > Several cleanups are in the patch below... I'll separate them out, but
> > I'd like to hear comments on them.  Basically:
> > 
> > 1. cache-v7.S is built for ARMv7 CPUs, so there's no reason not to
> >    use movw and movt when loading large constants, rather than using
> >    "ldr rd,=constant"
> > 
> > 2. we can do a much more efficient check for the errata in
> >    v7_flush_dcache_louis than we were doing - rather than putting the
> >    work-around code in the fast path, we can re-organise this such that
> >    we only try to run the workaround code if the LoU field is zero.
> > 
> > 3. shift the bitfield we want to extract in the CLIDR to the appropriate
> >    bit position prior to masking; this reduces the complexity of the
> >    code, particularly with the SMP differences in v7_flush_dcache_louis.
> > 
> > 4. pre-shift the Cortex A9 MIDR value to be checked, and shift the
> >    actual MIDR to lose the bottom four revision bits.
> > 
> > 5. as the v7_flush_dcache_louis code is more optimal, I see no reason not
> >    to enable this workaround by default now - if people really want it to
> >    be disabled, they can still choose that option.  This is in addition
> >    to Versatile Express enabling it.  Given the memory corrupting abilities
> >    of not having this errata enabled, I think it's only sane that it's
> >    something that should be encouraged to be enabled, even though it only
> >    affects r0pX CPUs.
> > 
> > One obvious issue comes up here though - in the case that the LoU bits
> > are validly zero, we merely return from v7_flush_dcache_louis with no
> > DSB or ISB.  However v7_flush_dcache_all always has a DSB or ISB at the
> > end, even if LoC is zero.  Is this an intentional difference, or should
> > v7_flush_dcache_louis always end with a DSB+ISB ?
> 
> Cc'ing Lorenzo since he added this code.
> 
> The DSB+ISB at the end is usually meant to wait for the completion of
> the cache maintenance operation. If there is no cache maintenance
> required, you don't need the barriers (nor the DMB at the beginning of
> the function), unless the semantics of this function always imply
> barrier. I assume not since we have a DSB anyway before issuing the WFI
> to put the CPU into a sleep state but I'll wait for Lorenzo to confirm.

I do not think the cache functions always imply barrier semantics,
as you said the barriers are there to ensure completion of cache
operations, if there is nothing to flush the barriers should not be
there.

> As for the v7_flush_dcache_all always having the barriers, I guess
> no-one is expecting a v7 CPU without caches.

Well, yes, actually I'd rather remove the barriers if LoC is 0 instead
of adding them to LoUIS API in case there is nothing to flush, I do not
think that's a problem in the first place.

Lorenzo

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [RFC] mixture of cleanups to cache-v7.S
  2015-04-10 14:11   ` Lorenzo Pieralisi
@ 2015-04-10 14:26     ` Russell King - ARM Linux
  2015-04-10 15:16       ` Catalin Marinas
  2015-04-10 15:37       ` Lorenzo Pieralisi
  0 siblings, 2 replies; 28+ messages in thread
From: Russell King - ARM Linux @ 2015-04-10 14:26 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Apr 10, 2015 at 03:11:05PM +0100, Lorenzo Pieralisi wrote:
> On Fri, Apr 10, 2015 at 02:27:42PM +0100, Catalin Marinas wrote:
> > As for the v7_flush_dcache_all always having the barriers, I guess
> > no-one is expecting a v7 CPU without caches.
> 
> Well, yes, actually I'd rather remove the barriers if LoC is 0 instead
> of adding them to LoUIS API in case there is nothing to flush, I do not
> think that's a problem in the first place.

Maybe you should read through the patch set before declaring there to
be no problem.

There may be no problem as the code stands right now, and we can bury
our heads in the sand and continue as through there's no problem what
so ever, and we'd be right to, but...

We need to have 643719 enabled for all platforms.  Right now, the code
in cache-v7 surrounding that workaround is pretty crap - enabling the
option places the workaround into the execution path whether it applies
or not.

What this patch series does is rearrange stuff to avoid the workaround
where it's possible to do so, while cleaning up a lot of the crap coding
that was done here in the first place.

In doing this, I spotted that if we _solve_ the above issue by making
both functions behave in the same way, we get more common code paths.

So, while the issue doesn't _cause_ a problem, fixing the disparity
between the two functions is worth it just to be able to have a
flush_levels implementation which behaves the same no matter how it's
called - one which always has a dmb before it, and (possibly) always
the dsb+isb afterwards - or only has the dsb+isb afterwards if it's
done some cache flushing.

Look at the resulting code after applying all the patches, and I'm sure
you'd agree that it's a lot better than the crap which is currently
there.  To save you having to look up the files and apply the patches,
here are the old and new versions:

============= old ====================
ENTRY(v7_flush_dcache_louis)
        dmb                                     @ ensure ordering with previous memory accesses
        mrc     p15, 1, r0, c0, c0, 1           @ read clidr, r0 = clidr
        ALT_SMP(ands    r3, r0, #(7 << 21))     @ extract LoUIS from clidr
        ALT_UP(ands     r3, r0, #(7 << 27))     @ extract LoUU from clidr
#ifdef CONFIG_ARM_ERRATA_643719
        ALT_SMP(mrceq   p15, 0, r2, c0, c0, 0)  @ read main ID register
        ALT_UP(reteq    lr)                     @ LoUU is zero, so nothing to do        ldreq   r1, =0x410fc090                 @ ID of ARM Cortex A9 r0p?
        biceq   r2, r2, #0x0000000f             @ clear minor revision number
        teqeq   r2, r1                          @ test for errata affected core and if so...
        orreqs  r3, #(1 << 21)                  @   fix LoUIS value (and set flags state to 'ne')
#endif
        ALT_SMP(mov     r3, r3, lsr #20)        @ r3 = LoUIS * 2
        ALT_UP(mov      r3, r3, lsr #26)        @ r3 = LoUU * 2
        reteq   lr                              @ return if level == 0
        mov     r10, #0                         @ r10 (starting level) = 0
        b       flush_levels                    @ start flushing cache levels

ENTRY(v7_flush_dcache_all)
        dmb                                     @ ensure ordering with previous memory accesses
        mrc     p15, 1, r0, c0, c0, 1           @ read clidr
        ands    r3, r0, #0x7000000              @ extract loc from clidr
        mov     r3, r3, lsr #23                 @ left align loc bit field
        beq     finished                        @ if loc is 0, then no need to clean
        mov     r10, #0                         @ start clean@cache level 0
flush_levels:
        add     r2, r10, r10, lsr #1            @ work out 3x current cache level
        mov     r1, r0, lsr r2                  @ extract cache type bits from clidr
...
============= new ====================
ENTRY(v7_flush_dcache_louis)
        mrc     p15, 1, r0, c0, c0, 1           @ read clidr, r0 = clidr
ALT_SMP(mov     r3, r0, lsr #20)                @ move LoUIS into position
ALT_UP( mov     r3, r0, lsr #26)                @ move LoUU into position
#ifdef CONFIG_ARM_ERRATA_643719
ALT_SMP(ands    r3, r3, #7 << 1)                @ extract LoU*2 field from clidrALT_UP( b       start_flush_levels)
        bne     start_flush_levels              @ LoU != 0, start flushing
        mrc     p15, 0, r2, c0, c0, 0           @ read main ID register
        movw    r1, #:lower16:(0x410fc090 >> 4) @ ID of ARM Cortex A9 r0p?
        movt    r1, #:upper16:(0x410fc090 >> 4)
        teq     r1, r2, lsr #4                  @ test for errata affected core
and if so...
        moveq   r3, #1 << 1                     @ fix LoUIS value (and set flags state to 'ne')
#endif
        b       start_flush_levels              @ start flushing cache levels

ENTRY(v7_flush_dcache_all)
        mrc     p15, 1, r0, c0, c0, 1           @ read clidr
        mov     r3, r0, lsr #23                 @ move LoC into position
start_flush_levels:
        dmb                                     @ ensure ordering with previous memory accesses
        ands    r3, r3, #7 << 1                 @ extract field from clidr
        beq     finished                        @ if loc is 0, then no need to clean
        mov     r10, #0                         @ start clean at cache level 0
flush_levels:
        add     r2, r10, r10, lsr #1            @ work out 3x current cache level
        mov     r1, r0, lsr r2                  @ extract cache type bits from clidr
...

Which do you think is better, the old version or the new version?

-- 
FTTC broadband for 0.8mile line: currently@10.5Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [RFC] mixture of cleanups to cache-v7.S
  2015-04-10 14:26     ` Russell King - ARM Linux
@ 2015-04-10 15:16       ` Catalin Marinas
  2015-04-10 15:37       ` Lorenzo Pieralisi
  1 sibling, 0 replies; 28+ messages in thread
From: Catalin Marinas @ 2015-04-10 15:16 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Apr 10, 2015 at 03:26:55PM +0100, Russell King - ARM Linux wrote:
> On Fri, Apr 10, 2015 at 03:11:05PM +0100, Lorenzo Pieralisi wrote:
> > On Fri, Apr 10, 2015 at 02:27:42PM +0100, Catalin Marinas wrote:
> > > As for the v7_flush_dcache_all always having the barriers, I guess
> > > no-one is expecting a v7 CPU without caches.
> > 
> > Well, yes, actually I'd rather remove the barriers if LoC is 0 instead
> > of adding them to LoUIS API in case there is nothing to flush, I do not
> > think that's a problem in the first place.
> 
> Maybe you should read through the patch set before declaring there to
> be no problem.

I don't think anyone was disputing the code clean-up but whether the
cache flushing functions need to have barrier semantics even when there
isn't any level to flush. Looking through the code, flush_cache_louis()
is used in cpu_die() where in some circumstances a DSB would be needed
(like the CPU being killed from another CPU rather than by a WFI
execution). To avoid duplicating barriers in cpu_die() since in practice
LoUIS is at least 1 (unless you have unified caches at level 1), we
could always require them in v7_flush_dcache_louis.

> ENTRY(v7_flush_dcache_louis)
>         mrc     p15, 1, r0, c0, c0, 1           @ read clidr, r0 = clidr
> ALT_SMP(mov     r3, r0, lsr #20)                @ move LoUIS into position
> ALT_UP( mov     r3, r0, lsr #26)                @ move LoUU into position
> #ifdef CONFIG_ARM_ERRATA_643719
> ALT_SMP(ands    r3, r3, #7 << 1)                @ extract LoU*2 field from clidrALT_UP( b       start_flush_levels)
>         bne     start_flush_levels              @ LoU != 0, start flushing
>         mrc     p15, 0, r2, c0, c0, 0           @ read main ID register
>         movw    r1, #:lower16:(0x410fc090 >> 4) @ ID of ARM Cortex A9 r0p?
>         movt    r1, #:upper16:(0x410fc090 >> 4)
>         teq     r1, r2, lsr #4                  @ test for errata affected core
> and if so...
>         moveq   r3, #1 << 1                     @ fix LoUIS value (and set flags state to 'ne')
> #endif
>         b       start_flush_levels              @ start flushing cache levels
> 
> ENTRY(v7_flush_dcache_all)
>         mrc     p15, 1, r0, c0, c0, 1           @ read clidr
>         mov     r3, r0, lsr #23                 @ move LoC into position
> start_flush_levels:
>         dmb                                     @ ensure ordering with previous memory accesses
>         ands    r3, r3, #7 << 1                 @ extract field from clidr
>         beq     finished                        @ if loc is 0, then no need to clean
>         mov     r10, #0                         @ start clean at cache level 0
> flush_levels:
>         add     r2, r10, r10, lsr #1            @ work out 3x current cache level
>         mov     r1, r0, lsr r2                  @ extract cache type bits from clidr

Since the DMB is only needed for ordering prior memory accesses with the
cache maintenance itself, you could move it just before 'flush_levels'
(rather theoretical as I'm not aware of any ARMv7 CPUs with a unified
level 1 cache).

With that said, you can add my reviewed-by on all patches:

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [RFC] mixture of cleanups to cache-v7.S
  2015-04-10 14:26     ` Russell King - ARM Linux
  2015-04-10 15:16       ` Catalin Marinas
@ 2015-04-10 15:37       ` Lorenzo Pieralisi
  1 sibling, 0 replies; 28+ messages in thread
From: Lorenzo Pieralisi @ 2015-04-10 15:37 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Apr 10, 2015 at 03:26:55PM +0100, Russell King - ARM Linux wrote:
> On Fri, Apr 10, 2015 at 03:11:05PM +0100, Lorenzo Pieralisi wrote:
> > On Fri, Apr 10, 2015 at 02:27:42PM +0100, Catalin Marinas wrote:
> > > As for the v7_flush_dcache_all always having the barriers, I guess
> > > no-one is expecting a v7 CPU without caches.
> > 
> > Well, yes, actually I'd rather remove the barriers if LoC is 0 instead
> > of adding them to LoUIS API in case there is nothing to flush, I do not
> > think that's a problem in the first place.
> 
> Maybe you should read through the patch set before declaring there to
> be no problem.
> 
> There may be no problem as the code stands right now, and we can bury
> our heads in the sand and continue as through there's no problem what
> so ever, and we'd be right to, but...
> 
> We need to have 643719 enabled for all platforms.  Right now, the code
> in cache-v7 surrounding that workaround is pretty crap - enabling the
> option places the workaround into the execution path whether it applies
> or not.
> 
> What this patch series does is rearrange stuff to avoid the workaround
> where it's possible to do so, while cleaning up a lot of the crap coding
> that was done here in the first place.
> 
> In doing this, I spotted that if we _solve_ the above issue by making
> both functions behave in the same way, we get more common code paths.

I certainly agree it is a good clean-up, I was referring to barriers and
the barrier semantics.

> So, while the issue doesn't _cause_ a problem, fixing the disparity
> between the two functions is worth it just to be able to have a
> flush_levels implementation which behaves the same no matter how it's
> called - one which always has a dmb before it, and (possibly) always
> the dsb+isb afterwards - or only has the dsb+isb afterwards if it's
> done some cache flushing.

In actual facts the latter is what's required, for practical
purposes the former solution is acceptable to fix the disparity.

> Look at the resulting code after applying all the patches, and I'm sure
> you'd agree that it's a lot better than the crap which is currently
> there.  To save you having to look up the files and apply the patches,
> here are the old and new versions:
> 
> ============= old ====================
> ENTRY(v7_flush_dcache_louis)
>         dmb                                     @ ensure ordering with previous memory accesses
>         mrc     p15, 1, r0, c0, c0, 1           @ read clidr, r0 = clidr
>         ALT_SMP(ands    r3, r0, #(7 << 21))     @ extract LoUIS from clidr
>         ALT_UP(ands     r3, r0, #(7 << 27))     @ extract LoUU from clidr
> #ifdef CONFIG_ARM_ERRATA_643719
>         ALT_SMP(mrceq   p15, 0, r2, c0, c0, 0)  @ read main ID register
>         ALT_UP(reteq    lr)                     @ LoUU is zero, so nothing to do        ldreq   r1, =0x410fc090                 @ ID of ARM Cortex A9 r0p?
>         biceq   r2, r2, #0x0000000f             @ clear minor revision number
>         teqeq   r2, r1                          @ test for errata affected core and if so...
>         orreqs  r3, #(1 << 21)                  @   fix LoUIS value (and set flags state to 'ne')
> #endif
>         ALT_SMP(mov     r3, r3, lsr #20)        @ r3 = LoUIS * 2
>         ALT_UP(mov      r3, r3, lsr #26)        @ r3 = LoUU * 2
>         reteq   lr                              @ return if level == 0
>         mov     r10, #0                         @ r10 (starting level) = 0
>         b       flush_levels                    @ start flushing cache levels
> 
> ENTRY(v7_flush_dcache_all)
>         dmb                                     @ ensure ordering with previous memory accesses
>         mrc     p15, 1, r0, c0, c0, 1           @ read clidr
>         ands    r3, r0, #0x7000000              @ extract loc from clidr
>         mov     r3, r3, lsr #23                 @ left align loc bit field
>         beq     finished                        @ if loc is 0, then no need to clean
>         mov     r10, #0                         @ start clean at cache level 0
> flush_levels:
>         add     r2, r10, r10, lsr #1            @ work out 3x current cache level
>         mov     r1, r0, lsr r2                  @ extract cache type bits from clidr
> ...
> ============= new ====================
> ENTRY(v7_flush_dcache_louis)
>         mrc     p15, 1, r0, c0, c0, 1           @ read clidr, r0 = clidr
> ALT_SMP(mov     r3, r0, lsr #20)                @ move LoUIS into position
> ALT_UP( mov     r3, r0, lsr #26)                @ move LoUU into position
> #ifdef CONFIG_ARM_ERRATA_643719
> ALT_SMP(ands    r3, r3, #7 << 1)                @ extract LoU*2 field from clidrALT_UP( b       start_flush_levels)
>         bne     start_flush_levels              @ LoU != 0, start flushing
>         mrc     p15, 0, r2, c0, c0, 0           @ read main ID register
>         movw    r1, #:lower16:(0x410fc090 >> 4) @ ID of ARM Cortex A9 r0p?
>         movt    r1, #:upper16:(0x410fc090 >> 4)
>         teq     r1, r2, lsr #4                  @ test for errata affected core
> and if so...
>         moveq   r3, #1 << 1                     @ fix LoUIS value (and set flags state to 'ne')
> #endif
>         b       start_flush_levels              @ start flushing cache levels
> 
> ENTRY(v7_flush_dcache_all)
>         mrc     p15, 1, r0, c0, c0, 1           @ read clidr
>         mov     r3, r0, lsr #23                 @ move LoC into position
> start_flush_levels:
>         dmb                                     @ ensure ordering with previous memory accesses
>         ands    r3, r3, #7 << 1                 @ extract field from clidr
>         beq     finished                        @ if loc is 0, then no need to clean
>         mov     r10, #0                         @ start clean at cache level 0
> flush_levels:
>         add     r2, r10, r10, lsr #1            @ work out 3x current cache level
>         mov     r1, r0, lsr r2                  @ extract cache type bits from clidr
> ...
> 
> Which do you think is better, the old version or the new version?

The old version, joking ;). I was not referring to the clean-up which
is a good thing on its own, we just have to agree on barriers semantics
in case the cache flushing level turns out to be 0, see above.

Thanks,
Lorenzo

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2015-04-10 15:37 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-04-02 22:49 [RFC] mixture of cleanups to cache-v7.S Russell King - ARM Linux
2015-04-02 22:57 ` Russell King - ARM Linux
2015-04-03 10:08   ` Russell King - ARM Linux
2015-04-03 10:53     ` Russell King - ARM Linux
2015-04-03 10:54     ` [PATCH 1/7] ARM: cache-v7: use movw/movt instructions Russell King
2015-04-09 17:10       ` Catalin Marinas
2015-04-03 10:54     ` [PATCH 2/7] ARM: cache-v7: shift CLIDR to extract appropriate field before masking Russell King
2015-04-09 17:09       ` Catalin Marinas
2015-04-03 10:54     ` [PATCH 3/7] ARM: cache-v7: consolidate initialisation of cache level index Russell King
2015-04-09 17:11       ` Catalin Marinas
2015-04-03 10:54     ` [PATCH 4/7] ARM: cache-v7: optimise branches in v7_flush_cache_louis Russell King
2015-04-09  8:13       ` Arnd Bergmann
2015-04-09  8:21         ` Russell King - ARM Linux
2015-04-09 10:29           ` Arnd Bergmann
2015-04-09 13:46             ` Russell King - ARM Linux
2015-04-09 17:26               ` Catalin Marinas
2015-04-09 17:17           ` Catalin Marinas
2015-04-09 17:15       ` Catalin Marinas
2015-04-03 10:54     ` [PATCH 5/7] ARM: cache-v7: optimise test for Cortex A9 r0pX devices Russell King
2015-04-09 17:20       ` Catalin Marinas
2015-04-03 10:54     ` [PATCH 6/7] ARM: enable ARM errata 643719 workaround by default Russell King
2015-04-09 17:21       ` Catalin Marinas
2015-04-03 10:54     ` [PATCH 7/7] ARM: cache-v7: further cleanups (and fix?) Russell King
2015-04-10 13:27 ` [RFC] mixture of cleanups to cache-v7.S Catalin Marinas
2015-04-10 14:11   ` Lorenzo Pieralisi
2015-04-10 14:26     ` Russell King - ARM Linux
2015-04-10 15:16       ` Catalin Marinas
2015-04-10 15:37       ` Lorenzo Pieralisi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.