All of lore.kernel.org
 help / color / mirror / Atom feed
* read_cpuid_id() in arch/arm/kernel/setup.c
@ 2015-03-13 16:03 Mason
  2015-03-13 16:19 ` Russell King - ARM Linux
  2015-03-13 16:56 ` Mark Rutland
  0 siblings, 2 replies; 13+ messages in thread
From: Mason @ 2015-03-13 16:03 UTC (permalink / raw)
  To: linux-arm-kernel

Hello everyone,

As far as I can tell, read_cpuid_id() resolves to read_cpuid(CPUID_ID)
which resolves to mrc 15, 0, rN, cr0, cr0, {0}

Consider this:

/*
  * The CPU ID never changes at run time, so we might as well tell the
  * compiler that it's constant.  Use this function to read the CPU ID
  * rather than directly reading processor_id or read_cpuid() directly.
  */
static inline unsigned int __attribute_const__ read_cpuid_id(void)
{
	return read_cpuid(CPUID_ID);
}

Despite the comment and attribute, my compiler(*) still reloads the
value every time.

(*) gcc version 4.9.3 20141031 (prerelease) (Linaro GCC 2014.11)

e.g.

static int __get_cpu_architecture(void)
{
	int cpu_arch;

	unsigned int id = read_cpuid_id();
	if ((id & 0x0008f000) == 0) {
		cpu_arch = CPU_ARCH_UNKNOWN;
	} else if ((id & 0x0008f000) == 0x00007000) {
		cpu_arch = (id & (1 << 23)) ? CPU_ARCH_ARMv4T : CPU_ARCH_ARMv3;
	} else if ((id & 0x00080000) == 0x00000000) {
		cpu_arch = (id >> 16) & 7;
		if (cpu_arch)
			cpu_arch += CPU_ARCH_ARMv3;
	} else if ((id & 0x000f0000) == 0x000f0000) {

resolves to

c01fec74:       ee10cf10        mrc     15, 0, ip, cr0, cr0, {0}
c01fec78:       e21cca8f        ands    ip, ip, #585728 ; 0x8f000
c01fec7c:       e34c3023        movt    r3, #49187      ; 0xc023
c01fec80:       e5837008        str     r7, [r3, #8]
c01fec84:       e50b304c        str     r3, [fp, #-76]  ; 0x4c
c01fec88:       0a000022        beq     c01fed18 <setup_arch+0xe4>
c01fec8c:       ee103f10        mrc     15, 0, r3, cr0, cr0, {0}
c01fec90:       e2033a8f        and     r3, r3, #585728 ; 0x8f000
c01fec94:       e3530a07        cmp     r3, #28672      ; 0x7000
c01fec98:       1a000004        bne     c01fecb0 <setup_arch+0x7c>
c01fec9c:       ee103f10        mrc     15, 0, r3, cr0, cr0, {0}
c01feca0:       e3130502        tst     r3, #8388608    ; 0x800000
c01feca4:       13a0c003        movne   ip, #3
c01feca8:       03a0c001        moveq   ip, #1
c01fecac:       ea000019        b       c01fed18 <setup_arch+0xe4>
c01fecb0:       ee103f10        mrc     15, 0, r3, cr0, cr0, {0}
c01fecb4:       e3130702        tst     r3, #524288     ; 0x80000


So I thought it would be nice to give the poor compiler a break,
and just stuff the result in a local variable:

--- setup.c	2015-03-03 18:04:59.000000000 +0100
+++ setup.foo.c	2015-03-13 16:26:56.413380663 +0100
@@ -237,15 +237,16 @@
  {
  	int cpu_arch;
  
-	if ((read_cpuid_id() & 0x0008f000) == 0) {
+	unsigned int id = read_cpuid_id();
+	if ((id & 0x0008f000) == 0) {
  		cpu_arch = CPU_ARCH_UNKNOWN;
-	} else if ((read_cpuid_id() & 0x0008f000) == 0x00007000) {
-		cpu_arch = (read_cpuid_id() & (1 << 23)) ? CPU_ARCH_ARMv4T : CPU_ARCH_ARMv3;
-	} else if ((read_cpuid_id() & 0x00080000) == 0x00000000) {
-		cpu_arch = (read_cpuid_id() >> 16) & 7;
+	} else if ((id & 0x0008f000) == 0x00007000) {
+		cpu_arch = (id & (1 << 23)) ? CPU_ARCH_ARMv4T : CPU_ARCH_ARMv3;
+	} else if ((id & 0x00080000) == 0x00000000) {
+		cpu_arch = (id >> 16) & 7;
  		if (cpu_arch)
  			cpu_arch += CPU_ARCH_ARMv3;
-	} else if ((read_cpuid_id() & 0x000f0000) == 0x000f0000) {
+	} else if ((id & 0x000f0000) == 0x000f0000) {
  		unsigned int mmfr0;
  
  		/* Revised CPUID format. Read the Memory Model Feature


which compiles to

c01fec74:       ee102f10        mrc     15, 0, r2, cr0, cr0, {0}
c01fec78:       e212ca8f        ands    ip, r2, #585728 ; 0x8f000
c01fec7c:       e34c3023        movt    r3, #49187      ; 0xc023
c01fec80:       e5837008        str     r7, [r3, #8]
c01fec84:       e50b304c        str     r3, [fp, #-76]  ; 0x4c
c01fec88:       0a00001c        beq     c01fed00 <setup_arch+0xcc>
c01fec8c:       e35c0a07        cmp     ip, #28672      ; 0x7000
c01fec90:       1a000003        bne     c01feca4 <setup_arch+0x70>
c01fec94:       e3120502        tst     r2, #8388608    ; 0x800000
c01fec98:       13a0c003        movne   ip, #3
c01fec9c:       03a0c001        moveq   ip, #1
c01feca0:       ea000016        b       c01fed00 <setup_arch+0xcc>
c01feca4:       e3120702        tst     r2, #524288     ; 0x80000


Is this nano-optimization worth considering?

Regards.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* read_cpuid_id() in arch/arm/kernel/setup.c
  2015-03-13 16:03 read_cpuid_id() in arch/arm/kernel/setup.c Mason
@ 2015-03-13 16:19 ` Russell King - ARM Linux
  2015-03-13 16:39   ` Mason
  2015-03-13 16:56 ` Mark Rutland
  1 sibling, 1 reply; 13+ messages in thread
From: Russell King - ARM Linux @ 2015-03-13 16:19 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Mar 13, 2015 at 05:03:52PM +0100, Mason wrote:
> Hello everyone,
> 
> As far as I can tell, read_cpuid_id() resolves to read_cpuid(CPUID_ID)
> which resolves to mrc 15, 0, rN, cr0, cr0, {0}
> 
> Consider this:
> 
> /*
>  * The CPU ID never changes at run time, so we might as well tell the
>  * compiler that it's constant.  Use this function to read the CPU ID
>  * rather than directly reading processor_id or read_cpuid() directly.
>  */
> static inline unsigned int __attribute_const__ read_cpuid_id(void)
> {
> 	return read_cpuid(CPUID_ID);
> }
> 
> Despite the comment and attribute, my compiler(*) still reloads the
> value every time.
> 
> (*) gcc version 4.9.3 20141031 (prerelease) (Linaro GCC 2014.11)
> 
> e.g.
> 
> static int __get_cpu_architecture(void)
> {
> 	int cpu_arch;
> 
> 	unsigned int id = read_cpuid_id();
> 	if ((id & 0x0008f000) == 0) {
> 		cpu_arch = CPU_ARCH_UNKNOWN;
> 	} else if ((id & 0x0008f000) == 0x00007000) {
> 		cpu_arch = (id & (1 << 23)) ? CPU_ARCH_ARMv4T : CPU_ARCH_ARMv3;
> 	} else if ((id & 0x00080000) == 0x00000000) {
> 		cpu_arch = (id >> 16) & 7;
> 		if (cpu_arch)
> 			cpu_arch += CPU_ARCH_ARMv3;
> 	} else if ((id & 0x000f0000) == 0x000f0000) {
> 
> resolves to
> 
> c01fec74:       ee10cf10        mrc     15, 0, ip, cr0, cr0, {0}
> c01fec78:       e21cca8f        ands    ip, ip, #585728 ; 0x8f000
> c01fec7c:       e34c3023        movt    r3, #49187      ; 0xc023
> c01fec80:       e5837008        str     r7, [r3, #8]
> c01fec84:       e50b304c        str     r3, [fp, #-76]  ; 0x4c
> c01fec88:       0a000022        beq     c01fed18 <setup_arch+0xe4>
> c01fec8c:       ee103f10        mrc     15, 0, r3, cr0, cr0, {0}
> c01fec90:       e2033a8f        and     r3, r3, #585728 ; 0x8f000
> c01fec94:       e3530a07        cmp     r3, #28672      ; 0x7000
> c01fec98:       1a000004        bne     c01fecb0 <setup_arch+0x7c>
> c01fec9c:       ee103f10        mrc     15, 0, r3, cr0, cr0, {0}
> c01feca0:       e3130502        tst     r3, #8388608    ; 0x800000
> c01feca4:       13a0c003        movne   ip, #3
> c01feca8:       03a0c001        moveq   ip, #1
> c01fecac:       ea000019        b       c01fed18 <setup_arch+0xe4>
> c01fecb0:       ee103f10        mrc     15, 0, r3, cr0, cr0, {0}
> c01fecb4:       e3130702        tst     r3, #524288     ; 0x80000
> 
> 
> So I thought it would be nice to give the poor compiler a break,
> and just stuff the result in a local variable:

NAK.

Your compiler behaviour is different to mine (stock gcc 4.7.4):

 4f8:   e1a0c00d        mov     ip, sp
 4fc:   e92ddff0        push    {r4, r5, r6, r7, r8, r9, sl, fp, ip, lr, pc}
 500:   e24cb004        sub     fp, ip, #4
 504:   e24dd00c        sub     sp, sp, #12
 508:   ee105f10        mrc     15, 0, r5, cr0, cr0, {0} <== load id
 50c:   e1a07000        mov     r7, r0
 510:   e1a00005        mov     r0, r5			<== r5 = id
 514:   ebfffffe        bl      0 <lookup_processor_type>
 518:   e2504000        subs    r4, r0, #0
 51c:   1a000003        bne     530 <setup_arch+0x38>
 520:   e59f063c        ldr     r0, [pc, #1596] ; b64 <setup_arch+0x66c>
 524:   e1a01005        mov     r1, r5
 528:   ebfffffe        bl      0 <printk>
 52c:   eafffffe        b       52c <setup_arch+0x34>
 530:   e5948020        ldr     r8, [r4, #32]
 534:   e2153a8f        ands    r3, r5, #585728 ; 0x8f000 <== r5 = id
 538:   e59f2628        ldr     r2, [pc, #1576] ; b68 <setup_arch+0x670>
 538:   e59f2628        ldr     r2, [pc, #1576] ; b68 <setup_arch+0x670>
 53c:   e5828000        str     r8, [r2]
 540:   0a00001f        beq     5c4 <setup_arch+0xcc>
 544:   e3530a07        cmp     r3, #28672      ; 0x7000
 548:   1a000003        bne     55c <setup_arch+0x64>
 54c:   e3150502        tst     r5, #8388608    ; 0x800000 <== r5 = id
 550:   03a03001        moveq   r3, #1
 554:   13a03003        movne   r3, #3
 558:   ea000019        b       5c4 <setup_arch+0xcc>
 55c:   e3150702        tst     r5, #524288     ; 0x80000 <== r5 = id
 560:   1a000003        bne     574 <setup_arch+0x7c>
 564:   e7e23855        ubfx    r3, r5, #16, #3		<== r5 = id
 568:   e3530000        cmp     r3, #0
 56c:   12833001        addne   r3, r3, #1
 570:   ea000013        b       5c4 <setup_arch+0xcc>
 574:   e205580f        and     r5, r5, #983040 ; 0xf0000 <== r5 = id
 578:   e355080f        cmp     r5, #983040     ; 0xf0000
 57c:   13a03000        movne   r3, #0
 580:   1a00000f        bne     5c4 <setup_arch+0xcc>
...

The point here is that the compiler is free to optimise the code as it
sees fit - if it decides that the register pressure from having the
value cached in a register is too high, it can decide to spill the
cached value, and reload it from CP15 as and when it needs to.  That
is an advantage.

It seems that GCC 4.7.4 optimises better than Linaro's 4.9.3.  In fact,
it looks like Linaro's 4.9.3 is rather buggy as far as optimisation
goes.

Later compilers aren't always better.

-- 
FTTC broadband for 0.8mile line: currently at 10.5Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* read_cpuid_id() in arch/arm/kernel/setup.c
  2015-03-13 16:19 ` Russell King - ARM Linux
@ 2015-03-13 16:39   ` Mason
  2015-03-13 16:45     ` Russell King - ARM Linux
  0 siblings, 1 reply; 13+ messages in thread
From: Mason @ 2015-03-13 16:39 UTC (permalink / raw)
  To: linux-arm-kernel

On 13/03/2015 17:19, Russell King - ARM Linux wrote:
> On Fri, Mar 13, 2015 at 05:03:52PM +0100, Mason wrote:
>> Hello everyone,
>>
>> As far as I can tell, read_cpuid_id() resolves to read_cpuid(CPUID_ID)
>> which resolves to mrc 15, 0, rN, cr0, cr0, {0}
>>
>> Consider this:
>>
>> /*
>>   * The CPU ID never changes at run time, so we might as well tell the
>>   * compiler that it's constant.  Use this function to read the CPU ID
>>   * rather than directly reading processor_id or read_cpuid() directly.
>>   */
>> static inline unsigned int __attribute_const__ read_cpuid_id(void)
>> {
>> 	return read_cpuid(CPUID_ID);
>> }
>>
>> Despite the comment and attribute, my compiler(*) still reloads the
>> value every time.
>>
>> (*) gcc version 4.9.3 20141031 (prerelease) (Linaro GCC 2014.11)
>>
>> e.g.
>>
>> static int __get_cpu_architecture(void)
>> {
>> 	int cpu_arch;
>>
>> 	unsigned int id = read_cpuid_id();
>> 	if ((id & 0x0008f000) == 0) {
>> 		cpu_arch = CPU_ARCH_UNKNOWN;
>> 	} else if ((id & 0x0008f000) == 0x00007000) {
>> 		cpu_arch = (id & (1 << 23)) ? CPU_ARCH_ARMv4T : CPU_ARCH_ARMv3;
>> 	} else if ((id & 0x00080000) == 0x00000000) {
>> 		cpu_arch = (id >> 16) & 7;
>> 		if (cpu_arch)
>> 			cpu_arch += CPU_ARCH_ARMv3;
>> 	} else if ((id & 0x000f0000) == 0x000f0000) {
>>
>> resolves to
>>
>> c01fec74:       ee10cf10        mrc     15, 0, ip, cr0, cr0, {0}
>> c01fec78:       e21cca8f        ands    ip, ip, #585728 ; 0x8f000
>> c01fec7c:       e34c3023        movt    r3, #49187      ; 0xc023
>> c01fec80:       e5837008        str     r7, [r3, #8]
>> c01fec84:       e50b304c        str     r3, [fp, #-76]  ; 0x4c
>> c01fec88:       0a000022        beq     c01fed18 <setup_arch+0xe4>
>> c01fec8c:       ee103f10        mrc     15, 0, r3, cr0, cr0, {0}
>> c01fec90:       e2033a8f        and     r3, r3, #585728 ; 0x8f000
>> c01fec94:       e3530a07        cmp     r3, #28672      ; 0x7000
>> c01fec98:       1a000004        bne     c01fecb0 <setup_arch+0x7c>
>> c01fec9c:       ee103f10        mrc     15, 0, r3, cr0, cr0, {0}
>> c01feca0:       e3130502        tst     r3, #8388608    ; 0x800000
>> c01feca4:       13a0c003        movne   ip, #3
>> c01feca8:       03a0c001        moveq   ip, #1
>> c01fecac:       ea000019        b       c01fed18 <setup_arch+0xe4>
>> c01fecb0:       ee103f10        mrc     15, 0, r3, cr0, cr0, {0}
>> c01fecb4:       e3130702        tst     r3, #524288     ; 0x80000
>>
>>
>> So I thought it would be nice to give the poor compiler a break,
>> and just stuff the result in a local variable:
>
> NAK.
>
> Your compiler behaviour is different to mine (stock gcc 4.7.4):
>
>   4f8:   e1a0c00d        mov     ip, sp
>   4fc:   e92ddff0        push    {r4, r5, r6, r7, r8, r9, sl, fp, ip, lr, pc}
>   500:   e24cb004        sub     fp, ip, #4
>   504:   e24dd00c        sub     sp, sp, #12
>   508:   ee105f10        mrc     15, 0, r5, cr0, cr0, {0} <== load id
>   50c:   e1a07000        mov     r7, r0
>   510:   e1a00005        mov     r0, r5			<== r5 = id
>   514:   ebfffffe        bl      0 <lookup_processor_type>
>   518:   e2504000        subs    r4, r0, #0
>   51c:   1a000003        bne     530 <setup_arch+0x38>
>   520:   e59f063c        ldr     r0, [pc, #1596] ; b64 <setup_arch+0x66c>
>   524:   e1a01005        mov     r1, r5
>   528:   ebfffffe        bl      0 <printk>
>   52c:   eafffffe        b       52c <setup_arch+0x34>
>   530:   e5948020        ldr     r8, [r4, #32]
>   534:   e2153a8f        ands    r3, r5, #585728 ; 0x8f000 <== r5 = id
>   538:   e59f2628        ldr     r2, [pc, #1576] ; b68 <setup_arch+0x670>
>   538:   e59f2628        ldr     r2, [pc, #1576] ; b68 <setup_arch+0x670>
>   53c:   e5828000        str     r8, [r2]
>   540:   0a00001f        beq     5c4 <setup_arch+0xcc>
>   544:   e3530a07        cmp     r3, #28672      ; 0x7000
>   548:   1a000003        bne     55c <setup_arch+0x64>
>   54c:   e3150502        tst     r5, #8388608    ; 0x800000 <== r5 = id
>   550:   03a03001        moveq   r3, #1
>   554:   13a03003        movne   r3, #3
>   558:   ea000019        b       5c4 <setup_arch+0xcc>
>   55c:   e3150702        tst     r5, #524288     ; 0x80000 <== r5 = id
>   560:   1a000003        bne     574 <setup_arch+0x7c>
>   564:   e7e23855        ubfx    r3, r5, #16, #3		<== r5 = id
>   568:   e3530000        cmp     r3, #0
>   56c:   12833001        addne   r3, r3, #1
>   570:   ea000013        b       5c4 <setup_arch+0xcc>
>   574:   e205580f        and     r5, r5, #983040 ; 0xf0000 <== r5 = id
>   578:   e355080f        cmp     r5, #983040     ; 0xf0000
>   57c:   13a03000        movne   r3, #0
>   580:   1a00000f        bne     5c4 <setup_arch+0xcc>
> ...
>
> The point here is that the compiler is free to optimise the code as it
> sees fit - if it decides that the register pressure from having the
> value cached in a register is too high, it can decide to spill the
> cached value, and reload it from CP15 as and when it needs to.  That
> is an advantage.

Good point. I hadn't thought of that.

Do you know the latency of an mrc instruction? (compared to a mov)

> It seems that GCC 4.7.4 optimises better than Linaro's 4.9.3.  In fact,
> it looks like Linaro's 4.9.3 is rather buggy as far as optimisation
> goes.
>
> Later compilers aren't always better.

I did NOT expect that. Compiler optimizations passes are so fragile.

Anyway, here's another proposed nano-improvement ;-)
(This one is code factorization)

--- setup.c	2015-03-03 18:04:59.000000000 +0100
+++ setup.bar.c	2015-03-13 17:29:23.800668429 +0100
@@ -246,12 +246,9 @@
  		if (cpu_arch)
  			cpu_arch += CPU_ARCH_ARMv3;
  	} else if ((read_cpuid_id() & 0x000f0000) == 0x000f0000) {
-		unsigned int mmfr0;
-
  		/* Revised CPUID format. Read the Memory Model Feature
  		 * Register 0 and check for VMSAv7 or PMSAv7 */
-		asm("mrc	p15, 0, %0, c0, c1, 4"
-		    : "=r" (mmfr0));
+		unsigned int mmfr0 = read_cpuid_ext(CPUID_EXT_MMFR0);
  		if ((mmfr0 & 0x0000000f) >= 0x00000003 ||
  		    (mmfr0 & 0x000000f0) >= 0x00000030)
  			cpu_arch = CPU_ARCH_ARMv7;


This one looks good, doesn't it? :-)

Regards.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* read_cpuid_id() in arch/arm/kernel/setup.c
  2015-03-13 16:39   ` Mason
@ 2015-03-13 16:45     ` Russell King - ARM Linux
  2015-03-13 17:06       ` Mason
  2015-03-15 17:40       ` Mason
  0 siblings, 2 replies; 13+ messages in thread
From: Russell King - ARM Linux @ 2015-03-13 16:45 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Mar 13, 2015 at 05:39:23PM +0100, Mason wrote:
> Good point. I hadn't thought of that.
> 
> Do you know the latency of an mrc instruction? (compared to a mov)

Not offhand.  It'll be different for different CPUs, but it's probably
not far off mov.

> >It seems that GCC 4.7.4 optimises better than Linaro's 4.9.3.  In fact,
> >it looks like Linaro's 4.9.3 is rather buggy as far as optimisation
> >goes.
> >
> >Later compilers aren't always better.
> 
> I did NOT expect that. Compiler optimizations passes are so fragile.

You're learning :)

> Anyway, here's another proposed nano-improvement ;-)
> (This one is code factorization)
> 
> --- setup.c	2015-03-03 18:04:59.000000000 +0100
> +++ setup.bar.c	2015-03-13 17:29:23.800668429 +0100
> @@ -246,12 +246,9 @@
>  		if (cpu_arch)
>  			cpu_arch += CPU_ARCH_ARMv3;
>  	} else if ((read_cpuid_id() & 0x000f0000) == 0x000f0000) {
> -		unsigned int mmfr0;
> -
>  		/* Revised CPUID format. Read the Memory Model Feature
>  		 * Register 0 and check for VMSAv7 or PMSAv7 */
> -		asm("mrc	p15, 0, %0, c0, c1, 4"
> -		    : "=r" (mmfr0));
> +		unsigned int mmfr0 = read_cpuid_ext(CPUID_EXT_MMFR0);
>  		if ((mmfr0 & 0x0000000f) >= 0x00000003 ||
>  		    (mmfr0 & 0x000000f0) >= 0x00000030)
>  			cpu_arch = CPU_ARCH_ARMv7;
> 
> 
> This one looks good, doesn't it? :-)

Yes, this one I like - and it probably fixes a potential latent bug
where the compiler was free to re-order that mrc outside of the if()
statement.

Please wrap it up as a normal submission, thanks.

-- 
FTTC broadband for 0.8mile line: currently at 10.5Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* read_cpuid_id() in arch/arm/kernel/setup.c
  2015-03-13 16:03 read_cpuid_id() in arch/arm/kernel/setup.c Mason
  2015-03-13 16:19 ` Russell King - ARM Linux
@ 2015-03-13 16:56 ` Mark Rutland
  2015-03-13 17:02   ` Russell King - ARM Linux
  1 sibling, 1 reply; 13+ messages in thread
From: Mark Rutland @ 2015-03-13 16:56 UTC (permalink / raw)
  To: linux-arm-kernel

Hi,

> /*
>   * The CPU ID never changes at run time, so we might as well tell the
>   * compiler that it's constant.  Use this function to read the CPU ID
>   * rather than directly reading processor_id or read_cpuid() directly.
>   */
> static inline unsigned int __attribute_const__ read_cpuid_id(void)
> {
> 	return read_cpuid(CPUID_ID);
> }
> 
> Despite the comment and attribute, my compiler(*) still reloads the
> value every time.

In the presence of big.LITTLE the comment and premise of the
optimisation here is wrong. A thread can migrate between cores of
differing microarchitectures.

So __attribute_const__ is simply broken, and we probably need to do
something like the black magic SP hazarding hack we do for the tpidr
percpu accesses (only expecting this to be called in non-preemptible
context) if it makes sense to allow the value to be cached.

> (*) gcc version 4.9.3 20141031 (prerelease) (Linaro GCC 2014.11)
> 
> e.g.
> 
> static int __get_cpu_architecture(void)
> {
> 	int cpu_arch;
> 
> 	unsigned int id = read_cpuid_id();
> 	if ((id & 0x0008f000) == 0) {
> 		cpu_arch = CPU_ARCH_UNKNOWN;
> 	} else if ((id & 0x0008f000) == 0x00007000) {
> 		cpu_arch = (id & (1 << 23)) ? CPU_ARCH_ARMv4T : CPU_ARCH_ARMv3;
> 	} else if ((id & 0x00080000) == 0x00000000) {
> 		cpu_arch = (id >> 16) & 7;
> 		if (cpu_arch)
> 			cpu_arch += CPU_ARCH_ARMv3;
> 	} else if ((id & 0x000f0000) == 0x000f0000) {

[...]

> So I thought it would be nice to give the poor compiler a break,
> and just stuff the result in a local variable:
> 
> --- setup.c	2015-03-03 18:04:59.000000000 +0100
> +++ setup.foo.c	2015-03-13 16:26:56.413380663 +0100
> @@ -237,15 +237,16 @@
>   {
>   	int cpu_arch;
>   
> -	if ((read_cpuid_id() & 0x0008f000) == 0) {
> +	unsigned int id = read_cpuid_id();
> +	if ((id & 0x0008f000) == 0) {
>   		cpu_arch = CPU_ARCH_UNKNOWN;
> -	} else if ((read_cpuid_id() & 0x0008f000) == 0x00007000) {
> -		cpu_arch = (read_cpuid_id() & (1 << 23)) ? CPU_ARCH_ARMv4T : CPU_ARCH_ARMv3;
> -	} else if ((read_cpuid_id() & 0x00080000) == 0x00000000) {
> -		cpu_arch = (read_cpuid_id() >> 16) & 7;
> +	} else if ((id & 0x0008f000) == 0x00007000) {
> +		cpu_arch = (id & (1 << 23)) ? CPU_ARCH_ARMv4T : CPU_ARCH_ARMv3;
> +	} else if ((id & 0x00080000) == 0x00000000) {
> +		cpu_arch = (id >> 16) & 7;
>   		if (cpu_arch)
>   			cpu_arch += CPU_ARCH_ARMv3;
> -	} else if ((read_cpuid_id() & 0x000f0000) == 0x000f0000) {
> +	} else if ((id & 0x000f0000) == 0x000f0000) {

[...]

> Is this nano-optimization worth considering?

This is only ever called at early boot time in setup_processor, so I'm
not sure I see the point in making changes for optimisation's sake --
this is far from a hot path.

Howevever it would be possible to make __get_cpu_architecture a little
more legible (we should be able to return early in each of the cases),
and having single read of read_cpuid_id() at the start would make the
lines a little shorter.

Mark.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* read_cpuid_id() in arch/arm/kernel/setup.c
  2015-03-13 16:56 ` Mark Rutland
@ 2015-03-13 17:02   ` Russell King - ARM Linux
  2015-03-13 18:26     ` Mark Rutland
  0 siblings, 1 reply; 13+ messages in thread
From: Russell King - ARM Linux @ 2015-03-13 17:02 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Mar 13, 2015 at 04:56:00PM +0000, Mark Rutland wrote:
> In the presence of big.LITTLE the comment and premise of the
> optimisation here is wrong. A thread can migrate between cores of
> differing microarchitectures.
> 
> So __attribute_const__ is simply broken, and we probably need to do
> something like the black magic SP hazarding hack we do for the tpidr
> percpu accesses (only expecting this to be called in non-preemptible
> context) if it makes sense to allow the value to be cached.

Yes, it's true that with big.LITTLE, lots of stuff is broken in this
regard, and you are partially right, but you are not entirely right
either.

It's not the reading of the CPU ID which is the problem, it's the
reading _and_ use of derived results which is a problem.

What this basically means is that in the presence of big.LITTLE, we
need __get_cpu_architecture() as a whole, and whatever makes use of
its return value to /all/ be non-preemptible... and probably a lot
more code too.

-- 
FTTC broadband for 0.8mile line: currently at 10.5Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* read_cpuid_id() in arch/arm/kernel/setup.c
  2015-03-13 16:45     ` Russell King - ARM Linux
@ 2015-03-13 17:06       ` Mason
  2015-03-15 17:40       ` Mason
  1 sibling, 0 replies; 13+ messages in thread
From: Mason @ 2015-03-13 17:06 UTC (permalink / raw)
  To: linux-arm-kernel

On 13/03/2015 17:45, Russell King - ARM Linux wrote:
> On Fri, Mar 13, 2015 at 05:39:23PM +0100, Mason wrote:
>> Good point. I hadn't thought of that.
>>
>> Do you know the latency of an mrc instruction? (compared to a mov)
>
> Not offhand.  It'll be different for different CPUs, but it's probably
> not far off mov.
>
>>> It seems that GCC 4.7.4 optimises better than Linaro's 4.9.3.  In fact,
>>> it looks like Linaro's 4.9.3 is rather buggy as far as optimisation
>>> goes.
>>>
>>> Later compilers aren't always better.
>>
>> I did NOT expect that. Compiler optimizations passes are so fragile.
>
> You're learning :)
>
>> Anyway, here's another proposed nano-improvement ;-)
>> (This one is code factorization)
>>
>> --- setup.c	2015-03-03 18:04:59.000000000 +0100
>> +++ setup.bar.c	2015-03-13 17:29:23.800668429 +0100
>> @@ -246,12 +246,9 @@
>>   		if (cpu_arch)
>>   			cpu_arch += CPU_ARCH_ARMv3;
>>   	} else if ((read_cpuid_id() & 0x000f0000) == 0x000f0000) {
>> -		unsigned int mmfr0;
>> -
>>   		/* Revised CPUID format. Read the Memory Model Feature
>>   		 * Register 0 and check for VMSAv7 or PMSAv7 */
>> -		asm("mrc	p15, 0, %0, c0, c1, 4"
>> -		    : "=r" (mmfr0));
>> +		unsigned int mmfr0 = read_cpuid_ext(CPUID_EXT_MMFR0);
>>   		if ((mmfr0 & 0x0000000f) >= 0x00000003 ||
>>   		    (mmfr0 & 0x000000f0) >= 0x00000030)
>>   			cpu_arch = CPU_ARCH_ARMv7;
>>
>>
>> This one looks good, doesn't it? :-)
>
> Yes, this one I like - and it probably fixes a potential latent bug
> where the compiler was free to re-order that mrc outside of the if()
> statement.
>
> Please wrap it up as a normal submission, thanks.

How exciting, my first kernel patch :-)

I'll take a closer look at other similar refactoring opportunities.

Can I send the patch inline, with "scissors and perforation" delimiter?

Do I have to base it on the latest 4.0 release candidate, or can you
rebase it as needed?

Also, you didn't like "caching" the value of read_cpuid_id() in
a local variable, but it's done in feat_v6_fixup. Do you want me
to submit a patch changing that, or is it not worth it?

Regards.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* read_cpuid_id() in arch/arm/kernel/setup.c
  2015-03-13 17:02   ` Russell King - ARM Linux
@ 2015-03-13 18:26     ` Mark Rutland
  0 siblings, 0 replies; 13+ messages in thread
From: Mark Rutland @ 2015-03-13 18:26 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Mar 13, 2015 at 05:02:51PM +0000, Russell King - ARM Linux wrote:
> On Fri, Mar 13, 2015 at 04:56:00PM +0000, Mark Rutland wrote:
> > In the presence of big.LITTLE the comment and premise of the
> > optimisation here is wrong. A thread can migrate between cores of
> > differing microarchitectures.
> > 
> > So __attribute_const__ is simply broken, and we probably need to do
> > something like the black magic SP hazarding hack we do for the tpidr
> > percpu accesses (only expecting this to be called in non-preemptible
> > context) if it makes sense to allow the value to be cached.
> 
> Yes, it's true that with big.LITTLE, lots of stuff is broken in this
> regard, and you are partially right, but you are not entirely right
> either.
> 
> It's not the reading of the CPU ID which is the problem, it's the
> reading _and_ use of derived results which is a problem.

I don't dispute that, and my comment was not meant to imply that.

> What this basically means is that in the presence of big.LITTLE, we
> need __get_cpu_architecture() as a whole, and whatever makes use of
> its return value to /all/ be non-preemptible... and probably a lot
> more code too.

Sure; this is exactly like using percpu variables, as I mentioned above.
The criticial section at which it is meaningful to read, perform some
work, and write back needs to be bound to a particular CPU.

I guess you could allow for preemption so long as the thread stayed
bound to the same CPU -- the ID registers aren't going to change on the
same CPU because another thread was running for a while. I don't know if
it would be possible to do any black magic hazarding to allow for that
though.

Mark.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* read_cpuid_id() in arch/arm/kernel/setup.c
  2015-03-13 16:45     ` Russell King - ARM Linux
  2015-03-13 17:06       ` Mason
@ 2015-03-15 17:40       ` Mason
  2015-03-16  8:44         ` Mason
  1 sibling, 1 reply; 13+ messages in thread
From: Mason @ 2015-03-15 17:40 UTC (permalink / raw)
  To: linux-arm-kernel

On 13/03/2015 17:45, Russell King - ARM Linux wrote:

> Yes, this one I like - and it probably fixes a potential latent bug
> where the compiler was free to re-order that mrc outside of the if()
> statement.
> 
> Please wrap it up as a normal submission, thanks.

Proposed patch at the end of this message.

I'm now puzzling over why it's required to have "memory"
in read_cpuid_ext's clobber list, and not in read_cpuid's?

Regards.


-- >8 --
Date: Sun, 15 Mar 2015 17:59:53 +0100
Subject: [PATCH] Use read_cpuid_ext() macro instead of inline asm

In commit 067e710b9a982a92cc8294d7fa0f1e924c65bba1, Paul Walmsley fixed
read_cpuid_ext() and added the following comment.

    The memory clobber prevents gcc 4.5 from reordering the mrc before
    any is_smp() tests, which can cause undefined instruction aborts on
    ARM1136 r0 due to the missing extended CP15 registers.
---
 arch/arm/kernel/setup.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index e55408e..1d60beb 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -246,12 +246,9 @@ static int __get_cpu_architecture(void)
 		if (cpu_arch)
 			cpu_arch += CPU_ARCH_ARMv3;
 	} else if ((read_cpuid_id() & 0x000f0000) == 0x000f0000) {
-		unsigned int mmfr0;
-
 		/* Revised CPUID format. Read the Memory Model Feature
 		 * Register 0 and check for VMSAv7 or PMSAv7 */
-		asm("mrc	p15, 0, %0, c0, c1, 4"
-		    : "=r" (mmfr0));
+		unsigned int mmfr0 = read_cpuid_ext(CPUID_EXT_MMFR0);
 		if ((mmfr0 & 0x0000000f) >= 0x00000003 ||
 		    (mmfr0 & 0x000000f0) >= 0x00000030)
 			cpu_arch = CPU_ARCH_ARMv7;
-- 
2.3.2

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* read_cpuid_id() in arch/arm/kernel/setup.c
  2015-03-15 17:40       ` Mason
@ 2015-03-16  8:44         ` Mason
  2015-03-16 16:54           ` Paul Walmsley
  0 siblings, 1 reply; 13+ messages in thread
From: Mason @ 2015-03-16  8:44 UTC (permalink / raw)
  To: linux-arm-kernel

On 15/03/2015 18:40, Mason wrote:

> On 13/03/2015 17:45, Russell King - ARM Linux wrote:
> 
>> Yes, this one I like - and it probably fixes a potential latent bug
>> where the compiler was free to re-order that mrc outside of the if()
>> statement.
>>
>> Please wrap it up as a normal submission, thanks.
> 
> Proposed patch at the end of this message.
> 
> I'm now puzzling over why it's required to have "memory"
> in read_cpuid_ext's clobber list, and not in read_cpuid's?

Same player shoot again.

-- >8 --
Date: Sun, 15 Mar 2015 17:59:53 +0100
Subject: [PATCH] Use read_cpuid_ext() macro instead of inline asm

In commit 067e710b9a98 ("ARM: 7801/1: prevent gcc 4.5 from reordering extended
CP15 reads above is_smp() test") Paul Walmsley fixed read_cpuid_ext() and added
the following comment.

    The memory clobber prevents gcc 4.5 from reordering the mrc before
    any is_smp() tests, which can cause undefined instruction aborts on
    ARM1136 r0 due to the missing extended CP15 registers.

Signed-off-by: Mason <slash.tmp@free.fr>
---
 arch/arm/kernel/setup.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index e55408e..1d60beb 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -246,12 +246,9 @@ static int __get_cpu_architecture(void)
 		if (cpu_arch)
 			cpu_arch += CPU_ARCH_ARMv3;
 	} else if ((read_cpuid_id() & 0x000f0000) == 0x000f0000) {
-		unsigned int mmfr0;
-
 		/* Revised CPUID format. Read the Memory Model Feature
 		 * Register 0 and check for VMSAv7 or PMSAv7 */
-		asm("mrc	p15, 0, %0, c0, c1, 4"
-		    : "=r" (mmfr0));
+		unsigned int mmfr0 = read_cpuid_ext(CPUID_EXT_MMFR0);
 		if ((mmfr0 & 0x0000000f) >= 0x00000003 ||
 		    (mmfr0 & 0x000000f0) >= 0x00000030)
 			cpu_arch = CPU_ARCH_ARMv7;
-- 
2.3.2

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* read_cpuid_id() in arch/arm/kernel/setup.c
  2015-03-16  8:44         ` Mason
@ 2015-03-16 16:54           ` Paul Walmsley
  2015-03-16 22:17             ` Mason
  0 siblings, 1 reply; 13+ messages in thread
From: Paul Walmsley @ 2015-03-16 16:54 UTC (permalink / raw)
  To: linux-arm-kernel

Hi

On Mon, 16 Mar 2015, Mason wrote:

> On 15/03/2015 18:40, Mason wrote:
> 
> > On 13/03/2015 17:45, Russell King - ARM Linux wrote:
> > 
> >> Yes, this one I like - and it probably fixes a potential latent bug
> >> where the compiler was free to re-order that mrc outside of the if()
> >> statement.
> >>
> >> Please wrap it up as a normal submission, thanks.
> > 
> > Proposed patch at the end of this message.
> > 
> > I'm now puzzling over why it's required to have "memory"
> > in read_cpuid_ext's clobber list, and not in read_cpuid's?
> 
> Same player shoot again.

Reviewed-by: Paul Walmsley <paul@pwsan.com>

Looks reasonable to me.  I'd suggest updating the patch message to 
describe your change, and why it's needed.  Consider something like:

---

Convert the open-coded MMFR0 register read in __get_cpu_architecture() to 
use the read_cpuid_ext() macro.  This shortens the function and ensures 
that a memory clobber is used on the coprocessor read instruction.  The 
memory clobber works around a bug in gcc 4.5.  gcc 4.5 can reorder 
coprocessor read instructions with respect to other code, disregarding 
potential side-effects of the coprocessor read.

---

Once you've got something that you're happy with, and have reposted it to 
the public lists, I believe the next step will be for you to post it to 
rmk's patch tracker at:

http://www.arm.linux.org.uk/developer/patches/



- Paul

> 
> -- >8 --
> Date: Sun, 15 Mar 2015 17:59:53 +0100
> Subject: [PATCH] Use read_cpuid_ext() macro instead of inline asm
> 
> In commit 067e710b9a98 ("ARM: 7801/1: prevent gcc 4.5 from reordering extended
> CP15 reads above is_smp() test") Paul Walmsley fixed read_cpuid_ext() and added
> the following comment.
> 
>     The memory clobber prevents gcc 4.5 from reordering the mrc before
>     any is_smp() tests, which can cause undefined instruction aborts on
>     ARM1136 r0 due to the missing extended CP15 registers.
> 
> Signed-off-by: Mason <slash.tmp@free.fr>
> ---
>  arch/arm/kernel/setup.c | 5 +----
>  1 file changed, 1 insertion(+), 4 deletions(-)
> 
> diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
> index e55408e..1d60beb 100644
> --- a/arch/arm/kernel/setup.c
> +++ b/arch/arm/kernel/setup.c
> @@ -246,12 +246,9 @@ static int __get_cpu_architecture(void)
>  		if (cpu_arch)
>  			cpu_arch += CPU_ARCH_ARMv3;
>  	} else if ((read_cpuid_id() & 0x000f0000) == 0x000f0000) {
> -		unsigned int mmfr0;
> -
>  		/* Revised CPUID format. Read the Memory Model Feature
>  		 * Register 0 and check for VMSAv7 or PMSAv7 */
> -		asm("mrc	p15, 0, %0, c0, c1, 4"
> -		    : "=r" (mmfr0));
> +		unsigned int mmfr0 = read_cpuid_ext(CPUID_EXT_MMFR0);
>  		if ((mmfr0 & 0x0000000f) >= 0x00000003 ||
>  		    (mmfr0 & 0x000000f0) >= 0x00000030)
>  			cpu_arch = CPU_ARCH_ARMv7;
> -- 
> 2.3.2
> 


- Paul

^ permalink raw reply	[flat|nested] 13+ messages in thread

* read_cpuid_id() in arch/arm/kernel/setup.c
  2015-03-16 16:54           ` Paul Walmsley
@ 2015-03-16 22:17             ` Mason
  2015-03-16 23:30               ` Mason
  0 siblings, 1 reply; 13+ messages in thread
From: Mason @ 2015-03-16 22:17 UTC (permalink / raw)
  To: linux-arm-kernel

Hello Paul,

On 16/03/2015 17:54, Paul Walmsley wrote:

> On Mon, 16 Mar 2015, Mason wrote:
> 
>> On 15/03/2015 18:40, Mason wrote:
>>
>>> On 13/03/2015 17:45, Russell King - ARM Linux wrote:
>>>
>>>> Yes, this one I like - and it probably fixes a potential latent bug
>>>> where the compiler was free to re-order that mrc outside of the if()
>>>> statement.
>>>>
>>>> Please wrap it up as a normal submission, thanks.
>>>
>>> Proposed patch at the end of this message.
>>>
>>> I'm now puzzling over why it's required to have "memory"
>>> in read_cpuid_ext's clobber list, and not in read_cpuid's?
> 
> Reviewed-by: Paul Walmsley <paul@pwsan.com>
> 
> Looks reasonable to me.  I'd suggest updating the patch message to 
> describe your change, and why it's needed.  Consider something like:
> 
> ---
> 
> Convert the open-coded MMFR0 register read in __get_cpu_architecture() to 
> use the read_cpuid_ext() macro.  This shortens the function and ensures 
> that a memory clobber is used on the coprocessor read instruction.  The 
> memory clobber works around a bug in gcc 4.5.  gcc 4.5 can reorder 
> coprocessor read instructions with respect to other code, disregarding 
> potential side-effects of the coprocessor read.

To be honest, the reason I wrote the patch in the first place
was merely to fix the code duplication! ;-)

I wasn't aware of the latent-bug issue until Russel mentioned
it. So I didn't want to put too much emphasis on that part,
since it didn't come from me, and it is well-documented in
your own commit, which I referenced.

Do you know why it was necessary to fix read_cpuid_ext and
not read_cpuid? I would think that the same problem affects
both macros.

> Once you've got something that you're happy with, and have reposted it to 
> the public lists, I believe the next step will be for you to post it to 
> rmk's patch tracker at:
> 
> http://www.arm.linux.org.uk/developer/patches/

Oh, I didn't know about that part. It's not mentioned in
https://www.kernel.org/doc/Documentation/SubmittingPatches

Thanks for the review, and for mentioning the tracker.

Ah yes, now I see this:
http://www.arm.linux.org.uk/mailinglists/faq.php#p1

Will post an (hopefully) improved commit message ASAP.

Regards.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* read_cpuid_id() in arch/arm/kernel/setup.c
  2015-03-16 22:17             ` Mason
@ 2015-03-16 23:30               ` Mason
  0 siblings, 0 replies; 13+ messages in thread
From: Mason @ 2015-03-16 23:30 UTC (permalink / raw)
  To: linux-arm-kernel

On 16/03/2015 23:17, Mason wrote:

> Will post an (hopefully) improved commit message ASAP.

-- >8 --
Date: Sun, 15 Mar 2015 17:59:53 +0100
Subject: [PATCH] Use read_cpuid_ext() macro instead of inline asm

Replace inline asm statement in __get_cpu_architecture() with equivalent macro
invocation, i.e. read_cpuid_ext(CPUID_EXT_MMFR0);

As an added bonus, this squashes a potential bug described by Paul Walmsley
in commit 067e710b9a98 ("ARM: 7801/1: prevent gcc 4.5 from reordering extended
CP15 reads above is_smp() test").

Signed-off-by: Mason <slash.tmp@free.fr>
---
 arch/arm/kernel/setup.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index e55408e..1d60beb 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -246,12 +246,9 @@ static int __get_cpu_architecture(void)
 		if (cpu_arch)
 			cpu_arch += CPU_ARCH_ARMv3;
 	} else if ((read_cpuid_id() & 0x000f0000) == 0x000f0000) {
-		unsigned int mmfr0;
-
 		/* Revised CPUID format. Read the Memory Model Feature
 		 * Register 0 and check for VMSAv7 or PMSAv7 */
-		asm("mrc	p15, 0, %0, c0, c1, 4"
-		    : "=r" (mmfr0));
+		unsigned int mmfr0 = read_cpuid_ext(CPUID_EXT_MMFR0);
 		if ((mmfr0 & 0x0000000f) >= 0x00000003 ||
 		    (mmfr0 & 0x000000f0) >= 0x00000030)
 			cpu_arch = CPU_ARCH_ARMv7;
-- 
2.3.2

^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2015-03-16 23:30 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-13 16:03 read_cpuid_id() in arch/arm/kernel/setup.c Mason
2015-03-13 16:19 ` Russell King - ARM Linux
2015-03-13 16:39   ` Mason
2015-03-13 16:45     ` Russell King - ARM Linux
2015-03-13 17:06       ` Mason
2015-03-15 17:40       ` Mason
2015-03-16  8:44         ` Mason
2015-03-16 16:54           ` Paul Walmsley
2015-03-16 22:17             ` Mason
2015-03-16 23:30               ` Mason
2015-03-13 16:56 ` Mark Rutland
2015-03-13 17:02   ` Russell King - ARM Linux
2015-03-13 18:26     ` Mark Rutland

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.