linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] ARC: io.h: Implement reads{x}()/writes{x}()
@ 2018-11-29 14:29 Jose Abreu
  2018-11-29 14:38 ` David Laight
  2018-11-29 21:16 ` Vineet Gupta
  0 siblings, 2 replies; 12+ messages in thread
From: Jose Abreu @ 2018-11-29 14:29 UTC (permalink / raw)
  To: linux-snps-arc, linux-kernel
  Cc: Jose Abreu, Vineet Gupta, Alexey Brodkin, Joao Pinto,
	Vitor Soares, David Laight

Some ARC CPU's do not support unaligned loads/stores. Currently, generic
implementation of reads{b/w/l}()/writes{b/w/l}() is being used with ARC.
This can lead to misfunction of some drivers as generic functions do a
plain dereference of a pointer that can be unaligned.

Let's use {get/put}_unaligned() helper instead of plain dereference of
pointer in order to fix this.

Changes from v1:
- Check if buffer is already aligned (David)
- Remove 64 bit mention (Alexey)

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Tested-by: Vitor Soares <soares@synopsys.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Alexey Brodkin <abrodkin@synopsys.com>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Vitor Soares <soares@synopsys.com>
Cc: David Laight <David.Laight@ACULAB.COM>
---
 arch/arc/include/asm/io.h | 61 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/arch/arc/include/asm/io.h b/arch/arc/include/asm/io.h
index c22b181e8206..949759a45cff 100644
--- a/arch/arc/include/asm/io.h
+++ b/arch/arc/include/asm/io.h
@@ -12,6 +12,7 @@
 #include <linux/types.h>
 #include <asm/byteorder.h>
 #include <asm/page.h>
+#include <asm/unaligned.h>
 
 #ifdef CONFIG_ISA_ARCV2
 #include <asm/barrier.h>
@@ -94,6 +95,34 @@ static inline u32 __raw_readl(const volatile void __iomem *addr)
 	return w;
 }
 
+#define __raw_readsx(t,f) \
+static inline void __raw_reads##f(const volatile void __iomem *addr, \
+				  void *buffer, unsigned int count) \
+{ \
+	if (count) { \
+		const unsigned long bptr = (unsigned long)buffer; \
+		u##t *buf = buffer; \
+\
+		do { \
+			u##t x = __raw_read##f(addr); \
+\
+			/* Some ARC CPU's don't support unaligned accesses */ \
+			if (bptr % ((t) / 8)) { \
+				put_unaligned(x, buf++); \
+			} else { \
+				*buf++ = x; \
+			} \
+		} while (--count); \
+	} \
+}
+
+#define __raw_readsb __raw_readsb
+__raw_readsx(8, b);
+#define __raw_readsw __raw_readsw
+__raw_readsx(16, w);
+#define __raw_readsl __raw_readsl
+__raw_readsx(32, l);
+
 #define __raw_writeb __raw_writeb
 static inline void __raw_writeb(u8 b, volatile void __iomem *addr)
 {
@@ -126,6 +155,32 @@ static inline void __raw_writel(u32 w, volatile void __iomem *addr)
 
 }
 
+#define __raw_writesx(t,f) \
+static inline void __raw_writes##f(volatile void __iomem *addr, \
+				   const void *buffer, unsigned int count) \
+{ \
+	if (count) { \
+		const unsigned long bptr = (unsigned long)buffer; \
+		const u##t *buf = buffer; \
+\
+		do { \
+			/* Some ARC CPU's don't support unaligned accesses */ \
+			if (bptr % ((t) / 8)) { \
+				__raw_write##f(get_unaligned(buf++), addr); \
+			} else { \
+				__raw_write##f(*buf++, addr); \
+			} \
+		} while (--count); \
+	} \
+}
+
+#define __raw_writesb __raw_writesb
+__raw_writesx(8, b);
+#define __raw_writesw __raw_writesw
+__raw_writesx(16, w);
+#define __raw_writesl __raw_writesl
+__raw_writesx(32, l);
+
 /*
  * MMIO can also get buffered/optimized in micro-arch, so barriers needed
  * Based on ARM model for the typical use case
@@ -141,10 +196,16 @@ static inline void __raw_writel(u32 w, volatile void __iomem *addr)
 #define readb(c)		({ u8  __v = readb_relaxed(c); __iormb(); __v; })
 #define readw(c)		({ u16 __v = readw_relaxed(c); __iormb(); __v; })
 #define readl(c)		({ u32 __v = readl_relaxed(c); __iormb(); __v; })
+#define readsb(p,d,l)		({ __raw_readsb(p,d,l); __iormb(); })
+#define readsw(p,d,l)		({ __raw_readsw(p,d,l); __iormb(); })
+#define readsl(p,d,l)		({ __raw_readsl(p,d,l); __iormb(); })
 
 #define writeb(v,c)		({ __iowmb(); writeb_relaxed(v,c); })
 #define writew(v,c)		({ __iowmb(); writew_relaxed(v,c); })
 #define writel(v,c)		({ __iowmb(); writel_relaxed(v,c); })
+#define writesb(p,d,l)		({ __iowmb(); __raw_writesb(p,d,l); })
+#define writesw(p,d,l)		({ __iowmb(); __raw_writesw(p,d,l); })
+#define writesl(p,d,l)		({ __iowmb(); __raw_writesl(p,d,l); })
 
 /*
  * Relaxed API for drivers which can handle barrier ordering themselves
-- 
2.7.4



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* RE: [PATCH v2] ARC: io.h: Implement reads{x}()/writes{x}()
  2018-11-29 14:29 [PATCH v2] ARC: io.h: Implement reads{x}()/writes{x}() Jose Abreu
@ 2018-11-29 14:38 ` David Laight
  2018-11-29 14:42   ` Jose Abreu
  2018-11-29 21:16 ` Vineet Gupta
  1 sibling, 1 reply; 12+ messages in thread
From: David Laight @ 2018-11-29 14:38 UTC (permalink / raw)
  To: 'Jose Abreu', linux-snps-arc, linux-kernel
  Cc: Vineet Gupta, Alexey Brodkin, Joao Pinto, Vitor Soares

From: Jose Abreu
> Sent: 29 November 2018 14:29
> 
> Some ARC CPU's do not support unaligned loads/stores. Currently, generic
> implementation of reads{b/w/l}()/writes{b/w/l}() is being used with ARC.
> This can lead to misfunction of some drivers as generic functions do a
> plain dereference of a pointer that can be unaligned.
> 
> Let's use {get/put}_unaligned() helper instead of plain dereference of
> pointer in order to fix this.
...
> +#define __raw_readsx(t,f) \
> +static inline void __raw_reads##f(const volatile void __iomem *addr, \
> +				  void *buffer, unsigned int count) \
> +{ \
> +	if (count) { \
> +		const unsigned long bptr = (unsigned long)buffer; \
> +		u##t *buf = buffer; \
> +\
> +		do { \
> +			u##t x = __raw_read##f(addr); \
> +\
> +			/* Some ARC CPU's don't support unaligned accesses */ \
> +			if (bptr % ((t) / 8)) { \
> +				put_unaligned(x, buf++); \
> +			} else { \
> +				*buf++ = x; \
> +			} \
> +		} while (--count); \
> +	} \
> +}

Does the compiler move the alignment test outside the loop?
You really want two copies of the loop body.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] ARC: io.h: Implement reads{x}()/writes{x}()
  2018-11-29 14:38 ` David Laight
@ 2018-11-29 14:42   ` Jose Abreu
  2018-11-29 16:13     ` Jose Abreu
  0 siblings, 1 reply; 12+ messages in thread
From: Jose Abreu @ 2018-11-29 14:42 UTC (permalink / raw)
  To: David Laight, linux-snps-arc, linux-kernel
  Cc: Vineet Gupta, Alexey Brodkin, Joao Pinto, Vitor Soares

On 29-11-2018 14:38, David Laight wrote:
> From: Jose Abreu
>> Sent: 29 November 2018 14:29
>>
>> Some ARC CPU's do not support unaligned loads/stores. Currently, generic
>> implementation of reads{b/w/l}()/writes{b/w/l}() is being used with ARC.
>> This can lead to misfunction of some drivers as generic functions do a
>> plain dereference of a pointer that can be unaligned.
>>
>> Let's use {get/put}_unaligned() helper instead of plain dereference of
>> pointer in order to fix this.
> ...
>> +#define __raw_readsx(t,f) \
>> +static inline void __raw_reads##f(const volatile void __iomem *addr, \
>> +				  void *buffer, unsigned int count) \
>> +{ \
>> +	if (count) { \
>> +		const unsigned long bptr = (unsigned long)buffer; \
>> +		u##t *buf = buffer; \
>> +\
>> +		do { \
>> +			u##t x = __raw_read##f(addr); \
>> +\
>> +			/* Some ARC CPU's don't support unaligned accesses */ \
>> +			if (bptr % ((t) / 8)) { \
>> +				put_unaligned(x, buf++); \
>> +			} else { \
>> +				*buf++ = x; \
>> +			} \
>> +		} while (--count); \
>> +	} \
>> +}
> Does the compiler move the alignment test outside the loop?
> You really want two copies of the loop body.

Hmm, I would expect so because the if condition takes two const
args ... I will try check that.

Thanks and Best Regards,
Jose Miguel Abreu

>
> 	David
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
>


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] ARC: io.h: Implement reads{x}()/writes{x}()
  2018-11-29 14:42   ` Jose Abreu
@ 2018-11-29 16:13     ` Jose Abreu
  2018-11-29 21:20       ` Arnd Bergmann
  0 siblings, 1 reply; 12+ messages in thread
From: Jose Abreu @ 2018-11-29 16:13 UTC (permalink / raw)
  To: David Laight, linux-snps-arc, linux-kernel
  Cc: Vineet Gupta, Alexey Brodkin, Joao Pinto, Vitor Soares

On 29-11-2018 14:42, Jose Abreu wrote:
> On 29-11-2018 14:38, David Laight wrote:
>> From: Jose Abreu
>>> Sent: 29 November 2018 14:29
>>>
>>> Some ARC CPU's do not support unaligned loads/stores. Currently, generic
>>> implementation of reads{b/w/l}()/writes{b/w/l}() is being used with ARC.
>>> This can lead to misfunction of some drivers as generic functions do a
>>> plain dereference of a pointer that can be unaligned.
>>>
>>> Let's use {get/put}_unaligned() helper instead of plain dereference of
>>> pointer in order to fix this.
>> ...
>>> +#define __raw_readsx(t,f) \
>>> +static inline void __raw_reads##f(const volatile void __iomem *addr, \
>>> +				  void *buffer, unsigned int count) \
>>> +{ \
>>> +	if (count) { \
>>> +		const unsigned long bptr = (unsigned long)buffer; \
>>> +		u##t *buf = buffer; \
>>> +\
>>> +		do { \
>>> +			u##t x = __raw_read##f(addr); \
>>> +\
>>> +			/* Some ARC CPU's don't support unaligned accesses */ \
>>> +			if (bptr % ((t) / 8)) { \
>>> +				put_unaligned(x, buf++); \
>>> +			} else { \
>>> +				*buf++ = x; \
>>> +			} \
>>> +		} while (--count); \
>>> +	} \
>>> +}
>> Does the compiler move the alignment test outside the loop?
>> You really want two copies of the loop body.
> Hmm, I would expect so because the if condition takes two const
> args ... I will try check that.

And it did optimize :)

Sample C Source:
--->8--
static noinline void test_readsl(char *buf, int len)
{
        readsl(0xdeadbeef, buf, len);
}
--->8---

And the disassembly:
--->8---
00000e88 <test_readsl>:
 e88:    breq.dr1,0,eac <0xeac>        /* if (count) */
 e8c:    and r2,r0,3

 e90:    mov_s lp_count,r1            /* r1 = count */
 e92:    brne r2,0,eb0 <0xeb0>        /* if (bptr % ((t) / 8)) */

 e96:    sub r0,r0,4
 e9a:    nop_s
 
 e9c:    lp eac <0xeac>                /* first loop */
 ea0:    ld r2,[0xdeadbeef]
 ea8:    st.a r2,[r0,4]
 eac:    j_s [blink]
 eae:    nop_s

 eb0:    lp ed6 <0xed6>                /* second loop */
 eb4:    ld r2,[0xdeadbeef]
 ebc:    lsr r5,r2,8
 ec0:    lsr r4,r2,16
 ec4:    lsr r3,r2,24
 ec8:    stb_s r2,[r0,0]
 eca:    stb r5,[r0,1]
 ece:    stb r4,[r0,2]
 ed2:    stb_s r3,[r0,3]
 ed4:    add_s r0,r0,4
 ed6:    j_s [blink]

--->8---

See how the if condition added in this version is checked in
<test_readsl+0xe92> and then it takes two different loops.

Thanks and Best Regards,
Jose Miguel Abreu

>
> Thanks and Best Regards,
> Jose Miguel Abreu
>
>> 	David
>>
>> -
>> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
>> Registration No: 1397386 (Wales)
>>


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] ARC: io.h: Implement reads{x}()/writes{x}()
  2018-11-29 14:29 [PATCH v2] ARC: io.h: Implement reads{x}()/writes{x}() Jose Abreu
  2018-11-29 14:38 ` David Laight
@ 2018-11-29 21:16 ` Vineet Gupta
  1 sibling, 0 replies; 12+ messages in thread
From: Vineet Gupta @ 2018-11-29 21:16 UTC (permalink / raw)
  To: Jose Abreu, linux-snps-arc, linux-kernel
  Cc: Alexey Brodkin, Joao Pinto, Vitor Soares, David Laight

On 11/29/18 6:29 AM, Jose Abreu wrote:
> Some ARC CPU's do not support unaligned loads/stores. Currently, generic
> implementation of reads{b/w/l}()/writes{b/w/l}() is being used with ARC.
> This can lead to misfunction of some drivers as generic functions do a
> plain dereference of a pointer that can be unaligned.
>
> Let's use {get/put}_unaligned() helper instead of plain dereference of
> pointer in order to fix this. 

Please add a few lines of comment about what the accessor does in general - maybe
copy-paste from generic file if needed.

> +#define __raw_readsx(t,f) \
> +static inline void __raw_reads##f(const volatile void __iomem *addr, \
> +				  void *buffer, unsigned int count) \
> +{ \
> +	if (count) { \

Could you reduce 1 level of indentation by checking for !count and return.

> +		const unsigned long bptr = (unsigned long)buffer; \
> +		u##t *buf = buffer; \
> +\
> +		do { \
> +			u##t x = __raw_read##f(addr); \
> +\
> +			/* Some ARC CPU's don't support unaligned accesses */ \
> +			if (bptr % ((t) / 8)) { \

This math is loop invariant so instead of using bptr here, have something like
is_aligned = (unsigned long)buffer % ((t) / 8)) outside the loop. A reasonable
compiler already does that but this one is easier on eyes.

> +				put_unaligned(x, buf++); \
> +			} else { \
> +				*buf++ = x; \
> +			} \
> +		} while (--count); \
> +	} \
> +}

Per your later post the compiler is doing the good job here, but sometimes it
might not. Adding a condition inside a loop is not a good idea in general. Better
to code the 2 loops seperately.

> +
> +#define __raw_readsb __raw_readsb
> +__raw_readsx(8, b);

The trailing ; is not needed

> +#define __raw_readsw __raw_readsw
> +__raw_readsx(16, w);
> +#define __raw_readsl __raw_readsl
> +__raw_readsx(32, l);

> +
>  #define __raw_writeb __raw_writeb
>  static inline void __raw_writeb(u8 b, volatile void __iomem *addr)
>  {
> @@ -126,6 +155,32 @@ static inline void __raw_writel(u32 w, volatile void __iomem *addr)
>  
>  }
>  
> +#define __raw_writesx(t,f) \
> +static inline void __raw_writes##f(volatile void __iomem *addr, \
> +				   const void *buffer, unsigned int count) \
> +{ \
> +	if (count) { \
> +		const unsigned long bptr = (unsigned long)buffer; \
> +		const u##t *buf = buffer; \
> +\
> +		do { \
> +			/* Some ARC CPU's don't support unaligned accesses */ \
> +			if (bptr % ((t) / 8)) { \
> +				__raw_write##f(get_unaligned(buf++), addr); \
> +			} else { \
> +				__raw_write##f(*buf++, addr); \
> +			} \
> +		} while (--count); \
> +	} \
> +}

Same as for read !

> +
> +#define __raw_writesb __raw_writesb
> +__raw_writesx(8, b);
> +#define __raw_writesw __raw_writesw
> +__raw_writesx(16, w);
> +#define __raw_writesl __raw_writesl
> +__raw_writesx(32, l);

Ditto !


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] ARC: io.h: Implement reads{x}()/writes{x}()
  2018-11-29 16:13     ` Jose Abreu
@ 2018-11-29 21:20       ` Arnd Bergmann
  2018-11-30  8:56         ` Jose Abreu
  0 siblings, 1 reply; 12+ messages in thread
From: Arnd Bergmann @ 2018-11-29 21:20 UTC (permalink / raw)
  To: jose.abreu
  Cc: David Laight, open list:SYNOPSYS ARC ARCHITECTURE,
	Linux Kernel Mailing List, Vineet Gupta, alexey.brodkin,
	Joao Pinto, Vitor Soares

On Thu, Nov 29, 2018 at 5:14 PM Jose Abreu <jose.abreu@synopsys.com> wrote:

> --->8--
> static noinline void test_readsl(char *buf, int len)
> {
>         readsl(0xdeadbeef, buf, len);
> }
> --->8---
>
> And the disassembly:
> --->8---
> 00000e88 <test_readsl>:
>  e88:    breq.dr1,0,eac <0xeac>        /* if (count) */
>  e8c:    and r2,r0,3
>
>  e90:    mov_s lp_count,r1            /* r1 = count */
>  e92:    brne r2,0,eb0 <0xeb0>        /* if (bptr % ((t) / 8)) */
>
>  e96:    sub r0,r0,4
>  e9a:    nop_s
>
>  e9c:    lp eac <0xeac>                /* first loop */
>  ea0:    ld r2,[0xdeadbeef]
>  ea8:    st.a r2,[r0,4]
>  eac:    j_s [blink]
>  eae:    nop_s
>
>  eb0:    lp ed6 <0xed6>                /* second loop */
>  eb4:    ld r2,[0xdeadbeef]
>  ebc:    lsr r5,r2,8
>  ec0:    lsr r4,r2,16
>  ec4:    lsr r3,r2,24
>  ec8:    stb_s r2,[r0,0]
>  eca:    stb r5,[r0,1]
>  ece:    stb r4,[r0,2]
>  ed2:    stb_s r3,[r0,3]
>  ed4:    add_s r0,r0,4
>  ed6:    j_s [blink]
>
> --->8---
>
> See how the if condition added in this version is checked in
> <test_readsl+0xe92> and then it takes two different loops.

This looks good to me. I wonder what the result is for CPUs
that /do/ support unaligned accesses. Normally put_unaligned()
should fall back to a simple store in that case, but I'm not
sure it can fold the two stores back into one and skip the
alignment check. Probably not worth overoptimizing for that
case (the MMIO access latency should be much higher than
anything you could gain here), but I'm still curious about
how well our get/put_unaligned macros work.

       Arnd

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] ARC: io.h: Implement reads{x}()/writes{x}()
  2018-11-29 21:20       ` Arnd Bergmann
@ 2018-11-30  8:56         ` Jose Abreu
  2018-11-30 13:44           ` Arnd Bergmann
  0 siblings, 1 reply; 12+ messages in thread
From: Jose Abreu @ 2018-11-30  8:56 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: David Laight, open list:SYNOPSYS ARC ARCHITECTURE,
	Linux Kernel Mailing List, Vineet Gupta, alexey.brodkin,
	Joao Pinto, Vitor Soares

On 29-11-2018 21:20, Arnd Bergmann wrote:
> On Thu, Nov 29, 2018 at 5:14 PM Jose Abreu <jose.abreu@synopsys.com> wrote:
>
>> --->8--
>> static noinline void test_readsl(char *buf, int len)
>> {
>>         readsl(0xdeadbeef, buf, len);
>> }
>> --->8---
>>
>> And the disassembly:
>> --->8---
>> 00000e88 <test_readsl>:
>>  e88:    breq.dr1,0,eac <0xeac>        /* if (count) */
>>  e8c:    and r2,r0,3
>>
>>  e90:    mov_s lp_count,r1            /* r1 = count */
>>  e92:    brne r2,0,eb0 <0xeb0>        /* if (bptr % ((t) / 8)) */
>>
>>  e96:    sub r0,r0,4
>>  e9a:    nop_s
>>
>>  e9c:    lp eac <0xeac>                /* first loop */
>>  ea0:    ld r2,[0xdeadbeef]
>>  ea8:    st.a r2,[r0,4]
>>  eac:    j_s [blink]
>>  eae:    nop_s
>>
>>  eb0:    lp ed6 <0xed6>                /* second loop */
>>  eb4:    ld r2,[0xdeadbeef]
>>  ebc:    lsr r5,r2,8
>>  ec0:    lsr r4,r2,16
>>  ec4:    lsr r3,r2,24
>>  ec8:    stb_s r2,[r0,0]
>>  eca:    stb r5,[r0,1]
>>  ece:    stb r4,[r0,2]
>>  ed2:    stb_s r3,[r0,3]
>>  ed4:    add_s r0,r0,4
>>  ed6:    j_s [blink]
>>
>> --->8---
>>
>> See how the if condition added in this version is checked in
>> <test_readsl+0xe92> and then it takes two different loops.
> This looks good to me. I wonder what the result is for CPUs
> that /do/ support unaligned accesses. Normally put_unaligned()
> should fall back to a simple store in that case, but I'm not
> sure it can fold the two stores back into one and skip the
> alignment check. Probably not worth overoptimizing for that
> case (the MMIO access latency should be much higher than
> anything you could gain here), but I'm still curious about
> how well our get/put_unaligned macros work.

Here is disassembly for an ARC CPU that supports unaligned accesses:

-->8---
00000d48 <test_readsl>:
 d48:    breq_s r1,0,28            /* if (count) */
 d4a:    tst    r0,0x3
 d4e:    bne_s 32                /* if (bptr % ((t) / 8)) */
 
 d50:    ld r2,[0xdeadbeef]        /* first loop */
 d58:    sub_s r1,r1,0x1
 d5a:    tst_s r1,r1
 d5c:    bne.d -12
 d60:    st.ab r2,[r0,4]
 
 d64:    dmb    0x1                    /* common exit point */
 d68:    j_s    [blink]
 d6a:    nop_s
 
 d6c:    ld r2,[0xdeadbeef]        /* second loop */
 d74:    sub_s r1,r1,0x1
 d76:    tst_s r1,r1
 d78:    bne.d -12
 d7c:    st.ab r2,[r0,4]

 d80:    b_s -28                    /* jmp to 0xd64 */
 d82:    nop_s
--->8---

Notice how first and second loop are exactly equal ...

Thanks and Best Regards,
Jose Miguel Abreu

>
>        Arnd


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] ARC: io.h: Implement reads{x}()/writes{x}()
  2018-11-30  8:56         ` Jose Abreu
@ 2018-11-30 13:44           ` Arnd Bergmann
  2018-11-30 13:57             ` David Laight
  0 siblings, 1 reply; 12+ messages in thread
From: Arnd Bergmann @ 2018-11-30 13:44 UTC (permalink / raw)
  To: jose.abreu
  Cc: David Laight, open list:SYNOPSYS ARC ARCHITECTURE,
	Linux Kernel Mailing List, Vineet Gupta, alexey.brodkin,
	Joao Pinto, Vitor Soares

On Fri, Nov 30, 2018 at 9:57 AM Jose Abreu <jose.abreu@synopsys.com> wrote:
> On 29-11-2018 21:20, Arnd Bergmann wrote:
> > On Thu, Nov 29, 2018 at 5:14 PM Jose Abreu <jose.abreu@synopsys.com> wrote:
> >> See how the if condition added in this version is checked in
> >> <test_readsl+0xe92> and then it takes two different loops.
> > This looks good to me. I wonder what the result is for CPUs
> > that /do/ support unaligned accesses. Normally put_unaligned()
> > should fall back to a simple store in that case, but I'm not
> > sure it can fold the two stores back into one and skip the
> > alignment check. Probably not worth overoptimizing for that
> > case (the MMIO access latency should be much higher than
> > anything you could gain here), but I'm still curious about
> > how well our get/put_unaligned macros work.
>
> Here is disassembly for an ARC CPU that supports unaligned accesses:
>
> -->8---
> 00000d48 <test_readsl>:
>  d48:    breq_s r1,0,28            /* if (count) */
>  d4a:    tst    r0,0x3
>  d4e:    bne_s 32                /* if (bptr % ((t) / 8)) */
>
>  d50:    ld r2,[0xdeadbeef]        /* first loop */
>  d58:    sub_s r1,r1,0x1
>  d5a:    tst_s r1,r1
>  d5c:    bne.d -12
>  d60:    st.ab r2,[r0,4]
>
>  d64:    dmb    0x1                    /* common exit point */
>  d68:    j_s    [blink]
>  d6a:    nop_s
>
>  d6c:    ld r2,[0xdeadbeef]        /* second loop */
>  d74:    sub_s r1,r1,0x1
>  d76:    tst_s r1,r1
>  d78:    bne.d -12
>  d7c:    st.ab r2,[r0,4]
>
>  d80:    b_s -28                    /* jmp to 0xd64 */
>  d82:    nop_s
> --->8---
>
> Notice how first and second loop are exactly equal ...

Ok, so it's halfway there: it managed to optimize the the unaligned
case correctly, but it failed to notice that both sides are
identical now.

      Arnd

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH v2] ARC: io.h: Implement reads{x}()/writes{x}()
  2018-11-30 13:44           ` Arnd Bergmann
@ 2018-11-30 13:57             ` David Laight
  2018-11-30 19:00               ` Vineet Gupta
  0 siblings, 1 reply; 12+ messages in thread
From: David Laight @ 2018-11-30 13:57 UTC (permalink / raw)
  To: 'Arnd Bergmann', jose.abreu
  Cc: open list:SYNOPSYS ARC ARCHITECTURE, Linux Kernel Mailing List,
	Vineet Gupta, alexey.brodkin, Joao Pinto, Vitor Soares

From: Arnd Bergmann
> Sent: 30 November 2018 13:44
> 
> On Fri, Nov 30, 2018 at 9:57 AM Jose Abreu <jose.abreu@synopsys.com> wrote:
> > On 29-11-2018 21:20, Arnd Bergmann wrote:
> > > On Thu, Nov 29, 2018 at 5:14 PM Jose Abreu <jose.abreu@synopsys.com> wrote:
> > >> See how the if condition added in this version is checked in
> > >> <test_readsl+0xe92> and then it takes two different loops.
> > > This looks good to me. I wonder what the result is for CPUs
> > > that /do/ support unaligned accesses. Normally put_unaligned()
> > > should fall back to a simple store in that case, but I'm not
> > > sure it can fold the two stores back into one and skip the
> > > alignment check. Probably not worth overoptimizing for that
> > > case (the MMIO access latency should be much higher than
> > > anything you could gain here), but I'm still curious about
> > > how well our get/put_unaligned macros work.
> >
> > Here is disassembly for an ARC CPU that supports unaligned accesses:
> >
> > -->8---
> > 00000d48 <test_readsl>:
> >  d48:    breq_s r1,0,28            /* if (count) */
> >  d4a:    tst    r0,0x3
> >  d4e:    bne_s 32                /* if (bptr % ((t) / 8)) */
> >
> >  d50:    ld r2,[0xdeadbeef]        /* first loop */
> >  d58:    sub_s r1,r1,0x1
> >  d5a:    tst_s r1,r1
> >  d5c:    bne.d -12
> >  d60:    st.ab r2,[r0,4]
> >
> >  d64:    dmb    0x1                    /* common exit point */
> >  d68:    j_s    [blink]
> >  d6a:    nop_s
> >
> >  d6c:    ld r2,[0xdeadbeef]        /* second loop */
> >  d74:    sub_s r1,r1,0x1
> >  d76:    tst_s r1,r1
> >  d78:    bne.d -12
> >  d7c:    st.ab r2,[r0,4]
> >
> >  d80:    b_s -28                    /* jmp to 0xd64 */
> >  d82:    nop_s
> > --->8---
> >
> > Notice how first and second loop are exactly equal ...
> 
> Ok, so it's halfway there: it managed to optimize the the unaligned
> case correctly, but it failed to notice that both sides are
> identical now.

There're even identical opcodes...
The barrier() (etc) in the asm output probably stopped the optimisation.

It also seems to have used a different type of loop to the
other example, probably less efficient.
(Not that I'm an expert on ARC opcodes.)

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] ARC: io.h: Implement reads{x}()/writes{x}()
  2018-11-30 13:57             ` David Laight
@ 2018-11-30 19:00               ` Vineet Gupta
  2018-12-03 10:10                 ` David Laight
  0 siblings, 1 reply; 12+ messages in thread
From: Vineet Gupta @ 2018-11-30 19:00 UTC (permalink / raw)
  To: David Laight, 'Arnd Bergmann', jose.abreu
  Cc: open list:SYNOPSYS ARC ARCHITECTURE, Linux Kernel Mailing List,
	Vineet Gupta, alexey.brodkin, Joao Pinto, Vitor Soares

On 11/30/18 5:57 AM, David Laight wrote:
> There're even identical opcodes...
> The barrier() (etc) in the asm output probably stopped the optimisation.
>
> It also seems to have used a different type of loop to the
> other example, probably less efficient.
> (Not that I'm an expert on ARC opcodes.)

The difference is due to ISA and ensuing ARC gcc backends. ARCompact based cores
don't support unaligned access and the loop there was ZOL (Zero delay loop). In
ARCv2 based cores, the gcc backend has been tweaked to generate fewer ZOLs hence
you see the more canonical tst and branch style loop.

-Vineet

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH v2] ARC: io.h: Implement reads{x}()/writes{x}()
  2018-11-30 19:00               ` Vineet Gupta
@ 2018-12-03 10:10                 ` David Laight
  2018-12-03 17:31                   ` Vineet Gupta
  0 siblings, 1 reply; 12+ messages in thread
From: David Laight @ 2018-12-03 10:10 UTC (permalink / raw)
  To: 'Vineet Gupta', 'Arnd Bergmann', jose.abreu
  Cc: open list:SYNOPSYS ARC ARCHITECTURE, Linux Kernel Mailing List,
	alexey.brodkin, Joao Pinto, Vitor Soares

From: Vineet Gupta
...
> > It also seems to have used a different type of loop to the
> > other example, probably less efficient.
> > (Not that I'm an expert on ARC opcodes.)
> 
> The difference is due to ISA and ensuing ARC gcc backends. ARCompact based cores
> don't support unaligned access and the loop there was ZOL (Zero delay loop). In
> ARCv2 based cores, the gcc backend has been tweaked to generate fewer ZOLs hence
> you see the more canonical tst and branch style loop.

Is this another case of the hardware implementing 'hardware' loop
instructions that execute slower than ones made of simple instructions?

The worst example has to be the x86 'loop' (dec cx and jump nz)
instruction which is microcoded on intel cpus.
That makes it very difficult to use the new addx instruction to
get two dependency chains through a loop.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] ARC: io.h: Implement reads{x}()/writes{x}()
  2018-12-03 10:10                 ` David Laight
@ 2018-12-03 17:31                   ` Vineet Gupta
  0 siblings, 0 replies; 12+ messages in thread
From: Vineet Gupta @ 2018-12-03 17:31 UTC (permalink / raw)
  To: David Laight, 'Arnd Bergmann', jose.abreu
  Cc: open list:SYNOPSYS ARC ARCHITECTURE, Linux Kernel Mailing List,
	alexey.brodkin, Joao Pinto, Vitor Soares

On 12/3/18 2:10 AM, David Laight wrote:
> From: Vineet Gupta
> ...
>>> It also seems to have used a different type of loop to the
>>> other example, probably less efficient.
>>> (Not that I'm an expert on ARC opcodes.)
>> The difference is due to ISA and ensuing ARC gcc backends. ARCompact based cores
>> don't support unaligned access and the loop there was ZOL (Zero delay loop). In
>> ARCv2 based cores, the gcc backend has been tweaked to generate fewer ZOLs hence
>> you see the more canonical tst and branch style loop.
> Is this another case of the hardware implementing 'hardware' loop
> instructions that execute slower than ones made of simple instructions?

Not really. ZOL allow for hardware loops with no instruction/cycle overhead in
general. However as micro-arches get more complicated there are newer "gizmos"
added to the machinery which sometimes make it harder for the compliers to
optimize for all the cases. ARCv2 ISA has a new DBNZ instruction (similar to x86
you refer below) to implement loops and that is preferred over the ZOL.

> The worst example has to be the x86 'loop' (dec cx and jump nz)
> instruction which is microcoded on intel cpus.
> That makes it very difficult to use the new addx instruction to
> get two dependency chains through a loop.


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2018-12-03 17:31 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-11-29 14:29 [PATCH v2] ARC: io.h: Implement reads{x}()/writes{x}() Jose Abreu
2018-11-29 14:38 ` David Laight
2018-11-29 14:42   ` Jose Abreu
2018-11-29 16:13     ` Jose Abreu
2018-11-29 21:20       ` Arnd Bergmann
2018-11-30  8:56         ` Jose Abreu
2018-11-30 13:44           ` Arnd Bergmann
2018-11-30 13:57             ` David Laight
2018-11-30 19:00               ` Vineet Gupta
2018-12-03 10:10                 ` David Laight
2018-12-03 17:31                   ` Vineet Gupta
2018-11-29 21:16 ` Vineet Gupta

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).