linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] ARCv2: LIB: memeset: fix doing prefetchw outside of buffer
@ 2019-01-14 15:16 Eugeniy Paltsev
  2019-01-14 15:16 ` [PATCH 2/2] ARCv2: LIB: memset: move st instruction series to macros Eugeniy Paltsev
  2019-01-15  1:09 ` [PATCH 1/2] ARCv2: LIB: memeset: fix doing prefetchw outside of buffer Vineet Gupta
  0 siblings, 2 replies; 5+ messages in thread
From: Eugeniy Paltsev @ 2019-01-14 15:16 UTC (permalink / raw)
  To: linux-snps-arc, Vineet Gupta
  Cc: linux-kernel, Alexey Brodkin, Eugeniy Paltsev

Current ARCv2 memeset implementation may call 'prefetchw'
instruction for address which lies outside of memset area.
So we got one modified (dirty) cache line outside of memset
area. This may lead to data corruption if this area is used
for DMA IO.

Another issue is that current ARCv2 memeset implementation
may call 'prealloc' instruction for L1 cache line which
doesn't fully belongs to memeset area in case of 128B L1 D$
line length. That leads to data corruption.

Fix prefetchw/prealloc instructions using in case of 64B L1 data
cache line (default case) and don't use prefetch* instructions
for other possible L1 data cache line lengths (32B and 128B).

Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
---
 arch/arc/lib/memset-archs.S | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/arch/arc/lib/memset-archs.S b/arch/arc/lib/memset-archs.S
index 62ad4bcb841a..c7717832336f 100644
--- a/arch/arc/lib/memset-archs.S
+++ b/arch/arc/lib/memset-archs.S
@@ -7,11 +7,32 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/cache.h>
 
 #undef PREALLOC_NOT_AVAIL
 
+/*
+ * The memset implementation below is optimized to use prefetchw and prealloc
+ * instruction in case of CPU with 64B L1 data cache line (L1_CACHE_SHIFT == 6)
+ * If you want to implement optimized memset for other possible L1 data cache
+ * line lengths (32B and 128B) you should rewrite code carefully checking
+ * we don't call any prefetchw/prealloc instruction for L1 cache lines which
+ * don't belongs to memset area.
+ */
+#if L1_CACHE_SHIFT!=6
+# define PREALLOC_INSTR(...)
+# define PREFETCHW_INSTR(...)
+#else  /* L1_CACHE_SHIFT!=6 */
+# define PREFETCHW_INSTR(...)	prefetchw __VA_ARGS__
+# ifdef PREALLOC_NOT_AVAIL
+#  define PREALLOC_INSTR(...)	prefetchw __VA_ARGS__
+# else
+#  define PREALLOC_INSTR(...)	prealloc __VA_ARGS__
+# endif
+#endif /* L1_CACHE_SHIFT!=6 */
+
 ENTRY_CFI(memset)
-	prefetchw [r0]		; Prefetch the write location
+	PREFETCHW_INSTR([r0])	; Prefetch the first write location
 	mov.f	0, r2
 ;;; if size is zero
 	jz.d	[blink]
@@ -48,11 +69,7 @@ ENTRY_CFI(memset)
 
 	lpnz	@.Lset64bytes
 	;; LOOP START
-#ifdef PREALLOC_NOT_AVAIL
-	prefetchw [r3, 64]	;Prefetch the next write location
-#else
-	prealloc  [r3, 64]
-#endif
+	PREALLOC_INSTR([r3, 64]) ;Prefetch the next write location
 #ifdef CONFIG_ARC_HAS_LL64
 	std.ab	r4, [r3, 8]
 	std.ab	r4, [r3, 8]
@@ -85,7 +102,6 @@ ENTRY_CFI(memset)
 	lsr.f	lp_count, r2, 5 ;Last remaining  max 124 bytes
 	lpnz	.Lset32bytes
 	;; LOOP START
-	prefetchw   [r3, 32]	;Prefetch the next write location
 #ifdef CONFIG_ARC_HAS_LL64
 	std.ab	r4, [r3, 8]
 	std.ab	r4, [r3, 8]
-- 
2.14.5


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/2] ARCv2: LIB: memset: move st instruction series to macros
  2019-01-14 15:16 [PATCH 1/2] ARCv2: LIB: memeset: fix doing prefetchw outside of buffer Eugeniy Paltsev
@ 2019-01-14 15:16 ` Eugeniy Paltsev
  2019-01-15  1:09 ` [PATCH 1/2] ARCv2: LIB: memeset: fix doing prefetchw outside of buffer Vineet Gupta
  1 sibling, 0 replies; 5+ messages in thread
From: Eugeniy Paltsev @ 2019-01-14 15:16 UTC (permalink / raw)
  To: linux-snps-arc, Vineet Gupta
  Cc: linux-kernel, Alexey Brodkin, Eugeniy Paltsev

Move store 32 byte instruction series to macros to make code
more readable.

Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
---
 arch/arc/lib/memset-archs.S | 63 +++++++++++++++------------------------------
 1 file changed, 21 insertions(+), 42 deletions(-)

diff --git a/arch/arc/lib/memset-archs.S b/arch/arc/lib/memset-archs.S
index c7717832336f..4a0d562d23cd 100644
--- a/arch/arc/lib/memset-archs.S
+++ b/arch/arc/lib/memset-archs.S
@@ -31,6 +31,24 @@
 # endif
 #endif /* L1_CACHE_SHIFT!=6 */
 
+.macro MEMSET_32BYTE		val, adr
+#ifdef CONFIG_ARC_HAS_LL64
+	std.ab	\val, [\adr, 8]
+	std.ab	\val, [\adr, 8]
+	std.ab	\val, [\adr, 8]
+	std.ab	\val, [\adr, 8]
+#else
+	st.ab	\val, [\adr, 4]
+	st.ab	\val, [\adr, 4]
+	st.ab	\val, [\adr, 4]
+	st.ab	\val, [\adr, 4]
+	st.ab	\val, [\adr, 4]
+	st.ab	\val, [\adr, 4]
+	st.ab	\val, [\adr, 4]
+	st.ab	\val, [\adr, 4]
+#endif
+.endm
+
 ENTRY_CFI(memset)
 	PREFETCHW_INSTR([r0])	; Prefetch the first write location
 	mov.f	0, r2
@@ -70,53 +88,14 @@ ENTRY_CFI(memset)
 	lpnz	@.Lset64bytes
 	;; LOOP START
 	PREALLOC_INSTR([r3, 64]) ;Prefetch the next write location
-#ifdef CONFIG_ARC_HAS_LL64
-	std.ab	r4, [r3, 8]
-	std.ab	r4, [r3, 8]
-	std.ab	r4, [r3, 8]
-	std.ab	r4, [r3, 8]
-	std.ab	r4, [r3, 8]
-	std.ab	r4, [r3, 8]
-	std.ab	r4, [r3, 8]
-	std.ab	r4, [r3, 8]
-#else
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-#endif
+	MEMSET_32BYTE	r4, r3
+	MEMSET_32BYTE	r4, r3
 .Lset64bytes:
 
 	lsr.f	lp_count, r2, 5 ;Last remaining  max 124 bytes
 	lpnz	.Lset32bytes
 	;; LOOP START
-#ifdef CONFIG_ARC_HAS_LL64
-	std.ab	r4, [r3, 8]
-	std.ab	r4, [r3, 8]
-	std.ab	r4, [r3, 8]
-	std.ab	r4, [r3, 8]
-#else
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-#endif
+	MEMSET_32BYTE	r4, r3
 .Lset32bytes:
 
 	and.f	lp_count, r2, 0x1F ;Last remaining 31 bytes
-- 
2.14.5


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/2] ARCv2: LIB: memeset: fix doing prefetchw outside of buffer
  2019-01-14 15:16 [PATCH 1/2] ARCv2: LIB: memeset: fix doing prefetchw outside of buffer Eugeniy Paltsev
  2019-01-14 15:16 ` [PATCH 2/2] ARCv2: LIB: memset: move st instruction series to macros Eugeniy Paltsev
@ 2019-01-15  1:09 ` Vineet Gupta
  2019-01-15 12:00   ` Eugeniy Paltsev
  1 sibling, 1 reply; 5+ messages in thread
From: Vineet Gupta @ 2019-01-15  1:09 UTC (permalink / raw)
  To: Eugeniy Paltsev, linux-snps-arc; +Cc: linux-kernel, Alexey Brodkin

On 1/14/19 7:17 AM, Eugeniy Paltsev wrote:
> Current ARCv2 memeset implementation may call 'prefetchw'
> instruction for address which lies outside of memset area.
> So we got one modified (dirty) cache line outside of memset
> area. This may lead to data corruption if this area is used
> for DMA IO.
>
> Another issue is that current ARCv2 memeset implementation
> may call 'prealloc' instruction for L1 cache line which
> doesn't fully belongs to memeset area in case of 128B L1 D$
> line length. That leads to data corruption.
>
> Fix prefetchw/prealloc instructions using in case of 64B L1 data
> cache line (default case) and don't use prefetch* instructions
> for other possible L1 data cache line lengths (32B and 128B).
>
> Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
> ---
>  arch/arc/lib/memset-archs.S | 30 +++++++++++++++++++++++-------
>  1 file changed, 23 insertions(+), 7 deletions(-)
>
> diff --git a/arch/arc/lib/memset-archs.S b/arch/arc/lib/memset-archs.S
> index 62ad4bcb841a..c7717832336f 100644
> --- a/arch/arc/lib/memset-archs.S
> +++ b/arch/arc/lib/memset-archs.S
> @@ -7,11 +7,32 @@
>   */
>  
>  #include <linux/linkage.h>
> +#include <asm/cache.h>
>  
>  #undef PREALLOC_NOT_AVAIL
>  
> +/*
> + * The memset implementation below is optimized to use prefetchw and prealloc
> + * instruction in case of CPU with 64B L1 data cache line (L1_CACHE_SHIFT == 6)
> + * If you want to implement optimized memset for other possible L1 data cache
> + * line lengths (32B and 128B) you should rewrite code carefully checking
> + * we don't call any prefetchw/prealloc instruction for L1 cache lines which
> + * don't belongs to memset area.

Good point. FWIW, it is possible to support those non common line lengths by using
L1_CACHE_SHIFT etc in asm code below but I agree its not worth the trouble.

> + */
> +#if L1_CACHE_SHIFT!=6
> +# define PREALLOC_INSTR(...)
> +# define PREFETCHW_INSTR(...)
> +#else  /* L1_CACHE_SHIFT!=6 */
> +# define PREFETCHW_INSTR(...)	prefetchw __VA_ARGS__
> +# ifdef PREALLOC_NOT_AVAIL
> +#  define PREALLOC_INSTR(...)	prefetchw __VA_ARGS__
> +# else
> +#  define PREALLOC_INSTR(...)	prealloc __VA_ARGS__
> +# endif
> +#endif /* L1_CACHE_SHIFT!=6 */
> +
>  ENTRY_CFI(memset)
> -	prefetchw [r0]		; Prefetch the write location
> +	PREFETCHW_INSTR([r0])	; Prefetch the first write location
>  	mov.f	0, r2
>  ;;; if size is zero
>  	jz.d	[blink]
> @@ -48,11 +69,7 @@ ENTRY_CFI(memset)
>  
>  	lpnz	@.Lset64bytes
>  	;; LOOP START
> -#ifdef PREALLOC_NOT_AVAIL
> -	prefetchw [r3, 64]	;Prefetch the next write location
> -#else
> -	prealloc  [r3, 64]
> -#endif
> +	PREALLOC_INSTR([r3, 64]) ;Prefetch the next write location

These are not solving the issue - I'd break this up and move these bits to your
next patch.

>  #ifdef CONFIG_ARC_HAS_LL64
>  	std.ab	r4, [r3, 8]
>  	std.ab	r4, [r3, 8]
> @@ -85,7 +102,6 @@ ENTRY_CFI(memset)
>  	lsr.f	lp_count, r2, 5 ;Last remaining  max 124 bytes
>  	lpnz	.Lset32bytes
>  	;; LOOP START
> -	prefetchw   [r3, 32]	;Prefetch the next write location

So the real fix for issue at hand is this line. I'll keep this for the fix and
beef up the changelog. Thing is existing code was already skipping the last 64B
from the main loop (thus avoided prefetching the next line), but then reintroduced
prefetchw is last 32B loop, spoiling the party.  That prefetchw was pointless anyways

-Vineet

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/2] ARCv2: LIB: memeset: fix doing prefetchw outside of buffer
  2019-01-15  1:09 ` [PATCH 1/2] ARCv2: LIB: memeset: fix doing prefetchw outside of buffer Vineet Gupta
@ 2019-01-15 12:00   ` Eugeniy Paltsev
  2019-01-15 16:58     ` Vineet Gupta
  0 siblings, 1 reply; 5+ messages in thread
From: Eugeniy Paltsev @ 2019-01-15 12:00 UTC (permalink / raw)
  To: Vineet Gupta, linux-snps-arc; +Cc: linux-kernel, Alexey Brodkin

Hi Vineet,

On Tue, 2019-01-15 at 01:09 +0000, Vineet Gupta wrote:
> On 1/14/19 7:17 AM, Eugeniy Paltsev wrote:
> > Current ARCv2 memeset implementation may call 'prefetchw'
> > instruction for address which lies outside of memset area.
> > So we got one modified (dirty) cache line outside of memset
> > area. This may lead to data corruption if this area is used
> > for DMA IO.
> > 
> > Another issue is that current ARCv2 memeset implementation
> > may call 'prealloc' instruction for L1 cache line which
> > doesn't fully belongs to memeset area in case of 128B L1 D$
> > line length. That leads to data corruption.
> > 
> > 
 * Fix prefetchw/prealloc instructions using in case of 64B L1 data
> > cache line (default case) and don't use prefetch* instructions
> > for other possible L1 data cache line lengths (32B and 128B).
> > 
> > Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
> > ---
> >  arch/arc/lib/memset-archs.S | 30 +++++++++++++++++++++++-------
> >  1 file changed, 23 insertions(+), 7 deletions(-)
> > 
> > diff --git a/arch/arc/lib/memset-archs.S b/arch/arc/lib/memset-archs.S
> > index 62ad4bcb841a..c7717832336f 100644
> > --- a/arch/arc/lib/memset-archs.S
> > +++ b/arch/arc/lib/memset-archs.S
> > @@ -7,11 +7,32 @@
> >   */
> >  
> >  #include <linux/linkage.h>
> > +#include <asm/cache.h>
> >  
> >  #undef PREALLOC_NOT_AVAIL
> >  
> > +/*
> > + * The memset implementation below is optimized to use prefetchw and prealloc
> > + * instruction in case of CPU with 64B L1 data cache line (L1_CACHE_SHIFT == 6)
> > + * If you want to implement optimized memset for other possible L1 data cache
> > + * line lengths (32B and 128B) you should rewrite code carefully checking
> > + * we don't call any prefetchw/prealloc instruction for L1 cache lines which
> > + * don't belongs to memset area.
> 
> Good point. FWIW, it is possible to support those non common line lengths by using
> L1_CACHE_SHIFT etc in asm code below but I agree its not worth the trouble.
> 
> > + */
> > +#if L1_CACHE_SHIFT!=6
> > +# define PREALLOC_INSTR(...)
> > +# define PREFETCHW_INSTR(...)
> > +#else  /* L1_CACHE_SHIFT!=6 */
> > +# define PREFETCHW_INSTR(...)	prefetchw __VA_ARGS__
> > +# ifdef PREALLOC_NOT_AVAIL
> > +#  define PREALLOC_INSTR(...)	prefetchw __VA_ARGS__
> > +# else
> > +#  define PREALLOC_INSTR(...)	prealloc __VA_ARGS__
> > +# endif
> > +#endif /* L1_CACHE_SHIFT!=6 */
> > +
> >  ENTRY_CFI(memset)
> > -	prefetchw [r0]		; Prefetch the write location
> > +	PREFETCHW_INSTR([r0])	; Prefetch the first write location
> >  	mov.f	0, r2
> >  ;;; if size is zero
> >  	jz.d	[blink]
> > @@ -48,11 +69,7 @@ ENTRY_CFI(memset)
> >  
> >  	lpnz	@.Lset64bytes
> >  	;; LOOP START
> > -#ifdef PREALLOC_NOT_AVAIL
> > -	prefetchw [r3, 64]	;Prefetch the next write location
> > -#else
> > -	prealloc  [r3, 64]
> > -#endif
> > +	PREALLOC_INSTR([r3, 64]) ;Prefetch the next write location
> 
> These are not solving the issue - I'd break this up and move these bits to your
> next patch.

Actually these are solving another issue - current implementation may call
'prealloc' instruction for L1 cache line which doesn't fully belongs to
memeset area in case of 128B L1 D$ line length. As the 'prealloc' fill cache line
with zeros this leads to data corruption.

So I would better keep these changes in this 'fix' patch.


BTW, I've forgot again to add Cc: stable@vger.kernel.org, could you add it for me,
when applying patch?
Thanks.


> >  #ifdef CONFIG_ARC_HAS_LL64
> >  	std.ab	r4, [r3, 8]
> >  	std.ab	r4, [r3, 8]
> > @@ -85,7 +102,6 @@ ENTRY_CFI(memset)
> >  	lsr.f	lp_count, r2, 5 ;Last remaining  max 124 bytes
> >  	lpnz	.Lset32bytes
> >  	;; LOOP START
> > -	prefetchw   [r3, 32]	;Prefetch the next write location
> 
> So the real fix for issue at hand is this line. I'll keep this for the fix and
> beef up the changelog. Thing is existing code was already skipping the last 64B
> from the main loop (thus avoided prefetching the next line), but then reintroduced
> prefetchw is last 32B loop, spoiling the party.  That prefetchw was pointless anyways
> 
> -Vineet
-- 
 Eugeniy Paltsev

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/2] ARCv2: LIB: memeset: fix doing prefetchw outside of buffer
  2019-01-15 12:00   ` Eugeniy Paltsev
@ 2019-01-15 16:58     ` Vineet Gupta
  0 siblings, 0 replies; 5+ messages in thread
From: Vineet Gupta @ 2019-01-15 16:58 UTC (permalink / raw)
  To: Eugeniy Paltsev, linux-snps-arc; +Cc: linux-kernel, Alexey Brodkin

On 1/15/19 4:00 AM, Eugeniy Paltsev wrote:
>>> -#ifdef PREALLOC_NOT_AVAIL
>>> -	prefetchw [r3, 64]	;Prefetch the next write location
>>> -#else
>>> -	prealloc  [r3, 64]
>>> -#endif
>>> +	PREALLOC_INSTR([r3, 64]) ;Prefetch the next write location
>> These are not solving the issue - I'd break this up and move these bits to your
>> next patch.
> Actually these are solving another issue - current implementation may call
> 'prealloc' instruction for L1 cache line which doesn't fully belongs to
> memeset area in case of 128B L1 D$ line length. As the 'prealloc' fill cache line
> with zeros this leads to data corruption.
>
> So I would better keep these changes in this 'fix' patch.

Fair enough. Although technically it is different bug which we haven't actually
hit, found by code review.

> BTW, I've forgot again to add Cc: stable@vger.kernel.org, could you add it for me,
> when applying patch?
> Thanks.

Yes, I did that already !

-Vineet

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2019-01-15 16:58 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-14 15:16 [PATCH 1/2] ARCv2: LIB: memeset: fix doing prefetchw outside of buffer Eugeniy Paltsev
2019-01-14 15:16 ` [PATCH 2/2] ARCv2: LIB: memset: move st instruction series to macros Eugeniy Paltsev
2019-01-15  1:09 ` [PATCH 1/2] ARCv2: LIB: memeset: fix doing prefetchw outside of buffer Vineet Gupta
2019-01-15 12:00   ` Eugeniy Paltsev
2019-01-15 16:58     ` Vineet Gupta

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).