linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Re: [PATCH] ppc32 copy_to_user dcbt fixup
       [not found] <1z8Na-5hH-1@gated-at.bofh.it>
@ 2004-03-13 14:39 ` Danjel McGougan
  2004-03-14 22:35   ` Bryan Rittmeyer
  0 siblings, 1 reply; 9+ messages in thread
From: Danjel McGougan @ 2004-03-13 14:39 UTC (permalink / raw)
  To: linux-kernel

Bryan Rittmeyer wrote:
> copy_tofrom_user and copy_page use dcbt to prefetch source data [1].
> Since at least 2.4.17, these functions have been prefetching
> beyond the end of the source buffer, leading to two problems:
> 
> 1. Subtly broken software cache coherency. If the area following src
> was invalidate_dcache_range'd prior to submitting for DMA,
> an out-of-bounds dcbt from copy_to_user of a separate slab object
> may read in the area before DMA completion. When the DMA does complete,
> data will not be loaded from RAM because stale data is already in cache.
> Thus you get a corrupt network packet, bogus audio capture, etc.
> 
[snip]

I am no expert on the ppc arch, but in my humble opinion it seems 
strange to invalidate the dcache *before* the memory-writing 
DMA-transaction. The obvious solution is to invalidate the dcache 
*after* DMA completion. It seems hard to guarantee that nobody will 
touch the memory area in question during the DMA.

Just some clue-less comments from a linux-kernel lurker.

Regards,
Danjel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] ppc32 copy_to_user dcbt fixup
  2004-03-13 14:39 ` [PATCH] ppc32 copy_to_user dcbt fixup Danjel McGougan
@ 2004-03-14 22:35   ` Bryan Rittmeyer
  0 siblings, 0 replies; 9+ messages in thread
From: Bryan Rittmeyer @ 2004-03-14 22:35 UTC (permalink / raw)
  To: Danjel McGougan; +Cc: linux-kernel

On Sat, Mar 13, 2004 at 03:39:57PM +0100, Danjel McGougan wrote:
> The obvious solution is to invalidate the dcache 
> *after* DMA completion. It seems hard to guarantee that nobody will 
> touch the memory area in question during the DMA.

You need to invalidate prior to submitting for DMA. Otherwise if the
region is dirty the CPU may write back after DMA has begun, causing a 
data corrupting race.

Invalidating before _and_ after is expensive; better to fix the misreads.

-Bryan


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] ppc32 copy_to_user dcbt fixup
  2004-03-13  9:11 ` Eugene Surovegin
@ 2004-03-16  1:59   ` Bryan Rittmeyer
  0 siblings, 0 replies; 9+ messages in thread
From: Bryan Rittmeyer @ 2004-03-16  1:59 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev list, Paul Mackerras, Benjamin Herrenschmidt

On Sat, Mar 13, 2004 at 01:11:10AM -0800, Eugene Surovegin wrote:
> I reported this problem on -embedded list half a year ago.
> This is already fixed in 2.4 tree, not sure about 2.6

Thanks. The fix in linuxppc-2.4 is cleaner than my prior patch.
Here's a forward port to linuxppc-2.5-benh:

--- linuxppc-2.5-benh/arch/ppc/lib/string.S.orig	2004-02-19 18:08:13.000000000 -0800
+++ linuxppc-2.5-benh/arch/ppc/lib/string.S	2004-03-15 17:05:38.000000000 -0800
@@ -436,48 +436,57 @@
 73:	stwu	r9,4(r6)
 	bdnz	72b
 
+	.section __ex_table,"a"
+	.align	2
+	.long	70b,100f
+	.long	71b,101f
+	.long	72b,102f
+	.long	73b,103f
+	.text
+
 58:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
 	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
 	li	r11,4
 	beq	63f
 
-#if !defined(CONFIG_8xx)
+#ifdef CONFIG_8xx
+	/* Don't use prefetch on 8xx */
+	mtctr	r0
+53:	COPY_16_BYTES_WITHEX(0)
+	bdnz	53b
+
+#else /* not CONFIG_8xx */
 	/* Here we decide how far ahead to prefetch the source */
+	li	r3,4
+	cmpwi	r0,1
+	li	r7,0
+	ble	114f
+	li	r7,1
 #if MAX_COPY_PREFETCH > 1
 	/* Heuristically, for large transfers we prefetch
 	   MAX_COPY_PREFETCH cachelines ahead.  For small transfers
 	   we prefetch 1 cacheline ahead. */
 	cmpwi	r0,MAX_COPY_PREFETCH
-	li	r7,1
-	li	r3,4
-	ble	111f
+	ble	112f
 	li	r7,MAX_COPY_PREFETCH
-111:	mtctr	r7
-112:	dcbt	r3,r4
+112:	mtctr	r7
+111:	dcbt	r3,r4
 	addi	r3,r3,CACHELINE_BYTES
-	bdnz	112b
-#else /* MAX_COPY_PREFETCH == 1 */
-	li	r3,CACHELINE_BYTES + 4
-	dcbt	r11,r4
-#endif /* MAX_COPY_PREFETCH */
-#endif /* CONFIG_8xx */
-
-	mtctr	r0
-53:
-#if !defined(CONFIG_8xx)
+	bdnz	111b
+#else
 	dcbt	r3,r4
+	addi	r3,r3,CACHELINE_BYTES
+#endif /* MAX_COPY_PREFETCH > 1 */
+
+114:	subf	r8,r7,r0
+	mr	r0,r7
+	mtctr	r8
+
+53:	dcbt	r3,r4
 54:	dcbz	r11,r6
-#endif
-/* had to move these to keep extable in order */
 	.section __ex_table,"a"
 	.align	2
-	.long	70b,100f
-	.long	71b,101f
-	.long	72b,102f
-	.long	73b,103f
-#if !defined(CONFIG_8xx)
 	.long	54b,105f
-#endif
 	.text
 /* the main body of the cacheline loop */
 	COPY_16_BYTES_WITHEX(0)
@@ -495,6 +504,11 @@
 #endif
 #endif
 	bdnz	53b
+	cmpwi	r0,0
+	li	r3,4
+	li	r7,0
+	bne	114b
+#endif /* CONFIG_8xx */	
 
 63:	srwi.	r0,r5,2
 	mtctr	r0
--- linuxppc-2.5-benh/arch/ppc/kernel/misc.S.orig	2004-03-15 17:20:13.000000000 -0800
+++ linuxppc-2.5-benh/arch/ppc/kernel/misc.S	2004-03-15 17:35:22.000000000 -0800
@@ -783,9 +783,18 @@
 _GLOBAL(copy_page)
 	addi	r3,r3,-4
 	addi	r4,r4,-4
+
+#ifdef CONFIG_8xx
+	/* don't use prefetch on 8xx */
+    	li	r0,4096/L1_CACHE_LINE_SIZE
+	mtctr	r0
+1:	COPY_16_BYTES
+	bdnz	1b
+	blr
+
+#else	/* not 8xx, we can prefetch */
 	li	r5,4
 
-#ifndef CONFIG_8xx
 #if MAX_COPY_PREFETCH > 1
 	li	r0,MAX_COPY_PREFETCH
 	li	r11,4
@@ -793,19 +802,17 @@
 11:	dcbt	r11,r4
 	addi	r11,r11,L1_CACHE_LINE_SIZE
 	bdnz	11b
-#else /* MAX_L1_COPY_PREFETCH == 1 */
+#else /* MAX_COPY_PREFETCH == 1 */
 	dcbt	r5,r4
 	li	r11,L1_CACHE_LINE_SIZE+4
-#endif /* MAX_L1_COPY_PREFETCH */
-#endif /* CONFIG_8xx */
-
-	li	r0,4096/L1_CACHE_LINE_SIZE
+#endif /* MAX_COPY_PREFETCH */
+	li	r0,4096/L1_CACHE_LINE_SIZE - MAX_COPY_PREFETCH
+	crclr	4*cr0+eq
+2:
 	mtctr	r0
 1:
-#ifndef CONFIG_8xx
 	dcbt	r11,r4
 	dcbz	r5,r3
-#endif
 	COPY_16_BYTES
 #if L1_CACHE_LINE_SIZE >= 32
 	COPY_16_BYTES
@@ -821,6 +828,12 @@
 #endif
 #endif
 	bdnz	1b
+	beqlr
+	crnot	4*cr0+eq,4*cr0+eq
+	li	r0,MAX_COPY_PREFETCH
+	li	r11,4
+	b	2b
+#endif	/* CONFIG_8xx */
 	blr
 
 /*


-Bryan


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] ppc32 copy_to_user dcbt fixup
  2004-03-13  4:50 ` Benjamin Herrenschmidt
  2004-03-13  7:49   ` Bryan Rittmeyer
@ 2004-03-15  8:38   ` Segher Boessenkool
  1 sibling, 0 replies; 9+ messages in thread
From: Segher Boessenkool @ 2004-03-15  8:38 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: linuxppc-dev list, Linux Kernel list, Bryan Rittmeyer, Paul Mackerras

> It would be wise to make the dcbz as long as
> possible in "advance" before the actual write to the cache line.

And use dcbzl instead, if available...


Segher


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] ppc32 copy_to_user dcbt fixup
  2004-03-13  4:15 Bryan Rittmeyer
  2004-03-13  4:50 ` Benjamin Herrenschmidt
@ 2004-03-13  9:11 ` Eugene Surovegin
  2004-03-16  1:59   ` Bryan Rittmeyer
  1 sibling, 1 reply; 9+ messages in thread
From: Eugene Surovegin @ 2004-03-13  9:11 UTC (permalink / raw)
  To: Bryan Rittmeyer
  Cc: linux-kernel, linuxppc-dev list, Paul Mackerras, Benjamin Herrenschmidt

On Fri, Mar 12, 2004 at 08:15:47PM -0800, Bryan Rittmeyer wrote:
> copy_tofrom_user and copy_page use dcbt to prefetch source data [1].
> Since at least 2.4.17, these functions have been prefetching
> beyond the end of the source buffer, leading to two problems:
> 
> 1. Subtly broken software cache coherency. If the area following src
> was invalidate_dcache_range'd prior to submitting for DMA,
> an out-of-bounds dcbt from copy_to_user of a separate slab object
> may read in the area before DMA completion. When the DMA does complete,
> data will not be loaded from RAM because stale data is already in cache.
> Thus you get a corrupt network packet, bogus audio capture, etc.
> 

I reported this problem on -embedded list half a year ago.

This is already fixed in 2.4 tree, not sure about 2.6

Eugene.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] ppc32 copy_to_user dcbt fixup
  2004-03-13  7:49   ` Bryan Rittmeyer
@ 2004-03-13  8:36     ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 9+ messages in thread
From: Benjamin Herrenschmidt @ 2004-03-13  8:36 UTC (permalink / raw)
  To: Bryan Rittmeyer; +Cc: Linux Kernel list, linuxppc-dev list, Paul Mackerras

On Sat, 2004-03-13 at 18:49, Bryan Rittmeyer wrote:
> On Sat, Mar 13, 2004 at 03:50:03PM +1100, Benjamin Herrenschmidt wrote:
> > It would be wise to make the dcbz as long as possible in "advance"
> > before the actual write to the cache line.
> 
> I guess we could try "pre-dcbz" ala the dcbt prefetch code.
> Even dcbz right before stw is probably cheaper than RAM
> loading data that will be totally overwritten. It's hard to
> lose eliminating bus I/O (especially reads).

Except if the target is already in the cache... difficult choice.

> Any comments on the dcbt prefetch patch?

Didn't have time to look at it in details yet, maybe not before
monday or tuesday

Ben



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] ppc32 copy_to_user dcbt fixup
  2004-03-13  4:50 ` Benjamin Herrenschmidt
@ 2004-03-13  7:49   ` Bryan Rittmeyer
  2004-03-13  8:36     ` Benjamin Herrenschmidt
  2004-03-15  8:38   ` Segher Boessenkool
  1 sibling, 1 reply; 9+ messages in thread
From: Bryan Rittmeyer @ 2004-03-13  7:49 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Linux Kernel list, linuxppc-dev list, Paul Mackerras

On Sat, Mar 13, 2004 at 03:50:03PM +1100, Benjamin Herrenschmidt wrote:
> It would be wise to make the dcbz as long as possible in "advance"
> before the actual write to the cache line.

I guess we could try "pre-dcbz" ala the dcbt prefetch code.
Even dcbz right before stw is probably cheaper than RAM
loading data that will be totally overwritten. It's hard to
lose eliminating bus I/O (especially reads).

Any comments on the dcbt prefetch patch?

-Bryan


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] ppc32 copy_to_user dcbt fixup
  2004-03-13  4:15 Bryan Rittmeyer
@ 2004-03-13  4:50 ` Benjamin Herrenschmidt
  2004-03-13  7:49   ` Bryan Rittmeyer
  2004-03-15  8:38   ` Segher Boessenkool
  2004-03-13  9:11 ` Eugene Surovegin
  1 sibling, 2 replies; 9+ messages in thread
From: Benjamin Herrenschmidt @ 2004-03-13  4:50 UTC (permalink / raw)
  To: Bryan Rittmeyer; +Cc: Linux Kernel list, linuxppc-dev list, Paul Mackerras


> [1] csum_partial_copy_generic does not use dcbt/dcbz despite being
> scorching hot in TCP workloads. I'm cooking up another patch to
> dcb?ize it.
> 
> [2] http://staidm.org/linux/ppc/copy_dcbt/copyuser-microbench.tar.bz2

I'll have a good look at it asap. Also, keep in mind that between the
dcbz and the time you can actually write to that cache line, some CPUs
may need some time to complete the L1 dcbz operation, which can lead
to poor performances, at least I've been told this is the case on
POWER3, maybe others. It would be wise to make the dcbz as long as
possible in "advance" before the actual write to the cache line.

Ben.



^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH] ppc32 copy_to_user dcbt fixup
@ 2004-03-13  4:15 Bryan Rittmeyer
  2004-03-13  4:50 ` Benjamin Herrenschmidt
  2004-03-13  9:11 ` Eugene Surovegin
  0 siblings, 2 replies; 9+ messages in thread
From: Bryan Rittmeyer @ 2004-03-13  4:15 UTC (permalink / raw)
  To: linux-kernel; +Cc: linuxppc-dev list, Paul Mackerras, Benjamin Herrenschmidt

[-- Attachment #1: Type: text/plain, Size: 1374 bytes --]

copy_tofrom_user and copy_page use dcbt to prefetch source data [1].
Since at least 2.4.17, these functions have been prefetching
beyond the end of the source buffer, leading to two problems:

1. Subtly broken software cache coherency. If the area following src
was invalidate_dcache_range'd prior to submitting for DMA,
an out-of-bounds dcbt from copy_to_user of a separate slab object
may read in the area before DMA completion. When the DMA does complete,
data will not be loaded from RAM because stale data is already in cache.
Thus you get a corrupt network packet, bogus audio capture, etc.

This problem probably does not affect hardware coherent systems
(all Apple machines?). However:

2. The extra 'dcbt' wastes bus bandwidth. Worst case: on a 128 byte copy,
we currently dcbt 256 bytes. These extra loads trash cache, potentially
causing writeback of more useful data.

The attached patch attempts to reign in dcbt prefetching at the end of
copies such that we do not read beyond the src area. This change fixes 
DMA data corruption on software coherent systems and improves
performance slightly in my lame microbenchmark [2].

[1] csum_partial_copy_generic does not use dcbt/dcbz despite being
scorching hot in TCP workloads. I'm cooking up another patch to
dcb?ize it. 

[2] http://staidm.org/linux/ppc/copy_dcbt/copyuser-microbench.tar.bz2

Comments?

-Bryan


[-- Attachment #2: dcbt.patch --]
[-- Type: text/plain, Size: 2133 bytes --]

--- linuxppc-2.5-benh/arch/ppc/lib/string.S~	2004-03-12 14:06:50.000000000 -0800
+++ linuxppc-2.5-benh/arch/ppc/lib/string.S	2004-03-12 16:26:09.000000000 -0800
@@ -443,16 +443,16 @@
 
 #if !defined(CONFIG_8xx)
 	/* Here we decide how far ahead to prefetch the source */
+	li	r12,1
 #if MAX_COPY_PREFETCH > 1
 	/* Heuristically, for large transfers we prefetch
 	   MAX_COPY_PREFETCH cachelines ahead.  For small transfers
 	   we prefetch 1 cacheline ahead. */
 	cmpwi	r0,MAX_COPY_PREFETCH
-	li	r7,1
 	li	r3,4
 	ble	111f
-	li	r7,MAX_COPY_PREFETCH
-111:	mtctr	r7
+	li	r12,MAX_COPY_PREFETCH
+111:	mtctr	r12
 112:	dcbt	r3,r4
 	addi	r3,r3,CACHELINE_BYTES
 	bdnz	112b
@@ -462,9 +462,29 @@
 #endif /* MAX_COPY_PREFETCH */
 #endif /* CONFIG_8xx */
 
+	/* don't run dcbt on cachelines outside our src area.
+       it increases bus traffic, may displace useful data,
+       and busts software cache coherency. those factors
+       are typically worse than the extra branch.
+
+       if r5 == 0, then we have to stop dcbt when ctr <= r12.  
+       if r5 != 0 (partial bytes at end) we should do an extra
+       dcbt for them--kmalloc etc will not put multiple
+       objects within a single cacheline
+
+       -bryan at staidm org
+    */
+	cmpwi	r5,0
+	beq		52f
+	addi	r12,r12,1
+
+52:
 	mtctr	r0
 53:
 #if !defined(CONFIG_8xx)
+	mfctr	r7
+	cmplw	0,r7,r12
+	ble		54f
 	dcbt	r3,r4
 54:	dcbz	r11,r6
 #endif
--- linuxppc-2.5-benh/arch/ppc/kernel/misc.S~	2004-02-04 13:35:34.000000000 -0800
+++ linuxppc-2.5-benh/arch/ppc/kernel/misc.S	2004-03-12 17:07:52.000000000 -0800
@@ -787,15 +787,16 @@
 
 #ifndef CONFIG_8xx
 #if MAX_COPY_PREFETCH > 1
-	li	r0,MAX_COPY_PREFETCH
+	li	r12,MAX_COPY_PREFETCH
 	li	r11,4
-	mtctr	r0
+	mtctr	r12
 11:	dcbt	r11,r4
 	addi	r11,r11,L1_CACHE_LINE_SIZE
 	bdnz	11b
 #else /* MAX_L1_COPY_PREFETCH == 1 */
 	dcbt	r5,r4
 	li	r11,L1_CACHE_LINE_SIZE+4
+	li	r12,1
 #endif /* MAX_L1_COPY_PREFETCH */
 #endif /* CONFIG_8xx */
 
@@ -803,8 +804,11 @@
 	mtctr	r0
 1:
 #ifndef CONFIG_8xx
+	mfctr	r7
+	cmplw	0,r7,r12
+	ble		2f
 	dcbt	r11,r4
-	dcbz	r5,r3
+2:	dcbz	r5,r3
 #endif
 	COPY_16_BYTES
 #if L1_CACHE_LINE_SIZE >= 32

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2004-03-16  2:02 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <1z8Na-5hH-1@gated-at.bofh.it>
2004-03-13 14:39 ` [PATCH] ppc32 copy_to_user dcbt fixup Danjel McGougan
2004-03-14 22:35   ` Bryan Rittmeyer
2004-03-13  4:15 Bryan Rittmeyer
2004-03-13  4:50 ` Benjamin Herrenschmidt
2004-03-13  7:49   ` Bryan Rittmeyer
2004-03-13  8:36     ` Benjamin Herrenschmidt
2004-03-15  8:38   ` Segher Boessenkool
2004-03-13  9:11 ` Eugene Surovegin
2004-03-16  1:59   ` Bryan Rittmeyer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).