All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
@ 2008-06-19  7:53 Mark Nelson
  2008-06-19 14:43 ` Arnd Bergmann
  0 siblings, 1 reply; 19+ messages in thread
From: Mark Nelson @ 2008-06-19  7:53 UTC (permalink / raw)
  To: linuxppc-dev, cbe-oss-dev; +Cc: Gunnar von Boehn, Michael Ellerman

/*
 * Copyright (C) 2008 Gunnar von Boehn, IBM Corp.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 *
 *
 * __copy_tofrom_user routine optimized for CELL-BE-PPC
 *
 * The CELL PPC core has 1 integerunit and 1 load/store unit
 * CELL: 1st level data cache = 32K - 2nd level data cache = 512K
 * - 3rd level data cache = 0K
 * To improve copy performance we need to prefetch source data
 * far ahead to hide this latency
 * For best performance instruction forms ending in "." like "andi."
 * should be avoided as they are implemented in microcode on CELL.
 *
 * The below code is loop unrolled for the CELL cache line of 128 bytes.
 *
 */

#include <asm/processor.h>
#include <asm/ppc_asm.h>

#define PREFETCH_AHEAD 6
#define ZERO_AHEAD 4

        .align  7
_GLOBAL(__copy_tofrom_user)
	dcbt	0,r4		/* Prefetch ONE SRC cacheline */

	std     r5,-8(r1)	/* remember size */

	cmpldi	cr1,r5,16	/* is size < 16 ? */
	mr	r6,r3
	blt+	cr1,.Lshortcopy

.Lbigcopy:
	neg	r8,r3		/* LS 3 bits = # bytes to 8-byte dest bdry */
        clrldi  r8,r8,64-4	/* aling to 16byte boundary */
	sub     r7,r4,r3
	cmpldi	cr0,r8,0
	beq+	.Ldst_aligned

.Ldst_unaligned:
	mtcrf	0x01,r8		/* put #bytes to boundary into cr7 */
	subf	r5,r8,r5

	bf	cr7*4+3,1f
20:	lbzx	r0,r7,r6	/* copy 1 byte */
60:	stb	r0,0(r6)
	addi	r6,r6,1
1:	bf	cr7*4+2,2f
21:	lhzx	r0,r7,r6	/* copy 2 byte */
61:	sth	r0,0(r6)
	addi	r6,r6,2
2:	bf	cr7*4+1,4f
22:	lwzx	r0,r7,r6	/* copy 4 byte */
62:	stw	r0,0(r6)
	addi	r6,r6,4
4:	bf	cr7*4+0,8f
23:	ldx	r0,r7,r6	/* copy 8 byte */
63:	std	r0,0(r6)
	addi	r6,r6,8
8:
	add	r4,r7,r6

.Ldst_aligned:

	cmpdi	cr5,r5,128-1

	neg	r7,r6
	addi	r6,r6,-8	/* prepare for stdu */
	addi	r4,r4,-8	/* prepare for ldu */

	clrldi  r7,r7,64-7	/* align to cacheline boundary */
	ble+	cr5,.Llessthancacheline


	cmpldi	cr6,r7,0
	subf	r5,r7,r5
	srdi	r7,r7,4		/* divide size by 16 */
	srdi	r10,r5,7	/* number of cache lines to copy */


	cmpldi	r10,0
	li	r11,0			/* number cachelines to copy with prefetch */
	beq	.Lnocacheprefetch

	cmpldi	r10,PREFETCH_AHEAD
	li	r12,128+8		/* prefetch distance*/
	ble	.Llessthanmaxprefetch

	subi	r11,r10,PREFETCH_AHEAD
	li	r10,PREFETCH_AHEAD
.Llessthanmaxprefetch:

	mtctr	r10
.LprefetchSRC:
	dcbt    r12,r4
        addi    r12,r12,128
        bdnz    .LprefetchSRC
.Lnocacheprefetch:


	mtctr	r7
	cmpldi	cr1,r5,128
	clrldi  r5,r5,64-7

	beq	cr6,.Lcachelinealigned	/* 	*/
.Laligntocacheline:
24:	ld 	r9,0x08(r4)
25:	ldu	r7,0x10(r4)
64:	std	r9,0x08(r6)
65:	stdu	r7,0x10(r6)
	bdnz	.Laligntocacheline


.Lcachelinealigned:				/* copy while cache lines */


	blt- 	cr1,.Llessthancacheline		/* size <128 */

.Louterloop:
        cmpdi   r11,0
	mtctr	r11
	beq-	.Lendloop

	li	r11,128*ZERO_AHEAD +8		/* DCBZ dist */

.align	4
	/* Copy whole cachelines, optimized by prefetching SRC cacheline */
.Lloop: 				/* Copy aligned body */
	dcbt    r12,r4			/* PREFETCH SOURCE cache lines ahead*/
26:	ld      r9, 0x08(r4)
4000:	dcbz	r11,r6
27:	ld      r7, 0x10(r4)    	/* 4 register stride copy */
28:	ld      r8, 0x18(r4)		/* 4 are optimal to hide 1st level cache lantency*/
29:	ld      r0, 0x20(r4)
66:	std     r9, 0x08(r6)
67:	std     r7, 0x10(r6)
68:	std     r8, 0x18(r6)
69:	std     r0, 0x20(r6)
30:	ld      r9, 0x28(r4)
31:	ld      r7, 0x30(r4)
32:	ld      r8, 0x38(r4)
33:	ld      r0, 0x40(r4)
70:	std     r9, 0x28(r6)
71:	std     r7, 0x30(r6)
72:	std     r8, 0x38(r6)
73:	std     r0, 0x40(r6)
34:	ld      r9, 0x48(r4)
35:	ld      r7, 0x50(r4)
36:	ld      r8, 0x58(r4)
37:	ld      r0, 0x60(r4)
74:	std     r9, 0x48(r6)
75:	std     r7, 0x50(r6)
76:	std     r8, 0x58(r6)
77:	std     r0, 0x60(r6)
38:	ld      r9, 0x68(r4)
39:	ld      r7, 0x70(r4)
40:	ld      r8, 0x78(r4)
41:	ldu     r0, 0x80(r4)
78:	std     r9, 0x68(r6)
79:	std     r7, 0x70(r6)
80:	std     r8, 0x78(r6)
81:	stdu    r0, 0x80(r6)

	bdnz    .Lloop
.Lendloop:


        cmpdi   r10,0
	sldi    r10,r10,2         	/* adjust from 128 to 32 byte stride */
        beq-     .Lendloop2
        mtctr 	r10
.Lloop2: 				/* Copy aligned body */
42:	ld      r9, 0x08(r4)
43:	ld      r7, 0x10(r4)
44:	ld      r8, 0x18(r4)
45:	ldu     r0, 0x20(r4)
82:	std     r9, 0x08(r6)
83:	std     r7, 0x10(r6)
84:	std     r8, 0x18(r6)
85:	stdu    r0, 0x20(r6)

	bdnz    .Lloop2

.Lendloop2:


.Llessthancacheline:		/* less than cache to do ? */
	cmpldi	cr0,r5,16
	srdi	r7,r5,4		/* divide size by 16 */
        blt-    .Ldo_lt16
	mtctr	r7
.Lcopy_remaining:
46:	ld 	r8,0x08(r4)
47:	ldu	r7,0x10(r4)
86:	std	r8,0x08(r6)
87:	stdu	r7,0x10(r6)
	bdnz	.Lcopy_remaining


.Ldo_lt16:			/* less than 16 ? */
	cmpldi	cr0,r5,0	/* copy remaining bytes (0-15) */
	beq	sp1		/* no rest to copy */
	addi	r4,r4,8
	addi	r6,r6,8
.Lshortcopy:			/* SIMPLE COPY to handle size =< 15 bytes */
	mtcrf	0x01,r5
	sub     r7,r4,r6
	bf-	cr7*4+0,sp8
48:	ldx	r0,r7,r6	/* copy 8 byte */
88:	std	r0,0(r6)
	addi	r6,r6,8
sp8:
	bf	cr7*4+1,sp4
49:	lwzx	r0,r7,r6	/* copy 4 byte */
89:	stw	r0,0(r6)
	addi	r6,r6,4
sp4:
	bf	cr7*4+2,sp2
50:	lhzx	r0,r7,r6	/* copy 2 byte */
90:	sth	r0,0(r6)
	addi	r6,r6,2
sp2:
	bf	cr7*4+3,sp1
51:	lbzx	r0,r7,r6	/* copy 1 byte */
91:	stb	r0,0(r6)
sp1:
	li	r3,0
	blr




/*
 * exception handlers follow
 * we have to return the number of bytes not copied
 * for an exception on a load, we set the rest of the destination to 0
 */

151:
150:
149:
148:
	add	r4,r7,r6
	b	1002f

123:
122:
121:
	add	r4,r7,r6
	add	r5,r8,r5	/* original size is r5 + r8, no need to go to stack */
	b	1001f

120:
	add	r5,r8,r5	/* original size is r5 + r8, no need to go to stack */
	b	1003f		/* we know we can't copy any more bytes so jump to clring */

141:
140:
139:
138:
	addi	r6,r6,32
	addi	r4,r4,32
137:
136:
135:
134:
	addi	r6,r6,32
	addi	r4,r4,32
133:
132:
131:
130:
	addi	r6,r6,32
	addi	r4,r4,32
4100:
147:
146:
145:
144:
143:
142:
129:
128:
127:
126:
125:
124:
	addi	r6,r6,8
	addi	r4,r4,8

/*
 * we had a fault on a load
 * r6 - first unmodified byte of the destination
 * r3 - original destination
 * r4 - next byte we have to read for a load
 */

1002:	ld	r5,-8(r1)
1001:	subf	r3,r3,r6	/* number of bytes we did copy */
	subf	r5,r3,r5	/* #bytes left to go */

/*
 * first see if we can copy any more bytes before hitting another exception
 */
	mtctr	r5
52:	lbz	r0,0(r4)
	addi	r4,r4,1
92:	stb	r0,0(r6)
	addi	r6,r6,1
	bdnz	52b
	li	r3,0		/* huh? all copied successfully this time? */
	blr

/*
 * here we have trapped again, need to clear ctr bytes starting at r6
 */
152:	mfctr	r5
1003:	li	r0,0
	mr	r4,r6
	mr	r3,r5		/* return the number of bytes not copied */
1:	andi.	r9,r4,7
	beq	3f
93:	stb	r0,0(r4)
	addic.	r5,r5,-1
	addi	r4,r4,1
	bne	1b
	blr
3:	cmpldi	cr1,r5,8
	srdi	r9,r5,3
	andi.	r5,r5,7
	blt	cr1,1000f
	mtctr	r9
94:	std	r0,0(r4)
	addi	r4,r4,8
	bdnz	94b
1000:	beqlr
	mtctr	r5	
95:	stb	r0,0(r4)
	addi	r4,r4,1
	bdnz	95b
	blr



/*
 * we had a fault on a store
 * r6 - byte we tried to store to
 * r3 - original destination
 */
181:
	addi	r6,r6,8
180:
	addi	r6,r6,8
179:
	addi	r6,r6,8
178:
	addi	r6,r6,8
177:
	addi	r6,r6,8
176:
	addi	r6,r6,8
175:
	addi	r6,r6,8
174:
	addi	r6,r6,8
173:
	addi	r6,r6,8
172:
	addi	r6,r6,8
171:
	addi	r6,r6,8
170:
	addi	r6,r6,8
185:
169:
	addi	r6,r6,8
184:
168:
	addi	r6,r6,8
187:
183:
167:
165:
	addi	r6,r6,8
186:
182:
166:
164:
	addi	r6,r6,8
191:
190:
189:
188:
163:
162:
161:
160:
	ld	r5,-8(r1)
	subf	r3,r3,r6	/* number of bytes we did copy */
	subf	r3,r3,r5
195:
194:
193:
	blr			/* #bytes not copied in r3 */

192:
	mfctr	r3
	blr


	.section __ex_table,"a"
	.align	3
	.llong	20b,120b
	.llong	60b,160b
	.llong	21b,121b
	.llong	61b,161b
	.llong	22b,122b
	.llong	62b,162b
	.llong	23b,123b
	.llong	63b,163b
	.llong	24b,124b
	.llong	25b,125b
	.llong	64b,164b
	.llong	65b,165b
	.llong	26b,126b
	.llong	27b,127b
	.llong	28b,128b
	.llong	29b,129b
	.llong	66b,166b
	.llong	67b,167b
	.llong	68b,168b
	.llong	69b,169b
	.llong	30b,130b
	.llong	31b,131b
	.llong	32b,132b
	.llong	33b,133b
	.llong	70b,170b
	.llong	71b,171b
	.llong	72b,172b
	.llong	73b,173b
	.llong	34b,134b
	.llong	35b,135b
	.llong	36b,136b
	.llong	37b,137b
	.llong	74b,174b
	.llong	75b,175b
	.llong	76b,176b
	.llong	77b,177b
	.llong	38b,138b
	.llong	39b,139b
	.llong	40b,140b
	.llong	41b,141b
	.llong	78b,178b
	.llong	79b,179b
	.llong	80b,180b
	.llong	81b,181b
	.llong	42b,142b
	.llong	43b,143b
	.llong	44b,144b
	.llong	45b,145b
	.llong	82b,182b
	.llong	83b,183b
	.llong	84b,184b
	.llong	85b,185b
	.llong	46b,146b
	.llong	47b,147b
	.llong	86b,186b
	.llong	87b,187b
	.llong	48b,148b
	.llong	88b,188b
	.llong	49b,149b
	.llong	89b,189b
	.llong	50b,150b
	.llong	90b,190b
	.llong	51b,151b
	.llong	91b,191b
	.llong	52b,152b
	.llong	92b,192b
	.llong	93b,193b
	.llong	94b,194b
	.llong	95b,195b
        .llong  4000b,4100b

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
  2008-06-19  7:53 [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell Mark Nelson
@ 2008-06-19 14:43 ` Arnd Bergmann
  2008-06-19 15:17   ` Gunnar von Boehn
  2008-06-20  1:55   ` Mark Nelson
  0 siblings, 2 replies; 19+ messages in thread
From: Arnd Bergmann @ 2008-06-19 14:43 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Mark Nelson, Gunnar von Boehn, cbe-oss-dev, Michael Ellerman

On Thursday 19 June 2008, Mark Nelson wrote:

>  * __copy_tofrom_user routine optimized for CELL-BE-PPC

A few things I noticed:

* You don't have a page wise user copy, which the regular code
has. This is probably not so noticable in iperf, but should
have a significant impact on lmbench and on a number of file
system tests that copy large amounts of data. Have you checked
that the loop around cache lines is just as fast?

* You don't align the source to word size, only the target.
Does this get handled correctly when the source is a noncacheable
mapping, e.g. an unaligned copy_from_user where the source points
to a physical local store mapping of an SPU? I don't think we
need to optimize this case for performance, but I'm not sure
if it would crash. AFAIR, unaligned loads from noncacheable storage
give you an alignment exception that you need to handle, right?

* The naming of the labels (with just numbers) is rather confusing,
it would be good to have something better, but I must admit that
I don't have a good idea either.

* The trick of using the condition code in cr7 for the last bytes
is really cute, but are the four branches actually better than a
single computed branch into the middle of 15 byte wise copies?

	Arnd <><

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
  2008-06-19 14:43 ` Arnd Bergmann
@ 2008-06-19 15:17   ` Gunnar von Boehn
  2008-06-19 16:13     ` Sanjay Patel
  2008-06-20  1:13     ` [Cbe-oss-dev] " Paul Mackerras
  2008-06-20  1:55   ` Mark Nelson
  1 sibling, 2 replies; 19+ messages in thread
From: Gunnar von Boehn @ 2008-06-19 15:17 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: Mark Nelson, linuxppc-dev, Michael Ellerman, cbe-oss-dev

Hi Arnd,

> You don't have a page wise user copy,
> which the regular code has.

The new code does not need two version IMHO.
The "regular" code was much slower for the normal case and has a special
version for the 4K optimized case.
The new code is equally good in both cases, so adding an extra 4K routine
is will increase the code size for very minor gain. I'm not sure if its
worth it.

Benchmark result on QS22 for good aligned copy
Old-code : 1300 MB/sec
Old-code 4k Special case: 2600 MB/sec
New code : 4000 MB/sec (always)


> You don't align the source to word size, only the target.
> Does this get handled correctly when the source
> is a noncacheable mapping, e.g.

The problem is that on CELL the required shift instructions
for SRC alignment are microcoded, in other words really slow.
You are right the main copy2user requires that the SRC is cacheable.
IMHO because of the exception on load, the routine should fallback to the
byte copy loop.

Arnd, could you verify that it works on localstore?


Cheers
Gunnar





                                                                           
             Arnd Bergmann                                                 
             <arnd@arndb.de>                                               
                                                                        To 
             19/06/2008 16:43          linuxppc-dev@ozlabs.org             
                                                                        cc 
                                       Mark Nelson <markn@au1.ibm.com>,    
                                       cbe-oss-dev@ozlabs.org, Gunnar von  
                                       Boehn/Germany/Contr/IBM@IBMDE,      
                                       Michael Ellerman                    
                                       <ellerman@au1.ibm.com>              
                                                                   Subject 
                                       Re: [RFC 1/3] powerpc:              
                                       __copy_tofrom_user tweaked for Cell 
                                                                           
                                                                           
                                                                           
                                                                           
                                                                           
                                                                           




On Thursday 19 June 2008, Mark Nelson wrote:

>  * __copy_tofrom_user routine optimized for CELL-BE-PPC

A few things I noticed:

* You don't have a page wise user copy, which the regular code
has. This is probably not so noticable in iperf, but should
have a significant impact on lmbench and on a number of file
system tests that copy large amounts of data. Have you checked
that the loop around cache lines is just as fast?

* You don't align the source to word size, only the target.
Does this get handled correctly when the source is a noncacheable
mapping, e.g. an unaligned copy_from_user where the source points
to a physical local store mapping of an SPU? I don't think we
need to optimize this case for performance, but I'm not sure
if it would crash. AFAIR, unaligned loads from noncacheable storage
give you an alignment exception that you need to handle, right?

* The naming of the labels (with just numbers) is rather confusing,
it would be good to have something better, but I must admit that
I don't have a good idea either.

* The trick of using the condition code in cr7 for the last bytes
is really cute, but are the four branches actually better than a
single computed branch into the middle of 15 byte wise copies?

             Arnd <><

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
  2008-06-19 15:17   ` Gunnar von Boehn
@ 2008-06-19 16:13     ` Sanjay Patel
  2008-06-20 11:36       ` Gunnar von Boehn
  2008-06-20  1:13     ` [Cbe-oss-dev] " Paul Mackerras
  1 sibling, 1 reply; 19+ messages in thread
From: Sanjay Patel @ 2008-06-19 16:13 UTC (permalink / raw)
  To: Arnd Bergmann, Gunnar von Boehn
  Cc: Mark Nelson, linuxppc-dev, Michael Ellerman, cbe-oss-dev




--- On Thu, 6/19/08, Gunnar von Boehn <VONBOEHN@de.ibm.com> wrote:

> You are right the main copy2user requires that the SRC is
> cacheable.
> IMHO because of the exception on load, the routine should
> fallback to the
> byte copy loop.
> 
> Arnd, could you verify that it works on localstore?

Since the main loops use 'dcbz', the destination must also be cacheable. IIRC, if the destination is write-through or cache-inhibited, the 'dcbz' will cause an alignment exception. I suppose it would still function correctly via the handler, but horribly slowly.

--Sanjay




      

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Cbe-oss-dev] [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
  2008-06-19 15:17   ` Gunnar von Boehn
  2008-06-19 16:13     ` Sanjay Patel
@ 2008-06-20  1:13     ` Paul Mackerras
  2008-06-20 16:47       ` Gunnar von Boehn
  2008-06-21  2:00       ` Arnd Bergmann
  1 sibling, 2 replies; 19+ messages in thread
From: Paul Mackerras @ 2008-06-20  1:13 UTC (permalink / raw)
  To: Gunnar von Boehn
  Cc: linuxppc-dev, Michael Ellerman, cbe-oss-dev, Arnd Bergmann

Gunnar von Boehn writes:

> The "regular" code was much slower for the normal case and has a special
> version for the 4K optimized case.

That's a slightly inaccurate view...

The reason for having the two cases is that when I profiled the
distribution of sizes and alignments of memory copies in the kernel,
the result was that almost all copies (something like 99%, IIRC) were
either 128 bytes or less, or else a whole page at a page-aligned
address.

Thus we get the best performance by having a simple copy routine with
minimal setup overhead for the small copy case, plus an aggressively
optimized page copy routine.  Spending time setting up for a
multi-cacheline copy that's not a whole page is just going to hurt the
small copy case without providing any real benefit.

Transferring data over loopback is possibly an exception to that.
However, it's very rare to transfer large amounts of data over
loopback, unless you're running a benchmark like iperf or netperf. :-/

Paul.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
  2008-06-19 14:43 ` Arnd Bergmann
  2008-06-19 15:17   ` Gunnar von Boehn
@ 2008-06-20  1:55   ` Mark Nelson
  1 sibling, 0 replies; 19+ messages in thread
From: Mark Nelson @ 2008-06-20  1:55 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Gunnar von Boehn, cbe-oss-dev, Arnd Bergmann, Michael Ellerman

> * The naming of the labels (with just numbers) is rather confusing,
> it would be good to have something better, but I must admit that
> I don't have a good idea either.

I will admit that at first glance the label naming with numbers
does look confusing but when you notice that all the loads start
at 20 and all the stores start at 60 and that to get the exception
handler for those instructions you just add 100 I think it makes
sense, but that could be because I've been looking at it way too
long...

(I thought I had a comment in there to that effect but it must
have gotten lost along the way. I'll add a new comment
explaining the above, that should help)

> 
> * The trick of using the condition code in cr7 for the last bytes
> is really cute, but are the four branches actually better than a
> single computed branch into the middle of 15 byte wise copies?

The original copy_tofrom_user does this also, which I guess is
carried over to this new version...

Gunnar did you have an old version that did something similar
to this?

Mark

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
  2008-06-19 16:13     ` Sanjay Patel
@ 2008-06-20 11:36       ` Gunnar von Boehn
  2008-06-20 17:46         ` Sanjay Patel
  0 siblings, 1 reply; 19+ messages in thread
From: Gunnar von Boehn @ 2008-06-20 11:36 UTC (permalink / raw)
  To: sanjay3000
  Cc: Mark Nelson, linuxppc-dev, Michael Ellerman, cbe-oss-dev, Arnd Bergmann

Hi Sanjay,

> I suppose it would still function correctly via the handler, but horribly
slowly.

How important is best performance for the unaligned copy to/from
uncacheable memory?
The challenge of the CELL chip is that X-form of the shift instructions are
microcoded.
The shifts are needed to implement a copy that reads and writes always
aligned.
There is of course the option to not use the X-form of the shift but to
write several copy routines
using immediate shift instructions and to pick the matching copy routine.
This option would of course highly increase the code size of the memcopy
routine.


Kind regards

Gunnar



                                                                           
             Sanjay Patel                                                  
             <sanjay3000@yahoo                                             
             .com>                                                      To 
                                       Arnd Bergmann <arnd@arndb.de>,      
             19/06/2008 18:13          Gunnar von                          
                                       Boehn/Germany/Contr/IBM@IBMDE       
                                                                        cc 
             Please respond to         Mark Nelson <markn@au1.ibm.com>,    
             sanjay3000@yahoo.         linuxppc-dev@ozlabs.org, Michael    
                    com                Ellerman <ellerman@au1.ibm.com>,    
                                       cbe-oss-dev@ozlabs.org              
                                                                   Subject 
                                       Re: [RFC 1/3] powerpc:              
                                       __copy_tofrom_user tweaked for Cell 
                                                                           
                                                                           
                                                                           
                                                                           
                                                                           
                                                                           







--- On Thu, 6/19/08, Gunnar von Boehn <VONBOEHN@de.ibm.com> wrote:

> You are right the main copy2user requires that the SRC is
> cacheable.
> IMHO because of the exception on load, the routine should
> fallback to the
> byte copy loop.
>
> Arnd, could you verify that it works on localstore?

Since the main loops use 'dcbz', the destination must also be cacheable.
IIRC, if the destination is write-through or cache-inhibited, the 'dcbz'
will cause an alignment exception. I suppose it would still function
correctly via the handler, but horribly slowly.

--Sanjay

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Cbe-oss-dev] [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
  2008-06-20  1:13     ` [Cbe-oss-dev] " Paul Mackerras
@ 2008-06-20 16:47       ` Gunnar von Boehn
  2008-06-21  2:00       ` Arnd Bergmann
  1 sibling, 0 replies; 19+ messages in thread
From: Gunnar von Boehn @ 2008-06-20 16:47 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, Michael Ellerman, cbe-oss-dev, Arnd Bergmann

Hi Paul,

Of course, I can only speak for the test result that I got on our
platforms.
We did test on PS3, QS21 single/dual, QS22 single/dual, and JS21

The performance of the old Linux routine and the new routine is about the
same for copies of less than 128 Bytes.
At 512 byte the new routine is about 100% faster than the old one. (on QS
21)
At 1500 Byte size, which is a typical ethernet frame size, the new routine
is over 3 times faster than the old one.  (on QS21)

We could NOT see a performance decrease for small copies.
We saw that for copies of 512 byte and more the performance increase is
significant.


>However, it's very rare to transfer large amounts of data over
>loopback, unless you're running a benchmark like iperf or netperf.

Please mind that this test was done as its a simple way to show how much
less work the CPU needs to do to handle network traffic.
All network traffic goes to copy2user - all network traffic can now be done
with much less CPU power wasted for copying the data.

Don't you agree that network traffic or IO in general with packages over
500 Byte, is not a rare case?


Cheers
Gunnar




                                                                           
             Paul Mackerras                                                
             <paulus@samba.org                                             
             >                                                          To 
                                       Gunnar von                          
             20/06/2008 03:13          Boehn/Germany/Contr/IBM@IBMDE       
                                                                        cc 
                                       Arnd Bergmann <arnd@arndb.de>,      
                                       linuxppc-dev@ozlabs.org, Michael    
                                       Ellerman <ellerman@au1.ibm.com>,    
                                       cbe-oss-dev@ozlabs.org              
                                                                   Subject 
                                       Re: [Cbe-oss-dev] [RFC 1/3]         
                                       powerpc: __copy_tofrom_user tweaked 
                                       for  Cell                           
                                                                           
                                                                           
                                                                           
                                                                           
                                                                           
                                                                           




Gunnar von Boehn writes:

> The "regular" code was much slower for the normal case and has a special
> version for the 4K optimized case.

That's a slightly inaccurate view...

The reason for having the two cases is that when I profiled the
distribution of sizes and alignments of memory copies in the kernel,
the result was that almost all copies (something like 99%, IIRC) were
either 128 bytes or less, or else a whole page at a page-aligned
address.

Thus we get the best performance by having a simple copy routine with
minimal setup overhead for the small copy case, plus an aggressively
optimized page copy routine.  Spending time setting up for a
multi-cacheline copy that's not a whole page is just going to hurt the
small copy case without providing any real benefit.

Transferring data over loopback is possibly an exception to that.
However, it's very rare to transfer large amounts of data over
loopback, unless you're running a benchmark like iperf or netperf. :-/

Paul.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
  2008-06-20 11:36       ` Gunnar von Boehn
@ 2008-06-20 17:46         ` Sanjay Patel
  2008-06-20 23:20           ` Benjamin Herrenschmidt
  2008-06-23  8:30           ` Gunnar von Boehn
  0 siblings, 2 replies; 19+ messages in thread
From: Sanjay Patel @ 2008-06-20 17:46 UTC (permalink / raw)
  To: Gunnar von Boehn
  Cc: Mark Nelson, linuxppc-dev, Michael Ellerman, cbe-oss-dev, Arnd Bergmann


--- On Fri, 6/20/08, Gunnar von Boehn <VONBOEHN@de.ibm.com> wrote:
> How important is best performance for the unaligned copy
> to/from uncacheable memory?
> The challenge of the CELL chip is that X-form of the shift
> instructions are microcoded.
> The shifts are needed to implement a copy that reads and
> writes always aligned.

Hi Gunnar,

I have no idea how important unaligned or uncacheable copy perf is for Cell Linux. My experience is from Mac OS X for PPC, where we used dcbz in a general-purpose memcpy but were forced to pull that optimization because of the detrimental perf effect on important applications.

I may be missing something, but I don't see how Cell's microcoded shift is much of a factor here. The problem is that the dcbz will generate the alignment exception regardless of whether the data is actually unaligned or not. Once you're on that code path, performance can't be good, can it?

--Sanjay




      

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
  2008-06-20 17:46         ` Sanjay Patel
@ 2008-06-20 23:20           ` Benjamin Herrenschmidt
  2008-06-20 23:44             ` Sanjay Patel
  2008-06-23  8:30           ` Gunnar von Boehn
  1 sibling, 1 reply; 19+ messages in thread
From: Benjamin Herrenschmidt @ 2008-06-20 23:20 UTC (permalink / raw)
  To: sanjay3000
  Cc: Mark Nelson, Gunnar von Boehn, Arnd Bergmann, linuxppc-dev,
	Michael Ellerman, cbe-oss-dev

On Fri, 2008-06-20 at 10:46 -0700, Sanjay Patel wrote:
> --- On Fri, 6/20/08, Gunnar von Boehn <VONBOEHN@de.ibm.com> wrote:
> > How important is best performance for the unaligned copy
> > to/from uncacheable memory?
> > The challenge of the CELL chip is that X-form of the shift
> > instructions are microcoded.
> > The shifts are needed to implement a copy that reads and
> > writes always aligned.
> 
> Hi Gunnar,
> 
> I have no idea how important unaligned or uncacheable copy perf is for
> Cell Linux. My experience is from Mac OS X for PPC, where we used dcbz
> in a general-purpose memcpy but were forced to pull that optimization
> because of the detrimental perf effect on important applications.

I though OS X had a trick with a CR bit that would disable the dcbz
optimization on the first alignment fault ? Or did they totally remove
it ?

> I may be missing something, but I don't see how Cell's microcoded
> shift is much of a factor here. The problem is that the dcbz will
> generate the alignment exception regardless of whether the data is
> actually unaligned or not. Once you're on that code path, performance
> can't be good, can it?

This is a concern. The problem is, do we want to lose all the benefit
of improved copy_to/from_user because of that ? Passing local store
addresses to/from read/write syscalls is supported, so I suppose it's a
real issue for reads.

On the other hand, how performant do we expect those to be ? That is, we
could have the alignment exception detect that it happened during
copy_to/from_user, and change the return address to a non-optimized
variant. Thus we would have at most one exception per read syscall.

Ben.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
  2008-06-20 23:20           ` Benjamin Herrenschmidt
@ 2008-06-20 23:44             ` Sanjay Patel
  0 siblings, 0 replies; 19+ messages in thread
From: Sanjay Patel @ 2008-06-20 23:44 UTC (permalink / raw)
  To: benh
  Cc: Mark Nelson, Gunnar von Boehn, Arnd Bergmann, linuxppc-dev,
	Michael Ellerman, cbe-oss-dev




--- On Fri, 6/20/08, Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> I though OS X had a trick with a CR bit that would disable
> the dcbz optimization on the first alignment fault ? Or did they
> totally remove it ?

Ah, it's coming back to me. :)

Apple added 'dcbz', removed it, and then there was the clever trick of optimizing the code path with a boot-time perf test and/or changing the code on the first fault...I'm not sure what's implemented in the recent builds.

If Linux can do something similar, that should allow good perf on cacheable and cache-inhibited space as well as different CPUs (eg, if 'dcba' is available, then you don't need the alignment fault hack).

--Sanjay


      

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Cbe-oss-dev] [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
  2008-06-20  1:13     ` [Cbe-oss-dev] " Paul Mackerras
  2008-06-20 16:47       ` Gunnar von Boehn
@ 2008-06-21  2:00       ` Arnd Bergmann
  2008-06-21  4:30         ` Paul Mackerras
  1 sibling, 1 reply; 19+ messages in thread
From: Arnd Bergmann @ 2008-06-21  2:00 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: linuxppc-dev, Gunnar von Boehn, Michael Ellerman, cbe-oss-dev

On Friday 20 June 2008, Paul Mackerras wrote:

> Transferring data over loopback is possibly an exception to that.
> However, it's very rare to transfer large amounts of data over
> loopback, unless you're running a benchmark like iperf or netperf. :-/

Well, it is the exact case that came up in a real world scenario
for cell: On a network intensive application where the SPUs are
supposed to do all the work, we ended up not getting enough
data in and out through gbit ethernet because the PPU spent
much of its time in copy_to_user.

Going to 10gbit will make the problem even more apparent.

I understand that optimizing for this case will cost extra
branches for the other cases, but maybe we can find a better
compromise than before. Can you name a test case that you
consider important to optimize for for what you consider
real-life tests?

Doing some static compile-time analysis, I found that most
of the call sites (which are not necessarily most of
the run time calls) pass either a small constant size of
less than a few cache lines, or have a variable size but are
not at all performance critical.
Since the prefetching and cache line size awareness was
most of the improvement for cell (AFAIU), maybe we can
annotate the few interesting cases, say by introducing a
new copy_from_user_large() function that can be easily
optimized for large transfers on a given CPU, while
the remaining code keeps optmizing for small transfers
and may even get rid of the full page copy optimization
in order to save a branch.

	Arnd <><

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Cbe-oss-dev] [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
  2008-06-21  2:00       ` Arnd Bergmann
@ 2008-06-21  4:30         ` Paul Mackerras
  2008-06-21  4:49           ` David Miller
  2008-06-21 21:06           ` Arnd Bergmann
  0 siblings, 2 replies; 19+ messages in thread
From: Paul Mackerras @ 2008-06-21  4:30 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: linuxppc-dev, Gunnar von Boehn, Michael Ellerman, cbe-oss-dev

Arnd Bergmann writes:

> On Friday 20 June 2008, Paul Mackerras wrote:
> 
> > Transferring data over loopback is possibly an exception to that.
> > However, it's very rare to transfer large amounts of data over
> > loopback, unless you're running a benchmark like iperf or netperf. :-/
> 
> Well, it is the exact case that came up in a real world scenario
> for cell: On a network intensive application where the SPUs are
> supposed to do all the work, we ended up not getting enough
> data in and out through gbit ethernet because the PPU spent
			  ^^^^^^^^^^^^^
Which isn't loopback... :)

I have no objection to improving copy_tofrom_user, memcpy and
copy_page.  I just want to make sure that we don't make things worse
on some platform.

In fact, Mark and I dug up some experiments I had done 5 or 6 years
ago and just ran through all the copy loops I tried back then, on
QS22, POWER6, POWER5+, POWER5, POWER4, 970, and POWER3, and compared
them to the current kernel routines and the proposed new Cell
routines.  So far we have just looked at the copy_page case (i.e. 4kB
on a 4kB alignment) for cache-cold and cache-hot cases.
Interestingly, some of the routines I discarded back then turn out to
do really well on most of the modern platforms, and quite a lot better
on Cell than Gunnar's code does (~10GB/s vs. ~5.5GB/s in the hot-cache
case, IIRC).  Mark is going to summarise the results and also measure
the speed for smaller copies and misaligned copies.

As for the distribution of sizes, I think it would be worthwhile to
run a fresh set of tests.  As I said, my previous results showed most
copies to be either small (<= 128B) or a multiple of 4k, and I think
that was true for copy_tofrom_user as well as memcpy, but that was a
while ago.

> much of its time in copy_to_user.
> 
> Going to 10gbit will make the problem even more apparent.

Is this application really transferring bulk data and using buffers
that aren't a multiple of the page size?  Do you know whether the
copies ended up being misaligned?

Of course, if we really want the fastest copy possible, the thing to
do is to use VMX loads and stores on 970, POWER6 and Cell.  The
overhead of setting up to use VMX in the kernel would probably kill
any advantage, though -- at least, that's what I found when I tried
using VMX for copy_page in the kernel on 970 a few years ago.

> Doing some static compile-time analysis, I found that most
> of the call sites (which are not necessarily most of
> the run time calls) pass either a small constant size of
> less than a few cache lines, or have a variable size but are
> not at all performance critical.
> Since the prefetching and cache line size awareness was
> most of the improvement for cell (AFAIU), maybe we can
> annotate the few interesting cases, say by introducing a
> new copy_from_user_large() function that can be easily
> optimized for large transfers on a given CPU, while
> the remaining code keeps optmizing for small transfers
> and may even get rid of the full page copy optimization
> in order to save a branch.

Let's see what Mark comes up with.  We may be able to find a way to do
it that works well across all current CPUs and also is OK for small
copies.  If not we might need to do what you suggest.

Regards,
Paul.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Cbe-oss-dev] [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
  2008-06-21  4:30         ` Paul Mackerras
@ 2008-06-21  4:49           ` David Miller
  2008-06-21 21:06           ` Arnd Bergmann
  1 sibling, 0 replies; 19+ messages in thread
From: David Miller @ 2008-06-21  4:49 UTC (permalink / raw)
  To: paulus; +Cc: linuxppc-dev, VONBOEHN, ellerman, arnd, cbe-oss-dev

From: Paul Mackerras <paulus@samba.org>
Date: Sat, 21 Jun 2008 14:30:02 +1000

> Is this application really transferring bulk data and using buffers
> that aren't a multiple of the page size?  Do you know whether the
> copies ended up being misaligned?

We used to cache align the sub-buffers carved out of the per-socket
anonymous buffer page that TCP, UDP, and other protocols use to
collect user write()/sendmsg() data when the outgoing interface
supports scatter-gather and checksumming (basically just about
any reasonable ethernet device these days).

But that alignment was removed from net/ipv4/tcp.c (I forget the exact
reasons, I think it was space wastage).

The net result is that you will therefore see a lot of misaligned
copies for networking sends these days.

In my opinion optimizing mempcy/user-copy in the most general way is
the best course of action.  Whatever histogram of sizes and alignments
you obtain with sampling today will change unpredictably in the
future.  I'm saying this as someone who fell into that trap on sparc64
several times in the past, and I have 5 memcpy/user-copy/memset
variants to maintain on that platform these days :-/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Cbe-oss-dev] [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
  2008-06-21  4:30         ` Paul Mackerras
  2008-06-21  4:49           ` David Miller
@ 2008-06-21 21:06           ` Arnd Bergmann
  1 sibling, 0 replies; 19+ messages in thread
From: Arnd Bergmann @ 2008-06-21 21:06 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Gunnar von Boehn, Paul Mackerras, Michael Ellerman, cbe-oss-dev

On Saturday 21 June 2008, Paul Mackerras wrote:
> Is this application really transferring bulk data and using buffers
> that aren't a multiple of the page size? =A0Do you know whether the
> copies ended up being misaligned?

In the problem case that was reported to me, it was all bulk data,
and all the oprofile samples showed up in the unaligned code path
of the usercopy code, which does the microcoded (on cell) shift
operations.

> Of course, if we really want the fastest copy possible, the thing to
> do is to use VMX loads and stores on 970, POWER6 and Cell. =A0The
> overhead of setting up to use VMX in the kernel would probably kill
> any advantage, though -- at least, that's what I found when I tried
> using VMX for copy_page in the kernel on 970 a few years ago.

Right, that is understandable, we saw similar results when Sebastian
was working on VMX optimized AES code.

> Let's see what Mark comes up with. =A0We may be able to find a way to do
> it that works well across all current CPUs and also is OK for small
> copies. =A0If not we might need to do what you suggest.

ok.

	Arnd <><

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
  2008-06-20 17:46         ` Sanjay Patel
  2008-06-20 23:20           ` Benjamin Herrenschmidt
@ 2008-06-23  8:30           ` Gunnar von Boehn
  2008-06-23 12:07             ` Geert Uytterhoeven
  2008-06-23 23:49             ` Paul Mackerras
  1 sibling, 2 replies; 19+ messages in thread
From: Gunnar von Boehn @ 2008-06-23  8:30 UTC (permalink / raw)
  To: sanjay3000
  Cc: Mark Nelson, linuxppc-dev, Michael Ellerman, cbe-oss-dev, Arnd Bergmann

Hi Sanya,

> I have no idea how important unaligned or uncacheable
> copy perf is for Cell Linux. My experience is from Mac
> OS X for PPC, where we used dcbz in a general-purpose
> memcpy but were forced to pull that optimization because
> of the detrimental perf effect on important applications.

Interesting points.
Can you help me to understand where the negative effect of DCBZ does come
from?


> I may be missing something, but I don't see how Cell's microcoded shift
is much of a factor here.
> The problem is that the dcbz will generate the alignment exception
> regardless of whether the data is actually unaligned or not.
> Once you're on that code path, performance can't be good, can it?

In which case will DCBZ create an aligned exception?

If you want to see result on Cell then here are the values you can expect
on 1 CPU:
On Cell the copy using the Shift-xform achives max 800 MB/sec.
The copy using a single byte loop achieves 800 MB/sec as well.

A unaligned copy using unrolled doublewords and cache prefetch achieves
about 2500 MB/sec.
The aligned case using unrolled doublewords and cache prefetch achieves
about 7000 MB/sec.


What hurts performance a lot on CELL (and on XBOX 360) are two things:
a) The first level cache latency, and the memory and 2nd level cache
latency.
Cell has a first level cache latency of 4.
Cell has a second level cache latency of 40.
Cell has a memory latency of 400.

To avoid the 1st level cache latency you need to have 4 instruction
distance between your load and usage/store of the data.
Therefore a straight copy needs to be written like this.

.Loop:
  ld      r9, 0x08(r4)
  ld      r7, 0x10(r4)
  ld      r8, 0x18(r4)
  ldu     r0, 0x20(r4)
  std     r9, 0x08(r6)  // 4 instructions distance from load
  std     r7, 0x10(r6)
  std     r8, 0x18(r6)
  stdu    r0, 0x20(r6)
bdnz    .Lloop2



b) A major pain in the back is the that the shift instruction is
microcoded.
While the SHIFT X-Form needs one clock on other PPC architectures, it needs
11 clocks on CELL.
An addition to taking 11 clocks for this running it thread, the microcoded
instruction will freeze the second thread.
Using microcoded instructions in a work loop will really drain the
performance on CELL.


I think if you want to use the same copy for uncacheable memory and maybe
for another PPC platform
then a good compromise will be to use the cache prefetch version for the
aligned case and to use a old SHIFT part for the unaligned case.
This way you will get max performance for aligned copies and good result
for the unaligned case.



                                                                           
             Sanjay Patel                                                  
             <sanjay3000@yahoo                                             
             .com>                                                      To 
                                       Gunnar von                          
             20/06/2008 19:46          Boehn/Germany/Contr/IBM@IBMDE       
                                                                        cc 
                                       Arnd Bergmann <arnd@arndb.de>,      
             Please respond to         cbe-oss-dev@ozlabs.org, Michael     
             sanjay3000@yahoo.         Ellerman <ellerman@au1.ibm.com>,    
                    com                linuxppc-dev@ozlabs.org, Mark       
                                       Nelson <markn@au1.ibm.com>          
                                                                   Subject 
                                       Re: [RFC 1/3] powerpc:              
                                       __copy_tofrom_user tweaked for Cell 
                                                                           
                                                                           
                                                                           
                                                                           
                                                                           
                                                                           





--- On Fri, 6/20/08, Gunnar von Boehn <VONBOEHN@de.ibm.com> wrote:
> How important is best performance for the unaligned copy
> to/from uncacheable memory?
> The challenge of the CELL chip is that X-form of the shift
> instructions are microcoded.
> The shifts are needed to implement a copy that reads and
> writes always aligned.

Hi Gunnar,

I have no idea how important unaligned or uncacheable copy perf is for Cell
Linux. My experience is from Mac OS X for PPC, where we used dcbz in a
general-purpose memcpy but were forced to pull that optimization because of
the detrimental perf effect on important applications.

I may be missing something, but I don't see how Cell's microcoded shift is
much of a factor here. The problem is that the dcbz will generate the
alignment exception regardless of whether the data is actually unaligned or
not. Once you're on that code path, performance can't be good, can it?

--Sanjay

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
  2008-06-23  8:30           ` Gunnar von Boehn
@ 2008-06-23 12:07             ` Geert Uytterhoeven
  2008-06-23 23:49             ` Paul Mackerras
  1 sibling, 0 replies; 19+ messages in thread
From: Geert Uytterhoeven @ 2008-06-23 12:07 UTC (permalink / raw)
  To: Gunnar von Boehn
  Cc: Mark Nelson, sanjay3000, Arnd Bergmann, linuxppc-dev,
	Michael Ellerman, cbe-oss-dev

[-- Attachment #1: Type: TEXT/PLAIN, Size: 871 bytes --]

On Mon, 23 Jun 2008, Gunnar von Boehn wrote:
> > The problem is that the dcbz will generate the alignment exception
> > regardless of whether the data is actually unaligned or not.
> > Once you're on that code path, performance can't be good, can it?
> 
> In which case will DCBZ create an aligned exception?

When using dcbz on uncached memory, IIRC.

With kind regards,

Geert Uytterhoeven
Software Architect

Sony Techsoft Centre
The Corporate Village · Da Vincilaan 7-D1 · B-1935 Zaventem · Belgium

Phone:    +32 (0)2 700 8453
Fax:      +32 (0)2 700 8622
E-mail:   Geert.Uytterhoeven@sonycom.com
Internet: http://www.sony-europe.com/

Sony Technology and Software Centre Europe
A division of Sony Service Centre (Europe) N.V.
Registered office: Technologielaan 7 · B-1840 Londerzeel · Belgium
VAT BE 0413.825.160 · RPR Brussels
Fortis 293-0376800-10 GEBA-BE-BB

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
  2008-06-23  8:30           ` Gunnar von Boehn
  2008-06-23 12:07             ` Geert Uytterhoeven
@ 2008-06-23 23:49             ` Paul Mackerras
  2008-06-27 13:30               ` Gunnar von Boehn
  1 sibling, 1 reply; 19+ messages in thread
From: Paul Mackerras @ 2008-06-23 23:49 UTC (permalink / raw)
  To: Gunnar von Boehn
  Cc: Mark Nelson, sanjay3000, Arnd Bergmann, linuxppc-dev,
	Michael Ellerman, cbe-oss-dev

Gunnar von Boehn writes:

> Interesting points.
> Can you help me to understand where the negative effect of DCBZ does come
> from?

In my experience, dcbz slows down the hot-cache case because it adds a
few cycles to the execution time of the inner loop, and on most 64-bit
PowerPC implementations, it doesn't actually help even in the
cold-cache case because the store queue does enough write combining
that the cache doesn't end up reading the line from memory.  I don't
know whether the Cell PPE can do that, but I could believe that it
can't.

Paul.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
  2008-06-23 23:49             ` Paul Mackerras
@ 2008-06-27 13:30               ` Gunnar von Boehn
  0 siblings, 0 replies; 19+ messages in thread
From: Gunnar von Boehn @ 2008-06-27 13:30 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Mark Nelson, Arnd Bergmann, sanjay3000, linuxppc-dev,
	Michael Ellerman, cbe-oss-dev

Hi Paul,


> In my experience, dcbz slows down the hot-cache case because it adds a
> few cycles to the execution time of the inner loop, and on most 64-bit
> PowerPC implementations, it doesn't actually help even in the
> cold-cache case because the store queue does enough write combining

I agree with you that on POWER the dcbz is probably not helping.

On PowerPC my experience is different.
>From what I have seen DCBZ help enormously on 970,PA-Semi and CELL.


Cheers
Gunnar



                                                                           
             Paul Mackerras                                                
             <paulus@samba.org                                             
             >                                                          To 
                                       Gunnar von                          
             24/06/2008 01:49          Boehn/Germany/Contr/IBM@IBMDE       
                                                                        cc 
                                       sanjay3000@yahoo.com, Mark Nelson   
                                       <markn@au1.ibm.com>,                
                                       linuxppc-dev@ozlabs.org, Michael    
                                       Ellerman <ellerman@au1.ibm.com>,    
                                       cbe-oss-dev@ozlabs.org, Arnd        
                                       Bergmann <arnd@arndb.de>            
                                                                   Subject 
                                       Re: [RFC 1/3] powerpc:              
                                       __copy_tofrom_user tweaked for Cell 
                                                                           
                                                                           
                                                                           
                                                                           
                                                                           
                                                                           




Gunnar von Boehn writes:

> Interesting points.
> Can you help me to understand where the negative effect of DCBZ does come
> from?

In my experience, dcbz slows down the hot-cache case because it adds a
few cycles to the execution time of the inner loop, and on most 64-bit
PowerPC implementations, it doesn't actually help even in the
cold-cache case because the store queue does enough write combining
that the cache doesn't end up reading the line from memory.  I don't
know whether the Cell PPE can do that, but I could believe that it
can't.

Paul.

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2008-06-27 13:30 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-06-19  7:53 [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell Mark Nelson
2008-06-19 14:43 ` Arnd Bergmann
2008-06-19 15:17   ` Gunnar von Boehn
2008-06-19 16:13     ` Sanjay Patel
2008-06-20 11:36       ` Gunnar von Boehn
2008-06-20 17:46         ` Sanjay Patel
2008-06-20 23:20           ` Benjamin Herrenschmidt
2008-06-20 23:44             ` Sanjay Patel
2008-06-23  8:30           ` Gunnar von Boehn
2008-06-23 12:07             ` Geert Uytterhoeven
2008-06-23 23:49             ` Paul Mackerras
2008-06-27 13:30               ` Gunnar von Boehn
2008-06-20  1:13     ` [Cbe-oss-dev] " Paul Mackerras
2008-06-20 16:47       ` Gunnar von Boehn
2008-06-21  2:00       ` Arnd Bergmann
2008-06-21  4:30         ` Paul Mackerras
2008-06-21  4:49           ` David Miller
2008-06-21 21:06           ` Arnd Bergmann
2008-06-20  1:55   ` Mark Nelson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.