* [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
@ 2008-06-19 7:53 Mark Nelson
2008-06-19 14:43 ` Arnd Bergmann
0 siblings, 1 reply; 19+ messages in thread
From: Mark Nelson @ 2008-06-19 7:53 UTC (permalink / raw)
To: linuxppc-dev, cbe-oss-dev; +Cc: Gunnar von Boehn, Michael Ellerman
/*
* Copyright (C) 2008 Gunnar von Boehn, IBM Corp.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*
* __copy_tofrom_user routine optimized for CELL-BE-PPC
*
* The CELL PPC core has 1 integerunit and 1 load/store unit
* CELL: 1st level data cache = 32K - 2nd level data cache = 512K
* - 3rd level data cache = 0K
* To improve copy performance we need to prefetch source data
* far ahead to hide this latency
* For best performance instruction forms ending in "." like "andi."
* should be avoided as they are implemented in microcode on CELL.
*
* The below code is loop unrolled for the CELL cache line of 128 bytes.
*
*/
#include <asm/processor.h>
#include <asm/ppc_asm.h>
#define PREFETCH_AHEAD 6
#define ZERO_AHEAD 4
.align 7
_GLOBAL(__copy_tofrom_user)
dcbt 0,r4 /* Prefetch ONE SRC cacheline */
std r5,-8(r1) /* remember size */
cmpldi cr1,r5,16 /* is size < 16 ? */
mr r6,r3
blt+ cr1,.Lshortcopy
.Lbigcopy:
neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
clrldi r8,r8,64-4 /* aling to 16byte boundary */
sub r7,r4,r3
cmpldi cr0,r8,0
beq+ .Ldst_aligned
.Ldst_unaligned:
mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
subf r5,r8,r5
bf cr7*4+3,1f
20: lbzx r0,r7,r6 /* copy 1 byte */
60: stb r0,0(r6)
addi r6,r6,1
1: bf cr7*4+2,2f
21: lhzx r0,r7,r6 /* copy 2 byte */
61: sth r0,0(r6)
addi r6,r6,2
2: bf cr7*4+1,4f
22: lwzx r0,r7,r6 /* copy 4 byte */
62: stw r0,0(r6)
addi r6,r6,4
4: bf cr7*4+0,8f
23: ldx r0,r7,r6 /* copy 8 byte */
63: std r0,0(r6)
addi r6,r6,8
8:
add r4,r7,r6
.Ldst_aligned:
cmpdi cr5,r5,128-1
neg r7,r6
addi r6,r6,-8 /* prepare for stdu */
addi r4,r4,-8 /* prepare for ldu */
clrldi r7,r7,64-7 /* align to cacheline boundary */
ble+ cr5,.Llessthancacheline
cmpldi cr6,r7,0
subf r5,r7,r5
srdi r7,r7,4 /* divide size by 16 */
srdi r10,r5,7 /* number of cache lines to copy */
cmpldi r10,0
li r11,0 /* number cachelines to copy with prefetch */
beq .Lnocacheprefetch
cmpldi r10,PREFETCH_AHEAD
li r12,128+8 /* prefetch distance*/
ble .Llessthanmaxprefetch
subi r11,r10,PREFETCH_AHEAD
li r10,PREFETCH_AHEAD
.Llessthanmaxprefetch:
mtctr r10
.LprefetchSRC:
dcbt r12,r4
addi r12,r12,128
bdnz .LprefetchSRC
.Lnocacheprefetch:
mtctr r7
cmpldi cr1,r5,128
clrldi r5,r5,64-7
beq cr6,.Lcachelinealigned /* */
.Laligntocacheline:
24: ld r9,0x08(r4)
25: ldu r7,0x10(r4)
64: std r9,0x08(r6)
65: stdu r7,0x10(r6)
bdnz .Laligntocacheline
.Lcachelinealigned: /* copy while cache lines */
blt- cr1,.Llessthancacheline /* size <128 */
.Louterloop:
cmpdi r11,0
mtctr r11
beq- .Lendloop
li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
.align 4
/* Copy whole cachelines, optimized by prefetching SRC cacheline */
.Lloop: /* Copy aligned body */
dcbt r12,r4 /* PREFETCH SOURCE cache lines ahead*/
26: ld r9, 0x08(r4)
4000: dcbz r11,r6
27: ld r7, 0x10(r4) /* 4 register stride copy */
28: ld r8, 0x18(r4) /* 4 are optimal to hide 1st level cache lantency*/
29: ld r0, 0x20(r4)
66: std r9, 0x08(r6)
67: std r7, 0x10(r6)
68: std r8, 0x18(r6)
69: std r0, 0x20(r6)
30: ld r9, 0x28(r4)
31: ld r7, 0x30(r4)
32: ld r8, 0x38(r4)
33: ld r0, 0x40(r4)
70: std r9, 0x28(r6)
71: std r7, 0x30(r6)
72: std r8, 0x38(r6)
73: std r0, 0x40(r6)
34: ld r9, 0x48(r4)
35: ld r7, 0x50(r4)
36: ld r8, 0x58(r4)
37: ld r0, 0x60(r4)
74: std r9, 0x48(r6)
75: std r7, 0x50(r6)
76: std r8, 0x58(r6)
77: std r0, 0x60(r6)
38: ld r9, 0x68(r4)
39: ld r7, 0x70(r4)
40: ld r8, 0x78(r4)
41: ldu r0, 0x80(r4)
78: std r9, 0x68(r6)
79: std r7, 0x70(r6)
80: std r8, 0x78(r6)
81: stdu r0, 0x80(r6)
bdnz .Lloop
.Lendloop:
cmpdi r10,0
sldi r10,r10,2 /* adjust from 128 to 32 byte stride */
beq- .Lendloop2
mtctr r10
.Lloop2: /* Copy aligned body */
42: ld r9, 0x08(r4)
43: ld r7, 0x10(r4)
44: ld r8, 0x18(r4)
45: ldu r0, 0x20(r4)
82: std r9, 0x08(r6)
83: std r7, 0x10(r6)
84: std r8, 0x18(r6)
85: stdu r0, 0x20(r6)
bdnz .Lloop2
.Lendloop2:
.Llessthancacheline: /* less than cache to do ? */
cmpldi cr0,r5,16
srdi r7,r5,4 /* divide size by 16 */
blt- .Ldo_lt16
mtctr r7
.Lcopy_remaining:
46: ld r8,0x08(r4)
47: ldu r7,0x10(r4)
86: std r8,0x08(r6)
87: stdu r7,0x10(r6)
bdnz .Lcopy_remaining
.Ldo_lt16: /* less than 16 ? */
cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */
beq sp1 /* no rest to copy */
addi r4,r4,8
addi r6,r6,8
.Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */
mtcrf 0x01,r5
sub r7,r4,r6
bf- cr7*4+0,sp8
48: ldx r0,r7,r6 /* copy 8 byte */
88: std r0,0(r6)
addi r6,r6,8
sp8:
bf cr7*4+1,sp4
49: lwzx r0,r7,r6 /* copy 4 byte */
89: stw r0,0(r6)
addi r6,r6,4
sp4:
bf cr7*4+2,sp2
50: lhzx r0,r7,r6 /* copy 2 byte */
90: sth r0,0(r6)
addi r6,r6,2
sp2:
bf cr7*4+3,sp1
51: lbzx r0,r7,r6 /* copy 1 byte */
91: stb r0,0(r6)
sp1:
li r3,0
blr
/*
* exception handlers follow
* we have to return the number of bytes not copied
* for an exception on a load, we set the rest of the destination to 0
*/
151:
150:
149:
148:
add r4,r7,r6
b 1002f
123:
122:
121:
add r4,r7,r6
add r5,r8,r5 /* original size is r5 + r8, no need to go to stack */
b 1001f
120:
add r5,r8,r5 /* original size is r5 + r8, no need to go to stack */
b 1003f /* we know we can't copy any more bytes so jump to clring */
141:
140:
139:
138:
addi r6,r6,32
addi r4,r4,32
137:
136:
135:
134:
addi r6,r6,32
addi r4,r4,32
133:
132:
131:
130:
addi r6,r6,32
addi r4,r4,32
4100:
147:
146:
145:
144:
143:
142:
129:
128:
127:
126:
125:
124:
addi r6,r6,8
addi r4,r4,8
/*
* we had a fault on a load
* r6 - first unmodified byte of the destination
* r3 - original destination
* r4 - next byte we have to read for a load
*/
1002: ld r5,-8(r1)
1001: subf r3,r3,r6 /* number of bytes we did copy */
subf r5,r3,r5 /* #bytes left to go */
/*
* first see if we can copy any more bytes before hitting another exception
*/
mtctr r5
52: lbz r0,0(r4)
addi r4,r4,1
92: stb r0,0(r6)
addi r6,r6,1
bdnz 52b
li r3,0 /* huh? all copied successfully this time? */
blr
/*
* here we have trapped again, need to clear ctr bytes starting at r6
*/
152: mfctr r5
1003: li r0,0
mr r4,r6
mr r3,r5 /* return the number of bytes not copied */
1: andi. r9,r4,7
beq 3f
93: stb r0,0(r4)
addic. r5,r5,-1
addi r4,r4,1
bne 1b
blr
3: cmpldi cr1,r5,8
srdi r9,r5,3
andi. r5,r5,7
blt cr1,1000f
mtctr r9
94: std r0,0(r4)
addi r4,r4,8
bdnz 94b
1000: beqlr
mtctr r5
95: stb r0,0(r4)
addi r4,r4,1
bdnz 95b
blr
/*
* we had a fault on a store
* r6 - byte we tried to store to
* r3 - original destination
*/
181:
addi r6,r6,8
180:
addi r6,r6,8
179:
addi r6,r6,8
178:
addi r6,r6,8
177:
addi r6,r6,8
176:
addi r6,r6,8
175:
addi r6,r6,8
174:
addi r6,r6,8
173:
addi r6,r6,8
172:
addi r6,r6,8
171:
addi r6,r6,8
170:
addi r6,r6,8
185:
169:
addi r6,r6,8
184:
168:
addi r6,r6,8
187:
183:
167:
165:
addi r6,r6,8
186:
182:
166:
164:
addi r6,r6,8
191:
190:
189:
188:
163:
162:
161:
160:
ld r5,-8(r1)
subf r3,r3,r6 /* number of bytes we did copy */
subf r3,r3,r5
195:
194:
193:
blr /* #bytes not copied in r3 */
192:
mfctr r3
blr
.section __ex_table,"a"
.align 3
.llong 20b,120b
.llong 60b,160b
.llong 21b,121b
.llong 61b,161b
.llong 22b,122b
.llong 62b,162b
.llong 23b,123b
.llong 63b,163b
.llong 24b,124b
.llong 25b,125b
.llong 64b,164b
.llong 65b,165b
.llong 26b,126b
.llong 27b,127b
.llong 28b,128b
.llong 29b,129b
.llong 66b,166b
.llong 67b,167b
.llong 68b,168b
.llong 69b,169b
.llong 30b,130b
.llong 31b,131b
.llong 32b,132b
.llong 33b,133b
.llong 70b,170b
.llong 71b,171b
.llong 72b,172b
.llong 73b,173b
.llong 34b,134b
.llong 35b,135b
.llong 36b,136b
.llong 37b,137b
.llong 74b,174b
.llong 75b,175b
.llong 76b,176b
.llong 77b,177b
.llong 38b,138b
.llong 39b,139b
.llong 40b,140b
.llong 41b,141b
.llong 78b,178b
.llong 79b,179b
.llong 80b,180b
.llong 81b,181b
.llong 42b,142b
.llong 43b,143b
.llong 44b,144b
.llong 45b,145b
.llong 82b,182b
.llong 83b,183b
.llong 84b,184b
.llong 85b,185b
.llong 46b,146b
.llong 47b,147b
.llong 86b,186b
.llong 87b,187b
.llong 48b,148b
.llong 88b,188b
.llong 49b,149b
.llong 89b,189b
.llong 50b,150b
.llong 90b,190b
.llong 51b,151b
.llong 91b,191b
.llong 52b,152b
.llong 92b,192b
.llong 93b,193b
.llong 94b,194b
.llong 95b,195b
.llong 4000b,4100b
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
2008-06-19 7:53 [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell Mark Nelson
@ 2008-06-19 14:43 ` Arnd Bergmann
2008-06-19 15:17 ` Gunnar von Boehn
2008-06-20 1:55 ` Mark Nelson
0 siblings, 2 replies; 19+ messages in thread
From: Arnd Bergmann @ 2008-06-19 14:43 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Mark Nelson, Gunnar von Boehn, cbe-oss-dev, Michael Ellerman
On Thursday 19 June 2008, Mark Nelson wrote:
> * __copy_tofrom_user routine optimized for CELL-BE-PPC
A few things I noticed:
* You don't have a page wise user copy, which the regular code
has. This is probably not so noticable in iperf, but should
have a significant impact on lmbench and on a number of file
system tests that copy large amounts of data. Have you checked
that the loop around cache lines is just as fast?
* You don't align the source to word size, only the target.
Does this get handled correctly when the source is a noncacheable
mapping, e.g. an unaligned copy_from_user where the source points
to a physical local store mapping of an SPU? I don't think we
need to optimize this case for performance, but I'm not sure
if it would crash. AFAIR, unaligned loads from noncacheable storage
give you an alignment exception that you need to handle, right?
* The naming of the labels (with just numbers) is rather confusing,
it would be good to have something better, but I must admit that
I don't have a good idea either.
* The trick of using the condition code in cr7 for the last bytes
is really cute, but are the four branches actually better than a
single computed branch into the middle of 15 byte wise copies?
Arnd <><
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
2008-06-19 14:43 ` Arnd Bergmann
@ 2008-06-19 15:17 ` Gunnar von Boehn
2008-06-19 16:13 ` Sanjay Patel
2008-06-20 1:13 ` [Cbe-oss-dev] " Paul Mackerras
2008-06-20 1:55 ` Mark Nelson
1 sibling, 2 replies; 19+ messages in thread
From: Gunnar von Boehn @ 2008-06-19 15:17 UTC (permalink / raw)
To: Arnd Bergmann; +Cc: Mark Nelson, linuxppc-dev, Michael Ellerman, cbe-oss-dev
Hi Arnd,
> You don't have a page wise user copy,
> which the regular code has.
The new code does not need two version IMHO.
The "regular" code was much slower for the normal case and has a special
version for the 4K optimized case.
The new code is equally good in both cases, so adding an extra 4K routine
is will increase the code size for very minor gain. I'm not sure if its
worth it.
Benchmark result on QS22 for good aligned copy
Old-code : 1300 MB/sec
Old-code 4k Special case: 2600 MB/sec
New code : 4000 MB/sec (always)
> You don't align the source to word size, only the target.
> Does this get handled correctly when the source
> is a noncacheable mapping, e.g.
The problem is that on CELL the required shift instructions
for SRC alignment are microcoded, in other words really slow.
You are right the main copy2user requires that the SRC is cacheable.
IMHO because of the exception on load, the routine should fallback to the
byte copy loop.
Arnd, could you verify that it works on localstore?
Cheers
Gunnar
Arnd Bergmann
<arnd@arndb.de>
To
19/06/2008 16:43 linuxppc-dev@ozlabs.org
cc
Mark Nelson <markn@au1.ibm.com>,
cbe-oss-dev@ozlabs.org, Gunnar von
Boehn/Germany/Contr/IBM@IBMDE,
Michael Ellerman
<ellerman@au1.ibm.com>
Subject
Re: [RFC 1/3] powerpc:
__copy_tofrom_user tweaked for Cell
On Thursday 19 June 2008, Mark Nelson wrote:
> * __copy_tofrom_user routine optimized for CELL-BE-PPC
A few things I noticed:
* You don't have a page wise user copy, which the regular code
has. This is probably not so noticable in iperf, but should
have a significant impact on lmbench and on a number of file
system tests that copy large amounts of data. Have you checked
that the loop around cache lines is just as fast?
* You don't align the source to word size, only the target.
Does this get handled correctly when the source is a noncacheable
mapping, e.g. an unaligned copy_from_user where the source points
to a physical local store mapping of an SPU? I don't think we
need to optimize this case for performance, but I'm not sure
if it would crash. AFAIR, unaligned loads from noncacheable storage
give you an alignment exception that you need to handle, right?
* The naming of the labels (with just numbers) is rather confusing,
it would be good to have something better, but I must admit that
I don't have a good idea either.
* The trick of using the condition code in cr7 for the last bytes
is really cute, but are the four branches actually better than a
single computed branch into the middle of 15 byte wise copies?
Arnd <><
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
2008-06-19 15:17 ` Gunnar von Boehn
@ 2008-06-19 16:13 ` Sanjay Patel
2008-06-20 11:36 ` Gunnar von Boehn
2008-06-20 1:13 ` [Cbe-oss-dev] " Paul Mackerras
1 sibling, 1 reply; 19+ messages in thread
From: Sanjay Patel @ 2008-06-19 16:13 UTC (permalink / raw)
To: Arnd Bergmann, Gunnar von Boehn
Cc: Mark Nelson, linuxppc-dev, Michael Ellerman, cbe-oss-dev
--- On Thu, 6/19/08, Gunnar von Boehn <VONBOEHN@de.ibm.com> wrote:
> You are right the main copy2user requires that the SRC is
> cacheable.
> IMHO because of the exception on load, the routine should
> fallback to the
> byte copy loop.
>
> Arnd, could you verify that it works on localstore?
Since the main loops use 'dcbz', the destination must also be cacheable. IIRC, if the destination is write-through or cache-inhibited, the 'dcbz' will cause an alignment exception. I suppose it would still function correctly via the handler, but horribly slowly.
--Sanjay
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [Cbe-oss-dev] [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
2008-06-19 15:17 ` Gunnar von Boehn
2008-06-19 16:13 ` Sanjay Patel
@ 2008-06-20 1:13 ` Paul Mackerras
2008-06-20 16:47 ` Gunnar von Boehn
2008-06-21 2:00 ` Arnd Bergmann
1 sibling, 2 replies; 19+ messages in thread
From: Paul Mackerras @ 2008-06-20 1:13 UTC (permalink / raw)
To: Gunnar von Boehn
Cc: linuxppc-dev, Michael Ellerman, cbe-oss-dev, Arnd Bergmann
Gunnar von Boehn writes:
> The "regular" code was much slower for the normal case and has a special
> version for the 4K optimized case.
That's a slightly inaccurate view...
The reason for having the two cases is that when I profiled the
distribution of sizes and alignments of memory copies in the kernel,
the result was that almost all copies (something like 99%, IIRC) were
either 128 bytes or less, or else a whole page at a page-aligned
address.
Thus we get the best performance by having a simple copy routine with
minimal setup overhead for the small copy case, plus an aggressively
optimized page copy routine. Spending time setting up for a
multi-cacheline copy that's not a whole page is just going to hurt the
small copy case without providing any real benefit.
Transferring data over loopback is possibly an exception to that.
However, it's very rare to transfer large amounts of data over
loopback, unless you're running a benchmark like iperf or netperf. :-/
Paul.
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
2008-06-19 14:43 ` Arnd Bergmann
2008-06-19 15:17 ` Gunnar von Boehn
@ 2008-06-20 1:55 ` Mark Nelson
1 sibling, 0 replies; 19+ messages in thread
From: Mark Nelson @ 2008-06-20 1:55 UTC (permalink / raw)
To: linuxppc-dev
Cc: Gunnar von Boehn, cbe-oss-dev, Arnd Bergmann, Michael Ellerman
> * The naming of the labels (with just numbers) is rather confusing,
> it would be good to have something better, but I must admit that
> I don't have a good idea either.
I will admit that at first glance the label naming with numbers
does look confusing but when you notice that all the loads start
at 20 and all the stores start at 60 and that to get the exception
handler for those instructions you just add 100 I think it makes
sense, but that could be because I've been looking at it way too
long...
(I thought I had a comment in there to that effect but it must
have gotten lost along the way. I'll add a new comment
explaining the above, that should help)
>
> * The trick of using the condition code in cr7 for the last bytes
> is really cute, but are the four branches actually better than a
> single computed branch into the middle of 15 byte wise copies?
The original copy_tofrom_user does this also, which I guess is
carried over to this new version...
Gunnar did you have an old version that did something similar
to this?
Mark
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
2008-06-19 16:13 ` Sanjay Patel
@ 2008-06-20 11:36 ` Gunnar von Boehn
2008-06-20 17:46 ` Sanjay Patel
0 siblings, 1 reply; 19+ messages in thread
From: Gunnar von Boehn @ 2008-06-20 11:36 UTC (permalink / raw)
To: sanjay3000
Cc: Mark Nelson, linuxppc-dev, Michael Ellerman, cbe-oss-dev, Arnd Bergmann
Hi Sanjay,
> I suppose it would still function correctly via the handler, but horribly
slowly.
How important is best performance for the unaligned copy to/from
uncacheable memory?
The challenge of the CELL chip is that X-form of the shift instructions are
microcoded.
The shifts are needed to implement a copy that reads and writes always
aligned.
There is of course the option to not use the X-form of the shift but to
write several copy routines
using immediate shift instructions and to pick the matching copy routine.
This option would of course highly increase the code size of the memcopy
routine.
Kind regards
Gunnar
Sanjay Patel
<sanjay3000@yahoo
.com> To
Arnd Bergmann <arnd@arndb.de>,
19/06/2008 18:13 Gunnar von
Boehn/Germany/Contr/IBM@IBMDE
cc
Please respond to Mark Nelson <markn@au1.ibm.com>,
sanjay3000@yahoo. linuxppc-dev@ozlabs.org, Michael
com Ellerman <ellerman@au1.ibm.com>,
cbe-oss-dev@ozlabs.org
Subject
Re: [RFC 1/3] powerpc:
__copy_tofrom_user tweaked for Cell
--- On Thu, 6/19/08, Gunnar von Boehn <VONBOEHN@de.ibm.com> wrote:
> You are right the main copy2user requires that the SRC is
> cacheable.
> IMHO because of the exception on load, the routine should
> fallback to the
> byte copy loop.
>
> Arnd, could you verify that it works on localstore?
Since the main loops use 'dcbz', the destination must also be cacheable.
IIRC, if the destination is write-through or cache-inhibited, the 'dcbz'
will cause an alignment exception. I suppose it would still function
correctly via the handler, but horribly slowly.
--Sanjay
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [Cbe-oss-dev] [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
2008-06-20 1:13 ` [Cbe-oss-dev] " Paul Mackerras
@ 2008-06-20 16:47 ` Gunnar von Boehn
2008-06-21 2:00 ` Arnd Bergmann
1 sibling, 0 replies; 19+ messages in thread
From: Gunnar von Boehn @ 2008-06-20 16:47 UTC (permalink / raw)
To: Paul Mackerras; +Cc: linuxppc-dev, Michael Ellerman, cbe-oss-dev, Arnd Bergmann
Hi Paul,
Of course, I can only speak for the test result that I got on our
platforms.
We did test on PS3, QS21 single/dual, QS22 single/dual, and JS21
The performance of the old Linux routine and the new routine is about the
same for copies of less than 128 Bytes.
At 512 byte the new routine is about 100% faster than the old one. (on QS
21)
At 1500 Byte size, which is a typical ethernet frame size, the new routine
is over 3 times faster than the old one. (on QS21)
We could NOT see a performance decrease for small copies.
We saw that for copies of 512 byte and more the performance increase is
significant.
>However, it's very rare to transfer large amounts of data over
>loopback, unless you're running a benchmark like iperf or netperf.
Please mind that this test was done as its a simple way to show how much
less work the CPU needs to do to handle network traffic.
All network traffic goes to copy2user - all network traffic can now be done
with much less CPU power wasted for copying the data.
Don't you agree that network traffic or IO in general with packages over
500 Byte, is not a rare case?
Cheers
Gunnar
Paul Mackerras
<paulus@samba.org
> To
Gunnar von
20/06/2008 03:13 Boehn/Germany/Contr/IBM@IBMDE
cc
Arnd Bergmann <arnd@arndb.de>,
linuxppc-dev@ozlabs.org, Michael
Ellerman <ellerman@au1.ibm.com>,
cbe-oss-dev@ozlabs.org
Subject
Re: [Cbe-oss-dev] [RFC 1/3]
powerpc: __copy_tofrom_user tweaked
for Cell
Gunnar von Boehn writes:
> The "regular" code was much slower for the normal case and has a special
> version for the 4K optimized case.
That's a slightly inaccurate view...
The reason for having the two cases is that when I profiled the
distribution of sizes and alignments of memory copies in the kernel,
the result was that almost all copies (something like 99%, IIRC) were
either 128 bytes or less, or else a whole page at a page-aligned
address.
Thus we get the best performance by having a simple copy routine with
minimal setup overhead for the small copy case, plus an aggressively
optimized page copy routine. Spending time setting up for a
multi-cacheline copy that's not a whole page is just going to hurt the
small copy case without providing any real benefit.
Transferring data over loopback is possibly an exception to that.
However, it's very rare to transfer large amounts of data over
loopback, unless you're running a benchmark like iperf or netperf. :-/
Paul.
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
2008-06-20 11:36 ` Gunnar von Boehn
@ 2008-06-20 17:46 ` Sanjay Patel
2008-06-20 23:20 ` Benjamin Herrenschmidt
2008-06-23 8:30 ` Gunnar von Boehn
0 siblings, 2 replies; 19+ messages in thread
From: Sanjay Patel @ 2008-06-20 17:46 UTC (permalink / raw)
To: Gunnar von Boehn
Cc: Mark Nelson, linuxppc-dev, Michael Ellerman, cbe-oss-dev, Arnd Bergmann
--- On Fri, 6/20/08, Gunnar von Boehn <VONBOEHN@de.ibm.com> wrote:
> How important is best performance for the unaligned copy
> to/from uncacheable memory?
> The challenge of the CELL chip is that X-form of the shift
> instructions are microcoded.
> The shifts are needed to implement a copy that reads and
> writes always aligned.
Hi Gunnar,
I have no idea how important unaligned or uncacheable copy perf is for Cell Linux. My experience is from Mac OS X for PPC, where we used dcbz in a general-purpose memcpy but were forced to pull that optimization because of the detrimental perf effect on important applications.
I may be missing something, but I don't see how Cell's microcoded shift is much of a factor here. The problem is that the dcbz will generate the alignment exception regardless of whether the data is actually unaligned or not. Once you're on that code path, performance can't be good, can it?
--Sanjay
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
2008-06-20 17:46 ` Sanjay Patel
@ 2008-06-20 23:20 ` Benjamin Herrenschmidt
2008-06-20 23:44 ` Sanjay Patel
2008-06-23 8:30 ` Gunnar von Boehn
1 sibling, 1 reply; 19+ messages in thread
From: Benjamin Herrenschmidt @ 2008-06-20 23:20 UTC (permalink / raw)
To: sanjay3000
Cc: Mark Nelson, Gunnar von Boehn, Arnd Bergmann, linuxppc-dev,
Michael Ellerman, cbe-oss-dev
On Fri, 2008-06-20 at 10:46 -0700, Sanjay Patel wrote:
> --- On Fri, 6/20/08, Gunnar von Boehn <VONBOEHN@de.ibm.com> wrote:
> > How important is best performance for the unaligned copy
> > to/from uncacheable memory?
> > The challenge of the CELL chip is that X-form of the shift
> > instructions are microcoded.
> > The shifts are needed to implement a copy that reads and
> > writes always aligned.
>
> Hi Gunnar,
>
> I have no idea how important unaligned or uncacheable copy perf is for
> Cell Linux. My experience is from Mac OS X for PPC, where we used dcbz
> in a general-purpose memcpy but were forced to pull that optimization
> because of the detrimental perf effect on important applications.
I though OS X had a trick with a CR bit that would disable the dcbz
optimization on the first alignment fault ? Or did they totally remove
it ?
> I may be missing something, but I don't see how Cell's microcoded
> shift is much of a factor here. The problem is that the dcbz will
> generate the alignment exception regardless of whether the data is
> actually unaligned or not. Once you're on that code path, performance
> can't be good, can it?
This is a concern. The problem is, do we want to lose all the benefit
of improved copy_to/from_user because of that ? Passing local store
addresses to/from read/write syscalls is supported, so I suppose it's a
real issue for reads.
On the other hand, how performant do we expect those to be ? That is, we
could have the alignment exception detect that it happened during
copy_to/from_user, and change the return address to a non-optimized
variant. Thus we would have at most one exception per read syscall.
Ben.
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
2008-06-20 23:20 ` Benjamin Herrenschmidt
@ 2008-06-20 23:44 ` Sanjay Patel
0 siblings, 0 replies; 19+ messages in thread
From: Sanjay Patel @ 2008-06-20 23:44 UTC (permalink / raw)
To: benh
Cc: Mark Nelson, Gunnar von Boehn, Arnd Bergmann, linuxppc-dev,
Michael Ellerman, cbe-oss-dev
--- On Fri, 6/20/08, Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> I though OS X had a trick with a CR bit that would disable
> the dcbz optimization on the first alignment fault ? Or did they
> totally remove it ?
Ah, it's coming back to me. :)
Apple added 'dcbz', removed it, and then there was the clever trick of optimizing the code path with a boot-time perf test and/or changing the code on the first fault...I'm not sure what's implemented in the recent builds.
If Linux can do something similar, that should allow good perf on cacheable and cache-inhibited space as well as different CPUs (eg, if 'dcba' is available, then you don't need the alignment fault hack).
--Sanjay
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [Cbe-oss-dev] [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
2008-06-20 1:13 ` [Cbe-oss-dev] " Paul Mackerras
2008-06-20 16:47 ` Gunnar von Boehn
@ 2008-06-21 2:00 ` Arnd Bergmann
2008-06-21 4:30 ` Paul Mackerras
1 sibling, 1 reply; 19+ messages in thread
From: Arnd Bergmann @ 2008-06-21 2:00 UTC (permalink / raw)
To: Paul Mackerras
Cc: linuxppc-dev, Gunnar von Boehn, Michael Ellerman, cbe-oss-dev
On Friday 20 June 2008, Paul Mackerras wrote:
> Transferring data over loopback is possibly an exception to that.
> However, it's very rare to transfer large amounts of data over
> loopback, unless you're running a benchmark like iperf or netperf. :-/
Well, it is the exact case that came up in a real world scenario
for cell: On a network intensive application where the SPUs are
supposed to do all the work, we ended up not getting enough
data in and out through gbit ethernet because the PPU spent
much of its time in copy_to_user.
Going to 10gbit will make the problem even more apparent.
I understand that optimizing for this case will cost extra
branches for the other cases, but maybe we can find a better
compromise than before. Can you name a test case that you
consider important to optimize for for what you consider
real-life tests?
Doing some static compile-time analysis, I found that most
of the call sites (which are not necessarily most of
the run time calls) pass either a small constant size of
less than a few cache lines, or have a variable size but are
not at all performance critical.
Since the prefetching and cache line size awareness was
most of the improvement for cell (AFAIU), maybe we can
annotate the few interesting cases, say by introducing a
new copy_from_user_large() function that can be easily
optimized for large transfers on a given CPU, while
the remaining code keeps optmizing for small transfers
and may even get rid of the full page copy optimization
in order to save a branch.
Arnd <><
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [Cbe-oss-dev] [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
2008-06-21 2:00 ` Arnd Bergmann
@ 2008-06-21 4:30 ` Paul Mackerras
2008-06-21 4:49 ` David Miller
2008-06-21 21:06 ` Arnd Bergmann
0 siblings, 2 replies; 19+ messages in thread
From: Paul Mackerras @ 2008-06-21 4:30 UTC (permalink / raw)
To: Arnd Bergmann
Cc: linuxppc-dev, Gunnar von Boehn, Michael Ellerman, cbe-oss-dev
Arnd Bergmann writes:
> On Friday 20 June 2008, Paul Mackerras wrote:
>
> > Transferring data over loopback is possibly an exception to that.
> > However, it's very rare to transfer large amounts of data over
> > loopback, unless you're running a benchmark like iperf or netperf. :-/
>
> Well, it is the exact case that came up in a real world scenario
> for cell: On a network intensive application where the SPUs are
> supposed to do all the work, we ended up not getting enough
> data in and out through gbit ethernet because the PPU spent
^^^^^^^^^^^^^
Which isn't loopback... :)
I have no objection to improving copy_tofrom_user, memcpy and
copy_page. I just want to make sure that we don't make things worse
on some platform.
In fact, Mark and I dug up some experiments I had done 5 or 6 years
ago and just ran through all the copy loops I tried back then, on
QS22, POWER6, POWER5+, POWER5, POWER4, 970, and POWER3, and compared
them to the current kernel routines and the proposed new Cell
routines. So far we have just looked at the copy_page case (i.e. 4kB
on a 4kB alignment) for cache-cold and cache-hot cases.
Interestingly, some of the routines I discarded back then turn out to
do really well on most of the modern platforms, and quite a lot better
on Cell than Gunnar's code does (~10GB/s vs. ~5.5GB/s in the hot-cache
case, IIRC). Mark is going to summarise the results and also measure
the speed for smaller copies and misaligned copies.
As for the distribution of sizes, I think it would be worthwhile to
run a fresh set of tests. As I said, my previous results showed most
copies to be either small (<= 128B) or a multiple of 4k, and I think
that was true for copy_tofrom_user as well as memcpy, but that was a
while ago.
> much of its time in copy_to_user.
>
> Going to 10gbit will make the problem even more apparent.
Is this application really transferring bulk data and using buffers
that aren't a multiple of the page size? Do you know whether the
copies ended up being misaligned?
Of course, if we really want the fastest copy possible, the thing to
do is to use VMX loads and stores on 970, POWER6 and Cell. The
overhead of setting up to use VMX in the kernel would probably kill
any advantage, though -- at least, that's what I found when I tried
using VMX for copy_page in the kernel on 970 a few years ago.
> Doing some static compile-time analysis, I found that most
> of the call sites (which are not necessarily most of
> the run time calls) pass either a small constant size of
> less than a few cache lines, or have a variable size but are
> not at all performance critical.
> Since the prefetching and cache line size awareness was
> most of the improvement for cell (AFAIU), maybe we can
> annotate the few interesting cases, say by introducing a
> new copy_from_user_large() function that can be easily
> optimized for large transfers on a given CPU, while
> the remaining code keeps optmizing for small transfers
> and may even get rid of the full page copy optimization
> in order to save a branch.
Let's see what Mark comes up with. We may be able to find a way to do
it that works well across all current CPUs and also is OK for small
copies. If not we might need to do what you suggest.
Regards,
Paul.
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [Cbe-oss-dev] [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
2008-06-21 4:30 ` Paul Mackerras
@ 2008-06-21 4:49 ` David Miller
2008-06-21 21:06 ` Arnd Bergmann
1 sibling, 0 replies; 19+ messages in thread
From: David Miller @ 2008-06-21 4:49 UTC (permalink / raw)
To: paulus; +Cc: linuxppc-dev, VONBOEHN, ellerman, arnd, cbe-oss-dev
From: Paul Mackerras <paulus@samba.org>
Date: Sat, 21 Jun 2008 14:30:02 +1000
> Is this application really transferring bulk data and using buffers
> that aren't a multiple of the page size? Do you know whether the
> copies ended up being misaligned?
We used to cache align the sub-buffers carved out of the per-socket
anonymous buffer page that TCP, UDP, and other protocols use to
collect user write()/sendmsg() data when the outgoing interface
supports scatter-gather and checksumming (basically just about
any reasonable ethernet device these days).
But that alignment was removed from net/ipv4/tcp.c (I forget the exact
reasons, I think it was space wastage).
The net result is that you will therefore see a lot of misaligned
copies for networking sends these days.
In my opinion optimizing mempcy/user-copy in the most general way is
the best course of action. Whatever histogram of sizes and alignments
you obtain with sampling today will change unpredictably in the
future. I'm saying this as someone who fell into that trap on sparc64
several times in the past, and I have 5 memcpy/user-copy/memset
variants to maintain on that platform these days :-/
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [Cbe-oss-dev] [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
2008-06-21 4:30 ` Paul Mackerras
2008-06-21 4:49 ` David Miller
@ 2008-06-21 21:06 ` Arnd Bergmann
1 sibling, 0 replies; 19+ messages in thread
From: Arnd Bergmann @ 2008-06-21 21:06 UTC (permalink / raw)
To: linuxppc-dev
Cc: Gunnar von Boehn, Paul Mackerras, Michael Ellerman, cbe-oss-dev
On Saturday 21 June 2008, Paul Mackerras wrote:
> Is this application really transferring bulk data and using buffers
> that aren't a multiple of the page size? =A0Do you know whether the
> copies ended up being misaligned?
In the problem case that was reported to me, it was all bulk data,
and all the oprofile samples showed up in the unaligned code path
of the usercopy code, which does the microcoded (on cell) shift
operations.
> Of course, if we really want the fastest copy possible, the thing to
> do is to use VMX loads and stores on 970, POWER6 and Cell. =A0The
> overhead of setting up to use VMX in the kernel would probably kill
> any advantage, though -- at least, that's what I found when I tried
> using VMX for copy_page in the kernel on 970 a few years ago.
Right, that is understandable, we saw similar results when Sebastian
was working on VMX optimized AES code.
> Let's see what Mark comes up with. =A0We may be able to find a way to do
> it that works well across all current CPUs and also is OK for small
> copies. =A0If not we might need to do what you suggest.
ok.
Arnd <><
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
2008-06-20 17:46 ` Sanjay Patel
2008-06-20 23:20 ` Benjamin Herrenschmidt
@ 2008-06-23 8:30 ` Gunnar von Boehn
2008-06-23 12:07 ` Geert Uytterhoeven
2008-06-23 23:49 ` Paul Mackerras
1 sibling, 2 replies; 19+ messages in thread
From: Gunnar von Boehn @ 2008-06-23 8:30 UTC (permalink / raw)
To: sanjay3000
Cc: Mark Nelson, linuxppc-dev, Michael Ellerman, cbe-oss-dev, Arnd Bergmann
Hi Sanya,
> I have no idea how important unaligned or uncacheable
> copy perf is for Cell Linux. My experience is from Mac
> OS X for PPC, where we used dcbz in a general-purpose
> memcpy but were forced to pull that optimization because
> of the detrimental perf effect on important applications.
Interesting points.
Can you help me to understand where the negative effect of DCBZ does come
from?
> I may be missing something, but I don't see how Cell's microcoded shift
is much of a factor here.
> The problem is that the dcbz will generate the alignment exception
> regardless of whether the data is actually unaligned or not.
> Once you're on that code path, performance can't be good, can it?
In which case will DCBZ create an aligned exception?
If you want to see result on Cell then here are the values you can expect
on 1 CPU:
On Cell the copy using the Shift-xform achives max 800 MB/sec.
The copy using a single byte loop achieves 800 MB/sec as well.
A unaligned copy using unrolled doublewords and cache prefetch achieves
about 2500 MB/sec.
The aligned case using unrolled doublewords and cache prefetch achieves
about 7000 MB/sec.
What hurts performance a lot on CELL (and on XBOX 360) are two things:
a) The first level cache latency, and the memory and 2nd level cache
latency.
Cell has a first level cache latency of 4.
Cell has a second level cache latency of 40.
Cell has a memory latency of 400.
To avoid the 1st level cache latency you need to have 4 instruction
distance between your load and usage/store of the data.
Therefore a straight copy needs to be written like this.
.Loop:
ld r9, 0x08(r4)
ld r7, 0x10(r4)
ld r8, 0x18(r4)
ldu r0, 0x20(r4)
std r9, 0x08(r6) // 4 instructions distance from load
std r7, 0x10(r6)
std r8, 0x18(r6)
stdu r0, 0x20(r6)
bdnz .Lloop2
b) A major pain in the back is the that the shift instruction is
microcoded.
While the SHIFT X-Form needs one clock on other PPC architectures, it needs
11 clocks on CELL.
An addition to taking 11 clocks for this running it thread, the microcoded
instruction will freeze the second thread.
Using microcoded instructions in a work loop will really drain the
performance on CELL.
I think if you want to use the same copy for uncacheable memory and maybe
for another PPC platform
then a good compromise will be to use the cache prefetch version for the
aligned case and to use a old SHIFT part for the unaligned case.
This way you will get max performance for aligned copies and good result
for the unaligned case.
Sanjay Patel
<sanjay3000@yahoo
.com> To
Gunnar von
20/06/2008 19:46 Boehn/Germany/Contr/IBM@IBMDE
cc
Arnd Bergmann <arnd@arndb.de>,
Please respond to cbe-oss-dev@ozlabs.org, Michael
sanjay3000@yahoo. Ellerman <ellerman@au1.ibm.com>,
com linuxppc-dev@ozlabs.org, Mark
Nelson <markn@au1.ibm.com>
Subject
Re: [RFC 1/3] powerpc:
__copy_tofrom_user tweaked for Cell
--- On Fri, 6/20/08, Gunnar von Boehn <VONBOEHN@de.ibm.com> wrote:
> How important is best performance for the unaligned copy
> to/from uncacheable memory?
> The challenge of the CELL chip is that X-form of the shift
> instructions are microcoded.
> The shifts are needed to implement a copy that reads and
> writes always aligned.
Hi Gunnar,
I have no idea how important unaligned or uncacheable copy perf is for Cell
Linux. My experience is from Mac OS X for PPC, where we used dcbz in a
general-purpose memcpy but were forced to pull that optimization because of
the detrimental perf effect on important applications.
I may be missing something, but I don't see how Cell's microcoded shift is
much of a factor here. The problem is that the dcbz will generate the
alignment exception regardless of whether the data is actually unaligned or
not. Once you're on that code path, performance can't be good, can it?
--Sanjay
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
2008-06-23 8:30 ` Gunnar von Boehn
@ 2008-06-23 12:07 ` Geert Uytterhoeven
2008-06-23 23:49 ` Paul Mackerras
1 sibling, 0 replies; 19+ messages in thread
From: Geert Uytterhoeven @ 2008-06-23 12:07 UTC (permalink / raw)
To: Gunnar von Boehn
Cc: Mark Nelson, sanjay3000, Arnd Bergmann, linuxppc-dev,
Michael Ellerman, cbe-oss-dev
[-- Attachment #1: Type: TEXT/PLAIN, Size: 871 bytes --]
On Mon, 23 Jun 2008, Gunnar von Boehn wrote:
> > The problem is that the dcbz will generate the alignment exception
> > regardless of whether the data is actually unaligned or not.
> > Once you're on that code path, performance can't be good, can it?
>
> In which case will DCBZ create an aligned exception?
When using dcbz on uncached memory, IIRC.
With kind regards,
Geert Uytterhoeven
Software Architect
Sony Techsoft Centre
The Corporate Village · Da Vincilaan 7-D1 · B-1935 Zaventem · Belgium
Phone: +32 (0)2 700 8453
Fax: +32 (0)2 700 8622
E-mail: Geert.Uytterhoeven@sonycom.com
Internet: http://www.sony-europe.com/
Sony Technology and Software Centre Europe
A division of Sony Service Centre (Europe) N.V.
Registered office: Technologielaan 7 · B-1840 Londerzeel · Belgium
VAT BE 0413.825.160 · RPR Brussels
Fortis 293-0376800-10 GEBA-BE-BB
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
2008-06-23 8:30 ` Gunnar von Boehn
2008-06-23 12:07 ` Geert Uytterhoeven
@ 2008-06-23 23:49 ` Paul Mackerras
2008-06-27 13:30 ` Gunnar von Boehn
1 sibling, 1 reply; 19+ messages in thread
From: Paul Mackerras @ 2008-06-23 23:49 UTC (permalink / raw)
To: Gunnar von Boehn
Cc: Mark Nelson, sanjay3000, Arnd Bergmann, linuxppc-dev,
Michael Ellerman, cbe-oss-dev
Gunnar von Boehn writes:
> Interesting points.
> Can you help me to understand where the negative effect of DCBZ does come
> from?
In my experience, dcbz slows down the hot-cache case because it adds a
few cycles to the execution time of the inner loop, and on most 64-bit
PowerPC implementations, it doesn't actually help even in the
cold-cache case because the store queue does enough write combining
that the cache doesn't end up reading the line from memory. I don't
know whether the Cell PPE can do that, but I could believe that it
can't.
Paul.
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell
2008-06-23 23:49 ` Paul Mackerras
@ 2008-06-27 13:30 ` Gunnar von Boehn
0 siblings, 0 replies; 19+ messages in thread
From: Gunnar von Boehn @ 2008-06-27 13:30 UTC (permalink / raw)
To: Paul Mackerras
Cc: Mark Nelson, Arnd Bergmann, sanjay3000, linuxppc-dev,
Michael Ellerman, cbe-oss-dev
Hi Paul,
> In my experience, dcbz slows down the hot-cache case because it adds a
> few cycles to the execution time of the inner loop, and on most 64-bit
> PowerPC implementations, it doesn't actually help even in the
> cold-cache case because the store queue does enough write combining
I agree with you that on POWER the dcbz is probably not helping.
On PowerPC my experience is different.
>From what I have seen DCBZ help enormously on 970,PA-Semi and CELL.
Cheers
Gunnar
Paul Mackerras
<paulus@samba.org
> To
Gunnar von
24/06/2008 01:49 Boehn/Germany/Contr/IBM@IBMDE
cc
sanjay3000@yahoo.com, Mark Nelson
<markn@au1.ibm.com>,
linuxppc-dev@ozlabs.org, Michael
Ellerman <ellerman@au1.ibm.com>,
cbe-oss-dev@ozlabs.org, Arnd
Bergmann <arnd@arndb.de>
Subject
Re: [RFC 1/3] powerpc:
__copy_tofrom_user tweaked for Cell
Gunnar von Boehn writes:
> Interesting points.
> Can you help me to understand where the negative effect of DCBZ does come
> from?
In my experience, dcbz slows down the hot-cache case because it adds a
few cycles to the execution time of the inner loop, and on most 64-bit
PowerPC implementations, it doesn't actually help even in the
cold-cache case because the store queue does enough write combining
that the cache doesn't end up reading the line from memory. I don't
know whether the Cell PPE can do that, but I could believe that it
can't.
Paul.
^ permalink raw reply [flat|nested] 19+ messages in thread
end of thread, other threads:[~2008-06-27 13:30 UTC | newest]
Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-06-19 7:53 [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell Mark Nelson
2008-06-19 14:43 ` Arnd Bergmann
2008-06-19 15:17 ` Gunnar von Boehn
2008-06-19 16:13 ` Sanjay Patel
2008-06-20 11:36 ` Gunnar von Boehn
2008-06-20 17:46 ` Sanjay Patel
2008-06-20 23:20 ` Benjamin Herrenschmidt
2008-06-20 23:44 ` Sanjay Patel
2008-06-23 8:30 ` Gunnar von Boehn
2008-06-23 12:07 ` Geert Uytterhoeven
2008-06-23 23:49 ` Paul Mackerras
2008-06-27 13:30 ` Gunnar von Boehn
2008-06-20 1:13 ` [Cbe-oss-dev] " Paul Mackerras
2008-06-20 16:47 ` Gunnar von Boehn
2008-06-21 2:00 ` Arnd Bergmann
2008-06-21 4:30 ` Paul Mackerras
2008-06-21 4:49 ` David Miller
2008-06-21 21:06 ` Arnd Bergmann
2008-06-20 1:55 ` Mark Nelson
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.