* [PATCH V2] powerpc: Implement {cmp}xchg for u8 and u16 @ 2016-04-19 6:29 Pan Xinhui 2016-04-19 9:18 ` Boqun Feng 2016-04-20 13:24 ` [PATCH V3] " Pan Xinhui 0 siblings, 2 replies; 22+ messages in thread From: Pan Xinhui @ 2016-04-19 6:29 UTC (permalink / raw) To: linux-kernel, linuxppc-dev Cc: benh, paulus, mpe, boqun.feng, peterz, paulmck, tglx From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> Implement xchg{u8,u16}{local,relaxed}, and cmpxchg{u8,u16}{,local,acquire,relaxed}. It works on all ppc. Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> --- change from V1: rework totally. --- arch/powerpc/include/asm/cmpxchg.h | 83 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h index 44efe73..79a1f45 100644 --- a/arch/powerpc/include/asm/cmpxchg.h +++ b/arch/powerpc/include/asm/cmpxchg.h @@ -7,6 +7,37 @@ #include <asm/asm-compat.h> #include <linux/bug.h> +#ifdef __BIG_ENDIAN +#define BITOFF_CAL(size, off) ((sizeof(u32) - size - off) * BITS_PER_BYTE) +#else +#define BITOFF_CAL(size, off) (off * BITS_PER_BYTE) +#endif + +static __always_inline unsigned long +__cmpxchg_u32_local(volatile unsigned int *p, unsigned long old, + unsigned long new); + +#define __XCHG_GEN(cmp, type, sfx, u32sfx, skip, v) \ +static __always_inline u32 \ +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new) \ +{ \ + int size = sizeof (type); \ + int off = (unsigned long)ptr % sizeof(u32); \ + volatile u32 *p = ptr - off; \ + int bitoff = BITOFF_CAL(size, off); \ + u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff; \ + u32 oldv, newv; \ + u32 ret; \ + do { \ + oldv = READ_ONCE(*p); \ + ret = (oldv & bitmask) >> bitoff; \ + if (skip && ret != old) \ + break; \ + newv = (oldv & ~bitmask) | (new << bitoff); \ + } while (__cmpxchg_u32##u32sfx((v void*)p, oldv, newv) != oldv);\ + return ret; \ +} + /* * Atomic exchange * @@ -14,6 +45,19 @@ * the previous value stored there. */ +#define XCHG_GEN(type, sfx, v) \ + __XCHG_GEN(_, type, sfx, _local, 0, v) \ +static __always_inline u32 __xchg_##type##sfx(v void *p, u32 n) \ +{ \ + return ___xchg_##type##sfx(p, 0, n); \ +} + +XCHG_GEN(u8, _local, volatile); +XCHG_GEN(u8, _relaxed, ); +XCHG_GEN(u16, _local, volatile); +XCHG_GEN(u16, _relaxed, ); +#undef XCHG_GEN + static __always_inline unsigned long __xchg_u32_local(volatile void *p, unsigned long val) { @@ -88,6 +132,10 @@ static __always_inline unsigned long __xchg_local(volatile void *ptr, unsigned long x, unsigned int size) { switch (size) { + case 1: + return __xchg_u8_local(ptr, x); + case 2: + return __xchg_u16_local(ptr, x); case 4: return __xchg_u32_local(ptr, x); #ifdef CONFIG_PPC64 @@ -103,6 +151,10 @@ static __always_inline unsigned long __xchg_relaxed(void *ptr, unsigned long x, unsigned int size) { switch (size) { + case 1: + return __xchg_u8_relaxed(ptr, x); + case 2: + return __xchg_u16_relaxed(ptr, x); case 4: return __xchg_u32_relaxed(ptr, x); #ifdef CONFIG_PPC64 @@ -226,6 +278,21 @@ __cmpxchg_u32_acquire(u32 *p, unsigned long old, unsigned long new) return prev; } + +#define CMPXCHG_GEN(type, sfx, v) \ + __XCHG_GEN(cmp, type, sfx, sfx, 1, v) + +CMPXCHG_GEN(u8, , volatile); +CMPXCHG_GEN(u8, _local, volatile); +CMPXCHG_GEN(u8, _relaxed, ); +CMPXCHG_GEN(u8, _acquire, ); +CMPXCHG_GEN(u16, , volatile); +CMPXCHG_GEN(u16, _local, volatile); +CMPXCHG_GEN(u16, _relaxed, ); +CMPXCHG_GEN(u16, _acquire, ); +#undef CMPXCHG_GEN +#undef __XCHG_GEN + #ifdef CONFIG_PPC64 static __always_inline unsigned long __cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new) @@ -316,6 +383,10 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, unsigned int size) { switch (size) { + case 1: + return __cmpxchg_u8(ptr, old, new); + case 2: + return __cmpxchg_u16(ptr, old, new); case 4: return __cmpxchg_u32(ptr, old, new); #ifdef CONFIG_PPC64 @@ -332,6 +403,10 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new, unsigned int size) { switch (size) { + case 1: + return __cmpxchg_u8_local(ptr, old, new); + case 2: + return __cmpxchg_u16_local(ptr, old, new); case 4: return __cmpxchg_u32_local(ptr, old, new); #ifdef CONFIG_PPC64 @@ -348,6 +423,10 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new, unsigned int size) { switch (size) { + case 1: + return __cmpxchg_u8_relaxed(ptr, old, new); + case 2: + return __cmpxchg_u16_relaxed(ptr, old, new); case 4: return __cmpxchg_u32_relaxed(ptr, old, new); #ifdef CONFIG_PPC64 @@ -364,6 +443,10 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new, unsigned int size) { switch (size) { + case 1: + return __cmpxchg_u8_acquire(ptr, old, new); + case 2: + return __cmpxchg_u16_acquire(ptr, old, new); case 4: return __cmpxchg_u32_acquire(ptr, old, new); #ifdef CONFIG_PPC64 -- 2.4.3 ^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [PATCH V2] powerpc: Implement {cmp}xchg for u8 and u16 2016-04-19 6:29 [PATCH V2] powerpc: Implement {cmp}xchg for u8 and u16 Pan Xinhui @ 2016-04-19 9:18 ` Boqun Feng 2016-04-20 3:39 ` Pan Xinhui 2016-04-20 13:24 ` [PATCH V3] " Pan Xinhui 1 sibling, 1 reply; 22+ messages in thread From: Boqun Feng @ 2016-04-19 9:18 UTC (permalink / raw) To: Pan Xinhui Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, peterz, paulmck, tglx Hi Xinhui, On Tue, Apr 19, 2016 at 02:29:34PM +0800, Pan Xinhui wrote: > From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> > > Implement xchg{u8,u16}{local,relaxed}, and > cmpxchg{u8,u16}{,local,acquire,relaxed}. > > It works on all ppc. > Nice work! AFAICT, your work doesn't depend on anything that ppc-specific, right? So maybe we can use it as a general approach for a fallback implementation on the archs without u8/u16 atomics. ;-) > Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org> > Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> > --- > change from V1: > rework totally. > --- > arch/powerpc/include/asm/cmpxchg.h | 83 ++++++++++++++++++++++++++++++++++++++ > 1 file changed, 83 insertions(+) > > diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h > index 44efe73..79a1f45 100644 > --- a/arch/powerpc/include/asm/cmpxchg.h > +++ b/arch/powerpc/include/asm/cmpxchg.h > @@ -7,6 +7,37 @@ > #include <asm/asm-compat.h> > #include <linux/bug.h> > > +#ifdef __BIG_ENDIAN > +#define BITOFF_CAL(size, off) ((sizeof(u32) - size - off) * BITS_PER_BYTE) > +#else > +#define BITOFF_CAL(size, off) (off * BITS_PER_BYTE) > +#endif > + > +static __always_inline unsigned long > +__cmpxchg_u32_local(volatile unsigned int *p, unsigned long old, > + unsigned long new); > + > +#define __XCHG_GEN(cmp, type, sfx, u32sfx, skip, v) \ > +static __always_inline u32 \ > +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new) \ > +{ \ > + int size = sizeof (type); \ > + int off = (unsigned long)ptr % sizeof(u32); \ > + volatile u32 *p = ptr - off; \ > + int bitoff = BITOFF_CAL(size, off); \ > + u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff; \ > + u32 oldv, newv; \ > + u32 ret; \ > + do { \ > + oldv = READ_ONCE(*p); \ > + ret = (oldv & bitmask) >> bitoff; \ > + if (skip && ret != old) \ > + break; \ > + newv = (oldv & ~bitmask) | (new << bitoff); \ > + } while (__cmpxchg_u32##u32sfx((v void*)p, oldv, newv) != oldv);\ Forgive me if this is too paranoid, but I think we can save the READ_ONCE() in the loop if we change the code into the following, because cmpxchg will return the "new" value, if the cmp part fails. newv = READ_ONCE(*p); do { oldv = newv; ret = (oldv & bitmask) >> bitoff; if (skip && ret != old) break; newv = (oldv & ~bitmask) | (new << bitoff); newv = __cmpxchg_u32##u32sfx((void *)p, oldv, newv); } while(newv != oldv); > + return ret; \ > +} > + > /* > * Atomic exchange > * > @@ -14,6 +45,19 @@ > * the previous value stored there. > */ > > +#define XCHG_GEN(type, sfx, v) \ > + __XCHG_GEN(_, type, sfx, _local, 0, v) \ ^^^^^^^ This should be sfx, right? Otherwise, all the newly added xchg will call __cmpxchg_u32_local, this will result in wrong ordering guarantees. > +static __always_inline u32 __xchg_##type##sfx(v void *p, u32 n) \ > +{ \ > + return ___xchg_##type##sfx(p, 0, n); \ > +} > + > +XCHG_GEN(u8, _local, volatile); I don't think we need the "volatile" modifier here, because READ_ONCE() and __cmpxchg_u32_* all have "volatile" semantics IIUC, so maybe we can save a paramter for the __XCHG_GEN macro. Regards, Boqun > +XCHG_GEN(u8, _relaxed, ); > +XCHG_GEN(u16, _local, volatile); > +XCHG_GEN(u16, _relaxed, ); > +#undef XCHG_GEN > + > static __always_inline unsigned long > __xchg_u32_local(volatile void *p, unsigned long val) > { > @@ -88,6 +132,10 @@ static __always_inline unsigned long > __xchg_local(volatile void *ptr, unsigned long x, unsigned int size) > { > switch (size) { > + case 1: > + return __xchg_u8_local(ptr, x); > + case 2: > + return __xchg_u16_local(ptr, x); > case 4: > return __xchg_u32_local(ptr, x); > #ifdef CONFIG_PPC64 > @@ -103,6 +151,10 @@ static __always_inline unsigned long > __xchg_relaxed(void *ptr, unsigned long x, unsigned int size) > { > switch (size) { > + case 1: > + return __xchg_u8_relaxed(ptr, x); > + case 2: > + return __xchg_u16_relaxed(ptr, x); > case 4: > return __xchg_u32_relaxed(ptr, x); > #ifdef CONFIG_PPC64 > @@ -226,6 +278,21 @@ __cmpxchg_u32_acquire(u32 *p, unsigned long old, unsigned long new) > return prev; > } > > + > +#define CMPXCHG_GEN(type, sfx, v) \ > + __XCHG_GEN(cmp, type, sfx, sfx, 1, v) > + > +CMPXCHG_GEN(u8, , volatile); > +CMPXCHG_GEN(u8, _local, volatile); > +CMPXCHG_GEN(u8, _relaxed, ); > +CMPXCHG_GEN(u8, _acquire, ); > +CMPXCHG_GEN(u16, , volatile); > +CMPXCHG_GEN(u16, _local, volatile); > +CMPXCHG_GEN(u16, _relaxed, ); > +CMPXCHG_GEN(u16, _acquire, ); > +#undef CMPXCHG_GEN > +#undef __XCHG_GEN > + > #ifdef CONFIG_PPC64 > static __always_inline unsigned long > __cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new) > @@ -316,6 +383,10 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, > unsigned int size) > { > switch (size) { > + case 1: > + return __cmpxchg_u8(ptr, old, new); > + case 2: > + return __cmpxchg_u16(ptr, old, new); > case 4: > return __cmpxchg_u32(ptr, old, new); > #ifdef CONFIG_PPC64 > @@ -332,6 +403,10 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new, > unsigned int size) > { > switch (size) { > + case 1: > + return __cmpxchg_u8_local(ptr, old, new); > + case 2: > + return __cmpxchg_u16_local(ptr, old, new); > case 4: > return __cmpxchg_u32_local(ptr, old, new); > #ifdef CONFIG_PPC64 > @@ -348,6 +423,10 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new, > unsigned int size) > { > switch (size) { > + case 1: > + return __cmpxchg_u8_relaxed(ptr, old, new); > + case 2: > + return __cmpxchg_u16_relaxed(ptr, old, new); > case 4: > return __cmpxchg_u32_relaxed(ptr, old, new); > #ifdef CONFIG_PPC64 > @@ -364,6 +443,10 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new, > unsigned int size) > { > switch (size) { > + case 1: > + return __cmpxchg_u8_acquire(ptr, old, new); > + case 2: > + return __cmpxchg_u16_acquire(ptr, old, new); > case 4: > return __cmpxchg_u32_acquire(ptr, old, new); > #ifdef CONFIG_PPC64 > -- > 2.4.3 > ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V2] powerpc: Implement {cmp}xchg for u8 and u16 2016-04-19 9:18 ` Boqun Feng @ 2016-04-20 3:39 ` Pan Xinhui 0 siblings, 0 replies; 22+ messages in thread From: Pan Xinhui @ 2016-04-20 3:39 UTC (permalink / raw) To: Boqun Feng Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, peterz, paulmck, tglx Hello, boqun On 2016年04月19日 17:18, Boqun Feng wrote: > Hi Xinhui, > > On Tue, Apr 19, 2016 at 02:29:34PM +0800, Pan Xinhui wrote: >> From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> >> >> Implement xchg{u8,u16}{local,relaxed}, and >> cmpxchg{u8,u16}{,local,acquire,relaxed}. >> >> It works on all ppc. >> > > Nice work! > thank you. > AFAICT, your work doesn't depend on anything that ppc-specific, right? > So maybe we can use it as a general approach for a fallback > implementation on the archs without u8/u16 atomics. ;-) > >> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org> >> Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> >> --- >> change from V1: >> rework totally. >> --- >> arch/powerpc/include/asm/cmpxchg.h | 83 ++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 83 insertions(+) >> >> diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h >> index 44efe73..79a1f45 100644 >> --- a/arch/powerpc/include/asm/cmpxchg.h >> +++ b/arch/powerpc/include/asm/cmpxchg.h >> @@ -7,6 +7,37 @@ >> #include <asm/asm-compat.h> >> #include <linux/bug.h> >> >> +#ifdef __BIG_ENDIAN >> +#define BITOFF_CAL(size, off) ((sizeof(u32) - size - off) * BITS_PER_BYTE) >> +#else >> +#define BITOFF_CAL(size, off) (off * BITS_PER_BYTE) >> +#endif >> + >> +static __always_inline unsigned long >> +__cmpxchg_u32_local(volatile unsigned int *p, unsigned long old, >> + unsigned long new); >> + >> +#define __XCHG_GEN(cmp, type, sfx, u32sfx, skip, v) \ >> +static __always_inline u32 \ >> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new) \ >> +{ \ >> + int size = sizeof (type); \ >> + int off = (unsigned long)ptr % sizeof(u32); \ >> + volatile u32 *p = ptr - off; \ >> + int bitoff = BITOFF_CAL(size, off); \ >> + u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff; \ >> + u32 oldv, newv; \ >> + u32 ret; \ >> + do { \ >> + oldv = READ_ONCE(*p); \ >> + ret = (oldv & bitmask) >> bitoff; \ >> + if (skip && ret != old) \ >> + break; \ >> + newv = (oldv & ~bitmask) | (new << bitoff); \ >> + } while (__cmpxchg_u32##u32sfx((v void*)p, oldv, newv) != oldv);\ > > Forgive me if this is too paranoid, but I think we can save the > READ_ONCE() in the loop if we change the code into the following, > because cmpxchg will return the "new" value, if the cmp part fails. > > newv = READ_ONCE(*p); > > do { > oldv = newv; > ret = (oldv & bitmask) >> bitoff; > if (skip && ret != old) > break; > newv = (oldv & ~bitmask) | (new << bitoff); > newv = __cmpxchg_u32##u32sfx((void *)p, oldv, newv); > } while(newv != oldv); > >> + return ret; \ >> +} a little optimization. Patch V3 will include your code, thanks. >> + >> /* >> * Atomic exchange >> * >> @@ -14,6 +45,19 @@ >> * the previous value stored there. >> */ >> >> +#define XCHG_GEN(type, sfx, v) \ >> + __XCHG_GEN(_, type, sfx, _local, 0, v) \ > ^^^^^^^ > > This should be sfx, right? Otherwise, all the newly added xchg will > call __cmpxchg_u32_local, this will result in wrong ordering guarantees. > I mean that. But I will think of the ordering issue for a while. :) >> +static __always_inline u32 __xchg_##type##sfx(v void *p, u32 n) \ >> +{ \ >> + return ___xchg_##type##sfx(p, 0, n); \ >> +} >> + >> +XCHG_GEN(u8, _local, volatile); > > I don't think we need the "volatile" modifier here, because READ_ONCE() > and __cmpxchg_u32_* all have "volatile" semantics IIUC, so maybe we can > save a paramter for the __XCHG_GEN macro. > such cleanup work can be done in separated patch. Here I just make the compiler happy. thanks xinhui > Regards, > Boqun > >> +XCHG_GEN(u8, _relaxed, ); >> +XCHG_GEN(u16, _local, volatile); >> +XCHG_GEN(u16, _relaxed, ); >> +#undef XCHG_GEN >> + >> static __always_inline unsigned long >> __xchg_u32_local(volatile void *p, unsigned long val) >> { >> @@ -88,6 +132,10 @@ static __always_inline unsigned long >> __xchg_local(volatile void *ptr, unsigned long x, unsigned int size) >> { >> switch (size) { >> + case 1: >> + return __xchg_u8_local(ptr, x); >> + case 2: >> + return __xchg_u16_local(ptr, x); >> case 4: >> return __xchg_u32_local(ptr, x); >> #ifdef CONFIG_PPC64 >> @@ -103,6 +151,10 @@ static __always_inline unsigned long >> __xchg_relaxed(void *ptr, unsigned long x, unsigned int size) >> { >> switch (size) { >> + case 1: >> + return __xchg_u8_relaxed(ptr, x); >> + case 2: >> + return __xchg_u16_relaxed(ptr, x); >> case 4: >> return __xchg_u32_relaxed(ptr, x); >> #ifdef CONFIG_PPC64 >> @@ -226,6 +278,21 @@ __cmpxchg_u32_acquire(u32 *p, unsigned long old, unsigned long new) >> return prev; >> } >> >> + >> +#define CMPXCHG_GEN(type, sfx, v) \ >> + __XCHG_GEN(cmp, type, sfx, sfx, 1, v) >> + >> +CMPXCHG_GEN(u8, , volatile); >> +CMPXCHG_GEN(u8, _local, volatile); >> +CMPXCHG_GEN(u8, _relaxed, ); >> +CMPXCHG_GEN(u8, _acquire, ); >> +CMPXCHG_GEN(u16, , volatile); >> +CMPXCHG_GEN(u16, _local, volatile); >> +CMPXCHG_GEN(u16, _relaxed, ); >> +CMPXCHG_GEN(u16, _acquire, ); >> +#undef CMPXCHG_GEN >> +#undef __XCHG_GEN >> + >> #ifdef CONFIG_PPC64 >> static __always_inline unsigned long >> __cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new) >> @@ -316,6 +383,10 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, >> unsigned int size) >> { >> switch (size) { >> + case 1: >> + return __cmpxchg_u8(ptr, old, new); >> + case 2: >> + return __cmpxchg_u16(ptr, old, new); >> case 4: >> return __cmpxchg_u32(ptr, old, new); >> #ifdef CONFIG_PPC64 >> @@ -332,6 +403,10 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new, >> unsigned int size) >> { >> switch (size) { >> + case 1: >> + return __cmpxchg_u8_local(ptr, old, new); >> + case 2: >> + return __cmpxchg_u16_local(ptr, old, new); >> case 4: >> return __cmpxchg_u32_local(ptr, old, new); >> #ifdef CONFIG_PPC64 >> @@ -348,6 +423,10 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new, >> unsigned int size) >> { >> switch (size) { >> + case 1: >> + return __cmpxchg_u8_relaxed(ptr, old, new); >> + case 2: >> + return __cmpxchg_u16_relaxed(ptr, old, new); >> case 4: >> return __cmpxchg_u32_relaxed(ptr, old, new); >> #ifdef CONFIG_PPC64 >> @@ -364,6 +443,10 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new, >> unsigned int size) >> { >> switch (size) { >> + case 1: >> + return __cmpxchg_u8_acquire(ptr, old, new); >> + case 2: >> + return __cmpxchg_u16_acquire(ptr, old, new); >> case 4: >> return __cmpxchg_u32_acquire(ptr, old, new); >> #ifdef CONFIG_PPC64 >> -- >> 2.4.3 >> > ^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16 2016-04-19 6:29 [PATCH V2] powerpc: Implement {cmp}xchg for u8 and u16 Pan Xinhui 2016-04-19 9:18 ` Boqun Feng @ 2016-04-20 13:24 ` Pan Xinhui 2016-04-20 14:24 ` Peter Zijlstra 2016-04-27 9:16 ` [PATCH V4] " Pan Xinhui 1 sibling, 2 replies; 22+ messages in thread From: Pan Xinhui @ 2016-04-20 13:24 UTC (permalink / raw) To: linux-kernel, linuxppc-dev Cc: benh, paulus, mpe, boqun.feng, peterz, paulmck, tglx From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> Implement xchg{u8,u16}{local,relaxed}, and cmpxchg{u8,u16}{,local,acquire,relaxed}. It works on all ppc. The basic idea is from commit 3226aad81aa6 ("sh: support 1 and 2 byte xchg") Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> --- change from v2: in the do{}while(), we save one load and use corresponding cmpxchg suffix. Also add corresponding __cmpxchg_u32 function declaration in the __XCHG_GEN change from V1: rework totally. --- arch/powerpc/include/asm/cmpxchg.h | 83 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h index 44efe73..2aec04e 100644 --- a/arch/powerpc/include/asm/cmpxchg.h +++ b/arch/powerpc/include/asm/cmpxchg.h @@ -7,6 +7,38 @@ #include <asm/asm-compat.h> #include <linux/bug.h> +#ifdef __BIG_ENDIAN +#define BITOFF_CAL(size, off) ((sizeof(u32) - size - off) * BITS_PER_BYTE) +#else +#define BITOFF_CAL(size, off) (off * BITS_PER_BYTE) +#endif + +#define __XCHG_GEN(cmp, type, sfx, skip, v) \ +static __always_inline unsigned long \ +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old, \ + unsigned long new); \ +static __always_inline u32 \ +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new) \ +{ \ + int size = sizeof (type); \ + int off = (unsigned long)ptr % sizeof(u32); \ + volatile u32 *p = ptr - off; \ + int bitoff = BITOFF_CAL(size, off); \ + u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff; \ + u32 oldv, newv, tmp; \ + u32 ret; \ + oldv = READ_ONCE(*p); \ + do { \ + ret = (oldv & bitmask) >> bitoff; \ + if (skip && ret != old) \ + break; \ + newv = (oldv & ~bitmask) | (new << bitoff); \ + tmp = oldv; \ + oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv); \ + } while (tmp != oldv); \ + return ret; \ +} + /* * Atomic exchange * @@ -14,6 +46,19 @@ * the previous value stored there. */ +#define XCHG_GEN(type, sfx, v) \ + __XCHG_GEN(_, type, sfx, 0, v) \ +static __always_inline u32 __xchg_##type##sfx(v void *p, u32 n) \ +{ \ + return ___xchg_##type##sfx(p, 0, n); \ +} + +XCHG_GEN(u8, _local, volatile); +XCHG_GEN(u8, _relaxed, ); +XCHG_GEN(u16, _local, volatile); +XCHG_GEN(u16, _relaxed, ); +#undef XCHG_GEN + static __always_inline unsigned long __xchg_u32_local(volatile void *p, unsigned long val) { @@ -88,6 +133,10 @@ static __always_inline unsigned long __xchg_local(volatile void *ptr, unsigned long x, unsigned int size) { switch (size) { + case 1: + return __xchg_u8_local(ptr, x); + case 2: + return __xchg_u16_local(ptr, x); case 4: return __xchg_u32_local(ptr, x); #ifdef CONFIG_PPC64 @@ -103,6 +152,10 @@ static __always_inline unsigned long __xchg_relaxed(void *ptr, unsigned long x, unsigned int size) { switch (size) { + case 1: + return __xchg_u8_relaxed(ptr, x); + case 2: + return __xchg_u16_relaxed(ptr, x); case 4: return __xchg_u32_relaxed(ptr, x); #ifdef CONFIG_PPC64 @@ -131,6 +184,20 @@ __xchg_relaxed(void *ptr, unsigned long x, unsigned int size) * and return the old value of *p. */ +#define CMPXCHG_GEN(type, sfx, v) \ + __XCHG_GEN(cmp, type, sfx, 1, v) + +CMPXCHG_GEN(u8, , volatile); +CMPXCHG_GEN(u8, _local, volatile); +CMPXCHG_GEN(u8, _relaxed, ); +CMPXCHG_GEN(u8, _acquire, ); +CMPXCHG_GEN(u16, , volatile); +CMPXCHG_GEN(u16, _local, volatile); +CMPXCHG_GEN(u16, _relaxed, ); +CMPXCHG_GEN(u16, _acquire, ); +#undef CMPXCHG_GEN +#undef __XCHG_GEN + static __always_inline unsigned long __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new) { @@ -316,6 +383,10 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, unsigned int size) { switch (size) { + case 1: + return __cmpxchg_u8(ptr, old, new); + case 2: + return __cmpxchg_u16(ptr, old, new); case 4: return __cmpxchg_u32(ptr, old, new); #ifdef CONFIG_PPC64 @@ -332,6 +403,10 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new, unsigned int size) { switch (size) { + case 1: + return __cmpxchg_u8_local(ptr, old, new); + case 2: + return __cmpxchg_u16_local(ptr, old, new); case 4: return __cmpxchg_u32_local(ptr, old, new); #ifdef CONFIG_PPC64 @@ -348,6 +423,10 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new, unsigned int size) { switch (size) { + case 1: + return __cmpxchg_u8_relaxed(ptr, old, new); + case 2: + return __cmpxchg_u16_relaxed(ptr, old, new); case 4: return __cmpxchg_u32_relaxed(ptr, old, new); #ifdef CONFIG_PPC64 @@ -364,6 +443,10 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new, unsigned int size) { switch (size) { + case 1: + return __cmpxchg_u8_acquire(ptr, old, new); + case 2: + return __cmpxchg_u16_acquire(ptr, old, new); case 4: return __cmpxchg_u32_acquire(ptr, old, new); #ifdef CONFIG_PPC64 -- 2.4.3 ^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16 2016-04-20 13:24 ` [PATCH V3] " Pan Xinhui @ 2016-04-20 14:24 ` Peter Zijlstra 2016-04-21 15:35 ` Pan Xinhui 2016-04-27 9:16 ` [PATCH V4] " Pan Xinhui 1 sibling, 1 reply; 22+ messages in thread From: Peter Zijlstra @ 2016-04-20 14:24 UTC (permalink / raw) To: Pan Xinhui Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, boqun.feng, paulmck, tglx On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote: > +#define __XCHG_GEN(cmp, type, sfx, skip, v) \ > +static __always_inline unsigned long \ > +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old, \ > + unsigned long new); \ > +static __always_inline u32 \ > +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new) \ > +{ \ > + int size = sizeof (type); \ > + int off = (unsigned long)ptr % sizeof(u32); \ > + volatile u32 *p = ptr - off; \ > + int bitoff = BITOFF_CAL(size, off); \ > + u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff; \ > + u32 oldv, newv, tmp; \ > + u32 ret; \ > + oldv = READ_ONCE(*p); \ > + do { \ > + ret = (oldv & bitmask) >> bitoff; \ > + if (skip && ret != old) \ > + break; \ > + newv = (oldv & ~bitmask) | (new << bitoff); \ > + tmp = oldv; \ > + oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv); \ > + } while (tmp != oldv); \ > + return ret; \ > +} So for an LL/SC based arch using cmpxchg() like that is sub-optimal. Why did you choose to write it entirely in C? ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16 2016-04-20 14:24 ` Peter Zijlstra @ 2016-04-21 15:35 ` Pan Xinhui 2016-04-21 15:52 ` Boqun Feng 2016-04-21 16:13 ` Peter Zijlstra 0 siblings, 2 replies; 22+ messages in thread From: Pan Xinhui @ 2016-04-21 15:35 UTC (permalink / raw) To: Peter Zijlstra Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, boqun.feng, paulmck, tglx On 2016年04月20日 22:24, Peter Zijlstra wrote: > On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote: > >> +#define __XCHG_GEN(cmp, type, sfx, skip, v) \ >> +static __always_inline unsigned long \ >> +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old, \ >> + unsigned long new); \ >> +static __always_inline u32 \ >> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new) \ >> +{ \ >> + int size = sizeof (type); \ >> + int off = (unsigned long)ptr % sizeof(u32); \ >> + volatile u32 *p = ptr - off; \ >> + int bitoff = BITOFF_CAL(size, off); \ >> + u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff; \ >> + u32 oldv, newv, tmp; \ >> + u32 ret; \ >> + oldv = READ_ONCE(*p); \ >> + do { \ >> + ret = (oldv & bitmask) >> bitoff; \ >> + if (skip && ret != old) \ >> + break; \ >> + newv = (oldv & ~bitmask) | (new << bitoff); \ >> + tmp = oldv; \ >> + oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv); \ >> + } while (tmp != oldv); \ >> + return ret; \ >> +} > > So for an LL/SC based arch using cmpxchg() like that is sub-optimal. > > Why did you choose to write it entirely in C? > yes, you are right. more load/store will be done in C code. However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression. So just wrote in C, for simple. :) Of course I have done xchg tests. we run code just like xchg((u8*)&v, j++); in several threads. and the result is, [ 768.374264] use time[1550072]ns in xchg_u8_asm [ 768.377102] use time[2826802]ns in xchg_u8_c I think this is because there is one more load in C. If possible, we can move such code in asm-generic/. thanks xinhui ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16 2016-04-21 15:35 ` Pan Xinhui @ 2016-04-21 15:52 ` Boqun Feng 2016-04-22 1:59 ` Pan Xinhui 2016-04-21 16:13 ` Peter Zijlstra 1 sibling, 1 reply; 22+ messages in thread From: Boqun Feng @ 2016-04-21 15:52 UTC (permalink / raw) To: Pan Xinhui Cc: Peter Zijlstra, linux-kernel, linuxppc-dev, benh, paulus, mpe, paulmck, tglx [-- Attachment #1: Type: text/plain, Size: 2087 bytes --] On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote: > On 2016年04月20日 22:24, Peter Zijlstra wrote: > > On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote: > > > >> +#define __XCHG_GEN(cmp, type, sfx, skip, v) \ > >> +static __always_inline unsigned long \ > >> +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old, \ > >> + unsigned long new); \ > >> +static __always_inline u32 \ > >> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new) \ > >> +{ \ > >> + int size = sizeof (type); \ > >> + int off = (unsigned long)ptr % sizeof(u32); \ > >> + volatile u32 *p = ptr - off; \ > >> + int bitoff = BITOFF_CAL(size, off); \ > >> + u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff; \ > >> + u32 oldv, newv, tmp; \ > >> + u32 ret; \ > >> + oldv = READ_ONCE(*p); \ > >> + do { \ > >> + ret = (oldv & bitmask) >> bitoff; \ > >> + if (skip && ret != old) \ > >> + break; \ > >> + newv = (oldv & ~bitmask) | (new << bitoff); \ > >> + tmp = oldv; \ > >> + oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv); \ > >> + } while (tmp != oldv); \ > >> + return ret; \ > >> +} > > > > So for an LL/SC based arch using cmpxchg() like that is sub-optimal. > > > > Why did you choose to write it entirely in C? > > > yes, you are right. more load/store will be done in C code. > However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression. > So just wrote in C, for simple. :) > > Of course I have done xchg tests. > we run code just like xchg((u8*)&v, j++); in several threads. > and the result is, > [ 768.374264] use time[1550072]ns in xchg_u8_asm How was xchg_u8_asm() implemented, using lbarx or using a 32bit ll/sc loop with shifting and masking in it? Regards, Boqun > [ 768.377102] use time[2826802]ns in xchg_u8_c > > I think this is because there is one more load in C. > If possible, we can move such code in asm-generic/. > > thanks > xinhui > [-- Attachment #2: signature.asc --] [-- Type: application/pgp-signature, Size: 473 bytes --] ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16 2016-04-21 15:52 ` Boqun Feng @ 2016-04-22 1:59 ` Pan Xinhui 2016-04-22 3:16 ` Boqun Feng 0 siblings, 1 reply; 22+ messages in thread From: Pan Xinhui @ 2016-04-22 1:59 UTC (permalink / raw) To: Boqun Feng Cc: Peter Zijlstra, linux-kernel, linuxppc-dev, benh, paulus, mpe, paulmck, tglx On 2016年04月21日 23:52, Boqun Feng wrote: > On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote: >> On 2016年04月20日 22:24, Peter Zijlstra wrote: >>> On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote: >>> >>>> +#define __XCHG_GEN(cmp, type, sfx, skip, v) \ >>>> +static __always_inline unsigned long \ >>>> +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old, \ >>>> + unsigned long new); \ >>>> +static __always_inline u32 \ >>>> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new) \ >>>> +{ \ >>>> + int size = sizeof (type); \ >>>> + int off = (unsigned long)ptr % sizeof(u32); \ >>>> + volatile u32 *p = ptr - off; \ >>>> + int bitoff = BITOFF_CAL(size, off); \ >>>> + u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff; \ >>>> + u32 oldv, newv, tmp; \ >>>> + u32 ret; \ >>>> + oldv = READ_ONCE(*p); \ >>>> + do { \ >>>> + ret = (oldv & bitmask) >> bitoff; \ >>>> + if (skip && ret != old) \ >>>> + break; \ >>>> + newv = (oldv & ~bitmask) | (new << bitoff); \ >>>> + tmp = oldv; \ >>>> + oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv); \ >>>> + } while (tmp != oldv); \ >>>> + return ret; \ >>>> +} >>> >>> So for an LL/SC based arch using cmpxchg() like that is sub-optimal. >>> >>> Why did you choose to write it entirely in C? >>> >> yes, you are right. more load/store will be done in C code. >> However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression. >> So just wrote in C, for simple. :) >> >> Of course I have done xchg tests. >> we run code just like xchg((u8*)&v, j++); in several threads. >> and the result is, >> [ 768.374264] use time[1550072]ns in xchg_u8_asm > > How was xchg_u8_asm() implemented, using lbarx or using a 32bit ll/sc > loop with shifting and masking in it? > yes, using 32bit ll/sc loops. looks like: __asm__ __volatile__( "1: lwarx %0,0,%3\n" " and %1,%0,%5\n" " or %1,%1,%4\n" PPC405_ERR77(0,%2) " stwcx. %1,0,%3\n" " bne- 1b" : "=&r" (_oldv), "=&r" (tmp), "+m" (*(volatile unsigned int *)_p) : "r" (_p), "r" (_newv), "r" (_oldv_mask) : "cc", "memory"); > Regards, > Boqun > >> [ 768.377102] use time[2826802]ns in xchg_u8_c >> >> I think this is because there is one more load in C. >> If possible, we can move such code in asm-generic/. >> >> thanks >> xinhui >> ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16 2016-04-22 1:59 ` Pan Xinhui @ 2016-04-22 3:16 ` Boqun Feng 0 siblings, 0 replies; 22+ messages in thread From: Boqun Feng @ 2016-04-22 3:16 UTC (permalink / raw) To: Pan Xinhui Cc: Peter Zijlstra, linux-kernel, linuxppc-dev, benh, paulus, mpe, paulmck, tglx [-- Attachment #1: Type: text/plain, Size: 3194 bytes --] On Fri, Apr 22, 2016 at 09:59:22AM +0800, Pan Xinhui wrote: > On 2016年04月21日 23:52, Boqun Feng wrote: > > On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote: > >> On 2016年04月20日 22:24, Peter Zijlstra wrote: > >>> On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote: > >>> > >>>> +#define __XCHG_GEN(cmp, type, sfx, skip, v) \ > >>>> +static __always_inline unsigned long \ > >>>> +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old, \ > >>>> + unsigned long new); \ > >>>> +static __always_inline u32 \ > >>>> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new) \ > >>>> +{ \ > >>>> + int size = sizeof (type); \ > >>>> + int off = (unsigned long)ptr % sizeof(u32); \ > >>>> + volatile u32 *p = ptr - off; \ > >>>> + int bitoff = BITOFF_CAL(size, off); \ > >>>> + u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff; \ > >>>> + u32 oldv, newv, tmp; \ > >>>> + u32 ret; \ > >>>> + oldv = READ_ONCE(*p); \ > >>>> + do { \ > >>>> + ret = (oldv & bitmask) >> bitoff; \ > >>>> + if (skip && ret != old) \ > >>>> + break; \ > >>>> + newv = (oldv & ~bitmask) | (new << bitoff); \ > >>>> + tmp = oldv; \ > >>>> + oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv); \ > >>>> + } while (tmp != oldv); \ > >>>> + return ret; \ > >>>> +} > >>> > >>> So for an LL/SC based arch using cmpxchg() like that is sub-optimal. > >>> > >>> Why did you choose to write it entirely in C? > >>> > >> yes, you are right. more load/store will be done in C code. > >> However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression. > >> So just wrote in C, for simple. :) > >> > >> Of course I have done xchg tests. > >> we run code just like xchg((u8*)&v, j++); in several threads. > >> and the result is, > >> [ 768.374264] use time[1550072]ns in xchg_u8_asm > > > > How was xchg_u8_asm() implemented, using lbarx or using a 32bit ll/sc > > loop with shifting and masking in it? > > > yes, using 32bit ll/sc loops. > > looks like: > __asm__ __volatile__( > "1: lwarx %0,0,%3\n" > " and %1,%0,%5\n" > " or %1,%1,%4\n" > PPC405_ERR77(0,%2) > " stwcx. %1,0,%3\n" > " bne- 1b" > : "=&r" (_oldv), "=&r" (tmp), "+m" (*(volatile unsigned int *)_p) > : "r" (_p), "r" (_newv), "r" (_oldv_mask) > : "cc", "memory"); > Good, so this works for all ppc ISAs too. Given the performance benefit(maybe caused by the reason Peter mentioned), I think we should use this as the implementation of u8/u16 {cmp}xchg for now. For Power7 and later, we can always switch to the lbarx/lharx version if observable performance benefit can be achieved. But the choice is left to you. After all, as you said, qspinlock is the only user ;-) Regards, Boqun > > > Regards, > > Boqun > > > >> [ 768.377102] use time[2826802]ns in xchg_u8_c > >> > >> I think this is because there is one more load in C. > >> If possible, we can move such code in asm-generic/. > >> > >> thanks > >> xinhui > >> > [-- Attachment #2: signature.asc --] [-- Type: application/pgp-signature, Size: 473 bytes --] ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16 2016-04-21 15:35 ` Pan Xinhui 2016-04-21 15:52 ` Boqun Feng @ 2016-04-21 16:13 ` Peter Zijlstra 2016-04-25 10:10 ` Pan Xinhui 1 sibling, 1 reply; 22+ messages in thread From: Peter Zijlstra @ 2016-04-21 16:13 UTC (permalink / raw) To: Pan Xinhui Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, boqun.feng, paulmck, tglx On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote: > yes, you are right. more load/store will be done in C code. > However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression. > So just wrote in C, for simple. :) Which is fine; but worthy of a note in your Changelog. > Of course I have done xchg tests. > we run code just like xchg((u8*)&v, j++); in several threads. > and the result is, > [ 768.374264] use time[1550072]ns in xchg_u8_asm > [ 768.377102] use time[2826802]ns in xchg_u8_c > > I think this is because there is one more load in C. > If possible, we can move such code in asm-generic/. So I'm not actually _that_ familiar with the PPC LL/SC implementation; but there are things a CPU can do to optimize these loops. For example, a CPU might choose to not release the exclusive hold of the line for a number of cycles, except when it passes SC or an interrupt happens. This way there's a smaller chance the SC fails and inhibits forward progress. By doing the modification outside of the LL/SC you loose such advantages. And yes, doing a !exclusive load prior to the exclusive load leads to an even bigger window where the data can get changed out from under you. ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16 2016-04-21 16:13 ` Peter Zijlstra @ 2016-04-25 10:10 ` Pan Xinhui 2016-04-25 15:37 ` Peter Zijlstra 0 siblings, 1 reply; 22+ messages in thread From: Pan Xinhui @ 2016-04-25 10:10 UTC (permalink / raw) To: Peter Zijlstra Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, boqun.feng, paulmck, tglx On 2016年04月22日 00:13, Peter Zijlstra wrote: > On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote: >> yes, you are right. more load/store will be done in C code. >> However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression. >> So just wrote in C, for simple. :) > > Which is fine; but worthy of a note in your Changelog. > will do that. >> Of course I have done xchg tests. >> we run code just like xchg((u8*)&v, j++); in several threads. >> and the result is, >> [ 768.374264] use time[1550072]ns in xchg_u8_asm >> [ 768.377102] use time[2826802]ns in xchg_u8_c >> >> I think this is because there is one more load in C. >> If possible, we can move such code in asm-generic/. > > So I'm not actually _that_ familiar with the PPC LL/SC implementation; > but there are things a CPU can do to optimize these loops. > > For example, a CPU might choose to not release the exclusive hold of the > line for a number of cycles, except when it passes SC or an interrupt > happens. This way there's a smaller chance the SC fails and inhibits > forward progress. I am not sure if there is such hardware optimization. > > By doing the modification outside of the LL/SC you loose such > advantages. > > And yes, doing a !exclusive load prior to the exclusive load leads to an > even bigger window where the data can get changed out from under you. > you are right. We have observed such data change during the two different loads. ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16 2016-04-25 10:10 ` Pan Xinhui @ 2016-04-25 15:37 ` Peter Zijlstra 2016-04-26 11:35 ` Pan Xinhui 0 siblings, 1 reply; 22+ messages in thread From: Peter Zijlstra @ 2016-04-25 15:37 UTC (permalink / raw) To: Pan Xinhui Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, boqun.feng, paulmck, tglx On Mon, Apr 25, 2016 at 06:10:51PM +0800, Pan Xinhui wrote: > > So I'm not actually _that_ familiar with the PPC LL/SC implementation; > > but there are things a CPU can do to optimize these loops. > > > > For example, a CPU might choose to not release the exclusive hold of the > > line for a number of cycles, except when it passes SC or an interrupt > > happens. This way there's a smaller chance the SC fails and inhibits > > forward progress. > I am not sure if there is such hardware optimization. So I think the hardware must do _something_, otherwise competing cores doing load-exlusive could life-lock a system, each one endlessly breaking the exclusive ownership of the other and the store-conditional always failing. Of course, there are such implementations, and they tend to have to put in explicit backoff loops; however, IIRC, PPC doesn't need that. (See ARC for an example that needs to do this.) ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16 2016-04-25 15:37 ` Peter Zijlstra @ 2016-04-26 11:35 ` Pan Xinhui 0 siblings, 0 replies; 22+ messages in thread From: Pan Xinhui @ 2016-04-26 11:35 UTC (permalink / raw) To: Peter Zijlstra Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, boqun.feng, paulmck, tglx On 2016年04月25日 23:37, Peter Zijlstra wrote: > On Mon, Apr 25, 2016 at 06:10:51PM +0800, Pan Xinhui wrote: >>> So I'm not actually _that_ familiar with the PPC LL/SC implementation; >>> but there are things a CPU can do to optimize these loops. >>> >>> For example, a CPU might choose to not release the exclusive hold of the >>> line for a number of cycles, except when it passes SC or an interrupt >>> happens. This way there's a smaller chance the SC fails and inhibits >>> forward progress. > >> I am not sure if there is such hardware optimization. > > So I think the hardware must do _something_, otherwise competing cores > doing load-exlusive could life-lock a system, each one endlessly > breaking the exclusive ownership of the other and the store-conditional > always failing. > Seems there is no such optimization. We haver observed SC fails almost all the time in a contention tests, then got stuck in the loop. :( one thread modify val with LL/SC, and other threads just modify val without any respect to LL/SC. So in the end, I choose to rewrite this patch in asm. :) > Of course, there are such implementations, and they tend to have to put > in explicit backoff loops; however, IIRC, PPC doesn't need that. (See > ARC for an example that needs to do this.) > ^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16 2016-04-20 13:24 ` [PATCH V3] " Pan Xinhui 2016-04-20 14:24 ` Peter Zijlstra @ 2016-04-27 9:16 ` Pan Xinhui 2016-04-27 13:58 ` Boqun Feng ` (2 more replies) 1 sibling, 3 replies; 22+ messages in thread From: Pan Xinhui @ 2016-04-27 9:16 UTC (permalink / raw) To: linux-kernel, linuxppc-dev Cc: benh, paulus, mpe, boqun.feng, peterz, paulmck, tglx From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> Implement xchg{u8,u16}{local,relaxed}, and cmpxchg{u8,u16}{,local,acquire,relaxed}. It works on all ppc. remove volatile of first parameter in __cmpxchg_local and __cmpxchg Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> --- change from v3: rewrite in asm for the LL/SC. remove volatile in __cmpxchg_local and __cmpxchg. change from v2: in the do{}while(), we save one load and use corresponding cmpxchg suffix. Also add corresponding __cmpxchg_u32 function declaration in the __XCHG_GEN change from V1: rework totally. --- arch/powerpc/include/asm/cmpxchg.h | 109 ++++++++++++++++++++++++++++++++++++- 1 file changed, 106 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h index 44efe73..8a3735f 100644 --- a/arch/powerpc/include/asm/cmpxchg.h +++ b/arch/powerpc/include/asm/cmpxchg.h @@ -7,6 +7,71 @@ #include <asm/asm-compat.h> #include <linux/bug.h> +#ifdef __BIG_ENDIAN +#define BITOFF_CAL(size, off) ((sizeof(u32) - size - off) * BITS_PER_BYTE) +#else +#define BITOFF_CAL(size, off) (off * BITS_PER_BYTE) +#endif + +#define XCHG_GEN(type, sfx, cl) \ +static inline u32 __xchg_##type##sfx(void *p, u32 val) \ +{ \ + unsigned int prev, prev_mask, tmp, bitoff, off; \ + \ + off = (unsigned long)p % sizeof(u32); \ + bitoff = BITOFF_CAL(sizeof(type), off); \ + p -= off; \ + val <<= bitoff; \ + prev_mask = (u32)(type)-1 << bitoff; \ + \ + __asm__ __volatile__( \ +"1: lwarx %0,0,%3\n" \ +" andc %1,%0,%5\n" \ +" or %1,%1,%4\n" \ + PPC405_ERR77(0,%3) \ +" stwcx. %1,0,%3\n" \ +" bne- 1b\n" \ + : "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p) \ + : "r" (p), "r" (val), "r" (prev_mask) \ + : "cc", cl); \ + \ + return prev >> bitoff; \ +} + +#define CMPXCHG_GEN(type, sfx, br, br2, cl) \ +static inline \ +u32 __cmpxchg_##type##sfx(void *p, u32 old, u32 new) \ +{ \ + unsigned int prev, prev_mask, tmp, bitoff, off; \ + \ + off = (unsigned long)p % sizeof(u32); \ + bitoff = BITOFF_CAL(sizeof(type), off); \ + p -= off; \ + old <<= bitoff; \ + new <<= bitoff; \ + prev_mask = (u32)(type)-1 << bitoff; \ + \ + __asm__ __volatile__( \ + br \ +"1: lwarx %0,0,%3\n" \ +" and %1,%0,%6\n" \ +" cmpw 0,%1,%4\n" \ +" bne- 2f\n" \ +" andc %1,%0,%6\n" \ +" or %1,%1,%5\n" \ + PPC405_ERR77(0,%3) \ +" stwcx. %1,0,%3\n" \ +" bne- 1b\n" \ + br2 \ + "\n" \ +"2:" \ + : "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p) \ + : "r" (p), "r" (old), "r" (new), "r" (prev_mask) \ + : "cc", cl); \ + \ + return prev >> bitoff; \ +} + /* * Atomic exchange * @@ -14,6 +79,11 @@ * the previous value stored there. */ +XCHG_GEN(u8, _local, "memory"); +XCHG_GEN(u8, _relaxed, "cc"); +XCHG_GEN(u16, _local, "memory"); +XCHG_GEN(u16, _relaxed, "cc"); + static __always_inline unsigned long __xchg_u32_local(volatile void *p, unsigned long val) { @@ -85,9 +155,13 @@ __xchg_u64_relaxed(u64 *p, unsigned long val) #endif static __always_inline unsigned long -__xchg_local(volatile void *ptr, unsigned long x, unsigned int size) +__xchg_local(void *ptr, unsigned long x, unsigned int size) { switch (size) { + case 1: + return __xchg_u8_local(ptr, x); + case 2: + return __xchg_u16_local(ptr, x); case 4: return __xchg_u32_local(ptr, x); #ifdef CONFIG_PPC64 @@ -103,6 +177,10 @@ static __always_inline unsigned long __xchg_relaxed(void *ptr, unsigned long x, unsigned int size) { switch (size) { + case 1: + return __xchg_u8_relaxed(ptr, x); + case 2: + return __xchg_u16_relaxed(ptr, x); case 4: return __xchg_u32_relaxed(ptr, x); #ifdef CONFIG_PPC64 @@ -131,6 +209,15 @@ __xchg_relaxed(void *ptr, unsigned long x, unsigned int size) * and return the old value of *p. */ +CMPXCHG_GEN(u8, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory"); +CMPXCHG_GEN(u8, _local, , , "memory"); +CMPXCHG_GEN(u8, _acquire, , PPC_ACQUIRE_BARRIER, "memory"); +CMPXCHG_GEN(u8, _relaxed, , , "cc"); +CMPXCHG_GEN(u16, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory"); +CMPXCHG_GEN(u16, _local, , , "memory"); +CMPXCHG_GEN(u16, _acquire, , PPC_ACQUIRE_BARRIER, "memory"); +CMPXCHG_GEN(u16, _relaxed, , , "cc"); + static __always_inline unsigned long __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new) { @@ -312,10 +399,14 @@ __cmpxchg_u64_acquire(u64 *p, unsigned long old, unsigned long new) #endif static __always_inline unsigned long -__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, +__cmpxchg(void *ptr, unsigned long old, unsigned long new, unsigned int size) { switch (size) { + case 1: + return __cmpxchg_u8(ptr, old, new); + case 2: + return __cmpxchg_u16(ptr, old, new); case 4: return __cmpxchg_u32(ptr, old, new); #ifdef CONFIG_PPC64 @@ -328,10 +419,14 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, } static __always_inline unsigned long -__cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new, +__cmpxchg_local(void *ptr, unsigned long old, unsigned long new, unsigned int size) { switch (size) { + case 1: + return __cmpxchg_u8_local(ptr, old, new); + case 2: + return __cmpxchg_u16_local(ptr, old, new); case 4: return __cmpxchg_u32_local(ptr, old, new); #ifdef CONFIG_PPC64 @@ -348,6 +443,10 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new, unsigned int size) { switch (size) { + case 1: + return __cmpxchg_u8_relaxed(ptr, old, new); + case 2: + return __cmpxchg_u16_relaxed(ptr, old, new); case 4: return __cmpxchg_u32_relaxed(ptr, old, new); #ifdef CONFIG_PPC64 @@ -364,6 +463,10 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new, unsigned int size) { switch (size) { + case 1: + return __cmpxchg_u8_acquire(ptr, old, new); + case 2: + return __cmpxchg_u16_acquire(ptr, old, new); case 4: return __cmpxchg_u32_acquire(ptr, old, new); #ifdef CONFIG_PPC64 -- 2.4.3 ^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16 2016-04-27 9:16 ` [PATCH V4] " Pan Xinhui @ 2016-04-27 13:58 ` Boqun Feng 2016-04-27 14:16 ` Boqun Feng 2016-04-27 14:50 ` Boqun Feng 2016-04-28 7:59 ` Peter Zijlstra 2016-11-25 0:04 ` [V4] " Michael Ellerman 2 siblings, 2 replies; 22+ messages in thread From: Boqun Feng @ 2016-04-27 13:58 UTC (permalink / raw) To: Pan Xinhui Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, peterz, paulmck, tglx [-- Attachment #1: Type: text/plain, Size: 7814 bytes --] On Wed, Apr 27, 2016 at 05:16:45PM +0800, Pan Xinhui wrote: > From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> > > Implement xchg{u8,u16}{local,relaxed}, and > cmpxchg{u8,u16}{,local,acquire,relaxed}. > > It works on all ppc. > > remove volatile of first parameter in __cmpxchg_local and __cmpxchg > > Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org> > Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> > --- > change from v3: > rewrite in asm for the LL/SC. > remove volatile in __cmpxchg_local and __cmpxchg. > change from v2: > in the do{}while(), we save one load and use corresponding cmpxchg suffix. > Also add corresponding __cmpxchg_u32 function declaration in the __XCHG_GEN > change from V1: > rework totally. > --- > arch/powerpc/include/asm/cmpxchg.h | 109 ++++++++++++++++++++++++++++++++++++- > 1 file changed, 106 insertions(+), 3 deletions(-) > > diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h > index 44efe73..8a3735f 100644 > --- a/arch/powerpc/include/asm/cmpxchg.h > +++ b/arch/powerpc/include/asm/cmpxchg.h > @@ -7,6 +7,71 @@ > #include <asm/asm-compat.h> > #include <linux/bug.h> > > +#ifdef __BIG_ENDIAN > +#define BITOFF_CAL(size, off) ((sizeof(u32) - size - off) * BITS_PER_BYTE) > +#else > +#define BITOFF_CAL(size, off) (off * BITS_PER_BYTE) > +#endif > + > +#define XCHG_GEN(type, sfx, cl) \ > +static inline u32 __xchg_##type##sfx(void *p, u32 val) \ > +{ \ > + unsigned int prev, prev_mask, tmp, bitoff, off; \ > + \ > + off = (unsigned long)p % sizeof(u32); \ > + bitoff = BITOFF_CAL(sizeof(type), off); \ > + p -= off; \ > + val <<= bitoff; \ > + prev_mask = (u32)(type)-1 << bitoff; \ > + \ > + __asm__ __volatile__( \ > +"1: lwarx %0,0,%3\n" \ > +" andc %1,%0,%5\n" \ > +" or %1,%1,%4\n" \ > + PPC405_ERR77(0,%3) \ > +" stwcx. %1,0,%3\n" \ > +" bne- 1b\n" \ > + : "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p) \ I think we can save the "tmp" here by: __asm__ volatile__( "1: lwarx %0,0,%2\n" " andc %0,%0,%4\n" " or %0,%0,%3\n" PPC405_ERR77(0,%2) " stwcx. %0,0,%2\n" " bne- 1b\n" : "=&r" (prev), "+m" (*(u32*)p) : "r" (p), "r" (val), "r" (prev_mask) : "cc", cl); right? > + : "r" (p), "r" (val), "r" (prev_mask) \ > + : "cc", cl); \ > + \ > + return prev >> bitoff; \ > +} > + > +#define CMPXCHG_GEN(type, sfx, br, br2, cl) \ > +static inline \ > +u32 __cmpxchg_##type##sfx(void *p, u32 old, u32 new) \ > +{ \ > + unsigned int prev, prev_mask, tmp, bitoff, off; \ > + \ > + off = (unsigned long)p % sizeof(u32); \ > + bitoff = BITOFF_CAL(sizeof(type), off); \ > + p -= off; \ > + old <<= bitoff; \ > + new <<= bitoff; \ > + prev_mask = (u32)(type)-1 << bitoff; \ > + \ > + __asm__ __volatile__( \ > + br \ > +"1: lwarx %0,0,%3\n" \ > +" and %1,%0,%6\n" \ > +" cmpw 0,%1,%4\n" \ > +" bne- 2f\n" \ > +" andc %1,%0,%6\n" \ > +" or %1,%1,%5\n" \ > + PPC405_ERR77(0,%3) \ > +" stwcx. %1,0,%3\n" \ > +" bne- 1b\n" \ > + br2 \ > + "\n" \ > +"2:" \ > + : "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p) \ And "tmp" here could also be saved by: "1: lwarx %0,0,%2\n" \ " xor %3,%0,%3\n" \ " and. %3,%3,%5\n" \ " bne- 2f\n" \ " andc %0,%0,%5\n" \ " or %0,%0,%4\n" \ PPC405_ERR77(0,%2) \ " stwcx. %0,0,%2\n" \ " bne- 1b\n" \ br2 \ "\n" \ "2:" \ : "=&r" (prev), "+m" (*(u32*)p) \ : "r" (p), "r" (old), "r" (new), "r" (prev_mask) \ : "cc", cl); \ right? IIUC, saving the local variable "tmp" will result in saving a general register for the compilers to use for other variables. So thoughts? Regards, Boqun > + : "r" (p), "r" (old), "r" (new), "r" (prev_mask) \ > + : "cc", cl); \ > + \ > + return prev >> bitoff; \ > +} > + > /* > * Atomic exchange > * > @@ -14,6 +79,11 @@ > * the previous value stored there. > */ > > +XCHG_GEN(u8, _local, "memory"); > +XCHG_GEN(u8, _relaxed, "cc"); > +XCHG_GEN(u16, _local, "memory"); > +XCHG_GEN(u16, _relaxed, "cc"); > + > static __always_inline unsigned long > __xchg_u32_local(volatile void *p, unsigned long val) > { > @@ -85,9 +155,13 @@ __xchg_u64_relaxed(u64 *p, unsigned long val) > #endif > > static __always_inline unsigned long > -__xchg_local(volatile void *ptr, unsigned long x, unsigned int size) > +__xchg_local(void *ptr, unsigned long x, unsigned int size) > { > switch (size) { > + case 1: > + return __xchg_u8_local(ptr, x); > + case 2: > + return __xchg_u16_local(ptr, x); > case 4: > return __xchg_u32_local(ptr, x); > #ifdef CONFIG_PPC64 > @@ -103,6 +177,10 @@ static __always_inline unsigned long > __xchg_relaxed(void *ptr, unsigned long x, unsigned int size) > { > switch (size) { > + case 1: > + return __xchg_u8_relaxed(ptr, x); > + case 2: > + return __xchg_u16_relaxed(ptr, x); > case 4: > return __xchg_u32_relaxed(ptr, x); > #ifdef CONFIG_PPC64 > @@ -131,6 +209,15 @@ __xchg_relaxed(void *ptr, unsigned long x, unsigned int size) > * and return the old value of *p. > */ > > +CMPXCHG_GEN(u8, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory"); > +CMPXCHG_GEN(u8, _local, , , "memory"); > +CMPXCHG_GEN(u8, _acquire, , PPC_ACQUIRE_BARRIER, "memory"); > +CMPXCHG_GEN(u8, _relaxed, , , "cc"); > +CMPXCHG_GEN(u16, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory"); > +CMPXCHG_GEN(u16, _local, , , "memory"); > +CMPXCHG_GEN(u16, _acquire, , PPC_ACQUIRE_BARRIER, "memory"); > +CMPXCHG_GEN(u16, _relaxed, , , "cc"); > + > static __always_inline unsigned long > __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new) > { > @@ -312,10 +399,14 @@ __cmpxchg_u64_acquire(u64 *p, unsigned long old, unsigned long new) > #endif > > static __always_inline unsigned long > -__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, > +__cmpxchg(void *ptr, unsigned long old, unsigned long new, > unsigned int size) > { > switch (size) { > + case 1: > + return __cmpxchg_u8(ptr, old, new); > + case 2: > + return __cmpxchg_u16(ptr, old, new); > case 4: > return __cmpxchg_u32(ptr, old, new); > #ifdef CONFIG_PPC64 > @@ -328,10 +419,14 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, > } > > static __always_inline unsigned long > -__cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new, > +__cmpxchg_local(void *ptr, unsigned long old, unsigned long new, > unsigned int size) > { > switch (size) { > + case 1: > + return __cmpxchg_u8_local(ptr, old, new); > + case 2: > + return __cmpxchg_u16_local(ptr, old, new); > case 4: > return __cmpxchg_u32_local(ptr, old, new); > #ifdef CONFIG_PPC64 > @@ -348,6 +443,10 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new, > unsigned int size) > { > switch (size) { > + case 1: > + return __cmpxchg_u8_relaxed(ptr, old, new); > + case 2: > + return __cmpxchg_u16_relaxed(ptr, old, new); > case 4: > return __cmpxchg_u32_relaxed(ptr, old, new); > #ifdef CONFIG_PPC64 > @@ -364,6 +463,10 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new, > unsigned int size) > { > switch (size) { > + case 1: > + return __cmpxchg_u8_acquire(ptr, old, new); > + case 2: > + return __cmpxchg_u16_acquire(ptr, old, new); > case 4: > return __cmpxchg_u32_acquire(ptr, old, new); > #ifdef CONFIG_PPC64 > -- > 2.4.3 > [-- Attachment #2: signature.asc --] [-- Type: application/pgp-signature, Size: 473 bytes --] ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16 2016-04-27 13:58 ` Boqun Feng @ 2016-04-27 14:16 ` Boqun Feng 2016-04-27 14:50 ` Boqun Feng 1 sibling, 0 replies; 22+ messages in thread From: Boqun Feng @ 2016-04-27 14:16 UTC (permalink / raw) To: Pan Xinhui Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, peterz, paulmck, tglx [-- Attachment #1: Type: text/plain, Size: 4517 bytes --] On Wed, Apr 27, 2016 at 09:58:17PM +0800, Boqun Feng wrote: > On Wed, Apr 27, 2016 at 05:16:45PM +0800, Pan Xinhui wrote: > > From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> > > > > Implement xchg{u8,u16}{local,relaxed}, and > > cmpxchg{u8,u16}{,local,acquire,relaxed}. > > > > It works on all ppc. > > > > remove volatile of first parameter in __cmpxchg_local and __cmpxchg > > > > Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org> > > Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> > > --- > > change from v3: > > rewrite in asm for the LL/SC. > > remove volatile in __cmpxchg_local and __cmpxchg. > > change from v2: > > in the do{}while(), we save one load and use corresponding cmpxchg suffix. > > Also add corresponding __cmpxchg_u32 function declaration in the __XCHG_GEN > > change from V1: > > rework totally. > > --- > > arch/powerpc/include/asm/cmpxchg.h | 109 ++++++++++++++++++++++++++++++++++++- > > 1 file changed, 106 insertions(+), 3 deletions(-) > > > > diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h > > index 44efe73..8a3735f 100644 > > --- a/arch/powerpc/include/asm/cmpxchg.h > > +++ b/arch/powerpc/include/asm/cmpxchg.h > > @@ -7,6 +7,71 @@ > > #include <asm/asm-compat.h> > > #include <linux/bug.h> > > > > +#ifdef __BIG_ENDIAN > > +#define BITOFF_CAL(size, off) ((sizeof(u32) - size - off) * BITS_PER_BYTE) > > +#else > > +#define BITOFF_CAL(size, off) (off * BITS_PER_BYTE) > > +#endif > > + > > +#define XCHG_GEN(type, sfx, cl) \ > > +static inline u32 __xchg_##type##sfx(void *p, u32 val) \ > > +{ \ > > + unsigned int prev, prev_mask, tmp, bitoff, off; \ > > + \ > > + off = (unsigned long)p % sizeof(u32); \ > > + bitoff = BITOFF_CAL(sizeof(type), off); \ > > + p -= off; \ > > + val <<= bitoff; \ > > + prev_mask = (u32)(type)-1 << bitoff; \ > > + \ > > + __asm__ __volatile__( \ > > +"1: lwarx %0,0,%3\n" \ > > +" andc %1,%0,%5\n" \ > > +" or %1,%1,%4\n" \ > > + PPC405_ERR77(0,%3) \ > > +" stwcx. %1,0,%3\n" \ > > +" bne- 1b\n" \ > > + : "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p) \ > > I think we can save the "tmp" here by: > > __asm__ volatile__( > "1: lwarx %0,0,%2\n" > " andc %0,%0,%4\n" > " or %0,%0,%3\n" > PPC405_ERR77(0,%2) > " stwcx. %0,0,%2\n" > " bne- 1b\n" > : "=&r" (prev), "+m" (*(u32*)p) > : "r" (p), "r" (val), "r" (prev_mask) > : "cc", cl); > > right? > > > + : "r" (p), "r" (val), "r" (prev_mask) \ > > + : "cc", cl); \ > > + \ > > + return prev >> bitoff; \ > > +} > > + > > +#define CMPXCHG_GEN(type, sfx, br, br2, cl) \ > > +static inline \ > > +u32 __cmpxchg_##type##sfx(void *p, u32 old, u32 new) \ > > +{ \ > > + unsigned int prev, prev_mask, tmp, bitoff, off; \ > > + \ > > + off = (unsigned long)p % sizeof(u32); \ > > + bitoff = BITOFF_CAL(sizeof(type), off); \ > > + p -= off; \ > > + old <<= bitoff; \ > > + new <<= bitoff; \ > > + prev_mask = (u32)(type)-1 << bitoff; \ > > + \ > > + __asm__ __volatile__( \ > > + br \ > > +"1: lwarx %0,0,%3\n" \ > > +" and %1,%0,%6\n" \ > > +" cmpw 0,%1,%4\n" \ > > +" bne- 2f\n" \ > > +" andc %1,%0,%6\n" \ > > +" or %1,%1,%5\n" \ > > + PPC405_ERR77(0,%3) \ > > +" stwcx. %1,0,%3\n" \ > > +" bne- 1b\n" \ > > + br2 \ > > + "\n" \ > > +"2:" \ > > + : "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p) \ > > And "tmp" here could also be saved by: > > "1: lwarx %0,0,%2\n" \ > " xor %3,%0,%3\n" \ > " and. %3,%3,%5\n" \ > " bne- 2f\n" \ > " andc %0,%0,%5\n" \ > " or %0,%0,%4\n" \ > PPC405_ERR77(0,%2) \ > " stwcx. %0,0,%2\n" \ > " bne- 1b\n" \ > br2 \ > "\n" \ > "2:" \ > : "=&r" (prev), "+m" (*(u32*)p) \ > : "r" (p), "r" (old), "r" (new), "r" (prev_mask) \ > : "cc", cl); \ > Oops, this should be: "1: lwarx %0,0,%3\n" \ " xor %2,%0,%2\n" \ " and. %2,%2,%5\n" \ " bne- 2f\n" \ " andc %0,%0,%5\n" \ " or %0,%0,%4\n" \ PPC405_ERR77(0,%3) \ " stwcx. %0,0,%3\n" \ " bne- 1b\n" \ br2 \ "\n" \ "2:" \ : "=&r" (prev), "+m" (*(u32*)p), "+&r" (old) \ : "r" (p), "r" (new), "r" (prev_mask) \ : "cc", cl); \ Regards, Boqun [-- Attachment #2: signature.asc --] [-- Type: application/pgp-signature, Size: 473 bytes --] ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16 2016-04-27 13:58 ` Boqun Feng 2016-04-27 14:16 ` Boqun Feng @ 2016-04-27 14:50 ` Boqun Feng 2016-04-27 14:59 ` Boqun Feng 1 sibling, 1 reply; 22+ messages in thread From: Boqun Feng @ 2016-04-27 14:50 UTC (permalink / raw) To: Pan Xinhui Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, peterz, paulmck, tglx [-- Attachment #1: Type: text/plain, Size: 4463 bytes --] On Wed, Apr 27, 2016 at 09:58:17PM +0800, Boqun Feng wrote: > On Wed, Apr 27, 2016 at 05:16:45PM +0800, Pan Xinhui wrote: > > From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> > > > > Implement xchg{u8,u16}{local,relaxed}, and > > cmpxchg{u8,u16}{,local,acquire,relaxed}. > > > > It works on all ppc. > > > > remove volatile of first parameter in __cmpxchg_local and __cmpxchg > > > > Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org> > > Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> > > --- > > change from v3: > > rewrite in asm for the LL/SC. > > remove volatile in __cmpxchg_local and __cmpxchg. > > change from v2: > > in the do{}while(), we save one load and use corresponding cmpxchg suffix. > > Also add corresponding __cmpxchg_u32 function declaration in the __XCHG_GEN > > change from V1: > > rework totally. > > --- > > arch/powerpc/include/asm/cmpxchg.h | 109 ++++++++++++++++++++++++++++++++++++- > > 1 file changed, 106 insertions(+), 3 deletions(-) > > > > diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h > > index 44efe73..8a3735f 100644 > > --- a/arch/powerpc/include/asm/cmpxchg.h > > +++ b/arch/powerpc/include/asm/cmpxchg.h > > @@ -7,6 +7,71 @@ > > #include <asm/asm-compat.h> > > #include <linux/bug.h> > > > > +#ifdef __BIG_ENDIAN > > +#define BITOFF_CAL(size, off) ((sizeof(u32) - size - off) * BITS_PER_BYTE) > > +#else > > +#define BITOFF_CAL(size, off) (off * BITS_PER_BYTE) > > +#endif > > + > > +#define XCHG_GEN(type, sfx, cl) \ > > +static inline u32 __xchg_##type##sfx(void *p, u32 val) \ > > +{ \ > > + unsigned int prev, prev_mask, tmp, bitoff, off; \ > > + \ > > + off = (unsigned long)p % sizeof(u32); \ > > + bitoff = BITOFF_CAL(sizeof(type), off); \ > > + p -= off; \ > > + val <<= bitoff; \ > > + prev_mask = (u32)(type)-1 << bitoff; \ > > + \ > > + __asm__ __volatile__( \ > > +"1: lwarx %0,0,%3\n" \ > > +" andc %1,%0,%5\n" \ > > +" or %1,%1,%4\n" \ > > + PPC405_ERR77(0,%3) \ > > +" stwcx. %1,0,%3\n" \ > > +" bne- 1b\n" \ > > + : "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p) \ > > I think we can save the "tmp" here by: > > __asm__ volatile__( > "1: lwarx %0,0,%2\n" > " andc %0,%0,%4\n" > " or %0,%0,%3\n" > PPC405_ERR77(0,%2) > " stwcx. %0,0,%2\n" > " bne- 1b\n" > : "=&r" (prev), "+m" (*(u32*)p) > : "r" (p), "r" (val), "r" (prev_mask) > : "cc", cl); > > right? > > > + : "r" (p), "r" (val), "r" (prev_mask) \ > > + : "cc", cl); \ > > + \ > > + return prev >> bitoff; \ > > +} > > + > > +#define CMPXCHG_GEN(type, sfx, br, br2, cl) \ > > +static inline \ > > +u32 __cmpxchg_##type##sfx(void *p, u32 old, u32 new) \ > > +{ \ > > + unsigned int prev, prev_mask, tmp, bitoff, off; \ > > + \ > > + off = (unsigned long)p % sizeof(u32); \ > > + bitoff = BITOFF_CAL(sizeof(type), off); \ > > + p -= off; \ > > + old <<= bitoff; \ > > + new <<= bitoff; \ > > + prev_mask = (u32)(type)-1 << bitoff; \ > > + \ > > + __asm__ __volatile__( \ > > + br \ > > +"1: lwarx %0,0,%3\n" \ > > +" and %1,%0,%6\n" \ > > +" cmpw 0,%1,%4\n" \ > > +" bne- 2f\n" \ > > +" andc %1,%0,%6\n" \ > > +" or %1,%1,%5\n" \ > > + PPC405_ERR77(0,%3) \ > > +" stwcx. %1,0,%3\n" \ > > +" bne- 1b\n" \ > > + br2 \ > > + "\n" \ > > +"2:" \ > > + : "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p) \ > > And "tmp" here could also be saved by: > > "1: lwarx %0,0,%2\n" \ > " xor %3,%0,%3\n" \ > " and. %3,%3,%5\n" \ > " bne- 2f\n" \ > " andc %0,%0,%5\n" \ > " or %0,%0,%4\n" \ > PPC405_ERR77(0,%2) \ > " stwcx. %0,0,%2\n" \ > " bne- 1b\n" \ > br2 \ > "\n" \ > "2:" \ > : "=&r" (prev), "+m" (*(u32*)p) \ > : "r" (p), "r" (old), "r" (new), "r" (prev_mask) \ > : "cc", cl); \ > > right? > Sorry, my bad, we can't implement cmpxchg like this.. please ignore this, I should really go to bed soon... But still, we can save the "tmp" for xchg() I think. Regards, Boqun > IIUC, saving the local variable "tmp" will result in saving a general > register for the compilers to use for other variables. > > So thoughts? > > Regards, > Boqun > [-- Attachment #2: signature.asc --] [-- Type: application/pgp-signature, Size: 473 bytes --] ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16 2016-04-27 14:50 ` Boqun Feng @ 2016-04-27 14:59 ` Boqun Feng 2016-04-28 10:21 ` Pan Xinhui 0 siblings, 1 reply; 22+ messages in thread From: Boqun Feng @ 2016-04-27 14:59 UTC (permalink / raw) To: Pan Xinhui Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, peterz, paulmck, tglx [-- Attachment #1: Type: text/plain, Size: 405 bytes --] On Wed, Apr 27, 2016 at 10:50:34PM +0800, Boqun Feng wrote: > > Sorry, my bad, we can't implement cmpxchg like this.. please ignore > this, I should really go to bed soon... > > But still, we can save the "tmp" for xchg() I think. > No.. we can't. Sorry for all the noise. This patch looks good to me. FWIW, you can add Acked-by: Boqun Feng <boqun.feng@gmail.com> Regards, Boqun [-- Attachment #2: signature.asc --] [-- Type: application/pgp-signature, Size: 473 bytes --] ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16 2016-04-27 14:59 ` Boqun Feng @ 2016-04-28 10:21 ` Pan Xinhui 0 siblings, 0 replies; 22+ messages in thread From: Pan Xinhui @ 2016-04-28 10:21 UTC (permalink / raw) To: Boqun Feng Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, peterz, paulmck, tglx On 2016年04月27日 22:59, Boqun Feng wrote: > On Wed, Apr 27, 2016 at 10:50:34PM +0800, Boqun Feng wrote: >> >> Sorry, my bad, we can't implement cmpxchg like this.. please ignore >> this, I should really go to bed soon... >> >> But still, we can save the "tmp" for xchg() I think. >> > > No.. we can't. Sorry for all the noise. > > This patch looks good to me. > > FWIW, you can add > > Acked-by: Boqun Feng <boqun.feng@gmail.com> > thanks! > Regards, > Boqun > ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16 2016-04-27 9:16 ` [PATCH V4] " Pan Xinhui 2016-04-27 13:58 ` Boqun Feng @ 2016-04-28 7:59 ` Peter Zijlstra 2016-04-28 10:21 ` Pan Xinhui 2016-11-25 0:04 ` [V4] " Michael Ellerman 2 siblings, 1 reply; 22+ messages in thread From: Peter Zijlstra @ 2016-04-28 7:59 UTC (permalink / raw) To: Pan Xinhui Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, boqun.feng, paulmck, tglx On Wed, Apr 27, 2016 at 05:16:45PM +0800, Pan Xinhui wrote: > From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> > > Implement xchg{u8,u16}{local,relaxed}, and > cmpxchg{u8,u16}{,local,acquire,relaxed}. > > It works on all ppc. > > remove volatile of first parameter in __cmpxchg_local and __cmpxchg > > Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org> > Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> Generally has the right shape; and I trust others to double check the ppc-asm minutia. Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16 2016-04-28 7:59 ` Peter Zijlstra @ 2016-04-28 10:21 ` Pan Xinhui 0 siblings, 0 replies; 22+ messages in thread From: Pan Xinhui @ 2016-04-28 10:21 UTC (permalink / raw) To: Peter Zijlstra Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, boqun.feng, paulmck, tglx On 2016年04月28日 15:59, Peter Zijlstra wrote: > On Wed, Apr 27, 2016 at 05:16:45PM +0800, Pan Xinhui wrote: >> From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> >> >> Implement xchg{u8,u16}{local,relaxed}, and >> cmpxchg{u8,u16}{,local,acquire,relaxed}. >> >> It works on all ppc. >> >> remove volatile of first parameter in __cmpxchg_local and __cmpxchg >> >> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org> >> Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> > > Generally has the right shape; and I trust others to double check the > ppc-asm minutia. > > Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> > > thanks! ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [V4] powerpc: Implement {cmp}xchg for u8 and u16 2016-04-27 9:16 ` [PATCH V4] " Pan Xinhui 2016-04-27 13:58 ` Boqun Feng 2016-04-28 7:59 ` Peter Zijlstra @ 2016-11-25 0:04 ` Michael Ellerman 2 siblings, 0 replies; 22+ messages in thread From: Michael Ellerman @ 2016-11-25 0:04 UTC (permalink / raw) To: xinhui, linux-kernel, linuxppc-dev Cc: peterz, boqun.feng, paulus, tglx, paulmck On Wed, 2016-04-27 at 09:16:45 UTC, xinhui wrote: > From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> > > Implement xchg{u8,u16}{local,relaxed}, and > cmpxchg{u8,u16}{,local,acquire,relaxed}. > > It works on all ppc. > > remove volatile of first parameter in __cmpxchg_local and __cmpxchg > > Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org> > Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com> > Acked-by: Boqun Feng <boqun.feng@gmail.com> > Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Applied to powerpc next, thanks. https://git.kernel.org/powerpc/c/d0563a1297e234ed37f6b51c2e9321 cheers ^ permalink raw reply [flat|nested] 22+ messages in thread
end of thread, other threads:[~2016-11-25 0:04 UTC | newest] Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2016-04-19 6:29 [PATCH V2] powerpc: Implement {cmp}xchg for u8 and u16 Pan Xinhui 2016-04-19 9:18 ` Boqun Feng 2016-04-20 3:39 ` Pan Xinhui 2016-04-20 13:24 ` [PATCH V3] " Pan Xinhui 2016-04-20 14:24 ` Peter Zijlstra 2016-04-21 15:35 ` Pan Xinhui 2016-04-21 15:52 ` Boqun Feng 2016-04-22 1:59 ` Pan Xinhui 2016-04-22 3:16 ` Boqun Feng 2016-04-21 16:13 ` Peter Zijlstra 2016-04-25 10:10 ` Pan Xinhui 2016-04-25 15:37 ` Peter Zijlstra 2016-04-26 11:35 ` Pan Xinhui 2016-04-27 9:16 ` [PATCH V4] " Pan Xinhui 2016-04-27 13:58 ` Boqun Feng 2016-04-27 14:16 ` Boqun Feng 2016-04-27 14:50 ` Boqun Feng 2016-04-27 14:59 ` Boqun Feng 2016-04-28 10:21 ` Pan Xinhui 2016-04-28 7:59 ` Peter Zijlstra 2016-04-28 10:21 ` Pan Xinhui 2016-11-25 0:04 ` [V4] " Michael Ellerman
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).