Hi Ingo, On Sat, May 05, 2018 at 12:00:55PM +0200, Ingo Molnar wrote: > > * Ingo Molnar wrote: > > > > So there's no loss in arch flexibility. > > > > BTW., PowerPC for example is already in such a situation, it does not define > > atomic_cmpxchg_release(), only the other APIs: > > > > #define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n))) > > #define atomic_cmpxchg_relaxed(v, o, n) \ > > cmpxchg_relaxed(&((v)->counter), (o), (n)) > > #define atomic_cmpxchg_acquire(v, o, n) \ > > cmpxchg_acquire(&((v)->counter), (o), (n)) > > > > Was it really the intention on the PowerPC side that the generic code falls back > > to cmpxchg(), i.e.: > > > > # define atomic_cmpxchg_release(...) __atomic_op_release(atomic_cmpxchg, __VA_ARGS__) > > > > Which after macro expansion becomes: > > > > smp_mb__before_atomic(); > > atomic_cmpxchg_relaxed(v, o, n); > > > > smp_mb__before_atomic() on PowerPC falls back to the generic __smp_mb(), which > > falls back to mb(), which on PowerPC is the 'sync' instruction. > > > > Isn't this a inefficiency bug? > > > > While I'm pretty clueless about PowerPC low level cmpxchg atomics, they appear to > > have the following basic structure: > > > > full cmpxchg(): > > > > PPC_ATOMIC_ENTRY_BARRIER # sync > > ldarx + stdcx > > PPC_ATOMIC_EXIT_BARRIER # sync > > > > cmpxchg_relaxed(): > > > > ldarx + stdcx > > > > cmpxchg_acquire(): > > > > ldarx + stdcx > > PPC_ACQUIRE_BARRIER # lwsync > > > > The logical extension for cmpxchg_release() would be: > > > > cmpxchg_release(): > > > > PPC_RELEASE_BARRIER # lwsync > > ldarx + stdcx > > > > But instead we silently get the generic fallback, which does: > > > > smp_mb__before_atomic(); > > atomic_cmpxchg_relaxed(v, o, n); > > > > Which maps to: > > > > sync > > ldarx + stdcx > > > > Note that it uses a full barrier instead of lwsync (does that stand for > > 'lightweight sync'?). > > > > Even if it turns out we need the full barrier, with the overly finegrained > > structure of the atomics this detail is totally undocumented and non-obvious. > > The patch below fills in those bits and implements the optimized cmpxchg_release() > family of APIs. The end effect should be that cmpxchg_release() will now use > 'lwsync' instead of 'sync' on PowerPC, for the following APIs: > > cmpxchg_release() > cmpxchg64_release() > atomic_cmpxchg_release() > atomic64_cmpxchg_release() > > I based this choice of the release barrier on an existing bitops low level PowerPC > method: > > DEFINE_BITOP(clear_bits_unlock, andc, PPC_RELEASE_BARRIER) > > This clearly suggests that PPC_RELEASE_BARRIER is in active use and 'lwsync' is > the 'release barrier' instruction, if I interpreted that right. > Thanks for looking into this, but as I said in other email: https://marc.info/?l=linux-kernel&m=152551511324210&w=2 , we actually generate light weight barriers for cmpxchg_release() familiy. The reason of the asymmetry between cmpxchg_acquire() and cmpxchg_release() is that we want to save a barrier for cmpxchg_acquire() if the cmp fails, but doing the similar for cmpxchg_release() will introduce a scenario that puts a barrier in a ll/sc loop, which may be a bad idea. > But I know very little about PowerPC so this might be spectacularly wrong. It's > totally untested as well. I also pretty sick today so my mental capabilities are > significantly reduced ... > Feel sorry about that, hope you well! Please let me know if you think I should provide more document work to make this more informative. Regards, Boqun > So not signed off and such. > > Thanks, > > Ingo > > --- > arch/powerpc/include/asm/atomic.h | 4 ++ > arch/powerpc/include/asm/cmpxchg.h | 81 ++++++++++++++++++++++++++++++++++++++ > 2 files changed, 85 insertions(+) > > diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h > index 682b3e6a1e21..f7a6f29acb12 100644 > --- a/arch/powerpc/include/asm/atomic.h > +++ b/arch/powerpc/include/asm/atomic.h > @@ -213,6 +213,8 @@ static __inline__ int atomic_dec_return_relaxed(atomic_t *v) > cmpxchg_relaxed(&((v)->counter), (o), (n)) > #define atomic_cmpxchg_acquire(v, o, n) \ > cmpxchg_acquire(&((v)->counter), (o), (n)) > +#define atomic_cmpxchg_release(v, o, n) \ > + cmpxchg_release(&((v)->counter), (o), (n)) > > #define atomic_xchg(v, new) (xchg(&((v)->counter), new)) > #define atomic_xchg_relaxed(v, new) xchg_relaxed(&((v)->counter), (new)) > @@ -519,6 +521,8 @@ static __inline__ long atomic64_dec_if_positive(atomic64_t *v) > cmpxchg_relaxed(&((v)->counter), (o), (n)) > #define atomic64_cmpxchg_acquire(v, o, n) \ > cmpxchg_acquire(&((v)->counter), (o), (n)) > +#define atomic64_cmpxchg_release(v, o, n) \ > + cmpxchg_release(&((v)->counter), (o), (n)) > > #define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) > #define atomic64_xchg_relaxed(v, new) xchg_relaxed(&((v)->counter), (new)) > diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h > index 9b001f1f6b32..6e46310b1833 100644 > --- a/arch/powerpc/include/asm/cmpxchg.h > +++ b/arch/powerpc/include/asm/cmpxchg.h > @@ -213,10 +213,12 @@ __xchg_relaxed(void *ptr, unsigned long x, unsigned int size) > CMPXCHG_GEN(u8, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory"); > CMPXCHG_GEN(u8, _local, , , "memory"); > CMPXCHG_GEN(u8, _acquire, , PPC_ACQUIRE_BARRIER, "memory"); > +CMPXCHG_GEN(u8, _release, PPC_RELEASE_BARRIER, , "memory"); > CMPXCHG_GEN(u8, _relaxed, , , "cc"); > CMPXCHG_GEN(u16, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory"); > CMPXCHG_GEN(u16, _local, , , "memory"); > CMPXCHG_GEN(u16, _acquire, , PPC_ACQUIRE_BARRIER, "memory"); > +CMPXCHG_GEN(u16, _release, PPC_RELEASE_BARRIER, , "memory"); > CMPXCHG_GEN(u16, _relaxed, , , "cc"); > > static __always_inline unsigned long > @@ -314,6 +316,29 @@ __cmpxchg_u32_acquire(u32 *p, unsigned long old, unsigned long new) > return prev; > } > > +static __always_inline unsigned long > +__cmpxchg_u32_release(u32 *p, unsigned long old, unsigned long new) > +{ > + unsigned long prev; > + > + __asm__ __volatile__ ( > + PPC_RELEASE_BARRIER > +"1: lwarx %0,0,%2 # __cmpxchg_u32_release\n" > +" cmpw 0,%0,%3\n" > +" bne- 2f\n" > + PPC405_ERR77(0, %2) > +" stwcx. %4,0,%2\n" > +" bne- 1b\n" > + "\n" > +"2:" > + : "=&r" (prev), "+m" (*p) > + : "r" (p), "r" (old), "r" (new) > + : "cc", "memory"); > + > + return prev; > +} > + > + > #ifdef CONFIG_PPC64 > static __always_inline unsigned long > __cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new) > @@ -397,6 +422,27 @@ __cmpxchg_u64_acquire(u64 *p, unsigned long old, unsigned long new) > > return prev; > } > + > +static __always_inline unsigned long > +__cmpxchg_u64_release(u64 *p, unsigned long old, unsigned long new) > +{ > + unsigned long prev; > + > + __asm__ __volatile__ ( > + PPC_RELEASE_BARRIER > +"1: ldarx %0,0,%2 # __cmpxchg_u64_release\n" > +" cmpd 0,%0,%3\n" > +" bne- 2f\n" > +" stdcx. %4,0,%2\n" > +" bne- 1b\n" > + "\n" > +"2:" > + : "=&r" (prev), "+m" (*p) > + : "r" (p), "r" (old), "r" (new) > + : "cc", "memory"); > + > + return prev; > +} > #endif > > static __always_inline unsigned long > @@ -478,6 +524,27 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new, > BUILD_BUG_ON_MSG(1, "Unsupported size for __cmpxchg_acquire"); > return old; > } > + > +static __always_inline unsigned long > +__cmpxchg_release(void *ptr, unsigned long old, unsigned long new, > + unsigned int size) > +{ > + switch (size) { > + case 1: > + return __cmpxchg_u8_release(ptr, old, new); > + case 2: > + return __cmpxchg_u16_release(ptr, old, new); > + case 4: > + return __cmpxchg_u32_release(ptr, old, new); > +#ifdef CONFIG_PPC64 > + case 8: > + return __cmpxchg_u64_release(ptr, old, new); > +#endif > + } > + BUILD_BUG_ON_MSG(1, "Unsupported size for __cmpxchg_release"); > + return old; > +} > + > #define cmpxchg(ptr, o, n) \ > ({ \ > __typeof__(*(ptr)) _o_ = (o); \ > @@ -512,6 +579,15 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new, > (unsigned long)_o_, (unsigned long)_n_, \ > sizeof(*(ptr))); \ > }) > + > +#define cmpxchg_release(ptr, o, n) \ > +({ \ > + __typeof__(*(ptr)) _o_ = (o); \ > + __typeof__(*(ptr)) _n_ = (n); \ > + (__typeof__(*(ptr))) __cmpxchg_release((ptr), \ > + (unsigned long)_o_, (unsigned long)_n_, \ > + sizeof(*(ptr))); \ > +}) > #ifdef CONFIG_PPC64 > #define cmpxchg64(ptr, o, n) \ > ({ \ > @@ -533,6 +609,11 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new, > BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ > cmpxchg_acquire((ptr), (o), (n)); \ > }) > +#define cmpxchg64_release(ptr, o, n) \ > +({ \ > + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ > + cmpxchg_release((ptr), (o), (n)); \ > +}) > #else > #include > #define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n))