linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH V2] powerpc: Implement {cmp}xchg for u8 and u16
@ 2016-04-19  6:29 Pan Xinhui
  2016-04-19  9:18 ` Boqun Feng
  2016-04-20 13:24 ` [PATCH V3] " Pan Xinhui
  0 siblings, 2 replies; 22+ messages in thread
From: Pan Xinhui @ 2016-04-19  6:29 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev
  Cc: benh, paulus, mpe, boqun.feng, peterz, paulmck, tglx

From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>

Implement xchg{u8,u16}{local,relaxed}, and
cmpxchg{u8,u16}{,local,acquire,relaxed}.

It works on all ppc.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
change from V1:
	rework totally.
---
 arch/powerpc/include/asm/cmpxchg.h | 83 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
index 44efe73..79a1f45 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -7,6 +7,37 @@
 #include <asm/asm-compat.h>
 #include <linux/bug.h>
 
+#ifdef __BIG_ENDIAN
+#define BITOFF_CAL(size, off)	((sizeof(u32) - size - off) * BITS_PER_BYTE)
+#else
+#define BITOFF_CAL(size, off)	(off * BITS_PER_BYTE)
+#endif
+
+static __always_inline unsigned long
+__cmpxchg_u32_local(volatile unsigned int *p, unsigned long old,
+			unsigned long new);
+
+#define __XCHG_GEN(cmp, type, sfx, u32sfx, skip, v)			\
+static __always_inline u32						\
+__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new)		\
+{									\
+	int size = sizeof (type);					\
+	int off = (unsigned long)ptr % sizeof(u32);			\
+	volatile u32 *p = ptr - off;					\
+	int bitoff = BITOFF_CAL(size, off);				\
+	u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;	\
+	u32 oldv, newv;							\
+	u32 ret;							\
+	do {								\
+		oldv = READ_ONCE(*p);					\
+		ret = (oldv & bitmask) >> bitoff;			\
+		if (skip && ret != old)					\
+			break;						\
+		newv = (oldv & ~bitmask) | (new << bitoff);		\
+	} while (__cmpxchg_u32##u32sfx((v void*)p, oldv, newv) != oldv);\
+	return ret;							\
+}
+
 /*
  * Atomic exchange
  *
@@ -14,6 +45,19 @@
  * the previous value stored there.
  */
 
+#define XCHG_GEN(type, sfx, v)						\
+		__XCHG_GEN(_, type, sfx, _local, 0, v)			\
+static __always_inline u32 __xchg_##type##sfx(v void *p, u32 n)	\
+{									\
+	return ___xchg_##type##sfx(p, 0, n);				\
+}
+
+XCHG_GEN(u8, _local, volatile);
+XCHG_GEN(u8, _relaxed, );
+XCHG_GEN(u16, _local, volatile);
+XCHG_GEN(u16, _relaxed, );
+#undef XCHG_GEN
+
 static __always_inline unsigned long
 __xchg_u32_local(volatile void *p, unsigned long val)
 {
@@ -88,6 +132,10 @@ static __always_inline unsigned long
 __xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __xchg_u8_local(ptr, x);
+	case 2:
+		return __xchg_u16_local(ptr, x);
 	case 4:
 		return __xchg_u32_local(ptr, x);
 #ifdef CONFIG_PPC64
@@ -103,6 +151,10 @@ static __always_inline unsigned long
 __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __xchg_u8_relaxed(ptr, x);
+	case 2:
+		return __xchg_u16_relaxed(ptr, x);
 	case 4:
 		return __xchg_u32_relaxed(ptr, x);
 #ifdef CONFIG_PPC64
@@ -226,6 +278,21 @@ __cmpxchg_u32_acquire(u32 *p, unsigned long old, unsigned long new)
 	return prev;
 }
 
+
+#define CMPXCHG_GEN(type, sfx, v)				\
+	__XCHG_GEN(cmp, type, sfx, sfx, 1, v)
+
+CMPXCHG_GEN(u8, , volatile);
+CMPXCHG_GEN(u8, _local, volatile);
+CMPXCHG_GEN(u8, _relaxed, );
+CMPXCHG_GEN(u8, _acquire, );
+CMPXCHG_GEN(u16, , volatile);
+CMPXCHG_GEN(u16, _local, volatile);
+CMPXCHG_GEN(u16, _relaxed, );
+CMPXCHG_GEN(u16, _acquire, );
+#undef CMPXCHG_GEN
+#undef __XCHG_GEN
+
 #ifdef CONFIG_PPC64
 static __always_inline unsigned long
 __cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new)
@@ -316,6 +383,10 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
 	  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -332,6 +403,10 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
 	  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8_local(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16_local(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32_local(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -348,6 +423,10 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
 		  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8_relaxed(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16_relaxed(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32_relaxed(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -364,6 +443,10 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new,
 		  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8_acquire(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16_acquire(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32_acquire(ptr, old, new);
 #ifdef CONFIG_PPC64
-- 
2.4.3

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH V2] powerpc: Implement {cmp}xchg for u8 and u16
  2016-04-19  6:29 [PATCH V2] powerpc: Implement {cmp}xchg for u8 and u16 Pan Xinhui
@ 2016-04-19  9:18 ` Boqun Feng
  2016-04-20  3:39   ` Pan Xinhui
  2016-04-20 13:24 ` [PATCH V3] " Pan Xinhui
  1 sibling, 1 reply; 22+ messages in thread
From: Boqun Feng @ 2016-04-19  9:18 UTC (permalink / raw)
  To: Pan Xinhui
  Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, peterz, paulmck, tglx

Hi Xinhui,

On Tue, Apr 19, 2016 at 02:29:34PM +0800, Pan Xinhui wrote:
> From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> 
> Implement xchg{u8,u16}{local,relaxed}, and
> cmpxchg{u8,u16}{,local,acquire,relaxed}.
> 
> It works on all ppc.
> 

Nice work!

AFAICT, your work doesn't depend on anything that ppc-specific, right?
So maybe we can use it as a general approach for a fallback
implementation on the archs without u8/u16 atomics. ;-)

> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> ---
> change from V1:
> 	rework totally.
> ---
>  arch/powerpc/include/asm/cmpxchg.h | 83 ++++++++++++++++++++++++++++++++++++++
>  1 file changed, 83 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
> index 44efe73..79a1f45 100644
> --- a/arch/powerpc/include/asm/cmpxchg.h
> +++ b/arch/powerpc/include/asm/cmpxchg.h
> @@ -7,6 +7,37 @@
>  #include <asm/asm-compat.h>
>  #include <linux/bug.h>
>  
> +#ifdef __BIG_ENDIAN
> +#define BITOFF_CAL(size, off)	((sizeof(u32) - size - off) * BITS_PER_BYTE)
> +#else
> +#define BITOFF_CAL(size, off)	(off * BITS_PER_BYTE)
> +#endif
> +
> +static __always_inline unsigned long
> +__cmpxchg_u32_local(volatile unsigned int *p, unsigned long old,
> +			unsigned long new);
> +
> +#define __XCHG_GEN(cmp, type, sfx, u32sfx, skip, v)			\
> +static __always_inline u32						\
> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new)		\
> +{									\
> +	int size = sizeof (type);					\
> +	int off = (unsigned long)ptr % sizeof(u32);			\
> +	volatile u32 *p = ptr - off;					\
> +	int bitoff = BITOFF_CAL(size, off);				\
> +	u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;	\
> +	u32 oldv, newv;							\
> +	u32 ret;							\
> +	do {								\
> +		oldv = READ_ONCE(*p);					\
> +		ret = (oldv & bitmask) >> bitoff;			\
> +		if (skip && ret != old)					\
> +			break;						\
> +		newv = (oldv & ~bitmask) | (new << bitoff);		\
> +	} while (__cmpxchg_u32##u32sfx((v void*)p, oldv, newv) != oldv);\

Forgive me if this is too paranoid, but I think we can save the
READ_ONCE() in the loop if we change the code into the following,
because cmpxchg will return the "new" value, if the cmp part fails.

	newv = READ_ONCE(*p);

	do {
		oldv = newv;
		ret = (oldv & bitmask) >> bitoff;
		if (skip && ret != old)
			break;
		newv = (oldv & ~bitmask) | (new << bitoff);
		newv = __cmpxchg_u32##u32sfx((void *)p, oldv, newv);
	} while(newv != oldv);

> +	return ret;							\
> +}
> +
>  /*
>   * Atomic exchange
>   *
> @@ -14,6 +45,19 @@
>   * the previous value stored there.
>   */
>  
> +#define XCHG_GEN(type, sfx, v)						\
> +		__XCHG_GEN(_, type, sfx, _local, 0, v)			\
                                         ^^^^^^^

This should be sfx, right? Otherwise, all the newly added xchg will
call __cmpxchg_u32_local, this will result in wrong ordering guarantees.

> +static __always_inline u32 __xchg_##type##sfx(v void *p, u32 n)	\
> +{									\
> +	return ___xchg_##type##sfx(p, 0, n);				\
> +}
> +
> +XCHG_GEN(u8, _local, volatile);

I don't think we need the "volatile" modifier here, because READ_ONCE()
and __cmpxchg_u32_* all have "volatile" semantics IIUC, so maybe we can
save a paramter for the __XCHG_GEN macro.

Regards,
Boqun

> +XCHG_GEN(u8, _relaxed, );
> +XCHG_GEN(u16, _local, volatile);
> +XCHG_GEN(u16, _relaxed, );
> +#undef XCHG_GEN
> +
>  static __always_inline unsigned long
>  __xchg_u32_local(volatile void *p, unsigned long val)
>  {
> @@ -88,6 +132,10 @@ static __always_inline unsigned long
>  __xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
>  {
>  	switch (size) {
> +	case 1:
> +		return __xchg_u8_local(ptr, x);
> +	case 2:
> +		return __xchg_u16_local(ptr, x);
>  	case 4:
>  		return __xchg_u32_local(ptr, x);
>  #ifdef CONFIG_PPC64
> @@ -103,6 +151,10 @@ static __always_inline unsigned long
>  __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
>  {
>  	switch (size) {
> +	case 1:
> +		return __xchg_u8_relaxed(ptr, x);
> +	case 2:
> +		return __xchg_u16_relaxed(ptr, x);
>  	case 4:
>  		return __xchg_u32_relaxed(ptr, x);
>  #ifdef CONFIG_PPC64
> @@ -226,6 +278,21 @@ __cmpxchg_u32_acquire(u32 *p, unsigned long old, unsigned long new)
>  	return prev;
>  }
>  
> +
> +#define CMPXCHG_GEN(type, sfx, v)				\
> +	__XCHG_GEN(cmp, type, sfx, sfx, 1, v)
> +
> +CMPXCHG_GEN(u8, , volatile);
> +CMPXCHG_GEN(u8, _local, volatile);
> +CMPXCHG_GEN(u8, _relaxed, );
> +CMPXCHG_GEN(u8, _acquire, );
> +CMPXCHG_GEN(u16, , volatile);
> +CMPXCHG_GEN(u16, _local, volatile);
> +CMPXCHG_GEN(u16, _relaxed, );
> +CMPXCHG_GEN(u16, _acquire, );
> +#undef CMPXCHG_GEN
> +#undef __XCHG_GEN
> +
>  #ifdef CONFIG_PPC64
>  static __always_inline unsigned long
>  __cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new)
> @@ -316,6 +383,10 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
>  	  unsigned int size)
>  {
>  	switch (size) {
> +	case 1:
> +		return __cmpxchg_u8(ptr, old, new);
> +	case 2:
> +		return __cmpxchg_u16(ptr, old, new);
>  	case 4:
>  		return __cmpxchg_u32(ptr, old, new);
>  #ifdef CONFIG_PPC64
> @@ -332,6 +403,10 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
>  	  unsigned int size)
>  {
>  	switch (size) {
> +	case 1:
> +		return __cmpxchg_u8_local(ptr, old, new);
> +	case 2:
> +		return __cmpxchg_u16_local(ptr, old, new);
>  	case 4:
>  		return __cmpxchg_u32_local(ptr, old, new);
>  #ifdef CONFIG_PPC64
> @@ -348,6 +423,10 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
>  		  unsigned int size)
>  {
>  	switch (size) {
> +	case 1:
> +		return __cmpxchg_u8_relaxed(ptr, old, new);
> +	case 2:
> +		return __cmpxchg_u16_relaxed(ptr, old, new);
>  	case 4:
>  		return __cmpxchg_u32_relaxed(ptr, old, new);
>  #ifdef CONFIG_PPC64
> @@ -364,6 +443,10 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new,
>  		  unsigned int size)
>  {
>  	switch (size) {
> +	case 1:
> +		return __cmpxchg_u8_acquire(ptr, old, new);
> +	case 2:
> +		return __cmpxchg_u16_acquire(ptr, old, new);
>  	case 4:
>  		return __cmpxchg_u32_acquire(ptr, old, new);
>  #ifdef CONFIG_PPC64
> -- 
> 2.4.3
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V2] powerpc: Implement {cmp}xchg for u8 and u16
  2016-04-19  9:18 ` Boqun Feng
@ 2016-04-20  3:39   ` Pan Xinhui
  0 siblings, 0 replies; 22+ messages in thread
From: Pan Xinhui @ 2016-04-20  3:39 UTC (permalink / raw)
  To: Boqun Feng
  Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, peterz, paulmck, tglx


Hello, boqun

On 2016年04月19日 17:18, Boqun Feng wrote:
> Hi Xinhui,
> 
> On Tue, Apr 19, 2016 at 02:29:34PM +0800, Pan Xinhui wrote:
>> From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
>>
>> Implement xchg{u8,u16}{local,relaxed}, and
>> cmpxchg{u8,u16}{,local,acquire,relaxed}.
>>
>> It works on all ppc.
>>
> 
> Nice work!
> 
thank you.

> AFAICT, your work doesn't depend on anything that ppc-specific, right?
> So maybe we can use it as a general approach for a fallback
> implementation on the archs without u8/u16 atomics. ;-)
> 
>> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
>> ---
>> change from V1:
>> 	rework totally.
>> ---
>>  arch/powerpc/include/asm/cmpxchg.h | 83 ++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 83 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
>> index 44efe73..79a1f45 100644
>> --- a/arch/powerpc/include/asm/cmpxchg.h
>> +++ b/arch/powerpc/include/asm/cmpxchg.h
>> @@ -7,6 +7,37 @@
>>  #include <asm/asm-compat.h>
>>  #include <linux/bug.h>
>>  
>> +#ifdef __BIG_ENDIAN
>> +#define BITOFF_CAL(size, off)	((sizeof(u32) - size - off) * BITS_PER_BYTE)
>> +#else
>> +#define BITOFF_CAL(size, off)	(off * BITS_PER_BYTE)
>> +#endif
>> +
>> +static __always_inline unsigned long
>> +__cmpxchg_u32_local(volatile unsigned int *p, unsigned long old,
>> +			unsigned long new);
>> +
>> +#define __XCHG_GEN(cmp, type, sfx, u32sfx, skip, v)			\
>> +static __always_inline u32						\
>> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new)		\
>> +{									\
>> +	int size = sizeof (type);					\
>> +	int off = (unsigned long)ptr % sizeof(u32);			\
>> +	volatile u32 *p = ptr - off;					\
>> +	int bitoff = BITOFF_CAL(size, off);				\
>> +	u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;	\
>> +	u32 oldv, newv;							\
>> +	u32 ret;							\
>> +	do {								\
>> +		oldv = READ_ONCE(*p);					\
>> +		ret = (oldv & bitmask) >> bitoff;			\
>> +		if (skip && ret != old)					\
>> +			break;						\
>> +		newv = (oldv & ~bitmask) | (new << bitoff);		\
>> +	} while (__cmpxchg_u32##u32sfx((v void*)p, oldv, newv) != oldv);\
> 
> Forgive me if this is too paranoid, but I think we can save the
> READ_ONCE() in the loop if we change the code into the following,
> because cmpxchg will return the "new" value, if the cmp part fails.
> 
> 	newv = READ_ONCE(*p);
> 
> 	do {
> 		oldv = newv;
> 		ret = (oldv & bitmask) >> bitoff;
> 		if (skip && ret != old)
> 			break;
> 		newv = (oldv & ~bitmask) | (new << bitoff);
> 		newv = __cmpxchg_u32##u32sfx((void *)p, oldv, newv);
> 	} while(newv != oldv);
> 
>> +	return ret;							\
>> +}
a little optimization. Patch V3 will include your code, thanks.

>> +
>>  /*
>>   * Atomic exchange
>>   *
>> @@ -14,6 +45,19 @@
>>   * the previous value stored there.
>>   */
>>  
>> +#define XCHG_GEN(type, sfx, v)						\
>> +		__XCHG_GEN(_, type, sfx, _local, 0, v)			\
>                                          ^^^^^^^
> 
> This should be sfx, right? Otherwise, all the newly added xchg will
> call __cmpxchg_u32_local, this will result in wrong ordering guarantees.
> 
I mean that. But I will think of the ordering issue for a while. :)

>> +static __always_inline u32 __xchg_##type##sfx(v void *p, u32 n)	\
>> +{									\
>> +	return ___xchg_##type##sfx(p, 0, n);				\
>> +}
>> +
>> +XCHG_GEN(u8, _local, volatile);
> 
> I don't think we need the "volatile" modifier here, because READ_ONCE()
> and __cmpxchg_u32_* all have "volatile" semantics IIUC, so maybe we can
> save a paramter for the __XCHG_GEN macro.
> 
such cleanup work can be done in separated patch. Here I just make the compiler happy.

thanks
xinhui
> Regards,
> Boqun
> 
>> +XCHG_GEN(u8, _relaxed, );
>> +XCHG_GEN(u16, _local, volatile);
>> +XCHG_GEN(u16, _relaxed, );
>> +#undef XCHG_GEN
>> +
>>  static __always_inline unsigned long
>>  __xchg_u32_local(volatile void *p, unsigned long val)
>>  {
>> @@ -88,6 +132,10 @@ static __always_inline unsigned long
>>  __xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
>>  {
>>  	switch (size) {
>> +	case 1:
>> +		return __xchg_u8_local(ptr, x);
>> +	case 2:
>> +		return __xchg_u16_local(ptr, x);
>>  	case 4:
>>  		return __xchg_u32_local(ptr, x);
>>  #ifdef CONFIG_PPC64
>> @@ -103,6 +151,10 @@ static __always_inline unsigned long
>>  __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
>>  {
>>  	switch (size) {
>> +	case 1:
>> +		return __xchg_u8_relaxed(ptr, x);
>> +	case 2:
>> +		return __xchg_u16_relaxed(ptr, x);
>>  	case 4:
>>  		return __xchg_u32_relaxed(ptr, x);
>>  #ifdef CONFIG_PPC64
>> @@ -226,6 +278,21 @@ __cmpxchg_u32_acquire(u32 *p, unsigned long old, unsigned long new)
>>  	return prev;
>>  }
>>  
>> +
>> +#define CMPXCHG_GEN(type, sfx, v)				\
>> +	__XCHG_GEN(cmp, type, sfx, sfx, 1, v)
>> +
>> +CMPXCHG_GEN(u8, , volatile);
>> +CMPXCHG_GEN(u8, _local, volatile);
>> +CMPXCHG_GEN(u8, _relaxed, );
>> +CMPXCHG_GEN(u8, _acquire, );
>> +CMPXCHG_GEN(u16, , volatile);
>> +CMPXCHG_GEN(u16, _local, volatile);
>> +CMPXCHG_GEN(u16, _relaxed, );
>> +CMPXCHG_GEN(u16, _acquire, );
>> +#undef CMPXCHG_GEN
>> +#undef __XCHG_GEN
>> +
>>  #ifdef CONFIG_PPC64
>>  static __always_inline unsigned long
>>  __cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new)
>> @@ -316,6 +383,10 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
>>  	  unsigned int size)
>>  {
>>  	switch (size) {
>> +	case 1:
>> +		return __cmpxchg_u8(ptr, old, new);
>> +	case 2:
>> +		return __cmpxchg_u16(ptr, old, new);
>>  	case 4:
>>  		return __cmpxchg_u32(ptr, old, new);
>>  #ifdef CONFIG_PPC64
>> @@ -332,6 +403,10 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
>>  	  unsigned int size)
>>  {
>>  	switch (size) {
>> +	case 1:
>> +		return __cmpxchg_u8_local(ptr, old, new);
>> +	case 2:
>> +		return __cmpxchg_u16_local(ptr, old, new);
>>  	case 4:
>>  		return __cmpxchg_u32_local(ptr, old, new);
>>  #ifdef CONFIG_PPC64
>> @@ -348,6 +423,10 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
>>  		  unsigned int size)
>>  {
>>  	switch (size) {
>> +	case 1:
>> +		return __cmpxchg_u8_relaxed(ptr, old, new);
>> +	case 2:
>> +		return __cmpxchg_u16_relaxed(ptr, old, new);
>>  	case 4:
>>  		return __cmpxchg_u32_relaxed(ptr, old, new);
>>  #ifdef CONFIG_PPC64
>> @@ -364,6 +443,10 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new,
>>  		  unsigned int size)
>>  {
>>  	switch (size) {
>> +	case 1:
>> +		return __cmpxchg_u8_acquire(ptr, old, new);
>> +	case 2:
>> +		return __cmpxchg_u16_acquire(ptr, old, new);
>>  	case 4:
>>  		return __cmpxchg_u32_acquire(ptr, old, new);
>>  #ifdef CONFIG_PPC64
>> -- 
>> 2.4.3
>>
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16
  2016-04-19  6:29 [PATCH V2] powerpc: Implement {cmp}xchg for u8 and u16 Pan Xinhui
  2016-04-19  9:18 ` Boqun Feng
@ 2016-04-20 13:24 ` Pan Xinhui
  2016-04-20 14:24   ` Peter Zijlstra
  2016-04-27  9:16   ` [PATCH V4] " Pan Xinhui
  1 sibling, 2 replies; 22+ messages in thread
From: Pan Xinhui @ 2016-04-20 13:24 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev
  Cc: benh, paulus, mpe, boqun.feng, peterz, paulmck, tglx

From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>

Implement xchg{u8,u16}{local,relaxed}, and
cmpxchg{u8,u16}{,local,acquire,relaxed}.

It works on all ppc.
The basic idea is from commit 3226aad81aa6 ("sh: support 1 and 2 byte xchg")

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
change from v2:
	in the do{}while(), we save one load and use corresponding cmpxchg suffix.
	Also add corresponding __cmpxchg_u32 function declaration in the __XCHG_GEN 
change from V1:
	rework totally.
---
 arch/powerpc/include/asm/cmpxchg.h | 83 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
index 44efe73..2aec04e 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -7,6 +7,38 @@
 #include <asm/asm-compat.h>
 #include <linux/bug.h>
 
+#ifdef __BIG_ENDIAN
+#define BITOFF_CAL(size, off)	((sizeof(u32) - size - off) * BITS_PER_BYTE)
+#else
+#define BITOFF_CAL(size, off)	(off * BITS_PER_BYTE)
+#endif
+
+#define __XCHG_GEN(cmp, type, sfx, skip, v)				\
+static __always_inline unsigned long					\
+__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old,		\
+			 unsigned long new);				\
+static __always_inline u32						\
+__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new)		\
+{									\
+	int size = sizeof (type);					\
+	int off = (unsigned long)ptr % sizeof(u32);			\
+	volatile u32 *p = ptr - off;					\
+	int bitoff = BITOFF_CAL(size, off);				\
+	u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;	\
+	u32 oldv, newv, tmp;						\
+	u32 ret;							\
+	oldv = READ_ONCE(*p);						\
+	do {								\
+		ret = (oldv & bitmask) >> bitoff;			\
+		if (skip && ret != old)					\
+			break;						\
+		newv = (oldv & ~bitmask) | (new << bitoff);		\
+		tmp = oldv;						\
+		oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv);	\
+	} while (tmp != oldv);						\
+	return ret;							\
+}
+
 /*
  * Atomic exchange
  *
@@ -14,6 +46,19 @@
  * the previous value stored there.
  */
 
+#define XCHG_GEN(type, sfx, v)						\
+		__XCHG_GEN(_, type, sfx, 0, v)				\
+static __always_inline u32 __xchg_##type##sfx(v void *p, u32 n)		\
+{									\
+	return ___xchg_##type##sfx(p, 0, n);				\
+}
+
+XCHG_GEN(u8, _local, volatile);
+XCHG_GEN(u8, _relaxed, );
+XCHG_GEN(u16, _local, volatile);
+XCHG_GEN(u16, _relaxed, );
+#undef XCHG_GEN
+
 static __always_inline unsigned long
 __xchg_u32_local(volatile void *p, unsigned long val)
 {
@@ -88,6 +133,10 @@ static __always_inline unsigned long
 __xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __xchg_u8_local(ptr, x);
+	case 2:
+		return __xchg_u16_local(ptr, x);
 	case 4:
 		return __xchg_u32_local(ptr, x);
 #ifdef CONFIG_PPC64
@@ -103,6 +152,10 @@ static __always_inline unsigned long
 __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __xchg_u8_relaxed(ptr, x);
+	case 2:
+		return __xchg_u16_relaxed(ptr, x);
 	case 4:
 		return __xchg_u32_relaxed(ptr, x);
 #ifdef CONFIG_PPC64
@@ -131,6 +184,20 @@ __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
  * and return the old value of *p.
  */
 
+#define CMPXCHG_GEN(type, sfx, v)				\
+	__XCHG_GEN(cmp, type, sfx, 1, v)
+
+CMPXCHG_GEN(u8, , volatile);
+CMPXCHG_GEN(u8, _local,	volatile);
+CMPXCHG_GEN(u8, _relaxed, );
+CMPXCHG_GEN(u8, _acquire, );
+CMPXCHG_GEN(u16, , volatile);
+CMPXCHG_GEN(u16, _local, volatile);
+CMPXCHG_GEN(u16, _relaxed, );
+CMPXCHG_GEN(u16, _acquire, );
+#undef CMPXCHG_GEN
+#undef __XCHG_GEN
+
 static __always_inline unsigned long
 __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
 {
@@ -316,6 +383,10 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
 	  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -332,6 +403,10 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
 	  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8_local(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16_local(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32_local(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -348,6 +423,10 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
 		  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8_relaxed(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16_relaxed(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32_relaxed(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -364,6 +443,10 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new,
 		  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8_acquire(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16_acquire(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32_acquire(ptr, old, new);
 #ifdef CONFIG_PPC64
-- 
2.4.3

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16
  2016-04-20 13:24 ` [PATCH V3] " Pan Xinhui
@ 2016-04-20 14:24   ` Peter Zijlstra
  2016-04-21 15:35     ` Pan Xinhui
  2016-04-27  9:16   ` [PATCH V4] " Pan Xinhui
  1 sibling, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2016-04-20 14:24 UTC (permalink / raw)
  To: Pan Xinhui
  Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, boqun.feng, paulmck, tglx

On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote:

> +#define __XCHG_GEN(cmp, type, sfx, skip, v)				\
> +static __always_inline unsigned long					\
> +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old,		\
> +			 unsigned long new);				\
> +static __always_inline u32						\
> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new)		\
> +{									\
> +	int size = sizeof (type);					\
> +	int off = (unsigned long)ptr % sizeof(u32);			\
> +	volatile u32 *p = ptr - off;					\
> +	int bitoff = BITOFF_CAL(size, off);				\
> +	u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;	\
> +	u32 oldv, newv, tmp;						\
> +	u32 ret;							\
> +	oldv = READ_ONCE(*p);						\
> +	do {								\
> +		ret = (oldv & bitmask) >> bitoff;			\
> +		if (skip && ret != old)					\
> +			break;						\
> +		newv = (oldv & ~bitmask) | (new << bitoff);		\
> +		tmp = oldv;						\
> +		oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv);	\
> +	} while (tmp != oldv);						\
> +	return ret;							\
> +}

So for an LL/SC based arch using cmpxchg() like that is sub-optimal.

Why did you choose to write it entirely in C?

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16
  2016-04-20 14:24   ` Peter Zijlstra
@ 2016-04-21 15:35     ` Pan Xinhui
  2016-04-21 15:52       ` Boqun Feng
  2016-04-21 16:13       ` Peter Zijlstra
  0 siblings, 2 replies; 22+ messages in thread
From: Pan Xinhui @ 2016-04-21 15:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, boqun.feng, paulmck, tglx

On 2016年04月20日 22:24, Peter Zijlstra wrote:
> On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote:
> 
>> +#define __XCHG_GEN(cmp, type, sfx, skip, v)				\
>> +static __always_inline unsigned long					\
>> +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old,		\
>> +			 unsigned long new);				\
>> +static __always_inline u32						\
>> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new)		\
>> +{									\
>> +	int size = sizeof (type);					\
>> +	int off = (unsigned long)ptr % sizeof(u32);			\
>> +	volatile u32 *p = ptr - off;					\
>> +	int bitoff = BITOFF_CAL(size, off);				\
>> +	u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;	\
>> +	u32 oldv, newv, tmp;						\
>> +	u32 ret;							\
>> +	oldv = READ_ONCE(*p);						\
>> +	do {								\
>> +		ret = (oldv & bitmask) >> bitoff;			\
>> +		if (skip && ret != old)					\
>> +			break;						\
>> +		newv = (oldv & ~bitmask) | (new << bitoff);		\
>> +		tmp = oldv;						\
>> +		oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv);	\
>> +	} while (tmp != oldv);						\
>> +	return ret;							\
>> +}
> 
> So for an LL/SC based arch using cmpxchg() like that is sub-optimal.
> 
> Why did you choose to write it entirely in C?
> 
yes, you are right. more load/store will be done in C code.
However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression.
So just wrote in C, for simple. :)

Of course I have done xchg tests.
we run code just like xchg((u8*)&v, j++); in several threads.
and the result is,
[  768.374264] use time[1550072]ns in xchg_u8_asm
[  768.377102] use time[2826802]ns in xchg_u8_c

I think this is because there is one more load in C.
If possible, we can move such code in asm-generic/.

thanks
xinhui

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16
  2016-04-21 15:35     ` Pan Xinhui
@ 2016-04-21 15:52       ` Boqun Feng
  2016-04-22  1:59         ` Pan Xinhui
  2016-04-21 16:13       ` Peter Zijlstra
  1 sibling, 1 reply; 22+ messages in thread
From: Boqun Feng @ 2016-04-21 15:52 UTC (permalink / raw)
  To: Pan Xinhui
  Cc: Peter Zijlstra, linux-kernel, linuxppc-dev, benh, paulus, mpe,
	paulmck, tglx

[-- Attachment #1: Type: text/plain, Size: 2087 bytes --]

On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote:
> On 2016年04月20日 22:24, Peter Zijlstra wrote:
> > On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote:
> > 
> >> +#define __XCHG_GEN(cmp, type, sfx, skip, v)				\
> >> +static __always_inline unsigned long					\
> >> +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old,		\
> >> +			 unsigned long new);				\
> >> +static __always_inline u32						\
> >> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new)		\
> >> +{									\
> >> +	int size = sizeof (type);					\
> >> +	int off = (unsigned long)ptr % sizeof(u32);			\
> >> +	volatile u32 *p = ptr - off;					\
> >> +	int bitoff = BITOFF_CAL(size, off);				\
> >> +	u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;	\
> >> +	u32 oldv, newv, tmp;						\
> >> +	u32 ret;							\
> >> +	oldv = READ_ONCE(*p);						\
> >> +	do {								\
> >> +		ret = (oldv & bitmask) >> bitoff;			\
> >> +		if (skip && ret != old)					\
> >> +			break;						\
> >> +		newv = (oldv & ~bitmask) | (new << bitoff);		\
> >> +		tmp = oldv;						\
> >> +		oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv);	\
> >> +	} while (tmp != oldv);						\
> >> +	return ret;							\
> >> +}
> > 
> > So for an LL/SC based arch using cmpxchg() like that is sub-optimal.
> > 
> > Why did you choose to write it entirely in C?
> > 
> yes, you are right. more load/store will be done in C code.
> However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression.
> So just wrote in C, for simple. :)
> 
> Of course I have done xchg tests.
> we run code just like xchg((u8*)&v, j++); in several threads.
> and the result is,
> [  768.374264] use time[1550072]ns in xchg_u8_asm

How was xchg_u8_asm() implemented, using lbarx or using a 32bit ll/sc
loop with shifting and masking in it?

Regards,
Boqun

> [  768.377102] use time[2826802]ns in xchg_u8_c
> 
> I think this is because there is one more load in C.
> If possible, we can move such code in asm-generic/.
> 
> thanks
> xinhui
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16
  2016-04-21 15:35     ` Pan Xinhui
  2016-04-21 15:52       ` Boqun Feng
@ 2016-04-21 16:13       ` Peter Zijlstra
  2016-04-25 10:10         ` Pan Xinhui
  1 sibling, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2016-04-21 16:13 UTC (permalink / raw)
  To: Pan Xinhui
  Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, boqun.feng, paulmck, tglx

On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote:
> yes, you are right. more load/store will be done in C code.
> However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression.
> So just wrote in C, for simple. :)

Which is fine; but worthy of a note in your Changelog.

> Of course I have done xchg tests.
> we run code just like xchg((u8*)&v, j++); in several threads.
> and the result is,
> [  768.374264] use time[1550072]ns in xchg_u8_asm
> [  768.377102] use time[2826802]ns in xchg_u8_c
> 
> I think this is because there is one more load in C.
> If possible, we can move such code in asm-generic/.

So I'm not actually _that_ familiar with the PPC LL/SC implementation;
but there are things a CPU can do to optimize these loops.

For example, a CPU might choose to not release the exclusive hold of the
line for a number of cycles, except when it passes SC or an interrupt
happens. This way there's a smaller chance the SC fails and inhibits
forward progress.

By doing the modification outside of the LL/SC you loose such
advantages.

And yes, doing a !exclusive load prior to the exclusive load leads to an
even bigger window where the data can get changed out from under you.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16
  2016-04-21 15:52       ` Boqun Feng
@ 2016-04-22  1:59         ` Pan Xinhui
  2016-04-22  3:16           ` Boqun Feng
  0 siblings, 1 reply; 22+ messages in thread
From: Pan Xinhui @ 2016-04-22  1:59 UTC (permalink / raw)
  To: Boqun Feng
  Cc: Peter Zijlstra, linux-kernel, linuxppc-dev, benh, paulus, mpe,
	paulmck, tglx

On 2016年04月21日 23:52, Boqun Feng wrote:
> On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote:
>> On 2016年04月20日 22:24, Peter Zijlstra wrote:
>>> On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote:
>>>
>>>> +#define __XCHG_GEN(cmp, type, sfx, skip, v)				\
>>>> +static __always_inline unsigned long					\
>>>> +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old,		\
>>>> +			 unsigned long new);				\
>>>> +static __always_inline u32						\
>>>> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new)		\
>>>> +{									\
>>>> +	int size = sizeof (type);					\
>>>> +	int off = (unsigned long)ptr % sizeof(u32);			\
>>>> +	volatile u32 *p = ptr - off;					\
>>>> +	int bitoff = BITOFF_CAL(size, off);				\
>>>> +	u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;	\
>>>> +	u32 oldv, newv, tmp;						\
>>>> +	u32 ret;							\
>>>> +	oldv = READ_ONCE(*p);						\
>>>> +	do {								\
>>>> +		ret = (oldv & bitmask) >> bitoff;			\
>>>> +		if (skip && ret != old)					\
>>>> +			break;						\
>>>> +		newv = (oldv & ~bitmask) | (new << bitoff);		\
>>>> +		tmp = oldv;						\
>>>> +		oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv);	\
>>>> +	} while (tmp != oldv);						\
>>>> +	return ret;							\
>>>> +}
>>>
>>> So for an LL/SC based arch using cmpxchg() like that is sub-optimal.
>>>
>>> Why did you choose to write it entirely in C?
>>>
>> yes, you are right. more load/store will be done in C code.
>> However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression.
>> So just wrote in C, for simple. :)
>>
>> Of course I have done xchg tests.
>> we run code just like xchg((u8*)&v, j++); in several threads.
>> and the result is,
>> [  768.374264] use time[1550072]ns in xchg_u8_asm
> 
> How was xchg_u8_asm() implemented, using lbarx or using a 32bit ll/sc
> loop with shifting and masking in it?
> 
yes, using 32bit ll/sc loops.

looks like:
        __asm__ __volatile__(
"1:     lwarx   %0,0,%3\n"
"       and %1,%0,%5\n"
"       or %1,%1,%4\n"
       PPC405_ERR77(0,%2)
"       stwcx.  %1,0,%3\n"
"       bne-    1b"
        : "=&r" (_oldv), "=&r" (tmp), "+m" (*(volatile unsigned int *)_p)
        : "r" (_p), "r" (_newv), "r" (_oldv_mask)
        : "cc", "memory");


> Regards,
> Boqun
> 
>> [  768.377102] use time[2826802]ns in xchg_u8_c
>>
>> I think this is because there is one more load in C.
>> If possible, we can move such code in asm-generic/.
>>
>> thanks
>> xinhui
>>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16
  2016-04-22  1:59         ` Pan Xinhui
@ 2016-04-22  3:16           ` Boqun Feng
  0 siblings, 0 replies; 22+ messages in thread
From: Boqun Feng @ 2016-04-22  3:16 UTC (permalink / raw)
  To: Pan Xinhui
  Cc: Peter Zijlstra, linux-kernel, linuxppc-dev, benh, paulus, mpe,
	paulmck, tglx

[-- Attachment #1: Type: text/plain, Size: 3194 bytes --]

On Fri, Apr 22, 2016 at 09:59:22AM +0800, Pan Xinhui wrote:
> On 2016年04月21日 23:52, Boqun Feng wrote:
> > On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote:
> >> On 2016年04月20日 22:24, Peter Zijlstra wrote:
> >>> On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote:
> >>>
> >>>> +#define __XCHG_GEN(cmp, type, sfx, skip, v)				\
> >>>> +static __always_inline unsigned long					\
> >>>> +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old,		\
> >>>> +			 unsigned long new);				\
> >>>> +static __always_inline u32						\
> >>>> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new)		\
> >>>> +{									\
> >>>> +	int size = sizeof (type);					\
> >>>> +	int off = (unsigned long)ptr % sizeof(u32);			\
> >>>> +	volatile u32 *p = ptr - off;					\
> >>>> +	int bitoff = BITOFF_CAL(size, off);				\
> >>>> +	u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;	\
> >>>> +	u32 oldv, newv, tmp;						\
> >>>> +	u32 ret;							\
> >>>> +	oldv = READ_ONCE(*p);						\
> >>>> +	do {								\
> >>>> +		ret = (oldv & bitmask) >> bitoff;			\
> >>>> +		if (skip && ret != old)					\
> >>>> +			break;						\
> >>>> +		newv = (oldv & ~bitmask) | (new << bitoff);		\
> >>>> +		tmp = oldv;						\
> >>>> +		oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv);	\
> >>>> +	} while (tmp != oldv);						\
> >>>> +	return ret;							\
> >>>> +}
> >>>
> >>> So for an LL/SC based arch using cmpxchg() like that is sub-optimal.
> >>>
> >>> Why did you choose to write it entirely in C?
> >>>
> >> yes, you are right. more load/store will be done in C code.
> >> However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression.
> >> So just wrote in C, for simple. :)
> >>
> >> Of course I have done xchg tests.
> >> we run code just like xchg((u8*)&v, j++); in several threads.
> >> and the result is,
> >> [  768.374264] use time[1550072]ns in xchg_u8_asm
> > 
> > How was xchg_u8_asm() implemented, using lbarx or using a 32bit ll/sc
> > loop with shifting and masking in it?
> > 
> yes, using 32bit ll/sc loops.
> 
> looks like:
>         __asm__ __volatile__(
> "1:     lwarx   %0,0,%3\n"
> "       and %1,%0,%5\n"
> "       or %1,%1,%4\n"
>        PPC405_ERR77(0,%2)
> "       stwcx.  %1,0,%3\n"
> "       bne-    1b"
>         : "=&r" (_oldv), "=&r" (tmp), "+m" (*(volatile unsigned int *)_p)
>         : "r" (_p), "r" (_newv), "r" (_oldv_mask)
>         : "cc", "memory");
> 

Good, so this works for all ppc ISAs too.

Given the performance benefit(maybe caused by the reason Peter
mentioned), I think we should use this as the implementation of u8/u16
{cmp}xchg for now. For Power7 and later, we can always switch to the
lbarx/lharx version if observable performance benefit can be achieved.

But the choice is left to you. After all, as you said, qspinlock is the
only user ;-)

Regards,
Boqun

> 
> > Regards,
> > Boqun
> > 
> >> [  768.377102] use time[2826802]ns in xchg_u8_c
> >>
> >> I think this is because there is one more load in C.
> >> If possible, we can move such code in asm-generic/.
> >>
> >> thanks
> >> xinhui
> >>
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16
  2016-04-21 16:13       ` Peter Zijlstra
@ 2016-04-25 10:10         ` Pan Xinhui
  2016-04-25 15:37           ` Peter Zijlstra
  0 siblings, 1 reply; 22+ messages in thread
From: Pan Xinhui @ 2016-04-25 10:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, boqun.feng, paulmck, tglx


On 2016年04月22日 00:13, Peter Zijlstra wrote:
> On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote:
>> yes, you are right. more load/store will be done in C code.
>> However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression.
>> So just wrote in C, for simple. :)
> 
> Which is fine; but worthy of a note in your Changelog.
> 
will do that.

>> Of course I have done xchg tests.
>> we run code just like xchg((u8*)&v, j++); in several threads.
>> and the result is,
>> [  768.374264] use time[1550072]ns in xchg_u8_asm
>> [  768.377102] use time[2826802]ns in xchg_u8_c
>>
>> I think this is because there is one more load in C.
>> If possible, we can move such code in asm-generic/.
> 
> So I'm not actually _that_ familiar with the PPC LL/SC implementation;
> but there are things a CPU can do to optimize these loops.
> 
> For example, a CPU might choose to not release the exclusive hold of the
> line for a number of cycles, except when it passes SC or an interrupt
> happens. This way there's a smaller chance the SC fails and inhibits
> forward progress.
I am not sure if there is such hardware optimization.

> 
> By doing the modification outside of the LL/SC you loose such
> advantages.
> 
> And yes, doing a !exclusive load prior to the exclusive load leads to an
> even bigger window where the data can get changed out from under you.
> 
you are right.
We have observed such data change during the two different loads.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16
  2016-04-25 10:10         ` Pan Xinhui
@ 2016-04-25 15:37           ` Peter Zijlstra
  2016-04-26 11:35             ` Pan Xinhui
  0 siblings, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2016-04-25 15:37 UTC (permalink / raw)
  To: Pan Xinhui
  Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, boqun.feng, paulmck, tglx

On Mon, Apr 25, 2016 at 06:10:51PM +0800, Pan Xinhui wrote:
> > So I'm not actually _that_ familiar with the PPC LL/SC implementation;
> > but there are things a CPU can do to optimize these loops.
> > 
> > For example, a CPU might choose to not release the exclusive hold of the
> > line for a number of cycles, except when it passes SC or an interrupt
> > happens. This way there's a smaller chance the SC fails and inhibits
> > forward progress.

> I am not sure if there is such hardware optimization.

So I think the hardware must do _something_, otherwise competing cores
doing load-exlusive could life-lock a system, each one endlessly
breaking the exclusive ownership of the other and the store-conditional
always failing.

Of course, there are such implementations, and they tend to have to put
in explicit backoff loops; however, IIRC, PPC doesn't need that. (See
ARC for an example that needs to do this.)

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16
  2016-04-25 15:37           ` Peter Zijlstra
@ 2016-04-26 11:35             ` Pan Xinhui
  0 siblings, 0 replies; 22+ messages in thread
From: Pan Xinhui @ 2016-04-26 11:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, boqun.feng, paulmck, tglx


On 2016年04月25日 23:37, Peter Zijlstra wrote:
> On Mon, Apr 25, 2016 at 06:10:51PM +0800, Pan Xinhui wrote:
>>> So I'm not actually _that_ familiar with the PPC LL/SC implementation;
>>> but there are things a CPU can do to optimize these loops.
>>>
>>> For example, a CPU might choose to not release the exclusive hold of the
>>> line for a number of cycles, except when it passes SC or an interrupt
>>> happens. This way there's a smaller chance the SC fails and inhibits
>>> forward progress.
> 
>> I am not sure if there is such hardware optimization.
> 
> So I think the hardware must do _something_, otherwise competing cores
> doing load-exlusive could life-lock a system, each one endlessly
> breaking the exclusive ownership of the other and the store-conditional
> always failing.
> 
Seems there is no such optimization.

We haver observed SC fails almost all the time in a contention tests, then got stuck in the loop. :(
one thread modify val with LL/SC, and other threads just modify val without any respect to LL/SC.

So in the end, I choose to rewrite this patch in asm. :)

> Of course, there are such implementations, and they tend to have to put
> in explicit backoff loops; however, IIRC, PPC doesn't need that. (See
> ARC for an example that needs to do this.)
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16
  2016-04-20 13:24 ` [PATCH V3] " Pan Xinhui
  2016-04-20 14:24   ` Peter Zijlstra
@ 2016-04-27  9:16   ` Pan Xinhui
  2016-04-27 13:58     ` Boqun Feng
                       ` (2 more replies)
  1 sibling, 3 replies; 22+ messages in thread
From: Pan Xinhui @ 2016-04-27  9:16 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev
  Cc: benh, paulus, mpe, boqun.feng, peterz, paulmck, tglx

From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>

Implement xchg{u8,u16}{local,relaxed}, and
cmpxchg{u8,u16}{,local,acquire,relaxed}.

It works on all ppc.

remove volatile of first parameter in __cmpxchg_local and __cmpxchg

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
change from v3:
	rewrite in asm for the LL/SC.
	remove volatile in __cmpxchg_local and __cmpxchg.
change from v2:
	in the do{}while(), we save one load and use corresponding cmpxchg suffix.
	Also add corresponding __cmpxchg_u32 function declaration in the __XCHG_GEN 
change from V1:
	rework totally.
---
 arch/powerpc/include/asm/cmpxchg.h | 109 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 106 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
index 44efe73..8a3735f 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -7,6 +7,71 @@
 #include <asm/asm-compat.h>
 #include <linux/bug.h>
 
+#ifdef __BIG_ENDIAN
+#define BITOFF_CAL(size, off)	((sizeof(u32) - size - off) * BITS_PER_BYTE)
+#else
+#define BITOFF_CAL(size, off)	(off * BITS_PER_BYTE)
+#endif
+
+#define XCHG_GEN(type, sfx, cl)				\
+static inline u32 __xchg_##type##sfx(void *p, u32 val)		\
+{								\
+	unsigned int prev, prev_mask, tmp, bitoff, off;		\
+								\
+	off = (unsigned long)p % sizeof(u32);			\
+	bitoff = BITOFF_CAL(sizeof(type), off);			\
+	p -= off;						\
+	val <<= bitoff;						\
+	prev_mask = (u32)(type)-1 << bitoff;			\
+								\
+	__asm__ __volatile__(					\
+"1:	lwarx   %0,0,%3\n"					\
+"	andc	%1,%0,%5\n"					\
+"	or	%1,%1,%4\n"					\
+	PPC405_ERR77(0,%3)					\
+"	stwcx.	%1,0,%3\n"					\
+"	bne-	1b\n"						\
+	: "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p)		\
+	: "r" (p), "r" (val), "r" (prev_mask)			\
+	: "cc", cl);						\
+								\
+	return prev >> bitoff;					\
+}
+
+#define CMPXCHG_GEN(type, sfx, br, br2, cl)			\
+static inline							\
+u32 __cmpxchg_##type##sfx(void *p, u32 old, u32 new)		\
+{								\
+	unsigned int prev, prev_mask, tmp, bitoff, off;		\
+								\
+	off = (unsigned long)p % sizeof(u32);			\
+	bitoff = BITOFF_CAL(sizeof(type), off);			\
+	p -= off;						\
+	old <<= bitoff;						\
+	new <<= bitoff;						\
+	prev_mask = (u32)(type)-1 << bitoff;			\
+								\
+	__asm__ __volatile__(					\
+	br							\
+"1:	lwarx   %0,0,%3\n"					\
+"	and	%1,%0,%6\n"					\
+"	cmpw	0,%1,%4\n"					\
+"	bne-	2f\n"						\
+"	andc	%1,%0,%6\n"					\
+"	or	%1,%1,%5\n"					\
+	PPC405_ERR77(0,%3)					\
+"	stwcx.  %1,0,%3\n"					\
+"	bne-    1b\n"						\
+	br2							\
+	"\n"							\
+"2:"								\
+	: "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p)		\
+	: "r" (p), "r" (old), "r" (new), "r" (prev_mask)	\
+	: "cc", cl);						\
+								\
+	return prev >> bitoff;					\
+}
+
 /*
  * Atomic exchange
  *
@@ -14,6 +79,11 @@
  * the previous value stored there.
  */
 
+XCHG_GEN(u8, _local, "memory");
+XCHG_GEN(u8, _relaxed, "cc");
+XCHG_GEN(u16, _local, "memory");
+XCHG_GEN(u16, _relaxed, "cc");
+
 static __always_inline unsigned long
 __xchg_u32_local(volatile void *p, unsigned long val)
 {
@@ -85,9 +155,13 @@ __xchg_u64_relaxed(u64 *p, unsigned long val)
 #endif
 
 static __always_inline unsigned long
-__xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
+__xchg_local(void *ptr, unsigned long x, unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __xchg_u8_local(ptr, x);
+	case 2:
+		return __xchg_u16_local(ptr, x);
 	case 4:
 		return __xchg_u32_local(ptr, x);
 #ifdef CONFIG_PPC64
@@ -103,6 +177,10 @@ static __always_inline unsigned long
 __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __xchg_u8_relaxed(ptr, x);
+	case 2:
+		return __xchg_u16_relaxed(ptr, x);
 	case 4:
 		return __xchg_u32_relaxed(ptr, x);
 #ifdef CONFIG_PPC64
@@ -131,6 +209,15 @@ __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
  * and return the old value of *p.
  */
 
+CMPXCHG_GEN(u8, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory");
+CMPXCHG_GEN(u8, _local, , , "memory");
+CMPXCHG_GEN(u8, _acquire, , PPC_ACQUIRE_BARRIER, "memory");
+CMPXCHG_GEN(u8, _relaxed, , , "cc");
+CMPXCHG_GEN(u16, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory");
+CMPXCHG_GEN(u16, _local, , , "memory");
+CMPXCHG_GEN(u16, _acquire, , PPC_ACQUIRE_BARRIER, "memory");
+CMPXCHG_GEN(u16, _relaxed, , , "cc");
+
 static __always_inline unsigned long
 __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
 {
@@ -312,10 +399,14 @@ __cmpxchg_u64_acquire(u64 *p, unsigned long old, unsigned long new)
 #endif
 
 static __always_inline unsigned long
-__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
+__cmpxchg(void *ptr, unsigned long old, unsigned long new,
 	  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -328,10 +419,14 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
 }
 
 static __always_inline unsigned long
-__cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
+__cmpxchg_local(void *ptr, unsigned long old, unsigned long new,
 	  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8_local(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16_local(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32_local(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -348,6 +443,10 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
 		  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8_relaxed(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16_relaxed(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32_relaxed(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -364,6 +463,10 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new,
 		  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8_acquire(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16_acquire(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32_acquire(ptr, old, new);
 #ifdef CONFIG_PPC64
-- 
2.4.3

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16
  2016-04-27  9:16   ` [PATCH V4] " Pan Xinhui
@ 2016-04-27 13:58     ` Boqun Feng
  2016-04-27 14:16       ` Boqun Feng
  2016-04-27 14:50       ` Boqun Feng
  2016-04-28  7:59     ` Peter Zijlstra
  2016-11-25  0:04     ` [V4] " Michael Ellerman
  2 siblings, 2 replies; 22+ messages in thread
From: Boqun Feng @ 2016-04-27 13:58 UTC (permalink / raw)
  To: Pan Xinhui
  Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, peterz, paulmck, tglx

[-- Attachment #1: Type: text/plain, Size: 7814 bytes --]

On Wed, Apr 27, 2016 at 05:16:45PM +0800, Pan Xinhui wrote:
> From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> 
> Implement xchg{u8,u16}{local,relaxed}, and
> cmpxchg{u8,u16}{,local,acquire,relaxed}.
> 
> It works on all ppc.
> 
> remove volatile of first parameter in __cmpxchg_local and __cmpxchg
> 
> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> ---
> change from v3:
> 	rewrite in asm for the LL/SC.
> 	remove volatile in __cmpxchg_local and __cmpxchg.
> change from v2:
> 	in the do{}while(), we save one load and use corresponding cmpxchg suffix.
> 	Also add corresponding __cmpxchg_u32 function declaration in the __XCHG_GEN 
> change from V1:
> 	rework totally.
> ---
>  arch/powerpc/include/asm/cmpxchg.h | 109 ++++++++++++++++++++++++++++++++++++-
>  1 file changed, 106 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
> index 44efe73..8a3735f 100644
> --- a/arch/powerpc/include/asm/cmpxchg.h
> +++ b/arch/powerpc/include/asm/cmpxchg.h
> @@ -7,6 +7,71 @@
>  #include <asm/asm-compat.h>
>  #include <linux/bug.h>
>  
> +#ifdef __BIG_ENDIAN
> +#define BITOFF_CAL(size, off)	((sizeof(u32) - size - off) * BITS_PER_BYTE)
> +#else
> +#define BITOFF_CAL(size, off)	(off * BITS_PER_BYTE)
> +#endif
> +
> +#define XCHG_GEN(type, sfx, cl)				\
> +static inline u32 __xchg_##type##sfx(void *p, u32 val)		\
> +{								\
> +	unsigned int prev, prev_mask, tmp, bitoff, off;		\
> +								\
> +	off = (unsigned long)p % sizeof(u32);			\
> +	bitoff = BITOFF_CAL(sizeof(type), off);			\
> +	p -= off;						\
> +	val <<= bitoff;						\
> +	prev_mask = (u32)(type)-1 << bitoff;			\
> +								\
> +	__asm__ __volatile__(					\
> +"1:	lwarx   %0,0,%3\n"					\
> +"	andc	%1,%0,%5\n"					\
> +"	or	%1,%1,%4\n"					\
> +	PPC405_ERR77(0,%3)					\
> +"	stwcx.	%1,0,%3\n"					\
> +"	bne-	1b\n"						\
> +	: "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p)		\

I think we can save the "tmp" here by:

	__asm__ volatile__(
"1:	lwarx	%0,0,%2\n"
"	andc	%0,%0,%4\n"
"	or	%0,%0,%3\n"
	PPC405_ERR77(0,%2)
"	stwcx.	%0,0,%2\n"
"	bne-	1b\n"
	: "=&r" (prev), "+m" (*(u32*)p)
	: "r" (p), "r" (val), "r" (prev_mask)
	: "cc", cl);

right?

> +	: "r" (p), "r" (val), "r" (prev_mask)			\
> +	: "cc", cl);						\
> +								\
> +	return prev >> bitoff;					\
> +}
> +
> +#define CMPXCHG_GEN(type, sfx, br, br2, cl)			\
> +static inline							\
> +u32 __cmpxchg_##type##sfx(void *p, u32 old, u32 new)		\
> +{								\
> +	unsigned int prev, prev_mask, tmp, bitoff, off;		\
> +								\
> +	off = (unsigned long)p % sizeof(u32);			\
> +	bitoff = BITOFF_CAL(sizeof(type), off);			\
> +	p -= off;						\
> +	old <<= bitoff;						\
> +	new <<= bitoff;						\
> +	prev_mask = (u32)(type)-1 << bitoff;			\
> +								\
> +	__asm__ __volatile__(					\
> +	br							\
> +"1:	lwarx   %0,0,%3\n"					\
> +"	and	%1,%0,%6\n"					\
> +"	cmpw	0,%1,%4\n"					\
> +"	bne-	2f\n"						\
> +"	andc	%1,%0,%6\n"					\
> +"	or	%1,%1,%5\n"					\
> +	PPC405_ERR77(0,%3)					\
> +"	stwcx.  %1,0,%3\n"					\
> +"	bne-    1b\n"						\
> +	br2							\
> +	"\n"							\
> +"2:"								\
> +	: "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p)		\

And "tmp" here could also be saved by:

"1:	lwarx   %0,0,%2\n"					\
"	xor	%3,%0,%3\n"					\
"	and.	%3,%3,%5\n"					\
"	bne-	2f\n"						\
"	andc	%0,%0,%5\n"					\
"	or	%0,%0,%4\n"					\
	PPC405_ERR77(0,%2)					\
"	stwcx.  %0,0,%2\n"					\
"	bne-    1b\n"						\
	br2							\
	"\n"							\
"2:"								\
	: "=&r" (prev), "+m" (*(u32*)p)		\
	: "r" (p), "r" (old), "r" (new), "r" (prev_mask)	\
	: "cc", cl);						\

right?

IIUC, saving the local variable "tmp" will result in saving a general
register for the compilers to use for other variables.

So thoughts?

Regards,
Boqun

> +	: "r" (p), "r" (old), "r" (new), "r" (prev_mask)	\
> +	: "cc", cl);						\
> +								\
> +	return prev >> bitoff;					\
> +}
> +
>  /*
>   * Atomic exchange
>   *
> @@ -14,6 +79,11 @@
>   * the previous value stored there.
>   */
>  
> +XCHG_GEN(u8, _local, "memory");
> +XCHG_GEN(u8, _relaxed, "cc");
> +XCHG_GEN(u16, _local, "memory");
> +XCHG_GEN(u16, _relaxed, "cc");
> +
>  static __always_inline unsigned long
>  __xchg_u32_local(volatile void *p, unsigned long val)
>  {
> @@ -85,9 +155,13 @@ __xchg_u64_relaxed(u64 *p, unsigned long val)
>  #endif
>  
>  static __always_inline unsigned long
> -__xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
> +__xchg_local(void *ptr, unsigned long x, unsigned int size)
>  {
>  	switch (size) {
> +	case 1:
> +		return __xchg_u8_local(ptr, x);
> +	case 2:
> +		return __xchg_u16_local(ptr, x);
>  	case 4:
>  		return __xchg_u32_local(ptr, x);
>  #ifdef CONFIG_PPC64
> @@ -103,6 +177,10 @@ static __always_inline unsigned long
>  __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
>  {
>  	switch (size) {
> +	case 1:
> +		return __xchg_u8_relaxed(ptr, x);
> +	case 2:
> +		return __xchg_u16_relaxed(ptr, x);
>  	case 4:
>  		return __xchg_u32_relaxed(ptr, x);
>  #ifdef CONFIG_PPC64
> @@ -131,6 +209,15 @@ __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
>   * and return the old value of *p.
>   */
>  
> +CMPXCHG_GEN(u8, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory");
> +CMPXCHG_GEN(u8, _local, , , "memory");
> +CMPXCHG_GEN(u8, _acquire, , PPC_ACQUIRE_BARRIER, "memory");
> +CMPXCHG_GEN(u8, _relaxed, , , "cc");
> +CMPXCHG_GEN(u16, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory");
> +CMPXCHG_GEN(u16, _local, , , "memory");
> +CMPXCHG_GEN(u16, _acquire, , PPC_ACQUIRE_BARRIER, "memory");
> +CMPXCHG_GEN(u16, _relaxed, , , "cc");
> +
>  static __always_inline unsigned long
>  __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
>  {
> @@ -312,10 +399,14 @@ __cmpxchg_u64_acquire(u64 *p, unsigned long old, unsigned long new)
>  #endif
>  
>  static __always_inline unsigned long
> -__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
> +__cmpxchg(void *ptr, unsigned long old, unsigned long new,
>  	  unsigned int size)
>  {
>  	switch (size) {
> +	case 1:
> +		return __cmpxchg_u8(ptr, old, new);
> +	case 2:
> +		return __cmpxchg_u16(ptr, old, new);
>  	case 4:
>  		return __cmpxchg_u32(ptr, old, new);
>  #ifdef CONFIG_PPC64
> @@ -328,10 +419,14 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
>  }
>  
>  static __always_inline unsigned long
> -__cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
> +__cmpxchg_local(void *ptr, unsigned long old, unsigned long new,
>  	  unsigned int size)
>  {
>  	switch (size) {
> +	case 1:
> +		return __cmpxchg_u8_local(ptr, old, new);
> +	case 2:
> +		return __cmpxchg_u16_local(ptr, old, new);
>  	case 4:
>  		return __cmpxchg_u32_local(ptr, old, new);
>  #ifdef CONFIG_PPC64
> @@ -348,6 +443,10 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
>  		  unsigned int size)
>  {
>  	switch (size) {
> +	case 1:
> +		return __cmpxchg_u8_relaxed(ptr, old, new);
> +	case 2:
> +		return __cmpxchg_u16_relaxed(ptr, old, new);
>  	case 4:
>  		return __cmpxchg_u32_relaxed(ptr, old, new);
>  #ifdef CONFIG_PPC64
> @@ -364,6 +463,10 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new,
>  		  unsigned int size)
>  {
>  	switch (size) {
> +	case 1:
> +		return __cmpxchg_u8_acquire(ptr, old, new);
> +	case 2:
> +		return __cmpxchg_u16_acquire(ptr, old, new);
>  	case 4:
>  		return __cmpxchg_u32_acquire(ptr, old, new);
>  #ifdef CONFIG_PPC64
> -- 
> 2.4.3
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16
  2016-04-27 13:58     ` Boqun Feng
@ 2016-04-27 14:16       ` Boqun Feng
  2016-04-27 14:50       ` Boqun Feng
  1 sibling, 0 replies; 22+ messages in thread
From: Boqun Feng @ 2016-04-27 14:16 UTC (permalink / raw)
  To: Pan Xinhui
  Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, peterz, paulmck, tglx

[-- Attachment #1: Type: text/plain, Size: 4517 bytes --]

On Wed, Apr 27, 2016 at 09:58:17PM +0800, Boqun Feng wrote:
> On Wed, Apr 27, 2016 at 05:16:45PM +0800, Pan Xinhui wrote:
> > From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> > 
> > Implement xchg{u8,u16}{local,relaxed}, and
> > cmpxchg{u8,u16}{,local,acquire,relaxed}.
> > 
> > It works on all ppc.
> > 
> > remove volatile of first parameter in __cmpxchg_local and __cmpxchg
> > 
> > Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> > ---
> > change from v3:
> > 	rewrite in asm for the LL/SC.
> > 	remove volatile in __cmpxchg_local and __cmpxchg.
> > change from v2:
> > 	in the do{}while(), we save one load and use corresponding cmpxchg suffix.
> > 	Also add corresponding __cmpxchg_u32 function declaration in the __XCHG_GEN 
> > change from V1:
> > 	rework totally.
> > ---
> >  arch/powerpc/include/asm/cmpxchg.h | 109 ++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 106 insertions(+), 3 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
> > index 44efe73..8a3735f 100644
> > --- a/arch/powerpc/include/asm/cmpxchg.h
> > +++ b/arch/powerpc/include/asm/cmpxchg.h
> > @@ -7,6 +7,71 @@
> >  #include <asm/asm-compat.h>
> >  #include <linux/bug.h>
> >  
> > +#ifdef __BIG_ENDIAN
> > +#define BITOFF_CAL(size, off)	((sizeof(u32) - size - off) * BITS_PER_BYTE)
> > +#else
> > +#define BITOFF_CAL(size, off)	(off * BITS_PER_BYTE)
> > +#endif
> > +
> > +#define XCHG_GEN(type, sfx, cl)				\
> > +static inline u32 __xchg_##type##sfx(void *p, u32 val)		\
> > +{								\
> > +	unsigned int prev, prev_mask, tmp, bitoff, off;		\
> > +								\
> > +	off = (unsigned long)p % sizeof(u32);			\
> > +	bitoff = BITOFF_CAL(sizeof(type), off);			\
> > +	p -= off;						\
> > +	val <<= bitoff;						\
> > +	prev_mask = (u32)(type)-1 << bitoff;			\
> > +								\
> > +	__asm__ __volatile__(					\
> > +"1:	lwarx   %0,0,%3\n"					\
> > +"	andc	%1,%0,%5\n"					\
> > +"	or	%1,%1,%4\n"					\
> > +	PPC405_ERR77(0,%3)					\
> > +"	stwcx.	%1,0,%3\n"					\
> > +"	bne-	1b\n"						\
> > +	: "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p)		\
> 
> I think we can save the "tmp" here by:
> 
> 	__asm__ volatile__(
> "1:	lwarx	%0,0,%2\n"
> "	andc	%0,%0,%4\n"
> "	or	%0,%0,%3\n"
> 	PPC405_ERR77(0,%2)
> "	stwcx.	%0,0,%2\n"
> "	bne-	1b\n"
> 	: "=&r" (prev), "+m" (*(u32*)p)
> 	: "r" (p), "r" (val), "r" (prev_mask)
> 	: "cc", cl);
> 
> right?
> 
> > +	: "r" (p), "r" (val), "r" (prev_mask)			\
> > +	: "cc", cl);						\
> > +								\
> > +	return prev >> bitoff;					\
> > +}
> > +
> > +#define CMPXCHG_GEN(type, sfx, br, br2, cl)			\
> > +static inline							\
> > +u32 __cmpxchg_##type##sfx(void *p, u32 old, u32 new)		\
> > +{								\
> > +	unsigned int prev, prev_mask, tmp, bitoff, off;		\
> > +								\
> > +	off = (unsigned long)p % sizeof(u32);			\
> > +	bitoff = BITOFF_CAL(sizeof(type), off);			\
> > +	p -= off;						\
> > +	old <<= bitoff;						\
> > +	new <<= bitoff;						\
> > +	prev_mask = (u32)(type)-1 << bitoff;			\
> > +								\
> > +	__asm__ __volatile__(					\
> > +	br							\
> > +"1:	lwarx   %0,0,%3\n"					\
> > +"	and	%1,%0,%6\n"					\
> > +"	cmpw	0,%1,%4\n"					\
> > +"	bne-	2f\n"						\
> > +"	andc	%1,%0,%6\n"					\
> > +"	or	%1,%1,%5\n"					\
> > +	PPC405_ERR77(0,%3)					\
> > +"	stwcx.  %1,0,%3\n"					\
> > +"	bne-    1b\n"						\
> > +	br2							\
> > +	"\n"							\
> > +"2:"								\
> > +	: "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p)		\
> 
> And "tmp" here could also be saved by:
> 
> "1:	lwarx   %0,0,%2\n"					\
> "	xor	%3,%0,%3\n"					\
> "	and.	%3,%3,%5\n"					\
> "	bne-	2f\n"						\
> "	andc	%0,%0,%5\n"					\
> "	or	%0,%0,%4\n"					\
> 	PPC405_ERR77(0,%2)					\
> "	stwcx.  %0,0,%2\n"					\
> "	bne-    1b\n"						\
> 	br2							\
> 	"\n"							\
> "2:"								\
> 	: "=&r" (prev), "+m" (*(u32*)p)		\
> 	: "r" (p), "r" (old), "r" (new), "r" (prev_mask)	\
> 	: "cc", cl);						\
> 

Oops, this should be:

"1:	lwarx   %0,0,%3\n"					\
"	xor	%2,%0,%2\n"					\
"	and.	%2,%2,%5\n"					\
"	bne-	2f\n"						\
"	andc	%0,%0,%5\n"					\
"	or	%0,%0,%4\n"					\
	PPC405_ERR77(0,%3)					\
"	stwcx.  %0,0,%3\n"					\
"	bne-    1b\n"						\
	br2							\
	"\n"							\
"2:"								\
	: "=&r" (prev), "+m" (*(u32*)p), "+&r" (old)		\
	: "r" (p), "r" (new), "r" (prev_mask)			\
	: "cc", cl);						\

Regards,
Boqun

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16
  2016-04-27 13:58     ` Boqun Feng
  2016-04-27 14:16       ` Boqun Feng
@ 2016-04-27 14:50       ` Boqun Feng
  2016-04-27 14:59         ` Boqun Feng
  1 sibling, 1 reply; 22+ messages in thread
From: Boqun Feng @ 2016-04-27 14:50 UTC (permalink / raw)
  To: Pan Xinhui
  Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, peterz, paulmck, tglx

[-- Attachment #1: Type: text/plain, Size: 4463 bytes --]

On Wed, Apr 27, 2016 at 09:58:17PM +0800, Boqun Feng wrote:
> On Wed, Apr 27, 2016 at 05:16:45PM +0800, Pan Xinhui wrote:
> > From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> > 
> > Implement xchg{u8,u16}{local,relaxed}, and
> > cmpxchg{u8,u16}{,local,acquire,relaxed}.
> > 
> > It works on all ppc.
> > 
> > remove volatile of first parameter in __cmpxchg_local and __cmpxchg
> > 
> > Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> > ---
> > change from v3:
> > 	rewrite in asm for the LL/SC.
> > 	remove volatile in __cmpxchg_local and __cmpxchg.
> > change from v2:
> > 	in the do{}while(), we save one load and use corresponding cmpxchg suffix.
> > 	Also add corresponding __cmpxchg_u32 function declaration in the __XCHG_GEN 
> > change from V1:
> > 	rework totally.
> > ---
> >  arch/powerpc/include/asm/cmpxchg.h | 109 ++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 106 insertions(+), 3 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
> > index 44efe73..8a3735f 100644
> > --- a/arch/powerpc/include/asm/cmpxchg.h
> > +++ b/arch/powerpc/include/asm/cmpxchg.h
> > @@ -7,6 +7,71 @@
> >  #include <asm/asm-compat.h>
> >  #include <linux/bug.h>
> >  
> > +#ifdef __BIG_ENDIAN
> > +#define BITOFF_CAL(size, off)	((sizeof(u32) - size - off) * BITS_PER_BYTE)
> > +#else
> > +#define BITOFF_CAL(size, off)	(off * BITS_PER_BYTE)
> > +#endif
> > +
> > +#define XCHG_GEN(type, sfx, cl)				\
> > +static inline u32 __xchg_##type##sfx(void *p, u32 val)		\
> > +{								\
> > +	unsigned int prev, prev_mask, tmp, bitoff, off;		\
> > +								\
> > +	off = (unsigned long)p % sizeof(u32);			\
> > +	bitoff = BITOFF_CAL(sizeof(type), off);			\
> > +	p -= off;						\
> > +	val <<= bitoff;						\
> > +	prev_mask = (u32)(type)-1 << bitoff;			\
> > +								\
> > +	__asm__ __volatile__(					\
> > +"1:	lwarx   %0,0,%3\n"					\
> > +"	andc	%1,%0,%5\n"					\
> > +"	or	%1,%1,%4\n"					\
> > +	PPC405_ERR77(0,%3)					\
> > +"	stwcx.	%1,0,%3\n"					\
> > +"	bne-	1b\n"						\
> > +	: "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p)		\
> 
> I think we can save the "tmp" here by:
> 
> 	__asm__ volatile__(
> "1:	lwarx	%0,0,%2\n"
> "	andc	%0,%0,%4\n"
> "	or	%0,%0,%3\n"
> 	PPC405_ERR77(0,%2)
> "	stwcx.	%0,0,%2\n"
> "	bne-	1b\n"
> 	: "=&r" (prev), "+m" (*(u32*)p)
> 	: "r" (p), "r" (val), "r" (prev_mask)
> 	: "cc", cl);
> 
> right?
> 
> > +	: "r" (p), "r" (val), "r" (prev_mask)			\
> > +	: "cc", cl);						\
> > +								\
> > +	return prev >> bitoff;					\
> > +}
> > +
> > +#define CMPXCHG_GEN(type, sfx, br, br2, cl)			\
> > +static inline							\
> > +u32 __cmpxchg_##type##sfx(void *p, u32 old, u32 new)		\
> > +{								\
> > +	unsigned int prev, prev_mask, tmp, bitoff, off;		\
> > +								\
> > +	off = (unsigned long)p % sizeof(u32);			\
> > +	bitoff = BITOFF_CAL(sizeof(type), off);			\
> > +	p -= off;						\
> > +	old <<= bitoff;						\
> > +	new <<= bitoff;						\
> > +	prev_mask = (u32)(type)-1 << bitoff;			\
> > +								\
> > +	__asm__ __volatile__(					\
> > +	br							\
> > +"1:	lwarx   %0,0,%3\n"					\
> > +"	and	%1,%0,%6\n"					\
> > +"	cmpw	0,%1,%4\n"					\
> > +"	bne-	2f\n"						\
> > +"	andc	%1,%0,%6\n"					\
> > +"	or	%1,%1,%5\n"					\
> > +	PPC405_ERR77(0,%3)					\
> > +"	stwcx.  %1,0,%3\n"					\
> > +"	bne-    1b\n"						\
> > +	br2							\
> > +	"\n"							\
> > +"2:"								\
> > +	: "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p)		\
> 
> And "tmp" here could also be saved by:
> 
> "1:	lwarx   %0,0,%2\n"					\
> "	xor	%3,%0,%3\n"					\
> "	and.	%3,%3,%5\n"					\
> "	bne-	2f\n"						\
> "	andc	%0,%0,%5\n"					\
> "	or	%0,%0,%4\n"					\
> 	PPC405_ERR77(0,%2)					\
> "	stwcx.  %0,0,%2\n"					\
> "	bne-    1b\n"						\
> 	br2							\
> 	"\n"							\
> "2:"								\
> 	: "=&r" (prev), "+m" (*(u32*)p)		\
> 	: "r" (p), "r" (old), "r" (new), "r" (prev_mask)	\
> 	: "cc", cl);						\
> 
> right?
> 

Sorry, my bad, we can't implement cmpxchg like this.. please ignore
this, I should really go to bed soon...

But still, we can save the "tmp" for xchg() I think.

Regards,
Boqun

> IIUC, saving the local variable "tmp" will result in saving a general
> register for the compilers to use for other variables.
> 
> So thoughts?
> 
> Regards,
> Boqun
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16
  2016-04-27 14:50       ` Boqun Feng
@ 2016-04-27 14:59         ` Boqun Feng
  2016-04-28 10:21           ` Pan Xinhui
  0 siblings, 1 reply; 22+ messages in thread
From: Boqun Feng @ 2016-04-27 14:59 UTC (permalink / raw)
  To: Pan Xinhui
  Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, peterz, paulmck, tglx

[-- Attachment #1: Type: text/plain, Size: 405 bytes --]

On Wed, Apr 27, 2016 at 10:50:34PM +0800, Boqun Feng wrote:
> 
> Sorry, my bad, we can't implement cmpxchg like this.. please ignore
> this, I should really go to bed soon...
> 
> But still, we can save the "tmp" for xchg() I think.
> 

No.. we can't. Sorry for all the noise.

This patch looks good to me.

FWIW, you can add

Acked-by: Boqun Feng <boqun.feng@gmail.com>

Regards,
Boqun

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16
  2016-04-27  9:16   ` [PATCH V4] " Pan Xinhui
  2016-04-27 13:58     ` Boqun Feng
@ 2016-04-28  7:59     ` Peter Zijlstra
  2016-04-28 10:21       ` Pan Xinhui
  2016-11-25  0:04     ` [V4] " Michael Ellerman
  2 siblings, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2016-04-28  7:59 UTC (permalink / raw)
  To: Pan Xinhui
  Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, boqun.feng, paulmck, tglx

On Wed, Apr 27, 2016 at 05:16:45PM +0800, Pan Xinhui wrote:
> From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> 
> Implement xchg{u8,u16}{local,relaxed}, and
> cmpxchg{u8,u16}{,local,acquire,relaxed}.
> 
> It works on all ppc.
> 
> remove volatile of first parameter in __cmpxchg_local and __cmpxchg
> 
> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>

Generally has the right shape; and I trust others to double check the
ppc-asm minutia.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16
  2016-04-27 14:59         ` Boqun Feng
@ 2016-04-28 10:21           ` Pan Xinhui
  0 siblings, 0 replies; 22+ messages in thread
From: Pan Xinhui @ 2016-04-28 10:21 UTC (permalink / raw)
  To: Boqun Feng
  Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, peterz, paulmck, tglx



On 2016年04月27日 22:59, Boqun Feng wrote:
> On Wed, Apr 27, 2016 at 10:50:34PM +0800, Boqun Feng wrote:
>>
>> Sorry, my bad, we can't implement cmpxchg like this.. please ignore
>> this, I should really go to bed soon...
>>
>> But still, we can save the "tmp" for xchg() I think.
>>
> 
> No.. we can't. Sorry for all the noise.
> 
> This patch looks good to me.
> 
> FWIW, you can add
> 
> Acked-by: Boqun Feng <boqun.feng@gmail.com>
> 
thanks!

> Regards,
> Boqun
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16
  2016-04-28  7:59     ` Peter Zijlstra
@ 2016-04-28 10:21       ` Pan Xinhui
  0 siblings, 0 replies; 22+ messages in thread
From: Pan Xinhui @ 2016-04-28 10:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, linuxppc-dev, benh, paulus, mpe, boqun.feng, paulmck, tglx



On 2016年04月28日 15:59, Peter Zijlstra wrote:
> On Wed, Apr 27, 2016 at 05:16:45PM +0800, Pan Xinhui wrote:
>> From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
>>
>> Implement xchg{u8,u16}{local,relaxed}, and
>> cmpxchg{u8,u16}{,local,acquire,relaxed}.
>>
>> It works on all ppc.
>>
>> remove volatile of first parameter in __cmpxchg_local and __cmpxchg
>>
>> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> 
> Generally has the right shape; and I trust others to double check the
> ppc-asm minutia.
> 
> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> 
> 
thanks!

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [V4] powerpc: Implement {cmp}xchg for u8 and u16
  2016-04-27  9:16   ` [PATCH V4] " Pan Xinhui
  2016-04-27 13:58     ` Boqun Feng
  2016-04-28  7:59     ` Peter Zijlstra
@ 2016-11-25  0:04     ` Michael Ellerman
  2 siblings, 0 replies; 22+ messages in thread
From: Michael Ellerman @ 2016-11-25  0:04 UTC (permalink / raw)
  To: xinhui, linux-kernel, linuxppc-dev
  Cc: peterz, boqun.feng, paulus, tglx, paulmck

On Wed, 2016-04-27 at 09:16:45 UTC, xinhui wrote:
> From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> 
> Implement xchg{u8,u16}{local,relaxed}, and
> cmpxchg{u8,u16}{,local,acquire,relaxed}.
> 
> It works on all ppc.
> 
> remove volatile of first parameter in __cmpxchg_local and __cmpxchg
> 
> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> Acked-by: Boqun Feng <boqun.feng@gmail.com>
> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/d0563a1297e234ed37f6b51c2e9321

cheers

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2016-11-25  0:04 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-19  6:29 [PATCH V2] powerpc: Implement {cmp}xchg for u8 and u16 Pan Xinhui
2016-04-19  9:18 ` Boqun Feng
2016-04-20  3:39   ` Pan Xinhui
2016-04-20 13:24 ` [PATCH V3] " Pan Xinhui
2016-04-20 14:24   ` Peter Zijlstra
2016-04-21 15:35     ` Pan Xinhui
2016-04-21 15:52       ` Boqun Feng
2016-04-22  1:59         ` Pan Xinhui
2016-04-22  3:16           ` Boqun Feng
2016-04-21 16:13       ` Peter Zijlstra
2016-04-25 10:10         ` Pan Xinhui
2016-04-25 15:37           ` Peter Zijlstra
2016-04-26 11:35             ` Pan Xinhui
2016-04-27  9:16   ` [PATCH V4] " Pan Xinhui
2016-04-27 13:58     ` Boqun Feng
2016-04-27 14:16       ` Boqun Feng
2016-04-27 14:50       ` Boqun Feng
2016-04-27 14:59         ` Boqun Feng
2016-04-28 10:21           ` Pan Xinhui
2016-04-28  7:59     ` Peter Zijlstra
2016-04-28 10:21       ` Pan Xinhui
2016-11-25  0:04     ` [V4] " Michael Ellerman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).