linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] powerpc: introduce {cmp}xchg for u8 and u16
@ 2016-04-08  6:41 Pan Xinhui
  2016-04-08  7:47 ` Peter Zijlstra
  0 siblings, 1 reply; 9+ messages in thread
From: Pan Xinhui @ 2016-04-08  6:41 UTC (permalink / raw)
  To: linuxppc-dev, linux-kernel
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Boqun Feng, 'Peter Zijlstra (Intel)",
	Thomas Gleixner

From: pan xinhui <xinhui.pan@linux.vnet.ibm.com>

Implement xchg{u8,u16}{local,relaxed}, and
cmpxchg{u8,u16}{,local,acquire,relaxed}.

Atomic operation on 8-bit and 16-bit data type is supported from power7

Signed-off-by: pan xinhui <xinhui.pan@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/cmpxchg.h | 265 +++++++++++++++++++++++++++++++++++++
 1 file changed, 265 insertions(+)

diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
index 44efe73..928ad89 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -15,6 +15,74 @@
  */
 
 static __always_inline unsigned long
+__xchg_u8_local(volatile void *p, unsigned long val)
+{
+	unsigned long prev;
+
+	__asm__ __volatile__(
+"1:	lbarx	%0,0,%2\n"
+	PPC405_ERR77(0,%2)
+"	stbcx.	%3,0,%2\n"
+"	bne-	1b"
+	: "=&r" (prev), "+m" (*(volatile unsigned char *)p)
+	: "r" (p), "r" (val)
+	: "cc", "memory");
+
+	return prev;
+}
+
+static __always_inline unsigned long
+__xchg_u8_relaxed(u8 *p, unsigned long val)
+{
+	unsigned long prev;
+
+	__asm__ __volatile__(
+"1:	lbarx	%0,0,%2\n"
+	PPC405_ERR77(0,%2)
+"	stbcx.	%3,0,%2\n"
+"	bne-	1b"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (val)
+	: "cc");
+
+	return prev;
+}
+
+static __always_inline unsigned long
+__xchg_u16_local(volatile void *p, unsigned long val)
+{
+	unsigned long prev;
+
+	__asm__ __volatile__(
+"1:	lharx	%0,0,%2\n"
+	PPC405_ERR77(0,%2)
+"	sthcx.	%3,0,%2\n"
+"	bne-	1b"
+	: "=&r" (prev), "+m" (*(volatile unsigned short *)p)
+	: "r" (p), "r" (val)
+	: "cc", "memory");
+
+	return prev;
+}
+
+static __always_inline unsigned long
+__xchg_u16_relaxed(u16 *p, unsigned long val)
+{
+	unsigned long prev;
+
+	__asm__ __volatile__(
+"1:	lharx	%0,0,%2\n"
+	PPC405_ERR77(0,%2)
+"	sthcx.	%3,0,%2\n"
+"	bne-	1b"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (val)
+	: "cc");
+
+	return prev;
+}
+
+static __always_inline unsigned long
 __xchg_u32_local(volatile void *p, unsigned long val)
 {
 	unsigned long prev;
@@ -88,6 +156,10 @@ static __always_inline unsigned long
 __xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __xchg_u8_local(ptr, x);
+	case 2:
+		return __xchg_u16_local(ptr, x);
 	case 4:
 		return __xchg_u32_local(ptr, x);
 #ifdef CONFIG_PPC64
@@ -103,6 +175,10 @@ static __always_inline unsigned long
 __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __xchg_u8_relaxed(ptr, x);
+	case 2:
+		return __xchg_u16_relaxed(ptr, x);
 	case 4:
 		return __xchg_u32_relaxed(ptr, x);
 #ifdef CONFIG_PPC64
@@ -132,6 +208,179 @@ __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
  */
 
 static __always_inline unsigned long
+__cmpxchg_u8(volatile unsigned char *p, unsigned long old, unsigned long new)
+{
+	unsigned long prev = 0;
+
+	__asm__ __volatile__ (
+	PPC_ATOMIC_ENTRY_BARRIER
+"1:	lbarx	%0,0,%2		# __cmpxchg_u8\n"
+"	cmpw	0,%0,%3\n"
+"	bne-	2f\n"
+	PPC405_ERR77(0,%2)
+"	stbcx.	%4,0,%2\n"
+"	bne-	1b\n"
+	PPC_ATOMIC_EXIT_BARRIER
+	"\n"
+"2:"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (old), "r" (new)
+	: "cc", "memory");
+
+	return prev;
+}
+
+static __always_inline unsigned long
+__cmpxchg_u8_local(volatile unsigned char *p, unsigned long old,
+			unsigned long new)
+{
+	unsigned long prev = 0;
+
+	__asm__ __volatile__ (
+"1:	lbarx	%0,0,%2		# __cmpxchg_u8_local\n"
+"	cmpw	0,%0,%3\n"
+"	bne-	2f\n"
+	PPC405_ERR77(0,%2)
+"	stbcx.	%4,0,%2\n"
+"	bne-	1b\n"
+"2:"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (old), "r" (new)
+	: "cc", "memory");
+
+	return prev;
+}
+
+static __always_inline unsigned long
+__cmpxchg_u8_relaxed(u8 *p, unsigned long old, unsigned long new)
+{
+	unsigned long prev = 0;
+
+	__asm__ __volatile__ (
+"1:	lbarx	%0,0,%2		# __cmpxchg_u8_relaxed\n"
+"	cmpw	0,%0,%3\n"
+"	bne-	2f\n"
+	PPC405_ERR77(0,%2)
+"	stbcx.	%4,0,%2\n"
+"	bne-	1b\n"
+"2:"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (old), "r" (new)
+	: "cc");
+
+	return prev;
+}
+
+static __always_inline unsigned long
+__cmpxchg_u8_acquire(u8 *p, unsigned long old, unsigned long new)
+{
+	unsigned long prev = 0;
+
+	__asm__ __volatile__ (
+"1:	lbarx	%0,0,%2		# __cmpxchg_u8_acquire\n"
+"	cmpw	0,%0,%3\n"
+"	bne-	2f\n"
+	PPC405_ERR77(0,%2)
+"	stbcx.	%4,0,%2\n"
+"	bne-	1b\n"
+	PPC_ACQUIRE_BARRIER
+	"\n"
+"2:"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (old), "r" (new)
+	: "cc", "memory");
+
+	return prev;
+}
+
+static __always_inline unsigned long
+__cmpxchg_u16(volatile unsigned short *p, unsigned long old, unsigned long new)
+{
+	unsigned long prev = 0;
+
+	__asm__ __volatile__ (
+	PPC_ATOMIC_ENTRY_BARRIER
+"1:	lharx	%0,0,%2		# __cmpxchg_u16\n"
+"	cmpw	0,%0,%3\n"
+"	bne-	2f\n"
+	PPC405_ERR77(0,%2)
+"	sthcx.	%4,0,%2\n"
+"	bne-	1b\n"
+	PPC_ATOMIC_EXIT_BARRIER
+	"\n"
+"2:"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (old), "r" (new)
+	: "cc", "memory");
+
+	return prev;
+}
+
+static __always_inline unsigned long
+__cmpxchg_u16_local(volatile unsigned short *p, unsigned long old,
+			unsigned long new)
+{
+	unsigned long prev = 0;
+
+	__asm__ __volatile__ (
+"1:	lharx	%0,0,%2		# __cmpxchg_u16_local\n"
+"	cmpw	0,%0,%3\n"
+"	bne-	2f\n"
+	PPC405_ERR77(0,%2)
+"	sthcx.	%4,0,%2\n"
+"	bne-	1b\n"
+"2:"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (old), "r" (new)
+	: "cc", "memory");
+
+	return prev;
+}
+
+static __always_inline unsigned long
+__cmpxchg_u16_relaxed(u16 *p, unsigned long old, unsigned long new)
+{
+	unsigned long prev = 0;
+
+	__asm__ __volatile__ (
+"1:	lharx	%0,0,%2		# __cmpxchg_u16_relaxed\n"
+"	cmpw	0,%0,%3\n"
+"	bne-	2f\n"
+	PPC405_ERR77(0,%2)
+"	sthcx.	%4,0,%2\n"
+"	bne-	1b\n"
+"2:"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (old), "r" (new)
+	: "cc");
+
+	return prev;
+}
+
+static __always_inline unsigned long
+__cmpxchg_u16_acquire(u16 *p, unsigned long old, unsigned long new)
+{
+	unsigned long prev = 0;
+
+	__asm__ __volatile__ (
+"1:	lharx	%0,0,%2		# __cmpxchg_u16_acquire\n"
+"	cmpw	0,%0,%3\n"
+"	bne-	2f\n"
+	PPC405_ERR77(0,%2)
+"	sthcx.	%4,0,%2\n"
+"	bne-	1b\n"
+	PPC_ACQUIRE_BARRIER
+	"\n"
+"2:"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (old), "r" (new)
+	: "cc", "memory");
+
+	return prev;
+}
+
+
+static __always_inline unsigned long
 __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
 {
 	unsigned int prev;
@@ -316,6 +565,10 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
 	  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -332,6 +585,10 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
 	  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8_local(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16_local(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32_local(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -348,6 +605,10 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
 		  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8_relaxed(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16_relaxed(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32_relaxed(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -364,6 +625,10 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new,
 		  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8_acquire(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16_acquire(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32_acquire(ptr, old, new);
 #ifdef CONFIG_PPC64
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] powerpc: introduce {cmp}xchg for u8 and u16
  2016-04-08  6:41 [PATCH] powerpc: introduce {cmp}xchg for u8 and u16 Pan Xinhui
@ 2016-04-08  7:47 ` Peter Zijlstra
  2016-04-10 14:17   ` Pan Xinhui
  0 siblings, 1 reply; 9+ messages in thread
From: Peter Zijlstra @ 2016-04-08  7:47 UTC (permalink / raw)
  To: Pan Xinhui
  Cc: linuxppc-dev, linux-kernel, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Boqun Feng, Thomas Gleixner

On Fri, Apr 08, 2016 at 02:41:46PM +0800, Pan Xinhui wrote:
> From: pan xinhui <xinhui.pan@linux.vnet.ibm.com>
> 
> Implement xchg{u8,u16}{local,relaxed}, and
> cmpxchg{u8,u16}{,local,acquire,relaxed}.
> 
> Atomic operation on 8-bit and 16-bit data type is supported from power7

And yes I see nothing P7 specific here, this implementation is for
everything PPC64 afaict, no?

Also, note that you don't need explicit 8/16 bit atomics to implement
these. Its fine to use 32bit atomics and only modify half the word.

Also, you might want to invest in some CPP to reduce the endless
repetition.

Other than that, no objections :-)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] powerpc: introduce {cmp}xchg for u8 and u16
  2016-04-08  7:47 ` Peter Zijlstra
@ 2016-04-10 14:17   ` Pan Xinhui
  2016-04-12 14:30     ` Peter Zijlstra
  0 siblings, 1 reply; 9+ messages in thread
From: Pan Xinhui @ 2016-04-10 14:17 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linuxppc-dev, linux-kernel, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Boqun Feng, Thomas Gleixner


On 2016年04月08日 15:47, Peter Zijlstra wrote:
> On Fri, Apr 08, 2016 at 02:41:46PM +0800, Pan Xinhui wrote:
>> From: pan xinhui <xinhui.pan@linux.vnet.ibm.com>
>>
>> Implement xchg{u8,u16}{local,relaxed}, and
>> cmpxchg{u8,u16}{,local,acquire,relaxed}.
>>
>> Atomic operation on 8-bit and 16-bit data type is supported from power7
> 
> And yes I see nothing P7 specific here, this implementation is for
> everything PPC64 afaict, no?
> 
Hello Peter,
	No, it's not for every ppc. So yes, I need add #ifdef here. Thanks for pointing it out.
We might need a new config option and let it depend on POWER7/POWER8_CPU or even POWER9...

> Also, note that you don't need explicit 8/16 bit atomics to implement
> these. Its fine to use 32bit atomics and only modify half the word.
> 
That is true. But I am a little worried about the performance. It will forbid any other tasks to touch the other half word during the load/reserve, right?
I am working on the qspinlock implementation on PPC.
Your and Waiman's patches are so nice. :)

> Also, you might want to invest in some CPP to reduce the endless
> repetition.
> 
Will do that. thanks for your tips.

thanks
xinhui
> Other than that, no objections :-)
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] powerpc: introduce {cmp}xchg for u8 and u16
  2016-04-10 14:17   ` Pan Xinhui
@ 2016-04-12 14:30     ` Peter Zijlstra
  2016-04-13 11:15       ` Pan Xinhui
  0 siblings, 1 reply; 9+ messages in thread
From: Peter Zijlstra @ 2016-04-12 14:30 UTC (permalink / raw)
  To: Pan Xinhui
  Cc: linuxppc-dev, linux-kernel, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Boqun Feng, Thomas Gleixner

On Sun, Apr 10, 2016 at 10:17:28PM +0800, Pan Xinhui wrote:
> 
> On 2016年04月08日 15:47, Peter Zijlstra wrote:
> > On Fri, Apr 08, 2016 at 02:41:46PM +0800, Pan Xinhui wrote:
> >> From: pan xinhui <xinhui.pan@linux.vnet.ibm.com>
> >>
> >> Implement xchg{u8,u16}{local,relaxed}, and
> >> cmpxchg{u8,u16}{,local,acquire,relaxed}.
> >>
> >> Atomic operation on 8-bit and 16-bit data type is supported from power7
> > 
> > And yes I see nothing P7 specific here, this implementation is for
> > everything PPC64 afaict, no?
> > 
> Hello Peter,
> 	No, it's not for every ppc. So yes, I need add #ifdef here. Thanks for pointing it out.
> We might need a new config option and let it depend on POWER7/POWER8_CPU or even POWER9...

Right, I'm not sure if PPC has alternatives, but you could of course
runtime patch the code from emulated with 32bit ll/sc to native 8/16bit
ll/sc if present on the current CPU if you have infrastructure for these
things.

> > Also, note that you don't need explicit 8/16 bit atomics to implement
> > these. Its fine to use 32bit atomics and only modify half the word.
> > 
> That is true. But I am a little worried about the performance. It will
> forbid any other tasks to touch the other half word during the
> load/reserve, right?

Well, not forbid, it would just make the LL/SC fail and try again. Other
archs already implement them this way. See commit 3226aad81aa6 ("sh:
support 1 and 2 byte xchg") for example.

> I am working on the qspinlock implementation on PPC.
> Your and Waiman's patches are so nice. :)

Thanks!, last time I looked at PPC spinlocks they could not use things
like ticket locks because PPC might be a guest and fairness blows etc..

You're making the qspinlock-paravirt thing work on PPC, or doing
qspinlock only for bare-metal PPC?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] powerpc: introduce {cmp}xchg for u8 and u16
  2016-04-12 14:30     ` Peter Zijlstra
@ 2016-04-13 11:15       ` Pan Xinhui
  2016-04-13 15:53         ` Waiman Long
  2016-04-16 19:43         ` Arnd Bergmann
  0 siblings, 2 replies; 9+ messages in thread
From: Pan Xinhui @ 2016-04-13 11:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linuxppc-dev, linux-kernel, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Boqun Feng, Thomas Gleixner

Hello Peter,

On 2016年04月12日 22:30, Peter Zijlstra wrote:
> On Sun, Apr 10, 2016 at 10:17:28PM +0800, Pan Xinhui wrote:
>>
>> On 2016年04月08日 15:47, Peter Zijlstra wrote:
>>> On Fri, Apr 08, 2016 at 02:41:46PM +0800, Pan Xinhui wrote:
>>>> From: pan xinhui <xinhui.pan@linux.vnet.ibm.com>
>>>>
>>>> Implement xchg{u8,u16}{local,relaxed}, and
>>>> cmpxchg{u8,u16}{,local,acquire,relaxed}.
>>>>
>>>> Atomic operation on 8-bit and 16-bit data type is supported from power7
>>>
>>> And yes I see nothing P7 specific here, this implementation is for
>>> everything PPC64 afaict, no?
>>>
>> Hello Peter,
>> 	No, it's not for every ppc. So yes, I need add #ifdef here. Thanks for pointing it out.
>> We might need a new config option and let it depend on POWER7/POWER8_CPU or even POWER9...
> 
> Right, I'm not sure if PPC has alternatives, but you could of course
> runtime patch the code from emulated with 32bit ll/sc to native 8/16bit
> ll/sc if present on the current CPU if you have infrastructure for these
> things.
> 
seems interesting. I have no idea about how to runtime patch the code. I will try to learn that.
If so, we need change {cmp}xchg into uninline functions?

>>> Also, note that you don't need explicit 8/16 bit atomics to implement
>>> these. Its fine to use 32bit atomics and only modify half the word.
>>>
>> That is true. But I am a little worried about the performance. It will
>> forbid any other tasks to touch the other half word during the
>> load/reserve, right?
> 
> Well, not forbid, it would just make the LL/SC fail and try again. Other
> archs already implement them this way. See commit 3226aad81aa6 ("sh:
> support 1 and 2 byte xchg") for example.
> 
thanks for your explanation. :)

I wrote one similar patch as you suggested.

I paste the new __xchg_u8's alpha implementation here. it need rewrite to be understood easily...
It does work, but some performance tests are needed later.

static __always_inline unsigned long
__xchg_u8_local(volatile void *p, unsigned char val)
{
        unsigned int prev, prev_mask, tmp, offset, _val, *_p;

        _p = (unsigned int *)round_down((unsigned long)p, sizeof(int));
        _val = val;
        offset = 8 * ( (unsigned long)p - (unsigned long )_p) ;
#ifndef CONFIG_CPU_LITTLE_ENDIAN
        offset = 8 * (sizeof(int) - sizeof(__typeof__(val))) - offset;
#endif
        _val <<= offset;
        prev_mask = ~((unsigned int)(__typeof__ (val))-1 << offset);

        __asm__ __volatile__(
"1:     lwarx   %0,0,%3\n"
"       and %1,%0,%5\n"
"       or %1,%1,%4\n"
        PPC405_ERR77(0,%2)
"       stwcx.  %1,0,%3\n"
"       bne-    1b"
        : "=&r" (prev), "=&r" (tmp), "+m" (*(volatile unsigned int *)_p)
        : "r" (_p), "r" (_val), "r" (prev_mask)
        : "cc", "memory");

        return prev >> offset;
}

>> I am working on the qspinlock implementation on PPC.
>> Your and Waiman's patches are so nice. :)
> 
> Thanks!, last time I looked at PPC spinlocks they could not use things
> like ticket locks because PPC might be a guest and fairness blows etc..
> 
> You're making the qspinlock-paravirt thing work on PPC, or doing
> qspinlock only for bare-metal PPC?
> 
I am making the both work. :)
qspinlock works on PPC now. I am preparing the patches and will send them out in next weeks :)

The paravirt work is a little hard.
currently, there are pv_wait() and pv_kick(). but only pv_kick has the parameter cpu(who will hold the lock as soon as the lock is unlocked). 
We need parameter cpu(who holds the lock now) in pv_wait,too.

thanks
xinhui

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] powerpc: introduce {cmp}xchg for u8 and u16
  2016-04-13 11:15       ` Pan Xinhui
@ 2016-04-13 15:53         ` Waiman Long
  2016-04-14  8:31           ` Pan Xinhui
  2016-04-16 19:43         ` Arnd Bergmann
  1 sibling, 1 reply; 9+ messages in thread
From: Waiman Long @ 2016-04-13 15:53 UTC (permalink / raw)
  To: Pan Xinhui
  Cc: Peter Zijlstra, linuxppc-dev, linux-kernel,
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Boqun Feng, Thomas Gleixner

On 04/13/2016 07:15 AM, Pan Xinhui wrote:
> Hello Peter,
>
> On 2016年04月12日 22:30, Peter Zijlstra wrote:
>
>>> I am working on the qspinlock implementation on PPC.
>>> Your and Waiman's patches are so nice. :)
>> Thanks!, last time I looked at PPC spinlocks they could not use things
>> like ticket locks because PPC might be a guest and fairness blows etc..
>>
>> You're making the qspinlock-paravirt thing work on PPC, or doing
>> qspinlock only for bare-metal PPC?
>>
> I am making the both work. :)
> qspinlock works on PPC now. I am preparing the patches and will send them out in next weeks :)

What of performance improvement are you seeing in PPC?

> The paravirt work is a little hard.
> currently, there are pv_wait() and pv_kick(). but only pv_kick has the parameter cpu(who will hold the lock as soon as the lock is unlocked).
> We need parameter cpu(who holds the lock now) in pv_wait,too.

That can be doable to a certain extent. However, if the current lock 
holder acquired the lock via the fastpath only. The CPU information is 
not logged anywhere. For a contended lock, the information should be there.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] powerpc: introduce {cmp}xchg for u8 and u16
  2016-04-13 15:53         ` Waiman Long
@ 2016-04-14  8:31           ` Pan Xinhui
  0 siblings, 0 replies; 9+ messages in thread
From: Pan Xinhui @ 2016-04-14  8:31 UTC (permalink / raw)
  To: Waiman Long
  Cc: Peter Zijlstra, linuxppc-dev, linux-kernel,
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Boqun Feng, Thomas Gleixner

Hello, Waiman

On 2016年04月13日 23:53, Waiman Long wrote:
> On 04/13/2016 07:15 AM, Pan Xinhui wrote:
>> Hello Peter,
>>
>> On 2016年04月12日 22:30, Peter Zijlstra wrote:
>>
>>>> I am working on the qspinlock implementation on PPC.
>>>> Your and Waiman's patches are so nice. :)
>>> Thanks!, last time I looked at PPC spinlocks they could not use things
>>> like ticket locks because PPC might be a guest and fairness blows etc..
>>>
>>> You're making the qspinlock-paravirt thing work on PPC, or doing
>>> qspinlock only for bare-metal PPC?
>>>
>> I am making the both work. :)
>> qspinlock works on PPC now. I am preparing the patches and will send them out in next weeks :)
> 
> What of performance improvement are you seeing in PPC?
> 
well, not good. I wrote one small benchmark which just increase a integer with spinlock hold.
the overhead of lock itself is high. But the fairness is good.
I just do the tests in guestOS, and the qspinlock does not make use of paravirt, but spinlock does. So the performance gap now is a little big.
looks like I need change the kernel config, and re-test.

I does not measure the system impact in real world now. Let's see the kernel build times with two different locks.
If possible, Could you share us how you do the performance test?

>> The paravirt work is a little hard.
>> currently, there are pv_wait() and pv_kick(). but only pv_kick has the parameter cpu(who will hold the lock as soon as the lock is unlocked).
>> We need parameter cpu(who holds the lock now) in pv_wait,too.
> 
> That can be doable to a certain extent. However, if the current lock holder acquired the lock via the fastpath only. The CPU information is not logged anywhere. For a contended lock, the information should be there.
> 
yes. Maybe we could use hashtable. We could put lock, lock holder in pv_node, too. :)
Just my thoughts.

thanks
xinhui

> Cheers, 
> Longman
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] powerpc: introduce {cmp}xchg for u8 and u16
  2016-04-13 11:15       ` Pan Xinhui
  2016-04-13 15:53         ` Waiman Long
@ 2016-04-16 19:43         ` Arnd Bergmann
  2016-04-18 10:19           ` Pan Xinhui
  1 sibling, 1 reply; 9+ messages in thread
From: Arnd Bergmann @ 2016-04-16 19:43 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Pan Xinhui, Peter Zijlstra, Boqun Feng, linux-kernel,
	Paul Mackerras, Thomas Gleixner

On Wednesday 13 April 2016 19:15:17 Pan Xinhui wrote:
> Hello Peter,
> 
> On 2016年04月12日 22:30, Peter Zijlstra wrote:
> > On Sun, Apr 10, 2016 at 10:17:28PM +0800, Pan Xinhui wrote:
> >>
> >> On 2016年04月08日 15:47, Peter Zijlstra wrote:
> >>> On Fri, Apr 08, 2016 at 02:41:46PM +0800, Pan Xinhui wrote:
> >>>> From: pan xinhui <xinhui.pan@linux.vnet.ibm.com>
> >>>>
> >>>> Implement xchg{u8,u16}{local,relaxed}, and
> >>>> cmpxchg{u8,u16}{,local,acquire,relaxed}.
> >>>>
> >>>> Atomic operation on 8-bit and 16-bit data type is supported from power7
> >>>
> >>> And yes I see nothing P7 specific here, this implementation is for
> >>> everything PPC64 afaict, no?
> >>>
> >> Hello Peter,
> >> 	No, it's not for every ppc. So yes, I need add #ifdef here. Thanks for pointing it out.
> >> We might need a new config option and let it depend on POWER7/POWER8_CPU or even POWER9...
> > 
> > Right, I'm not sure if PPC has alternatives, but you could of course
> > runtime patch the code from emulated with 32bit ll/sc to native 8/16bit
> > ll/sc if present on the current CPU if you have infrastructure for these
> > things.
> > 
> seems interesting. I have no idea about how to runtime patch the code. I will try to learn that.
> If so, we need change {cmp}xchg into uninline functions?

I think you don't need to, see do_feature_fixups()/patch_feature_section()

Note that an #ifdef by itself has to worry about any combination of
architectures, so in a kernel that has both POWER6 and POWER7 enabled,
you cannot call the POWER7-only function.

	Arnd

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] powerpc: introduce {cmp}xchg for u8 and u16
  2016-04-16 19:43         ` Arnd Bergmann
@ 2016-04-18 10:19           ` Pan Xinhui
  0 siblings, 0 replies; 9+ messages in thread
From: Pan Xinhui @ 2016-04-18 10:19 UTC (permalink / raw)
  To: Arnd Bergmann, linuxppc-dev
  Cc: Peter Zijlstra, Boqun Feng, linux-kernel, Paul Mackerras,
	Thomas Gleixner



On 2016年04月17日 03:43, Arnd Bergmann wrote:
> On Wednesday 13 April 2016 19:15:17 Pan Xinhui wrote:
>> Hello Peter,
>>
>> On 2016年04月12日 22:30, Peter Zijlstra wrote:
>>> On Sun, Apr 10, 2016 at 10:17:28PM +0800, Pan Xinhui wrote:
>>>>
>>>> On 2016年04月08日 15:47, Peter Zijlstra wrote:
>>>>> On Fri, Apr 08, 2016 at 02:41:46PM +0800, Pan Xinhui wrote:
>>>>>> From: pan xinhui <xinhui.pan@linux.vnet.ibm.com>
>>>>>>
>>>>>> Implement xchg{u8,u16}{local,relaxed}, and
>>>>>> cmpxchg{u8,u16}{,local,acquire,relaxed}.
>>>>>>
>>>>>> Atomic operation on 8-bit and 16-bit data type is supported from power7
>>>>>
>>>>> And yes I see nothing P7 specific here, this implementation is for
>>>>> everything PPC64 afaict, no?
>>>>>
>>>> Hello Peter,
>>>> 	No, it's not for every ppc. So yes, I need add #ifdef here. Thanks for pointing it out.
>>>> We might need a new config option and let it depend on POWER7/POWER8_CPU or even POWER9...
>>>
>>> Right, I'm not sure if PPC has alternatives, but you could of course
>>> runtime patch the code from emulated with 32bit ll/sc to native 8/16bit
>>> ll/sc if present on the current CPU if you have infrastructure for these
>>> things.
>>>
>> seems interesting. I have no idea about how to runtime patch the code. I will try to learn that.
>> If so, we need change {cmp}xchg into uninline functions?
> 
> I think you don't need to, see do_feature_fixups()/patch_feature_section()
> 
Hello, Arnd
	thanks for your tips :) I will take a look at them.
This time I will make generic functions for all ppc. But in future, We will runtime patch the code.

> Note that an #ifdef by itself has to worry about any combination of
> architectures, so in a kernel that has both POWER6 and POWER7 enabled,
> you cannot call the POWER7-only function.
> 
seems your are right.
While I think it's not a good idea to enable several cpu types. just select the minimum supported cpu in real world. :)

thanks
xinhui
> 	Arnd
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2016-04-18 10:19 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-08  6:41 [PATCH] powerpc: introduce {cmp}xchg for u8 and u16 Pan Xinhui
2016-04-08  7:47 ` Peter Zijlstra
2016-04-10 14:17   ` Pan Xinhui
2016-04-12 14:30     ` Peter Zijlstra
2016-04-13 11:15       ` Pan Xinhui
2016-04-13 15:53         ` Waiman Long
2016-04-14  8:31           ` Pan Xinhui
2016-04-16 19:43         ` Arnd Bergmann
2016-04-18 10:19           ` Pan Xinhui

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).