[RFC 0/5] atomics: powerpc: implement relaxed/acquire/release variants of some atomics

All of lore.kernel.org
 help / color / mirror / Atom feed

* [RFC 0/5] atomics: powerpc: implement relaxed/acquire/release variants of some atomics
@ 2015-08-28  2:48 Boqun Feng
  2015-08-28  2:48 ` [RFC 1/5] atomics: add test for atomic operations with _relaxed variants Boqun Feng
                   ` (4 more replies)
  0 siblings, 5 replies; 32+ messages in thread
From: Boqun Feng @ 2015-08-28  2:48 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev
  Cc: Peter Zijlstra, Ingo Molnar, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Thomas Gleixner, Will Deacon,
	Paul E. McKenney, Waiman Long, Boqun Feng

Relaxed/acquire/release variants of atomic operations {add,sub}_return and
{cmp,}xchg are introduced by commit:

"atomics: add acquire/release/relaxed variants of some atomic operations"

which is now on locking/core branch of tip tree.

By default, the generic code will implement relaxed variants as a full
ordered atomic operation and release/acquire variants as a relaxed variants
with a necessary general barrier before or after.

On powerpc, which has a weak memory order model, a relaxed variant can be
implemented more lightweightly than a full ordered one. Further more,
release and acquire variants can be implemented with arch-specific
lightweight barriers.

Therefore this patchset implements the relaxed/acquire/release variants
based on powerpc memory model and specific barriers. A trivial test for
these new variants is also included in this series, because some of these
variants are not used in kernel for now, I would like to make the code of
these variants at least generated somewhere.

The patchset consists of 5 parts:

1.	add trivial tests for the new variants in lib/atomic64_test.c

2.	introduce arch_atomic_op_*() helpers as the arch-specific helpers
	to build other variants based on relaxed.

3.	implement atomic{,64}_{add,sub}_return_* variants

4.	implement xchg_* and atomic{,64}_xchg_* variants

5.	implement cmpxchg_* atomic{,64}_cmpxchg_* variants

This patchset is based on locking/core branch of tip tree and all patches
are built and boot tested for LE pseries.

Regards,
Boqun

-- 
2.5.0

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [RFC 1/5] atomics: add test for atomic operations with _relaxed variants
  2015-08-28  2:48 [RFC 0/5] atomics: powerpc: implement relaxed/acquire/release variants of some atomics Boqun Feng
@ 2015-08-28  2:48 ` Boqun Feng
  2015-08-28  2:48   ` [RFC 2/5] atomics: introduce arch_atomic_op_{acquire, release, fence} helpers Boqun Feng
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 32+ messages in thread
From: Boqun Feng @ 2015-08-28  2:48 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev
  Cc: Peter Zijlstra, Ingo Molnar, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Thomas Gleixner, Will Deacon,
	Paul E. McKenney, Waiman Long, Boqun Feng

Some atomic operations now have _{relaxed, acquire, release} variants,
this patch then adds some trivial tests for two purpose:

1.	test the behavior of these new operations in single-CPU
	environment.
2.	make their code generated before we actually use them somewhere,
	so that we can examine their assembly code.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 lib/atomic64_test.c | 91 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 59 insertions(+), 32 deletions(-)

diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c
index 83c33a5b..0484437 100644
--- a/lib/atomic64_test.c
+++ b/lib/atomic64_test.c
@@ -27,6 +27,50 @@ do {								\
 		(unsigned long long)r);				\
 } while (0)
 
+#define TEST_RETURN(bit, op, c_op, val)				\
+do {								\
+	atomic##bit##_set(&v, v0);				\
+	r = v0;							\
+	r c_op val;						\
+	BUG_ON(atomic##bit##_##op(val, &v) != r);		\
+	BUG_ON(atomic##bit##_read(&v) != r);			\
+} while (0)
+
+#define TEST_RETURN_FAMILY(bit, op, c_op, val)			\
+do {								\
+	TEST_RETURN(bit, op, c_op, val);			\
+	TEST_RETURN(bit, op##_acquire, c_op, val);		\
+	TEST_RETURN(bit, op##_release, c_op, val);		\
+	TEST_RETURN(bit, op##_relaxed, c_op, val);		\
+} while (0)
+
+#define TEST_ARGS(bit, op, init, ret, expect, args...)		\
+do {								\
+	atomic##bit##_set(&v, init);				\
+	BUG_ON(atomic##bit##_##op(&v, args) != ret);		\
+	BUG_ON(atomic##bit##_read(&v) != expect);		\
+} while (0)
+
+#define TEST_XCHG_FAMILY(bit, init, new)			\
+do {								\
+	TEST_ARGS(bit, xchg, init, init, new, new);		\
+	TEST_ARGS(bit, xchg_acquire, init, init, new, new);	\
+	TEST_ARGS(bit, xchg_release, init, init, new, new);	\
+	TEST_ARGS(bit, xchg_relaxed, init, init, new, new);	\
+} while (0)
+
+#define TEST_CMPXCHG_FAMILY(bit, init, new, wrong)			\
+do {									\
+	TEST_ARGS(bit, cmpxchg, init, init, new, init, new);		\
+	TEST_ARGS(bit, cmpxchg, init, init, init, wrong, new);		\
+	TEST_ARGS(bit, cmpxchg_acquire, init, init, new, init, new);	\
+	TEST_ARGS(bit, cmpxchg_acquire, init, init, init, wrong, new);	\
+	TEST_ARGS(bit, cmpxchg_release, init, init, new, init, new);	\
+	TEST_ARGS(bit, cmpxchg_release, init, init, init, wrong, new);	\
+	TEST_ARGS(bit, cmpxchg_relaxed, init, init, new, init, new);	\
+	TEST_ARGS(bit, cmpxchg_relaxed, init, init, init, wrong, new);	\
+} while (0)
+
 static __init void test_atomic(void)
 {
 	int v0 = 0xaaa31337;
@@ -45,6 +89,15 @@ static __init void test_atomic(void)
 	TEST(, and, &=, v1);
 	TEST(, xor, ^=, v1);
 	TEST(, andnot, &= ~, v1);
+
+	TEST_RETURN_FAMILY(, add_return, +=, onestwos);
+	TEST_RETURN_FAMILY(, add_return, +=, -one);
+	TEST_RETURN_FAMILY(, sub_return, -=, onestwos);
+	TEST_RETURN_FAMILY(, sub_return, -=, -one);
+
+	TEST_XCHG_FAMILY(, v0, v1);
+	TEST_CMPXCHG_FAMILY(, v0, v1, onestwos);
+
 }
 
 #define INIT(c) do { atomic64_set(&v, c); r = c; } while (0)
@@ -74,25 +127,10 @@ static __init void test_atomic64(void)
 	TEST(64, xor, ^=, v1);
 	TEST(64, andnot, &= ~, v1);
 
-	INIT(v0);
-	r += onestwos;
-	BUG_ON(atomic64_add_return(onestwos, &v) != r);
-	BUG_ON(v.counter != r);
-
-	INIT(v0);
-	r += -one;
-	BUG_ON(atomic64_add_return(-one, &v) != r);
-	BUG_ON(v.counter != r);
-
-	INIT(v0);
-	r -= onestwos;
-	BUG_ON(atomic64_sub_return(onestwos, &v) != r);
-	BUG_ON(v.counter != r);
-
-	INIT(v0);
-	r -= -one;
-	BUG_ON(atomic64_sub_return(-one, &v) != r);
-	BUG_ON(v.counter != r);
+	TEST_RETURN_FAMILY(64, add_return, +=, onestwos);
+	TEST_RETURN_FAMILY(64, add_return, +=, -one);
+	TEST_RETURN_FAMILY(64, sub_return, -=, onestwos);
+	TEST_RETURN_FAMILY(64, sub_return, -=, -one);
 
 	INIT(v0);
 	atomic64_inc(&v);
@@ -114,19 +152,8 @@ static __init void test_atomic64(void)
 	BUG_ON(atomic64_dec_return(&v) != r);
 	BUG_ON(v.counter != r);
 
-	INIT(v0);
-	BUG_ON(atomic64_xchg(&v, v1) != v0);
-	r = v1;
-	BUG_ON(v.counter != r);
-
-	INIT(v0);
-	BUG_ON(atomic64_cmpxchg(&v, v0, v1) != v0);
-	r = v1;
-	BUG_ON(v.counter != r);
-
-	INIT(v0);
-	BUG_ON(atomic64_cmpxchg(&v, v2, v1) != v0);
-	BUG_ON(v.counter != r);
+	TEST_XCHG_FAMILY(64, v0, v1);
+	TEST_CMPXCHG_FAMILY(64, v0, v1, v2);
 
 	INIT(v0);
 	BUG_ON(atomic64_add_unless(&v, one, v0));
-- 
2.5.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [RFC 2/5] atomics: introduce arch_atomic_op_{acquire,release,fence} helpers
  2015-08-28  2:48 [RFC 0/5] atomics: powerpc: implement relaxed/acquire/release variants of some atomics Boqun Feng
@ 2015-08-28  2:48   ` Boqun Feng
  2015-08-28  2:48   ` [RFC 2/5] atomics: introduce arch_atomic_op_{acquire, release, fence} helpers Boqun Feng
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 32+ messages in thread
From: Boqun Feng @ 2015-08-28  2:48 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev
  Cc: Peter Zijlstra, Ingo Molnar, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Thomas Gleixner, Will Deacon,
	Paul E. McKenney, Waiman Long, Boqun Feng

Some architectures may have their special barriers for acquire, release
and fence semantics, general memory barriers(smp_mb__*_atomic()) in
__atomic_op_*() may be too strong, so arch_atomic_op_*() helpers are
introduced for architectures to provide their own version helpers to
build different variants based on _relaxed variants.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 include/linux/atomic.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 00a5763..622255b 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -34,20 +34,33 @@
  * The idea here is to build acquire/release variants by adding explicit
  * barriers on top of the relaxed variant. In the case where the relaxed
  * variant is already fully ordered, no additional barriers are needed.
+ *
+ * Besides, if an arch has a special barrier for acquire/release, it could
+ * implement its own arch_atomic_op_* and use the same framework for building
+ * variants
  */
+#ifndef arch_atomic_op_acquire
 #define __atomic_op_acquire(op, args...)				\
 ({									\
 	typeof(op##_relaxed(args)) __ret  = op##_relaxed(args);		\
 	smp_mb__after_atomic();						\
 	__ret;								\
 })
+#else
+#define __atomic_op_acquire arch_atomic_op_acquire
+#endif
 
+#ifndef arch_atomic_op_release
 #define __atomic_op_release(op, args...)				\
 ({									\
 	smp_mb__before_atomic();					\
 	op##_relaxed(args);						\
 })
+#else
+#define __atomic_op_release arch_atomic_op_release
+#endif
 
+#ifndef arch_atomic_op_fence
 #define __atomic_op_fence(op, args...)					\
 ({									\
 	typeof(op##_relaxed(args)) __ret;				\
@@ -56,6 +69,9 @@
 	smp_mb__after_atomic();						\
 	__ret;								\
 })
+#else
+#define __atomic_op_fence arch_atomic_op_fence
+#endif
 
 /* atomic_add_return_relaxed */
 #ifndef atomic_add_return_relaxed
-- 
2.5.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [RFC 2/5] atomics: introduce arch_atomic_op_{acquire, release, fence} helpers
@ 2015-08-28  2:48   ` Boqun Feng
  0 siblings, 0 replies; 32+ messages in thread
From: Boqun Feng @ 2015-08-28  2:48 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev
  Cc: Peter Zijlstra, Ingo Molnar, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Thomas Gleixner, Will Deacon,
	Paul E. McKenney, Waiman Long, Boqun Feng

Some architectures may have their special barriers for acquire, release
and fence semantics, general memory barriers(smp_mb__*_atomic()) in
__atomic_op_*() may be too strong, so arch_atomic_op_*() helpers are
introduced for architectures to provide their own version helpers to
build different variants based on _relaxed variants.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 include/linux/atomic.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 00a5763..622255b 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -34,20 +34,33 @@
  * The idea here is to build acquire/release variants by adding explicit
  * barriers on top of the relaxed variant. In the case where the relaxed
  * variant is already fully ordered, no additional barriers are needed.
+ *
+ * Besides, if an arch has a special barrier for acquire/release, it could
+ * implement its own arch_atomic_op_* and use the same framework for building
+ * variants
  */
+#ifndef arch_atomic_op_acquire
 #define __atomic_op_acquire(op, args...)				\
 ({									\
 	typeof(op##_relaxed(args)) __ret  = op##_relaxed(args);		\
 	smp_mb__after_atomic();						\
 	__ret;								\
 })
+#else
+#define __atomic_op_acquire arch_atomic_op_acquire
+#endif
 
+#ifndef arch_atomic_op_release
 #define __atomic_op_release(op, args...)				\
 ({									\
 	smp_mb__before_atomic();					\
 	op##_relaxed(args);						\
 })
+#else
+#define __atomic_op_release arch_atomic_op_release
+#endif
 
+#ifndef arch_atomic_op_fence
 #define __atomic_op_fence(op, args...)					\
 ({									\
 	typeof(op##_relaxed(args)) __ret;				\
@@ -56,6 +69,9 @@
 	smp_mb__after_atomic();						\
 	__ret;								\
 })
+#else
+#define __atomic_op_fence arch_atomic_op_fence
+#endif
 
 /* atomic_add_return_relaxed */
 #ifndef atomic_add_return_relaxed
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants
  2015-08-28  2:48 [RFC 0/5] atomics: powerpc: implement relaxed/acquire/release variants of some atomics Boqun Feng
@ 2015-08-28  2:48   ` Boqun Feng
  2015-08-28  2:48   ` [RFC 2/5] atomics: introduce arch_atomic_op_{acquire, release, fence} helpers Boqun Feng
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 32+ messages in thread
From: Boqun Feng @ 2015-08-28  2:48 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev
  Cc: Peter Zijlstra, Ingo Molnar, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Thomas Gleixner, Will Deacon,
	Paul E. McKenney, Waiman Long, Boqun Feng

On powerpc, we don't need a general memory barrier to achieve acquire and
release semantics, so arch_atomic_op_{acquire,release} can be implemented
using "lwsync" and "isync".

For release semantics, since we only need to ensure all memory accesses
that issue before must take effects before the -store- part of the
atomics, "lwsync" is what we only need. On the platform without
"lwsync", "sync" should be used. Therefore, smp_lwsync() is used here.

For acquire semantics, "lwsync" is what we only need for the similar
reason.  However on the platform without "lwsync", we can use "isync"
rather than "sync" as an acquire barrier. So a new kind of barrier
smp_acquire_barrier__after_atomic() is introduced, which is barrier() on
UP, "lwsync" if available and "isync" otherwise.

For full ordered semantics, like the original ones, smp_lwsync() is put
before relaxed variants and smp_mb__after_atomic() is put after.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 arch/powerpc/include/asm/atomic.h | 88 ++++++++++++++++++++++++++++-----------
 1 file changed, 64 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index 55f106e..806ce50 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -12,6 +12,39 @@
 
 #define ATOMIC_INIT(i)		{ (i) }
 
+/*
+ * Since {add,sub}_return_relaxed and xchg_relaxed are implemented with
+ * a "bne-" instruction at the end, so an isync is enough as a acquire barrier
+ * on the platform without lwsync.
+ */
+#ifdef CONFIG_SMP
+#define smp_acquire_barrier__after_atomic() \
+	__asm__ __volatile__(PPC_ACQUIRE_BARRIER : : : "memory")
+#else
+#define smp_acquire_barrier__after_atomic() barrier()
+#endif
+#define arch_atomic_op_acquire(op, args...)				\
+({									\
+	typeof(op##_relaxed(args)) __ret  = op##_relaxed(args);		\
+	smp_acquire_barrier__after_atomic();				\
+	__ret;								\
+})
+
+#define arch_atomic_op_release(op, args...)				\
+({									\
+	smp_lwsync();							\
+	op##_relaxed(args);						\
+})
+
+#define arch_atomic_op_fence(op, args...)				\
+({									\
+	typeof(op##_relaxed(args)) __ret;				\
+	smp_lwsync();							\
+	__ret = op##_relaxed(args);					\
+	smp_mb__after_atomic();						\
+	__ret;								\
+})
+
 static __inline__ int atomic_read(const atomic_t *v)
 {
 	int t;
@@ -42,27 +75,27 @@ static __inline__ void atomic_##op(int a, atomic_t *v)			\
 	: "cc");							\
 }									\
 
-#define ATOMIC_OP_RETURN(op, asm_op)					\
-static __inline__ int atomic_##op##_return(int a, atomic_t *v)		\
+#define ATOMIC_OP_RETURN_RELAXED(op, asm_op)				\
+static inline int atomic_##op##_return_relaxed(int a, atomic_t *v)	\
 {									\
 	int t;								\
 									\
 	__asm__ __volatile__(						\
-	PPC_ATOMIC_ENTRY_BARRIER					\
-"1:	lwarx	%0,0,%2		# atomic_" #op "_return\n"		\
-	#asm_op " %0,%1,%0\n"						\
-	PPC405_ERR77(0,%2)						\
-"	stwcx.	%0,0,%2 \n"						\
+"1:	lwarx	%0,0,%3		# atomic_" #op "_return_relaxed\n"	\
+	#asm_op " %0,%2,%0\n"						\
+	PPC405_ERR77(0, %3)						\
+"	stwcx.	%0,0,%3\n"						\
 "	bne-	1b\n"							\
-	PPC_ATOMIC_EXIT_BARRIER						\
-	: "=&r" (t)							\
+	: "=&r" (t), "+m" (v->counter)					\
 	: "r" (a), "r" (&v->counter)					\
-	: "cc", "memory");						\
+	: "cc");							\
 									\
 	return t;							\
 }
 
-#define ATOMIC_OPS(op, asm_op) ATOMIC_OP(op, asm_op) ATOMIC_OP_RETURN(op, asm_op)
+#define ATOMIC_OPS(op, asm_op)						\
+	ATOMIC_OP(op, asm_op)						\
+	ATOMIC_OP_RETURN_RELAXED(op, asm_op)
 
 ATOMIC_OPS(add, add)
 ATOMIC_OPS(sub, subf)
@@ -71,8 +104,11 @@ ATOMIC_OP(and, and)
 ATOMIC_OP(or, or)
 ATOMIC_OP(xor, xor)
 
+#define atomic_add_return_relaxed atomic_add_return_relaxed
+#define atomic_sub_return_relaxed atomic_sub_return_relaxed
+
 #undef ATOMIC_OPS
-#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP_RETURN_RELAXED
 #undef ATOMIC_OP
 
 #define atomic_add_negative(a, v)	(atomic_add_return((a), (v)) < 0)
@@ -285,26 +321,27 @@ static __inline__ void atomic64_##op(long a, atomic64_t *v)		\
 	: "cc");							\
 }
 
-#define ATOMIC64_OP_RETURN(op, asm_op)					\
-static __inline__ long atomic64_##op##_return(long a, atomic64_t *v)	\
+#define ATOMIC64_OP_RETURN_RELAXED(op, asm_op)				\
+static inline long							\
+atomic64_##op##_return_relaxed(long a, atomic64_t *v)			\
 {									\
 	long t;								\
 									\
 	__asm__ __volatile__(						\
-	PPC_ATOMIC_ENTRY_BARRIER					\
-"1:	ldarx	%0,0,%2		# atomic64_" #op "_return\n"		\
-	#asm_op " %0,%1,%0\n"						\
-"	stdcx.	%0,0,%2 \n"						\
+"1:	ldarx	%0,0,%3		# atomic64_" #op "_return_relaxed\n"	\
+	#asm_op " %0,%2,%0\n"						\
+"	stdcx.	%0,0,%3\n"						\
 "	bne-	1b\n"							\
-	PPC_ATOMIC_EXIT_BARRIER						\
-	: "=&r" (t)							\
+	: "=&r" (t), "+m" (v->counter)					\
 	: "r" (a), "r" (&v->counter)					\
-	: "cc", "memory");						\
+	: "cc");							\
 									\
 	return t;							\
 }
 
-#define ATOMIC64_OPS(op, asm_op) ATOMIC64_OP(op, asm_op) ATOMIC64_OP_RETURN(op, asm_op)
+#define ATOMIC64_OPS(op, asm_op)					\
+	ATOMIC64_OP(op, asm_op)						\
+	ATOMIC64_OP_RETURN_RELAXED(op, asm_op)
 
 ATOMIC64_OPS(add, add)
 ATOMIC64_OPS(sub, subf)
@@ -312,8 +349,11 @@ ATOMIC64_OP(and, and)
 ATOMIC64_OP(or, or)
 ATOMIC64_OP(xor, xor)
 
-#undef ATOMIC64_OPS
-#undef ATOMIC64_OP_RETURN
+#define atomic64_add_return_relaxed atomic64_add_return_relaxed
+#define atomic64_sub_return_relaxed atomic64_sub_return_relaxed
+
+#undef ATOPIC64_OPS
+#undef ATOMIC64_OP_RETURN_RELAXED
 #undef ATOMIC64_OP
 
 #define atomic64_add_negative(a, v)	(atomic64_add_return((a), (v)) < 0)
-- 
2.5.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [RFC 3/5] powerpc: atomic: implement atomic{, 64}_{add, sub}_return_* variants
@ 2015-08-28  2:48   ` Boqun Feng
  0 siblings, 0 replies; 32+ messages in thread
From: Boqun Feng @ 2015-08-28  2:48 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev
  Cc: Peter Zijlstra, Ingo Molnar, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Thomas Gleixner, Will Deacon,
	Paul E. McKenney, Waiman Long, Boqun Feng

On powerpc, we don't need a general memory barrier to achieve acquire and
release semantics, so arch_atomic_op_{acquire,release} can be implemented
using "lwsync" and "isync".

For release semantics, since we only need to ensure all memory accesses
that issue before must take effects before the -store- part of the
atomics, "lwsync" is what we only need. On the platform without
"lwsync", "sync" should be used. Therefore, smp_lwsync() is used here.

For acquire semantics, "lwsync" is what we only need for the similar
reason.  However on the platform without "lwsync", we can use "isync"
rather than "sync" as an acquire barrier. So a new kind of barrier
smp_acquire_barrier__after_atomic() is introduced, which is barrier() on
UP, "lwsync" if available and "isync" otherwise.

For full ordered semantics, like the original ones, smp_lwsync() is put
before relaxed variants and smp_mb__after_atomic() is put after.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 arch/powerpc/include/asm/atomic.h | 88 ++++++++++++++++++++++++++++-----------
 1 file changed, 64 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index 55f106e..806ce50 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -12,6 +12,39 @@
 
 #define ATOMIC_INIT(i)		{ (i) }
 
+/*
+ * Since {add,sub}_return_relaxed and xchg_relaxed are implemented with
+ * a "bne-" instruction at the end, so an isync is enough as a acquire barrier
+ * on the platform without lwsync.
+ */
+#ifdef CONFIG_SMP
+#define smp_acquire_barrier__after_atomic() \
+	__asm__ __volatile__(PPC_ACQUIRE_BARRIER : : : "memory")
+#else
+#define smp_acquire_barrier__after_atomic() barrier()
+#endif
+#define arch_atomic_op_acquire(op, args...)				\
+({									\
+	typeof(op##_relaxed(args)) __ret  = op##_relaxed(args);		\
+	smp_acquire_barrier__after_atomic();				\
+	__ret;								\
+})
+
+#define arch_atomic_op_release(op, args...)				\
+({									\
+	smp_lwsync();							\
+	op##_relaxed(args);						\
+})
+
+#define arch_atomic_op_fence(op, args...)				\
+({									\
+	typeof(op##_relaxed(args)) __ret;				\
+	smp_lwsync();							\
+	__ret = op##_relaxed(args);					\
+	smp_mb__after_atomic();						\
+	__ret;								\
+})
+
 static __inline__ int atomic_read(const atomic_t *v)
 {
 	int t;
@@ -42,27 +75,27 @@ static __inline__ void atomic_##op(int a, atomic_t *v)			\
 	: "cc");							\
 }									\
 
-#define ATOMIC_OP_RETURN(op, asm_op)					\
-static __inline__ int atomic_##op##_return(int a, atomic_t *v)		\
+#define ATOMIC_OP_RETURN_RELAXED(op, asm_op)				\
+static inline int atomic_##op##_return_relaxed(int a, atomic_t *v)	\
 {									\
 	int t;								\
 									\
 	__asm__ __volatile__(						\
-	PPC_ATOMIC_ENTRY_BARRIER					\
-"1:	lwarx	%0,0,%2		# atomic_" #op "_return\n"		\
-	#asm_op " %0,%1,%0\n"						\
-	PPC405_ERR77(0,%2)						\
-"	stwcx.	%0,0,%2 \n"						\
+"1:	lwarx	%0,0,%3		# atomic_" #op "_return_relaxed\n"	\
+	#asm_op " %0,%2,%0\n"						\
+	PPC405_ERR77(0, %3)						\
+"	stwcx.	%0,0,%3\n"						\
 "	bne-	1b\n"							\
-	PPC_ATOMIC_EXIT_BARRIER						\
-	: "=&r" (t)							\
+	: "=&r" (t), "+m" (v->counter)					\
 	: "r" (a), "r" (&v->counter)					\
-	: "cc", "memory");						\
+	: "cc");							\
 									\
 	return t;							\
 }
 
-#define ATOMIC_OPS(op, asm_op) ATOMIC_OP(op, asm_op) ATOMIC_OP_RETURN(op, asm_op)
+#define ATOMIC_OPS(op, asm_op)						\
+	ATOMIC_OP(op, asm_op)						\
+	ATOMIC_OP_RETURN_RELAXED(op, asm_op)
 
 ATOMIC_OPS(add, add)
 ATOMIC_OPS(sub, subf)
@@ -71,8 +104,11 @@ ATOMIC_OP(and, and)
 ATOMIC_OP(or, or)
 ATOMIC_OP(xor, xor)
 
+#define atomic_add_return_relaxed atomic_add_return_relaxed
+#define atomic_sub_return_relaxed atomic_sub_return_relaxed
+
 #undef ATOMIC_OPS
-#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP_RETURN_RELAXED
 #undef ATOMIC_OP
 
 #define atomic_add_negative(a, v)	(atomic_add_return((a), (v)) < 0)
@@ -285,26 +321,27 @@ static __inline__ void atomic64_##op(long a, atomic64_t *v)		\
 	: "cc");							\
 }
 
-#define ATOMIC64_OP_RETURN(op, asm_op)					\
-static __inline__ long atomic64_##op##_return(long a, atomic64_t *v)	\
+#define ATOMIC64_OP_RETURN_RELAXED(op, asm_op)				\
+static inline long							\
+atomic64_##op##_return_relaxed(long a, atomic64_t *v)			\
 {									\
 	long t;								\
 									\
 	__asm__ __volatile__(						\
-	PPC_ATOMIC_ENTRY_BARRIER					\
-"1:	ldarx	%0,0,%2		# atomic64_" #op "_return\n"		\
-	#asm_op " %0,%1,%0\n"						\
-"	stdcx.	%0,0,%2 \n"						\
+"1:	ldarx	%0,0,%3		# atomic64_" #op "_return_relaxed\n"	\
+	#asm_op " %0,%2,%0\n"						\
+"	stdcx.	%0,0,%3\n"						\
 "	bne-	1b\n"							\
-	PPC_ATOMIC_EXIT_BARRIER						\
-	: "=&r" (t)							\
+	: "=&r" (t), "+m" (v->counter)					\
 	: "r" (a), "r" (&v->counter)					\
-	: "cc", "memory");						\
+	: "cc");							\
 									\
 	return t;							\
 }
 
-#define ATOMIC64_OPS(op, asm_op) ATOMIC64_OP(op, asm_op) ATOMIC64_OP_RETURN(op, asm_op)
+#define ATOMIC64_OPS(op, asm_op)					\
+	ATOMIC64_OP(op, asm_op)						\
+	ATOMIC64_OP_RETURN_RELAXED(op, asm_op)
 
 ATOMIC64_OPS(add, add)
 ATOMIC64_OPS(sub, subf)
@@ -312,8 +349,11 @@ ATOMIC64_OP(and, and)
 ATOMIC64_OP(or, or)
 ATOMIC64_OP(xor, xor)
 
-#undef ATOMIC64_OPS
-#undef ATOMIC64_OP_RETURN
+#define atomic64_add_return_relaxed atomic64_add_return_relaxed
+#define atomic64_sub_return_relaxed atomic64_sub_return_relaxed
+
+#undef ATOPIC64_OPS
+#undef ATOMIC64_OP_RETURN_RELAXED
 #undef ATOMIC64_OP
 
 #define atomic64_add_negative(a, v)	(atomic64_add_return((a), (v)) < 0)
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [RFC 4/5] powerpc: atomic: implement xchg_* and atomic{,64}_xchg_* variants
  2015-08-28  2:48 [RFC 0/5] atomics: powerpc: implement relaxed/acquire/release variants of some atomics Boqun Feng
@ 2015-08-28  2:48   ` Boqun Feng
  2015-08-28  2:48   ` [RFC 2/5] atomics: introduce arch_atomic_op_{acquire, release, fence} helpers Boqun Feng
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 32+ messages in thread
From: Boqun Feng @ 2015-08-28  2:48 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev
  Cc: Peter Zijlstra, Ingo Molnar, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Thomas Gleixner, Will Deacon,
	Paul E. McKenney, Waiman Long, Boqun Feng

Implement xchg_relaxed and define atomic{,64}_xchg_* as xchg_relaxed,
based on these _relaxed variants, release/acquire variants can be built.

Note that xchg_relaxed and atomic_{,64}_xchg_relaxed are not compiler
barriers.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 arch/powerpc/include/asm/atomic.h  |  2 ++
 arch/powerpc/include/asm/cmpxchg.h | 64 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)

diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index 806ce50..4965dcf 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -193,6 +193,7 @@ static __inline__ int atomic_dec_return(atomic_t *v)
 
 #define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
+#define atomic_xchg_relaxed(v, new) xchg_relaxed(&((v)->counter), (new))
 
 /**
  * __atomic_add_unless - add unless the number is a given value
@@ -461,6 +462,7 @@ static __inline__ long atomic64_dec_if_positive(atomic64_t *v)
 
 #define atomic64_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
 #define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
+#define atomic64_xchg_relaxed(v, new) xchg_relaxed(&((v)->counter), (new))
 
 /**
  * atomic64_add_unless - add unless the number is a given value
diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
index ad6263c..66374f4 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -54,6 +54,32 @@ __xchg_u32_local(volatile void *p, unsigned long val)
 	return prev;
 }
 
+/*
+ * Atomic exchange relaxed
+ *
+ * Changes the memory location '*p' to be val and returns
+ * the previous value stored there.
+ *
+ * Note that this is not a compiler barrier, there is no order
+ * guarantee around.
+ */
+static __always_inline unsigned long
+__xchg_u32_relaxed(u32 *p, unsigned long val)
+{
+	unsigned long prev;
+
+	__asm__ __volatile__(
+"1:	lwarx	%0,0,%2\n"
+	PPC405_ERR77(0, %2)
+"	stwcx.	%3,0,%2\n"
+"	bne-	1b"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (val)
+	: "cc");
+
+	return prev;
+}
+
 #ifdef CONFIG_PPC64
 static __always_inline unsigned long
 __xchg_u64(volatile void *p, unsigned long val)
@@ -90,6 +116,23 @@ __xchg_u64_local(volatile void *p, unsigned long val)
 
 	return prev;
 }
+
+static __always_inline unsigned long
+__xchg_u64_relaxed(u64 *p, unsigned long val)
+{
+	unsigned long prev;
+
+	__asm__ __volatile__(
+"1:	ldarx	%0,0,%2\n"
+	PPC405_ERR77(0, %2)
+"	stdcx.	%3,0,%2\n"
+"	bne-	1b"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (val)
+	: "cc");
+
+	return prev;
+}
 #endif
 
 /*
@@ -127,6 +170,21 @@ __xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
 	__xchg_called_with_bad_pointer();
 	return x;
 }
+
+static __always_inline unsigned long
+__xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
+{
+	switch (size) {
+	case 4:
+		return __xchg_u32_relaxed(ptr, x);
+#ifdef CONFIG_PPC64
+	case 8:
+		return __xchg_u64_relaxed(ptr, x);
+#endif
+	}
+	__xchg_called_with_bad_pointer();
+	return x;
+}
 #define xchg(ptr,x)							     \
   ({									     \
      __typeof__(*(ptr)) _x_ = (x);					     \
@@ -140,6 +198,12 @@ __xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
      		(unsigned long)_x_, sizeof(*(ptr))); 			     \
   })
 
+#define xchg_relaxed(ptr, x)						\
+({									\
+	__typeof__(*(ptr)) _x_ = (x);					\
+	(__typeof__(*(ptr))) __xchg_relaxed((ptr),			\
+			(unsigned long)_x_, sizeof(*(ptr)));		\
+})
 /*
  * Compare and exchange - if *p == old, set it to new,
  * and return the old value of *p.
-- 
2.5.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [RFC 4/5] powerpc: atomic: implement xchg_* and atomic{, 64}_xchg_* variants
@ 2015-08-28  2:48   ` Boqun Feng
  0 siblings, 0 replies; 32+ messages in thread
From: Boqun Feng @ 2015-08-28  2:48 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev
  Cc: Peter Zijlstra, Ingo Molnar, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Thomas Gleixner, Will Deacon,
	Paul E. McKenney, Waiman Long, Boqun Feng

Implement xchg_relaxed and define atomic{,64}_xchg_* as xchg_relaxed,
based on these _relaxed variants, release/acquire variants can be built.

Note that xchg_relaxed and atomic_{,64}_xchg_relaxed are not compiler
barriers.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 arch/powerpc/include/asm/atomic.h  |  2 ++
 arch/powerpc/include/asm/cmpxchg.h | 64 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)

diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index 806ce50..4965dcf 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -193,6 +193,7 @@ static __inline__ int atomic_dec_return(atomic_t *v)
 
 #define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
+#define atomic_xchg_relaxed(v, new) xchg_relaxed(&((v)->counter), (new))
 
 /**
  * __atomic_add_unless - add unless the number is a given value
@@ -461,6 +462,7 @@ static __inline__ long atomic64_dec_if_positive(atomic64_t *v)
 
 #define atomic64_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
 #define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
+#define atomic64_xchg_relaxed(v, new) xchg_relaxed(&((v)->counter), (new))
 
 /**
  * atomic64_add_unless - add unless the number is a given value
diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
index ad6263c..66374f4 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -54,6 +54,32 @@ __xchg_u32_local(volatile void *p, unsigned long val)
 	return prev;
 }
 
+/*
+ * Atomic exchange relaxed
+ *
+ * Changes the memory location '*p' to be val and returns
+ * the previous value stored there.
+ *
+ * Note that this is not a compiler barrier, there is no order
+ * guarantee around.
+ */
+static __always_inline unsigned long
+__xchg_u32_relaxed(u32 *p, unsigned long val)
+{
+	unsigned long prev;
+
+	__asm__ __volatile__(
+"1:	lwarx	%0,0,%2\n"
+	PPC405_ERR77(0, %2)
+"	stwcx.	%3,0,%2\n"
+"	bne-	1b"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (val)
+	: "cc");
+
+	return prev;
+}
+
 #ifdef CONFIG_PPC64
 static __always_inline unsigned long
 __xchg_u64(volatile void *p, unsigned long val)
@@ -90,6 +116,23 @@ __xchg_u64_local(volatile void *p, unsigned long val)
 
 	return prev;
 }
+
+static __always_inline unsigned long
+__xchg_u64_relaxed(u64 *p, unsigned long val)
+{
+	unsigned long prev;
+
+	__asm__ __volatile__(
+"1:	ldarx	%0,0,%2\n"
+	PPC405_ERR77(0, %2)
+"	stdcx.	%3,0,%2\n"
+"	bne-	1b"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (val)
+	: "cc");
+
+	return prev;
+}
 #endif
 
 /*
@@ -127,6 +170,21 @@ __xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
 	__xchg_called_with_bad_pointer();
 	return x;
 }
+
+static __always_inline unsigned long
+__xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
+{
+	switch (size) {
+	case 4:
+		return __xchg_u32_relaxed(ptr, x);
+#ifdef CONFIG_PPC64
+	case 8:
+		return __xchg_u64_relaxed(ptr, x);
+#endif
+	}
+	__xchg_called_with_bad_pointer();
+	return x;
+}
 #define xchg(ptr,x)							     \
   ({									     \
      __typeof__(*(ptr)) _x_ = (x);					     \
@@ -140,6 +198,12 @@ __xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
      		(unsigned long)_x_, sizeof(*(ptr))); 			     \
   })
 
+#define xchg_relaxed(ptr, x)						\
+({									\
+	__typeof__(*(ptr)) _x_ = (x);					\
+	(__typeof__(*(ptr))) __xchg_relaxed((ptr),			\
+			(unsigned long)_x_, sizeof(*(ptr)));		\
+})
 /*
  * Compare and exchange - if *p == old, set it to new,
  * and return the old value of *p.
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [RFC 5/5] powerpc: atomic: implement cmpxchg{,64}_* and atomic{,64}_cmpxchg_* variants
  2015-08-28  2:48 [RFC 0/5] atomics: powerpc: implement relaxed/acquire/release variants of some atomics Boqun Feng
@ 2015-08-28  2:48   ` Boqun Feng
  2015-08-28  2:48   ` [RFC 2/5] atomics: introduce arch_atomic_op_{acquire, release, fence} helpers Boqun Feng
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 32+ messages in thread
From: Boqun Feng @ 2015-08-28  2:48 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev
  Cc: Peter Zijlstra, Ingo Molnar, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Thomas Gleixner, Will Deacon,
	Paul E. McKenney, Waiman Long, Boqun Feng

Unlike other atomic operation variants, cmpxchg{,64}_acquire and
atomic{,64}_cmpxchg_acquire don't have acquire semantics if the cmp part
fails, so we need to implement these using assembly.

Note cmpxchg{,64}_relaxed and atomic{,64}_cmpxchg_relaxed are not
compiler barriers.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 arch/powerpc/include/asm/atomic.h  |  10 +++
 arch/powerpc/include/asm/cmpxchg.h | 141 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 150 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index 4965dcf..ef8d062 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -192,6 +192,11 @@ static __inline__ int atomic_dec_return(atomic_t *v)
 }
 
 #define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
+#define atomic_cmpxchg_relaxed(v, o, n) \
+	cmpxchg_relaxed(&((v)->counter), (o), (n))
+#define atomic_cmpxchg_acquire(v, o, n) \
+	cmpxchg_acquire(&((v)->counter), (o), (n))
+
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 #define atomic_xchg_relaxed(v, new) xchg_relaxed(&((v)->counter), (new))
 
@@ -461,6 +466,11 @@ static __inline__ long atomic64_dec_if_positive(atomic64_t *v)
 }
 
 #define atomic64_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
+#define atomic64_cmpxchg_relaxed(v, o, n) \
+	cmpxchg_relaxed(&((v)->counter), (o), (n))
+#define atomic64_cmpxchg_acquire(v, o, n) \
+	cmpxchg_acquire(&((v)->counter), (o), (n))
+
 #define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
 #define atomic64_xchg_relaxed(v, new) xchg_relaxed(&((v)->counter), (new))
 
diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
index 66374f4..f40f295 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -254,6 +254,48 @@ __cmpxchg_u32_local(volatile unsigned int *p, unsigned long old,
 	return prev;
 }
 
+static __always_inline unsigned long
+__cmpxchg_u32_relaxed(u32 *p, unsigned long old, unsigned long new)
+{
+	unsigned long prev;
+
+	__asm__ __volatile__ (
+"1:	lwarx	%0,0,%2		# __cmpxchg_u32_relaxed\n"
+"	cmpw	0,%0,%3\n"
+"	bne-	2f\n"
+	PPC405_ERR77(0, %2)
+"	stwcx.	%4,0,%2\n"
+"	bne-	1b\n"
+"2:"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (old), "r" (new)
+	: "cc");
+
+	return prev;
+}
+
+static __always_inline unsigned long
+__cmpxchg_u32_acquire(u32 *p, unsigned long old, unsigned long new)
+{
+	unsigned long prev;
+
+	__asm__ __volatile__ (
+"1:	lwarx	%0,0,%2		# __cmpxchg_u32_acquire\n"
+"	cmpw	0,%0,%3\n"
+"	bne-	2f\n"
+	PPC405_ERR77(0, %2)
+"	stwcx.	%4,0,%2\n"
+"	bne-	1b\n"
+	PPC_ACQUIRE_BARRIER
+	"\n"
+"2:"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (old), "r" (new)
+	: "cc", "memory");
+
+	return prev;
+}
+
 #ifdef CONFIG_PPC64
 static __always_inline unsigned long
 __cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new)
@@ -297,6 +339,46 @@ __cmpxchg_u64_local(volatile unsigned long *p, unsigned long old,
 
 	return prev;
 }
+
+static __always_inline unsigned long
+__cmpxchg_u64_relaxed(u64 *p, unsigned long old, unsigned long new)
+{
+	unsigned long prev;
+
+	__asm__ __volatile__ (
+"1:	ldarx	%0,0,%2		# __cmpxchg_u64_relaxed\n"
+"	cmpd	0,%0,%3\n"
+"	bne-	2f\n"
+"	stdcx.	%4,0,%2\n"
+"	bne-	1b\n"
+"2:"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (old), "r" (new)
+	: "cc");
+
+	return prev;
+}
+
+static __always_inline unsigned long
+__cmpxchg_u64_acquire(u64 *p, unsigned long old, unsigned long new)
+{
+	unsigned long prev;
+
+	__asm__ __volatile__ (
+"1:	ldarx	%0,0,%2		# __cmpxchg_u64_acquire\n"
+"	cmpd	0,%0,%3\n"
+"	bne-	2f\n"
+"	stdcx.	%4,0,%2\n"
+"	bne-	1b\n"
+	PPC_ACQUIRE_BARRIER
+	"\n"
+"2:"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (old), "r" (new)
+	: "cc", "memory");
+
+	return prev;
+}
 #endif
 
 /* This function doesn't exist, so you'll get a linker error
@@ -335,6 +417,37 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
 	return old;
 }
 
+static __always_inline unsigned long
+__cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
+		  unsigned int size)
+{
+	switch (size) {
+	case 4:
+		return __cmpxchg_u32_relaxed(ptr, old, new);
+#ifdef CONFIG_PPC64
+	case 8:
+		return __cmpxchg_u64_relaxed(ptr, old, new);
+#endif
+	}
+	__cmpxchg_called_with_bad_pointer();
+	return old;
+}
+
+static __always_inline unsigned long
+__cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new,
+		  unsigned int size)
+{
+	switch (size) {
+	case 4:
+		return __cmpxchg_u32_acquire(ptr, old, new);
+#ifdef CONFIG_PPC64
+	case 8:
+		return __cmpxchg_u64_acquire(ptr, old, new);
+#endif
+	}
+	__cmpxchg_called_with_bad_pointer();
+	return old;
+}
 #define cmpxchg(ptr, o, n)						 \
   ({									 \
      __typeof__(*(ptr)) _o_ = (o);					 \
@@ -352,6 +465,23 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
 				    (unsigned long)_n_, sizeof(*(ptr))); \
   })
 
+#define cmpxchg_relaxed(ptr, o, n)					\
+({									\
+	__typeof__(*(ptr)) _o_ = (o);					\
+	__typeof__(*(ptr)) _n_ = (n);					\
+	(__typeof__(*(ptr))) __cmpxchg_relaxed((ptr),			\
+			(unsigned long)_o_, (unsigned long)_n_,		\
+			sizeof(*(ptr)));				\
+})
+
+#define cmpxchg_acquire(ptr, o, n)					\
+({									\
+	__typeof__(*(ptr)) _o_ = (o);					\
+	__typeof__(*(ptr)) _n_ = (n);					\
+	(__typeof__(*(ptr))) __cmpxchg_acquire((ptr),			\
+			(unsigned long)_o_, (unsigned long)_n_,		\
+			sizeof(*(ptr)));				\
+})
 #ifdef CONFIG_PPC64
 #define cmpxchg64(ptr, o, n)						\
   ({									\
@@ -363,7 +493,16 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
 	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
 	cmpxchg_local((ptr), (o), (n));					\
   })
-#define cmpxchg64_relaxed	cmpxchg64_local
+#define cmpxchg64_relaxed(ptr, o, n)					\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
+	cmpxchg_relaxed((ptr), (o), (n));				\
+})
+#define cmpxchg64_acquire(ptr, o, n)					\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
+	cmpxchg_acquire((ptr), (o), (n));				\
+})
 #else
 #include <asm-generic/cmpxchg-local.h>
 #define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n))
-- 
2.5.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [RFC 5/5] powerpc: atomic: implement cmpxchg{, 64}_* and atomic{, 64}_cmpxchg_* variants
@ 2015-08-28  2:48   ` Boqun Feng
  0 siblings, 0 replies; 32+ messages in thread
From: Boqun Feng @ 2015-08-28  2:48 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev
  Cc: Peter Zijlstra, Ingo Molnar, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Thomas Gleixner, Will Deacon,
	Paul E. McKenney, Waiman Long, Boqun Feng

Unlike other atomic operation variants, cmpxchg{,64}_acquire and
atomic{,64}_cmpxchg_acquire don't have acquire semantics if the cmp part
fails, so we need to implement these using assembly.

Note cmpxchg{,64}_relaxed and atomic{,64}_cmpxchg_relaxed are not
compiler barriers.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 arch/powerpc/include/asm/atomic.h  |  10 +++
 arch/powerpc/include/asm/cmpxchg.h | 141 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 150 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index 4965dcf..ef8d062 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -192,6 +192,11 @@ static __inline__ int atomic_dec_return(atomic_t *v)
 }
 
 #define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
+#define atomic_cmpxchg_relaxed(v, o, n) \
+	cmpxchg_relaxed(&((v)->counter), (o), (n))
+#define atomic_cmpxchg_acquire(v, o, n) \
+	cmpxchg_acquire(&((v)->counter), (o), (n))
+
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 #define atomic_xchg_relaxed(v, new) xchg_relaxed(&((v)->counter), (new))
 
@@ -461,6 +466,11 @@ static __inline__ long atomic64_dec_if_positive(atomic64_t *v)
 }
 
 #define atomic64_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
+#define atomic64_cmpxchg_relaxed(v, o, n) \
+	cmpxchg_relaxed(&((v)->counter), (o), (n))
+#define atomic64_cmpxchg_acquire(v, o, n) \
+	cmpxchg_acquire(&((v)->counter), (o), (n))
+
 #define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
 #define atomic64_xchg_relaxed(v, new) xchg_relaxed(&((v)->counter), (new))
 
diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
index 66374f4..f40f295 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -254,6 +254,48 @@ __cmpxchg_u32_local(volatile unsigned int *p, unsigned long old,
 	return prev;
 }
 
+static __always_inline unsigned long
+__cmpxchg_u32_relaxed(u32 *p, unsigned long old, unsigned long new)
+{
+	unsigned long prev;
+
+	__asm__ __volatile__ (
+"1:	lwarx	%0,0,%2		# __cmpxchg_u32_relaxed\n"
+"	cmpw	0,%0,%3\n"
+"	bne-	2f\n"
+	PPC405_ERR77(0, %2)
+"	stwcx.	%4,0,%2\n"
+"	bne-	1b\n"
+"2:"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (old), "r" (new)
+	: "cc");
+
+	return prev;
+}
+
+static __always_inline unsigned long
+__cmpxchg_u32_acquire(u32 *p, unsigned long old, unsigned long new)
+{
+	unsigned long prev;
+
+	__asm__ __volatile__ (
+"1:	lwarx	%0,0,%2		# __cmpxchg_u32_acquire\n"
+"	cmpw	0,%0,%3\n"
+"	bne-	2f\n"
+	PPC405_ERR77(0, %2)
+"	stwcx.	%4,0,%2\n"
+"	bne-	1b\n"
+	PPC_ACQUIRE_BARRIER
+	"\n"
+"2:"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (old), "r" (new)
+	: "cc", "memory");
+
+	return prev;
+}
+
 #ifdef CONFIG_PPC64
 static __always_inline unsigned long
 __cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new)
@@ -297,6 +339,46 @@ __cmpxchg_u64_local(volatile unsigned long *p, unsigned long old,
 
 	return prev;
 }
+
+static __always_inline unsigned long
+__cmpxchg_u64_relaxed(u64 *p, unsigned long old, unsigned long new)
+{
+	unsigned long prev;
+
+	__asm__ __volatile__ (
+"1:	ldarx	%0,0,%2		# __cmpxchg_u64_relaxed\n"
+"	cmpd	0,%0,%3\n"
+"	bne-	2f\n"
+"	stdcx.	%4,0,%2\n"
+"	bne-	1b\n"
+"2:"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (old), "r" (new)
+	: "cc");
+
+	return prev;
+}
+
+static __always_inline unsigned long
+__cmpxchg_u64_acquire(u64 *p, unsigned long old, unsigned long new)
+{
+	unsigned long prev;
+
+	__asm__ __volatile__ (
+"1:	ldarx	%0,0,%2		# __cmpxchg_u64_acquire\n"
+"	cmpd	0,%0,%3\n"
+"	bne-	2f\n"
+"	stdcx.	%4,0,%2\n"
+"	bne-	1b\n"
+	PPC_ACQUIRE_BARRIER
+	"\n"
+"2:"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (old), "r" (new)
+	: "cc", "memory");
+
+	return prev;
+}
 #endif
 
 /* This function doesn't exist, so you'll get a linker error
@@ -335,6 +417,37 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
 	return old;
 }
 
+static __always_inline unsigned long
+__cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
+		  unsigned int size)
+{
+	switch (size) {
+	case 4:
+		return __cmpxchg_u32_relaxed(ptr, old, new);
+#ifdef CONFIG_PPC64
+	case 8:
+		return __cmpxchg_u64_relaxed(ptr, old, new);
+#endif
+	}
+	__cmpxchg_called_with_bad_pointer();
+	return old;
+}
+
+static __always_inline unsigned long
+__cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new,
+		  unsigned int size)
+{
+	switch (size) {
+	case 4:
+		return __cmpxchg_u32_acquire(ptr, old, new);
+#ifdef CONFIG_PPC64
+	case 8:
+		return __cmpxchg_u64_acquire(ptr, old, new);
+#endif
+	}
+	__cmpxchg_called_with_bad_pointer();
+	return old;
+}
 #define cmpxchg(ptr, o, n)						 \
   ({									 \
      __typeof__(*(ptr)) _o_ = (o);					 \
@@ -352,6 +465,23 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
 				    (unsigned long)_n_, sizeof(*(ptr))); \
   })
 
+#define cmpxchg_relaxed(ptr, o, n)					\
+({									\
+	__typeof__(*(ptr)) _o_ = (o);					\
+	__typeof__(*(ptr)) _n_ = (n);					\
+	(__typeof__(*(ptr))) __cmpxchg_relaxed((ptr),			\
+			(unsigned long)_o_, (unsigned long)_n_,		\
+			sizeof(*(ptr)));				\
+})
+
+#define cmpxchg_acquire(ptr, o, n)					\
+({									\
+	__typeof__(*(ptr)) _o_ = (o);					\
+	__typeof__(*(ptr)) _n_ = (n);					\
+	(__typeof__(*(ptr))) __cmpxchg_acquire((ptr),			\
+			(unsigned long)_o_, (unsigned long)_n_,		\
+			sizeof(*(ptr)));				\
+})
 #ifdef CONFIG_PPC64
 #define cmpxchg64(ptr, o, n)						\
   ({									\
@@ -363,7 +493,16 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
 	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
 	cmpxchg_local((ptr), (o), (n));					\
   })
-#define cmpxchg64_relaxed	cmpxchg64_local
+#define cmpxchg64_relaxed(ptr, o, n)					\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
+	cmpxchg_relaxed((ptr), (o), (n));				\
+})
+#define cmpxchg64_acquire(ptr, o, n)					\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
+	cmpxchg_acquire((ptr), (o), (n));				\
+})
 #else
 #include <asm-generic/cmpxchg-local.h>
 #define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n))
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants
  2015-08-28  2:48   ` [RFC 3/5] powerpc: atomic: implement atomic{, 64}_{add, sub}_return_* variants Boqun Feng
  (?)
@ 2015-08-28 10:48   ` Peter Zijlstra
  2015-08-28 12:06     ` Boqun Feng
  -1 siblings, 1 reply; 32+ messages in thread
From: Peter Zijlstra @ 2015-08-28 10:48 UTC (permalink / raw)
  To: Boqun Feng
  Cc: linux-kernel, linuxppc-dev, Ingo Molnar, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Thomas Gleixner, Will Deacon,
	Paul E. McKenney, Waiman Long

On Fri, Aug 28, 2015 at 10:48:17AM +0800, Boqun Feng wrote:
> +/*
> + * Since {add,sub}_return_relaxed and xchg_relaxed are implemented with
> + * a "bne-" instruction at the end, so an isync is enough as a acquire barrier
> + * on the platform without lwsync.
> + */
> +#ifdef CONFIG_SMP
> +#define smp_acquire_barrier__after_atomic() \
> +	__asm__ __volatile__(PPC_ACQUIRE_BARRIER : : : "memory")
> +#else
> +#define smp_acquire_barrier__after_atomic() barrier()
> +#endif
> +#define arch_atomic_op_acquire(op, args...)				\
> +({									\
> +	typeof(op##_relaxed(args)) __ret  = op##_relaxed(args);		\
> +	smp_acquire_barrier__after_atomic();				\
> +	__ret;								\
> +})
> +
> +#define arch_atomic_op_release(op, args...)				\
> +({									\
> +	smp_lwsync();							\
> +	op##_relaxed(args);						\
> +})

Urgh, so this is RCpc. We were trying to get rid of that if possible.
Lets wait until that's settled before introducing more of it.

lkml.kernel.org/r/20150820155604.GB24100@arm.com

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 2/5] atomics: introduce arch_atomic_op_{acquire,release,fence} helpers
  2015-08-28  2:48   ` [RFC 2/5] atomics: introduce arch_atomic_op_{acquire, release, fence} helpers Boqun Feng
  (?)
@ 2015-08-28 11:36   ` Peter Zijlstra
  2015-08-28 11:50     ` Boqun Feng
  -1 siblings, 1 reply; 32+ messages in thread
From: Peter Zijlstra @ 2015-08-28 11:36 UTC (permalink / raw)
  To: Boqun Feng
  Cc: linux-kernel, linuxppc-dev, Ingo Molnar, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Thomas Gleixner, Will Deacon,
	Paul E. McKenney, Waiman Long

On Fri, Aug 28, 2015 at 10:48:16AM +0800, Boqun Feng wrote:
> Some architectures may have their special barriers for acquire, release
> and fence semantics, general memory barriers(smp_mb__*_atomic()) in
> __atomic_op_*() may be too strong, so arch_atomic_op_*() helpers are
> introduced for architectures to provide their own version helpers to
> build different variants based on _relaxed variants.
> 
> Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
> ---
>  include/linux/atomic.h | 16 ++++++++++++++++
>  1 file changed, 16 insertions(+)
> 
> diff --git a/include/linux/atomic.h b/include/linux/atomic.h
> index 00a5763..622255b 100644
> --- a/include/linux/atomic.h
> +++ b/include/linux/atomic.h
> @@ -34,20 +34,33 @@
>   * The idea here is to build acquire/release variants by adding explicit
>   * barriers on top of the relaxed variant. In the case where the relaxed
>   * variant is already fully ordered, no additional barriers are needed.
> + *
> + * Besides, if an arch has a special barrier for acquire/release, it could
> + * implement its own arch_atomic_op_* and use the same framework for building
> + * variants
>   */
> +#ifndef arch_atomic_op_acquire
>  #define __atomic_op_acquire(op, args...)				\
>  ({									\
>  	typeof(op##_relaxed(args)) __ret  = op##_relaxed(args);		\
>  	smp_mb__after_atomic();						\
>  	__ret;								\
>  })
> +#else
> +#define __atomic_op_acquire arch_atomic_op_acquire
> +#endif

Not really a fan of this, its not consistent with the existing #ifndef
guard style.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 2/5] atomics: introduce arch_atomic_op_{acquire,release,fence} helpers
  2015-08-28 11:36   ` [RFC 2/5] atomics: introduce arch_atomic_op_{acquire,release,fence} helpers Peter Zijlstra
@ 2015-08-28 11:50     ` Boqun Feng
  0 siblings, 0 replies; 32+ messages in thread
From: Boqun Feng @ 2015-08-28 11:50 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, linuxppc-dev, Ingo Molnar, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Thomas Gleixner, Will Deacon,
	Paul E. McKenney, Waiman Long

[-- Attachment #1: Type: text/plain, Size: 1937 bytes --]

Hi Peter,

On Fri, Aug 28, 2015 at 01:36:14PM +0200, Peter Zijlstra wrote:
> On Fri, Aug 28, 2015 at 10:48:16AM +0800, Boqun Feng wrote:
> > Some architectures may have their special barriers for acquire, release
> > and fence semantics, general memory barriers(smp_mb__*_atomic()) in
> > __atomic_op_*() may be too strong, so arch_atomic_op_*() helpers are
> > introduced for architectures to provide their own version helpers to
> > build different variants based on _relaxed variants.
> > 
> > Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
> > ---
> >  include/linux/atomic.h | 16 ++++++++++++++++
> >  1 file changed, 16 insertions(+)
> > 
> > diff --git a/include/linux/atomic.h b/include/linux/atomic.h
> > index 00a5763..622255b 100644
> > --- a/include/linux/atomic.h
> > +++ b/include/linux/atomic.h
> > @@ -34,20 +34,33 @@
> >   * The idea here is to build acquire/release variants by adding explicit
> >   * barriers on top of the relaxed variant. In the case where the relaxed
> >   * variant is already fully ordered, no additional barriers are needed.
> > + *
> > + * Besides, if an arch has a special barrier for acquire/release, it could
> > + * implement its own arch_atomic_op_* and use the same framework for building
> > + * variants
> >   */
> > +#ifndef arch_atomic_op_acquire
> >  #define __atomic_op_acquire(op, args...)				\
> >  ({									\
> >  	typeof(op##_relaxed(args)) __ret  = op##_relaxed(args);		\
> >  	smp_mb__after_atomic();						\
> >  	__ret;								\
> >  })
> > +#else
> > +#define __atomic_op_acquire arch_atomic_op_acquire
> > +#endif
> 
> Not really a fan of this, its not consistent with the existing #ifndef
> guard style.

You suggestion is that I should:

#ifndef __atomic_op_acquire 
#define __atomic_op_acquire(op, args...)
({
	...
})
#endif

... and define powerpc specific __atomic_op_acquire in asm/atomic.h?

Regards,
Boqun

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants
  2015-08-28 10:48   ` [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants Peter Zijlstra
@ 2015-08-28 12:06     ` Boqun Feng
  2015-08-28 14:16       ` Boqun Feng
  0 siblings, 1 reply; 32+ messages in thread
From: Boqun Feng @ 2015-08-28 12:06 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, linuxppc-dev, Ingo Molnar, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Thomas Gleixner, Will Deacon,
	Paul E. McKenney, Waiman Long

[-- Attachment #1: Type: text/plain, Size: 1348 bytes --]

Hi Peter,

On Fri, Aug 28, 2015 at 12:48:54PM +0200, Peter Zijlstra wrote:
> On Fri, Aug 28, 2015 at 10:48:17AM +0800, Boqun Feng wrote:
> > +/*
> > + * Since {add,sub}_return_relaxed and xchg_relaxed are implemented with
> > + * a "bne-" instruction at the end, so an isync is enough as a acquire barrier
> > + * on the platform without lwsync.
> > + */
> > +#ifdef CONFIG_SMP
> > +#define smp_acquire_barrier__after_atomic() \
> > +	__asm__ __volatile__(PPC_ACQUIRE_BARRIER : : : "memory")
> > +#else
> > +#define smp_acquire_barrier__after_atomic() barrier()
> > +#endif
> > +#define arch_atomic_op_acquire(op, args...)				\
> > +({									\
> > +	typeof(op##_relaxed(args)) __ret  = op##_relaxed(args);		\
> > +	smp_acquire_barrier__after_atomic();				\
> > +	__ret;								\
> > +})
> > +
> > +#define arch_atomic_op_release(op, args...)				\
> > +({									\
> > +	smp_lwsync();							\
> > +	op##_relaxed(args);						\
> > +})
> 
> Urgh, so this is RCpc. We were trying to get rid of that if possible.
> Lets wait until that's settled before introducing more of it.
> 
> lkml.kernel.org/r/20150820155604.GB24100@arm.com

OK, get it. Thanks.

So I'm not going to introduce these arch specific macros, I think what I
need to implement are just _relaxed variants and cmpxchg_acquire.

Regards,
Boqun


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants
  2015-08-28 12:06     ` Boqun Feng
@ 2015-08-28 14:16       ` Boqun Feng
  2015-08-28 15:39         ` Peter Zijlstra
  0 siblings, 1 reply; 32+ messages in thread
From: Boqun Feng @ 2015-08-28 14:16 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, linuxppc-dev, Ingo Molnar, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Thomas Gleixner, Will Deacon,
	Paul E. McKenney, Waiman Long

[-- Attachment #1: Type: text/plain, Size: 1888 bytes --]

On Fri, Aug 28, 2015 at 08:06:14PM +0800, Boqun Feng wrote:
> Hi Peter,
> 
> On Fri, Aug 28, 2015 at 12:48:54PM +0200, Peter Zijlstra wrote:
> > On Fri, Aug 28, 2015 at 10:48:17AM +0800, Boqun Feng wrote:
> > > +/*
> > > + * Since {add,sub}_return_relaxed and xchg_relaxed are implemented with
> > > + * a "bne-" instruction at the end, so an isync is enough as a acquire barrier
> > > + * on the platform without lwsync.
> > > + */
> > > +#ifdef CONFIG_SMP
> > > +#define smp_acquire_barrier__after_atomic() \
> > > +	__asm__ __volatile__(PPC_ACQUIRE_BARRIER : : : "memory")
> > > +#else
> > > +#define smp_acquire_barrier__after_atomic() barrier()
> > > +#endif
> > > +#define arch_atomic_op_acquire(op, args...)				\
> > > +({									\
> > > +	typeof(op##_relaxed(args)) __ret  = op##_relaxed(args);		\
> > > +	smp_acquire_barrier__after_atomic();				\
> > > +	__ret;								\
> > > +})
> > > +
> > > +#define arch_atomic_op_release(op, args...)				\
> > > +({									\
> > > +	smp_lwsync();							\
> > > +	op##_relaxed(args);						\
> > > +})
> > 
> > Urgh, so this is RCpc. We were trying to get rid of that if possible.
> > Lets wait until that's settled before introducing more of it.
> > 
> > lkml.kernel.org/r/20150820155604.GB24100@arm.com
> 
> OK, get it. Thanks.
> 
> So I'm not going to introduce these arch specific macros, I think what I
> need to implement are just _relaxed variants and cmpxchg_acquire.

Ah.. just read through the thread you mentioned, I might misunderstand
you, probably because I didn't understand RCpc well..

You are saying that in a RELEASE we -might- switch from smp_lwsync() to
smp_mb() semantically, right? I guess this means we -might- switch from
RCpc to RCsc, right?

If so, I think I'd better to wait until we have a conclusion for this.

Thank you for your comments!

Regards,
Boqun


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants
  2015-08-28 14:16       ` Boqun Feng
@ 2015-08-28 15:39         ` Peter Zijlstra
  2015-08-28 16:59           ` Boqun Feng
  2015-09-01 19:00           ` Will Deacon
  0 siblings, 2 replies; 32+ messages in thread
From: Peter Zijlstra @ 2015-08-28 15:39 UTC (permalink / raw)
  To: Boqun Feng
  Cc: linux-kernel, linuxppc-dev, Ingo Molnar, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Thomas Gleixner, Will Deacon,
	Paul E. McKenney, Waiman Long

On Fri, Aug 28, 2015 at 10:16:02PM +0800, Boqun Feng wrote:
> On Fri, Aug 28, 2015 at 08:06:14PM +0800, Boqun Feng wrote:
> > Hi Peter,
> > 
> > On Fri, Aug 28, 2015 at 12:48:54PM +0200, Peter Zijlstra wrote:
> > > On Fri, Aug 28, 2015 at 10:48:17AM +0800, Boqun Feng wrote:
> > > > +/*
> > > > + * Since {add,sub}_return_relaxed and xchg_relaxed are implemented with
> > > > + * a "bne-" instruction at the end, so an isync is enough as a acquire barrier
> > > > + * on the platform without lwsync.
> > > > + */
> > > > +#ifdef CONFIG_SMP
> > > > +#define smp_acquire_barrier__after_atomic() \
> > > > +	__asm__ __volatile__(PPC_ACQUIRE_BARRIER : : : "memory")
> > > > +#else
> > > > +#define smp_acquire_barrier__after_atomic() barrier()
> > > > +#endif
> > > > +#define arch_atomic_op_acquire(op, args...)				\
> > > > +({									\
> > > > +	typeof(op##_relaxed(args)) __ret  = op##_relaxed(args);		\
> > > > +	smp_acquire_barrier__after_atomic();				\
> > > > +	__ret;								\
> > > > +})
> > > > +
> > > > +#define arch_atomic_op_release(op, args...)				\
> > > > +({									\
> > > > +	smp_lwsync();							\
> > > > +	op##_relaxed(args);						\
> > > > +})
> > > 
> > > Urgh, so this is RCpc. We were trying to get rid of that if possible.
> > > Lets wait until that's settled before introducing more of it.
> > > 
> > > lkml.kernel.org/r/20150820155604.GB24100@arm.com
> > 
> > OK, get it. Thanks.
> > 
> > So I'm not going to introduce these arch specific macros, I think what I
> > need to implement are just _relaxed variants and cmpxchg_acquire.
> 
> Ah.. just read through the thread you mentioned, I might misunderstand
> you, probably because I didn't understand RCpc well..
> 
> You are saying that in a RELEASE we -might- switch from smp_lwsync() to
> smp_mb() semantically, right? I guess this means we -might- switch from
> RCpc to RCsc, right?
> 
> If so, I think I'd better to wait until we have a conclusion for this.

Yes, the difference between RCpc and RCsc is in the meaning of RELEASE +
ACQUIRE. With RCsc that implies a full memory barrier, with RCpc it does
not.

Currently PowerPC is the only arch that (can, and) does RCpc and gives a
weaker RELEASE + ACQUIRE. Only the CPU who did the ACQUIRE is guaranteed
to see the stores of the CPU which did the RELEASE in order.

As it stands, RCU is the only _known_ codebase where this matters, but
we did in fact write code for a fair number of years 'assuming' RELEASE
+ ACQUIRE was a full barrier, so who knows what else is out there.


RCsc - release consistency sequential consistency
RCpc - release consistency processor consistency

https://en.wikipedia.org/wiki/Processor_consistency (where they have
s/sequential/causal/)

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants
  2015-08-28 15:39         ` Peter Zijlstra
@ 2015-08-28 16:59           ` Boqun Feng
  2015-09-01 19:00           ` Will Deacon
  1 sibling, 0 replies; 32+ messages in thread
From: Boqun Feng @ 2015-08-28 16:59 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, linuxppc-dev, Ingo Molnar, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Thomas Gleixner, Will Deacon,
	Paul E. McKenney, Waiman Long

[-- Attachment #1: Type: text/plain, Size: 1448 bytes --]

On Fri, Aug 28, 2015 at 05:39:21PM +0200, Peter Zijlstra wrote:
> On Fri, Aug 28, 2015 at 10:16:02PM +0800, Boqun Feng wrote:
<snip>
> > 
> > Ah.. just read through the thread you mentioned, I might misunderstand
> > you, probably because I didn't understand RCpc well..
> > 
> > You are saying that in a RELEASE we -might- switch from smp_lwsync() to
> > smp_mb() semantically, right? I guess this means we -might- switch from
> > RCpc to RCsc, right?
> > 
> > If so, I think I'd better to wait until we have a conclusion for this.
> 
> Yes, the difference between RCpc and RCsc is in the meaning of RELEASE +
> ACQUIRE. With RCsc that implies a full memory barrier, with RCpc it does
> not.
> 
> Currently PowerPC is the only arch that (can, and) does RCpc and gives a
> weaker RELEASE + ACQUIRE. Only the CPU who did the ACQUIRE is guaranteed
> to see the stores of the CPU which did the RELEASE in order.
> 
> As it stands, RCU is the only _known_ codebase where this matters, but
> we did in fact write code for a fair number of years 'assuming' RELEASE
> + ACQUIRE was a full barrier, so who knows what else is out there.
> 
> 
> RCsc - release consistency sequential consistency
> RCpc - release consistency processor consistency
> 
> https://en.wikipedia.org/wiki/Processor_consistency (where they have
> s/sequential/causal/)

Thank you for your detailed explanation! Much clear now ;-)

Regards,
Boqun

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants
  2015-08-28 15:39         ` Peter Zijlstra
  2015-08-28 16:59           ` Boqun Feng
@ 2015-09-01 19:00           ` Will Deacon
  2015-09-01 21:45             ` Paul E. McKenney
  1 sibling, 1 reply; 32+ messages in thread
From: Will Deacon @ 2015-09-01 19:00 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Boqun Feng, linux-kernel, linuxppc-dev, Ingo Molnar,
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Thomas Gleixner, Paul E. McKenney, Waiman Long

On Fri, Aug 28, 2015 at 04:39:21PM +0100, Peter Zijlstra wrote:
> On Fri, Aug 28, 2015 at 10:16:02PM +0800, Boqun Feng wrote:
> > Ah.. just read through the thread you mentioned, I might misunderstand
> > you, probably because I didn't understand RCpc well..
> > 
> > You are saying that in a RELEASE we -might- switch from smp_lwsync() to
> > smp_mb() semantically, right? I guess this means we -might- switch from
> > RCpc to RCsc, right?
> > 
> > If so, I think I'd better to wait until we have a conclusion for this.
> 
> Yes, the difference between RCpc and RCsc is in the meaning of RELEASE +
> ACQUIRE. With RCsc that implies a full memory barrier, with RCpc it does
> not.

We've discussed this before, but for the sake of completeness, I don't
think we're fully RCsc either because we don't order the actual RELEASE
operation again a subsequent ACQUIRE operation:


P0
smp_store_release(&x, 1);
foo = smp_load_acquire(&y);

P1
smp_store_release(&y, 1);
bar = smp_load_acquire(&x);

We allow foo == bar == 0, which is prohibited by SC.


However, we *do* enforce ordering on any prior or subsequent accesses
for the code snippet above (the release and acquire combine to give a
full barrier), which makes these primitives well suited to things like
message passing.

Will

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants
  2015-09-01 19:00           ` Will Deacon
@ 2015-09-01 21:45             ` Paul E. McKenney
  2015-09-02  9:59               ` Will Deacon
  0 siblings, 1 reply; 32+ messages in thread
From: Paul E. McKenney @ 2015-09-01 21:45 UTC (permalink / raw)
  To: Will Deacon
  Cc: Peter Zijlstra, Boqun Feng, linux-kernel, linuxppc-dev,
	Ingo Molnar, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, Thomas Gleixner, Waiman Long

On Tue, Sep 01, 2015 at 08:00:27PM +0100, Will Deacon wrote:
> On Fri, Aug 28, 2015 at 04:39:21PM +0100, Peter Zijlstra wrote:
> > On Fri, Aug 28, 2015 at 10:16:02PM +0800, Boqun Feng wrote:
> > > Ah.. just read through the thread you mentioned, I might misunderstand
> > > you, probably because I didn't understand RCpc well..
> > > 
> > > You are saying that in a RELEASE we -might- switch from smp_lwsync() to
> > > smp_mb() semantically, right? I guess this means we -might- switch from
> > > RCpc to RCsc, right?
> > > 
> > > If so, I think I'd better to wait until we have a conclusion for this.
> > 
> > Yes, the difference between RCpc and RCsc is in the meaning of RELEASE +
> > ACQUIRE. With RCsc that implies a full memory barrier, with RCpc it does
> > not.
> 
> We've discussed this before, but for the sake of completeness, I don't
> think we're fully RCsc either because we don't order the actual RELEASE
> operation again a subsequent ACQUIRE operation:
> 
> P0
> smp_store_release(&x, 1);
> foo = smp_load_acquire(&y);
> 
> P1
> smp_store_release(&y, 1);
> bar = smp_load_acquire(&x);
> 
> We allow foo == bar == 0, which is prohibited by SC.

I certainly hope that no one expects foo == bar == 0 to be prohibited!!!

On the other hand, in this case, foo == bar == 1 will be prohibited:

P0
foo = smp_load_acquire(&y);
smp_store_release(&x, 1);

P1
bar = smp_load_acquire(&x);
smp_store_release(&y, 1);

> However, we *do* enforce ordering on any prior or subsequent accesses
> for the code snippet above (the release and acquire combine to give a
> full barrier), which makes these primitives well suited to things like
> message passing.

If I understand your example correctly, neither x86 nor Power implement
a full barrier in this case.  For example:

	P0
	WRITE_ONCE(a, 1);
	smp_store_release(b, 1);
	r1 = smp_load_acquire(c);
	r2 = READ_ONCE(d);

	P1
	WRITE_ONCE(d, 1);
	smp_mb();
	r3 = READ_ONCE(a);

Both x86 and Power can reorder P0 as follows:

	P0
	r1 = smp_load_acquire(c);
	r2 = READ_ONCE(d);
	WRITE_ONCE(a, 1);
	smp_store_release(b, 1);

Which clearly shows that the non-SC outcome r2 == 0 && r3 == 0 is allowed.

Or am I missing your point here?

							Thanx, Paul


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants
  2015-09-01 21:45             ` Paul E. McKenney
@ 2015-09-02  9:59               ` Will Deacon
  2015-09-02 10:49                 ` Paul E. McKenney
                                   ` (2 more replies)
  0 siblings, 3 replies; 32+ messages in thread
From: Will Deacon @ 2015-09-02  9:59 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, Boqun Feng, linux-kernel, linuxppc-dev,
	Ingo Molnar, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, Thomas Gleixner, Waiman Long

Hi Paul,

On Tue, Sep 01, 2015 at 10:45:40PM +0100, Paul E. McKenney wrote:
> On Tue, Sep 01, 2015 at 08:00:27PM +0100, Will Deacon wrote:
> > On Fri, Aug 28, 2015 at 04:39:21PM +0100, Peter Zijlstra wrote:
> > > Yes, the difference between RCpc and RCsc is in the meaning of RELEASE +
> > > ACQUIRE. With RCsc that implies a full memory barrier, with RCpc it does
> > > not.
> > 
> > We've discussed this before, but for the sake of completeness, I don't
> > think we're fully RCsc either because we don't order the actual RELEASE
> > operation again a subsequent ACQUIRE operation:
> > 
> > P0
> > smp_store_release(&x, 1);
> > foo = smp_load_acquire(&y);
> > 
> > P1
> > smp_store_release(&y, 1);
> > bar = smp_load_acquire(&x);
> > 
> > We allow foo == bar == 0, which is prohibited by SC.
> 
> I certainly hope that no one expects foo == bar == 0 to be prohibited!!!

I just thought it was worth making this point, because it is prohibited
in SC and I don't want people to think that our RELEASE/ACQUIRE operations
are SC (even though they happen to be on arm64).

> On the other hand, in this case, foo == bar == 1 will be prohibited:
> 
> P0
> foo = smp_load_acquire(&y);
> smp_store_release(&x, 1);
> 
> P1
> bar = smp_load_acquire(&x);
> smp_store_release(&y, 1);

Agreed.

> > However, we *do* enforce ordering on any prior or subsequent accesses
> > for the code snippet above (the release and acquire combine to give a
> > full barrier), which makes these primitives well suited to things like
> > message passing.
> 
> If I understand your example correctly, neither x86 nor Power implement
> a full barrier in this case.  For example:
> 
> 	P0
> 	WRITE_ONCE(a, 1);
> 	smp_store_release(b, 1);
> 	r1 = smp_load_acquire(c);
> 	r2 = READ_ONCE(d);
> 
> 	P1
> 	WRITE_ONCE(d, 1);
> 	smp_mb();
> 	r3 = READ_ONCE(a);
> 
> Both x86 and Power can reorder P0 as follows:
> 
> 	P0
> 	r1 = smp_load_acquire(c);
> 	r2 = READ_ONCE(d);
> 	WRITE_ONCE(a, 1);
> 	smp_store_release(b, 1);
> 
> Which clearly shows that the non-SC outcome r2 == 0 && r3 == 0 is allowed.
> 
> Or am I missing your point here?

I think this example is slightly different. Having the RELEASE/ACQUIRE
operations being reordered with respect to each other is one thing, but
I thought we were heading in a direction where they combined to give a
full barrier with respect to other accesses. In that case, the reordering
above would be forbidden.

Peter -- if the above reordering can happen on x86, then moving away
from RCpc is going to be less popular than I hoped...

Will

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants
  2015-09-02  9:59               ` Will Deacon
@ 2015-09-02 10:49                 ` Paul E. McKenney
  2015-09-02 15:23                 ` Pranith Kumar
  2015-09-11 12:45                 ` Will Deacon
  2 siblings, 0 replies; 32+ messages in thread
From: Paul E. McKenney @ 2015-09-02 10:49 UTC (permalink / raw)
  To: Will Deacon
  Cc: Peter Zijlstra, Boqun Feng, linux-kernel, linuxppc-dev,
	Ingo Molnar, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, Thomas Gleixner, Waiman Long

On Wed, Sep 02, 2015 at 10:59:06AM +0100, Will Deacon wrote:
> Hi Paul,
> 
> On Tue, Sep 01, 2015 at 10:45:40PM +0100, Paul E. McKenney wrote:
> > On Tue, Sep 01, 2015 at 08:00:27PM +0100, Will Deacon wrote:
> > > On Fri, Aug 28, 2015 at 04:39:21PM +0100, Peter Zijlstra wrote:
> > > > Yes, the difference between RCpc and RCsc is in the meaning of RELEASE +
> > > > ACQUIRE. With RCsc that implies a full memory barrier, with RCpc it does
> > > > not.
> > > 
> > > We've discussed this before, but for the sake of completeness, I don't
> > > think we're fully RCsc either because we don't order the actual RELEASE
> > > operation again a subsequent ACQUIRE operation:
> > > 
> > > P0
> > > smp_store_release(&x, 1);
> > > foo = smp_load_acquire(&y);
> > > 
> > > P1
> > > smp_store_release(&y, 1);
> > > bar = smp_load_acquire(&x);
> > > 
> > > We allow foo == bar == 0, which is prohibited by SC.
> > 
> > I certainly hope that no one expects foo == bar == 0 to be prohibited!!!
> 
> I just thought it was worth making this point, because it is prohibited
> in SC and I don't want people to think that our RELEASE/ACQUIRE operations
> are SC (even though they happen to be on arm64).

OK, good.

> > On the other hand, in this case, foo == bar == 1 will be prohibited:
> > 
> > P0
> > foo = smp_load_acquire(&y);
> > smp_store_release(&x, 1);
> > 
> > P1
> > bar = smp_load_acquire(&x);
> > smp_store_release(&y, 1);
> 
> Agreed.

Good as well.

> > > However, we *do* enforce ordering on any prior or subsequent accesses
> > > for the code snippet above (the release and acquire combine to give a
> > > full barrier), which makes these primitives well suited to things like
> > > message passing.
> > 
> > If I understand your example correctly, neither x86 nor Power implement
> > a full barrier in this case.  For example:
> > 
> > 	P0
> > 	WRITE_ONCE(a, 1);
> > 	smp_store_release(b, 1);
> > 	r1 = smp_load_acquire(c);
> > 	r2 = READ_ONCE(d);
> > 
> > 	P1
> > 	WRITE_ONCE(d, 1);
> > 	smp_mb();
> > 	r3 = READ_ONCE(a);
> > 
> > Both x86 and Power can reorder P0 as follows:
> > 
> > 	P0
> > 	r1 = smp_load_acquire(c);
> > 	r2 = READ_ONCE(d);
> > 	WRITE_ONCE(a, 1);
> > 	smp_store_release(b, 1);
> > 
> > Which clearly shows that the non-SC outcome r2 == 0 && r3 == 0 is allowed.
> > 
> > Or am I missing your point here?
> 
> I think this example is slightly different. Having the RELEASE/ACQUIRE
> operations being reordered with respect to each other is one thing, but
> I thought we were heading in a direction where they combined to give a
> full barrier with respect to other accesses. In that case, the reordering
> above would be forbidden.

It is certainly less added overhead to make unlock-lock a full barrier
than it is to make smp_store_release()-smp_load_acquire() a full barrier.
I am not fully convinced on either, aside from needing some way to make
unlock-lock a full barrier within the RCU implementation, for which the
now-privatized smp_mb__after_unlock_lock() suffices.

> Peter -- if the above reordering can happen on x86, then moving away
> from RCpc is going to be less popular than I hoped...

;-)

							Thanx, Paul


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants
  2015-09-02  9:59               ` Will Deacon
  2015-09-02 10:49                 ` Paul E. McKenney
@ 2015-09-02 15:23                 ` Pranith Kumar
  2015-09-02 15:36                     ` [RFC 3/5] powerpc: atomic: implement atomic{, 64}_{add, sub}_return_* variants Pranith Kumar
  2015-09-11 12:45                 ` Will Deacon
  2 siblings, 1 reply; 32+ messages in thread
From: Pranith Kumar @ 2015-09-02 15:23 UTC (permalink / raw)
  To: Will Deacon, Paul E. McKenney
  Cc: Waiman Long, Peter Zijlstra, Boqun Feng, linux-kernel,
	Paul Mackerras, Thomas Gleixner, linuxppc-dev, Ingo Molnar

Hi Will,

On 09/02/2015 05:59 AM, Will Deacon wrote:
> I just thought it was worth making this point, because it is prohibited
> in SC and I don't want people to think that our RELEASE/ACQUIRE operations
> are SC (even though they happen to be on arm64).

This is interesting information. Does that mean that the following patch
should work? (I am not proposing to use it, just trying to understand if 
REL+ACQ will act as a full barrier on ARM64, which you say it does).

Thanks,
Pranith.

diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
index d8c25b7..14a1b35 100644
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -68,8 +68,7 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size
                BUILD_BUG();
        }
 
-       smp_mb();
-       return ret;
+       return smp_load_acquire(ret);
 }
 
 #define xchg(ptr,x) \


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants
  2015-09-02 15:23                 ` Pranith Kumar
@ 2015-09-02 15:36                     ` Pranith Kumar
  0 siblings, 0 replies; 32+ messages in thread
From: Pranith Kumar @ 2015-09-02 15:36 UTC (permalink / raw)
  To: Will Deacon, Paul E. McKenney
  Cc: Waiman Long, Peter Zijlstra, Boqun Feng, linux-kernel,
	Paul Mackerras, Thomas Gleixner, linuxppc-dev, Ingo Molnar

On Wed, Sep 2, 2015 at 11:23 AM, Pranith Kumar <bobby.prani@gmail.com> wrote:
> Hi Will,
>
> On 09/02/2015 05:59 AM, Will Deacon wrote:
>> I just thought it was worth making this point, because it is prohibited
>> in SC and I don't want people to think that our RELEASE/ACQUIRE operations
>> are SC (even though they happen to be on arm64).
>
> This is interesting information. Does that mean that the following patch
> should work? (I am not proposing to use it, just trying to understand if
> REL+ACQ will act as a full barrier on ARM64, which you say it does).
>
> Thanks,
> Pranith.
>
> diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
> index d8c25b7..14a1b35 100644
> --- a/arch/arm64/include/asm/cmpxchg.h
> +++ b/arch/arm64/include/asm/cmpxchg.h
> @@ -68,8 +68,7 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size
>                 BUILD_BUG();
>         }
>
> -       smp_mb();
> -       return ret;
> +       return smp_load_acquire(ret);

I meant 'smp_load_acquire(&ret);'

-- 
Pranith

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] powerpc: atomic: implement atomic{, 64}_{add, sub}_return_* variants
@ 2015-09-02 15:36                     ` Pranith Kumar
  0 siblings, 0 replies; 32+ messages in thread
From: Pranith Kumar @ 2015-09-02 15:36 UTC (permalink / raw)
  To: Will Deacon, Paul E. McKenney
  Cc: Waiman Long, Peter Zijlstra, Boqun Feng, linux-kernel,
	Paul Mackerras, Thomas Gleixner, linuxppc-dev, Ingo Molnar

On Wed, Sep 2, 2015 at 11:23 AM, Pranith Kumar <bobby.prani@gmail.com> wrote:
> Hi Will,
>
> On 09/02/2015 05:59 AM, Will Deacon wrote:
>> I just thought it was worth making this point, because it is prohibited
>> in SC and I don't want people to think that our RELEASE/ACQUIRE operations
>> are SC (even though they happen to be on arm64).
>
> This is interesting information. Does that mean that the following patch
> should work? (I am not proposing to use it, just trying to understand if
> REL+ACQ will act as a full barrier on ARM64, which you say it does).
>
> Thanks,
> Pranith.
>
> diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
> index d8c25b7..14a1b35 100644
> --- a/arch/arm64/include/asm/cmpxchg.h
> +++ b/arch/arm64/include/asm/cmpxchg.h
> @@ -68,8 +68,7 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size
>                 BUILD_BUG();
>         }
>
> -       smp_mb();
> -       return ret;
> +       return smp_load_acquire(ret);

I meant 'smp_load_acquire(&ret);'

-- 
Pranith

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants
  2015-09-02 15:36                     ` [RFC 3/5] powerpc: atomic: implement atomic{, 64}_{add, sub}_return_* variants Pranith Kumar
  (?)
@ 2015-09-03 10:31                     ` Will Deacon
  -1 siblings, 0 replies; 32+ messages in thread
From: Will Deacon @ 2015-09-03 10:31 UTC (permalink / raw)
  To: Pranith Kumar
  Cc: Paul E. McKenney, Waiman Long, Peter Zijlstra, Boqun Feng,
	linux-kernel, Paul Mackerras, Thomas Gleixner, linuxppc-dev,
	Ingo Molnar

On Wed, Sep 02, 2015 at 04:36:09PM +0100, Pranith Kumar wrote:
> On Wed, Sep 2, 2015 at 11:23 AM, Pranith Kumar <bobby.prani@gmail.com> wrote:
> > On 09/02/2015 05:59 AM, Will Deacon wrote:
> >> I just thought it was worth making this point, because it is prohibited
> >> in SC and I don't want people to think that our RELEASE/ACQUIRE operations
> >> are SC (even though they happen to be on arm64).
> >
> > This is interesting information. Does that mean that the following patch
> > should work? (I am not proposing to use it, just trying to understand if
> > REL+ACQ will act as a full barrier on ARM64, which you say it does).
> >
> > Thanks,
> > Pranith.
> >
> > diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
> > index d8c25b7..14a1b35 100644
> > --- a/arch/arm64/include/asm/cmpxchg.h
> > +++ b/arch/arm64/include/asm/cmpxchg.h
> > @@ -68,8 +68,7 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size
> >                 BUILD_BUG();
> >         }
> >
> > -       smp_mb();
> > -       return ret;
> > +       return smp_load_acquire(ret);
> 
> I meant 'smp_load_acquire(&ret);'

Yes, I think that would work on arm64, but it's not portable between
architectures.

Will

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants
  2015-09-02  9:59               ` Will Deacon
  2015-09-02 10:49                 ` Paul E. McKenney
  2015-09-02 15:23                 ` Pranith Kumar
@ 2015-09-11 12:45                 ` Will Deacon
  2015-09-11 17:09                   ` Paul E. McKenney
  2015-09-14 11:35                   ` Peter Zijlstra
  2 siblings, 2 replies; 32+ messages in thread
From: Will Deacon @ 2015-09-11 12:45 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, Boqun Feng, linux-kernel, linuxppc-dev,
	Ingo Molnar, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, Thomas Gleixner, Waiman Long

[left the context in the hope that we can make some progress]

On Wed, Sep 02, 2015 at 10:59:06AM +0100, Will Deacon wrote:
> On Tue, Sep 01, 2015 at 10:45:40PM +0100, Paul E. McKenney wrote:
> > On Tue, Sep 01, 2015 at 08:00:27PM +0100, Will Deacon wrote:
> > > On Fri, Aug 28, 2015 at 04:39:21PM +0100, Peter Zijlstra wrote:
> > > > Yes, the difference between RCpc and RCsc is in the meaning of RELEASE +
> > > > ACQUIRE. With RCsc that implies a full memory barrier, with RCpc it does
> > > > not.
> > > 
> > > We've discussed this before, but for the sake of completeness, I don't
> > > think we're fully RCsc either because we don't order the actual RELEASE
> > > operation again a subsequent ACQUIRE operation:
> > > 
> > > P0
> > > smp_store_release(&x, 1);
> > > foo = smp_load_acquire(&y);
> > > 
> > > P1
> > > smp_store_release(&y, 1);
> > > bar = smp_load_acquire(&x);
> > > 
> > > We allow foo == bar == 0, which is prohibited by SC.
> > 
> > I certainly hope that no one expects foo == bar == 0 to be prohibited!!!
> 
> I just thought it was worth making this point, because it is prohibited
> in SC and I don't want people to think that our RELEASE/ACQUIRE operations
> are SC (even though they happen to be on arm64).
> 
> > On the other hand, in this case, foo == bar == 1 will be prohibited:
> > 
> > P0
> > foo = smp_load_acquire(&y);
> > smp_store_release(&x, 1);
> > 
> > P1
> > bar = smp_load_acquire(&x);
> > smp_store_release(&y, 1);
> 
> Agreed.
> 
> > > However, we *do* enforce ordering on any prior or subsequent accesses
> > > for the code snippet above (the release and acquire combine to give a
> > > full barrier), which makes these primitives well suited to things like
> > > message passing.
> > 
> > If I understand your example correctly, neither x86 nor Power implement
> > a full barrier in this case.  For example:
> > 
> > 	P0
> > 	WRITE_ONCE(a, 1);
> > 	smp_store_release(b, 1);
> > 	r1 = smp_load_acquire(c);
> > 	r2 = READ_ONCE(d);
> > 
> > 	P1
> > 	WRITE_ONCE(d, 1);
> > 	smp_mb();
> > 	r3 = READ_ONCE(a);
> > 
> > Both x86 and Power can reorder P0 as follows:
> > 
> > 	P0
> > 	r1 = smp_load_acquire(c);
> > 	r2 = READ_ONCE(d);
> > 	WRITE_ONCE(a, 1);
> > 	smp_store_release(b, 1);
> > 
> > Which clearly shows that the non-SC outcome r2 == 0 && r3 == 0 is allowed.
> > 
> > Or am I missing your point here?
> 
> I think this example is slightly different. Having the RELEASE/ACQUIRE
> operations being reordered with respect to each other is one thing, but
> I thought we were heading in a direction where they combined to give a
> full barrier with respect to other accesses. In that case, the reordering
> above would be forbidden.
> 
> Peter -- if the above reordering can happen on x86, then moving away
> from RCpc is going to be less popular than I hoped...

Peter, any thoughts? I'm not au fait with the x86 memory model, but what
Paul's saying is worrying.

Will

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants
  2015-09-11 12:45                 ` Will Deacon
@ 2015-09-11 17:09                   ` Paul E. McKenney
  2015-09-14 11:35                   ` Peter Zijlstra
  1 sibling, 0 replies; 32+ messages in thread
From: Paul E. McKenney @ 2015-09-11 17:09 UTC (permalink / raw)
  To: Will Deacon
  Cc: Peter Zijlstra, Boqun Feng, linux-kernel, linuxppc-dev,
	Ingo Molnar, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, Thomas Gleixner, Waiman Long

On Fri, Sep 11, 2015 at 01:45:07PM +0100, Will Deacon wrote:
> [left the context in the hope that we can make some progress]
> 
> On Wed, Sep 02, 2015 at 10:59:06AM +0100, Will Deacon wrote:
> > On Tue, Sep 01, 2015 at 10:45:40PM +0100, Paul E. McKenney wrote:
> > > On Tue, Sep 01, 2015 at 08:00:27PM +0100, Will Deacon wrote:
> > > > On Fri, Aug 28, 2015 at 04:39:21PM +0100, Peter Zijlstra wrote:
> > > > > Yes, the difference between RCpc and RCsc is in the meaning of RELEASE +
> > > > > ACQUIRE. With RCsc that implies a full memory barrier, with RCpc it does
> > > > > not.
> > > > 
> > > > We've discussed this before, but for the sake of completeness, I don't
> > > > think we're fully RCsc either because we don't order the actual RELEASE
> > > > operation again a subsequent ACQUIRE operation:
> > > > 
> > > > P0
> > > > smp_store_release(&x, 1);
> > > > foo = smp_load_acquire(&y);
> > > > 
> > > > P1
> > > > smp_store_release(&y, 1);
> > > > bar = smp_load_acquire(&x);
> > > > 
> > > > We allow foo == bar == 0, which is prohibited by SC.
> > > 
> > > I certainly hope that no one expects foo == bar == 0 to be prohibited!!!
> > 
> > I just thought it was worth making this point, because it is prohibited
> > in SC and I don't want people to think that our RELEASE/ACQUIRE operations
> > are SC (even though they happen to be on arm64).
> > 
> > > On the other hand, in this case, foo == bar == 1 will be prohibited:
> > > 
> > > P0
> > > foo = smp_load_acquire(&y);
> > > smp_store_release(&x, 1);
> > > 
> > > P1
> > > bar = smp_load_acquire(&x);
> > > smp_store_release(&y, 1);
> > 
> > Agreed.
> > 
> > > > However, we *do* enforce ordering on any prior or subsequent accesses
> > > > for the code snippet above (the release and acquire combine to give a
> > > > full barrier), which makes these primitives well suited to things like
> > > > message passing.
> > > 
> > > If I understand your example correctly, neither x86 nor Power implement
> > > a full barrier in this case.  For example:
> > > 
> > > 	P0
> > > 	WRITE_ONCE(a, 1);
> > > 	smp_store_release(b, 1);
> > > 	r1 = smp_load_acquire(c);
> > > 	r2 = READ_ONCE(d);
> > > 
> > > 	P1
> > > 	WRITE_ONCE(d, 1);
> > > 	smp_mb();
> > > 	r3 = READ_ONCE(a);
> > > 
> > > Both x86 and Power can reorder P0 as follows:
> > > 
> > > 	P0
> > > 	r1 = smp_load_acquire(c);
> > > 	r2 = READ_ONCE(d);
> > > 	WRITE_ONCE(a, 1);
> > > 	smp_store_release(b, 1);
> > > 
> > > Which clearly shows that the non-SC outcome r2 == 0 && r3 == 0 is allowed.
> > > 
> > > Or am I missing your point here?
> > 
> > I think this example is slightly different. Having the RELEASE/ACQUIRE
> > operations being reordered with respect to each other is one thing, but
> > I thought we were heading in a direction where they combined to give a
> > full barrier with respect to other accesses. In that case, the reordering
> > above would be forbidden.
> > 
> > Peter -- if the above reordering can happen on x86, then moving away
> > from RCpc is going to be less popular than I hoped...
> 
> Peter, any thoughts? I'm not au fait with the x86 memory model, but what
> Paul's saying is worrying.

The herd tool has an x86 mode, which will allow you to double-check
my scenario.  This tool is described in "Herding Cats: Modelling,
Simulation, Testing, and Data-mining for Weak Memory" by Alglave,
Marenget, and Tautschnig.  The herd tool is available at this git
repository: https://github.com/herd/herdtools.

							Thanx, Paul


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants
  2015-09-11 12:45                 ` Will Deacon
  2015-09-11 17:09                   ` Paul E. McKenney
@ 2015-09-14 11:35                   ` Peter Zijlstra
  2015-09-14 12:01                     ` Peter Zijlstra
  1 sibling, 1 reply; 32+ messages in thread
From: Peter Zijlstra @ 2015-09-14 11:35 UTC (permalink / raw)
  To: Will Deacon
  Cc: Paul E. McKenney, Boqun Feng, linux-kernel, linuxppc-dev,
	Ingo Molnar, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, Thomas Gleixner, Waiman Long

Sorry for being tardy, I had a wee spell of feeling horrible and then I
procrastinated longer than I should have.

On Fri, Sep 11, 2015 at 01:45:07PM +0100, Will Deacon wrote:

> Peter, any thoughts? I'm not au fait with the x86 memory model, but what
> Paul's saying is worrying.

Right, so Paul is right -- and I completely forgot (I used to know about
that).

So all the TSO archs (SPARC-TSO, x86 (!OOSTORE) and s390) can do
smp_load_acquire()/smp_store_release() with just barrier(), and while:

	smp_store_release(&x);
	smp_load_acquire(&x);

will provide full order by means of the address dependency,

	smp_store_release(&x);
	smp_load_acquire(&y);

will not. Because the one reorder TSO allows is exactly that one.

> Peter -- if the above reordering can happen on x86, then moving away
> from RCpc is going to be less popular than I hoped...

Sadly yes.. We could of course try and split LOCK from ACQUIRE again,
but I'm not sure that's going to help anything except confusion.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants
  2015-09-14 11:35                   ` Peter Zijlstra
@ 2015-09-14 12:01                     ` Peter Zijlstra
  2015-09-14 12:11                       ` Peter Zijlstra
  0 siblings, 1 reply; 32+ messages in thread
From: Peter Zijlstra @ 2015-09-14 12:01 UTC (permalink / raw)
  To: Will Deacon
  Cc: Paul E. McKenney, Boqun Feng, linux-kernel, linuxppc-dev,
	Ingo Molnar, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, Thomas Gleixner, Waiman Long

On Mon, Sep 14, 2015 at 01:35:20PM +0200, Peter Zijlstra wrote:
> 
> Sorry for being tardy, I had a wee spell of feeling horrible and then I
> procrastinated longer than I should have.
> 
> On Fri, Sep 11, 2015 at 01:45:07PM +0100, Will Deacon wrote:
> 
> > Peter, any thoughts? I'm not au fait with the x86 memory model, but what
> > Paul's saying is worrying.
> 
> Right, so Paul is right -- and I completely forgot (I used to know about
> that).
> 
> So all the TSO archs (SPARC-TSO, x86 (!OOSTORE) and s390) can do
> smp_load_acquire()/smp_store_release() with just barrier(), and while:
> 
> 	smp_store_release(&x);
> 	smp_load_acquire(&x);
> 
> will provide full order by means of the address dependency,
> 
> 	smp_store_release(&x);
> 	smp_load_acquire(&y);
> 
> will not. Because the one reorder TSO allows is exactly that one.
> 
> > Peter -- if the above reordering can happen on x86, then moving away
> > from RCpc is going to be less popular than I hoped...
> 
> Sadly yes.. We could of course try and split LOCK from ACQUIRE again,
> but I'm not sure that's going to help anything except confusion.

This of course also means we need something like:

	smp_mb__release_acquire()

which cannot be a no-op for TSO archs. And it might even mean it needs
to be the same as smp_mb__unlock_lock(), but I need to think more on
this.

The scenario is:

	CPU0			CPU1

				unlock(x)
				  smp_store_release(&x->lock, 0);

	unlock(y)
	  smp_store_release(&next->lock, 1); /* next == &y */

				lock(y)
				  while (!(smp_load_acquire(&y->lock))
					cpu_relax();


Where the lock does _NOT_ issue a store to acquire the lock at all. Now
I don't think any of our current primitives manage this, so we should be
good, but it might just be possible.


And at the same time; having both:

	smp_mb__release_acquire()
	smp_mb__unlock_lock()

is quite horrible, for it clearly shows a LOCK isn't quite the same as
ACQUIRE :/



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants
  2015-09-14 12:01                     ` Peter Zijlstra
@ 2015-09-14 12:11                       ` Peter Zijlstra
  2015-09-14 15:38                         ` Will Deacon
  0 siblings, 1 reply; 32+ messages in thread
From: Peter Zijlstra @ 2015-09-14 12:11 UTC (permalink / raw)
  To: Will Deacon
  Cc: Paul E. McKenney, Boqun Feng, linux-kernel, linuxppc-dev,
	Ingo Molnar, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, Thomas Gleixner, Waiman Long

On Mon, Sep 14, 2015 at 02:01:53PM +0200, Peter Zijlstra wrote:
> The scenario is:
> 
> 	CPU0			CPU1
> 
> 				unlock(x)
> 				  smp_store_release(&x->lock, 0);
> 
> 	unlock(y)
> 	  smp_store_release(&next->lock, 1); /* next == &y */
> 
> 				lock(y)
> 				  while (!(smp_load_acquire(&y->lock))
> 					cpu_relax();
> 
> 
> Where the lock does _NOT_ issue a store to acquire the lock at all. Now
> I don't think any of our current primitives manage this, so we should be
> good, but it might just be possible.

So with a bit more through this seems fundamentally impossible, you
always needs some stores in a lock() implementation, the above for
instance needs to queue itself, otherwise CPU0 will not be able to find
it etc..

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants
  2015-09-14 12:11                       ` Peter Zijlstra
@ 2015-09-14 15:38                         ` Will Deacon
  2015-09-14 16:26                           ` Paul E. McKenney
  0 siblings, 1 reply; 32+ messages in thread
From: Will Deacon @ 2015-09-14 15:38 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paul E. McKenney, Boqun Feng, linux-kernel, linuxppc-dev,
	Ingo Molnar, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, Thomas Gleixner, Waiman Long

On Mon, Sep 14, 2015 at 01:11:56PM +0100, Peter Zijlstra wrote:
> On Mon, Sep 14, 2015 at 02:01:53PM +0200, Peter Zijlstra wrote:
> > The scenario is:
> > 
> > 	CPU0			CPU1
> > 
> > 				unlock(x)
> > 				  smp_store_release(&x->lock, 0);
> > 
> > 	unlock(y)
> > 	  smp_store_release(&next->lock, 1); /* next == &y */
> > 
> > 				lock(y)
> > 				  while (!(smp_load_acquire(&y->lock))
> > 					cpu_relax();
> > 
> > 
> > Where the lock does _NOT_ issue a store to acquire the lock at all. Now
> > I don't think any of our current primitives manage this, so we should be
> > good, but it might just be possible.
> 
> So with a bit more through this seems fundamentally impossible, you
> always needs some stores in a lock() implementation, the above for
> instance needs to queue itself, otherwise CPU0 will not be able to find
> it etc..

Which brings us back round to separating LOCK/UNLOCK from ACQUIRE/RELEASE.

If we say that UNLOCK(foo) -> LOCK(bar) is ordered but RELEASE(baz) ->
ACQUIRE(boz) is only ordered by smp_mb__release_acquire(), then I think
we're in a position where we can at least build arbitrary locks portably
out of ACQUIRE/RELEASE operations, even though I don't see any users of
that macro in the imminent future.

I'll have a crack at some documentation.

Will

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants
  2015-09-14 15:38                         ` Will Deacon
@ 2015-09-14 16:26                           ` Paul E. McKenney
  0 siblings, 0 replies; 32+ messages in thread
From: Paul E. McKenney @ 2015-09-14 16:26 UTC (permalink / raw)
  To: Will Deacon
  Cc: Peter Zijlstra, Boqun Feng, linux-kernel, linuxppc-dev,
	Ingo Molnar, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, Thomas Gleixner, Waiman Long

On Mon, Sep 14, 2015 at 04:38:48PM +0100, Will Deacon wrote:
> On Mon, Sep 14, 2015 at 01:11:56PM +0100, Peter Zijlstra wrote:
> > On Mon, Sep 14, 2015 at 02:01:53PM +0200, Peter Zijlstra wrote:
> > > The scenario is:
> > > 
> > > 	CPU0			CPU1
> > > 
> > > 				unlock(x)
> > > 				  smp_store_release(&x->lock, 0);
> > > 
> > > 	unlock(y)
> > > 	  smp_store_release(&next->lock, 1); /* next == &y */
> > > 
> > > 				lock(y)
> > > 				  while (!(smp_load_acquire(&y->lock))
> > > 					cpu_relax();
> > > 
> > > 
> > > Where the lock does _NOT_ issue a store to acquire the lock at all. Now
> > > I don't think any of our current primitives manage this, so we should be
> > > good, but it might just be possible.
> > 
> > So with a bit more through this seems fundamentally impossible, you
> > always needs some stores in a lock() implementation, the above for
> > instance needs to queue itself, otherwise CPU0 will not be able to find
> > it etc..
> 
> Which brings us back round to separating LOCK/UNLOCK from ACQUIRE/RELEASE.

I believe that we do need to do this, unless we decide to have unlock-lock
continue to imply only acquire and release, rather than full ordering.
I believe that Mike Ellerman is working up additional benchmarking
on this.

							Thanx, Paul

> If we say that UNLOCK(foo) -> LOCK(bar) is ordered but RELEASE(baz) ->
> ACQUIRE(boz) is only ordered by smp_mb__release_acquire(), then I think
> we're in a position where we can at least build arbitrary locks portably
> out of ACQUIRE/RELEASE operations, even though I don't see any users of
> that macro in the imminent future.
> 
> I'll have a crack at some documentation.
> 
> Will
> 


^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2015-09-14 16:46 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-08-28  2:48 [RFC 0/5] atomics: powerpc: implement relaxed/acquire/release variants of some atomics Boqun Feng
2015-08-28  2:48 ` [RFC 1/5] atomics: add test for atomic operations with _relaxed variants Boqun Feng
2015-08-28  2:48 ` [RFC 2/5] atomics: introduce arch_atomic_op_{acquire,release,fence} helpers Boqun Feng
2015-08-28  2:48   ` [RFC 2/5] atomics: introduce arch_atomic_op_{acquire, release, fence} helpers Boqun Feng
2015-08-28 11:36   ` [RFC 2/5] atomics: introduce arch_atomic_op_{acquire,release,fence} helpers Peter Zijlstra
2015-08-28 11:50     ` Boqun Feng
2015-08-28  2:48 ` [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants Boqun Feng
2015-08-28  2:48   ` [RFC 3/5] powerpc: atomic: implement atomic{, 64}_{add, sub}_return_* variants Boqun Feng
2015-08-28 10:48   ` [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants Peter Zijlstra
2015-08-28 12:06     ` Boqun Feng
2015-08-28 14:16       ` Boqun Feng
2015-08-28 15:39         ` Peter Zijlstra
2015-08-28 16:59           ` Boqun Feng
2015-09-01 19:00           ` Will Deacon
2015-09-01 21:45             ` Paul E. McKenney
2015-09-02  9:59               ` Will Deacon
2015-09-02 10:49                 ` Paul E. McKenney
2015-09-02 15:23                 ` Pranith Kumar
2015-09-02 15:36                   ` Pranith Kumar
2015-09-02 15:36                     ` [RFC 3/5] powerpc: atomic: implement atomic{, 64}_{add, sub}_return_* variants Pranith Kumar
2015-09-03 10:31                     ` [RFC 3/5] powerpc: atomic: implement atomic{,64}_{add,sub}_return_* variants Will Deacon
2015-09-11 12:45                 ` Will Deacon
2015-09-11 17:09                   ` Paul E. McKenney
2015-09-14 11:35                   ` Peter Zijlstra
2015-09-14 12:01                     ` Peter Zijlstra
2015-09-14 12:11                       ` Peter Zijlstra
2015-09-14 15:38                         ` Will Deacon
2015-09-14 16:26                           ` Paul E. McKenney
2015-08-28  2:48 ` [RFC 4/5] powerpc: atomic: implement xchg_* and atomic{,64}_xchg_* variants Boqun Feng
2015-08-28  2:48   ` [RFC 4/5] powerpc: atomic: implement xchg_* and atomic{, 64}_xchg_* variants Boqun Feng
2015-08-28  2:48 ` [RFC 5/5] powerpc: atomic: implement cmpxchg{,64}_* and atomic{,64}_cmpxchg_* variants Boqun Feng
2015-08-28  2:48   ` [RFC 5/5] powerpc: atomic: implement cmpxchg{, 64}_* and atomic{, 64}_cmpxchg_* variants Boqun Feng

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.