linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v5 1/3] powerpc/bitops: Use immediate operand when possible
@ 2021-09-21 15:09 Christophe Leroy
  2021-09-21 15:09 ` [PATCH v5 2/3] powerpc/atomics: " Christophe Leroy
                   ` (3 more replies)
  0 siblings, 4 replies; 6+ messages in thread
From: Christophe Leroy @ 2021-09-21 15:09 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

Today we get the following code generation for bitops like
set or clear bit:

	c0009fe0:	39 40 08 00 	li      r10,2048
	c0009fe4:	7c e0 40 28 	lwarx   r7,0,r8
	c0009fe8:	7c e7 53 78 	or      r7,r7,r10
	c0009fec:	7c e0 41 2d 	stwcx.  r7,0,r8

	c000d568:	39 00 18 00 	li      r8,6144
	c000d56c:	7c c0 38 28 	lwarx   r6,0,r7
	c000d570:	7c c6 40 78 	andc    r6,r6,r8
	c000d574:	7c c0 39 2d 	stwcx.  r6,0,r7

Most set bits are constant on lower 16 bits, so it can easily
be replaced by the "immediate" version of the operation. Allow
GCC to choose between the normal or immediate form.

For clear bits, on 32 bits 'rlwinm' can be used instead of 'andc' for
when all bits to be cleared are consecutive.

On 64 bits we don't have any equivalent single operation for clearing,
single bits or a few bits, we'd need two 'rldicl' so it is not
worth it, the li/andc sequence is doing the same.

With this patch we get:

	c0009fe0:	7d 00 50 28 	lwarx   r8,0,r10
	c0009fe4:	61 08 08 00 	ori     r8,r8,2048
	c0009fe8:	7d 00 51 2d 	stwcx.  r8,0,r10

	c000d558:	7c e0 40 28 	lwarx   r7,0,r8
	c000d55c:	54 e7 05 64 	rlwinm  r7,r7,0,21,18
	c000d560:	7c e0 41 2d 	stwcx.  r7,0,r8

On pmac32_defconfig, it reduces the text by approx 10 kbytes.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: Segher Boessenkool <segher@kernel.crashing.org>
---
v5: Fixed the argument of is_rlwinm_mask_valid() in test_and_clear_bits()

v4: Rebased

v3:
- Using the mask validation proposed by Segher

v2:
- Use "n" instead of "i" as constraint for the rlwinm mask
- Improve mask verification to handle more than single bit masks

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/include/asm/bitops.h | 89 ++++++++++++++++++++++++++++---
 1 file changed, 81 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
index 11847b6a244e..a05d8c62cbea 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -71,19 +71,61 @@ static inline void fn(unsigned long mask,	\
 	__asm__ __volatile__ (			\
 	prefix					\
 "1:"	PPC_LLARX "%0,0,%3,0\n"			\
-	stringify_in_c(op) "%0,%0,%2\n"		\
+	#op "%I2 %0,%0,%2\n"			\
 	PPC_STLCX "%0,0,%3\n"			\
 	"bne- 1b\n"				\
 	: "=&r" (old), "+m" (*p)		\
-	: "r" (mask), "r" (p)			\
+	: "rK" (mask), "r" (p)			\
 	: "cc", "memory");			\
 }
 
 DEFINE_BITOP(set_bits, or, "")
-DEFINE_BITOP(clear_bits, andc, "")
-DEFINE_BITOP(clear_bits_unlock, andc, PPC_RELEASE_BARRIER)
 DEFINE_BITOP(change_bits, xor, "")
 
+static __always_inline bool is_rlwinm_mask_valid(unsigned long x)
+{
+	if (!x)
+		return false;
+	if (x & 1)
+		x = ~x;	// make the mask non-wrapping
+	x += x & -x;	// adding the low set bit results in at most one bit set
+
+	return !(x & (x - 1));
+}
+
+#define DEFINE_CLROP(fn, prefix)					\
+static inline void fn(unsigned long mask, volatile unsigned long *_p)	\
+{									\
+	unsigned long old;						\
+	unsigned long *p = (unsigned long *)_p;				\
+									\
+	if (IS_ENABLED(CONFIG_PPC32) &&					\
+	    __builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) {\
+		asm volatile (						\
+			prefix						\
+		"1:"	"lwarx	%0,0,%3\n"				\
+			"rlwinm	%0,%0,0,%2\n"				\
+			"stwcx.	%0,0,%3\n"				\
+			"bne- 1b\n"					\
+			: "=&r" (old), "+m" (*p)			\
+			: "n" (~mask), "r" (p)				\
+			: "cc", "memory");				\
+	} else {							\
+		asm volatile (						\
+			prefix						\
+		"1:"	PPC_LLARX "%0,0,%3,0\n"				\
+			"andc %0,%0,%2\n"				\
+			PPC_STLCX "%0,0,%3\n"				\
+			"bne- 1b\n"					\
+			: "=&r" (old), "+m" (*p)			\
+			: "r" (mask), "r" (p)				\
+			: "cc", "memory");				\
+	}								\
+}
+
+DEFINE_CLROP(clear_bits, "")
+DEFINE_CLROP(clear_bits_unlock, PPC_RELEASE_BARRIER)
+
 static inline void arch_set_bit(int nr, volatile unsigned long *addr)
 {
 	set_bits(BIT_MASK(nr), addr + BIT_WORD(nr));
@@ -116,12 +158,12 @@ static inline unsigned long fn(			\
 	__asm__ __volatile__ (				\
 	prefix						\
 "1:"	PPC_LLARX "%0,0,%3,%4\n"			\
-	stringify_in_c(op) "%1,%0,%2\n"			\
+	#op "%I2 %1,%0,%2\n"				\
 	PPC_STLCX "%1,0,%3\n"				\
 	"bne- 1b\n"					\
 	postfix						\
 	: "=&r" (old), "=&r" (t)			\
-	: "r" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0)	\
+	: "rK" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0)	\
 	: "cc", "memory");				\
 	return (old & mask);				\
 }
@@ -130,11 +172,42 @@ DEFINE_TESTOP(test_and_set_bits, or, PPC_ATOMIC_ENTRY_BARRIER,
 	      PPC_ATOMIC_EXIT_BARRIER, 0)
 DEFINE_TESTOP(test_and_set_bits_lock, or, "",
 	      PPC_ACQUIRE_BARRIER, 1)
-DEFINE_TESTOP(test_and_clear_bits, andc, PPC_ATOMIC_ENTRY_BARRIER,
-	      PPC_ATOMIC_EXIT_BARRIER, 0)
 DEFINE_TESTOP(test_and_change_bits, xor, PPC_ATOMIC_ENTRY_BARRIER,
 	      PPC_ATOMIC_EXIT_BARRIER, 0)
 
+static inline unsigned long test_and_clear_bits(unsigned long mask, volatile unsigned long *_p)
+{
+	unsigned long old, t;
+	unsigned long *p = (unsigned long *)_p;
+
+	if (IS_ENABLED(CONFIG_PPC32) &&
+	    __builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) {
+		asm volatile (
+			PPC_ATOMIC_ENTRY_BARRIER
+		"1:"	"lwarx %0,0,%3\n"
+			"rlwinm	%1,%0,0,%2\n"
+			"stwcx. %1,0,%3\n"
+			"bne- 1b\n"
+			PPC_ATOMIC_EXIT_BARRIER
+			: "=&r" (old), "=&r" (t)
+			: "n" (~mask), "r" (p)
+			: "cc", "memory");
+	} else {
+		asm volatile (
+			PPC_ATOMIC_ENTRY_BARRIER
+		"1:"	PPC_LLARX "%0,0,%3,0\n"
+			"andc	%1,%0,%2\n"
+			PPC_STLCX "%1,0,%3\n"
+			"bne- 1b\n"
+			PPC_ATOMIC_EXIT_BARRIER
+			: "=&r" (old), "=&r" (t)
+			: "r" (mask), "r" (p)
+			: "cc", "memory");
+	}
+
+	return (old & mask);
+}
+
 static inline int arch_test_and_set_bit(unsigned long nr,
 					volatile unsigned long *addr)
 {
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v5 2/3] powerpc/atomics: Use immediate operand when possible
  2021-09-21 15:09 [PATCH v5 1/3] powerpc/bitops: Use immediate operand when possible Christophe Leroy
@ 2021-09-21 15:09 ` Christophe Leroy
  2021-09-21 15:09 ` [PATCH v5 3/3] powerpc/atomics: Remove atomic_inc()/atomic_dec() and friends Christophe Leroy
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 6+ messages in thread
From: Christophe Leroy @ 2021-09-21 15:09 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

Today we get the following code generation for atomic operations:

	c001bb2c:	39 20 00 01 	li      r9,1
	c001bb30:	7d 40 18 28 	lwarx   r10,0,r3
	c001bb34:	7d 09 50 50 	subf    r8,r9,r10
	c001bb38:	7d 00 19 2d 	stwcx.  r8,0,r3

	c001c7a8:	39 40 00 01 	li      r10,1
	c001c7ac:	7d 00 18 28 	lwarx   r8,0,r3
	c001c7b0:	7c ea 42 14 	add     r7,r10,r8
	c001c7b4:	7c e0 19 2d 	stwcx.  r7,0,r3

By allowing GCC to choose between immediate or regular operation,
we get:

	c001bb2c:	7d 20 18 28 	lwarx   r9,0,r3
	c001bb30:	39 49 ff ff 	addi    r10,r9,-1
	c001bb34:	7d 40 19 2d 	stwcx.  r10,0,r3
	--
	c001c7a4:	7d 40 18 28 	lwarx   r10,0,r3
	c001c7a8:	39 0a 00 01 	addi    r8,r10,1
	c001c7ac:	7d 00 19 2d 	stwcx.  r8,0,r3

For "and", the dot form has to be used because "andi" doesn't exist.

For logical operations we use unsigned 16 bits immediate.
For arithmetic operations we use signed 16 bits immediate.

On pmac32_defconfig, it reduces the text by approx another 8 kbytes.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Acked-by: Segher Boessenkool <segher@kernel.crashing.org>
---
v4: Rebased

v2: Use "addc/addic"
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/include/asm/atomic.h | 56 +++++++++++++++----------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index 6a53ef178bfd..ce0d5a013c58 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -37,62 +37,62 @@ static __inline__ void arch_atomic_set(atomic_t *v, int i)
 	__asm__ __volatile__("stw%U0%X0 %1,%0" : "=m"UPD_CONSTR(v->counter) : "r"(i));
 }
 
-#define ATOMIC_OP(op, asm_op)						\
+#define ATOMIC_OP(op, asm_op, suffix, sign, ...)			\
 static __inline__ void arch_atomic_##op(int a, atomic_t *v)		\
 {									\
 	int t;								\
 									\
 	__asm__ __volatile__(						\
 "1:	lwarx	%0,0,%3		# atomic_" #op "\n"			\
-	#asm_op " %0,%2,%0\n"						\
+	#asm_op "%I2" suffix " %0,%0,%2\n"				\
 "	stwcx.	%0,0,%3 \n"						\
 "	bne-	1b\n"							\
 	: "=&r" (t), "+m" (v->counter)					\
-	: "r" (a), "r" (&v->counter)					\
-	: "cc");							\
+	: "r"#sign (a), "r" (&v->counter)				\
+	: "cc", ##__VA_ARGS__);						\
 }									\
 
-#define ATOMIC_OP_RETURN_RELAXED(op, asm_op)				\
+#define ATOMIC_OP_RETURN_RELAXED(op, asm_op, suffix, sign, ...)		\
 static inline int arch_atomic_##op##_return_relaxed(int a, atomic_t *v)	\
 {									\
 	int t;								\
 									\
 	__asm__ __volatile__(						\
 "1:	lwarx	%0,0,%3		# atomic_" #op "_return_relaxed\n"	\
-	#asm_op " %0,%2,%0\n"						\
+	#asm_op "%I2" suffix " %0,%0,%2\n"				\
 "	stwcx.	%0,0,%3\n"						\
 "	bne-	1b\n"							\
 	: "=&r" (t), "+m" (v->counter)					\
-	: "r" (a), "r" (&v->counter)					\
-	: "cc");							\
+	: "r"#sign (a), "r" (&v->counter)				\
+	: "cc", ##__VA_ARGS__);						\
 									\
 	return t;							\
 }
 
-#define ATOMIC_FETCH_OP_RELAXED(op, asm_op)				\
+#define ATOMIC_FETCH_OP_RELAXED(op, asm_op, suffix, sign, ...)		\
 static inline int arch_atomic_fetch_##op##_relaxed(int a, atomic_t *v)	\
 {									\
 	int res, t;							\
 									\
 	__asm__ __volatile__(						\
 "1:	lwarx	%0,0,%4		# atomic_fetch_" #op "_relaxed\n"	\
-	#asm_op " %1,%3,%0\n"						\
+	#asm_op "%I3" suffix " %1,%0,%3\n"				\
 "	stwcx.	%1,0,%4\n"						\
 "	bne-	1b\n"							\
 	: "=&r" (res), "=&r" (t), "+m" (v->counter)			\
-	: "r" (a), "r" (&v->counter)					\
-	: "cc");							\
+	: "r"#sign (a), "r" (&v->counter)				\
+	: "cc", ##__VA_ARGS__);						\
 									\
 	return res;							\
 }
 
-#define ATOMIC_OPS(op, asm_op)						\
-	ATOMIC_OP(op, asm_op)						\
-	ATOMIC_OP_RETURN_RELAXED(op, asm_op)				\
-	ATOMIC_FETCH_OP_RELAXED(op, asm_op)
+#define ATOMIC_OPS(op, asm_op, suffix, sign, ...)			\
+	ATOMIC_OP(op, asm_op, suffix, sign, ##__VA_ARGS__)		\
+	ATOMIC_OP_RETURN_RELAXED(op, asm_op, suffix, sign, ##__VA_ARGS__)\
+	ATOMIC_FETCH_OP_RELAXED(op, asm_op, suffix, sign, ##__VA_ARGS__)
 
-ATOMIC_OPS(add, add)
-ATOMIC_OPS(sub, subf)
+ATOMIC_OPS(add, add, "c", I, "xer")
+ATOMIC_OPS(sub, sub, "c", I, "xer")
 
 #define arch_atomic_add_return_relaxed arch_atomic_add_return_relaxed
 #define arch_atomic_sub_return_relaxed arch_atomic_sub_return_relaxed
@@ -101,13 +101,13 @@ ATOMIC_OPS(sub, subf)
 #define arch_atomic_fetch_sub_relaxed arch_atomic_fetch_sub_relaxed
 
 #undef ATOMIC_OPS
-#define ATOMIC_OPS(op, asm_op)						\
-	ATOMIC_OP(op, asm_op)						\
-	ATOMIC_FETCH_OP_RELAXED(op, asm_op)
+#define ATOMIC_OPS(op, asm_op, suffix, sign)				\
+	ATOMIC_OP(op, asm_op, suffix, sign)				\
+	ATOMIC_FETCH_OP_RELAXED(op, asm_op, suffix, sign)
 
-ATOMIC_OPS(and, and)
-ATOMIC_OPS(or, or)
-ATOMIC_OPS(xor, xor)
+ATOMIC_OPS(and, and, ".", K)
+ATOMIC_OPS(or, or, "", K)
+ATOMIC_OPS(xor, xor, "", K)
 
 #define arch_atomic_fetch_and_relaxed arch_atomic_fetch_and_relaxed
 #define arch_atomic_fetch_or_relaxed  arch_atomic_fetch_or_relaxed
@@ -241,15 +241,15 @@ static __inline__ int arch_atomic_fetch_add_unless(atomic_t *v, int a, int u)
 "1:	lwarx	%0,0,%1		# atomic_fetch_add_unless\n\
 	cmpw	0,%0,%3 \n\
 	beq	2f \n\
-	add	%0,%2,%0 \n"
+	add%I2c	%0,%0,%2 \n"
 "	stwcx.	%0,0,%1 \n\
 	bne-	1b \n"
 	PPC_ATOMIC_EXIT_BARRIER
-"	subf	%0,%2,%0 \n\
+"	sub%I2c	%0,%0,%2 \n\
 2:"
 	: "=&r" (t)
-	: "r" (&v->counter), "r" (a), "r" (u)
-	: "cc", "memory");
+	: "r" (&v->counter), "rI" (a), "r" (u)
+	: "cc", "memory", "xer");
 
 	return t;
 }
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v5 3/3] powerpc/atomics: Remove atomic_inc()/atomic_dec() and friends
  2021-09-21 15:09 [PATCH v5 1/3] powerpc/bitops: Use immediate operand when possible Christophe Leroy
  2021-09-21 15:09 ` [PATCH v5 2/3] powerpc/atomics: " Christophe Leroy
@ 2021-09-21 15:09 ` Christophe Leroy
  2021-11-26 17:27 ` [PATCH v5 1/3] powerpc/bitops: Use immediate operand when possible LEROY Christophe
  2021-12-07 13:27 ` Michael Ellerman
  3 siblings, 0 replies; 6+ messages in thread
From: Christophe Leroy @ 2021-09-21 15:09 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

Now that atomic_add() and atomic_sub() handle immediate operands,
atomic_inc() and atomic_dec() have no added value compared to the
generic fallback which calls atomic_add(1) and atomic_sub(1).

Also remove atomic_inc_not_zero() which fallsback to
atomic_add_unless() which itself fallsback to
atomic_fetch_add_unless() which now handles immediate operands.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
v4: rebased

v2: New
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/include/asm/atomic.h | 95 -------------------------------
 1 file changed, 95 deletions(-)

diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index ce0d5a013c58..395d1feb5790 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -118,71 +118,6 @@ ATOMIC_OPS(xor, xor, "", K)
 #undef ATOMIC_OP_RETURN_RELAXED
 #undef ATOMIC_OP
 
-static __inline__ void arch_atomic_inc(atomic_t *v)
-{
-	int t;
-
-	__asm__ __volatile__(
-"1:	lwarx	%0,0,%2		# atomic_inc\n\
-	addic	%0,%0,1\n"
-"	stwcx.	%0,0,%2 \n\
-	bne-	1b"
-	: "=&r" (t), "+m" (v->counter)
-	: "r" (&v->counter)
-	: "cc", "xer");
-}
-#define arch_atomic_inc arch_atomic_inc
-
-static __inline__ int arch_atomic_inc_return_relaxed(atomic_t *v)
-{
-	int t;
-
-	__asm__ __volatile__(
-"1:	lwarx	%0,0,%2		# atomic_inc_return_relaxed\n"
-"	addic	%0,%0,1\n"
-"	stwcx.	%0,0,%2\n"
-"	bne-	1b"
-	: "=&r" (t), "+m" (v->counter)
-	: "r" (&v->counter)
-	: "cc", "xer");
-
-	return t;
-}
-
-static __inline__ void arch_atomic_dec(atomic_t *v)
-{
-	int t;
-
-	__asm__ __volatile__(
-"1:	lwarx	%0,0,%2		# atomic_dec\n\
-	addic	%0,%0,-1\n"
-"	stwcx.	%0,0,%2\n\
-	bne-	1b"
-	: "=&r" (t), "+m" (v->counter)
-	: "r" (&v->counter)
-	: "cc", "xer");
-}
-#define arch_atomic_dec arch_atomic_dec
-
-static __inline__ int arch_atomic_dec_return_relaxed(atomic_t *v)
-{
-	int t;
-
-	__asm__ __volatile__(
-"1:	lwarx	%0,0,%2		# atomic_dec_return_relaxed\n"
-"	addic	%0,%0,-1\n"
-"	stwcx.	%0,0,%2\n"
-"	bne-	1b"
-	: "=&r" (t), "+m" (v->counter)
-	: "r" (&v->counter)
-	: "cc", "xer");
-
-	return t;
-}
-
-#define arch_atomic_inc_return_relaxed arch_atomic_inc_return_relaxed
-#define arch_atomic_dec_return_relaxed arch_atomic_dec_return_relaxed
-
 #define arch_atomic_cmpxchg(v, o, n) \
 	(arch_cmpxchg(&((v)->counter), (o), (n)))
 #define arch_atomic_cmpxchg_relaxed(v, o, n) \
@@ -255,36 +190,6 @@ static __inline__ int arch_atomic_fetch_add_unless(atomic_t *v, int a, int u)
 }
 #define arch_atomic_fetch_add_unless arch_atomic_fetch_add_unless
 
-/**
- * atomic_inc_not_zero - increment unless the number is zero
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1, so long as @v is non-zero.
- * Returns non-zero if @v was non-zero, and zero otherwise.
- */
-static __inline__ int arch_atomic_inc_not_zero(atomic_t *v)
-{
-	int t1, t2;
-
-	__asm__ __volatile__ (
-	PPC_ATOMIC_ENTRY_BARRIER
-"1:	lwarx	%0,0,%2		# atomic_inc_not_zero\n\
-	cmpwi	0,%0,0\n\
-	beq-	2f\n\
-	addic	%1,%0,1\n"
-"	stwcx.	%1,0,%2\n\
-	bne-	1b\n"
-	PPC_ATOMIC_EXIT_BARRIER
-	"\n\
-2:"
-	: "=&r" (t1), "=&r" (t2)
-	: "r" (&v->counter)
-	: "cc", "xer", "memory");
-
-	return t1;
-}
-#define arch_atomic_inc_not_zero(v) arch_atomic_inc_not_zero((v))
-
 /*
  * Atomically test *v and decrement if it is greater than 0.
  * The function returns the old value of *v minus 1, even if
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v5 1/3] powerpc/bitops: Use immediate operand when possible
  2021-09-21 15:09 [PATCH v5 1/3] powerpc/bitops: Use immediate operand when possible Christophe Leroy
  2021-09-21 15:09 ` [PATCH v5 2/3] powerpc/atomics: " Christophe Leroy
  2021-09-21 15:09 ` [PATCH v5 3/3] powerpc/atomics: Remove atomic_inc()/atomic_dec() and friends Christophe Leroy
@ 2021-11-26 17:27 ` LEROY Christophe
  2021-11-30 11:40   ` Michael Ellerman
  2021-12-07 13:27 ` Michael Ellerman
  3 siblings, 1 reply; 6+ messages in thread
From: LEROY Christophe @ 2021-11-26 17:27 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: Paul Mackerras, linuxppc-dev, linux-kernel

Hi Michael,

Any chance to get this series merged this cycle ?

Thanks
Christophe

Le 21/09/2021 à 17:09, Christophe Leroy a écrit :
> Today we get the following code generation for bitops like
> set or clear bit:
> 
> 	c0009fe0:	39 40 08 00 	li      r10,2048
> 	c0009fe4:	7c e0 40 28 	lwarx   r7,0,r8
> 	c0009fe8:	7c e7 53 78 	or      r7,r7,r10
> 	c0009fec:	7c e0 41 2d 	stwcx.  r7,0,r8
> 
> 	c000d568:	39 00 18 00 	li      r8,6144
> 	c000d56c:	7c c0 38 28 	lwarx   r6,0,r7
> 	c000d570:	7c c6 40 78 	andc    r6,r6,r8
> 	c000d574:	7c c0 39 2d 	stwcx.  r6,0,r7
> 
> Most set bits are constant on lower 16 bits, so it can easily
> be replaced by the "immediate" version of the operation. Allow
> GCC to choose between the normal or immediate form.
> 
> For clear bits, on 32 bits 'rlwinm' can be used instead of 'andc' for
> when all bits to be cleared are consecutive.
> 
> On 64 bits we don't have any equivalent single operation for clearing,
> single bits or a few bits, we'd need two 'rldicl' so it is not
> worth it, the li/andc sequence is doing the same.
> 
> With this patch we get:
> 
> 	c0009fe0:	7d 00 50 28 	lwarx   r8,0,r10
> 	c0009fe4:	61 08 08 00 	ori     r8,r8,2048
> 	c0009fe8:	7d 00 51 2d 	stwcx.  r8,0,r10
> 
> 	c000d558:	7c e0 40 28 	lwarx   r7,0,r8
> 	c000d55c:	54 e7 05 64 	rlwinm  r7,r7,0,21,18
> 	c000d560:	7c e0 41 2d 	stwcx.  r7,0,r8
> 
> On pmac32_defconfig, it reduces the text by approx 10 kbytes.
> 
> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> Reviewed-by: Segher Boessenkool <segher@kernel.crashing.org>
> ---
> v5: Fixed the argument of is_rlwinm_mask_valid() in test_and_clear_bits()
> 
> v4: Rebased
> 
> v3:
> - Using the mask validation proposed by Segher
> 
> v2:
> - Use "n" instead of "i" as constraint for the rlwinm mask
> - Improve mask verification to handle more than single bit masks
> 
> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> ---
>   arch/powerpc/include/asm/bitops.h | 89 ++++++++++++++++++++++++++++---
>   1 file changed, 81 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
> index 11847b6a244e..a05d8c62cbea 100644
> --- a/arch/powerpc/include/asm/bitops.h
> +++ b/arch/powerpc/include/asm/bitops.h
> @@ -71,19 +71,61 @@ static inline void fn(unsigned long mask,	\
>   	__asm__ __volatile__ (			\
>   	prefix					\
>   "1:"	PPC_LLARX "%0,0,%3,0\n"			\
> -	stringify_in_c(op) "%0,%0,%2\n"		\
> +	#op "%I2 %0,%0,%2\n"			\
>   	PPC_STLCX "%0,0,%3\n"			\
>   	"bne- 1b\n"				\
>   	: "=&r" (old), "+m" (*p)		\
> -	: "r" (mask), "r" (p)			\
> +	: "rK" (mask), "r" (p)			\
>   	: "cc", "memory");			\
>   }
>   
>   DEFINE_BITOP(set_bits, or, "")
> -DEFINE_BITOP(clear_bits, andc, "")
> -DEFINE_BITOP(clear_bits_unlock, andc, PPC_RELEASE_BARRIER)
>   DEFINE_BITOP(change_bits, xor, "")
>   
> +static __always_inline bool is_rlwinm_mask_valid(unsigned long x)
> +{
> +	if (!x)
> +		return false;
> +	if (x & 1)
> +		x = ~x;	// make the mask non-wrapping
> +	x += x & -x;	// adding the low set bit results in at most one bit set
> +
> +	return !(x & (x - 1));
> +}
> +
> +#define DEFINE_CLROP(fn, prefix)					\
> +static inline void fn(unsigned long mask, volatile unsigned long *_p)	\
> +{									\
> +	unsigned long old;						\
> +	unsigned long *p = (unsigned long *)_p;				\
> +									\
> +	if (IS_ENABLED(CONFIG_PPC32) &&					\
> +	    __builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) {\
> +		asm volatile (						\
> +			prefix						\
> +		"1:"	"lwarx	%0,0,%3\n"				\
> +			"rlwinm	%0,%0,0,%2\n"				\
> +			"stwcx.	%0,0,%3\n"				\
> +			"bne- 1b\n"					\
> +			: "=&r" (old), "+m" (*p)			\
> +			: "n" (~mask), "r" (p)				\
> +			: "cc", "memory");				\
> +	} else {							\
> +		asm volatile (						\
> +			prefix						\
> +		"1:"	PPC_LLARX "%0,0,%3,0\n"				\
> +			"andc %0,%0,%2\n"				\
> +			PPC_STLCX "%0,0,%3\n"				\
> +			"bne- 1b\n"					\
> +			: "=&r" (old), "+m" (*p)			\
> +			: "r" (mask), "r" (p)				\
> +			: "cc", "memory");				\
> +	}								\
> +}
> +
> +DEFINE_CLROP(clear_bits, "")
> +DEFINE_CLROP(clear_bits_unlock, PPC_RELEASE_BARRIER)
> +
>   static inline void arch_set_bit(int nr, volatile unsigned long *addr)
>   {
>   	set_bits(BIT_MASK(nr), addr + BIT_WORD(nr));
> @@ -116,12 +158,12 @@ static inline unsigned long fn(			\
>   	__asm__ __volatile__ (				\
>   	prefix						\
>   "1:"	PPC_LLARX "%0,0,%3,%4\n"			\
> -	stringify_in_c(op) "%1,%0,%2\n"			\
> +	#op "%I2 %1,%0,%2\n"				\
>   	PPC_STLCX "%1,0,%3\n"				\
>   	"bne- 1b\n"					\
>   	postfix						\
>   	: "=&r" (old), "=&r" (t)			\
> -	: "r" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0)	\
> +	: "rK" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0)	\
>   	: "cc", "memory");				\
>   	return (old & mask);				\
>   }
> @@ -130,11 +172,42 @@ DEFINE_TESTOP(test_and_set_bits, or, PPC_ATOMIC_ENTRY_BARRIER,
>   	      PPC_ATOMIC_EXIT_BARRIER, 0)
>   DEFINE_TESTOP(test_and_set_bits_lock, or, "",
>   	      PPC_ACQUIRE_BARRIER, 1)
> -DEFINE_TESTOP(test_and_clear_bits, andc, PPC_ATOMIC_ENTRY_BARRIER,
> -	      PPC_ATOMIC_EXIT_BARRIER, 0)
>   DEFINE_TESTOP(test_and_change_bits, xor, PPC_ATOMIC_ENTRY_BARRIER,
>   	      PPC_ATOMIC_EXIT_BARRIER, 0)
>   
> +static inline unsigned long test_and_clear_bits(unsigned long mask, volatile unsigned long *_p)
> +{
> +	unsigned long old, t;
> +	unsigned long *p = (unsigned long *)_p;
> +
> +	if (IS_ENABLED(CONFIG_PPC32) &&
> +	    __builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) {
> +		asm volatile (
> +			PPC_ATOMIC_ENTRY_BARRIER
> +		"1:"	"lwarx %0,0,%3\n"
> +			"rlwinm	%1,%0,0,%2\n"
> +			"stwcx. %1,0,%3\n"
> +			"bne- 1b\n"
> +			PPC_ATOMIC_EXIT_BARRIER
> +			: "=&r" (old), "=&r" (t)
> +			: "n" (~mask), "r" (p)
> +			: "cc", "memory");
> +	} else {
> +		asm volatile (
> +			PPC_ATOMIC_ENTRY_BARRIER
> +		"1:"	PPC_LLARX "%0,0,%3,0\n"
> +			"andc	%1,%0,%2\n"
> +			PPC_STLCX "%1,0,%3\n"
> +			"bne- 1b\n"
> +			PPC_ATOMIC_EXIT_BARRIER
> +			: "=&r" (old), "=&r" (t)
> +			: "r" (mask), "r" (p)
> +			: "cc", "memory");
> +	}
> +
> +	return (old & mask);
> +}
> +
>   static inline int arch_test_and_set_bit(unsigned long nr,
>   					volatile unsigned long *addr)
>   {
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v5 1/3] powerpc/bitops: Use immediate operand when possible
  2021-11-26 17:27 ` [PATCH v5 1/3] powerpc/bitops: Use immediate operand when possible LEROY Christophe
@ 2021-11-30 11:40   ` Michael Ellerman
  0 siblings, 0 replies; 6+ messages in thread
From: Michael Ellerman @ 2021-11-30 11:40 UTC (permalink / raw)
  To: LEROY Christophe; +Cc: Paul Mackerras, linuxppc-dev, linux-kernel

LEROY Christophe <christophe.leroy@csgroup.eu> writes:
> Hi Michael,
>
> Any chance to get this series merged this cycle ?

Yeah. I hesitated a bit at the changes which make the atomic asm even
more complicated, but I guess it's worth it.

cheers

> Le 21/09/2021 à 17:09, Christophe Leroy a écrit :
>> Today we get the following code generation for bitops like
>> set or clear bit:
>> 
>> 	c0009fe0:	39 40 08 00 	li      r10,2048
>> 	c0009fe4:	7c e0 40 28 	lwarx   r7,0,r8
>> 	c0009fe8:	7c e7 53 78 	or      r7,r7,r10
>> 	c0009fec:	7c e0 41 2d 	stwcx.  r7,0,r8
>> 
>> 	c000d568:	39 00 18 00 	li      r8,6144
>> 	c000d56c:	7c c0 38 28 	lwarx   r6,0,r7
>> 	c000d570:	7c c6 40 78 	andc    r6,r6,r8
>> 	c000d574:	7c c0 39 2d 	stwcx.  r6,0,r7
>> 
>> Most set bits are constant on lower 16 bits, so it can easily
>> be replaced by the "immediate" version of the operation. Allow
>> GCC to choose between the normal or immediate form.
>> 
>> For clear bits, on 32 bits 'rlwinm' can be used instead of 'andc' for
>> when all bits to be cleared are consecutive.
>> 
>> On 64 bits we don't have any equivalent single operation for clearing,
>> single bits or a few bits, we'd need two 'rldicl' so it is not
>> worth it, the li/andc sequence is doing the same.
>> 
>> With this patch we get:
>> 
>> 	c0009fe0:	7d 00 50 28 	lwarx   r8,0,r10
>> 	c0009fe4:	61 08 08 00 	ori     r8,r8,2048
>> 	c0009fe8:	7d 00 51 2d 	stwcx.  r8,0,r10
>> 
>> 	c000d558:	7c e0 40 28 	lwarx   r7,0,r8
>> 	c000d55c:	54 e7 05 64 	rlwinm  r7,r7,0,21,18
>> 	c000d560:	7c e0 41 2d 	stwcx.  r7,0,r8
>> 
>> On pmac32_defconfig, it reduces the text by approx 10 kbytes.
>> 
>> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
>> Reviewed-by: Segher Boessenkool <segher@kernel.crashing.org>
>> ---
>> v5: Fixed the argument of is_rlwinm_mask_valid() in test_and_clear_bits()
>> 
>> v4: Rebased
>> 
>> v3:
>> - Using the mask validation proposed by Segher
>> 
>> v2:
>> - Use "n" instead of "i" as constraint for the rlwinm mask
>> - Improve mask verification to handle more than single bit masks
>> 
>> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
>> ---
>>   arch/powerpc/include/asm/bitops.h | 89 ++++++++++++++++++++++++++++---
>>   1 file changed, 81 insertions(+), 8 deletions(-)
>> 
>> diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
>> index 11847b6a244e..a05d8c62cbea 100644
>> --- a/arch/powerpc/include/asm/bitops.h
>> +++ b/arch/powerpc/include/asm/bitops.h
>> @@ -71,19 +71,61 @@ static inline void fn(unsigned long mask,	\
>>   	__asm__ __volatile__ (			\
>>   	prefix					\
>>   "1:"	PPC_LLARX "%0,0,%3,0\n"			\
>> -	stringify_in_c(op) "%0,%0,%2\n"		\
>> +	#op "%I2 %0,%0,%2\n"			\
>>   	PPC_STLCX "%0,0,%3\n"			\
>>   	"bne- 1b\n"				\
>>   	: "=&r" (old), "+m" (*p)		\
>> -	: "r" (mask), "r" (p)			\
>> +	: "rK" (mask), "r" (p)			\
>>   	: "cc", "memory");			\
>>   }
>>   
>>   DEFINE_BITOP(set_bits, or, "")
>> -DEFINE_BITOP(clear_bits, andc, "")
>> -DEFINE_BITOP(clear_bits_unlock, andc, PPC_RELEASE_BARRIER)
>>   DEFINE_BITOP(change_bits, xor, "")
>>   
>> +static __always_inline bool is_rlwinm_mask_valid(unsigned long x)
>> +{
>> +	if (!x)
>> +		return false;
>> +	if (x & 1)
>> +		x = ~x;	// make the mask non-wrapping
>> +	x += x & -x;	// adding the low set bit results in at most one bit set
>> +
>> +	return !(x & (x - 1));
>> +}
>> +
>> +#define DEFINE_CLROP(fn, prefix)					\
>> +static inline void fn(unsigned long mask, volatile unsigned long *_p)	\
>> +{									\
>> +	unsigned long old;						\
>> +	unsigned long *p = (unsigned long *)_p;				\
>> +									\
>> +	if (IS_ENABLED(CONFIG_PPC32) &&					\
>> +	    __builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) {\
>> +		asm volatile (						\
>> +			prefix						\
>> +		"1:"	"lwarx	%0,0,%3\n"				\
>> +			"rlwinm	%0,%0,0,%2\n"				\
>> +			"stwcx.	%0,0,%3\n"				\
>> +			"bne- 1b\n"					\
>> +			: "=&r" (old), "+m" (*p)			\
>> +			: "n" (~mask), "r" (p)				\
>> +			: "cc", "memory");				\
>> +	} else {							\
>> +		asm volatile (						\
>> +			prefix						\
>> +		"1:"	PPC_LLARX "%0,0,%3,0\n"				\
>> +			"andc %0,%0,%2\n"				\
>> +			PPC_STLCX "%0,0,%3\n"				\
>> +			"bne- 1b\n"					\
>> +			: "=&r" (old), "+m" (*p)			\
>> +			: "r" (mask), "r" (p)				\
>> +			: "cc", "memory");				\
>> +	}								\
>> +}
>> +
>> +DEFINE_CLROP(clear_bits, "")
>> +DEFINE_CLROP(clear_bits_unlock, PPC_RELEASE_BARRIER)
>> +
>>   static inline void arch_set_bit(int nr, volatile unsigned long *addr)
>>   {
>>   	set_bits(BIT_MASK(nr), addr + BIT_WORD(nr));
>> @@ -116,12 +158,12 @@ static inline unsigned long fn(			\
>>   	__asm__ __volatile__ (				\
>>   	prefix						\
>>   "1:"	PPC_LLARX "%0,0,%3,%4\n"			\
>> -	stringify_in_c(op) "%1,%0,%2\n"			\
>> +	#op "%I2 %1,%0,%2\n"				\
>>   	PPC_STLCX "%1,0,%3\n"				\
>>   	"bne- 1b\n"					\
>>   	postfix						\
>>   	: "=&r" (old), "=&r" (t)			\
>> -	: "r" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0)	\
>> +	: "rK" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0)	\
>>   	: "cc", "memory");				\
>>   	return (old & mask);				\
>>   }
>> @@ -130,11 +172,42 @@ DEFINE_TESTOP(test_and_set_bits, or, PPC_ATOMIC_ENTRY_BARRIER,
>>   	      PPC_ATOMIC_EXIT_BARRIER, 0)
>>   DEFINE_TESTOP(test_and_set_bits_lock, or, "",
>>   	      PPC_ACQUIRE_BARRIER, 1)
>> -DEFINE_TESTOP(test_and_clear_bits, andc, PPC_ATOMIC_ENTRY_BARRIER,
>> -	      PPC_ATOMIC_EXIT_BARRIER, 0)
>>   DEFINE_TESTOP(test_and_change_bits, xor, PPC_ATOMIC_ENTRY_BARRIER,
>>   	      PPC_ATOMIC_EXIT_BARRIER, 0)
>>   
>> +static inline unsigned long test_and_clear_bits(unsigned long mask, volatile unsigned long *_p)
>> +{
>> +	unsigned long old, t;
>> +	unsigned long *p = (unsigned long *)_p;
>> +
>> +	if (IS_ENABLED(CONFIG_PPC32) &&
>> +	    __builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) {
>> +		asm volatile (
>> +			PPC_ATOMIC_ENTRY_BARRIER
>> +		"1:"	"lwarx %0,0,%3\n"
>> +			"rlwinm	%1,%0,0,%2\n"
>> +			"stwcx. %1,0,%3\n"
>> +			"bne- 1b\n"
>> +			PPC_ATOMIC_EXIT_BARRIER
>> +			: "=&r" (old), "=&r" (t)
>> +			: "n" (~mask), "r" (p)
>> +			: "cc", "memory");
>> +	} else {
>> +		asm volatile (
>> +			PPC_ATOMIC_ENTRY_BARRIER
>> +		"1:"	PPC_LLARX "%0,0,%3,0\n"
>> +			"andc	%1,%0,%2\n"
>> +			PPC_STLCX "%1,0,%3\n"
>> +			"bne- 1b\n"
>> +			PPC_ATOMIC_EXIT_BARRIER
>> +			: "=&r" (old), "=&r" (t)
>> +			: "r" (mask), "r" (p)
>> +			: "cc", "memory");
>> +	}
>> +
>> +	return (old & mask);
>> +}
>> +
>>   static inline int arch_test_and_set_bit(unsigned long nr,
>>   					volatile unsigned long *addr)
>>   {
>> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v5 1/3] powerpc/bitops: Use immediate operand when possible
  2021-09-21 15:09 [PATCH v5 1/3] powerpc/bitops: Use immediate operand when possible Christophe Leroy
                   ` (2 preceding siblings ...)
  2021-11-26 17:27 ` [PATCH v5 1/3] powerpc/bitops: Use immediate operand when possible LEROY Christophe
@ 2021-12-07 13:27 ` Michael Ellerman
  3 siblings, 0 replies; 6+ messages in thread
From: Michael Ellerman @ 2021-12-07 13:27 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Christophe Leroy, Paul Mackerras,
	Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

On Tue, 21 Sep 2021 17:09:47 +0200, Christophe Leroy wrote:
> Today we get the following code generation for bitops like
> set or clear bit:
> 
> 	c0009fe0:	39 40 08 00 	li      r10,2048
> 	c0009fe4:	7c e0 40 28 	lwarx   r7,0,r8
> 	c0009fe8:	7c e7 53 78 	or      r7,r7,r10
> 	c0009fec:	7c e0 41 2d 	stwcx.  r7,0,r8
> 
> [...]

Applied to powerpc/next.

[1/3] powerpc/bitops: Use immediate operand when possible
      https://git.kernel.org/powerpc/c/fb350784d8d17952afa93383bb47aaa6b715c459
[2/3] powerpc/atomics: Use immediate operand when possible
      https://git.kernel.org/powerpc/c/41d65207de9fbff58acd8937a7c3f8940c186a87
[3/3] powerpc/atomics: Remove atomic_inc()/atomic_dec() and friends
      https://git.kernel.org/powerpc/c/f05cab0034babaa9b3dfaf6003ee6493496a8180

cheers

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-12-07 13:31 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-21 15:09 [PATCH v5 1/3] powerpc/bitops: Use immediate operand when possible Christophe Leroy
2021-09-21 15:09 ` [PATCH v5 2/3] powerpc/atomics: " Christophe Leroy
2021-09-21 15:09 ` [PATCH v5 3/3] powerpc/atomics: Remove atomic_inc()/atomic_dec() and friends Christophe Leroy
2021-11-26 17:27 ` [PATCH v5 1/3] powerpc/bitops: Use immediate operand when possible LEROY Christophe
2021-11-30 11:40   ` Michael Ellerman
2021-12-07 13:27 ` Michael Ellerman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).