[RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg

All of lore.kernel.org
 help / color / mirror / Atom feed

* [RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg_double()
@ 2022-12-19 15:35 Peter Zijlstra
  2022-12-19 15:35 ` [RFC][PATCH 01/12] crypto: Remove u128 usage Peter Zijlstra
                   ` (12 more replies)
  0 siblings, 13 replies; 57+ messages in thread
From: Peter Zijlstra @ 2022-12-19 15:35 UTC (permalink / raw)
  To: torvalds
  Cc: corbet, will, peterz, boqun.feng, mark.rutland, catalin.marinas,
	dennis, tj, cl, hca, gor, agordeev, borntraeger, svens,
	Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

Hi,

Since Linus hated on cmpxchg_double(), a few patches to get rid of it, as proposed here:

  https://lkml.kernel.org/r/Y2U3WdU61FvYlpUh@hirez.programming.kicks-ass.net

based on tip/master because Linus' tree is moving a wee bit fast at the moment.

0day robot is all green for building, very limited testing on arm64/s390
for obvious raisins -- I tried to get the asm right, but please, double
check.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [RFC][PATCH 01/12] crypto: Remove u128 usage
  2022-12-19 15:35 [RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg_double() Peter Zijlstra
@ 2022-12-19 15:35 ` Peter Zijlstra
  2022-12-19 15:56   ` Jason A. Donenfeld
  2022-12-19 15:35 ` [RFC][PATCH 02/12] crypto/ghash-clmulni: Use (struct) be128 Peter Zijlstra
                   ` (11 subsequent siblings)
  12 siblings, 1 reply; 57+ messages in thread
From: Peter Zijlstra @ 2022-12-19 15:35 UTC (permalink / raw)
  To: torvalds
  Cc: corbet, will, peterz, boqun.feng, mark.rutland, catalin.marinas,
	dennis, tj, cl, hca, gor, agordeev, borntraeger, svens,
	Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

As seems to be the common (majority) usage in crypto, use __uint128_t
instead of u128.

This frees up u128 for definition in linux/types.h.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 lib/crypto/curve25519-hacl64.c |  142 ++++++++++++++++++++---------------------
 lib/crypto/poly1305-donna64.c  |   22 ++----
 2 files changed, 80 insertions(+), 84 deletions(-)

--- a/lib/crypto/curve25519-hacl64.c
+++ b/lib/crypto/curve25519-hacl64.c
@@ -14,8 +14,6 @@
 #include <crypto/curve25519.h>
 #include <linux/string.h>
 
-typedef __uint128_t u128;
-
 static __always_inline u64 u64_eq_mask(u64 a, u64 b)
 {
 	u64 x = a ^ b;
@@ -50,77 +48,77 @@ static __always_inline void modulo_carry
 	b[0] = b0_;
 }
 
-static __always_inline void fproduct_copy_from_wide_(u64 *output, u128 *input)
+static __always_inline void fproduct_copy_from_wide_(u64 *output, __uint128_t *input)
 {
 	{
-		u128 xi = input[0];
+		__uint128_t xi = input[0];
 		output[0] = ((u64)(xi));
 	}
 	{
-		u128 xi = input[1];
+		__uint128_t xi = input[1];
 		output[1] = ((u64)(xi));
 	}
 	{
-		u128 xi = input[2];
+		__uint128_t xi = input[2];
 		output[2] = ((u64)(xi));
 	}
 	{
-		u128 xi = input[3];
+		__uint128_t xi = input[3];
 		output[3] = ((u64)(xi));
 	}
 	{
-		u128 xi = input[4];
+		__uint128_t xi = input[4];
 		output[4] = ((u64)(xi));
 	}
 }
 
 static __always_inline void
-fproduct_sum_scalar_multiplication_(u128 *output, u64 *input, u64 s)
+fproduct_sum_scalar_multiplication_(__uint128_t *output, u64 *input, u64 s)
 {
-	output[0] += (u128)input[0] * s;
-	output[1] += (u128)input[1] * s;
-	output[2] += (u128)input[2] * s;
-	output[3] += (u128)input[3] * s;
-	output[4] += (u128)input[4] * s;
+	output[0] += (__uint128_t)input[0] * s;
+	output[1] += (__uint128_t)input[1] * s;
+	output[2] += (__uint128_t)input[2] * s;
+	output[3] += (__uint128_t)input[3] * s;
+	output[4] += (__uint128_t)input[4] * s;
 }
 
-static __always_inline void fproduct_carry_wide_(u128 *tmp)
+static __always_inline void fproduct_carry_wide_(__uint128_t *tmp)
 {
 	{
 		u32 ctr = 0;
-		u128 tctr = tmp[ctr];
-		u128 tctrp1 = tmp[ctr + 1];
+		__uint128_t tctr = tmp[ctr];
+		__uint128_t tctrp1 = tmp[ctr + 1];
 		u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
-		u128 c = ((tctr) >> (51));
-		tmp[ctr] = ((u128)(r0));
+		__uint128_t c = ((tctr) >> (51));
+		tmp[ctr] = ((__uint128_t)(r0));
 		tmp[ctr + 1] = ((tctrp1) + (c));
 	}
 	{
 		u32 ctr = 1;
-		u128 tctr = tmp[ctr];
-		u128 tctrp1 = tmp[ctr + 1];
+		__uint128_t tctr = tmp[ctr];
+		__uint128_t tctrp1 = tmp[ctr + 1];
 		u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
-		u128 c = ((tctr) >> (51));
-		tmp[ctr] = ((u128)(r0));
+		__uint128_t c = ((tctr) >> (51));
+		tmp[ctr] = ((__uint128_t)(r0));
 		tmp[ctr + 1] = ((tctrp1) + (c));
 	}
 
 	{
 		u32 ctr = 2;
-		u128 tctr = tmp[ctr];
-		u128 tctrp1 = tmp[ctr + 1];
+		__uint128_t tctr = tmp[ctr];
+		__uint128_t tctrp1 = tmp[ctr + 1];
 		u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
-		u128 c = ((tctr) >> (51));
-		tmp[ctr] = ((u128)(r0));
+		__uint128_t c = ((tctr) >> (51));
+		tmp[ctr] = ((__uint128_t)(r0));
 		tmp[ctr + 1] = ((tctrp1) + (c));
 	}
 	{
 		u32 ctr = 3;
-		u128 tctr = tmp[ctr];
-		u128 tctrp1 = tmp[ctr + 1];
+		__uint128_t tctr = tmp[ctr];
+		__uint128_t tctrp1 = tmp[ctr + 1];
 		u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
-		u128 c = ((tctr) >> (51));
-		tmp[ctr] = ((u128)(r0));
+		__uint128_t c = ((tctr) >> (51));
+		tmp[ctr] = ((__uint128_t)(r0));
 		tmp[ctr + 1] = ((tctrp1) + (c));
 	}
 }
@@ -154,7 +152,7 @@ static __always_inline void fmul_shift_r
 	output[0] = 19 * b0;
 }
 
-static __always_inline void fmul_mul_shift_reduce_(u128 *output, u64 *input,
+static __always_inline void fmul_mul_shift_reduce_(__uint128_t *output, u64 *input,
 						   u64 *input21)
 {
 	u32 i;
@@ -188,21 +186,21 @@ static __always_inline void fmul_fmul(u6
 {
 	u64 tmp[5] = { input[0], input[1], input[2], input[3], input[4] };
 	{
-		u128 b4;
-		u128 b0;
-		u128 b4_;
-		u128 b0_;
+		__uint128_t b4;
+		__uint128_t b0;
+		__uint128_t b4_;
+		__uint128_t b0_;
 		u64 i0;
 		u64 i1;
 		u64 i0_;
 		u64 i1_;
-		u128 t[5] = { 0 };
+		__uint128_t t[5] = { 0 };
 		fmul_mul_shift_reduce_(t, tmp, input21);
 		fproduct_carry_wide_(t);
 		b4 = t[4];
 		b0 = t[0];
-		b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
-		b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
+		b4_ = ((b4) & (((__uint128_t)(0x7ffffffffffffLLU))));
+		b0_ = ((b0) + (((__uint128_t)(19) * (((u64)(((b4) >> (51))))))));
 		t[4] = b4_;
 		t[0] = b0_;
 		fproduct_copy_from_wide_(output, t);
@@ -215,7 +213,7 @@ static __always_inline void fmul_fmul(u6
 	}
 }
 
-static __always_inline void fsquare_fsquare__(u128 *tmp, u64 *output)
+static __always_inline void fsquare_fsquare__(__uint128_t *tmp, u64 *output)
 {
 	u64 r0 = output[0];
 	u64 r1 = output[1];
@@ -227,16 +225,16 @@ static __always_inline void fsquare_fsqu
 	u64 d2 = r2 * 2 * 19;
 	u64 d419 = r4 * 19;
 	u64 d4 = d419 * 2;
-	u128 s0 = ((((((u128)(r0) * (r0))) + (((u128)(d4) * (r1))))) +
-		   (((u128)(d2) * (r3))));
-	u128 s1 = ((((((u128)(d0) * (r1))) + (((u128)(d4) * (r2))))) +
-		   (((u128)(r3 * 19) * (r3))));
-	u128 s2 = ((((((u128)(d0) * (r2))) + (((u128)(r1) * (r1))))) +
-		   (((u128)(d4) * (r3))));
-	u128 s3 = ((((((u128)(d0) * (r3))) + (((u128)(d1) * (r2))))) +
-		   (((u128)(r4) * (d419))));
-	u128 s4 = ((((((u128)(d0) * (r4))) + (((u128)(d1) * (r3))))) +
-		   (((u128)(r2) * (r2))));
+	__uint128_t s0 = ((((((__uint128_t)(r0) * (r0))) + (((__uint128_t)(d4) * (r1))))) +
+		   (((__uint128_t)(d2) * (r3))));
+	__uint128_t s1 = ((((((__uint128_t)(d0) * (r1))) + (((__uint128_t)(d4) * (r2))))) +
+		   (((__uint128_t)(r3 * 19) * (r3))));
+	__uint128_t s2 = ((((((__uint128_t)(d0) * (r2))) + (((__uint128_t)(r1) * (r1))))) +
+		   (((__uint128_t)(d4) * (r3))));
+	__uint128_t s3 = ((((((__uint128_t)(d0) * (r3))) + (((__uint128_t)(d1) * (r2))))) +
+		   (((__uint128_t)(r4) * (d419))));
+	__uint128_t s4 = ((((((__uint128_t)(d0) * (r4))) + (((__uint128_t)(d1) * (r3))))) +
+		   (((__uint128_t)(r2) * (r2))));
 	tmp[0] = s0;
 	tmp[1] = s1;
 	tmp[2] = s2;
@@ -244,12 +242,12 @@ static __always_inline void fsquare_fsqu
 	tmp[4] = s4;
 }
 
-static __always_inline void fsquare_fsquare_(u128 *tmp, u64 *output)
+static __always_inline void fsquare_fsquare_(__uint128_t *tmp, u64 *output)
 {
-	u128 b4;
-	u128 b0;
-	u128 b4_;
-	u128 b0_;
+	__uint128_t b4;
+	__uint128_t b0;
+	__uint128_t b4_;
+	__uint128_t b0_;
 	u64 i0;
 	u64 i1;
 	u64 i0_;
@@ -258,8 +256,8 @@ static __always_inline void fsquare_fsqu
 	fproduct_carry_wide_(tmp);
 	b4 = tmp[4];
 	b0 = tmp[0];
-	b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
-	b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
+	b4_ = ((b4) & (((__uint128_t)(0x7ffffffffffffLLU))));
+	b0_ = ((b0) + (((__uint128_t)(19) * (((u64)(((b4) >> (51))))))));
 	tmp[4] = b4_;
 	tmp[0] = b0_;
 	fproduct_copy_from_wide_(output, tmp);
@@ -271,7 +269,7 @@ static __always_inline void fsquare_fsqu
 	output[1] = i1_;
 }
 
-static __always_inline void fsquare_fsquare_times_(u64 *output, u128 *tmp,
+static __always_inline void fsquare_fsquare_times_(u64 *output, __uint128_t *tmp,
 						   u32 count1)
 {
 	u32 i;
@@ -283,7 +281,7 @@ static __always_inline void fsquare_fsqu
 static __always_inline void fsquare_fsquare_times(u64 *output, u64 *input,
 						  u32 count1)
 {
-	u128 t[5];
+	__uint128_t t[5];
 	memcpy(output, input, 5 * sizeof(*input));
 	fsquare_fsquare_times_(output, t, count1);
 }
@@ -291,7 +289,7 @@ static __always_inline void fsquare_fsqu
 static __always_inline void fsquare_fsquare_times_inplace(u64 *output,
 							  u32 count1)
 {
-	u128 t[5];
+	__uint128_t t[5];
 	fsquare_fsquare_times_(output, t, count1);
 }
 
@@ -396,36 +394,36 @@ static __always_inline void fdifference(
 
 static __always_inline void fscalar(u64 *output, u64 *b, u64 s)
 {
-	u128 tmp[5];
-	u128 b4;
-	u128 b0;
-	u128 b4_;
-	u128 b0_;
+	__uint128_t tmp[5];
+	__uint128_t b4;
+	__uint128_t b0;
+	__uint128_t b4_;
+	__uint128_t b0_;
 	{
 		u64 xi = b[0];
-		tmp[0] = ((u128)(xi) * (s));
+		tmp[0] = ((__uint128_t)(xi) * (s));
 	}
 	{
 		u64 xi = b[1];
-		tmp[1] = ((u128)(xi) * (s));
+		tmp[1] = ((__uint128_t)(xi) * (s));
 	}
 	{
 		u64 xi = b[2];
-		tmp[2] = ((u128)(xi) * (s));
+		tmp[2] = ((__uint128_t)(xi) * (s));
 	}
 	{
 		u64 xi = b[3];
-		tmp[3] = ((u128)(xi) * (s));
+		tmp[3] = ((__uint128_t)(xi) * (s));
 	}
 	{
 		u64 xi = b[4];
-		tmp[4] = ((u128)(xi) * (s));
+		tmp[4] = ((__uint128_t)(xi) * (s));
 	}
 	fproduct_carry_wide_(tmp);
 	b4 = tmp[4];
 	b0 = tmp[0];
-	b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
-	b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
+	b4_ = ((b4) & (((__uint128_t)(0x7ffffffffffffLLU))));
+	b0_ = ((b0) + (((__uint128_t)(19) * (((u64)(((b4) >> (51))))))));
 	tmp[4] = b4_;
 	tmp[0] = b0_;
 	fproduct_copy_from_wide_(output, tmp);
--- a/lib/crypto/poly1305-donna64.c
+++ b/lib/crypto/poly1305-donna64.c
@@ -10,8 +10,6 @@
 #include <asm/unaligned.h>
 #include <crypto/internal/poly1305.h>
 
-typedef __uint128_t u128;
-
 void poly1305_core_setkey(struct poly1305_core_key *key,
 			  const u8 raw_key[POLY1305_BLOCK_SIZE])
 {
@@ -41,7 +39,7 @@ void poly1305_core_blocks(struct poly130
 	u64 s1, s2;
 	u64 h0, h1, h2;
 	u64 c;
-	u128 d0, d1, d2, d;
+	__uint128_t d0, d1, d2, d;
 
 	if (!nblocks)
 		return;
@@ -71,20 +69,20 @@ void poly1305_core_blocks(struct poly130
 		h2 += (((t1 >> 24)) & 0x3ffffffffffULL) | hibit64;
 
 		/* h *= r */
-		d0 = (u128)h0 * r0;
-		d = (u128)h1 * s2;
+		d0 = (__uint128_t)h0 * r0;
+		d = (__uint128_t)h1 * s2;
 		d0 += d;
-		d = (u128)h2 * s1;
+		d = (__uint128_t)h2 * s1;
 		d0 += d;
-		d1 = (u128)h0 * r1;
-		d = (u128)h1 * r0;
+		d1 = (__uint128_t)h0 * r1;
+		d = (__uint128_t)h1 * r0;
 		d1 += d;
-		d = (u128)h2 * s2;
+		d = (__uint128_t)h2 * s2;
 		d1 += d;
-		d2 = (u128)h0 * r2;
-		d = (u128)h1 * r1;
+		d2 = (__uint128_t)h0 * r2;
+		d = (__uint128_t)h1 * r1;
 		d2 += d;
-		d = (u128)h2 * r0;
+		d = (__uint128_t)h2 * r0;
 		d2 += d;
 
 		/* (partial) h %= p */



^ permalink raw reply	[flat|nested] 57+ messages in thread

* [RFC][PATCH 02/12] crypto/ghash-clmulni: Use (struct) be128
  2022-12-19 15:35 [RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg_double() Peter Zijlstra
  2022-12-19 15:35 ` [RFC][PATCH 01/12] crypto: Remove u128 usage Peter Zijlstra
@ 2022-12-19 15:35 ` Peter Zijlstra
  2022-12-20  5:45   ` Eric Biggers
  2022-12-19 15:35 ` [RFC][PATCH 03/12] cyrpto/b128ops: Remove struct u128 Peter Zijlstra
                   ` (10 subsequent siblings)
  12 siblings, 1 reply; 57+ messages in thread
From: Peter Zijlstra @ 2022-12-19 15:35 UTC (permalink / raw)
  To: torvalds
  Cc: corbet, will, peterz, boqun.feng, mark.rutland, catalin.marinas,
	dennis, tj, cl, hca, gor, agordeev, borntraeger, svens,
	Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

Even though x86 is firmly little endian, use be128 because le128 is in
fact the wrong way around :/ The actual code is already using be128 in
ghash_setkey() so this shouldn't be more confusing.

This frees up the u128 name for a real u128 type.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/crypto/ghash-clmulni-intel_asm.S  |    4 ++--
 arch/x86/crypto/ghash-clmulni-intel_glue.c |    6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -88,7 +88,7 @@ SYM_FUNC_START_LOCAL(__clmul_gf128mul_bl
 	RET
 SYM_FUNC_END(__clmul_gf128mul_ble)
 
-/* void clmul_ghash_mul(char *dst, const u128 *shash) */
+/* void clmul_ghash_mul(char *dst, const le128 *shash) */
 SYM_FUNC_START(clmul_ghash_mul)
 	FRAME_BEGIN
 	movups (%rdi), DATA
@@ -104,7 +104,7 @@ SYM_FUNC_END(clmul_ghash_mul)
 
 /*
  * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
- *			   const u128 *shash);
+ *			   const le128 *shash);
  */
 SYM_FUNC_START(clmul_ghash_update)
 	FRAME_BEGIN
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -23,17 +23,17 @@
 #define GHASH_BLOCK_SIZE	16
 #define GHASH_DIGEST_SIZE	16
 
-void clmul_ghash_mul(char *dst, const u128 *shash);
+void clmul_ghash_mul(char *dst, const be128 *shash);
 
 void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
-			const u128 *shash);
+			const be128 *shash);
 
 struct ghash_async_ctx {
 	struct cryptd_ahash *cryptd_tfm;
 };
 
 struct ghash_ctx {
-	u128 shash;
+	be128 shash;
 };
 
 struct ghash_desc_ctx {



^ permalink raw reply	[flat|nested] 57+ messages in thread

* [RFC][PATCH 03/12] cyrpto/b128ops: Remove struct u128
  2022-12-19 15:35 [RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg_double() Peter Zijlstra
  2022-12-19 15:35 ` [RFC][PATCH 01/12] crypto: Remove u128 usage Peter Zijlstra
  2022-12-19 15:35 ` [RFC][PATCH 02/12] crypto/ghash-clmulni: Use (struct) be128 Peter Zijlstra
@ 2022-12-19 15:35 ` Peter Zijlstra
  2022-12-20  5:52   ` Eric Biggers
  2022-12-19 15:35 ` [RFC][PATCH 04/12] types: Introduce [us]128 Peter Zijlstra
                   ` (9 subsequent siblings)
  12 siblings, 1 reply; 57+ messages in thread
From: Peter Zijlstra @ 2022-12-19 15:35 UTC (permalink / raw)
  To: torvalds
  Cc: corbet, will, peterz, boqun.feng, mark.rutland, catalin.marinas,
	dennis, tj, cl, hca, gor, agordeev, borntraeger, svens,
	Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

Per git-grep u128_xor() and its related struct u128 are unused except
to implement {be,le}128_xor(). Remove them to free up the namespace.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/crypto/b128ops.h |   14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

--- a/include/crypto/b128ops.h
+++ b/include/crypto/b128ops.h
@@ -50,10 +50,6 @@
 #include <linux/types.h>
 
 typedef struct {
-	u64 a, b;
-} u128;
-
-typedef struct {
 	__be64 a, b;
 } be128;
 
@@ -61,20 +57,16 @@ typedef struct {
 	__le64 b, a;
 } le128;
 
-static inline void u128_xor(u128 *r, const u128 *p, const u128 *q)
+static inline void be128_xor(be128 *r, const be128 *p, const be128 *q)
 {
 	r->a = p->a ^ q->a;
 	r->b = p->b ^ q->b;
 }
 
-static inline void be128_xor(be128 *r, const be128 *p, const be128 *q)
-{
-	u128_xor((u128 *)r, (u128 *)p, (u128 *)q);
-}
-
 static inline void le128_xor(le128 *r, const le128 *p, const le128 *q)
 {
-	u128_xor((u128 *)r, (u128 *)p, (u128 *)q);
+	r->a = p->a ^ q->a;
+	r->b = p->b ^ q->b;
 }
 
 #endif /* _CRYPTO_B128OPS_H */



^ permalink raw reply	[flat|nested] 57+ messages in thread

* [RFC][PATCH 04/12] types: Introduce [us]128
  2022-12-19 15:35 [RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg_double() Peter Zijlstra
                   ` (2 preceding siblings ...)
  2022-12-19 15:35 ` [RFC][PATCH 03/12] cyrpto/b128ops: Remove struct u128 Peter Zijlstra
@ 2022-12-19 15:35 ` Peter Zijlstra
  2022-12-29  8:30   ` Pavel Machek
  2022-12-19 15:35 ` [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}() Peter Zijlstra
                   ` (8 subsequent siblings)
  12 siblings, 1 reply; 57+ messages in thread
From: Peter Zijlstra @ 2022-12-19 15:35 UTC (permalink / raw)
  To: torvalds
  Cc: corbet, will, peterz, boqun.feng, mark.rutland, catalin.marinas,
	dennis, tj, cl, hca, gor, agordeev, borntraeger, svens,
	Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

Introduce [us]128 (when available). Unlike [us]64, ensure they are
always naturally aligned.

This also enables 128bit wide atomics (which require natural
alignment) such as cmpxchg128().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/types.h      |    5 +++++
 include/uapi/linux/types.h |    4 ++++
 2 files changed, 9 insertions(+)

--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -10,6 +10,11 @@
 #define DECLARE_BITMAP(name,bits) \
 	unsigned long name[BITS_TO_LONGS(bits)]
 
+#ifdef __SIZEOF_INT128__
+typedef __s128 s128;
+typedef __u128 u128;
+#endif
+
 typedef u32 __kernel_dev_t;
 
 typedef __kernel_fd_set		fd_set;
--- a/include/uapi/linux/types.h
+++ b/include/uapi/linux/types.h
@@ -13,6 +13,10 @@
 
 #include <linux/posix_types.h>
 
+#ifdef __SIZEOF_INT128__
+typedef __signed__ __int128 __s128 __attribute__((aligned(16)));
+typedef unsigned __int128 __u128 __attribute__((aligned(16)));
+#endif
 
 /*
  * Below are truly Linux-specific types that should never collide with



^ permalink raw reply	[flat|nested] 57+ messages in thread

* [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}()
  2022-12-19 15:35 [RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg_double() Peter Zijlstra
                   ` (3 preceding siblings ...)
  2022-12-19 15:35 ` [RFC][PATCH 04/12] types: Introduce [us]128 Peter Zijlstra
@ 2022-12-19 15:35 ` Peter Zijlstra
  2022-12-19 20:07   ` Boqun Feng
                     ` (3 more replies)
  2022-12-19 15:35 ` [RFC][PATCH 06/12] instrumentation: Wire up cmpxchg128() Peter Zijlstra
                   ` (7 subsequent siblings)
  12 siblings, 4 replies; 57+ messages in thread
From: Peter Zijlstra @ 2022-12-19 15:35 UTC (permalink / raw)
  To: torvalds
  Cc: corbet, will, peterz, boqun.feng, mark.rutland, catalin.marinas,
	dennis, tj, cl, hca, gor, agordeev, borntraeger, svens,
	Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

For all architectures that currently support cmpxchg_double()
implement the cmpxchg128() family of functions that is basically the
same but with a saner interface.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/arm64/include/asm/atomic_ll_sc.h |   38 +++++++++++++++++++++++
 arch/arm64/include/asm/atomic_lse.h   |   33 +++++++++++++++++++-
 arch/arm64/include/asm/cmpxchg.h      |   26 ++++++++++++++++
 arch/s390/include/asm/cmpxchg.h       |   33 ++++++++++++++++++++
 arch/x86/include/asm/cmpxchg_32.h     |    3 +
 arch/x86/include/asm/cmpxchg_64.h     |   55 +++++++++++++++++++++++++++++++++-
 6 files changed, 185 insertions(+), 3 deletions(-)

--- a/arch/arm64/include/asm/atomic_ll_sc.h
+++ b/arch/arm64/include/asm/atomic_ll_sc.h
@@ -326,6 +326,44 @@ __CMPXCHG_DBL(   ,        ,  ,         )
 __CMPXCHG_DBL(_mb, dmb ish, l, "memory")
 
 #undef __CMPXCHG_DBL
+
+union __u128_halves {
+	u128 full;
+	struct {
+		u64 low, high;
+	};
+};
+
+#define __CMPXCHG128(name, mb, rel, cl)					\
+static __always_inline u128						\
+__ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)	\
+{									\
+	union __u128_halves r, o = { .full = (old) },			\
+			       n = { .full = (new) };			\
+									\
+	asm volatile("// __cmpxchg128" #name "\n"			\
+	"	prfm	pstl1strm, %2\n"				\
+	"1:	ldxp	%0, %1, %2\n"					\
+	"	eor	%3, %0, %3\n"					\
+	"	eor	%4, %1, %4\n"					\
+	"	orr	%3, %4, %3\n"					\
+	"	cbnz	%3, 2f\n"					\
+	"	st" #rel "xp	%w3, %5, %6, %2\n"			\
+	"	cbnz	%w3, 1b\n"					\
+	"	" #mb "\n"						\
+	"2:"								\
+	: "=&r" (r.low), "=&r" (r.high), "+Q" (*(unsigned long *)ptr)	\
+	: "r" (o.low), "r" (o.high), "r" (n.low), "r" (n.high)		\
+	: cl);								\
+									\
+	return r.full;							\
+}
+
+__CMPXCHG128(   ,        ,  ,         )
+__CMPXCHG128(_mb, dmb ish, l, "memory")
+
+#undef __CMPXCHG128
+
 #undef K
 
 #endif	/* __ASM_ATOMIC_LL_SC_H */
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -151,7 +151,7 @@ __lse_atomic64_fetch_##op##name(s64 i, a
 	"	" #asm_op #mb "	%[i], %[old], %[v]"			\
 	: [v] "+Q" (v->counter),					\
 	  [old] "=r" (old)						\
-	: [i] "r" (i) 							\
+	: [i] "r" (i)							\
 	: cl);								\
 									\
 	return old;							\
@@ -324,4 +324,35 @@ __CMPXCHG_DBL(_mb, al, "memory")
 
 #undef __CMPXCHG_DBL
 
+#define __CMPXCHG128(name, mb, cl...)					\
+static __always_inline u128						\
+__lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)		\
+{									\
+	union __u128_halves r, o = { .full = (old) },			\
+			       n = { .full = (new) };			\
+	register unsigned long x0 asm ("x0") = o.low;			\
+	register unsigned long x1 asm ("x1") = o.high;			\
+	register unsigned long x2 asm ("x2") = n.low;			\
+	register unsigned long x3 asm ("x3") = n.high;			\
+	register unsigned long x4 asm ("x4") = (unsigned long)ptr;	\
+									\
+	asm volatile(							\
+	__LSE_PREAMBLE							\
+	"	casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\
+	: [old1] "+&r" (x0), [old2] "+&r" (x1),				\
+	  [v] "+Q" (*(unsigned long *)ptr)				\
+	: [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4),		\
+	  [oldval1] "r" (r.low), [oldval2] "r" (r.high)			\
+	: cl);								\
+									\
+	r.low = x0; r.high = x1;					\
+									\
+	return r.full;							\
+}
+
+__CMPXCHG128(   ,   )
+__CMPXCHG128(_mb, al, "memory")
+
+#undef __CMPXCHG128
+
 #endif	/* __ASM_ATOMIC_LSE_H */
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -147,6 +147,19 @@ __CMPXCHG_DBL(_mb)
 
 #undef __CMPXCHG_DBL
 
+#define __CMPXCHG128(name)						\
+static inline long __cmpxchg128##name(volatile u128 *ptr,		\
+				      u128 old, u128 new)		\
+{									\
+	return __lse_ll_sc_body(_cmpxchg128##name,			\
+				ptr, old, new);				\
+}
+
+__CMPXCHG128(   )
+__CMPXCHG128(_mb)
+
+#undef __CMPXCHG128
+
 #define __CMPXCHG_GEN(sfx)						\
 static __always_inline unsigned long __cmpxchg##sfx(volatile void *ptr,	\
 					   unsigned long old,		\
@@ -229,6 +242,19 @@ __CMPXCHG_GEN(_mb)
 	__ret;									\
 })
 
+/* cmpxchg128 */
+#define system_has_cmpxchg128()		1
+
+#define arch_cmpxchg128(ptr, o, n)						\
+({										\
+	__cmpxchg128_mb((ptr), (o), (n));					\
+})
+
+#define arch_cmpxchg128_local(ptr, o, n)					\
+({										\
+	__cmpxchg128((ptr), (o), (n));						\
+})
+
 #define __CMPWAIT_CASE(w, sfx, sz)					\
 static inline void __cmpwait_case_##sz(volatile void *ptr,		\
 				       unsigned long val)		\
--- a/arch/s390/include/asm/cmpxchg.h
+++ b/arch/s390/include/asm/cmpxchg.h
@@ -201,4 +201,37 @@ static __always_inline int __cmpxchg_dou
 			 (unsigned long)(n1), (unsigned long)(n2));	\
 })
 
+#define system_has_cmpxchg128()		1
+
+static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new)
+{
+	asm volatile(
+		"	cdsg	%[old],%[new],%[ptr]\n"
+		: [old] "+&d" (old)
+		: [new] "d" (new),
+		  [ptr] "QS" (*(unsigned long *)ptr)
+		: "memory", "cc");
+	return old;
+}
+
+static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *oldp, u128 new)
+{
+	u128 old = *oldp;
+	int cc;
+
+	asm volatile(
+		"	cdsg	%[old],%[new],%[ptr]\n"
+		"	ipm	%[cc]\n"
+		"	srl	%[cc],28\n"
+		: [cc] "=&d" (cc), [old] "+&d" (old)
+		: [new] "d" (new),
+		  [ptr] "QS" (*(unsigned long *)ptr)
+		: "memory", "cc");
+
+	if (unlikely(!cc))
+		*oldp = old;
+
+	return likely(cc);
+}
+
 #endif /* __ASM_CMPXCHG_H */
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -103,6 +103,7 @@ static inline bool __try_cmpxchg64(volat
 
 #endif
 
-#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX8)
+#define system_has_cmpxchg_double()	boot_cpu_has(X86_FEATURE_CX8)
+#define system_has_cmpxchg64()		boot_cpu_has(X86_FEATURE_CX8)
 
 #endif /* _ASM_X86_CMPXCHG_32_H */
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -20,6 +20,59 @@
 	arch_try_cmpxchg((ptr), (po), (n));				\
 })
 
-#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16)
+union __u128_halves {
+	u128 full;
+	struct {
+		u64 low, high;
+	};
+};
+
+static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new)
+{
+	union __u128_halves o = { .full = old, }, n = { .full = new, };
+
+	asm volatile(LOCK_PREFIX "cmpxchg16b %[ptr]"
+		     : [ptr] "+m" (*ptr),
+		       "+a" (o.low), "+d" (o.high)
+		     : "b" (n.low), "c" (n.high)
+		     : "memory");
+
+	return o.full;
+}
+
+static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old, u128 new)
+{
+	union __u128_halves o = { .full = old, }, n = { .full = new, };
+
+	asm volatile("cmpxchg16b %[ptr]"
+		     : [ptr] "+m" (*ptr),
+		       "+a" (o.low), "+d" (o.high)
+		     : "b" (n.low), "c" (n.high)
+		     : "memory");
+
+	return o.full;
+}
+
+static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *old, u128 new)
+{
+	union __u128_halves o = { .full = *old, }, n = { .full = new, };
+	bool ret;
+
+	asm volatile(LOCK_PREFIX "cmpxchg16b %[ptr]"
+		     CC_SET(e)
+		     : CC_OUT(e) (ret),
+		       [ptr] "+m" (*ptr),
+		       "+a" (o.low), "+d" (o.high)
+		     : "b" (n.low), "c" (n.high)
+		     : "memory");
+
+	if (unlikely(!ret))
+		*old = o.full;
+
+	return likely(ret);
+}
+
+#define system_has_cmpxchg_double()	boot_cpu_has(X86_FEATURE_CX16)
+#define system_has_cmpxchg128()		boot_cpu_has(X86_FEATURE_CX16)
 
 #endif /* _ASM_X86_CMPXCHG_64_H */



^ permalink raw reply	[flat|nested] 57+ messages in thread

* [RFC][PATCH 06/12] instrumentation: Wire up cmpxchg128()
  2022-12-19 15:35 [RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg_double() Peter Zijlstra
                   ` (4 preceding siblings ...)
  2022-12-19 15:35 ` [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}() Peter Zijlstra
@ 2022-12-19 15:35 ` Peter Zijlstra
  2022-12-19 15:35 ` [RFC][PATCH 07/12] percpu: Wire up cmpxchg128 Peter Zijlstra
                   ` (6 subsequent siblings)
  12 siblings, 0 replies; 57+ messages in thread
From: Peter Zijlstra @ 2022-12-19 15:35 UTC (permalink / raw)
  To: torvalds
  Cc: corbet, will, peterz, boqun.feng, mark.rutland, catalin.marinas,
	dennis, tj, cl, hca, gor, agordeev, borntraeger, svens,
	Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

Wire up the cmpxchg128 familty in the atomic wrappery scripts.

These provide the generic cmpxchg128 family of functions from the
arch_ prefixed version, adding explicit instrumentation where needed.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/atomic/atomic-arch-fallback.h |   95 +++++++++++++++++++++++++++-
 include/linux/atomic/atomic-instrumented.h  |   77 ++++++++++++++++++++++
 scripts/atomic/gen-atomic-fallback.sh       |    4 -
 scripts/atomic/gen-atomic-instrumented.sh   |    4 -
 4 files changed, 174 insertions(+), 6 deletions(-)

--- a/include/linux/atomic/atomic-arch-fallback.h
+++ b/include/linux/atomic/atomic-arch-fallback.h
@@ -77,6 +77,29 @@
 
 #endif /* arch_cmpxchg64_relaxed */
 
+#ifndef arch_cmpxchg128_relaxed
+#define arch_cmpxchg128_acquire arch_cmpxchg128
+#define arch_cmpxchg128_release arch_cmpxchg128
+#define arch_cmpxchg128_relaxed arch_cmpxchg128
+#else /* arch_cmpxchg128_relaxed */
+
+#ifndef arch_cmpxchg128_acquire
+#define arch_cmpxchg128_acquire(...) \
+	__atomic_op_acquire(arch_cmpxchg128, __VA_ARGS__)
+#endif
+
+#ifndef arch_cmpxchg128_release
+#define arch_cmpxchg128_release(...) \
+	__atomic_op_release(arch_cmpxchg128, __VA_ARGS__)
+#endif
+
+#ifndef arch_cmpxchg128
+#define arch_cmpxchg128(...) \
+	__atomic_op_fence(arch_cmpxchg128, __VA_ARGS__)
+#endif
+
+#endif /* arch_cmpxchg128_relaxed */
+
 #ifndef arch_try_cmpxchg_relaxed
 #ifdef arch_try_cmpxchg
 #define arch_try_cmpxchg_acquire arch_try_cmpxchg
@@ -217,6 +240,76 @@
 
 #endif /* arch_try_cmpxchg64_relaxed */
 
+#ifndef arch_try_cmpxchg128_relaxed
+#ifdef arch_try_cmpxchg128
+#define arch_try_cmpxchg128_acquire arch_try_cmpxchg128
+#define arch_try_cmpxchg128_release arch_try_cmpxchg128
+#define arch_try_cmpxchg128_relaxed arch_try_cmpxchg128
+#endif /* arch_try_cmpxchg128 */
+
+#ifndef arch_try_cmpxchg128
+#define arch_try_cmpxchg128(_ptr, _oldp, _new) \
+({ \
+	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
+	___r = arch_cmpxchg128((_ptr), ___o, (_new)); \
+	if (unlikely(___r != ___o)) \
+		*___op = ___r; \
+	likely(___r == ___o); \
+})
+#endif /* arch_try_cmpxchg128 */
+
+#ifndef arch_try_cmpxchg128_acquire
+#define arch_try_cmpxchg128_acquire(_ptr, _oldp, _new) \
+({ \
+	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
+	___r = arch_cmpxchg128_acquire((_ptr), ___o, (_new)); \
+	if (unlikely(___r != ___o)) \
+		*___op = ___r; \
+	likely(___r == ___o); \
+})
+#endif /* arch_try_cmpxchg128_acquire */
+
+#ifndef arch_try_cmpxchg128_release
+#define arch_try_cmpxchg128_release(_ptr, _oldp, _new) \
+({ \
+	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
+	___r = arch_cmpxchg128_release((_ptr), ___o, (_new)); \
+	if (unlikely(___r != ___o)) \
+		*___op = ___r; \
+	likely(___r == ___o); \
+})
+#endif /* arch_try_cmpxchg128_release */
+
+#ifndef arch_try_cmpxchg128_relaxed
+#define arch_try_cmpxchg128_relaxed(_ptr, _oldp, _new) \
+({ \
+	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
+	___r = arch_cmpxchg128_relaxed((_ptr), ___o, (_new)); \
+	if (unlikely(___r != ___o)) \
+		*___op = ___r; \
+	likely(___r == ___o); \
+})
+#endif /* arch_try_cmpxchg128_relaxed */
+
+#else /* arch_try_cmpxchg128_relaxed */
+
+#ifndef arch_try_cmpxchg128_acquire
+#define arch_try_cmpxchg128_acquire(...) \
+	__atomic_op_acquire(arch_try_cmpxchg128, __VA_ARGS__)
+#endif
+
+#ifndef arch_try_cmpxchg128_release
+#define arch_try_cmpxchg128_release(...) \
+	__atomic_op_release(arch_try_cmpxchg128, __VA_ARGS__)
+#endif
+
+#ifndef arch_try_cmpxchg128
+#define arch_try_cmpxchg128(...) \
+	__atomic_op_fence(arch_try_cmpxchg128, __VA_ARGS__)
+#endif
+
+#endif /* arch_try_cmpxchg128_relaxed */
+
 #ifndef arch_atomic_read_acquire
 static __always_inline int
 arch_atomic_read_acquire(const atomic_t *v)
@@ -2456,4 +2549,4 @@ arch_atomic64_dec_if_positive(atomic64_t
 #endif
 
 #endif /* _LINUX_ATOMIC_FALLBACK_H */
-// b5e87bdd5ede61470c29f7a7e4de781af3770f09
+// 46357a526de89c762d30fb238f35a7d5950a670b
--- a/include/linux/atomic/atomic-instrumented.h
+++ b/include/linux/atomic/atomic-instrumented.h
@@ -1968,6 +1968,36 @@ atomic_long_dec_if_positive(atomic_long_
 	arch_cmpxchg64_relaxed(__ai_ptr, __VA_ARGS__); \
 })
 
+#define cmpxchg128(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	kcsan_mb(); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	arch_cmpxchg128(__ai_ptr, __VA_ARGS__); \
+})
+
+#define cmpxchg128_acquire(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	arch_cmpxchg128_acquire(__ai_ptr, __VA_ARGS__); \
+})
+
+#define cmpxchg128_release(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	kcsan_release(); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	arch_cmpxchg128_release(__ai_ptr, __VA_ARGS__); \
+})
+
+#define cmpxchg128_relaxed(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	arch_cmpxchg128_relaxed(__ai_ptr, __VA_ARGS__); \
+})
+
 #define try_cmpxchg(ptr, oldp, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
@@ -2044,6 +2074,44 @@ atomic_long_dec_if_positive(atomic_long_
 	arch_try_cmpxchg64_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
 })
 
+#define try_cmpxchg128(ptr, oldp, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	typeof(oldp) __ai_oldp = (oldp); \
+	kcsan_mb(); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	arch_try_cmpxchg128(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+
+#define try_cmpxchg128_acquire(ptr, oldp, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	typeof(oldp) __ai_oldp = (oldp); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	arch_try_cmpxchg128_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+
+#define try_cmpxchg128_release(ptr, oldp, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	typeof(oldp) __ai_oldp = (oldp); \
+	kcsan_release(); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	arch_try_cmpxchg128_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+
+#define try_cmpxchg128_relaxed(ptr, oldp, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	typeof(oldp) __ai_oldp = (oldp); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	arch_try_cmpxchg128_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+
 #define cmpxchg_local(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
@@ -2058,6 +2126,13 @@ atomic_long_dec_if_positive(atomic_long_
 	arch_cmpxchg64_local(__ai_ptr, __VA_ARGS__); \
 })
 
+#define cmpxchg128_local(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	arch_cmpxchg128_local(__ai_ptr, __VA_ARGS__); \
+})
+
 #define sync_cmpxchg(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
@@ -2083,4 +2158,4 @@ atomic_long_dec_if_positive(atomic_long_
 })
 
 #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
-// 764f741eb77a7ad565dc8d99ce2837d5542e8aee
+// 27320c1ec2bf2878ecb9df3ea4816a7bc0c57a52
--- a/scripts/atomic/gen-atomic-fallback.sh
+++ b/scripts/atomic/gen-atomic-fallback.sh
@@ -217,11 +217,11 @@ cat << EOF
 
 EOF
 
-for xchg in "arch_xchg" "arch_cmpxchg" "arch_cmpxchg64"; do
+for xchg in "arch_xchg" "arch_cmpxchg" "arch_cmpxchg64" "arch_cmpxchg128"; do
 	gen_xchg_fallbacks "${xchg}"
 done
 
-for cmpxchg in "cmpxchg" "cmpxchg64"; do
+for cmpxchg in "cmpxchg" "cmpxchg64" "cmpxchg128"; do
 	gen_try_cmpxchg_fallbacks "${cmpxchg}"
 done
 
--- a/scripts/atomic/gen-atomic-instrumented.sh
+++ b/scripts/atomic/gen-atomic-instrumented.sh
@@ -166,14 +166,14 @@ grep '^[a-z]' "$1" | while read name met
 done
 
 
-for xchg in "xchg" "cmpxchg" "cmpxchg64" "try_cmpxchg" "try_cmpxchg64"; do
+for xchg in "xchg" "cmpxchg" "cmpxchg64" "cmpxchg128" "try_cmpxchg" "try_cmpxchg64" "try_cmpxchg128"; do
 	for order in "" "_acquire" "_release" "_relaxed"; do
 		gen_xchg "${xchg}" "${order}" ""
 		printf "\n"
 	done
 done
 
-for xchg in "cmpxchg_local" "cmpxchg64_local" "sync_cmpxchg"; do
+for xchg in "cmpxchg_local" "cmpxchg64_local" "cmpxchg128_local" "sync_cmpxchg"; do
 	gen_xchg "${xchg}" "" ""
 	printf "\n"
 done



^ permalink raw reply	[flat|nested] 57+ messages in thread

* [RFC][PATCH 07/12] percpu: Wire up cmpxchg128
  2022-12-19 15:35 [RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg_double() Peter Zijlstra
                   ` (5 preceding siblings ...)
  2022-12-19 15:35 ` [RFC][PATCH 06/12] instrumentation: Wire up cmpxchg128() Peter Zijlstra
@ 2022-12-19 15:35 ` Peter Zijlstra
  2022-12-29 13:36   ` Arnd Bergmann
  2023-01-04 12:09   ` Heiko Carstens
  2022-12-19 15:35 ` [RFC][PATCH 08/12] s390: Replace cmpxchg_double() with cmpxchg128() Peter Zijlstra
                   ` (5 subsequent siblings)
  12 siblings, 2 replies; 57+ messages in thread
From: Peter Zijlstra @ 2022-12-19 15:35 UTC (permalink / raw)
  To: torvalds
  Cc: corbet, will, peterz, boqun.feng, mark.rutland, catalin.marinas,
	dennis, tj, cl, hca, gor, agordeev, borntraeger, svens,
	Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

In order to replace cmpxchg_double() with the newly minted
cmpxchg128() family of functions, wire it up in this_cpu_cmpxchg().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/arm64/include/asm/percpu.h |   24 ++++++++++++++++++
 arch/s390/include/asm/percpu.h  |   20 +++++++++++++++
 arch/x86/include/asm/percpu.h   |   52 ++++++++++++++++++++++++++++++++++++++++
 include/asm-generic/percpu.h    |    8 ++++++
 include/linux/percpu-defs.h     |   20 +++++++++++++--
 5 files changed, 122 insertions(+), 2 deletions(-)

--- a/arch/arm64/include/asm/percpu.h
+++ b/arch/arm64/include/asm/percpu.h
@@ -140,6 +140,10 @@ PERCPU_RET_OP(add, add, ldadd)
  * re-enabling preemption for preemptible kernels, but doing that in a way
  * which builds inside a module would mean messing directly with the preempt
  * count. If you do this, peterz and tglx will hunt you down.
+ *
+ * Not to mention it'll break the actual preemption model for missing a
+ * preemption point when TIF_NEED_RESCHED gets set while preemption is
+ * disabled.
  */
 #define this_cpu_cmpxchg_double_8(ptr1, ptr2, o1, o2, n1, n2)		\
 ({									\
@@ -240,6 +244,26 @@ PERCPU_RET_OP(add, add, ldadd)
 #define this_cpu_cmpxchg_8(pcp, o, n)	\
 	_pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
 
+#define __pcpu_cast_128(_exp, _val)					\
+	_Generic((_exp),						\
+		 u128: (_val),						\
+		 s128: (_val),						\
+		 default: (unsigned long)(_val))
+
+#define this_cpu_cmpxchg_16(pcp, o, n)					\
+({									\
+	u128 old__ = __pcpu_cast_128((o), (o));				\
+	u128 new__ = __pcpu_cast_128((n), (n));				\
+	typedef typeof(pcp) pcp_op_T__;					\
+	pcp_op_T__ *ptr__;						\
+	u128 ret__;							\
+	preempt_disable_notrace();					\
+	ptr__ = raw_cpu_ptr(&(pcp));					\
+	ret__ = cmpxchg128_local((void *)ptr__, old__, new__);		\
+	preempt_enable_notrace();					\
+	(typeof(pcp))__pcpu_cast_128(*ptr__, ret__);			\
+})
+
 #ifdef __KVM_NVHE_HYPERVISOR__
 extern unsigned long __hyp_per_cpu_offset(unsigned int cpu);
 #define __per_cpu_offset
--- a/arch/s390/include/asm/percpu.h
+++ b/arch/s390/include/asm/percpu.h
@@ -148,6 +148,26 @@
 #define this_cpu_cmpxchg_4(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval)
 #define this_cpu_cmpxchg_8(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval)
 
+#define __pcpu_cast_128(_exp, _val)					\
+	_Generic((_exp),						\
+		 u128: (_val),						\
+		 s128: (_val),						\
+		 default: (unsigned long)(_val))
+
+#define this_cpu_cmpxchg_16(pcp, oval, nval)				\
+({									\
+	u128 old__ = __pcpu_cast_128((nval), (nval));			\
+	u128 new__ = __pcpu_cast_128((oval), (oval));			\
+	typedef typeof(pcp) pcp_op_T__;					\
+	pcp_op_T__ *ptr__;						\
+	u128 ret__;							\
+	preempt_disable_notrace();					\
+	ptr__ = raw_cpu_ptr(&(pcp));					\
+	ret__ = cmpxchg128((void *)ptr__, old__, new__);		\
+	preempt_enable_notrace();					\
+	(typeof(pcp))__pcpu_cast_128(*ptr__, ret__);			\
+})
+
 #define arch_this_cpu_xchg(pcp, nval)					\
 ({									\
 	typeof(pcp) *ptr__;						\
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -210,6 +210,58 @@ do {									\
 	(typeof(_var))(unsigned long) pco_old__;			\
 })
 
+#if defined(CONFIG_X86_32) && defined(CONFIG_X86_CMPXCHG64)
+#define __pcpu_cast_64(_exp, _val)					\
+	_Generic((_exp),						\
+		 u64: (_val),						\
+		 s64: (_val),						\
+		 default: (unsigned long)(_val))
+
+#define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval)		\
+({									\
+	__pcpu_type_##size pco_old__ = __pcpu_cast_64((_oval), (_oval));\
+	__pcpu_type_##size pco_new__ = __pcpu_cast_64((_nval), (_nval));\
+	asm qual ("cmpxchg8b " __percpu_arg([var])			\
+		  : [var] "+m" (_var),					\
+		    "+A" (pco_old__)					\
+		  : "b" ((u32)pco_new__), "c" ((u32)(pco_new__ >> 32))	\
+		  : "memory");						\
+	(typeof(_var))__pcpu_cast_64(_var, pco_old__);			\
+})
+
+#define raw_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg64_op(8,         , pcp, oval, nval)
+#define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg64_op(8, volatile, pcp, oval, nval)
+#endif
+
+#ifdef CONFIG_X86_64
+#define __pcpu_cast_128(_exp, _val)					\
+	_Generic((_exp),						\
+		 u128: (_val),						\
+		 s128: (_val),						\
+		 default: (unsigned long)(_val))
+
+#define percpu_cmpxchg128_op(size, qual, _var, _oval, _nval)		\
+({									\
+	union __u128_halves pco_old__ = {				\
+		.full = __pcpu_cast_128((_oval), (_oval))		\
+	};								\
+	union __u128_halves pco_new__ = {				\
+		.full = __pcpu_cast_128((_nval), (_nval))		\
+	};								\
+	asm qual ("cmpxchg16b " __percpu_arg([var])			\
+		  : [var] "+m" (_var),					\
+		    "+a" (pco_old__.low),				\
+		    "+d" (pco_old__.high)				\
+		  : "b" (pco_new__.low),				\
+		    "c" (pco_new__.high)				\
+		  : "memory");						\
+	(typeof(_var))__pcpu_cast_128(_var, pco_old__.full);		\
+})
+
+#define raw_cpu_cmpxchg_16(pcp, oval, nval)	percpu_cmpxchg128_op(16,         , pcp, oval, nval)
+#define this_cpu_cmpxchg_16(pcp, oval, nval)	percpu_cmpxchg128_op(16, volatile, pcp, oval, nval)
+#endif
+
 /*
  * this_cpu_read() makes gcc load the percpu variable every time it is
  * accessed while this_cpu_read_stable() allows the value to be cached.
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -298,6 +298,10 @@ do {									\
 #define raw_cpu_cmpxchg_8(pcp, oval, nval) \
 	raw_cpu_generic_cmpxchg(pcp, oval, nval)
 #endif
+#ifndef raw_cpu_cmpxchg_16
+#define raw_cpu_cmpxchg_16(pcp, oval, nval) \
+	raw_cpu_generic_cmpxchg(pcp, oval, nval)
+#endif
 
 #ifndef raw_cpu_cmpxchg_double_1
 #define raw_cpu_cmpxchg_double_1(pcp1, pcp2, oval1, oval2, nval1, nval2) \
@@ -423,6 +427,10 @@ do {									\
 #define this_cpu_cmpxchg_8(pcp, oval, nval) \
 	this_cpu_generic_cmpxchg(pcp, oval, nval)
 #endif
+#ifndef this_cpu_cmpxchg_16
+#define this_cpu_cmpxchg_16(pcp, oval, nval) \
+	this_cpu_generic_cmpxchg(pcp, oval, nval)
+#endif
 
 #ifndef this_cpu_cmpxchg_double_1
 #define this_cpu_cmpxchg_double_1(pcp1, pcp2, oval1, oval2, nval1, nval2) \
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -343,6 +343,22 @@ static inline void __this_cpu_preempt_ch
 	pscr2_ret__;							\
 })
 
+#define __pcpu_size16_call_return2(stem, variable, ...)			\
+({									\
+	typeof(variable) pscr2_ret__;					\
+	__verify_pcpu_ptr(&(variable));					\
+	switch(sizeof(variable)) {					\
+	case 1: pscr2_ret__ = stem##1(variable, __VA_ARGS__); break;	\
+	case 2: pscr2_ret__ = stem##2(variable, __VA_ARGS__); break;	\
+	case 4: pscr2_ret__ = stem##4(variable, __VA_ARGS__); break;	\
+	case 8: pscr2_ret__ = stem##8(variable, __VA_ARGS__); break;	\
+	case 16: pscr2_ret__ = stem##16(variable, __VA_ARGS__); break;	\
+	default:							\
+		__bad_size_call_parameter(); break;			\
+	}								\
+	pscr2_ret__;							\
+})
+
 /*
  * Special handling for cmpxchg_double.  cmpxchg_double is passed two
  * percpu variables.  The first has to be aligned to a double word
@@ -425,7 +441,7 @@ do {									\
 #define raw_cpu_add_return(pcp, val)	__pcpu_size_call_return2(raw_cpu_add_return_, pcp, val)
 #define raw_cpu_xchg(pcp, nval)		__pcpu_size_call_return2(raw_cpu_xchg_, pcp, nval)
 #define raw_cpu_cmpxchg(pcp, oval, nval) \
-	__pcpu_size_call_return2(raw_cpu_cmpxchg_, pcp, oval, nval)
+	__pcpu_size16_call_return2(raw_cpu_cmpxchg_, pcp, oval, nval)
 #define raw_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
 	__pcpu_double_call_return_bool(raw_cpu_cmpxchg_double_, pcp1, pcp2, oval1, oval2, nval1, nval2)
 
@@ -512,7 +528,7 @@ do {									\
 #define this_cpu_add_return(pcp, val)	__pcpu_size_call_return2(this_cpu_add_return_, pcp, val)
 #define this_cpu_xchg(pcp, nval)	__pcpu_size_call_return2(this_cpu_xchg_, pcp, nval)
 #define this_cpu_cmpxchg(pcp, oval, nval) \
-	__pcpu_size_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
+	__pcpu_size16_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
 #define this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
 	__pcpu_double_call_return_bool(this_cpu_cmpxchg_double_, pcp1, pcp2, oval1, oval2, nval1, nval2)
 



^ permalink raw reply	[flat|nested] 57+ messages in thread

* [RFC][PATCH 08/12] s390: Replace cmpxchg_double() with cmpxchg128()
  2022-12-19 15:35 [RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg_double() Peter Zijlstra
                   ` (6 preceding siblings ...)
  2022-12-19 15:35 ` [RFC][PATCH 07/12] percpu: Wire up cmpxchg128 Peter Zijlstra
@ 2022-12-19 15:35 ` Peter Zijlstra
  2023-01-10  7:23   ` Heiko Carstens
  2022-12-19 15:35 ` [RFC][PATCH 09/12] x86,amd_iommu: Replace cmpxchg_double() Peter Zijlstra
                   ` (4 subsequent siblings)
  12 siblings, 1 reply; 57+ messages in thread
From: Peter Zijlstra @ 2022-12-19 15:35 UTC (permalink / raw)
  To: torvalds
  Cc: corbet, will, peterz, boqun.feng, mark.rutland, catalin.marinas,
	dennis, tj, cl, hca, gor, agordeev, borntraeger, svens,
	Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

In order to depricate cmpxchg_double(), replace all its usage with
cmpxchg128().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/s390/include/asm/cpu_mf.h  |   29 ++++++++++++-----
 arch/s390/kernel/perf_cpum_sf.c |   65 +++++++++++++++++++++++++---------------
 2 files changed, 63 insertions(+), 31 deletions(-)

--- a/arch/s390/include/asm/cpu_mf.h
+++ b/arch/s390/include/asm/cpu_mf.h
@@ -131,19 +131,32 @@ struct hws_combined_entry {
 	struct hws_diag_entry	diag;	/* Diagnostic-sampling data entry */
 } __packed;
 
+union hws_flags_and_overflow {
+	struct {
+		unsigned long flags;
+		unsigned long overflow;
+	};
+	u128	full;
+};
+
 struct hws_trailer_entry {
 	union {
 		struct {
-			unsigned int f:1;	/* 0 - Block Full Indicator   */
-			unsigned int a:1;	/* 1 - Alert request control  */
-			unsigned int t:1;	/* 2 - Timestamp format	      */
-			unsigned int :29;	/* 3 - 31: Reserved	      */
-			unsigned int bsdes:16;	/* 32-47: size of basic SDE   */
-			unsigned int dsdes:16;	/* 48-63: size of diagnostic SDE */
+			union {
+				struct {
+					unsigned int f:1;	/* 0 - Block Full Indicator   */
+					unsigned int a:1;	/* 1 - Alert request control  */
+					unsigned int t:1;	/* 2 - Timestamp format	      */
+					unsigned int :29;	/* 3 - 31: Reserved	      */
+					unsigned int bsdes:16;	/* 32-47: size of basic SDE   */
+					unsigned int dsdes:16;	/* 48-63: size of diagnostic SDE */
+				};
+				unsigned long long flags;	/* 0 - 63: All indicators     */
+			};
+			unsigned long long overflow;	 /* 64 - sample Overflow count	      */
 		};
-		unsigned long long flags;	/* 0 - 63: All indicators     */
+		union hws_flags_and_overflow flags_and_overflow;
 	};
-	unsigned long long overflow;	 /* 64 - sample Overflow count	      */
 	unsigned char timestamp[16];	 /* 16 - 31 timestamp		      */
 	unsigned long long reserved1;	 /* 32 -Reserved		      */
 	unsigned long long reserved2;	 /*				      */
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -1227,6 +1227,8 @@ static void hw_collect_samples(struct pe
 	}
 }
 
+typedef union hws_flags_and_overflow fao_t;
+
 /* hw_perf_event_update() - Process sampling buffer
  * @event:	The perf event
  * @flush_all:	Flag to also flush partially filled sample-data-blocks
@@ -1243,10 +1245,11 @@ static void hw_collect_samples(struct pe
  */
 static void hw_perf_event_update(struct perf_event *event, int flush_all)
 {
+	unsigned long long event_overflow, sampl_overflow, num_sdb;
 	struct hw_perf_event *hwc = &event->hw;
 	struct hws_trailer_entry *te;
+	fao_t old_fao, new_fao;
 	unsigned long *sdbt;
-	unsigned long long event_overflow, sampl_overflow, num_sdb, te_flags;
 	int done;
 
 	/*
@@ -1294,12 +1297,16 @@ static void hw_perf_event_update(struct
 		num_sdb++;
 
 		/* Reset trailer (using compare-double-and-swap) */
+		old_fao = te->flags_and_overflow;
 		do {
-			te_flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK;
-			te_flags |= SDB_TE_ALERT_REQ_MASK;
-		} while (!cmpxchg_double(&te->flags, &te->overflow,
-					 te->flags, te->overflow,
-					 te_flags, 0ULL));
+			new_fao = (fao_t){
+				.flags = old_fao.flags,
+				.overflow = 0,
+			};
+			new_fao.flags &= ~SDB_TE_BUFFER_FULL_MASK;
+			new_fao.flags |= SDB_TE_ALERT_REQ_MASK;
+		} while (!try_cmpxchg128(&te->flags_and_overflow.full,
+					 &old_fao.full, new_fao.full));
 
 		/* Advance to next sample-data-block */
 		sdbt++;
@@ -1475,14 +1482,19 @@ static int aux_output_begin(struct perf_
 static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
 			  unsigned long long *overflow)
 {
-	unsigned long long orig_overflow, orig_flags, new_flags;
 	struct hws_trailer_entry *te;
+	fao_t old_fao, new_fao;
 
 	te = aux_sdb_trailer(aux, alert_index);
+
+	old_fao = te->flags_and_overflow;
 	do {
-		orig_flags = te->flags;
-		*overflow = orig_overflow = te->overflow;
-		if (orig_flags & SDB_TE_BUFFER_FULL_MASK) {
+		new_fao = (fao_t){
+			.flags = old_fao.flags,
+			.overflow = 0,
+		};
+		*overflow = old_fao.overflow;
+		if (new_fao.flags & SDB_TE_BUFFER_FULL_MASK) {
 			/*
 			 * SDB is already set by hardware.
 			 * Abort and try to set somewhere
@@ -1490,10 +1502,11 @@ static bool aux_set_alert(struct aux_buf
 			 */
 			return false;
 		}
-		new_flags = orig_flags | SDB_TE_ALERT_REQ_MASK;
-	} while (!cmpxchg_double(&te->flags, &te->overflow,
-				 orig_flags, orig_overflow,
-				 new_flags, 0ULL));
+		new_fao.flags |= SDB_TE_ALERT_REQ_MASK;
+
+	} while (!try_cmpxchg128(&te->flags_and_overflow.full,
+				 &old_fao.full, new_fao.full));
+
 	return true;
 }
 
@@ -1522,9 +1535,10 @@ static bool aux_set_alert(struct aux_buf
 static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range,
 			     unsigned long long *overflow)
 {
-	unsigned long long orig_overflow, orig_flags, new_flags;
 	unsigned long i, range_scan, idx, idx_old;
+	unsigned long long orig_overflow;
 	struct hws_trailer_entry *te;
+	fao_t old_fao, new_fao;
 
 	debug_sprintf_event(sfdbg, 6, "%s: range %ld head %ld alert %ld "
 			    "empty %ld\n", __func__, range, aux->head,
@@ -1554,17 +1568,22 @@ static bool aux_reset_buffer(struct aux_
 	idx_old = idx = aux->empty_mark + 1;
 	for (i = 0; i < range_scan; i++, idx++) {
 		te = aux_sdb_trailer(aux, idx);
+
+		old_fao = te->flags_and_overflow;
 		do {
-			orig_flags = te->flags;
-			orig_overflow = te->overflow;
-			new_flags = orig_flags & ~SDB_TE_BUFFER_FULL_MASK;
+			new_fao = (fao_t){
+				.flags = old_fao.flags,
+				.overflow = 0,
+			};
+			orig_overflow = old_fao.overflow;
+			new_fao.flags &= ~SDB_TE_BUFFER_FULL_MASK;
 			if (idx == aux->alert_mark)
-				new_flags |= SDB_TE_ALERT_REQ_MASK;
+				new_fao.flags |= SDB_TE_ALERT_REQ_MASK;
 			else
-				new_flags &= ~SDB_TE_ALERT_REQ_MASK;
-		} while (!cmpxchg_double(&te->flags, &te->overflow,
-					 orig_flags, orig_overflow,
-					 new_flags, 0ULL));
+				new_fao.flags &= ~SDB_TE_ALERT_REQ_MASK;
+		} while (!try_cmpxchg128(&te->flags_and_overflow.full,
+					 &old_fao.full, new_fao.full));
+
 		*overflow += orig_overflow;
 	}
 



^ permalink raw reply	[flat|nested] 57+ messages in thread

* [RFC][PATCH 09/12] x86,amd_iommu: Replace cmpxchg_double()
  2022-12-19 15:35 [RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg_double() Peter Zijlstra
                   ` (7 preceding siblings ...)
  2022-12-19 15:35 ` [RFC][PATCH 08/12] s390: Replace cmpxchg_double() with cmpxchg128() Peter Zijlstra
@ 2022-12-19 15:35 ` Peter Zijlstra
  2022-12-19 16:47   ` Niklas Schnelle
  2022-12-28  8:40   ` Vasant Hegde
  2022-12-19 15:35 ` [RFC][PATCH 10/12] x86,intel_iommu: " Peter Zijlstra
                   ` (3 subsequent siblings)
  12 siblings, 2 replies; 57+ messages in thread
From: Peter Zijlstra @ 2022-12-19 15:35 UTC (permalink / raw)
  To: torvalds
  Cc: corbet, will, peterz, boqun.feng, mark.rutland, catalin.marinas,
	dennis, tj, cl, hca, gor, agordeev, borntraeger, svens,
	Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch


Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 drivers/iommu/amd/amd_iommu_types.h |    9 +++++++--
 drivers/iommu/amd/iommu.c           |   10 ++++------
 2 files changed, 11 insertions(+), 8 deletions(-)

--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -979,8 +979,13 @@ union irte_ga_hi {
 };
 
 struct irte_ga {
-	union irte_ga_lo lo;
-	union irte_ga_hi hi;
+	union {
+		struct {
+			union irte_ga_lo lo;
+			union irte_ga_hi hi;
+		};
+		u128 irte;
+	};
 };
 
 struct irq_2_irte {
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2992,10 +2992,10 @@ static int alloc_irq_index(struct amd_io
 static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
 			  struct irte_ga *irte, struct amd_ir_data *data)
 {
-	bool ret;
 	struct irq_remap_table *table;
-	unsigned long flags;
 	struct irte_ga *entry;
+	unsigned long flags;
+	u128 old;
 
 	table = get_irq_table(iommu, devid);
 	if (!table)
@@ -3006,16 +3006,14 @@ static int modify_irte_ga(struct amd_iom
 	entry = (struct irte_ga *)table->table;
 	entry = &entry[index];
 
-	ret = cmpxchg_double(&entry->lo.val, &entry->hi.val,
-			     entry->lo.val, entry->hi.val,
-			     irte->lo.val, irte->hi.val);
 	/*
 	 * We use cmpxchg16 to atomically update the 128-bit IRTE,
 	 * and it cannot be updated by the hardware or other processors
 	 * behind us, so the return value of cmpxchg16 should be the
 	 * same as the old value.
 	 */
-	WARN_ON(!ret);
+	old = entry->irte;
+	WARN_ON(!try_cmpxchg128(&entry->irte, &old, irte->irte));
 
 	if (data)
 		data->ref = entry;



^ permalink raw reply	[flat|nested] 57+ messages in thread

* [RFC][PATCH 10/12] x86,intel_iommu: Replace cmpxchg_double()
  2022-12-19 15:35 [RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg_double() Peter Zijlstra
                   ` (8 preceding siblings ...)
  2022-12-19 15:35 ` [RFC][PATCH 09/12] x86,amd_iommu: Replace cmpxchg_double() Peter Zijlstra
@ 2022-12-19 15:35 ` Peter Zijlstra
  2022-12-19 15:35 ` [RFC][PATCH 11/12] slub: " Peter Zijlstra
                   ` (2 subsequent siblings)
  12 siblings, 0 replies; 57+ messages in thread
From: Peter Zijlstra @ 2022-12-19 15:35 UTC (permalink / raw)
  To: torvalds
  Cc: corbet, will, peterz, boqun.feng, mark.rutland, catalin.marinas,
	dennis, tj, cl, hca, gor, agordeev, borntraeger, svens,
	Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch


Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 drivers/iommu/intel/irq_remapping.c |    8 --
 include/linux/dmar.h                |  125 +++++++++++++++++++-----------------
 2 files changed, 68 insertions(+), 65 deletions(-)

--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -174,18 +174,14 @@ static int modify_irte(struct irq_2_iomm
 	irte = &iommu->ir_table->base[index];
 
 	if ((irte->pst == 1) || (irte_modified->pst == 1)) {
-		bool ret;
-
-		ret = cmpxchg_double(&irte->low, &irte->high,
-				     irte->low, irte->high,
-				     irte_modified->low, irte_modified->high);
 		/*
 		 * We use cmpxchg16 to atomically update the 128-bit IRTE,
 		 * and it cannot be updated by the hardware or other processors
 		 * behind us, so the return value of cmpxchg16 should be the
 		 * same as the old value.
 		 */
-		WARN_ON(!ret);
+		u128 old = irte->irte;
+		WARN_ON(!try_cmpxchg128(&irte->irte, &old, irte_modified->irte));
 	} else {
 		WRITE_ONCE(irte->low, irte_modified->low);
 		WRITE_ONCE(irte->high, irte_modified->high);
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -201,67 +201,74 @@ static inline void detect_intel_iommu(vo
 
 struct irte {
 	union {
-		/* Shared between remapped and posted mode*/
 		struct {
-			__u64	present		: 1,  /*  0      */
-				fpd		: 1,  /*  1      */
-				__res0		: 6,  /*  2 -  6 */
-				avail		: 4,  /*  8 - 11 */
-				__res1		: 3,  /* 12 - 14 */
-				pst		: 1,  /* 15      */
-				vector		: 8,  /* 16 - 23 */
-				__res2		: 40; /* 24 - 63 */
+			union {
+				/* Shared between remapped and posted mode*/
+				struct {
+					__u64	present		: 1,  /*  0      */
+						fpd		: 1,  /*  1      */
+						__res0		: 6,  /*  2 -  6 */
+						avail		: 4,  /*  8 - 11 */
+						__res1		: 3,  /* 12 - 14 */
+						pst		: 1,  /* 15      */
+						vector		: 8,  /* 16 - 23 */
+						__res2		: 40; /* 24 - 63 */
+				};
+
+				/* Remapped mode */
+				struct {
+					__u64	r_present	: 1,  /*  0      */
+						r_fpd		: 1,  /*  1      */
+						dst_mode	: 1,  /*  2      */
+						redir_hint	: 1,  /*  3      */
+						trigger_mode	: 1,  /*  4      */
+						dlvry_mode	: 3,  /*  5 -  7 */
+						r_avail		: 4,  /*  8 - 11 */
+						r_res0		: 4,  /* 12 - 15 */
+						r_vector	: 8,  /* 16 - 23 */
+						r_res1		: 8,  /* 24 - 31 */
+						dest_id		: 32; /* 32 - 63 */
+				};
+
+				/* Posted mode */
+				struct {
+					__u64	p_present	: 1,  /*  0      */
+						p_fpd		: 1,  /*  1      */
+						p_res0		: 6,  /*  2 -  7 */
+						p_avail		: 4,  /*  8 - 11 */
+						p_res1		: 2,  /* 12 - 13 */
+						p_urgent	: 1,  /* 14      */
+						p_pst		: 1,  /* 15      */
+						p_vector	: 8,  /* 16 - 23 */
+						p_res2		: 14, /* 24 - 37 */
+						pda_l		: 26; /* 38 - 63 */
+				};
+				__u64 low;
+			};
+
+			union {
+				/* Shared between remapped and posted mode*/
+				struct {
+					__u64	sid		: 16,  /* 64 - 79  */
+						sq		: 2,   /* 80 - 81  */
+						svt		: 2,   /* 82 - 83  */
+						__res3		: 44;  /* 84 - 127 */
+				};
+
+				/* Posted mode*/
+				struct {
+					__u64	p_sid		: 16,  /* 64 - 79  */
+						p_sq		: 2,   /* 80 - 81  */
+						p_svt		: 2,   /* 82 - 83  */
+						p_res3		: 12,  /* 84 - 95  */
+						pda_h		: 32;  /* 96 - 127 */
+				};
+				__u64 high;
+			};
 		};
-
-		/* Remapped mode */
-		struct {
-			__u64	r_present	: 1,  /*  0      */
-				r_fpd		: 1,  /*  1      */
-				dst_mode	: 1,  /*  2      */
-				redir_hint	: 1,  /*  3      */
-				trigger_mode	: 1,  /*  4      */
-				dlvry_mode	: 3,  /*  5 -  7 */
-				r_avail		: 4,  /*  8 - 11 */
-				r_res0		: 4,  /* 12 - 15 */
-				r_vector	: 8,  /* 16 - 23 */
-				r_res1		: 8,  /* 24 - 31 */
-				dest_id		: 32; /* 32 - 63 */
-		};
-
-		/* Posted mode */
-		struct {
-			__u64	p_present	: 1,  /*  0      */
-				p_fpd		: 1,  /*  1      */
-				p_res0		: 6,  /*  2 -  7 */
-				p_avail		: 4,  /*  8 - 11 */
-				p_res1		: 2,  /* 12 - 13 */
-				p_urgent	: 1,  /* 14      */
-				p_pst		: 1,  /* 15      */
-				p_vector	: 8,  /* 16 - 23 */
-				p_res2		: 14, /* 24 - 37 */
-				pda_l		: 26; /* 38 - 63 */
-		};
-		__u64 low;
-	};
-
-	union {
-		/* Shared between remapped and posted mode*/
-		struct {
-			__u64	sid		: 16,  /* 64 - 79  */
-				sq		: 2,   /* 80 - 81  */
-				svt		: 2,   /* 82 - 83  */
-				__res3		: 44;  /* 84 - 127 */
-		};
-
-		/* Posted mode*/
-		struct {
-			__u64	p_sid		: 16,  /* 64 - 79  */
-				p_sq		: 2,   /* 80 - 81  */
-				p_svt		: 2,   /* 82 - 83  */
-				p_res3		: 12,  /* 84 - 95  */
-				pda_h		: 32;  /* 96 - 127 */
-		};
-		__u64 high;
+#ifdef CONFIG_IRQ_REMAP
+		__u128 irte;
+#endif
 	};
 };
 



^ permalink raw reply	[flat|nested] 57+ messages in thread

* [RFC][PATCH 11/12] slub: Replace cmpxchg_double()
  2022-12-19 15:35 [RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg_double() Peter Zijlstra
                   ` (9 preceding siblings ...)
  2022-12-19 15:35 ` [RFC][PATCH 10/12] x86,intel_iommu: " Peter Zijlstra
@ 2022-12-19 15:35 ` Peter Zijlstra
  2023-01-03 15:58   ` Vlastimil Babka
  2023-01-03 17:16   ` Heiko Carstens
  2022-12-19 15:35 ` [RFC][PATCH 12/12] arch: Remove cmpxchg_double Peter Zijlstra
  2022-12-22  1:21 ` [RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg_double() Boqun Feng
  12 siblings, 2 replies; 57+ messages in thread
From: Peter Zijlstra @ 2022-12-19 15:35 UTC (permalink / raw)
  To: torvalds
  Cc: corbet, will, peterz, boqun.feng, mark.rutland, catalin.marinas,
	dennis, tj, cl, hca, gor, agordeev, borntraeger, svens,
	Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch


Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/slub_def.h |   12 ++-
 mm/slab.h                |   41 +++++++++++--
 mm/slub.c                |  146 ++++++++++++++++++++++++++++-------------------
 3 files changed, 135 insertions(+), 64 deletions(-)

--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -39,15 +39,21 @@ enum stat_item {
 	CPU_PARTIAL_FREE,	/* Refill cpu partial on free */
 	CPU_PARTIAL_NODE,	/* Refill cpu partial from node partial */
 	CPU_PARTIAL_DRAIN,	/* Drain cpu partial to node partial */
-	NR_SLUB_STAT_ITEMS };
+	NR_SLUB_STAT_ITEMS
+};
 
 /*
  * When changing the layout, make sure freelist and tid are still compatible
  * with this_cpu_cmpxchg_double() alignment requirements.
  */
 struct kmem_cache_cpu {
-	void **freelist;	/* Pointer to next available object */
-	unsigned long tid;	/* Globally unique transaction id */
+	union {
+		struct {
+			void **freelist;	/* Pointer to next available object */
+			unsigned long tid;	/* Globally unique transaction id */
+		};
+		freelist_aba_t freelist_tid;
+	};
 	struct slab *slab;	/* The slab from which we are allocating */
 #ifdef CONFIG_SLUB_CPU_PARTIAL
 	struct slab *partial;	/* Partially allocated frozen slabs */
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -5,6 +5,32 @@
  * Internal slab definitions
  */
 
+/*
+ * Freelist pointer and counter to cmpxchg together, avoids the typical ABA
+ * problems with cmpxchg of just a pointer.
+ */
+typedef union {
+	struct {
+		void *freelist;
+		unsigned long counter;
+	};
+#ifdef CONFIG_64BIT
+	u128 full;
+#else
+	u64 full;
+#endif
+} freelist_aba_t;
+
+#ifdef CONFIG_64BIT
+# ifdef system_has_cmpxchg128
+# define system_has_freelist_aba() system_has_cmpxchg128()
+# endif
+#else /* CONFIG_64BIT */
+# ifdef system_has_cmpxchg64
+# define system_has_freelist_aba() system_has_cmpxchg64()
+# endif
+#endif /* CONFIG_64BIT */
+
 /* Reuses the bits in struct page */
 struct slab {
 	unsigned long __page_flags;
@@ -34,14 +60,19 @@ struct slab {
 	};
 	struct kmem_cache *slab_cache;
 	/* Double-word boundary */
-	void *freelist;		/* first free object */
 	union {
-		unsigned long counters;
 		struct {
-			unsigned inuse:16;
-			unsigned objects:15;
-			unsigned frozen:1;
+			void *freelist;		/* first free object */
+			union {
+				unsigned long counters;
+				struct {
+					unsigned inuse:16;
+					unsigned objects:15;
+					unsigned frozen:1;
+				};
+			};
 		};
+		freelist_aba_t freelist_counter;
 	};
 	unsigned int __unused;
 
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -280,7 +280,13 @@ static inline bool kmem_cache_has_cpu_pa
 /* Poison object */
 #define __OBJECT_POISON		((slab_flags_t __force)0x80000000U)
 /* Use cmpxchg_double */
+
+#if defined(system_has_freelist_aba) && \
+    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 #define __CMPXCHG_DOUBLE	((slab_flags_t __force)0x40000000U)
+#else
+#define __CMPXCHG_DOUBLE	((slab_flags_t __force)0U)
+#endif
 
 /*
  * Tracking user of a slab.
@@ -496,6 +502,47 @@ static __always_inline void slab_unlock(
 	__bit_spin_unlock(PG_locked, &page->flags);
 }
 
+static inline bool
+__update_freelist_fast(struct slab *slab,
+		      void *freelist_old, unsigned long counters_old,
+		      void *freelist_new, unsigned long counters_new)
+{
+
+	bool ret = false;
+
+#ifdef system_has_freelist_aba
+	freelist_aba_t old = { .freelist = freelist_old, .counter = counters_old };
+	freelist_aba_t new = { .freelist = freelist_new, .counter = counters_new };
+
+#ifdef CONFIG_64BIT
+	ret = try_cmpxchg128(&slab->freelist_counter.full, &old.full, new.full);
+#else
+	ret = try_cmpxchg64(&slab->freelist_counter.full, &old.full, new.full);
+#endif
+#endif /* system_has_freelist_aba */
+
+	return ret;
+}
+
+static inline bool
+__update_freelist_slow(struct slab *slab,
+		      void *freelist_old, unsigned long counters_old,
+		      void *freelist_new, unsigned long counters_new)
+{
+	bool ret = false;
+
+	slab_lock(slab);
+	if (slab->freelist == freelist_old &&
+	    slab->counters == counters_old) {
+		slab->freelist = freelist_new;
+		slab->counters = counters_new;
+		ret = true;
+	}
+	slab_unlock(slab);
+
+	return ret;
+}
+
 /*
  * Interrupts must be disabled (for the fallback code to work right), typically
  * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is
@@ -503,33 +550,25 @@ static __always_inline void slab_unlock(
  * allocation/ free operation in hardirq context. Therefore nothing can
  * interrupt the operation.
  */
-static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
+static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab,
 		void *freelist_old, unsigned long counters_old,
 		void *freelist_new, unsigned long counters_new,
 		const char *n)
 {
+	bool ret;
+
 	if (USE_LOCKLESS_FAST_PATH())
 		lockdep_assert_irqs_disabled();
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
-    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+
 	if (s->flags & __CMPXCHG_DOUBLE) {
-		if (cmpxchg_double(&slab->freelist, &slab->counters,
-				   freelist_old, counters_old,
-				   freelist_new, counters_new))
-			return true;
-	} else
-#endif
-	{
-		slab_lock(slab);
-		if (slab->freelist == freelist_old &&
-					slab->counters == counters_old) {
-			slab->freelist = freelist_new;
-			slab->counters = counters_new;
-			slab_unlock(slab);
-			return true;
-		}
-		slab_unlock(slab);
+		ret = __update_freelist_fast(slab, freelist_old, counters_old,
+				            freelist_new, counters_new);
+	} else {
+		ret = __update_freelist_slow(slab, freelist_old, counters_old,
+				            freelist_new, counters_new);
 	}
+	if (likely(ret))
+		return true;
 
 	cpu_relax();
 	stat(s, CMPXCHG_DOUBLE_FAIL);
@@ -541,36 +580,26 @@ static inline bool __cmpxchg_double_slab
 	return false;
 }
 
-static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
+static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab,
 		void *freelist_old, unsigned long counters_old,
 		void *freelist_new, unsigned long counters_new,
 		const char *n)
 {
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
-    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+	bool ret;
+
 	if (s->flags & __CMPXCHG_DOUBLE) {
-		if (cmpxchg_double(&slab->freelist, &slab->counters,
-				   freelist_old, counters_old,
-				   freelist_new, counters_new))
-			return true;
-	} else
-#endif
-	{
+		ret = __update_freelist_fast(slab, freelist_old, counters_old,
+				            freelist_new, counters_new);
+	} else {
 		unsigned long flags;
 
 		local_irq_save(flags);
-		slab_lock(slab);
-		if (slab->freelist == freelist_old &&
-					slab->counters == counters_old) {
-			slab->freelist = freelist_new;
-			slab->counters = counters_new;
-			slab_unlock(slab);
-			local_irq_restore(flags);
-			return true;
-		}
-		slab_unlock(slab);
+		ret = __update_freelist_slow(slab, freelist_old, counters_old,
+				            freelist_new, counters_new);
 		local_irq_restore(flags);
 	}
+	if (likely(ret))
+		return true;
 
 	cpu_relax();
 	stat(s, CMPXCHG_DOUBLE_FAIL);
@@ -2168,7 +2197,7 @@ static inline void *acquire_slab(struct
 	VM_BUG_ON(new.frozen);
 	new.frozen = 1;
 
-	if (!__cmpxchg_double_slab(s, slab,
+	if (!__slab_update_freelist(s, slab,
 			freelist, counters,
 			new.freelist, new.counters,
 			"acquire_slab"))
@@ -2500,7 +2529,7 @@ static void deactivate_slab(struct kmem_
 	}
 
 
-	if (!cmpxchg_double_slab(s, slab,
+	if (!slab_update_freelist(s, slab,
 				old.freelist, old.counters,
 				new.freelist, new.counters,
 				"unfreezing slab")) {
@@ -2561,7 +2590,7 @@ static void __unfreeze_partials(struct k
 
 			new.frozen = 0;
 
-		} while (!__cmpxchg_double_slab(s, slab,
+		} while (!__slab_update_freelist(s, slab,
 				old.freelist, old.counters,
 				new.freelist, new.counters,
 				"unfreezing slab"));
@@ -3022,7 +3051,7 @@ static inline void *get_freelist(struct
 		new.inuse = slab->objects;
 		new.frozen = freelist != NULL;
 
-	} while (!__cmpxchg_double_slab(s, slab,
+	} while (!__slab_update_freelist(s, slab,
 		freelist, counters,
 		NULL, new.counters,
 		"get_freelist"));
@@ -3295,6 +3324,18 @@ static __always_inline void maybe_wipe_o
 			0, sizeof(void *));
 }
 
+static inline bool
+__update_cpu_freelist_fast(struct kmem_cache *s,
+			   void *freelist_old, void *freelist_new,
+			   unsigned long tid)
+{
+	freelist_aba_t old = { .freelist = freelist_old, .counter = tid };
+	freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) };
+
+	return this_cpu_cmpxchg(s->cpu_slab->freelist_tid.full,
+				old.full, new.full) == old.full;
+}
+
 /*
  * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
  * have the fastpath folded into their functions. So no function call
@@ -3379,11 +3420,7 @@ static __always_inline void *slab_alloc_
 		 * against code executing on this cpu *not* from access by
 		 * other cpus.
 		 */
-		if (unlikely(!this_cpu_cmpxchg_double(
-				s->cpu_slab->freelist, s->cpu_slab->tid,
-				object, tid,
-				next_object, next_tid(tid)))) {
-
+		if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
 			note_cmpxchg_failure("slab_alloc", s, tid);
 			goto redo;
 		}
@@ -3517,7 +3554,7 @@ static void __slab_free(struct kmem_cach
 			}
 		}
 
-	} while (!cmpxchg_double_slab(s, slab,
+	} while (!slab_update_freelist(s, slab,
 		prior, counters,
 		head, new.counters,
 		"__slab_free"));
@@ -3621,11 +3658,7 @@ static __always_inline void do_slab_free
 
 		set_freepointer(s, tail_obj, freelist);
 
-		if (unlikely(!this_cpu_cmpxchg_double(
-				s->cpu_slab->freelist, s->cpu_slab->tid,
-				freelist, tid,
-				head, next_tid(tid)))) {
-
+		if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) {
 			note_cmpxchg_failure("slab_free", s, tid);
 			goto redo;
 		}
@@ -4319,11 +4352,12 @@ static int kmem_cache_open(struct kmem_c
 		}
 	}
 
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
+#if defined(system_has_freelist_aba) && \
     defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
-	if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
+	if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) {
 		/* Enable fast mode */
 		s->flags |= __CMPXCHG_DOUBLE;
+	}
 #endif
 
 	/*



^ permalink raw reply	[flat|nested] 57+ messages in thread

* [RFC][PATCH 12/12] arch: Remove cmpxchg_double
  2022-12-19 15:35 [RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg_double() Peter Zijlstra
                   ` (10 preceding siblings ...)
  2022-12-19 15:35 ` [RFC][PATCH 11/12] slub: " Peter Zijlstra
@ 2022-12-19 15:35 ` Peter Zijlstra
  2022-12-22  1:21 ` [RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg_double() Boqun Feng
  12 siblings, 0 replies; 57+ messages in thread
From: Peter Zijlstra @ 2022-12-19 15:35 UTC (permalink / raw)
  To: torvalds
  Cc: corbet, will, peterz, boqun.feng, mark.rutland, catalin.marinas,
	dennis, tj, cl, hca, gor, agordeev, borntraeger, svens,
	Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

No moar users, remove the monster.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 Documentation/core-api/this_cpu_ops.rst    |    2 -
 arch/arm64/include/asm/atomic_ll_sc.h      |   33 ----------------
 arch/arm64/include/asm/atomic_lse.h        |   36 ------------------
 arch/arm64/include/asm/cmpxchg.h           |   46 -----------------------
 arch/arm64/include/asm/percpu.h            |   10 -----
 arch/s390/include/asm/cmpxchg.h            |   34 -----------------
 arch/s390/include/asm/percpu.h             |   18 ---------
 arch/x86/include/asm/cmpxchg.h             |   25 ------------
 arch/x86/include/asm/cmpxchg_32.h          |    1 
 arch/x86/include/asm/cmpxchg_64.h          |    1 
 arch/x86/include/asm/percpu.h              |   41 --------------------
 include/asm-generic/percpu.h               |   58 -----------------------------
 include/linux/atomic/atomic-instrumented.h |   17 --------
 include/linux/percpu-defs.h                |   38 -------------------
 scripts/atomic/gen-atomic-instrumented.sh  |   17 ++------
 15 files changed, 6 insertions(+), 371 deletions(-)

--- a/Documentation/core-api/this_cpu_ops.rst
+++ b/Documentation/core-api/this_cpu_ops.rst
@@ -53,7 +53,6 @@ are defined. These operations can be use
 	this_cpu_add_return(pcp, val)
 	this_cpu_xchg(pcp, nval)
 	this_cpu_cmpxchg(pcp, oval, nval)
-	this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
 	this_cpu_sub(pcp, val)
 	this_cpu_inc(pcp)
 	this_cpu_dec(pcp)
@@ -242,7 +241,6 @@ modifies the variable, then RMW actions
 	__this_cpu_add_return(pcp, val)
 	__this_cpu_xchg(pcp, nval)
 	__this_cpu_cmpxchg(pcp, oval, nval)
-	__this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
 	__this_cpu_sub(pcp, val)
 	__this_cpu_inc(pcp)
 	__this_cpu_dec(pcp)
--- a/arch/arm64/include/asm/atomic_ll_sc.h
+++ b/arch/arm64/include/asm/atomic_ll_sc.h
@@ -294,39 +294,6 @@ __CMPXCHG_CASE( ,  ,  mb_, 64, dmb ish,
 
 #undef __CMPXCHG_CASE
 
-#define __CMPXCHG_DBL(name, mb, rel, cl)				\
-static __always_inline long						\
-__ll_sc__cmpxchg_double##name(unsigned long old1,			\
-				      unsigned long old2,		\
-				      unsigned long new1,		\
-				      unsigned long new2,		\
-				      volatile void *ptr)		\
-{									\
-	unsigned long tmp, ret;						\
-									\
-	asm volatile("// __cmpxchg_double" #name "\n"			\
-	"	prfm	pstl1strm, %2\n"				\
-	"1:	ldxp	%0, %1, %2\n"					\
-	"	eor	%0, %0, %3\n"					\
-	"	eor	%1, %1, %4\n"					\
-	"	orr	%1, %0, %1\n"					\
-	"	cbnz	%1, 2f\n"					\
-	"	st" #rel "xp	%w0, %5, %6, %2\n"			\
-	"	cbnz	%w0, 1b\n"					\
-	"	" #mb "\n"						\
-	"2:"								\
-	: "=&r" (tmp), "=&r" (ret), "+Q" (*(unsigned long *)ptr)	\
-	: "r" (old1), "r" (old2), "r" (new1), "r" (new2)		\
-	: cl);								\
-									\
-	return ret;							\
-}
-
-__CMPXCHG_DBL(   ,        ,  ,         )
-__CMPXCHG_DBL(_mb, dmb ish, l, "memory")
-
-#undef __CMPXCHG_DBL
-
 union __u128_halves {
 	u128 full;
 	struct {
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -288,42 +288,6 @@ __CMPXCHG_CASE(x,  ,  mb_, 64, al, "memo
 
 #undef __CMPXCHG_CASE
 
-#define __CMPXCHG_DBL(name, mb, cl...)					\
-static __always_inline long						\
-__lse__cmpxchg_double##name(unsigned long old1,				\
-					 unsigned long old2,		\
-					 unsigned long new1,		\
-					 unsigned long new2,		\
-					 volatile void *ptr)		\
-{									\
-	unsigned long oldval1 = old1;					\
-	unsigned long oldval2 = old2;					\
-	register unsigned long x0 asm ("x0") = old1;			\
-	register unsigned long x1 asm ("x1") = old2;			\
-	register unsigned long x2 asm ("x2") = new1;			\
-	register unsigned long x3 asm ("x3") = new2;			\
-	register unsigned long x4 asm ("x4") = (unsigned long)ptr;	\
-									\
-	asm volatile(							\
-	__LSE_PREAMBLE							\
-	"	casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\
-	"	eor	%[old1], %[old1], %[oldval1]\n"			\
-	"	eor	%[old2], %[old2], %[oldval2]\n"			\
-	"	orr	%[old1], %[old1], %[old2]"			\
-	: [old1] "+&r" (x0), [old2] "+&r" (x1),				\
-	  [v] "+Q" (*(unsigned long *)ptr)				\
-	: [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4),		\
-	  [oldval1] "r" (oldval1), [oldval2] "r" (oldval2)		\
-	: cl);								\
-									\
-	return x0;							\
-}
-
-__CMPXCHG_DBL(   ,   )
-__CMPXCHG_DBL(_mb, al, "memory")
-
-#undef __CMPXCHG_DBL
-
 #define __CMPXCHG128(name, mb, cl...)					\
 static __always_inline u128						\
 __lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)		\
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -131,22 +131,6 @@ __CMPXCHG_CASE(mb_, 64)
 
 #undef __CMPXCHG_CASE
 
-#define __CMPXCHG_DBL(name)						\
-static inline long __cmpxchg_double##name(unsigned long old1,		\
-					 unsigned long old2,		\
-					 unsigned long new1,		\
-					 unsigned long new2,		\
-					 volatile void *ptr)		\
-{									\
-	return __lse_ll_sc_body(_cmpxchg_double##name, 			\
-				old1, old2, new1, new2, ptr);		\
-}
-
-__CMPXCHG_DBL(   )
-__CMPXCHG_DBL(_mb)
-
-#undef __CMPXCHG_DBL
-
 #define __CMPXCHG128(name)						\
 static inline long __cmpxchg128##name(volatile u128 *ptr,		\
 				      u128 old, u128 new)		\
@@ -212,36 +196,6 @@ __CMPXCHG_GEN(_mb)
 #define arch_cmpxchg64			arch_cmpxchg
 #define arch_cmpxchg64_local		arch_cmpxchg_local
 
-/* cmpxchg_double */
-#define system_has_cmpxchg_double()     1
-
-#define __cmpxchg_double_check(ptr1, ptr2)					\
-({										\
-	if (sizeof(*(ptr1)) != 8)						\
-		BUILD_BUG();							\
-	VM_BUG_ON((unsigned long *)(ptr2) - (unsigned long *)(ptr1) != 1);	\
-})
-
-#define arch_cmpxchg_double(ptr1, ptr2, o1, o2, n1, n2)				\
-({										\
-	int __ret;								\
-	__cmpxchg_double_check(ptr1, ptr2);					\
-	__ret = !__cmpxchg_double_mb((unsigned long)(o1), (unsigned long)(o2),	\
-				     (unsigned long)(n1), (unsigned long)(n2),	\
-				     ptr1);					\
-	__ret;									\
-})
-
-#define arch_cmpxchg_double_local(ptr1, ptr2, o1, o2, n1, n2)			\
-({										\
-	int __ret;								\
-	__cmpxchg_double_check(ptr1, ptr2);					\
-	__ret = !__cmpxchg_double((unsigned long)(o1), (unsigned long)(o2),	\
-				  (unsigned long)(n1), (unsigned long)(n2),	\
-				  ptr1);					\
-	__ret;									\
-})
-
 /* cmpxchg128 */
 #define system_has_cmpxchg128()		1
 
--- a/arch/arm64/include/asm/percpu.h
+++ b/arch/arm64/include/asm/percpu.h
@@ -145,16 +145,6 @@ PERCPU_RET_OP(add, add, ldadd)
  * preemption point when TIF_NEED_RESCHED gets set while preemption is
  * disabled.
  */
-#define this_cpu_cmpxchg_double_8(ptr1, ptr2, o1, o2, n1, n2)		\
-({									\
-	int __ret;							\
-	preempt_disable_notrace();					\
-	__ret = cmpxchg_double_local(	raw_cpu_ptr(&(ptr1)),		\
-					raw_cpu_ptr(&(ptr2)),		\
-					o1, o2, n1, n2);		\
-	preempt_enable_notrace();					\
-	__ret;								\
-})
 
 #define _pcp_protect(op, pcp, ...)					\
 ({									\
--- a/arch/s390/include/asm/cmpxchg.h
+++ b/arch/s390/include/asm/cmpxchg.h
@@ -167,40 +167,6 @@ static __always_inline unsigned long __c
 #define arch_cmpxchg_local	arch_cmpxchg
 #define arch_cmpxchg64_local	arch_cmpxchg
 
-#define system_has_cmpxchg_double()	1
-
-static __always_inline int __cmpxchg_double(unsigned long p1, unsigned long p2,
-					    unsigned long o1, unsigned long o2,
-					    unsigned long n1, unsigned long n2)
-{
-	union register_pair old = { .even = o1, .odd = o2, };
-	union register_pair new = { .even = n1, .odd = n2, };
-	int cc;
-
-	asm volatile(
-		"	cdsg	%[old],%[new],%[ptr]\n"
-		"	ipm	%[cc]\n"
-		"	srl	%[cc],28\n"
-		: [cc] "=&d" (cc), [old] "+&d" (old.pair)
-		: [new] "d" (new.pair),
-		  [ptr] "QS" (*(unsigned long *)p1), "Q" (*(unsigned long *)p2)
-		: "memory", "cc");
-	return !cc;
-}
-
-#define arch_cmpxchg_double(p1, p2, o1, o2, n1, n2)			\
-({									\
-	typeof(p1) __p1 = (p1);						\
-	typeof(p2) __p2 = (p2);						\
-									\
-	BUILD_BUG_ON(sizeof(*(p1)) != sizeof(long));			\
-	BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long));			\
-	VM_BUG_ON((unsigned long)((__p1) + 1) != (unsigned long)(__p2));\
-	__cmpxchg_double((unsigned long)__p1, (unsigned long)__p2,	\
-			 (unsigned long)(o1), (unsigned long)(o2),	\
-			 (unsigned long)(n1), (unsigned long)(n2));	\
-})
-
 #define system_has_cmpxchg128()		1
 
 static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new)
--- a/arch/s390/include/asm/percpu.h
+++ b/arch/s390/include/asm/percpu.h
@@ -184,24 +184,6 @@
 #define this_cpu_xchg_4(pcp, nval) arch_this_cpu_xchg(pcp, nval)
 #define this_cpu_xchg_8(pcp, nval) arch_this_cpu_xchg(pcp, nval)
 
-#define arch_this_cpu_cmpxchg_double(pcp1, pcp2, o1, o2, n1, n2)	    \
-({									    \
-	typeof(pcp1) *p1__;						    \
-	typeof(pcp2) *p2__;						    \
-	int ret__;							    \
-									    \
-	preempt_disable_notrace();					    \
-	p1__ = raw_cpu_ptr(&(pcp1));					    \
-	p2__ = raw_cpu_ptr(&(pcp2));					    \
-	ret__ = __cmpxchg_double((unsigned long)p1__, (unsigned long)p2__,  \
-				 (unsigned long)(o1), (unsigned long)(o2),  \
-				 (unsigned long)(n1), (unsigned long)(n2)); \
-	preempt_enable_notrace();					    \
-	ret__;								    \
-})
-
-#define this_cpu_cmpxchg_double_8 arch_this_cpu_cmpxchg_double
-
 #include <asm-generic/percpu.h>
 
 #endif /* __ARCH_S390_PERCPU__ */
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -233,29 +233,4 @@ extern void __add_wrong_size(void)
 #define __xadd(ptr, inc, lock)	__xchg_op((ptr), (inc), xadd, lock)
 #define xadd(ptr, inc)		__xadd((ptr), (inc), LOCK_PREFIX)
 
-#define __cmpxchg_double(pfx, p1, p2, o1, o2, n1, n2)			\
-({									\
-	bool __ret;							\
-	__typeof__(*(p1)) __old1 = (o1), __new1 = (n1);			\
-	__typeof__(*(p2)) __old2 = (o2), __new2 = (n2);			\
-	BUILD_BUG_ON(sizeof(*(p1)) != sizeof(long));			\
-	BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long));			\
-	VM_BUG_ON((unsigned long)(p1) % (2 * sizeof(long)));		\
-	VM_BUG_ON((unsigned long)((p1) + 1) != (unsigned long)(p2));	\
-	asm volatile(pfx "cmpxchg%c5b %1"				\
-		     CC_SET(e)						\
-		     : CC_OUT(e) (__ret),				\
-		       "+m" (*(p1)), "+m" (*(p2)),			\
-		       "+a" (__old1), "+d" (__old2)			\
-		     : "i" (2 * sizeof(long)),				\
-		       "b" (__new1), "c" (__new2));			\
-	__ret;								\
-})
-
-#define arch_cmpxchg_double(p1, p2, o1, o2, n1, n2) \
-	__cmpxchg_double(LOCK_PREFIX, p1, p2, o1, o2, n1, n2)
-
-#define arch_cmpxchg_double_local(p1, p2, o1, o2, n1, n2) \
-	__cmpxchg_double(, p1, p2, o1, o2, n1, n2)
-
 #endif	/* ASM_X86_CMPXCHG_H */
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -103,7 +103,6 @@ static inline bool __try_cmpxchg64(volat
 
 #endif
 
-#define system_has_cmpxchg_double()	boot_cpu_has(X86_FEATURE_CX8)
 #define system_has_cmpxchg64()		boot_cpu_has(X86_FEATURE_CX8)
 
 #endif /* _ASM_X86_CMPXCHG_32_H */
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -72,7 +72,6 @@ static __always_inline bool arch_try_cmp
 	return likely(ret);
 }
 
-#define system_has_cmpxchg_double()	boot_cpu_has(X86_FEATURE_CX16)
 #define system_has_cmpxchg128()		boot_cpu_has(X86_FEATURE_CX16)
 
 #endif /* _ASM_X86_CMPXCHG_64_H */
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -339,23 +339,6 @@ do {									\
 #define this_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(2, volatile, pcp, oval, nval)
 #define this_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(4, volatile, pcp, oval, nval)
 
-#ifdef CONFIG_X86_CMPXCHG64
-#define percpu_cmpxchg8b_double(pcp1, pcp2, o1, o2, n1, n2)		\
-({									\
-	bool __ret;							\
-	typeof(pcp1) __o1 = (o1), __n1 = (n1);				\
-	typeof(pcp2) __o2 = (o2), __n2 = (n2);				\
-	asm volatile("cmpxchg8b "__percpu_arg(1)			\
-		     CC_SET(z)						\
-		     : CC_OUT(z) (__ret), "+m" (pcp1), "+m" (pcp2), "+a" (__o1), "+d" (__o2) \
-		     : "b" (__n1), "c" (__n2));				\
-	__ret;								\
-})
-
-#define raw_cpu_cmpxchg_double_4	percpu_cmpxchg8b_double
-#define this_cpu_cmpxchg_double_4	percpu_cmpxchg8b_double
-#endif /* CONFIG_X86_CMPXCHG64 */
-
 /*
  * Per cpu atomic 64 bit operations are only available under 64 bit.
  * 32 bit must fall back to generic operations.
@@ -378,30 +361,6 @@ do {									\
 #define this_cpu_add_return_8(pcp, val)		percpu_add_return_op(8, volatile, pcp, val)
 #define this_cpu_xchg_8(pcp, nval)		percpu_xchg_op(8, volatile, pcp, nval)
 #define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
-
-/*
- * Pretty complex macro to generate cmpxchg16 instruction.  The instruction
- * is not supported on early AMD64 processors so we must be able to emulate
- * it in software.  The address used in the cmpxchg16 instruction must be
- * aligned to a 16 byte boundary.
- */
-#define percpu_cmpxchg16b_double(pcp1, pcp2, o1, o2, n1, n2)		\
-({									\
-	bool __ret;							\
-	typeof(pcp1) __o1 = (o1), __n1 = (n1);				\
-	typeof(pcp2) __o2 = (o2), __n2 = (n2);				\
-	alternative_io("leaq %P1,%%rsi\n\tcall this_cpu_cmpxchg16b_emu\n\t", \
-		       "cmpxchg16b " __percpu_arg(1) "\n\tsetz %0\n\t",	\
-		       X86_FEATURE_CX16,				\
-		       ASM_OUTPUT2("=a" (__ret), "+m" (pcp1),		\
-				   "+m" (pcp2), "+d" (__o2)),		\
-		       "b" (__n1), "c" (__n2), "a" (__o1) : "rsi");	\
-	__ret;								\
-})
-
-#define raw_cpu_cmpxchg_double_8	percpu_cmpxchg16b_double
-#define this_cpu_cmpxchg_double_8	percpu_cmpxchg16b_double
-
 #endif
 
 static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -99,19 +99,6 @@ do {									\
 	__ret;								\
 })
 
-#define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
-({									\
-	typeof(pcp1) *__p1 = raw_cpu_ptr(&(pcp1));			\
-	typeof(pcp2) *__p2 = raw_cpu_ptr(&(pcp2));			\
-	int __ret = 0;							\
-	if (*__p1 == (oval1) && *__p2  == (oval2)) {			\
-		*__p1 = nval1;						\
-		*__p2 = nval2;						\
-		__ret = 1;						\
-	}								\
-	(__ret);							\
-})
-
 #define __this_cpu_generic_read_nopreempt(pcp)				\
 ({									\
 	typeof(pcp) ___ret;						\
@@ -180,17 +167,6 @@ do {									\
 	__ret;								\
 })
 
-#define this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
-({									\
-	int __ret;							\
-	unsigned long __flags;						\
-	raw_local_irq_save(__flags);					\
-	__ret = raw_cpu_generic_cmpxchg_double(pcp1, pcp2,		\
-			oval1, oval2, nval1, nval2);			\
-	raw_local_irq_restore(__flags);					\
-	__ret;								\
-})
-
 #ifndef raw_cpu_read_1
 #define raw_cpu_read_1(pcp)		raw_cpu_generic_read(pcp)
 #endif
@@ -303,23 +279,6 @@ do {									\
 	raw_cpu_generic_cmpxchg(pcp, oval, nval)
 #endif
 
-#ifndef raw_cpu_cmpxchg_double_1
-#define raw_cpu_cmpxchg_double_1(pcp1, pcp2, oval1, oval2, nval1, nval2) \
-	raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
-#endif
-#ifndef raw_cpu_cmpxchg_double_2
-#define raw_cpu_cmpxchg_double_2(pcp1, pcp2, oval1, oval2, nval1, nval2) \
-	raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
-#endif
-#ifndef raw_cpu_cmpxchg_double_4
-#define raw_cpu_cmpxchg_double_4(pcp1, pcp2, oval1, oval2, nval1, nval2) \
-	raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
-#endif
-#ifndef raw_cpu_cmpxchg_double_8
-#define raw_cpu_cmpxchg_double_8(pcp1, pcp2, oval1, oval2, nval1, nval2) \
-	raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
-#endif
-
 #ifndef this_cpu_read_1
 #define this_cpu_read_1(pcp)		this_cpu_generic_read(pcp)
 #endif
@@ -432,21 +391,4 @@ do {									\
 	this_cpu_generic_cmpxchg(pcp, oval, nval)
 #endif
 
-#ifndef this_cpu_cmpxchg_double_1
-#define this_cpu_cmpxchg_double_1(pcp1, pcp2, oval1, oval2, nval1, nval2) \
-	this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
-#endif
-#ifndef this_cpu_cmpxchg_double_2
-#define this_cpu_cmpxchg_double_2(pcp1, pcp2, oval1, oval2, nval1, nval2) \
-	this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
-#endif
-#ifndef this_cpu_cmpxchg_double_4
-#define this_cpu_cmpxchg_double_4(pcp1, pcp2, oval1, oval2, nval1, nval2) \
-	this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
-#endif
-#ifndef this_cpu_cmpxchg_double_8
-#define this_cpu_cmpxchg_double_8(pcp1, pcp2, oval1, oval2, nval1, nval2) \
-	this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
-#endif
-
 #endif /* _ASM_GENERIC_PERCPU_H_ */
--- a/include/linux/atomic/atomic-instrumented.h
+++ b/include/linux/atomic/atomic-instrumented.h
@@ -2141,21 +2141,6 @@ atomic_long_dec_if_positive(atomic_long_
 	arch_sync_cmpxchg(__ai_ptr, __VA_ARGS__); \
 })
 
-#define cmpxchg_double(ptr, ...) \
-({ \
-	typeof(ptr) __ai_ptr = (ptr); \
-	kcsan_mb(); \
-	instrument_atomic_write(__ai_ptr, 2 * sizeof(*__ai_ptr)); \
-	arch_cmpxchg_double(__ai_ptr, __VA_ARGS__); \
-})
-
-
-#define cmpxchg_double_local(ptr, ...) \
-({ \
-	typeof(ptr) __ai_ptr = (ptr); \
-	instrument_atomic_write(__ai_ptr, 2 * sizeof(*__ai_ptr)); \
-	arch_cmpxchg_double_local(__ai_ptr, __VA_ARGS__); \
-})
 
 #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
-// 27320c1ec2bf2878ecb9df3ea4816a7bc0c57a52
+// 416a741acbd4d28dbfa45f1b2a2c1b714454229f
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -359,33 +359,6 @@ static inline void __this_cpu_preempt_ch
 	pscr2_ret__;							\
 })
 
-/*
- * Special handling for cmpxchg_double.  cmpxchg_double is passed two
- * percpu variables.  The first has to be aligned to a double word
- * boundary and the second has to follow directly thereafter.
- * We enforce this on all architectures even if they don't support
- * a double cmpxchg instruction, since it's a cheap requirement, and it
- * avoids breaking the requirement for architectures with the instruction.
- */
-#define __pcpu_double_call_return_bool(stem, pcp1, pcp2, ...)		\
-({									\
-	bool pdcrb_ret__;						\
-	__verify_pcpu_ptr(&(pcp1));					\
-	BUILD_BUG_ON(sizeof(pcp1) != sizeof(pcp2));			\
-	VM_BUG_ON((unsigned long)(&(pcp1)) % (2 * sizeof(pcp1)));	\
-	VM_BUG_ON((unsigned long)(&(pcp2)) !=				\
-		  (unsigned long)(&(pcp1)) + sizeof(pcp1));		\
-	switch(sizeof(pcp1)) {						\
-	case 1: pdcrb_ret__ = stem##1(pcp1, pcp2, __VA_ARGS__); break;	\
-	case 2: pdcrb_ret__ = stem##2(pcp1, pcp2, __VA_ARGS__); break;	\
-	case 4: pdcrb_ret__ = stem##4(pcp1, pcp2, __VA_ARGS__); break;	\
-	case 8: pdcrb_ret__ = stem##8(pcp1, pcp2, __VA_ARGS__); break;	\
-	default:							\
-		__bad_size_call_parameter(); break;			\
-	}								\
-	pdcrb_ret__;							\
-})
-
 #define __pcpu_size_call(stem, variable, ...)				\
 do {									\
 	__verify_pcpu_ptr(&(variable));					\
@@ -442,9 +415,6 @@ do {									\
 #define raw_cpu_xchg(pcp, nval)		__pcpu_size_call_return2(raw_cpu_xchg_, pcp, nval)
 #define raw_cpu_cmpxchg(pcp, oval, nval) \
 	__pcpu_size16_call_return2(raw_cpu_cmpxchg_, pcp, oval, nval)
-#define raw_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
-	__pcpu_double_call_return_bool(raw_cpu_cmpxchg_double_, pcp1, pcp2, oval1, oval2, nval1, nval2)
-
 #define raw_cpu_sub(pcp, val)		raw_cpu_add(pcp, -(val))
 #define raw_cpu_inc(pcp)		raw_cpu_add(pcp, 1)
 #define raw_cpu_dec(pcp)		raw_cpu_sub(pcp, 1)
@@ -504,11 +474,6 @@ do {									\
 	raw_cpu_cmpxchg(pcp, oval, nval);				\
 })
 
-#define __this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
-({	__this_cpu_preempt_check("cmpxchg_double");			\
-	raw_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2);	\
-})
-
 #define __this_cpu_sub(pcp, val)	__this_cpu_add(pcp, -(typeof(pcp))(val))
 #define __this_cpu_inc(pcp)		__this_cpu_add(pcp, 1)
 #define __this_cpu_dec(pcp)		__this_cpu_sub(pcp, 1)
@@ -529,9 +494,6 @@ do {									\
 #define this_cpu_xchg(pcp, nval)	__pcpu_size_call_return2(this_cpu_xchg_, pcp, nval)
 #define this_cpu_cmpxchg(pcp, oval, nval) \
 	__pcpu_size16_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
-#define this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
-	__pcpu_double_call_return_bool(this_cpu_cmpxchg_double_, pcp1, pcp2, oval1, oval2, nval1, nval2)
-
 #define this_cpu_sub(pcp, val)		this_cpu_add(pcp, -(typeof(pcp))(val))
 #define this_cpu_inc(pcp)		this_cpu_add(pcp, 1)
 #define this_cpu_dec(pcp)		this_cpu_sub(pcp, 1)
--- a/scripts/atomic/gen-atomic-instrumented.sh
+++ b/scripts/atomic/gen-atomic-instrumented.sh
@@ -84,7 +84,6 @@ gen_xchg()
 {
 	local xchg="$1"; shift
 	local order="$1"; shift
-	local mult="$1"; shift
 
 	kcsan_barrier=""
 	if [ "${xchg%_local}" = "${xchg}" ]; then
@@ -104,8 +103,8 @@ cat <<EOF
 EOF
 [ -n "$kcsan_barrier" ] && printf "\t${kcsan_barrier}; \\\\\n"
 cat <<EOF
-	instrument_atomic_write(__ai_ptr, ${mult}sizeof(*__ai_ptr)); \\
-	instrument_atomic_write(__ai_oldp, ${mult}sizeof(*__ai_oldp)); \\
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \\
+	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \\
 	arch_${xchg}${order}(__ai_ptr, __ai_oldp, __VA_ARGS__); \\
 })
 EOF
@@ -119,7 +118,7 @@ cat <<EOF
 EOF
 [ -n "$kcsan_barrier" ] && printf "\t${kcsan_barrier}; \\\\\n"
 cat <<EOF
-	instrument_atomic_write(__ai_ptr, ${mult}sizeof(*__ai_ptr)); \\
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \\
 	arch_${xchg}${order}(__ai_ptr, __VA_ARGS__); \\
 })
 EOF
@@ -168,22 +167,16 @@ done
 
 for xchg in "xchg" "cmpxchg" "cmpxchg64" "cmpxchg128" "try_cmpxchg" "try_cmpxchg64" "try_cmpxchg128"; do
 	for order in "" "_acquire" "_release" "_relaxed"; do
-		gen_xchg "${xchg}" "${order}" ""
+		gen_xchg "${xchg}" "${order}"
 		printf "\n"
 	done
 done
 
 for xchg in "cmpxchg_local" "cmpxchg64_local" "cmpxchg128_local" "sync_cmpxchg"; do
-	gen_xchg "${xchg}" "" ""
+	gen_xchg "${xchg}" ""
 	printf "\n"
 done
 
-gen_xchg "cmpxchg_double" "" "2 * "
-
-printf "\n\n"
-
-gen_xchg "cmpxchg_double_local" "" "2 * "
-
 cat <<EOF
 
 #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 01/12] crypto: Remove u128 usage
  2022-12-19 15:35 ` [RFC][PATCH 01/12] crypto: Remove u128 usage Peter Zijlstra
@ 2022-12-19 15:56   ` Jason A. Donenfeld
  2022-12-19 17:00     ` Peter Zijlstra
  0 siblings, 1 reply; 57+ messages in thread
From: Jason A. Donenfeld @ 2022-12-19 15:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: torvalds, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, hca, gor, agordeev, borntraeger,
	svens, Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa,
	joro, suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Mon, Dec 19, 2022 at 04:35:26PM +0100, Peter Zijlstra wrote:
> As seems to be the common (majority) usage in crypto, use __uint128_t
> instead of u128.
> 
> This frees up u128 for definition in linux/types.h.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  lib/crypto/curve25519-hacl64.c |  142 ++++++++++++++++++++---------------------
>  lib/crypto/poly1305-donna64.c  |   22 ++----
>  2 files changed, 80 insertions(+), 84 deletions(-)
> 
> --- a/lib/crypto/curve25519-hacl64.c
> +++ b/lib/crypto/curve25519-hacl64.c
> @@ -14,8 +14,6 @@
>  #include <crypto/curve25519.h>
>  #include <linux/string.h>
>  
> -typedef __uint128_t u128;
> -
>  static __always_inline u64 u64_eq_mask(u64 a, u64 b)
>  {
>  	u64 x = a ^ b;
> @@ -50,77 +48,77 @@ static __always_inline void modulo_carry
>  	b[0] = b0_;
>  }
>  
> -static __always_inline void fproduct_copy_from_wide_(u64 *output, u128 *input)
> +static __always_inline void fproduct_copy_from_wide_(u64 *output, __uint128_t *input)
>  {
>  	{
> -		u128 xi = input[0];
> +		__uint128_t xi = input[0];

Why not just use `u128` from types.h in this file?

Jason

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 09/12] x86,amd_iommu: Replace cmpxchg_double()
  2022-12-19 15:35 ` [RFC][PATCH 09/12] x86,amd_iommu: Replace cmpxchg_double() Peter Zijlstra
@ 2022-12-19 16:47   ` Niklas Schnelle
  2022-12-28  8:40   ` Vasant Hegde
  1 sibling, 0 replies; 57+ messages in thread
From: Niklas Schnelle @ 2022-12-19 16:47 UTC (permalink / raw)
  To: Peter Zijlstra, torvalds
  Cc: corbet, will, boqun.feng, mark.rutland, catalin.marinas, dennis,
	tj, cl, hca, gor, agordeev, borntraeger, svens, Herbert Xu,
	davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Mon, 2022-12-19 at 16:35 +0100, Peter Zijlstra wrote:
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  drivers/iommu/amd/amd_iommu_types.h |    9 +++++++--
>  drivers/iommu/amd/iommu.c           |   10 ++++------
>  2 files changed, 11 insertions(+), 8 deletions(-)
> 
> --- a/drivers/iommu/amd/amd_iommu_types.h
> +++ b/drivers/iommu/amd/amd_iommu_types.h
> @@ -979,8 +979,13 @@ union irte_ga_hi {
>  };
>  
>  struct irte_ga {
> -	union irte_ga_lo lo;
> -	union irte_ga_hi hi;
> +	union {
> +		struct {
> +			union irte_ga_lo lo;
> +			union irte_ga_hi hi;
> +		};
> +		u128 irte;
> +	};
>  };
>  
>  struct irq_2_irte {
> --- a/drivers/iommu/amd/iommu.c
> +++ b/drivers/iommu/amd/iommu.c
> @@ -2992,10 +2992,10 @@ static int alloc_irq_index(struct amd_io
>  static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
>  			  struct irte_ga *irte, struct amd_ir_data *data)
>  {
> -	bool ret;
>  	struct irq_remap_table *table;
> -	unsigned long flags;
>  	struct irte_ga *entry;
> +	unsigned long flags;
> +	u128 old;
>  
>  	table = get_irq_table(iommu, devid);
>  	if (!table)
> @@ -3006,16 +3006,14 @@ static int modify_irte_ga(struct amd_iom
>  	entry = (struct irte_ga *)table->table;
>  	entry = &entry[index];
>  
> -	ret = cmpxchg_double(&entry->lo.val, &entry->hi.val,
> -			     entry->lo.val, entry->hi.val,
> -			     irte->lo.val, irte->hi.val);
>  	/*
>  	 * We use cmpxchg16 to atomically update the 128-bit IRTE,
>  	 * and it cannot be updated by the hardware or other processors
>  	 * behind us, so the return value of cmpxchg16 should be the
>  	 * same as the old value.

The above comment seems to have already been out of date but could be
updated to say cmpxchg128 instead of cmxchg16 anyway.

>  	 */
> -	WARN_ON(!ret);
> +	old = entry->irte;
> +	WARN_ON(!try_cmpxchg128(&entry->irte, &old, irte->irte));
>  
>  	if (data)
>  		data->ref = entry;
> 
> 


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 01/12] crypto: Remove u128 usage
  2022-12-19 15:56   ` Jason A. Donenfeld
@ 2022-12-19 17:00     ` Peter Zijlstra
  2022-12-19 17:03       ` Jason A. Donenfeld
  0 siblings, 1 reply; 57+ messages in thread
From: Peter Zijlstra @ 2022-12-19 17:00 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: torvalds, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, hca, gor, agordeev, borntraeger,
	svens, Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa,
	joro, suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Mon, Dec 19, 2022 at 04:56:33PM +0100, Jason A. Donenfeld wrote:

> Why not just use `u128` from types.h in this file?

Ordering, I can't very well introduce it in types.h while other
definitions exist in the tree. So I first have to clean up the u128
namespace.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 01/12] crypto: Remove u128 usage
  2022-12-19 17:00     ` Peter Zijlstra
@ 2022-12-19 17:03       ` Jason A. Donenfeld
  2022-12-20  3:50         ` Herbert Xu
  0 siblings, 1 reply; 57+ messages in thread
From: Jason A. Donenfeld @ 2022-12-19 17:03 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: torvalds, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, hca, gor, agordeev, borntraeger,
	svens, Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa,
	joro, suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Mon, Dec 19, 2022 at 6:01 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Dec 19, 2022 at 04:56:33PM +0100, Jason A. Donenfeld wrote:
>
> > Why not just use `u128` from types.h in this file?
>
> Ordering, I can't very well introduce it in types.h while other
> definitions exist in the tree. So I first have to clean up the u128
> namespace.

Is there a patch at the end of the series that adds it back in to use u128?

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}()
  2022-12-19 15:35 ` [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}() Peter Zijlstra
@ 2022-12-19 20:07   ` Boqun Feng
  2022-12-20 11:08     ` Peter Zijlstra
  2022-12-22  1:25   ` Boqun Feng
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 57+ messages in thread
From: Boqun Feng @ 2022-12-19 20:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: torvalds, corbet, will, mark.rutland, catalin.marinas, dennis,
	tj, cl, hca, gor, agordeev, borntraeger, svens, Herbert Xu,
	davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Mon, Dec 19, 2022 at 04:35:30PM +0100, Peter Zijlstra wrote:
> For all architectures that currently support cmpxchg_double()
> implement the cmpxchg128() family of functions that is basically the
> same but with a saner interface.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/arm64/include/asm/atomic_ll_sc.h |   38 +++++++++++++++++++++++
>  arch/arm64/include/asm/atomic_lse.h   |   33 +++++++++++++++++++-
>  arch/arm64/include/asm/cmpxchg.h      |   26 ++++++++++++++++
>  arch/s390/include/asm/cmpxchg.h       |   33 ++++++++++++++++++++
>  arch/x86/include/asm/cmpxchg_32.h     |    3 +
>  arch/x86/include/asm/cmpxchg_64.h     |   55 +++++++++++++++++++++++++++++++++-
>  6 files changed, 185 insertions(+), 3 deletions(-)
> 
> --- a/arch/arm64/include/asm/atomic_ll_sc.h
> +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> @@ -326,6 +326,44 @@ __CMPXCHG_DBL(   ,        ,  ,         )
>  __CMPXCHG_DBL(_mb, dmb ish, l, "memory")
>  
>  #undef __CMPXCHG_DBL
> +
> +union __u128_halves {
> +	u128 full;
> +	struct {
> +		u64 low, high;
> +	};
> +};
> +
> +#define __CMPXCHG128(name, mb, rel, cl)					\
> +static __always_inline u128						\
> +__ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)	\
> +{									\
> +	union __u128_halves r, o = { .full = (old) },			\
> +			       n = { .full = (new) };			\
> +									\
> +	asm volatile("// __cmpxchg128" #name "\n"			\
> +	"	prfm	pstl1strm, %2\n"				\
> +	"1:	ldxp	%0, %1, %2\n"					\
> +	"	eor	%3, %0, %3\n"					\
> +	"	eor	%4, %1, %4\n"					\
> +	"	orr	%3, %4, %3\n"					\
> +	"	cbnz	%3, 2f\n"					\
> +	"	st" #rel "xp	%w3, %5, %6, %2\n"			\
> +	"	cbnz	%w3, 1b\n"					\
> +	"	" #mb "\n"						\
> +	"2:"								\
> +	: "=&r" (r.low), "=&r" (r.high), "+Q" (*(unsigned long *)ptr)	\

I wonder whether we should use "(*(u128 *)ptr)" instead of "(*(unsigned
long *) ptr)"? Because compilers may think only 64bit value pointed by
"ptr" gets modified, and they are allowed to do "useful" optimization.

Same for lse and s390.

Regards,
Boqun

> +	: "r" (o.low), "r" (o.high), "r" (n.low), "r" (n.high)		\
> +	: cl);								\
> +									\
> +	return r.full;							\
> +}
> +
> +__CMPXCHG128(   ,        ,  ,         )
> +__CMPXCHG128(_mb, dmb ish, l, "memory")
> +
> +#undef __CMPXCHG128
> +
>  #undef K
>  
>  #endif	/* __ASM_ATOMIC_LL_SC_H */

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 01/12] crypto: Remove u128 usage
  2022-12-19 17:03       ` Jason A. Donenfeld
@ 2022-12-20  3:50         ` Herbert Xu
  2022-12-20  4:11           ` H. Peter Anvin
  0 siblings, 1 reply; 57+ messages in thread
From: Herbert Xu @ 2022-12-20  3:50 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Peter Zijlstra, torvalds, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, hca, gor, agordeev, borntraeger,
	svens, davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Mon, Dec 19, 2022 at 06:03:04PM +0100, Jason A. Donenfeld wrote:
>
> Is there a patch at the end of the series that adds it back in to use u128?

Could we do some ifdef trickery to reduce the amount of code churn
please? Changing everything away from u128 and then back to it seems
silly.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 01/12] crypto: Remove u128 usage
  2022-12-20  3:50         ` Herbert Xu
@ 2022-12-20  4:11           ` H. Peter Anvin
  2022-12-20  4:15             ` Herbert Xu
  0 siblings, 1 reply; 57+ messages in thread
From: H. Peter Anvin @ 2022-12-20  4:11 UTC (permalink / raw)
  To: Herbert Xu, Jason A. Donenfeld
  Cc: Peter Zijlstra, torvalds, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, hca, gor, agordeev, borntraeger,
	svens, davem, tglx, mingo, bp, dave.hansen, x86, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On December 19, 2022 7:50:47 PM PST, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>On Mon, Dec 19, 2022 at 06:03:04PM +0100, Jason A. Donenfeld wrote:
>>
>> Is there a patch at the end of the series that adds it back in to use u128?
>
>Could we do some ifdef trickery to reduce the amount of code churn
>please? Changing everything away from u128 and then back to it seems
>silly.
>
>Thanks,

Seems like "merging common code snippets" is something we at least used to do with single patches...

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 01/12] crypto: Remove u128 usage
  2022-12-20  4:11           ` H. Peter Anvin
@ 2022-12-20  4:15             ` Herbert Xu
  0 siblings, 0 replies; 57+ messages in thread
From: Herbert Xu @ 2022-12-20  4:15 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Jason A. Donenfeld, Peter Zijlstra, torvalds, corbet, will,
	boqun.feng, mark.rutland, catalin.marinas, dennis, tj, cl, hca,
	gor, agordeev, borntraeger, svens, davem, tglx, mingo, bp,
	dave.hansen, x86, joro, suravee.suthikulpanit, robin.murphy,
	dwmw2, baolu.lu, Arnd Bergmann, penberg, rientjes,
	iamjoonsoo.kim, Andrew Morton, vbabka, roman.gushchin, 42.hyeyoo,
	linux-doc, linux-kernel, linux-mm, linux-s390, linux-crypto,
	iommu, linux-arch

On Mon, Dec 19, 2022 at 08:11:37PM -0800, H. Peter Anvin wrote:
>
> Seems like "merging common code snippets" is something we at least used to do with single patches...

I certainly don't have any objections if we go down this route.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 02/12] crypto/ghash-clmulni: Use (struct) be128
  2022-12-19 15:35 ` [RFC][PATCH 02/12] crypto/ghash-clmulni: Use (struct) be128 Peter Zijlstra
@ 2022-12-20  5:45   ` Eric Biggers
  0 siblings, 0 replies; 57+ messages in thread
From: Eric Biggers @ 2022-12-20  5:45 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: torvalds, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, hca, gor, agordeev, borntraeger,
	svens, Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa,
	joro, suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Mon, Dec 19, 2022 at 04:35:27PM +0100, Peter Zijlstra wrote:
> Even though x86 is firmly little endian, use be128 because le128 is in
> fact the wrong way around :/ The actual code is already using be128 in
> ghash_setkey() so this shouldn't be more confusing.
> 
> This frees up the u128 name for a real u128 type.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

This patch doesn't make sense.  The x86 ghash code is definitely storing the key
as a little endian value, not big endian.  The reason be128 shows up in
ghash_setkey() is because the code is doing a byteswap from the original key
bytes.  Also, this patch causes 'sparse' warnings.

Can you consider
https://lore.kernel.org/linux-crypto/20221220054042.188537-1-ebiggers@kernel.org/T/#u
instead?

- Eric

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 03/12] cyrpto/b128ops: Remove struct u128
  2022-12-19 15:35 ` [RFC][PATCH 03/12] cyrpto/b128ops: Remove struct u128 Peter Zijlstra
@ 2022-12-20  5:52   ` Eric Biggers
  0 siblings, 0 replies; 57+ messages in thread
From: Eric Biggers @ 2022-12-20  5:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: torvalds, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, hca, gor, agordeev, borntraeger,
	svens, Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa,
	joro, suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Mon, Dec 19, 2022 at 04:35:28PM +0100, Peter Zijlstra wrote:
> cyrpto/b128ops: Remove struct u128

cyrpto => crypto

> Per git-grep u128_xor() and its related struct u128 are unused except
> to implement {be,le}128_xor(). Remove them to free up the namespace.

There's still a reference to u128 in drivers/crypto/vmx/ghash.c.  But, it's only
dereferenced by assembly code, so it should keep working even if u128 gets
redefined to a native int.  I don't speak PowerPC, so I'm not sure what the
"correct" type is there.

- Eric

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}()
  2022-12-19 20:07   ` Boqun Feng
@ 2022-12-20 11:08     ` Peter Zijlstra
  2022-12-20 14:31       ` Linus Torvalds
  2023-01-03 13:25       ` Mark Rutland
  0 siblings, 2 replies; 57+ messages in thread
From: Peter Zijlstra @ 2022-12-20 11:08 UTC (permalink / raw)
  To: Boqun Feng
  Cc: torvalds, corbet, will, mark.rutland, catalin.marinas, dennis,
	tj, cl, hca, gor, agordeev, borntraeger, svens, Herbert Xu,
	davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Mon, Dec 19, 2022 at 12:07:25PM -0800, Boqun Feng wrote:
> On Mon, Dec 19, 2022 at 04:35:30PM +0100, Peter Zijlstra wrote:
> > For all architectures that currently support cmpxchg_double()
> > implement the cmpxchg128() family of functions that is basically the
> > same but with a saner interface.
> > 
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---
> >  arch/arm64/include/asm/atomic_ll_sc.h |   38 +++++++++++++++++++++++
> >  arch/arm64/include/asm/atomic_lse.h   |   33 +++++++++++++++++++-
> >  arch/arm64/include/asm/cmpxchg.h      |   26 ++++++++++++++++
> >  arch/s390/include/asm/cmpxchg.h       |   33 ++++++++++++++++++++
> >  arch/x86/include/asm/cmpxchg_32.h     |    3 +
> >  arch/x86/include/asm/cmpxchg_64.h     |   55 +++++++++++++++++++++++++++++++++-
> >  6 files changed, 185 insertions(+), 3 deletions(-)
> > 
> > --- a/arch/arm64/include/asm/atomic_ll_sc.h
> > +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> > @@ -326,6 +326,44 @@ __CMPXCHG_DBL(   ,        ,  ,         )
> >  __CMPXCHG_DBL(_mb, dmb ish, l, "memory")
> >  
> >  #undef __CMPXCHG_DBL
> > +
> > +union __u128_halves {
> > +	u128 full;
> > +	struct {
> > +		u64 low, high;
> > +	};
> > +};
> > +
> > +#define __CMPXCHG128(name, mb, rel, cl)					\
> > +static __always_inline u128						\
> > +__ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)	\
> > +{									\
> > +	union __u128_halves r, o = { .full = (old) },			\
> > +			       n = { .full = (new) };			\
> > +									\
> > +	asm volatile("// __cmpxchg128" #name "\n"			\
> > +	"	prfm	pstl1strm, %2\n"				\
> > +	"1:	ldxp	%0, %1, %2\n"					\
> > +	"	eor	%3, %0, %3\n"					\
> > +	"	eor	%4, %1, %4\n"					\
> > +	"	orr	%3, %4, %3\n"					\
> > +	"	cbnz	%3, 2f\n"					\
> > +	"	st" #rel "xp	%w3, %5, %6, %2\n"			\
> > +	"	cbnz	%w3, 1b\n"					\
> > +	"	" #mb "\n"						\
> > +	"2:"								\
> > +	: "=&r" (r.low), "=&r" (r.high), "+Q" (*(unsigned long *)ptr)	\
> 
> I wonder whether we should use "(*(u128 *)ptr)" instead of "(*(unsigned
> long *) ptr)"? Because compilers may think only 64bit value pointed by
> "ptr" gets modified, and they are allowed to do "useful" optimization.

In this I've copied the existing cmpxchg_double() code; I'll have to let
the arch folks speak here, I've no clue.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}()
  2022-12-20 11:08     ` Peter Zijlstra
@ 2022-12-20 14:31       ` Linus Torvalds
  2022-12-20 15:09         ` Peter Zijlstra
  2023-01-03 13:25       ` Mark Rutland
  1 sibling, 1 reply; 57+ messages in thread
From: Linus Torvalds @ 2022-12-20 14:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Boqun Feng, corbet, will, mark.rutland, catalin.marinas, dennis,
	tj, cl, hca, gor, agordeev, borntraeger, svens, Herbert Xu,
	davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Tue, Dec 20, 2022 at 5:09 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Dec 19, 2022 at 12:07:25PM -0800, Boqun Feng wrote:
> >
> > I wonder whether we should use "(*(u128 *)ptr)" instead of "(*(unsigned
> > long *) ptr)"? Because compilers may think only 64bit value pointed by
> > "ptr" gets modified, and they are allowed to do "useful" optimization.
>
> In this I've copied the existing cmpxchg_double() code; I'll have to let
> the arch folks speak here, I've no clue.

It does sound like the right thing to do. I doubt it ends up making a
difference in practice, but yes, the asm doesn't have a memory
clobber, so the input/output types should be the right ones for the
compiler to not possibly do something odd and cache the part that it
doesn't see as being accessed.

              Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}()
  2022-12-20 14:31       ` Linus Torvalds
@ 2022-12-20 15:09         ` Peter Zijlstra
  0 siblings, 0 replies; 57+ messages in thread
From: Peter Zijlstra @ 2022-12-20 15:09 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Boqun Feng, corbet, will, mark.rutland, catalin.marinas, dennis,
	tj, cl, hca, gor, agordeev, borntraeger, svens, Herbert Xu,
	davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Tue, Dec 20, 2022 at 08:31:19AM -0600, Linus Torvalds wrote:
> On Tue, Dec 20, 2022 at 5:09 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Mon, Dec 19, 2022 at 12:07:25PM -0800, Boqun Feng wrote:
> > >
> > > I wonder whether we should use "(*(u128 *)ptr)" instead of "(*(unsigned
> > > long *) ptr)"? Because compilers may think only 64bit value pointed by
> > > "ptr" gets modified, and they are allowed to do "useful" optimization.
> >
> > In this I've copied the existing cmpxchg_double() code; I'll have to let
> > the arch folks speak here, I've no clue.
> 
> It does sound like the right thing to do. I doubt it ends up making a
> difference in practice, but yes, the asm doesn't have a memory
> clobber, so the input/output types should be the right ones for the
> compiler to not possibly do something odd and cache the part that it
> doesn't see as being accessed.

Right, and x86 does just *ptr, without trying to cast away the volatile
even.

I've pushed out a *(u128 *)ptr variant for arm64 and s390, then at least
we'll know if the compiler objects.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg_double()
  2022-12-19 15:35 [RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg_double() Peter Zijlstra
                   ` (11 preceding siblings ...)
  2022-12-19 15:35 ` [RFC][PATCH 12/12] arch: Remove cmpxchg_double Peter Zijlstra
@ 2022-12-22  1:21 ` Boqun Feng
  12 siblings, 0 replies; 57+ messages in thread
From: Boqun Feng @ 2022-12-22  1:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: torvalds, corbet, will, mark.rutland, catalin.marinas, dennis,
	tj, cl, hca, gor, agordeev, borntraeger, svens, Herbert Xu,
	davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

Hi Peter,

On Mon, Dec 19, 2022 at 04:35:25PM +0100, Peter Zijlstra wrote:
> Hi,
> 
> Since Linus hated on cmpxchg_double(), a few patches to get rid of it, as proposed here:
> 
>   https://lkml.kernel.org/r/Y2U3WdU61FvYlpUh@hirez.programming.kicks-ass.net
> 
> based on tip/master because Linus' tree is moving a wee bit fast at the moment.
> 
> 0day robot is all green for building, very limited testing on arm64/s390
> for obvious raisins -- I tried to get the asm right, but please, double
> check.
> 

I added some test cases for cmpxcgh128 APIs, and found two issues. I
will reply separately in the patches. The test cases themselves are at
the end, let me know if you want to me to send a proper patch.

Regards,
Boqun

------------------------------------------------------------>8
Subject: [PATCH] atomic: Add test cases for cmpxchg128 family

Besides for 32bit and 64bit cmpxchg, we only test via atomic_cmpxchg_*
APIs, add tests via cmpxchg* APIs while we are at it.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 lib/atomic64_test.c | 41 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c
index d9d170238165..7f79d0704ba8 100644
--- a/lib/atomic64_test.c
+++ b/lib/atomic64_test.c
@@ -76,12 +76,19 @@ do {								\
 	BUG_ON(atomic##bit##_read(&v) != expect);		\
 } while (0)
 
+#define TEST_ARGS_PLAIN(_, op, init, ret, expect, args...)	\
+do {								\
+	__WRITE_ONCE(n, init);					\
+	BUG_ON(op(&n, ##args) != ret);				\
+	BUG_ON(__READ_ONCE(n) != expect);			\
+} while (0)
+
 #define XCHG_FAMILY_TEST(bit, init, new)				\
 do {									\
 	FAMILY_TEST(TEST_ARGS, bit, xchg, init, init, new, new);	\
 } while (0)
 
-#define CMPXCHG_FAMILY_TEST(bit, init, new, wrong)			\
+#define ATOMIC_CMPXCHG_FAMILY_TEST(bit, init, new, wrong)		\
 do {									\
 	FAMILY_TEST(TEST_ARGS, bit, cmpxchg, 				\
 			init, init, new, init, new);			\
@@ -89,6 +96,14 @@ do {									\
 			init, init, init, wrong, new);			\
 } while (0)
 
+#define CMPXCHG_FAMILY_TEST(bit, init, new, wrong)			\
+do {									\
+	FAMILY_TEST(TEST_ARGS_PLAIN, _, cmpxchg##bit, 			\
+			init, init, new, init, new);			\
+	FAMILY_TEST(TEST_ARGS_PLAIN, _, cmpxchg##bit,			\
+			init, init, init, wrong, new);			\
+} while (0)
+
 #define INC_RETURN_FAMILY_TEST(bit, i)			\
 do {							\
 	FAMILY_TEST(TEST_ARGS, bit, inc_return,		\
@@ -109,6 +124,7 @@ static __init void test_atomic(void)
 	int one = 1;
 
 	atomic_t v;
+	int n;
 	int r;
 
 	TEST(, add, +=, onestwos);
@@ -139,6 +155,7 @@ static __init void test_atomic(void)
 	DEC_RETURN_FAMILY_TEST(, v0);
 
 	XCHG_FAMILY_TEST(, v0, v1);
+	ATOMIC_CMPXCHG_FAMILY_TEST(, v0, v1, onestwos);
 	CMPXCHG_FAMILY_TEST(, v0, v1, onestwos);
 
 }
@@ -155,6 +172,7 @@ static __init void test_atomic64(void)
 	int r_int;
 
 	atomic64_t v = ATOMIC64_INIT(v0);
+	long long n = 0;
 	long long r = v0;
 	BUG_ON(v.counter != r);
 
@@ -201,6 +219,7 @@ static __init void test_atomic64(void)
 	DEC_RETURN_FAMILY_TEST(64, v0);
 
 	XCHG_FAMILY_TEST(64, v0, v1);
+	ATOMIC_CMPXCHG_FAMILY_TEST(64, v0, v1, v2);
 	CMPXCHG_FAMILY_TEST(64, v0, v1, v2);
 
 	INIT(v0);
@@ -245,10 +264,30 @@ static __init void test_atomic64(void)
 	BUG_ON(!r_int);
 }
 
+#ifdef system_has_cmpxchg128
+static __init void test_atomic128(void)
+{
+	long long v0 = 0xaaa31337c001d00dLL;
+	long long v1 = 0xdeadbeefdeafcafeLL;
+	long long v2 = 0xfaceabadf00df001LL;
+	long long v3 = 0x8000000000000000LL;
+
+	s128 init = ((s128)v0 << 64) + v1;
+	s128 new = ((s128)v1 << 64) + v0;
+	s128 wrong = ((s128)v2 << 64) + v3;
+	s128 n = 1;
+
+	CMPXCHG_FAMILY_TEST(128, init, new, wrong);
+}
+#else
+static __init void test_atomic128(void) {}
+#endif
+
 static __init int test_atomics_init(void)
 {
 	test_atomic();
 	test_atomic64();
+	test_atomic128();
 
 #ifdef CONFIG_X86
 	pr_info("passed for %s platform %s CX8 and %s SSE\n",
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}()
  2022-12-19 15:35 ` [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}() Peter Zijlstra
  2022-12-19 20:07   ` Boqun Feng
@ 2022-12-22  1:25   ` Boqun Feng
  2022-12-22 13:16     ` Peter Zijlstra
  2023-01-03 17:12   ` Heiko Carstens
  2023-01-09 18:50   ` Mark Rutland
  3 siblings, 1 reply; 57+ messages in thread
From: Boqun Feng @ 2022-12-22  1:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: torvalds, corbet, will, mark.rutland, catalin.marinas, dennis,
	tj, cl, hca, gor, agordeev, borntraeger, svens, Herbert Xu,
	davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Mon, Dec 19, 2022 at 04:35:30PM +0100, Peter Zijlstra wrote:
> For all architectures that currently support cmpxchg_double()
> implement the cmpxchg128() family of functions that is basically the
> same but with a saner interface.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/arm64/include/asm/atomic_ll_sc.h |   38 +++++++++++++++++++++++
>  arch/arm64/include/asm/atomic_lse.h   |   33 +++++++++++++++++++-
>  arch/arm64/include/asm/cmpxchg.h      |   26 ++++++++++++++++
>  arch/s390/include/asm/cmpxchg.h       |   33 ++++++++++++++++++++
>  arch/x86/include/asm/cmpxchg_32.h     |    3 +
>  arch/x86/include/asm/cmpxchg_64.h     |   55 +++++++++++++++++++++++++++++++++-
>  6 files changed, 185 insertions(+), 3 deletions(-)
> 
> --- a/arch/arm64/include/asm/atomic_ll_sc.h
> +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> @@ -326,6 +326,44 @@ __CMPXCHG_DBL(   ,        ,  ,         )
>  __CMPXCHG_DBL(_mb, dmb ish, l, "memory")
>  
>  #undef __CMPXCHG_DBL
> +
> +union __u128_halves {
> +	u128 full;
> +	struct {
> +		u64 low, high;
> +	};
> +};
> +
> +#define __CMPXCHG128(name, mb, rel, cl)					\
> +static __always_inline u128						\
> +__ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)	\
> +{									\
> +	union __u128_halves r, o = { .full = (old) },			\
> +			       n = { .full = (new) };			\
> +									\
> +	asm volatile("// __cmpxchg128" #name "\n"			\
> +	"	prfm	pstl1strm, %2\n"				\
> +	"1:	ldxp	%0, %1, %2\n"					\
> +	"	eor	%3, %0, %3\n"					\
> +	"	eor	%4, %1, %4\n"					\
> +	"	orr	%3, %4, %3\n"					\
> +	"	cbnz	%3, 2f\n"					\
> +	"	st" #rel "xp	%w3, %5, %6, %2\n"			\
> +	"	cbnz	%w3, 1b\n"					\
> +	"	" #mb "\n"						\
> +	"2:"								\
> +	: "=&r" (r.low), "=&r" (r.high), "+Q" (*(unsigned long *)ptr)	\
> +	: "r" (o.low), "r" (o.high), "r" (n.low), "r" (n.high)		\
> +	: cl);								\
> +									\
> +	return r.full;							\
> +}
> +
> +__CMPXCHG128(   ,        ,  ,         )
> +__CMPXCHG128(_mb, dmb ish, l, "memory")
> +
> +#undef __CMPXCHG128
> +
>  #undef K
>  
>  #endif	/* __ASM_ATOMIC_LL_SC_H */
> --- a/arch/arm64/include/asm/atomic_lse.h
> +++ b/arch/arm64/include/asm/atomic_lse.h
> @@ -151,7 +151,7 @@ __lse_atomic64_fetch_##op##name(s64 i, a
>  	"	" #asm_op #mb "	%[i], %[old], %[v]"			\
>  	: [v] "+Q" (v->counter),					\
>  	  [old] "=r" (old)						\
> -	: [i] "r" (i) 							\
> +	: [i] "r" (i)							\
>  	: cl);								\
>  									\
>  	return old;							\
> @@ -324,4 +324,35 @@ __CMPXCHG_DBL(_mb, al, "memory")
>  
>  #undef __CMPXCHG_DBL
>  
> +#define __CMPXCHG128(name, mb, cl...)					\
> +static __always_inline u128						\
> +__lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)		\
> +{									\
> +	union __u128_halves r, o = { .full = (old) },			\
> +			       n = { .full = (new) };			\
> +	register unsigned long x0 asm ("x0") = o.low;			\
> +	register unsigned long x1 asm ("x1") = o.high;			\
> +	register unsigned long x2 asm ("x2") = n.low;			\
> +	register unsigned long x3 asm ("x3") = n.high;			\
> +	register unsigned long x4 asm ("x4") = (unsigned long)ptr;	\
> +									\
> +	asm volatile(							\
> +	__LSE_PREAMBLE							\
> +	"	casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\
> +	: [old1] "+&r" (x0), [old2] "+&r" (x1),				\
> +	  [v] "+Q" (*(unsigned long *)ptr)				\
> +	: [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4),		\

Issue #1: the line below can be removed, otherwise..

> +	  [oldval1] "r" (r.low), [oldval2] "r" (r.high)			\

warning:

	./arch/arm64/include/asm/atomic_lse.h: In function '__lse__cmpxchg128_mb':
	./arch/arm64/include/asm/atomic_lse.h:309:27: warning: 'r.<U97b8>.low' is used uninitialized [-Wuninitialized]
	  309 |           [oldval1] "r" (r.low), [oldval2] "r" (r.high)


> +	: cl);								\
> +									\
> +	r.low = x0; r.high = x1;					\
> +									\
> +	return r.full;							\
> +}
> +
> +__CMPXCHG128(   ,   )
> +__CMPXCHG128(_mb, al, "memory")
> +
> +#undef __CMPXCHG128
> +
>  #endif	/* __ASM_ATOMIC_LSE_H */
> --- a/arch/arm64/include/asm/cmpxchg.h
> +++ b/arch/arm64/include/asm/cmpxchg.h
> @@ -147,6 +147,19 @@ __CMPXCHG_DBL(_mb)
>  
>  #undef __CMPXCHG_DBL
>  
> +#define __CMPXCHG128(name)						\
> +static inline long __cmpxchg128##name(volatile u128 *ptr,		\

Issue #2: this should be

static inline u128 __cmpxchg128##name(..)

because cmpxchg* needs to return the old value.

Regards,
Boqun

> +				      u128 old, u128 new)		\
> +{									\
> +	return __lse_ll_sc_body(_cmpxchg128##name,			\
> +				ptr, old, new);				\
> +}
> +
> +__CMPXCHG128(   )
> +__CMPXCHG128(_mb)
> +
> +#undef __CMPXCHG128
> +
>  #define __CMPXCHG_GEN(sfx)						\
>  static __always_inline unsigned long __cmpxchg##sfx(volatile void *ptr,	\
>  					   unsigned long old,		\
> @@ -229,6 +242,19 @@ __CMPXCHG_GEN(_mb)
>  	__ret;									\
>  })
>  
> +/* cmpxchg128 */
> +#define system_has_cmpxchg128()		1
> +
> +#define arch_cmpxchg128(ptr, o, n)						\
> +({										\
> +	__cmpxchg128_mb((ptr), (o), (n));					\
> +})
> +
> +#define arch_cmpxchg128_local(ptr, o, n)					\
> +({										\
> +	__cmpxchg128((ptr), (o), (n));						\
> +})
> +
>  #define __CMPWAIT_CASE(w, sfx, sz)					\
>  static inline void __cmpwait_case_##sz(volatile void *ptr,		\
>  				       unsigned long val)		\
> --- a/arch/s390/include/asm/cmpxchg.h
> +++ b/arch/s390/include/asm/cmpxchg.h
> @@ -201,4 +201,37 @@ static __always_inline int __cmpxchg_dou
>  			 (unsigned long)(n1), (unsigned long)(n2));	\
>  })
>  
> +#define system_has_cmpxchg128()		1
> +
> +static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new)
> +{
> +	asm volatile(
> +		"	cdsg	%[old],%[new],%[ptr]\n"
> +		: [old] "+&d" (old)
> +		: [new] "d" (new),
> +		  [ptr] "QS" (*(unsigned long *)ptr)
> +		: "memory", "cc");
> +	return old;
> +}
> +
> +static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *oldp, u128 new)
> +{
> +	u128 old = *oldp;
> +	int cc;
> +
> +	asm volatile(
> +		"	cdsg	%[old],%[new],%[ptr]\n"
> +		"	ipm	%[cc]\n"
> +		"	srl	%[cc],28\n"
> +		: [cc] "=&d" (cc), [old] "+&d" (old)
> +		: [new] "d" (new),
> +		  [ptr] "QS" (*(unsigned long *)ptr)
> +		: "memory", "cc");
> +
> +	if (unlikely(!cc))
> +		*oldp = old;
> +
> +	return likely(cc);
> +}
> +
>  #endif /* __ASM_CMPXCHG_H */
> --- a/arch/x86/include/asm/cmpxchg_32.h
> +++ b/arch/x86/include/asm/cmpxchg_32.h
> @@ -103,6 +103,7 @@ static inline bool __try_cmpxchg64(volat
>  
>  #endif
>  
> -#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX8)
> +#define system_has_cmpxchg_double()	boot_cpu_has(X86_FEATURE_CX8)
> +#define system_has_cmpxchg64()		boot_cpu_has(X86_FEATURE_CX8)
>  
>  #endif /* _ASM_X86_CMPXCHG_32_H */
> --- a/arch/x86/include/asm/cmpxchg_64.h
> +++ b/arch/x86/include/asm/cmpxchg_64.h
> @@ -20,6 +20,59 @@
>  	arch_try_cmpxchg((ptr), (po), (n));				\
>  })
>  
> -#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16)
> +union __u128_halves {
> +	u128 full;
> +	struct {
> +		u64 low, high;
> +	};
> +};
> +
> +static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new)
> +{
> +	union __u128_halves o = { .full = old, }, n = { .full = new, };
> +
> +	asm volatile(LOCK_PREFIX "cmpxchg16b %[ptr]"
> +		     : [ptr] "+m" (*ptr),
> +		       "+a" (o.low), "+d" (o.high)
> +		     : "b" (n.low), "c" (n.high)
> +		     : "memory");
> +
> +	return o.full;
> +}
> +
> +static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old, u128 new)
> +{
> +	union __u128_halves o = { .full = old, }, n = { .full = new, };
> +
> +	asm volatile("cmpxchg16b %[ptr]"
> +		     : [ptr] "+m" (*ptr),
> +		       "+a" (o.low), "+d" (o.high)
> +		     : "b" (n.low), "c" (n.high)
> +		     : "memory");
> +
> +	return o.full;
> +}
> +
> +static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *old, u128 new)
> +{
> +	union __u128_halves o = { .full = *old, }, n = { .full = new, };
> +	bool ret;
> +
> +	asm volatile(LOCK_PREFIX "cmpxchg16b %[ptr]"
> +		     CC_SET(e)
> +		     : CC_OUT(e) (ret),
> +		       [ptr] "+m" (*ptr),
> +		       "+a" (o.low), "+d" (o.high)
> +		     : "b" (n.low), "c" (n.high)
> +		     : "memory");
> +
> +	if (unlikely(!ret))
> +		*old = o.full;
> +
> +	return likely(ret);
> +}
> +
> +#define system_has_cmpxchg_double()	boot_cpu_has(X86_FEATURE_CX16)
> +#define system_has_cmpxchg128()		boot_cpu_has(X86_FEATURE_CX16)
>  
>  #endif /* _ASM_X86_CMPXCHG_64_H */
> 
> 

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}()
  2022-12-22  1:25   ` Boqun Feng
@ 2022-12-22 13:16     ` Peter Zijlstra
  0 siblings, 0 replies; 57+ messages in thread
From: Peter Zijlstra @ 2022-12-22 13:16 UTC (permalink / raw)
  To: Boqun Feng
  Cc: torvalds, corbet, will, mark.rutland, catalin.marinas, dennis,
	tj, cl, hca, gor, agordeev, borntraeger, svens, Herbert Xu,
	davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Wed, Dec 21, 2022 at 05:25:20PM -0800, Boqun Feng wrote:

> > +#define __CMPXCHG128(name, mb, cl...)					\
> > +static __always_inline u128						\
> > +__lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)		\
> > +{									\
> > +	union __u128_halves r, o = { .full = (old) },			\
> > +			       n = { .full = (new) };			\
> > +	register unsigned long x0 asm ("x0") = o.low;			\
> > +	register unsigned long x1 asm ("x1") = o.high;			\
> > +	register unsigned long x2 asm ("x2") = n.low;			\
> > +	register unsigned long x3 asm ("x3") = n.high;			\
> > +	register unsigned long x4 asm ("x4") = (unsigned long)ptr;	\
> > +									\
> > +	asm volatile(							\
> > +	__LSE_PREAMBLE							\
> > +	"	casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\
> > +	: [old1] "+&r" (x0), [old2] "+&r" (x1),				\
> > +	  [v] "+Q" (*(unsigned long *)ptr)				\
> > +	: [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4),		\
> 
> Issue #1: the line below can be removed, otherwise..
> 
> > +	  [oldval1] "r" (r.low), [oldval2] "r" (r.high)			\
> 
> warning:
> 
> 	./arch/arm64/include/asm/atomic_lse.h: In function '__lse__cmpxchg128_mb':
> 	./arch/arm64/include/asm/atomic_lse.h:309:27: warning: 'r.<U97b8>.low' is used uninitialized [-Wuninitialized]
> 	  309 |           [oldval1] "r" (r.low), [oldval2] "r" (r.high)
> 
> 
> > +	: cl);								\
> > +									\
> > +	r.low = x0; r.high = x1;					\
> > +									\
> > +	return r.full;							\
> > +}
> > +
> > +__CMPXCHG128(   ,   )
> > +__CMPXCHG128(_mb, al, "memory")
> > +
> > +#undef __CMPXCHG128
> > +
> >  #endif	/* __ASM_ATOMIC_LSE_H */
> > --- a/arch/arm64/include/asm/cmpxchg.h
> > +++ b/arch/arm64/include/asm/cmpxchg.h
> > @@ -147,6 +147,19 @@ __CMPXCHG_DBL(_mb)
> >  
> >  #undef __CMPXCHG_DBL
> >  
> > +#define __CMPXCHG128(name)						\
> > +static inline long __cmpxchg128##name(volatile u128 *ptr,		\
> 
> Issue #2: this should be
> 
> static inline u128 __cmpxchg128##name(..)
> 
> because cmpxchg* needs to return the old value.

Duh.. fixed both. Pushed out to queue/core/wip-u128. I'll probably
continue all this in two weeks (yay xmas break!).

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 09/12] x86,amd_iommu: Replace cmpxchg_double()
  2022-12-19 15:35 ` [RFC][PATCH 09/12] x86,amd_iommu: Replace cmpxchg_double() Peter Zijlstra
  2022-12-19 16:47   ` Niklas Schnelle
@ 2022-12-28  8:40   ` Vasant Hegde
  1 sibling, 0 replies; 57+ messages in thread
From: Vasant Hegde @ 2022-12-28  8:40 UTC (permalink / raw)
  To: Peter Zijlstra, torvalds
  Cc: corbet, will, boqun.feng, mark.rutland, catalin.marinas, dennis,
	tj, cl, hca, gor, agordeev, borntraeger, svens, Herbert Xu,
	davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On 12/19/2022 9:05 PM, Peter Zijlstra wrote:
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  drivers/iommu/amd/amd_iommu_types.h |    9 +++++++--
>  drivers/iommu/amd/iommu.c           |   10 ++++------
>  2 files changed, 11 insertions(+), 8 deletions(-)
> 
> --- a/drivers/iommu/amd/amd_iommu_types.h
> +++ b/drivers/iommu/amd/amd_iommu_types.h
> @@ -979,8 +979,13 @@ union irte_ga_hi {
>  };
>  
>  struct irte_ga {
> -	union irte_ga_lo lo;
> -	union irte_ga_hi hi;
> +	union {
> +		struct {
> +			union irte_ga_lo lo;
> +			union irte_ga_hi hi;
> +		};
> +		u128 irte;
> +	};
>  };
>  
>  struct irq_2_irte {
> --- a/drivers/iommu/amd/iommu.c
> +++ b/drivers/iommu/amd/iommu.c
> @@ -2992,10 +2992,10 @@ static int alloc_irq_index(struct amd_io
>  static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
>  			  struct irte_ga *irte, struct amd_ir_data *data)
>  {
> -	bool ret;
>  	struct irq_remap_table *table;
> -	unsigned long flags;
>  	struct irte_ga *entry;
> +	unsigned long flags;
> +	u128 old;
>  
>  	table = get_irq_table(iommu, devid);
>  	if (!table)
> @@ -3006,16 +3006,14 @@ static int modify_irte_ga(struct amd_iom
>  	entry = (struct irte_ga *)table->table;
>  	entry = &entry[index];
>  
> -	ret = cmpxchg_double(&entry->lo.val, &entry->hi.val,
> -			     entry->lo.val, entry->hi.val,
> -			     irte->lo.val, irte->hi.val);
>  	/*
>  	 * We use cmpxchg16 to atomically update the 128-bit IRTE,
>  	 * and it cannot be updated by the hardware or other processors
>  	 * behind us, so the return value of cmpxchg16 should be the
>  	 * same as the old value.
>  	 */
> -	WARN_ON(!ret);
> +	old = entry->irte;
> +	WARN_ON(!try_cmpxchg128(&entry->irte, &old, irte->irte));

Changes looks good to me. I have tested it on AMD system and it works fine.

-Vasant


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 04/12] types: Introduce [us]128
  2022-12-19 15:35 ` [RFC][PATCH 04/12] types: Introduce [us]128 Peter Zijlstra
@ 2022-12-29  8:30   ` Pavel Machek
  0 siblings, 0 replies; 57+ messages in thread
From: Pavel Machek @ 2022-12-29  8:30 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: torvalds, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, hca, gor, agordeev, borntraeger,
	svens, Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa,
	joro, suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

[-- Attachment #1: Type: text/plain, Size: 907 bytes --]

Hi!

> Introduce [us]128 (when available). Unlike [us]64, ensure they are
> always naturally aligned.
> 
> This also enables 128bit wide atomics (which require natural
> alignment) such as cmpxchg128().
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/types.h      |    5 +++++
>  include/uapi/linux/types.h |    4 ++++
>  2 files changed, 9 insertions(+)
> 
> --- a/include/linux/types.h
> +++ b/include/linux/types.h
> @@ -10,6 +10,11 @@
>  #define DECLARE_BITMAP(name,bits) \
>  	unsigned long name[BITS_TO_LONGS(bits)]
>  
> +#ifdef __SIZEOF_INT128__
> +typedef __s128 s128;
> +typedef __u128 u128;
> +#endif

Should this come as a note here?

> Introduce [us]128 (when available). Unlike [us]64, ensure they are
> always naturally aligned.

BR,
							Pavel
-- 
People of Russia, stop Putin before his war on Ukraine escalates.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 07/12] percpu: Wire up cmpxchg128
  2022-12-19 15:35 ` [RFC][PATCH 07/12] percpu: Wire up cmpxchg128 Peter Zijlstra
@ 2022-12-29 13:36   ` Arnd Bergmann
  2023-01-04 12:09   ` Heiko Carstens
  1 sibling, 0 replies; 57+ messages in thread
From: Arnd Bergmann @ 2022-12-29 13:36 UTC (permalink / raw)
  To: Peter Zijlstra, Linus Torvalds
  Cc: Jonathan Corbet, Will Deacon, Boqun Feng, Mark Rutland,
	Catalin Marinas, dennis, Tejun Heo, Christoph Lameter, hca, gor,
	Alexander Gordeev, borntraeger, svens, Herbert Xu,
	David S . Miller, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H. Peter Anvin, joro, suravee.suthikulpanit,
	Robin Murphy, dwmw2, baolu.lu, Pekka Enberg, David Rientjes,
	Joonsoo Kim, Andrew Morton, Vlastimil Babka, Roman Gushchin,
	Hyeonggon Yoo, linux-doc, linux-kernel, linux-mm, linux-s390,
	linux-crypto, iommu, Linux-Arch

On Mon, Dec 19, 2022, at 16:35, Peter Zijlstra wrote:
> In order to replace cmpxchg_double() with the newly minted
> cmpxchg128() family of functions, wire it up in this_cpu_cmpxchg().
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Does this work on x86 chips without X86_FEATURE_CX16?

As far as I can tell, the new percpu_cmpxchg128_op uses
the cmpxchg16b instruction unconditionally without checking
for the feature bit first, and is now used by this_cpu_cmpxchg()
unconditionally as well.

This is fine for the moment if the only user is mm/slub.c
and that retains the system_has_cmpxchg128() runtime check,
but I think a better interface would be to guarantee that
this_cpu_cmpxchg() always ends up either in a working
inline asm or the generic fallback but not an invalid
opcode.

For consistency, I would also suggest this_cpu_cmpxchg() to
take the same argument types as cmpxchg(): at most 'unsigned
long' sized, with additional this_cpu_cmpxchg64() and
this_cpu_cmpxchg128() macros that take fixed-size arguments.
I have an older patch set that additionally converts all
8-bit and 16-bit cmpxchg()/xchg() calls to cmpxchg_8()/
xchg_8()/cmpxchg_16()/xchg_16() and and leaves only the
fixed-32bit and variable typed 'unsigned long' sized
callers for the weakly typed variant.

       Arnd

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}()
  2022-12-20 11:08     ` Peter Zijlstra
  2022-12-20 14:31       ` Linus Torvalds
@ 2023-01-03 13:25       ` Mark Rutland
  2023-01-03 14:03         ` Mark Rutland
  1 sibling, 1 reply; 57+ messages in thread
From: Mark Rutland @ 2023-01-03 13:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Boqun Feng, torvalds, corbet, will, catalin.marinas, dennis, tj,
	cl, hca, gor, agordeev, borntraeger, svens, Herbert Xu, davem,
	tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Tue, Dec 20, 2022 at 12:08:16PM +0100, Peter Zijlstra wrote:
> On Mon, Dec 19, 2022 at 12:07:25PM -0800, Boqun Feng wrote:
> > On Mon, Dec 19, 2022 at 04:35:30PM +0100, Peter Zijlstra wrote:
> > > For all architectures that currently support cmpxchg_double()
> > > implement the cmpxchg128() family of functions that is basically the
> > > same but with a saner interface.
> > > 
> > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > > ---
> > >  arch/arm64/include/asm/atomic_ll_sc.h |   38 +++++++++++++++++++++++
> > >  arch/arm64/include/asm/atomic_lse.h   |   33 +++++++++++++++++++-
> > >  arch/arm64/include/asm/cmpxchg.h      |   26 ++++++++++++++++
> > >  arch/s390/include/asm/cmpxchg.h       |   33 ++++++++++++++++++++
> > >  arch/x86/include/asm/cmpxchg_32.h     |    3 +
> > >  arch/x86/include/asm/cmpxchg_64.h     |   55 +++++++++++++++++++++++++++++++++-
> > >  6 files changed, 185 insertions(+), 3 deletions(-)
> > > 
> > > --- a/arch/arm64/include/asm/atomic_ll_sc.h
> > > +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> > > @@ -326,6 +326,44 @@ __CMPXCHG_DBL(   ,        ,  ,         )
> > >  __CMPXCHG_DBL(_mb, dmb ish, l, "memory")
> > >  
> > >  #undef __CMPXCHG_DBL
> > > +
> > > +union __u128_halves {
> > > +	u128 full;
> > > +	struct {
> > > +		u64 low, high;
> > > +	};
> > > +};
> > > +
> > > +#define __CMPXCHG128(name, mb, rel, cl)					\
> > > +static __always_inline u128						\
> > > +__ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)	\
> > > +{									\
> > > +	union __u128_halves r, o = { .full = (old) },			\
> > > +			       n = { .full = (new) };			\
> > > +									\
> > > +	asm volatile("// __cmpxchg128" #name "\n"			\
> > > +	"	prfm	pstl1strm, %2\n"				\
> > > +	"1:	ldxp	%0, %1, %2\n"					\
> > > +	"	eor	%3, %0, %3\n"					\
> > > +	"	eor	%4, %1, %4\n"					\
> > > +	"	orr	%3, %4, %3\n"					\
> > > +	"	cbnz	%3, 2f\n"					\
> > > +	"	st" #rel "xp	%w3, %5, %6, %2\n"			\
> > > +	"	cbnz	%w3, 1b\n"					\
> > > +	"	" #mb "\n"						\
> > > +	"2:"								\
> > > +	: "=&r" (r.low), "=&r" (r.high), "+Q" (*(unsigned long *)ptr)	\
> > 
> > I wonder whether we should use "(*(u128 *)ptr)" instead of "(*(unsigned
> > long *) ptr)"? Because compilers may think only 64bit value pointed by
> > "ptr" gets modified, and they are allowed to do "useful" optimization.
> 
> In this I've copied the existing cmpxchg_double() code; I'll have to let
> the arch folks speak here, I've no clue.

We definitely need to ensure the compiler sees we poke the whole thing, or it
can get this horribly wrong, so that is a latent bug.

See commit:

  fee960bed5e857eb ("arm64: xchg: hazard against entire exchange variable")

... for examples of GCC being clever, where I overlooked the *_double() cases.

I'll go spin a quick fix for that so that we can have something go to stable
before we rejig the API.

Mark.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}()
  2023-01-03 13:25       ` Mark Rutland
@ 2023-01-03 14:03         ` Mark Rutland
  2023-01-03 16:19           ` Mark Rutland
  0 siblings, 1 reply; 57+ messages in thread
From: Mark Rutland @ 2023-01-03 14:03 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Boqun Feng, torvalds, corbet, will, catalin.marinas, dennis, tj,
	cl, hca, gor, agordeev, borntraeger, svens, Herbert Xu, davem,
	tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Tue, Jan 03, 2023 at 01:25:35PM +0000, Mark Rutland wrote:
> On Tue, Dec 20, 2022 at 12:08:16PM +0100, Peter Zijlstra wrote:
> > On Mon, Dec 19, 2022 at 12:07:25PM -0800, Boqun Feng wrote:
> > > On Mon, Dec 19, 2022 at 04:35:30PM +0100, Peter Zijlstra wrote:
> > > > For all architectures that currently support cmpxchg_double()
> > > > implement the cmpxchg128() family of functions that is basically the
> > > > same but with a saner interface.
> > > > 
> > > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > > > ---
> > > >  arch/arm64/include/asm/atomic_ll_sc.h |   38 +++++++++++++++++++++++
> > > >  arch/arm64/include/asm/atomic_lse.h   |   33 +++++++++++++++++++-
> > > >  arch/arm64/include/asm/cmpxchg.h      |   26 ++++++++++++++++
> > > >  arch/s390/include/asm/cmpxchg.h       |   33 ++++++++++++++++++++
> > > >  arch/x86/include/asm/cmpxchg_32.h     |    3 +
> > > >  arch/x86/include/asm/cmpxchg_64.h     |   55 +++++++++++++++++++++++++++++++++-
> > > >  6 files changed, 185 insertions(+), 3 deletions(-)
> > > > 
> > > > --- a/arch/arm64/include/asm/atomic_ll_sc.h
> > > > +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> > > > @@ -326,6 +326,44 @@ __CMPXCHG_DBL(   ,        ,  ,         )
> > > >  __CMPXCHG_DBL(_mb, dmb ish, l, "memory")
> > > >  
> > > >  #undef __CMPXCHG_DBL
> > > > +
> > > > +union __u128_halves {
> > > > +	u128 full;
> > > > +	struct {
> > > > +		u64 low, high;
> > > > +	};
> > > > +};
> > > > +
> > > > +#define __CMPXCHG128(name, mb, rel, cl)					\
> > > > +static __always_inline u128						\
> > > > +__ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)	\
> > > > +{									\
> > > > +	union __u128_halves r, o = { .full = (old) },			\
> > > > +			       n = { .full = (new) };			\
> > > > +									\
> > > > +	asm volatile("// __cmpxchg128" #name "\n"			\
> > > > +	"	prfm	pstl1strm, %2\n"				\
> > > > +	"1:	ldxp	%0, %1, %2\n"					\
> > > > +	"	eor	%3, %0, %3\n"					\
> > > > +	"	eor	%4, %1, %4\n"					\
> > > > +	"	orr	%3, %4, %3\n"					\
> > > > +	"	cbnz	%3, 2f\n"					\
> > > > +	"	st" #rel "xp	%w3, %5, %6, %2\n"			\
> > > > +	"	cbnz	%w3, 1b\n"					\
> > > > +	"	" #mb "\n"						\
> > > > +	"2:"								\
> > > > +	: "=&r" (r.low), "=&r" (r.high), "+Q" (*(unsigned long *)ptr)	\
> > > 
> > > I wonder whether we should use "(*(u128 *)ptr)" instead of "(*(unsigned
> > > long *) ptr)"? Because compilers may think only 64bit value pointed by
> > > "ptr" gets modified, and they are allowed to do "useful" optimization.
> > 
> > In this I've copied the existing cmpxchg_double() code; I'll have to let
> > the arch folks speak here, I've no clue.
> 
> We definitely need to ensure the compiler sees we poke the whole thing, or it
> can get this horribly wrong, so that is a latent bug.
> 
> See commit:
> 
>   fee960bed5e857eb ("arm64: xchg: hazard against entire exchange variable")
> 
> ... for examples of GCC being clever, where I overlooked the *_double() cases.

Ugh; with GCC 12.1.0, arm64 defconfig, and the following:

| struct big {
|         u64 lo, hi;
| } __aligned(128);
| 
| unsigned long foo(struct big *b)
| {
|         u64 hi_old, hi_new;
| 
|         hi_old = b->hi;
| 
|         cmpxchg_double_local(&b->lo, &b->hi, 0x12, 0x34, 0x56, 0x78);
| 
|         hi_new = b->hi;
| 
|         return hi_old ^ hi_new;
| }

GCC clearly figures out the high half isn't modified, and constant folds hi_old
^ hi_new down to zero, regardless of whether we use LL/SC or LSE:

<foo>:
   0:   d503233f        paciasp
   4:   aa0003e4        mov     x4, x0
   8:   1400000e        b       40 <foo+0x40>
   c:   d2800240        mov     x0, #0x12                       // #18
  10:   d2800681        mov     x1, #0x34                       // #52
  14:   aa0003e5        mov     x5, x0
  18:   aa0103e6        mov     x6, x1
  1c:   d2800ac2        mov     x2, #0x56                       // #86
  20:   d2800f03        mov     x3, #0x78                       // #120
  24:   48207c82        casp    x0, x1, x2, x3, [x4]
  28:   ca050000        eor     x0, x0, x5
  2c:   ca060021        eor     x1, x1, x6
  30:   aa010000        orr     x0, x0, x1
  34:   d2800000        mov     x0, #0x0                        // #0    <--- BANG
  38:   d50323bf        autiasp
  3c:   d65f03c0        ret
  40:   d2800240        mov     x0, #0x12                       // #18
  44:   d2800681        mov     x1, #0x34                       // #52
  48:   d2800ac2        mov     x2, #0x56                       // #86
  4c:   d2800f03        mov     x3, #0x78                       // #120
  50:   f9800091        prfm    pstl1strm, [x4]
  54:   c87f1885        ldxp    x5, x6, [x4]
  58:   ca0000a5        eor     x5, x5, x0
  5c:   ca0100c6        eor     x6, x6, x1
  60:   aa0600a6        orr     x6, x5, x6
  64:   b5000066        cbnz    x6, 70 <foo+0x70>
  68:   c8250c82        stxp    w5, x2, x3, [x4]
  6c:   35ffff45        cbnz    w5, 54 <foo+0x54>
  70:   d2800000        mov     x0, #0x0                        // #0     <--- BANG
  74:   d50323bf        autiasp
  78:   d65f03c0        ret
  7c:   d503201f        nop

... so we *definitely* need to fix that.

Using __uint128_t instead, e.g.

diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h
index 0890e4f568fb7..cbb3d961123b1 100644
--- a/arch/arm64/include/asm/atomic_ll_sc.h
+++ b/arch/arm64/include/asm/atomic_ll_sc.h
@@ -315,7 +315,7 @@ __ll_sc__cmpxchg_double##name(unsigned long old1,                   \
        "       cbnz    %w0, 1b\n"                                      \
        "       " #mb "\n"                                              \
        "2:"                                                            \
-       : "=&r" (tmp), "=&r" (ret), "+Q" (*(unsigned long *)ptr)        \
+       : "=&r" (tmp), "=&r" (ret), "+Q" (*(__uint128_t *)ptr)          \
        : "r" (old1), "r" (old2), "r" (new1), "r" (new2)                \
        : cl);                                                          \
                                                                        \
diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index 52075e93de6c0..a94d6dacc0292 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -311,7 +311,7 @@ __lse__cmpxchg_double##name(unsigned long old1,                             \
        "       eor     %[old2], %[old2], %[oldval2]\n"                 \
        "       orr     %[old1], %[old1], %[old2]"                      \
        : [old1] "+&r" (x0), [old2] "+&r" (x1),                         \
-         [v] "+Q" (*(unsigned long *)ptr)                              \
+         [v] "+Q" (*(__uint128_t *)ptr)                                \
        : [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4),             \
          [oldval1] "r" (oldval1), [oldval2] "r" (oldval2)              \
        : cl);                                                          \

... makes GCC much happier:

<foo>:
   0:   f9400407        ldr     x7, [x0, #8]
   4:   d503233f        paciasp
   8:   aa0003e4        mov     x4, x0
   c:   1400000f        b       48 <foo+0x48>
  10:   d2800240        mov     x0, #0x12                       // #18
  14:   d2800681        mov     x1, #0x34                       // #52
  18:   aa0003e5        mov     x5, x0
  1c:   aa0103e6        mov     x6, x1
  20:   d2800ac2        mov     x2, #0x56                       // #86
  24:   d2800f03        mov     x3, #0x78                       // #120
  28:   48207c82        casp    x0, x1, x2, x3, [x4]
  2c:   ca050000        eor     x0, x0, x5
  30:   ca060021        eor     x1, x1, x6
  34:   aa010000        orr     x0, x0, x1
  38:   f9400480        ldr     x0, [x4, #8]
  3c:   d50323bf        autiasp
  40:   ca0000e0        eor     x0, x7, x0
  44:   d65f03c0        ret
  48:   d2800240        mov     x0, #0x12                       // #18
  4c:   d2800681        mov     x1, #0x34                       // #52
  50:   d2800ac2        mov     x2, #0x56                       // #86
  54:   d2800f03        mov     x3, #0x78                       // #120
  58:   f9800091        prfm    pstl1strm, [x4]
  5c:   c87f1885        ldxp    x5, x6, [x4]
  60:   ca0000a5        eor     x5, x5, x0
  64:   ca0100c6        eor     x6, x6, x1
  68:   aa0600a6        orr     x6, x5, x6
  6c:   b5000066        cbnz    x6, 78 <foo+0x78>
  70:   c8250c82        stxp    w5, x2, x3, [x4]
  74:   35ffff45        cbnz    w5, 5c <foo+0x5c>
  78:   f9400480        ldr     x0, [x4, #8]
  7c:   d50323bf        autiasp
  80:   ca0000e0        eor     x0, x7, x0
  84:   d65f03c0        ret
  88:   d503201f        nop
  8c:   d503201f        nop

... I'll go check whether clang is happy with that, and how far back that can
go, otherwise we'll need to blat the high half with a separate constaint that
(ideally) doesn't end up allocating a pointless address register.

Mark.

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 11/12] slub: Replace cmpxchg_double()
  2022-12-19 15:35 ` [RFC][PATCH 11/12] slub: " Peter Zijlstra
@ 2023-01-03 15:58   ` Vlastimil Babka
  2023-01-03 17:16   ` Heiko Carstens
  1 sibling, 0 replies; 57+ messages in thread
From: Vlastimil Babka @ 2023-01-03 15:58 UTC (permalink / raw)
  To: Peter Zijlstra, torvalds
  Cc: corbet, will, boqun.feng, mark.rutland, catalin.marinas, dennis,
	tj, cl, hca, gor, agordeev, borntraeger, svens, Herbert Xu,
	davem, tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel, linux-mm,
	linux-s390, linux-crypto, iommu, linux-arch

On 12/19/22 16:35, Peter Zijlstra wrote:
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

LGTM and a nice cleanup also, thanks!
Acked-by: Vlastimil Babka <vbabka@suse.cz>



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}()
  2023-01-03 14:03         ` Mark Rutland
@ 2023-01-03 16:19           ` Mark Rutland
  2023-01-03 16:50             ` Arnd Bergmann
  0 siblings, 1 reply; 57+ messages in thread
From: Mark Rutland @ 2023-01-03 16:19 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Boqun Feng, torvalds, corbet, will, catalin.marinas, dennis, tj,
	cl, hca, gor, agordeev, borntraeger, svens, Herbert Xu, davem,
	tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Tue, Jan 03, 2023 at 02:03:37PM +0000, Mark Rutland wrote:
> On Tue, Jan 03, 2023 at 01:25:35PM +0000, Mark Rutland wrote:
> > On Tue, Dec 20, 2022 at 12:08:16PM +0100, Peter Zijlstra wrote:
> > > On Mon, Dec 19, 2022 at 12:07:25PM -0800, Boqun Feng wrote:
> > > > On Mon, Dec 19, 2022 at 04:35:30PM +0100, Peter Zijlstra wrote:
> > > > > For all architectures that currently support cmpxchg_double()
> > > > > implement the cmpxchg128() family of functions that is basically the
> > > > > same but with a saner interface.
> > > > > 
> > > > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > > > > ---
> > > > >  arch/arm64/include/asm/atomic_ll_sc.h |   38 +++++++++++++++++++++++
> > > > >  arch/arm64/include/asm/atomic_lse.h   |   33 +++++++++++++++++++-
> > > > >  arch/arm64/include/asm/cmpxchg.h      |   26 ++++++++++++++++
> > > > >  arch/s390/include/asm/cmpxchg.h       |   33 ++++++++++++++++++++
> > > > >  arch/x86/include/asm/cmpxchg_32.h     |    3 +
> > > > >  arch/x86/include/asm/cmpxchg_64.h     |   55 +++++++++++++++++++++++++++++++++-
> > > > >  6 files changed, 185 insertions(+), 3 deletions(-)
> > > > > 
> > > > > --- a/arch/arm64/include/asm/atomic_ll_sc.h
> > > > > +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> > > > > @@ -326,6 +326,44 @@ __CMPXCHG_DBL(   ,        ,  ,         )
> > > > >  __CMPXCHG_DBL(_mb, dmb ish, l, "memory")
> > > > >  
> > > > >  #undef __CMPXCHG_DBL
> > > > > +
> > > > > +union __u128_halves {
> > > > > +	u128 full;
> > > > > +	struct {
> > > > > +		u64 low, high;
> > > > > +	};
> > > > > +};
> > > > > +
> > > > > +#define __CMPXCHG128(name, mb, rel, cl)					\
> > > > > +static __always_inline u128						\
> > > > > +__ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)	\
> > > > > +{									\
> > > > > +	union __u128_halves r, o = { .full = (old) },			\
> > > > > +			       n = { .full = (new) };			\
> > > > > +									\
> > > > > +	asm volatile("// __cmpxchg128" #name "\n"			\
> > > > > +	"	prfm	pstl1strm, %2\n"				\
> > > > > +	"1:	ldxp	%0, %1, %2\n"					\
> > > > > +	"	eor	%3, %0, %3\n"					\
> > > > > +	"	eor	%4, %1, %4\n"					\
> > > > > +	"	orr	%3, %4, %3\n"					\
> > > > > +	"	cbnz	%3, 2f\n"					\
> > > > > +	"	st" #rel "xp	%w3, %5, %6, %2\n"			\
> > > > > +	"	cbnz	%w3, 1b\n"					\
> > > > > +	"	" #mb "\n"						\
> > > > > +	"2:"								\
> > > > > +	: "=&r" (r.low), "=&r" (r.high), "+Q" (*(unsigned long *)ptr)	\
> > > > 
> > > > I wonder whether we should use "(*(u128 *)ptr)" instead of "(*(unsigned
> > > > long *) ptr)"? Because compilers may think only 64bit value pointed by
> > > > "ptr" gets modified, and they are allowed to do "useful" optimization.
> > > 
> > > In this I've copied the existing cmpxchg_double() code; I'll have to let
> > > the arch folks speak here, I've no clue.
> > 
> > We definitely need to ensure the compiler sees we poke the whole thing, or it
> > can get this horribly wrong, so that is a latent bug.
> > 
> > See commit:
> > 
> >   fee960bed5e857eb ("arm64: xchg: hazard against entire exchange variable")
> > 
> > ... for examples of GCC being clever, where I overlooked the *_double() cases.

> Using __uint128_t instead, e.g.
> 
> diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h
> index 0890e4f568fb7..cbb3d961123b1 100644
> --- a/arch/arm64/include/asm/atomic_ll_sc.h
> +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> @@ -315,7 +315,7 @@ __ll_sc__cmpxchg_double##name(unsigned long old1,                   \
>         "       cbnz    %w0, 1b\n"                                      \
>         "       " #mb "\n"                                              \
>         "2:"                                                            \
> -       : "=&r" (tmp), "=&r" (ret), "+Q" (*(unsigned long *)ptr)        \
> +       : "=&r" (tmp), "=&r" (ret), "+Q" (*(__uint128_t *)ptr)          \
>         : "r" (old1), "r" (old2), "r" (new1), "r" (new2)                \
>         : cl);                                                          \
>                                                                         \
> diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
> index 52075e93de6c0..a94d6dacc0292 100644
> --- a/arch/arm64/include/asm/atomic_lse.h
> +++ b/arch/arm64/include/asm/atomic_lse.h
> @@ -311,7 +311,7 @@ __lse__cmpxchg_double##name(unsigned long old1,                             \
>         "       eor     %[old2], %[old2], %[oldval2]\n"                 \
>         "       orr     %[old1], %[old1], %[old2]"                      \
>         : [old1] "+&r" (x0), [old2] "+&r" (x1),                         \
> -         [v] "+Q" (*(unsigned long *)ptr)                              \
> +         [v] "+Q" (*(__uint128_t *)ptr)                                \
>         : [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4),             \
>           [oldval1] "r" (oldval1), [oldval2] "r" (oldval2)              \
>         : cl);                                                          \
> 
> ... makes GCC much happier:

> ... I'll go check whether clang is happy with that, and how far back that can
> go, otherwise we'll need to blat the high half with a separate constaint that
> (ideally) doesn't end up allocating a pointless address register.

Hmm... from the commit history it looks like GCC prior to 5.1 might not be
happy with that, but that *might* just be if we actually do arithmetic on the
value, and we might be ok just using it for memroy effects. I can't currently
get such an old GCC to run on my machines so I haven't been able to check.

I'll dig into this a bit more tomorrow, but it looks like the options (for a
backport-suitable fix) will be:

(a) use a __uint128_t input+output, as above, if we're lucky

(b) introduce a second 64-bit input+output for the high half (likely a "+o")

(c) use a full memory clobber for ancient compilers.

Mark.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}()
  2023-01-03 16:19           ` Mark Rutland
@ 2023-01-03 16:50             ` Arnd Bergmann
  2023-01-04 11:36               ` Mark Rutland
  0 siblings, 1 reply; 57+ messages in thread
From: Arnd Bergmann @ 2023-01-03 16:50 UTC (permalink / raw)
  To: Mark Rutland, Peter Zijlstra
  Cc: Boqun Feng, Linus Torvalds, Jonathan Corbet, Will Deacon,
	Catalin Marinas, dennis, Tejun Heo, Christoph Lameter,
	Heiko Carstens, gor, Alexander Gordeev, borntraeger, svens,
	Herbert Xu, David S . Miller, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, joro,
	suravee.suthikulpanit, Robin Murphy, dwmw2, baolu.lu,
	Pekka Enberg, David Rientjes, Joonsoo Kim, Andrew Morton,
	Vlastimil Babka, Roman Gushchin, Hyeonggon Yoo, linux-doc,
	linux-kernel, linux-mm, linux-s390, linux-crypto, iommu,
	Linux-Arch

On Tue, Jan 3, 2023, at 17:19, Mark Rutland wrote:
> On Tue, Jan 03, 2023 at 02:03:37PM +0000, Mark Rutland wrote:
>> On Tue, Jan 03, 2023 at 01:25:35PM +0000, Mark Rutland wrote:
>> > On Tue, Dec 20, 2022 at 12:08:16PM +0100, Peter Zijlstra wrote:

>> ... makes GCC much happier:
>
>> ... I'll go check whether clang is happy with that, and how far back that can
>> go, otherwise we'll need to blat the high half with a separate constaint that
>> (ideally) doesn't end up allocating a pointless address register.
>
> Hmm... from the commit history it looks like GCC prior to 5.1 might not be
> happy with that, but that *might* just be if we actually do arithmetic on the
> value, and we might be ok just using it for memroy effects. I can't currently
> get such an old GCC to run on my machines so I haven't been able to check.

gcc-5.1 is the oldest (barely) supported compiler, the minimum was
last raised from gcc-4.9 in linux-5.15. If only gcc-4.9 and older are
affected, we're good on mainline but may still want a fix for stable
kernels.

I checked that the cross-compiler binaries from [1] still work, but I noticed
that this version is missing the native aarch64-to-aarch64 compiler (x86 to
aarch64 and vice versa are there), and you need to install libmpfr4 [2]
as a dependency. The newer compilers (6.5.0 and up) don't have these problems.

     Arnd

[1] https://mirrors.edge.kernel.org/pub/tools/crosstool/files/bin/arm64/5.5.0/
[2] http://ftp.uk.debian.org/debian/pool/main/m/mpfr4/libmpfr4_3.1.5-1_arm64.deb

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}()
  2022-12-19 15:35 ` [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}() Peter Zijlstra
  2022-12-19 20:07   ` Boqun Feng
  2022-12-22  1:25   ` Boqun Feng
@ 2023-01-03 17:12   ` Heiko Carstens
  2023-01-09 18:50   ` Mark Rutland
  3 siblings, 0 replies; 57+ messages in thread
From: Heiko Carstens @ 2023-01-03 17:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: torvalds, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, gor, agordeev, borntraeger,
	svens, Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa,
	joro, suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Mon, Dec 19, 2022 at 04:35:30PM +0100, Peter Zijlstra wrote:
> For all architectures that currently support cmpxchg_double()
> implement the cmpxchg128() family of functions that is basically the
> same but with a saner interface.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/arm64/include/asm/atomic_ll_sc.h |   38 +++++++++++++++++++++++
>  arch/arm64/include/asm/atomic_lse.h   |   33 +++++++++++++++++++-
>  arch/arm64/include/asm/cmpxchg.h      |   26 ++++++++++++++++
>  arch/s390/include/asm/cmpxchg.h       |   33 ++++++++++++++++++++
>  arch/x86/include/asm/cmpxchg_32.h     |    3 +
>  arch/x86/include/asm/cmpxchg_64.h     |   55 +++++++++++++++++++++++++++++++++-
>  6 files changed, 185 insertions(+), 3 deletions(-)
...
> --- a/arch/s390/include/asm/cmpxchg.h
> +++ b/arch/s390/include/asm/cmpxchg.h
> @@ -201,4 +201,37 @@ static __always_inline int __cmpxchg_dou
>  			 (unsigned long)(n1), (unsigned long)(n2));	\
>  })
>  
> +#define system_has_cmpxchg128()		1
> +
> +static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new)
> +{
> +	asm volatile(
> +		"	cdsg	%[old],%[new],%[ptr]\n"
> +		: [old] "+&d" (old)
> +		: [new] "d" (new),
> +		  [ptr] "QS" (*(unsigned long *)ptr)
> +		: "memory", "cc");
> +	return old;
> +}
> +
> +static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *oldp, u128 new)
> +{
> +	u128 old = *oldp;
> +	int cc;
> +
> +	asm volatile(
> +		"	cdsg	%[old],%[new],%[ptr]\n"
> +		"	ipm	%[cc]\n"
> +		"	srl	%[cc],28\n"
> +		: [cc] "=&d" (cc), [old] "+&d" (old)
> +		: [new] "d" (new),
> +		  [ptr] "QS" (*(unsigned long *)ptr)
> +		: "memory", "cc");
> +
> +	if (unlikely(!cc))
> +		*oldp = old;
> +
> +	return likely(cc);
> +}
> +

I was wondering why arch_try_cmpxchg128() isn't even used with later
code. Turned out this is because of a missing

#define arch_try_cmpxchg128 arch_try_cmpxchg128

which in turn means that the generic fallback variant is used.

The above arch_try_cmpxchg128() implementation is broken, since it has
inversed condition code handling (cc == 0 means compare and swap
succeeded, cc == 1 means it failed).

However I would prefer to use the generic fallback variant anyway.
Could you please merge the below into your current patch?

It addresses also the oddity that *ptr within arch_cmpxchg128() is
only specified as input, while it should be input/output - it doesn't
matter due to the memory clobber, but let's have that correct anyway.

diff --git a/arch/s390/include/asm/cmpxchg.h b/arch/s390/include/asm/cmpxchg.h
index 527c968945e8..0b98f57bbe9e 100644
--- a/arch/s390/include/asm/cmpxchg.h
+++ b/arch/s390/include/asm/cmpxchg.h
@@ -173,31 +173,12 @@ static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 n
 {
 	asm volatile(
 		"	cdsg	%[old],%[new],%[ptr]\n"
-		: [old] "+&d" (old)
-		: [new] "d" (new),
-		  [ptr] "QS" (*(u128 *)ptr)
+		: [old] "+d" (old), [ptr] "+QS" (*ptr)
+		: [new] "d" (new)
 		: "memory", "cc");
 	return old;
 }
 
-static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *oldp, u128 new)
-{
-	u128 old = *oldp;
-	int cc;
-
-	asm volatile(
-		"	cdsg	%[old],%[new],%[ptr]\n"
-		"	ipm	%[cc]\n"
-		"	srl	%[cc],28\n"
-		: [cc] "=&d" (cc), [old] "+&d" (old)
-		: [new] "d" (new),
-		  [ptr] "QS" (*(u128 *)ptr)
-		: "memory", "cc");
-
-	if (unlikely(!cc))
-		*oldp = old;
-
-	return likely(cc);
-}
+#define arch_cmpxchg128		arch_cmpxchg128
 
 #endif /* __ASM_CMPXCHG_H */

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 11/12] slub: Replace cmpxchg_double()
  2022-12-19 15:35 ` [RFC][PATCH 11/12] slub: " Peter Zijlstra
  2023-01-03 15:58   ` Vlastimil Babka
@ 2023-01-03 17:16   ` Heiko Carstens
  2023-01-03 19:08     ` Linus Torvalds
  1 sibling, 1 reply; 57+ messages in thread
From: Heiko Carstens @ 2023-01-03 17:16 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: torvalds, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, gor, agordeev, borntraeger,
	svens, Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa,
	joro, suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Mon, Dec 19, 2022 at 04:35:36PM +0100, Peter Zijlstra wrote:
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/slub_def.h |   12 ++-
>  mm/slab.h                |   41 +++++++++++--
>  mm/slub.c                |  146 ++++++++++++++++++++++++++++-------------------
>  3 files changed, 135 insertions(+), 64 deletions(-)

Does this actually work? Just wondering since I end up with an instant
list corruption on s390. Might be endianness related, but I can't see
anything obvious at a first glance.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 11/12] slub: Replace cmpxchg_double()
  2023-01-03 17:16   ` Heiko Carstens
@ 2023-01-03 19:08     ` Linus Torvalds
  2023-01-04 12:07       ` Heiko Carstens
  2023-01-09 16:28       ` Peter Zijlstra
  0 siblings, 2 replies; 57+ messages in thread
From: Linus Torvalds @ 2023-01-03 19:08 UTC (permalink / raw)
  To: Heiko Carstens
  Cc: Peter Zijlstra, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, gor, agordeev, borntraeger,
	svens, Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa,
	joro, suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Tue, Jan 3, 2023 at 9:17 AM Heiko Carstens <hca@linux.ibm.com> wrote:
>
> On Mon, Dec 19, 2022 at 04:35:36PM +0100, Peter Zijlstra wrote:
> >
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---
> >  include/linux/slub_def.h |   12 ++-
> >  mm/slab.h                |   41 +++++++++++--
> >  mm/slub.c                |  146 ++++++++++++++++++++++++++++-------------------
> >  3 files changed, 135 insertions(+), 64 deletions(-)
>
> Does this actually work? Just wondering since I end up with an instant
> list corruption on s390. Might be endianness related, but I can't see
> anything obvious at a first glance.

I don't see anything that looks related to endianness, because while
there is that 128-bit union member, it's always either used in full,
or it's accessed as other union members.

But I *do* note that this patch seems to be the only one that depends
on the new this_cpu_cmpxchg() updates to make it just automatically do
the right thing for a 128-bit value. And I have to admit that all
those games with __pcpu_cast_128() make no sense to me. Why isn't it
just using "u128" everywhere without any odd _Generic() games?

I could also easily see that if the asm constraints are wrong (like
the "cast pointer to (unsigned long *) instead of keeping it pointing
to a 128-bit type" thing discussed earlier), then code like this:

+       freelist_aba_t old = { .freelist = freelist_old, .counter = tid };
+       freelist_aba_t new = { .freelist = freelist_new, .counter =
next_tid(tid) };
+
+       return this_cpu_cmpxchg(s->cpu_slab->freelist_tid.full,
+                               old.full, new.full) == old.full;

would easily make the compiler go "the second word of 'old' is never
used by the asm, so I won't initialize it".

But yeah, that patch is hard to read, so hard to say. Does everything
leading up to it work fine?

                 Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}()
  2023-01-03 16:50             ` Arnd Bergmann
@ 2023-01-04 11:36               ` Mark Rutland
  2023-01-04 13:55                 ` Mark Rutland
  0 siblings, 1 reply; 57+ messages in thread
From: Mark Rutland @ 2023-01-04 11:36 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Peter Zijlstra, Boqun Feng, Linus Torvalds, Jonathan Corbet,
	Will Deacon, Catalin Marinas, dennis, Tejun Heo,
	Christoph Lameter, Heiko Carstens, gor, Alexander Gordeev,
	borntraeger, svens, Herbert Xu, David S . Miller,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, joro, suravee.suthikulpanit, Robin Murphy, dwmw2,
	baolu.lu, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, Vlastimil Babka, Roman Gushchin, Hyeonggon Yoo,
	linux-doc, linux-kernel, linux-mm, linux-s390, linux-crypto,
	iommu, Linux-Arch

On Tue, Jan 03, 2023 at 05:50:00PM +0100, Arnd Bergmann wrote:
> On Tue, Jan 3, 2023, at 17:19, Mark Rutland wrote:
> > On Tue, Jan 03, 2023 at 02:03:37PM +0000, Mark Rutland wrote:
> >> On Tue, Jan 03, 2023 at 01:25:35PM +0000, Mark Rutland wrote:
> >> > On Tue, Dec 20, 2022 at 12:08:16PM +0100, Peter Zijlstra wrote:
> 
> >> ... makes GCC much happier:
> >
> >> ... I'll go check whether clang is happy with that, and how far back that can
> >> go, otherwise we'll need to blat the high half with a separate constaint that
> >> (ideally) doesn't end up allocating a pointless address register.
> >
> > Hmm... from the commit history it looks like GCC prior to 5.1 might not be
> > happy with that, but that *might* just be if we actually do arithmetic on the
> > value, and we might be ok just using it for memroy effects. I can't currently
> > get such an old GCC to run on my machines so I haven't been able to check.
> 
> gcc-5.1 is the oldest (barely) supported compiler, the minimum was
> last raised from gcc-4.9 in linux-5.15. If only gcc-4.9 and older are
> affected, we're good on mainline but may still want a fix for stable
> kernels.

Yup; I just wanted something that would easily backport to stable, at least as
far as linux-4.9.y (where I couldn't find the minimum GCC version when I looked
yesterday).

> I checked that the cross-compiler binaries from [1] still work, but I noticed
> that this version is missing the native aarch64-to-aarch64 compiler (x86 to
> aarch64 and vice versa are there), and you need to install libmpfr4 [2]
> as a dependency. The newer compilers (6.5.0 and up) don't have these problems.

I was trying the old kernel.org crosstool binaries, but I was either missing a
library (or I have an incompatible version) on my x86_64 host. I'll have
another look today -- thanks for the pointers!

Mark.

>      Arnd
> 
> [1] https://mirrors.edge.kernel.org/pub/tools/crosstool/files/bin/arm64/5.5.0/
> [2] http://ftp.uk.debian.org/debian/pool/main/m/mpfr4/libmpfr4_3.1.5-1_arm64.deb

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 11/12] slub: Replace cmpxchg_double()
  2023-01-03 19:08     ` Linus Torvalds
@ 2023-01-04 12:07       ` Heiko Carstens
  2023-01-09 16:28       ` Peter Zijlstra
  1 sibling, 0 replies; 57+ messages in thread
From: Heiko Carstens @ 2023-01-04 12:07 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, gor, agordeev, borntraeger,
	svens, Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa,
	joro, suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Tue, Jan 03, 2023 at 11:08:29AM -0800, Linus Torvalds wrote:
> On Tue, Jan 3, 2023 at 9:17 AM Heiko Carstens <hca@linux.ibm.com> wrote:
> >
> > On Mon, Dec 19, 2022 at 04:35:36PM +0100, Peter Zijlstra wrote:
> > >
> > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > > ---
> > >  include/linux/slub_def.h |   12 ++-
> > >  mm/slab.h                |   41 +++++++++++--
> > >  mm/slub.c                |  146 ++++++++++++++++++++++++++++-------------------
> > >  3 files changed, 135 insertions(+), 64 deletions(-)
> >
> > Does this actually work? Just wondering since I end up with an instant
> > list corruption on s390. Might be endianness related, but I can't see
> > anything obvious at a first glance.
...
> the right thing for a 128-bit value. And I have to admit that all
> those games with __pcpu_cast_128() make no sense to me. Why isn't it
> just using "u128" everywhere without any odd _Generic() games?

That would have been my question as well, but the good thing is that
you pointed me to the percpu patch - Initially didn't expect any s390
specific code in there, but that is where the bug is.
I'll reply to that patch.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 07/12] percpu: Wire up cmpxchg128
  2022-12-19 15:35 ` [RFC][PATCH 07/12] percpu: Wire up cmpxchg128 Peter Zijlstra
  2022-12-29 13:36   ` Arnd Bergmann
@ 2023-01-04 12:09   ` Heiko Carstens
  2023-01-09 16:29     ` Peter Zijlstra
  1 sibling, 1 reply; 57+ messages in thread
From: Heiko Carstens @ 2023-01-04 12:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: torvalds, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, gor, agordeev, borntraeger,
	svens, Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa,
	joro, suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Mon, Dec 19, 2022 at 04:35:32PM +0100, Peter Zijlstra wrote:
> In order to replace cmpxchg_double() with the newly minted
> cmpxchg128() family of functions, wire it up in this_cpu_cmpxchg().
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
...
> --- a/arch/s390/include/asm/percpu.h
> +++ b/arch/s390/include/asm/percpu.h
> +#define this_cpu_cmpxchg_16(pcp, oval, nval)				\
> +({									\
> +	u128 old__ = __pcpu_cast_128((nval), (nval));			\
> +	u128 new__ = __pcpu_cast_128((oval), (oval));			\

spot the bug... please merge the below into this patch.

diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h
index 24a4d9d644c0..d1997d01892a 100644
--- a/arch/s390/include/asm/percpu.h
+++ b/arch/s390/include/asm/percpu.h
@@ -156,8 +156,8 @@
 
 #define this_cpu_cmpxchg_16(pcp, oval, nval)				\
 ({									\
-	u128 old__ = __pcpu_cast_128((nval), (nval));			\
-	u128 new__ = __pcpu_cast_128((oval), (oval));			\
+	u128 old__ = __pcpu_cast_128((oval), (oval));			\
+	u128 new__ = __pcpu_cast_128((nval), (nval));			\
 	typedef typeof(pcp) pcp_op_T__;					\
 	pcp_op_T__ *ptr__;						\
 	u128 ret__;							\

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}()
  2023-01-04 11:36               ` Mark Rutland
@ 2023-01-04 13:55                 ` Mark Rutland
  0 siblings, 0 replies; 57+ messages in thread
From: Mark Rutland @ 2023-01-04 13:55 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Peter Zijlstra, Boqun Feng, Linus Torvalds, Jonathan Corbet,
	Will Deacon, Catalin Marinas, dennis, Tejun Heo,
	Christoph Lameter, Heiko Carstens, gor, Alexander Gordeev,
	borntraeger, svens, Herbert Xu, David S . Miller,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, joro, suravee.suthikulpanit, Robin Murphy, dwmw2,
	baolu.lu, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, Vlastimil Babka, Roman Gushchin, Hyeonggon Yoo,
	linux-doc, linux-kernel, linux-mm, linux-s390, linux-crypto,
	iommu, Linux-Arch

On Wed, Jan 04, 2023 at 11:36:18AM +0000, Mark Rutland wrote:
> On Tue, Jan 03, 2023 at 05:50:00PM +0100, Arnd Bergmann wrote:
> > On Tue, Jan 3, 2023, at 17:19, Mark Rutland wrote:
> > > On Tue, Jan 03, 2023 at 02:03:37PM +0000, Mark Rutland wrote:
> > >> On Tue, Jan 03, 2023 at 01:25:35PM +0000, Mark Rutland wrote:
> > >> > On Tue, Dec 20, 2022 at 12:08:16PM +0100, Peter Zijlstra wrote:
> > 
> > >> ... makes GCC much happier:
> > >
> > >> ... I'll go check whether clang is happy with that, and how far back that can
> > >> go, otherwise we'll need to blat the high half with a separate constaint that
> > >> (ideally) doesn't end up allocating a pointless address register.
> > >
> > > Hmm... from the commit history it looks like GCC prior to 5.1 might not be
> > > happy with that, but that *might* just be if we actually do arithmetic on the
> > > value, and we might be ok just using it for memroy effects. I can't currently
> > > get such an old GCC to run on my machines so I haven't been able to check.
> > 
> > gcc-5.1 is the oldest (barely) supported compiler, the minimum was
> > last raised from gcc-4.9 in linux-5.15. If only gcc-4.9 and older are
> > affected, we're good on mainline but may still want a fix for stable
> > kernels.
> 
> Yup; I just wanted something that would easily backport to stable, at least as
> far as linux-4.9.y (where I couldn't find the minimum GCC version when I looked
> yesterday).

I'd missed that we backported commit:

  dca5244d2f5b94f1 ("compiler.h: Raise minimum version of GCC to 5.1 for arm64")

... all the way back to v4.4.y, so we can assume v5.1 even in stable.

The earliest toolchain I could get running was GCC 4.8.5, and that was happy
with the __uint128_t cast for the asm,

Looking back through the history, the reason for the GCC 5.1 check was that
prior to GCC 5.1 GCC would output library calls for arithmetic on 128-bit
types, as noted in commit:

  fb8722735f50cd51 ("arm64: support __int128 on gcc 5+")

... but since we're not doing any actual manipulation of the value, that should
be fine.

I'll go write a commit message and send that out as a fix.

> > I checked that the cross-compiler binaries from [1] still work, but I noticed
> > that this version is missing the native aarch64-to-aarch64 compiler (x86 to
> > aarch64 and vice versa are there), and you need to install libmpfr4 [2]
> > as a dependency. The newer compilers (6.5.0 and up) don't have these problems.
> 
> I was trying the old kernel.org crosstool binaries, but I was either missing a
> library (or I have an incompatible version) on my x86_64 host. I'll have
> another look today -- thanks for the pointers!

It turns out I'd just missed that at some point the prefix used by the
kernel.org cross compilers changed from:

  aarch64-linux-gnu-

to:

  aarch64-linux-

... and I'd become so used to the latter that I was trying to invoke a binary
that didn't exist. With the older prefix I could use the kernel.org GCC 4.8.5
without issue.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 11/12] slub: Replace cmpxchg_double()
  2023-01-03 19:08     ` Linus Torvalds
  2023-01-04 12:07       ` Heiko Carstens
@ 2023-01-09 16:28       ` Peter Zijlstra
  2023-01-09 22:02         ` Linus Torvalds
  1 sibling, 1 reply; 57+ messages in thread
From: Peter Zijlstra @ 2023-01-09 16:28 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Heiko Carstens, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, gor, agordeev, borntraeger,
	svens, Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa,
	joro, suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Tue, Jan 03, 2023 at 11:08:29AM -0800, Linus Torvalds wrote:

> But I *do* note that this patch seems to be the only one that depends
> on the new this_cpu_cmpxchg() updates to make it just automatically do
> the right thing for a 128-bit value. And I have to admit that all
> those games with __pcpu_cast_128() make no sense to me. Why isn't it
> just using "u128" everywhere without any odd _Generic() games?

I ran into a ton of casting trouble when compiling kernel/fork.c which
uses this_cpu_cmpxchg() on a pointer type and the compiler hates casting
pointers to an integer that is not the exact same size.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 07/12] percpu: Wire up cmpxchg128
  2023-01-04 12:09   ` Heiko Carstens
@ 2023-01-09 16:29     ` Peter Zijlstra
  0 siblings, 0 replies; 57+ messages in thread
From: Peter Zijlstra @ 2023-01-09 16:29 UTC (permalink / raw)
  To: Heiko Carstens
  Cc: torvalds, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, gor, agordeev, borntraeger,
	svens, Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa,
	joro, suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Wed, Jan 04, 2023 at 01:09:16PM +0100, Heiko Carstens wrote:
> On Mon, Dec 19, 2022 at 04:35:32PM +0100, Peter Zijlstra wrote:
> > In order to replace cmpxchg_double() with the newly minted
> > cmpxchg128() family of functions, wire it up in this_cpu_cmpxchg().
> > 
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ...
> > --- a/arch/s390/include/asm/percpu.h
> > +++ b/arch/s390/include/asm/percpu.h
> > +#define this_cpu_cmpxchg_16(pcp, oval, nval)				\
> > +({									\
> > +	u128 old__ = __pcpu_cast_128((nval), (nval));			\
> > +	u128 new__ = __pcpu_cast_128((oval), (oval));			\
> 
> spot the bug... please merge the below into this patch.

D'oh, luckily I got a fresh pile of brown paper bags for xmas.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}()
  2022-12-19 15:35 ` [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}() Peter Zijlstra
                     ` (2 preceding siblings ...)
  2023-01-03 17:12   ` Heiko Carstens
@ 2023-01-09 18:50   ` Mark Rutland
  2023-01-12 10:35     ` Peter Zijlstra
  3 siblings, 1 reply; 57+ messages in thread
From: Mark Rutland @ 2023-01-09 18:50 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: torvalds, corbet, will, boqun.feng, catalin.marinas, dennis, tj,
	cl, hca, gor, agordeev, borntraeger, svens, Herbert Xu, davem,
	tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

Hi Peter,

On Mon, Dec 19, 2022 at 04:35:30PM +0100, Peter Zijlstra wrote:
> For all architectures that currently support cmpxchg_double()
> implement the cmpxchg128() family of functions that is basically the
> same but with a saner interface.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

I tried giving this a go, specifically your queue core/wip-u128 branch with
HEAD commit c05419246aa69cd3, but it locked up at boot.

I spotted a couple of problems there which also apply here, noted below with
suggested fixes.

> ---
>  arch/arm64/include/asm/atomic_ll_sc.h |   38 +++++++++++++++++++++++
>  arch/arm64/include/asm/atomic_lse.h   |   33 +++++++++++++++++++-
>  arch/arm64/include/asm/cmpxchg.h      |   26 ++++++++++++++++
>  arch/s390/include/asm/cmpxchg.h       |   33 ++++++++++++++++++++
>  arch/x86/include/asm/cmpxchg_32.h     |    3 +
>  arch/x86/include/asm/cmpxchg_64.h     |   55 +++++++++++++++++++++++++++++++++-
>  6 files changed, 185 insertions(+), 3 deletions(-)
> 
> --- a/arch/arm64/include/asm/atomic_ll_sc.h
> +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> @@ -326,6 +326,44 @@ __CMPXCHG_DBL(   ,        ,  ,         )
>  __CMPXCHG_DBL(_mb, dmb ish, l, "memory")
>  
>  #undef __CMPXCHG_DBL
> +
> +union __u128_halves {
> +	u128 full;
> +	struct {
> +		u64 low, high;
> +	};
> +};
> +
> +#define __CMPXCHG128(name, mb, rel, cl)					\
> +static __always_inline u128						\
> +__ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)	\
> +{									\
> +	union __u128_halves r, o = { .full = (old) },			\
> +			       n = { .full = (new) };			\
> +									\
> +	asm volatile("// __cmpxchg128" #name "\n"			\
> +	"	prfm	pstl1strm, %2\n"				\
> +	"1:	ldxp	%0, %1, %2\n"					\
> +	"	eor	%3, %0, %3\n"					\
> +	"	eor	%4, %1, %4\n"					\
> +	"	orr	%3, %4, %3\n"					\
> +	"	cbnz	%3, 2f\n"					\

These lines clobber %3 and %4, but per below, those are input operands, and so this blows up.

> +	"	st" #rel "xp	%w3, %5, %6, %2\n"			\
> +	"	cbnz	%w3, 1b\n"					\
> +	"	" #mb "\n"						\
> +	"2:"								\
> +	: "=&r" (r.low), "=&r" (r.high), "+Q" (*(unsigned long *)ptr)	\
> +	: "r" (o.low), "r" (o.high), "r" (n.low), "r" (n.high)		\
> +	: cl);								\
> +									\
> +	return r.full;							\
> +}
> +
> +__CMPXCHG128(   ,        ,  ,         )
> +__CMPXCHG128(_mb, dmb ish, l, "memory")
> +
> +#undef __CMPXCHG128

I think we can do this simpler and more clearly if we use the u128 operand
directly, with the 'H' modifier to get at the high register of the pair:

| #define __CMPXCHG128(name, mb, rel, cl...)                              \
| static __always_inline u128                                             \
| __ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)       \
| {                                                                       \
|         u128 ret;                                                       \
|         unsigned int tmp;                                               \
|                                                                         \
|         asm volatile("// __cmpxchg128" #name "\n"                       \
|         "       prfm    pstl1strm, %[v]\n"                              \
|         "1:     ldxp    %[ret], %H[ret], %[v]\n"                        \
|         "       cmp     %[ret], %[old]\n"                               \
|         "       ccmp    %H[ret], %H[old], #4, ne\n"                     \
|         "       b.ne    2f\n"                                           \
|         "       st" #rel "xp %w[tmp], %[new], %H[new], %[v]\n"          \
|         "       cbnz    %w[tmp], 1b\n"                                  \
|         "       " #mb "\n"                                              \
|         "2:"                                                            \
|         : [ret] "=&r" (ret),                                            \
|           [tmp] "=&r" (tmp),                                            \
|           [v] "+Q" (*ptr)                                               \
|         : [old] "r" (old),                                              \
|           [new] "r" (new)                                               \
|         : "cc" , ##cl);                                                 \
|                                                                         \
|         return ret;                                                     \
| }
| 
| __CMPXCHG128(   ,        ,  )
| __CMPXCHG128(_mb, dmb ish, l, "memory")
| 
| #undef __CMPXCHG128

Note: I've used CMP and CCMP to simplify the equality check, which clobbers the
flags/condition-codes ("cc"), but requires two fewer GPRs. I'm assuming that's
the better tradeoff here.

The existing cmpxchg_double() code clobbers the loaded value as part of
checking whether it was equal, but to be able to preserve the value and be able
to replay the loop (which for hilarious LL/SC reasons *must* be in asm), we
can't do the same here.

I've boot-tested the suggestion with GCC 12.1.0.

> +
>  #undef K
>  
>  #endif	/* __ASM_ATOMIC_LL_SC_H */
> --- a/arch/arm64/include/asm/atomic_lse.h
> +++ b/arch/arm64/include/asm/atomic_lse.h
> @@ -151,7 +151,7 @@ __lse_atomic64_fetch_##op##name(s64 i, a
>  	"	" #asm_op #mb "	%[i], %[old], %[v]"			\
>  	: [v] "+Q" (v->counter),					\
>  	  [old] "=r" (old)						\
> -	: [i] "r" (i) 							\
> +	: [i] "r" (i)							\
>  	: cl);								\
>  									\
>  	return old;							\

Spurious whitespace change?

> @@ -324,4 +324,35 @@ __CMPXCHG_DBL(_mb, al, "memory")
>  
>  #undef __CMPXCHG_DBL
>  
> +#define __CMPXCHG128(name, mb, cl...)					\
> +static __always_inline u128						\
> +__lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)		\
> +{									\
> +	union __u128_halves r, o = { .full = (old) },			\
> +			       n = { .full = (new) };			\
> +	register unsigned long x0 asm ("x0") = o.low;			\
> +	register unsigned long x1 asm ("x1") = o.high;			\
> +	register unsigned long x2 asm ("x2") = n.low;			\
> +	register unsigned long x3 asm ("x3") = n.high;			\
> +	register unsigned long x4 asm ("x4") = (unsigned long)ptr;	\
> +									\
> +	asm volatile(							\
> +	__LSE_PREAMBLE							\
> +	"	casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\
> +	: [old1] "+&r" (x0), [old2] "+&r" (x1),				\
> +	  [v] "+Q" (*(unsigned long *)ptr)				\
> +	: [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4),		\
> +	  [oldval1] "r" (r.low), [oldval2] "r" (r.high)			\
> +	: cl);								\
> +									\
> +	r.low = x0; r.high = x1;					\
> +									\
> +	return r.full;							\
> +}
> +
> +__CMPXCHG128(   ,   )
> +__CMPXCHG128(_mb, al, "memory")
> +
> +#undef __CMPXCHG128

Similarly, I'd suggest:

| #define __CMPXCHG128(name, mb, cl...)                                   \
| static __always_inline u128                                             \
| __lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)         \
| {                                                                       \
|         asm volatile(                                                   \
|         __LSE_PREAMBLE                                                  \
|         "       casp" #mb "\t%[old], %H[old], %[new], %H[new], %[v]\n"  \
|         : [old] "+&r" (old),                                            \
|           [v] "+Q" (*(u128 *)ptr)                                       \
|         : [new] "r" (new)                                               \
|         : cl);                                                          \
|                                                                         \
|         return old;                                                     \
| }
| 
| __CMPXCHG128(   ,   )   
| __CMPXCHG128(_mb, al, "memory")
| 
| #undef __CMPXCHG128

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 11/12] slub: Replace cmpxchg_double()
  2023-01-09 16:28       ` Peter Zijlstra
@ 2023-01-09 22:02         ` Linus Torvalds
  2023-01-09 22:22           ` H. Peter Anvin
  2023-01-10 10:28           ` Peter Zijlstra
  0 siblings, 2 replies; 57+ messages in thread
From: Linus Torvalds @ 2023-01-09 22:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Heiko Carstens, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, gor, agordeev, borntraeger,
	svens, Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa,
	joro, suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Mon, Jan 9, 2023 at 10:29 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> I ran into a ton of casting trouble when compiling kernel/fork.c which
> uses this_cpu_cmpxchg() on a pointer type and the compiler hates casting
> pointers to an integer that is not the exact same size.

Ahh. Yeah - not because that code needs or wants the 128-bit case, but
because the macro expands to all sizes in a switch statement, so the
compiler sees all the cases even if only one is then statically
picked.

So the silly casts are for all the cases that never matter.

Annoying.

I wonder if the "this_cpu_cmpxchg()" macro could be made to use
_Generic() to pick out the pointer case first, and then only use
'sizeof()' for the integer types, so that we don't have this kind of
"every architecture needs to deal with the nasty situation" code.

Ok, it's not actually the this_cpu_cmpxchg() macro, it's
__pcpu_size_call_return() and friends, but whatever.

Another alternative is to try to avoid casting to "u64" as long as
humanly possible, and use only "typeof((*ptr))" everywhere. Then when
the type actually *is* 128-bit, it all works out fine, because it
won't be a pointer. That's the approach the uaccess macros tend to
take, and then they hit the reverse issue on clang, where using the
"byte register" constraints would cause warnings for non-byte
accesses, and we had to do

                unsigned char x_u8__;
                __get_user_asm(x_u8__, ptr, "b", "=q", label);
                (x) = x_u8__;

because using '(x)' directly would then warn when 'x' wasn't a
char-sized thing - even if that asm case never actually was _used_ for
that case, since it was all inside a "switch (sizeof) case 1:"
statement.

            Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 11/12] slub: Replace cmpxchg_double()
  2023-01-09 22:02         ` Linus Torvalds
@ 2023-01-09 22:22           ` H. Peter Anvin
  2023-01-10  2:09             ` H. Peter Anvin
  2023-01-10 10:28           ` Peter Zijlstra
  1 sibling, 1 reply; 57+ messages in thread
From: H. Peter Anvin @ 2023-01-09 22:22 UTC (permalink / raw)
  To: Linus Torvalds, Peter Zijlstra
  Cc: Heiko Carstens, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, gor, agordeev, borntraeger,
	svens, Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86,
	joro, suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On January 9, 2023 2:02:33 PM PST, Linus Torvalds <torvalds@linux-foundation.org> wrote:
>On Mon, Jan 9, 2023 at 10:29 AM Peter Zijlstra <peterz@infradead.org> wrote:
>>
>> I ran into a ton of casting trouble when compiling kernel/fork.c which
>> uses this_cpu_cmpxchg() on a pointer type and the compiler hates casting
>> pointers to an integer that is not the exact same size.
>
>Ahh. Yeah - not because that code needs or wants the 128-bit case, but
>because the macro expands to all sizes in a switch statement, so the
>compiler sees all the cases even if only one is then statically
>picked.
>
>So the silly casts are for all the cases that never matter.
>
>Annoying.
>
>I wonder if the "this_cpu_cmpxchg()" macro could be made to use
>_Generic() to pick out the pointer case first, and then only use
>'sizeof()' for the integer types, so that we don't have this kind of
>"every architecture needs to deal with the nasty situation" code.
>
>Ok, it's not actually the this_cpu_cmpxchg() macro, it's
>__pcpu_size_call_return() and friends, but whatever.
>
>Another alternative is to try to avoid casting to "u64" as long as
>humanly possible, and use only "typeof((*ptr))" everywhere. Then when
>the type actually *is* 128-bit, it all works out fine, because it
>won't be a pointer. That's the approach the uaccess macros tend to
>take, and then they hit the reverse issue on clang, where using the
>"byte register" constraints would cause warnings for non-byte
>accesses, and we had to do
>
>                unsigned char x_u8__;
>                __get_user_asm(x_u8__, ptr, "b", "=q", label);
>                (x) = x_u8__;
>
>because using '(x)' directly would then warn when 'x' wasn't a
>char-sized thing - even if that asm case never actually was _used_ for
>that case, since it was all inside a "switch (sizeof) case 1:"
>statement.
>
>            Linus

I wrote a crazy macro for dealing with exactly this at one point, basically producing the "right type" to cast to. It would need to have 128-bit support added to it, but that should be trivial. It is called something like int_type() ... not in front of a computer right now so can't double check.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 11/12] slub: Replace cmpxchg_double()
  2023-01-09 22:22           ` H. Peter Anvin
@ 2023-01-10  2:09             ` H. Peter Anvin
  0 siblings, 0 replies; 57+ messages in thread
From: H. Peter Anvin @ 2023-01-10  2:09 UTC (permalink / raw)
  To: Linus Torvalds, Peter Zijlstra
  Cc: Heiko Carstens, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, gor, agordeev, borntraeger,
	svens, Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86,
	joro, suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On 1/9/23 14:22, H. Peter Anvin wrote:
>>
>> Another alternative is to try to avoid casting to "u64" as long as
>> humanly possible, and use only "typeof((*ptr))" everywhere. Then when
>> the type actually *is* 128-bit, it all works out fine, because it
>> won't be a pointer. That's the approach the uaccess macros tend to
>> take, and then they hit the reverse issue on clang, where using the
>> "byte register" constraints would cause warnings for non-byte
>> accesses, and we had to do
>>
>>                 unsigned char x_u8__;
>>                 __get_user_asm(x_u8__, ptr, "b", "=q", label);
>>                 (x) = x_u8__;
>>
>> because using '(x)' directly would then warn when 'x' wasn't a
>> char-sized thing - even if that asm case never actually was _used_ for
>> that case, since it was all inside a "switch (sizeof) case 1:"
>> statement.
>>
>>             Linus
> 

> I wrote a crazy macro for dealing with exactly this at one point,
> basically producing the "right type" to cast to. It would need to
> have 128-bit support added to it, but that should be trivial. It is
> called something like int_type() ... not in front of a computer right
> now so can't double check.
Right, it is called __inttype and is defined in 
arch/x86/include/asm/uaccess.h (and, apparently, a few other 
architectures; probably should be centralized.)

It has been rewritten since my first version using a nice little macro 
called __typefits, also would we worth centralizing.

	-hpa

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 08/12] s390: Replace cmpxchg_double() with cmpxchg128()
  2022-12-19 15:35 ` [RFC][PATCH 08/12] s390: Replace cmpxchg_double() with cmpxchg128() Peter Zijlstra
@ 2023-01-10  7:23   ` Heiko Carstens
  2023-01-10  8:32     ` Peter Zijlstra
  0 siblings, 1 reply; 57+ messages in thread
From: Heiko Carstens @ 2023-01-10  7:23 UTC (permalink / raw)
  To: Peter Zijlstra, Thomas Richter
  Cc: torvalds, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, gor, agordeev, borntraeger,
	svens, Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa,
	joro, suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Mon, Dec 19, 2022 at 04:35:33PM +0100, Peter Zijlstra wrote:
> In order to depricate cmpxchg_double(), replace all its usage with
> cmpxchg128().
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/s390/include/asm/cpu_mf.h  |   29 ++++++++++++-----
>  arch/s390/kernel/perf_cpum_sf.c |   65 +++++++++++++++++++++++++---------------
>  2 files changed, 63 insertions(+), 31 deletions(-)

So, Alexander Gordeev reported that this code was already prior to your
changes potentially broken with respect to missing READ_ONCE() within the
cmpxchg_double() loops.

In order to fix that and have a patch that can be backported I would go
with something like the patch below, which I would also plan to send for
-rc4, unless there are objections.

This can then easily be converted to the new cmpxchg128() later.


From 7b271f42946b306620a748c0da5f07f8c786888d Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Thu, 5 Jan 2023 15:44:20 +0100
Subject: [PATCH] s390/cpum_sf: add READ_ONCE() semantics to compare and swap
 loops

The current cmpxchg_double() loops within the perf hw sampling code do not
have READ_ONCE() semantics to read the old value from memory. This allows
the compiler to generate code which reads the "old" value several times
from memory, which again allows for inconsistencies.

For example:

        /* Reset trailer (using compare-double-and-swap) */
        do {
                te_flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK;
                te_flags |= SDB_TE_ALERT_REQ_MASK;
        } while (!cmpxchg_double(&te->flags, &te->overflow,
                 te->flags, te->overflow,
                 te_flags, 0ULL));

The compiler could generate code where te->flags used within the
cmpxchg_double() call may be refetched from memory and which is not
necessarily identical to the previous read version which was used to
generate te_flags. Which in turn means that an incorrect update could
happen.

Fix this by adding READ_ONCE() semantics to all cmpxchg_double()
loops. Given that READ_ONCE() cannot generate code on s390 which atomically
reads 16 bytes, use a private compare-and-swap-double implementation to
achieve that.

Also replace cmpxchg_double() with the private implementation to be able to
re-use the old value within the loops.

As a side effect this converts the whole code to only use bit fields
to read and modify bits within the hws trailer header.

Reported-by: Alexander Gordeev <agordeev@linux.ibm.com>
Acked-by: Alexander Gordeev <agordeev@linux.ibm.com>
Acked-by: Hendrik Brueckner <brueckner@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/include/asm/cpu_mf.h  |  31 +++++-----
 arch/s390/kernel/perf_cpum_sf.c | 101 ++++++++++++++++++++------------
 2 files changed, 77 insertions(+), 55 deletions(-)

diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h
index feaba12dbecb..efa103b52a1a 100644
--- a/arch/s390/include/asm/cpu_mf.h
+++ b/arch/s390/include/asm/cpu_mf.h
@@ -131,19 +131,21 @@ struct hws_combined_entry {
 	struct hws_diag_entry	diag;	/* Diagnostic-sampling data entry */
 } __packed;
 
-struct hws_trailer_entry {
-	union {
-		struct {
-			unsigned int f:1;	/* 0 - Block Full Indicator   */
-			unsigned int a:1;	/* 1 - Alert request control  */
-			unsigned int t:1;	/* 2 - Timestamp format	      */
-			unsigned int :29;	/* 3 - 31: Reserved	      */
-			unsigned int bsdes:16;	/* 32-47: size of basic SDE   */
-			unsigned int dsdes:16;	/* 48-63: size of diagnostic SDE */
-		};
-		unsigned long long flags;	/* 0 - 63: All indicators     */
+union hws_trailer_header {
+	struct {
+		unsigned int f:1;	/* 0 - Block Full Indicator   */
+		unsigned int a:1;	/* 1 - Alert request control  */
+		unsigned int t:1;	/* 2 - Timestamp format	      */
+		unsigned int :29;	/* 3 - 31: Reserved	      */
+		unsigned int bsdes:16;	/* 32-47: size of basic SDE   */
+		unsigned int dsdes:16;	/* 48-63: size of diagnostic SDE */
+		unsigned long long overflow; /* 64 - Overflow Count   */
 	};
-	unsigned long long overflow;	 /* 64 - sample Overflow count	      */
+	__uint128_t val;
+};
+
+struct hws_trailer_entry {
+	union hws_trailer_header header; /* 0 - 15 Flags + Overflow Count     */
 	unsigned char timestamp[16];	 /* 16 - 31 timestamp		      */
 	unsigned long long reserved1;	 /* 32 -Reserved		      */
 	unsigned long long reserved2;	 /*				      */
@@ -290,14 +292,11 @@ static inline unsigned long sample_rate_to_freq(struct hws_qsi_info_block *qsi,
 	return USEC_PER_SEC * qsi->cpu_speed / rate;
 }
 
-#define SDB_TE_ALERT_REQ_MASK	0x4000000000000000UL
-#define SDB_TE_BUFFER_FULL_MASK 0x8000000000000000UL
-
 /* Return TOD timestamp contained in an trailer entry */
 static inline unsigned long long trailer_timestamp(struct hws_trailer_entry *te)
 {
 	/* TOD in STCKE format */
-	if (te->t)
+	if (te->header.t)
 		return *((unsigned long long *) &te->timestamp[1]);
 
 	/* TOD in STCK format */
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index 332a49965130..ce886a03545a 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -163,14 +163,15 @@ static void free_sampling_buffer(struct sf_buffer *sfb)
 
 static int alloc_sample_data_block(unsigned long *sdbt, gfp_t gfp_flags)
 {
-	unsigned long sdb, *trailer;
+	struct hws_trailer_entry *te;
+	unsigned long sdb;
 
 	/* Allocate and initialize sample-data-block */
 	sdb = get_zeroed_page(gfp_flags);
 	if (!sdb)
 		return -ENOMEM;
-	trailer = trailer_entry_ptr(sdb);
-	*trailer = SDB_TE_ALERT_REQ_MASK;
+	te = (struct hws_trailer_entry *)trailer_entry_ptr(sdb);
+	te->header.a = 1;
 
 	/* Link SDB into the sample-data-block-table */
 	*sdbt = sdb;
@@ -1206,7 +1207,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
 					    "%s: Found unknown"
 					    " sampling data entry: te->f %i"
 					    " basic.def %#4x (%p)\n", __func__,
-					    te->f, sample->def, sample);
+					    te->header.f, sample->def, sample);
 			/* Sample slot is not yet written or other record.
 			 *
 			 * This condition can occur if the buffer was reused
@@ -1217,7 +1218,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
 			 * that are not full.  Stop processing if the first
 			 * invalid format was detected.
 			 */
-			if (!te->f)
+			if (!te->header.f)
 				break;
 		}
 
@@ -1227,6 +1228,16 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
 	}
 }
 
+static inline __uint128_t __cdsg(__uint128_t *ptr, __uint128_t old, __uint128_t new)
+{
+	asm volatile(
+		"	cdsg	%[old],%[new],%[ptr]\n"
+		: [old] "+d" (old), [ptr] "+QS" (*ptr)
+		: [new] "d" (new)
+		: "memory", "cc");
+	return old;
+}
+
 /* hw_perf_event_update() - Process sampling buffer
  * @event:	The perf event
  * @flush_all:	Flag to also flush partially filled sample-data-blocks
@@ -1243,10 +1254,11 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
  */
 static void hw_perf_event_update(struct perf_event *event, int flush_all)
 {
+	unsigned long long event_overflow, sampl_overflow, num_sdb;
+	union hws_trailer_header old, prev, new;
 	struct hw_perf_event *hwc = &event->hw;
 	struct hws_trailer_entry *te;
 	unsigned long *sdbt;
-	unsigned long long event_overflow, sampl_overflow, num_sdb, te_flags;
 	int done;
 
 	/*
@@ -1266,25 +1278,25 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all)
 		te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt);
 
 		/* Leave loop if no more work to do (block full indicator) */
-		if (!te->f) {
+		if (!te->header.f) {
 			done = 1;
 			if (!flush_all)
 				break;
 		}
 
 		/* Check the sample overflow count */
-		if (te->overflow)
+		if (te->header.overflow)
 			/* Account sample overflows and, if a particular limit
 			 * is reached, extend the sampling buffer.
 			 * For details, see sfb_account_overflows().
 			 */
-			sampl_overflow += te->overflow;
+			sampl_overflow += te->header.overflow;
 
 		/* Timestamps are valid for full sample-data-blocks only */
 		debug_sprintf_event(sfdbg, 6, "%s: sdbt %#lx "
 				    "overflow %llu timestamp %#llx\n",
-				    __func__, (unsigned long)sdbt, te->overflow,
-				    (te->f) ? trailer_timestamp(te) : 0ULL);
+				    __func__, (unsigned long)sdbt, te->header.overflow,
+				    (te->header.f) ? trailer_timestamp(te) : 0ULL);
 
 		/* Collect all samples from a single sample-data-block and
 		 * flag if an (perf) event overflow happened.  If so, the PMU
@@ -1294,12 +1306,16 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all)
 		num_sdb++;
 
 		/* Reset trailer (using compare-double-and-swap) */
+		/* READ_ONCE() 16 byte header */
+		prev.val = __cdsg(&te->header.val, 0, 0);
 		do {
-			te_flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK;
-			te_flags |= SDB_TE_ALERT_REQ_MASK;
-		} while (!cmpxchg_double(&te->flags, &te->overflow,
-					 te->flags, te->overflow,
-					 te_flags, 0ULL));
+			old.val = prev.val;
+			new.val = prev.val;
+			new.f = 0;
+			new.a = 1;
+			new.overflow = 0;
+			prev.val = __cdsg(&te->header.val, old.val, new.val);
+		} while (prev.val != old.val);
 
 		/* Advance to next sample-data-block */
 		sdbt++;
@@ -1384,7 +1400,7 @@ static void aux_output_end(struct perf_output_handle *handle)
 	range_scan = AUX_SDB_NUM_ALERT(aux);
 	for (i = 0, idx = aux->head; i < range_scan; i++, idx++) {
 		te = aux_sdb_trailer(aux, idx);
-		if (!(te->flags & SDB_TE_BUFFER_FULL_MASK))
+		if (!te->header.f)
 			break;
 	}
 	/* i is num of SDBs which are full */
@@ -1392,7 +1408,7 @@ static void aux_output_end(struct perf_output_handle *handle)
 
 	/* Remove alert indicators in the buffer */
 	te = aux_sdb_trailer(aux, aux->alert_mark);
-	te->flags &= ~SDB_TE_ALERT_REQ_MASK;
+	te->header.a = 0;
 
 	debug_sprintf_event(sfdbg, 6, "%s: SDBs %ld range %ld head %ld\n",
 			    __func__, i, range_scan, aux->head);
@@ -1437,9 +1453,9 @@ static int aux_output_begin(struct perf_output_handle *handle,
 		idx = aux->empty_mark + 1;
 		for (i = 0; i < range_scan; i++, idx++) {
 			te = aux_sdb_trailer(aux, idx);
-			te->flags &= ~(SDB_TE_BUFFER_FULL_MASK |
-				       SDB_TE_ALERT_REQ_MASK);
-			te->overflow = 0;
+			te->header.f = 0;
+			te->header.a = 0;
+			te->header.overflow = 0;
 		}
 		/* Save the position of empty SDBs */
 		aux->empty_mark = aux->head + range - 1;
@@ -1448,7 +1464,7 @@ static int aux_output_begin(struct perf_output_handle *handle,
 	/* Set alert indicator */
 	aux->alert_mark = aux->head + range/2 - 1;
 	te = aux_sdb_trailer(aux, aux->alert_mark);
-	te->flags = te->flags | SDB_TE_ALERT_REQ_MASK;
+	te->header.a = 1;
 
 	/* Reset hardware buffer head */
 	head = AUX_SDB_INDEX(aux, aux->head);
@@ -1475,14 +1491,17 @@ static int aux_output_begin(struct perf_output_handle *handle,
 static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
 			  unsigned long long *overflow)
 {
-	unsigned long long orig_overflow, orig_flags, new_flags;
+	union hws_trailer_header old, prev, new;
 	struct hws_trailer_entry *te;
 
 	te = aux_sdb_trailer(aux, alert_index);
+	/* READ_ONCE() 16 byte header */
+	prev.val = __cdsg(&te->header.val, 0, 0);
 	do {
-		orig_flags = te->flags;
-		*overflow = orig_overflow = te->overflow;
-		if (orig_flags & SDB_TE_BUFFER_FULL_MASK) {
+		old.val = prev.val;
+		new.val = prev.val;
+		*overflow = old.overflow;
+		if (old.f) {
 			/*
 			 * SDB is already set by hardware.
 			 * Abort and try to set somewhere
@@ -1490,10 +1509,10 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
 			 */
 			return false;
 		}
-		new_flags = orig_flags | SDB_TE_ALERT_REQ_MASK;
-	} while (!cmpxchg_double(&te->flags, &te->overflow,
-				 orig_flags, orig_overflow,
-				 new_flags, 0ULL));
+		new.a = 1;
+		new.overflow = 0;
+		prev.val = __cdsg(&te->header.val, old.val, new.val);
+	} while (prev.val != old.val);
 	return true;
 }
 
@@ -1522,8 +1541,9 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
 static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range,
 			     unsigned long long *overflow)
 {
-	unsigned long long orig_overflow, orig_flags, new_flags;
 	unsigned long i, range_scan, idx, idx_old;
+	union hws_trailer_header old, prev, new;
+	unsigned long long orig_overflow;
 	struct hws_trailer_entry *te;
 
 	debug_sprintf_event(sfdbg, 6, "%s: range %ld head %ld alert %ld "
@@ -1554,17 +1574,20 @@ static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range,
 	idx_old = idx = aux->empty_mark + 1;
 	for (i = 0; i < range_scan; i++, idx++) {
 		te = aux_sdb_trailer(aux, idx);
+		/* READ_ONCE() 16 byte header */
+		prev.val = __cdsg(&te->header.val, 0, 0);
 		do {
-			orig_flags = te->flags;
-			orig_overflow = te->overflow;
-			new_flags = orig_flags & ~SDB_TE_BUFFER_FULL_MASK;
+			old.val = prev.val;
+			new.val = prev.val;
+			orig_overflow = old.overflow;
+			new.f = 0;
+			new.overflow = 0;
 			if (idx == aux->alert_mark)
-				new_flags |= SDB_TE_ALERT_REQ_MASK;
+				new.a = 1;
 			else
-				new_flags &= ~SDB_TE_ALERT_REQ_MASK;
-		} while (!cmpxchg_double(&te->flags, &te->overflow,
-					 orig_flags, orig_overflow,
-					 new_flags, 0ULL));
+				new.a = 0;
+			prev.val = __cdsg(&te->header.val, old.val, new.val);
+		} while (prev.val != old.val);
 		*overflow += orig_overflow;
 	}
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 08/12] s390: Replace cmpxchg_double() with cmpxchg128()
  2023-01-10  7:23   ` Heiko Carstens
@ 2023-01-10  8:32     ` Peter Zijlstra
  2023-01-10 11:27       ` Mark Rutland
  2023-01-10 11:46       ` Heiko Carstens
  0 siblings, 2 replies; 57+ messages in thread
From: Peter Zijlstra @ 2023-01-10  8:32 UTC (permalink / raw)
  To: Heiko Carstens
  Cc: Thomas Richter, torvalds, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, gor, agordeev, borntraeger,
	svens, Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa,
	joro, suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Tue, Jan 10, 2023 at 08:23:05AM +0100, Heiko Carstens wrote:

> So, Alexander Gordeev reported that this code was already prior to your
> changes potentially broken with respect to missing READ_ONCE() within the
> cmpxchg_double() loops.

Unless there's an early exit, that shouldn't matter. If you managed to
read garbage the cmpxchg itself will simply fail and the loop retries.

> @@ -1294,12 +1306,16 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all)
>  		num_sdb++;
>  
>  		/* Reset trailer (using compare-double-and-swap) */
> +		/* READ_ONCE() 16 byte header */
> +		prev.val = __cdsg(&te->header.val, 0, 0);
>  		do {
> +			old.val = prev.val;
> +			new.val = prev.val;
> +			new.f = 0;
> +			new.a = 1;
> +			new.overflow = 0;
> +			prev.val = __cdsg(&te->header.val, old.val, new.val);
> +		} while (prev.val != old.val);

So this, and

> +		/* READ_ONCE() 16 byte header */
> +		prev.val = __cdsg(&te->header.val, 0, 0);
>  		do {
> +			old.val = prev.val;
> +			new.val = prev.val;
> +			orig_overflow = old.overflow;
> +			new.f = 0;
> +			new.overflow = 0;
>  			if (idx == aux->alert_mark)
> +				new.a = 1;
>  			else
> +				new.a = 0;
> +			prev.val = __cdsg(&te->header.val, old.val, new.val);
> +		} while (prev.val != old.val);

this case are just silly and expensive. If that initial read is split
and manages to read gibberish the cmpxchg will fail and we retry anyway.

> +	/* READ_ONCE() 16 byte header */
> +	prev.val = __cdsg(&te->header.val, 0, 0);
>  	do {
> +		old.val = prev.val;
> +		new.val = prev.val;
> +		*overflow = old.overflow;
> +		if (old.f) {
>  			/*
>  			 * SDB is already set by hardware.
>  			 * Abort and try to set somewhere
> @@ -1490,10 +1509,10 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
>  			 */
>  			return false;
>  		}
> +		new.a = 1;
> +		new.overflow = 0;
> +		prev.val = __cdsg(&te->header.val, old.val, new.val);
> +	} while (prev.val != old.val);


And while this case has an early exit, it only cares about a single bit
(although you made it a full word) and so also shouldn't care. If
aux_reset_buffer() returns false, @overflow isn't consumed.


So I really don't see the point of this patch.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 11/12] slub: Replace cmpxchg_double()
  2023-01-09 22:02         ` Linus Torvalds
  2023-01-09 22:22           ` H. Peter Anvin
@ 2023-01-10 10:28           ` Peter Zijlstra
  1 sibling, 0 replies; 57+ messages in thread
From: Peter Zijlstra @ 2023-01-10 10:28 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Heiko Carstens, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, gor, agordeev, borntraeger,
	svens, Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa,
	joro, suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Mon, Jan 09, 2023 at 04:02:33PM -0600, Linus Torvalds wrote:
> On Mon, Jan 9, 2023 at 10:29 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > I ran into a ton of casting trouble when compiling kernel/fork.c which
> > uses this_cpu_cmpxchg() on a pointer type and the compiler hates casting
> > pointers to an integer that is not the exact same size.
> 
> Ahh. Yeah - not because that code needs or wants the 128-bit case, but
> because the macro expands to all sizes in a switch statement, so the
> compiler sees all the cases even if only one is then statically
> picked.
> 
> So the silly casts are for all the cases that never matter.
> 
> Annoying.

Yes, very.

This seems to compile (and boot). Let me go update the others and push
it out for the robots to have a go.

#define percpu_cmpxchg128_op(size, qual, _var, _oval, _nval)		\
({									\
	union {								\
		typeof(_var) full;					\
		struct {						\
			u64 low, high;					\
		};							\
	} old__, new__;							\
									\
	old__.full = _oval;						\
	new__.full = _nval;						\
									\
	asm qual ("cmpxchg16b " __percpu_arg([var])			\
		  : [var] "+m" (_var),					\
		    "+a" (old__.low),					\
		    "+d" (old__.high)					\
		  : "b" (new__.low),					\
		    "c" (new__.high)					\
		  : "memory");						\
									\
	old__.full;							\
})

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 08/12] s390: Replace cmpxchg_double() with cmpxchg128()
  2023-01-10  8:32     ` Peter Zijlstra
@ 2023-01-10 11:27       ` Mark Rutland
  2023-01-10 11:46       ` Heiko Carstens
  1 sibling, 0 replies; 57+ messages in thread
From: Mark Rutland @ 2023-01-10 11:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Heiko Carstens, Thomas Richter, torvalds, corbet, will,
	boqun.feng, catalin.marinas, dennis, tj, cl, gor, agordeev,
	borntraeger, svens, Herbert Xu, davem, tglx, mingo, bp,
	dave.hansen, x86, hpa, joro, suravee.suthikulpanit, robin.murphy,
	dwmw2, baolu.lu, Arnd Bergmann, penberg, rientjes,
	iamjoonsoo.kim, Andrew Morton, vbabka, roman.gushchin, 42.hyeyoo,
	linux-doc, linux-kernel, linux-mm, linux-s390, linux-crypto,
	iommu, linux-arch

On Tue, Jan 10, 2023 at 09:32:55AM +0100, Peter Zijlstra wrote:
> On Tue, Jan 10, 2023 at 08:23:05AM +0100, Heiko Carstens wrote:
> 
> > So, Alexander Gordeev reported that this code was already prior to your
> > changes potentially broken with respect to missing READ_ONCE() within the
> > cmpxchg_double() loops.
> 
> Unless there's an early exit, that shouldn't matter. If you managed to
> read garbage the cmpxchg itself will simply fail and the loop retries.

I don't think that's true; without READ_ONCE() the compiler could (but is
very unlikely to) read multiple times, and that could cause problems.

For example:

| 	prev = *ptr;
| 
| 	do {
| 		new = some_function_of(prev);
| 		old = cmpxchg(ptr, prev, new);
| 	} while (old != prev);

Could effectively become:

| 	prev1 = *ptr;
|	prev2 = *ptr;
|
| 	do {
| 		new = some_function_of(prev1)
| 		old = cmpxchg(ptr, prev2, new);
| 	} while (old != prev2);

... which would effectively udpate from a stale value, throwing away prev2.
That and the two generated reads could be in either order.

So I do think it's warranted to use READ_ONCE() for the prev value feeding into
a cmpxchg operation, even if that's only for the "once" part rather than lack
of tearing.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 08/12] s390: Replace cmpxchg_double() with cmpxchg128()
  2023-01-10  8:32     ` Peter Zijlstra
  2023-01-10 11:27       ` Mark Rutland
@ 2023-01-10 11:46       ` Heiko Carstens
  2023-01-12 11:12         ` Alexander Gordeev
  1 sibling, 1 reply; 57+ messages in thread
From: Heiko Carstens @ 2023-01-10 11:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Richter, torvalds, corbet, will, boqun.feng, mark.rutland,
	catalin.marinas, dennis, tj, cl, gor, agordeev, borntraeger,
	svens, Herbert Xu, davem, tglx, mingo, bp, dave.hansen, x86, hpa,
	joro, suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Tue, Jan 10, 2023 at 09:32:55AM +0100, Peter Zijlstra wrote:
> On Tue, Jan 10, 2023 at 08:23:05AM +0100, Heiko Carstens wrote:
> > So, Alexander Gordeev reported that this code was already prior to your
> > changes potentially broken with respect to missing READ_ONCE() within the
> > cmpxchg_double() loops.
> 
> Unless there's an early exit, that shouldn't matter. If you managed to
> read garbage the cmpxchg itself will simply fail and the loop retries.
> 
> > @@ -1294,12 +1306,16 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all)
> >  		num_sdb++;
> >  
> >  		/* Reset trailer (using compare-double-and-swap) */
> > +		/* READ_ONCE() 16 byte header */
> > +		prev.val = __cdsg(&te->header.val, 0, 0);
> >  		do {
> > +			old.val = prev.val;
> > +			new.val = prev.val;
> > +			new.f = 0;
> > +			new.a = 1;
> > +			new.overflow = 0;
> > +			prev.val = __cdsg(&te->header.val, old.val, new.val);
> > +		} while (prev.val != old.val);
> 
> So this, and
...
> this case are just silly and expensive. If that initial read is split
> and manages to read gibberish the cmpxchg will fail and we retry anyway.

While I do agree that there is no need to necessarily read the whole 16
bytes atomically in advance here, there is still the problem about the
missing initial READ_ONCE() in the original code.
As I tried to outline here:

    For example:
    
            /* Reset trailer (using compare-double-and-swap) */
            do {
                    te_flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK;
                    te_flags |= SDB_TE_ALERT_REQ_MASK;
            } while (!cmpxchg_double(&te->flags, &te->overflow,
                     te->flags, te->overflow,
                     te_flags, 0ULL));
    
    The compiler could generate code where te->flags used within the
    cmpxchg_double() call may be refetched from memory and which is not
    necessarily identical to the previous read version which was used to
    generate te_flags. Which in turn means that an incorrect update could
    happen.

Is there anything that prevents te->flags from being read several times?

> > +	/* READ_ONCE() 16 byte header */
> > +	prev.val = __cdsg(&te->header.val, 0, 0);
> >  	do {
> > +		old.val = prev.val;
> > +		new.val = prev.val;
> > +		*overflow = old.overflow;
> > +		if (old.f) {
> >  			/*
> >  			 * SDB is already set by hardware.
> >  			 * Abort and try to set somewhere
> > @@ -1490,10 +1509,10 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
> >  			 */
> >  			return false;
> >  		}
> > +		new.a = 1;
> > +		new.overflow = 0;
> > +		prev.val = __cdsg(&te->header.val, old.val, new.val);
> > +	} while (prev.val != old.val);
> 
> And while this case has an early exit, it only cares about a single bit
> (although you made it a full word) and so also shouldn't care. If
> aux_reset_buffer() returns false, @overflow isn't consumed.

Yes, except that it is anything but obvious that @overflow isn't consumed.

> So I really don't see the point of this patch.

As stated above: READ_ONCE() is missing. And while at it I wanted to have a
consistent complete previous value - also considering that cdsg is not very
expensive.
And while it also reuse the returned values from cdsg, instead of throwing
them away and reading from memory again in a splitted and potentially
inconsistent way.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}()
  2023-01-09 18:50   ` Mark Rutland
@ 2023-01-12 10:35     ` Peter Zijlstra
  0 siblings, 0 replies; 57+ messages in thread
From: Peter Zijlstra @ 2023-01-12 10:35 UTC (permalink / raw)
  To: Mark Rutland
  Cc: torvalds, corbet, will, boqun.feng, catalin.marinas, dennis, tj,
	cl, hca, gor, agordeev, borntraeger, svens, Herbert Xu, davem,
	tglx, mingo, bp, dave.hansen, x86, hpa, joro,
	suravee.suthikulpanit, robin.murphy, dwmw2, baolu.lu,
	Arnd Bergmann, penberg, rientjes, iamjoonsoo.kim, Andrew Morton,
	vbabka, roman.gushchin, 42.hyeyoo, linux-doc, linux-kernel,
	linux-mm, linux-s390, linux-crypto, iommu, linux-arch

On Mon, Jan 09, 2023 at 06:50:24PM +0000, Mark Rutland wrote:

> Similarly, I'd suggest:
> 
> | #define __CMPXCHG128(name, mb, cl...)                                   \
> | static __always_inline u128                                             \
> | __lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)         \
> | {                                                                       \
> |         asm volatile(                                                   \
> |         __LSE_PREAMBLE                                                  \
> |         "       casp" #mb "\t%[old], %H[old], %[new], %H[new], %[v]\n"  \
> |         : [old] "+&r" (old),                                            \
> |           [v] "+Q" (*(u128 *)ptr)                                       \
> |         : [new] "r" (new)                                               \
> |         : cl);                                                          \
> |                                                                         \
> |         return old;                                                     \
> | }
> | 
> | __CMPXCHG128(   ,   )   
> | __CMPXCHG128(_mb, al, "memory")
> | 
> | #undef __CMPXCHG128

clang-16 seems to hate on this like:

arch/arm64/include/asm/atomic_lse.h:342:1: warning: value size does not match register size specified by the constraint and modifier [-Wasm-operand-widths]
arch/arm64/include/asm/atomic_lse.h:334:17: note: expanded from macro '__CMPXCHG128'
	: [old] "+&r" (old),                                            \
		       ^

(much the same for the ll_sc version; if you want the full build thing,
holler and I'll bounce you the robot mail).

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC][PATCH 08/12] s390: Replace cmpxchg_double() with cmpxchg128()
  2023-01-10 11:46       ` Heiko Carstens
@ 2023-01-12 11:12         ` Alexander Gordeev
  0 siblings, 0 replies; 57+ messages in thread
From: Alexander Gordeev @ 2023-01-12 11:12 UTC (permalink / raw)
  To: Heiko Carstens
  Cc: Peter Zijlstra, Thomas Richter, torvalds, corbet, will,
	boqun.feng, mark.rutland, catalin.marinas, dennis, tj, cl, gor,
	borntraeger, svens, Herbert Xu, davem, tglx, mingo, bp,
	dave.hansen, x86, hpa, joro, suravee.suthikulpanit, robin.murphy,
	dwmw2, baolu.lu, Arnd Bergmann, penberg, rientjes,
	iamjoonsoo.kim, Andrew Morton, vbabka, roman.gushchin, 42.hyeyoo,
	linux-doc, linux-kernel, linux-mm, linux-s390, linux-crypto,
	iommu, linux-arch

On Tue, Jan 10, 2023 at 12:46:44PM +0100, Heiko Carstens wrote:
> > > +	/* READ_ONCE() 16 byte header */
> > > +	prev.val = __cdsg(&te->header.val, 0, 0);
> > >  	do {
> > > +		old.val = prev.val;
> > > +		new.val = prev.val;
> > > +		*overflow = old.overflow;

I guess, it would also make sense to place write to overflow 
after the while loop. So the output variable left intact in
case the function bailed out. Not sure if it should be part
of this patch though.

> > > +		if (old.f) {
> > >  			/*
> > >  			 * SDB is already set by hardware.
> > >  			 * Abort and try to set somewhere
> > > @@ -1490,10 +1509,10 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
> > >  			 */
> > >  			return false;
> > >  		}
> > > +		new.a = 1;
> > > +		new.overflow = 0;
> > > +		prev.val = __cdsg(&te->header.val, old.val, new.val);
> > > +	} while (prev.val != old.val);
> > 
> > And while this case has an early exit, it only cares about a single bit
> > (although you made it a full word) and so also shouldn't care. If
> > aux_reset_buffer() returns false, @overflow isn't consumed.
> 
> Yes, except that it is anything but obvious that @overflow isn't consumed.

^ permalink raw reply	[flat|nested] 57+ messages in thread

end of thread, other threads:[~2023-01-12 11:24 UTC | newest]

Thread overview: 57+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-19 15:35 [RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg_double() Peter Zijlstra
2022-12-19 15:35 ` [RFC][PATCH 01/12] crypto: Remove u128 usage Peter Zijlstra
2022-12-19 15:56   ` Jason A. Donenfeld
2022-12-19 17:00     ` Peter Zijlstra
2022-12-19 17:03       ` Jason A. Donenfeld
2022-12-20  3:50         ` Herbert Xu
2022-12-20  4:11           ` H. Peter Anvin
2022-12-20  4:15             ` Herbert Xu
2022-12-19 15:35 ` [RFC][PATCH 02/12] crypto/ghash-clmulni: Use (struct) be128 Peter Zijlstra
2022-12-20  5:45   ` Eric Biggers
2022-12-19 15:35 ` [RFC][PATCH 03/12] cyrpto/b128ops: Remove struct u128 Peter Zijlstra
2022-12-20  5:52   ` Eric Biggers
2022-12-19 15:35 ` [RFC][PATCH 04/12] types: Introduce [us]128 Peter Zijlstra
2022-12-29  8:30   ` Pavel Machek
2022-12-19 15:35 ` [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}() Peter Zijlstra
2022-12-19 20:07   ` Boqun Feng
2022-12-20 11:08     ` Peter Zijlstra
2022-12-20 14:31       ` Linus Torvalds
2022-12-20 15:09         ` Peter Zijlstra
2023-01-03 13:25       ` Mark Rutland
2023-01-03 14:03         ` Mark Rutland
2023-01-03 16:19           ` Mark Rutland
2023-01-03 16:50             ` Arnd Bergmann
2023-01-04 11:36               ` Mark Rutland
2023-01-04 13:55                 ` Mark Rutland
2022-12-22  1:25   ` Boqun Feng
2022-12-22 13:16     ` Peter Zijlstra
2023-01-03 17:12   ` Heiko Carstens
2023-01-09 18:50   ` Mark Rutland
2023-01-12 10:35     ` Peter Zijlstra
2022-12-19 15:35 ` [RFC][PATCH 06/12] instrumentation: Wire up cmpxchg128() Peter Zijlstra
2022-12-19 15:35 ` [RFC][PATCH 07/12] percpu: Wire up cmpxchg128 Peter Zijlstra
2022-12-29 13:36   ` Arnd Bergmann
2023-01-04 12:09   ` Heiko Carstens
2023-01-09 16:29     ` Peter Zijlstra
2022-12-19 15:35 ` [RFC][PATCH 08/12] s390: Replace cmpxchg_double() with cmpxchg128() Peter Zijlstra
2023-01-10  7:23   ` Heiko Carstens
2023-01-10  8:32     ` Peter Zijlstra
2023-01-10 11:27       ` Mark Rutland
2023-01-10 11:46       ` Heiko Carstens
2023-01-12 11:12         ` Alexander Gordeev
2022-12-19 15:35 ` [RFC][PATCH 09/12] x86,amd_iommu: Replace cmpxchg_double() Peter Zijlstra
2022-12-19 16:47   ` Niklas Schnelle
2022-12-28  8:40   ` Vasant Hegde
2022-12-19 15:35 ` [RFC][PATCH 10/12] x86,intel_iommu: " Peter Zijlstra
2022-12-19 15:35 ` [RFC][PATCH 11/12] slub: " Peter Zijlstra
2023-01-03 15:58   ` Vlastimil Babka
2023-01-03 17:16   ` Heiko Carstens
2023-01-03 19:08     ` Linus Torvalds
2023-01-04 12:07       ` Heiko Carstens
2023-01-09 16:28       ` Peter Zijlstra
2023-01-09 22:02         ` Linus Torvalds
2023-01-09 22:22           ` H. Peter Anvin
2023-01-10  2:09             ` H. Peter Anvin
2023-01-10 10:28           ` Peter Zijlstra
2022-12-19 15:35 ` [RFC][PATCH 12/12] arch: Remove cmpxchg_double Peter Zijlstra
2022-12-22  1:21 ` [RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg_double() Boqun Feng

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.