[PATCH 0/3] arm64: use subsections instead of function calls for LL/SC fallbacks

linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/3] arm64: use subsections instead of function calls for LL/SC fallbacks
@ 2018-11-13 23:39 Ard Biesheuvel
  2018-11-13 23:39 ` [PATCH 1/3] arm64/atomics: refactor LL/SC base asm templates Ard Biesheuvel
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: Ard Biesheuvel @ 2018-11-13 23:39 UTC (permalink / raw)
  To: linux-arm-kernel

Refactor the LL/SC atomics code so we can emit the LL/SC fallbacks for the
LSE atomics as subsections that get instantiated at each call site rather
than as out of line functions that get called from inline asm (without the
awareness of the compiler)

This should allow slightly better LSE code, and removes stack spilling and
potential PLT indirection for the LL/SC fallbacks.

Ard Biesheuvel (3):
  arm64/atomics: refactor LL/SC base asm templates
  arm64/atomics: use subsections for out of line LL/SC alternatives
  arm64/atomics: remove out of line LL/SC alternatives

 arch/arm64/include/asm/atomic_ll_sc.h | 190 ++++---
 arch/arm64/include/asm/atomic_lse.h   | 558 ++++++++++----------
 arch/arm64/include/asm/lse.h          |  13 -
 arch/arm64/lib/Makefile               |  19 -
 arch/arm64/lib/atomic_ll_sc.c         |   3 -
 5 files changed, 372 insertions(+), 411 deletions(-)
 delete mode 100644 arch/arm64/lib/atomic_ll_sc.c

-- 
2.17.1

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/3] arm64/atomics: refactor LL/SC base asm templates
  2018-11-13 23:39 [PATCH 0/3] arm64: use subsections instead of function calls for LL/SC fallbacks Ard Biesheuvel
@ 2018-11-13 23:39 ` Ard Biesheuvel
  2018-11-13 23:39 ` [PATCH 2/3] arm64/atomics: use subsections for out of line LL/SC alternatives Ard Biesheuvel
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 7+ messages in thread
From: Ard Biesheuvel @ 2018-11-13 23:39 UTC (permalink / raw)
  To: linux-arm-kernel

Refactor the asm templates that emit the LL/SC instruction
sequences so that we will be able to reuse them in the LSE
code, which will emit them out of line, but without the use
of function calls.

This involves factoring out the core instruction sequences
and using named operands throughout.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/include/asm/atomic_ll_sc.h | 139 ++++++++++----------
 1 file changed, 72 insertions(+), 67 deletions(-)

diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h
index f5a2d09afb38..5f55f6b8dd7e 100644
--- a/arch/arm64/include/asm/atomic_ll_sc.h
+++ b/arch/arm64/include/asm/atomic_ll_sc.h
@@ -36,6 +36,51 @@
  * this file, which unfortunately don't work on a per-function basis
  * (the optimize attribute silently ignores these options).
  */
+#define __LL_SC_ATOMIC_OP(asm_op, w)					\
+"	prfm		pstl1strm, %[v]				\n"	\
+"1:	ldxr		%" #w "[res], %[v]			\n"	\
+"	" #asm_op "	%" #w "[res], %" #w "[res], %" #w "[i]	\n"	\
+"	stxr		%w[tmp], %" #w "[res], %[v]		\n"	\
+"	cbnz		%w[tmp], 1b"
+
+#define __LL_SC_ATOMIC_OP_RETURN(asm_op, mb, acq, rel, w)		\
+"	prfm		pstl1strm, %[v]				\n"	\
+"1:	ld" #acq "xr	%" #w "[res], %[v]			\n"	\
+"	" #asm_op "	%" #w "[res], %" #w "[res], %" #w "[i]	\n"	\
+"	st" #rel "xr	%w[tmp], %" #w "[res], %[v]		\n"	\
+"	cbnz		%w[tmp], 1b				\n"	\
+"	" #mb
+
+#define __LL_SC_ATOMIC_FETCH_OP(asm_op, mb, acq, rel, w)		\
+"	prfm		pstl1strm, %[v]				\n"	\
+"1:	ld" #acq "xr	%" #w "[res], %[v]			\n"	\
+"	" #asm_op "	%" #w "[val], %" #w "[res], %" #w "[i]	\n"	\
+"	st" #rel "xr	%w[tmp], %" #w "[val], %[v]		\n"	\
+"	cbnz		%w[tmp], 1b				\n"	\
+"	" #mb								\
+
+#define __LL_SC_CMPXCHG_BASE_OP(w, sz, name, mb, acq, rel)		\
+"	prfm			pstl1strm, %[v]			\n"	\
+"1:	ld" #acq "xr" #sz "	%" #w "[oldval], %[v]		\n"	\
+"	eor			%" #w "[tmp], %" #w "[oldval], "	\
+"					      %" #w "[old]	\n"	\
+"	cbnz			%" #w "[tmp], 2f		\n"	\
+"	st" #rel "xr" #sz "	%w[tmp], %" #w "[new], %[v]	\n"	\
+"	cbnz			%w[tmp], 1b			\n"	\
+"	" #mb "							\n"	\
+"2:"
+
+#define __LL_SC_CMPXCHG_DBL_OP(mb, rel)					\
+"	prfm		pstl1strm, %[v]				\n"	\
+"1:	ldxp		%[tmp], %[ret], %[v]			\n"	\
+"	eor		%[tmp], %[tmp], %[old1]			\n"	\
+"	eor		%[ret], %[ret], %[old2]			\n"	\
+"	orr		%[ret], %[tmp], %[ret]			\n"	\
+"	cbnz		%[ret], 2f				\n"	\
+"	st" #rel "xp	%w[tmp], %[new1], %[new2], %[v]		\n"	\
+"	cbnz		%w[tmp], 1b				\n"	\
+"	" #mb "							\n"	\
+"2:"									\
 
 #define ATOMIC_OP(op, asm_op)						\
 __LL_SC_INLINE void							\
@@ -44,14 +89,10 @@ __LL_SC_PREFIX(atomic_##op(int i, atomic_t *v))				\
 	unsigned long tmp;						\
 	int result;							\
 									\
-	asm volatile("// atomic_" #op "\n"				\
-"	prfm	pstl1strm, %2\n"					\
-"1:	ldxr	%w0, %2\n"						\
-"	" #asm_op "	%w0, %w0, %w3\n"				\
-"	stxr	%w1, %w0, %2\n"						\
-"	cbnz	%w1, 1b"						\
-	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
-	: "Ir" (i));							\
+	asm volatile("	// atomic_" #op "\n"				\
+	__LL_SC_ATOMIC_OP(asm_op, w)					\
+	: [res]"=&r" (result), [tmp]"=&r" (tmp), [v]"+Q" (v->counter)	\
+	: [i]"Ir" (i));							\
 }									\
 __LL_SC_EXPORT(atomic_##op);
 
@@ -63,14 +104,9 @@ __LL_SC_PREFIX(atomic_##op##_return##name(int i, atomic_t *v))		\
 	int result;							\
 									\
 	asm volatile("// atomic_" #op "_return" #name "\n"		\
-"	prfm	pstl1strm, %2\n"					\
-"1:	ld" #acq "xr	%w0, %2\n"					\
-"	" #asm_op "	%w0, %w0, %w3\n"				\
-"	st" #rel "xr	%w1, %w0, %2\n"					\
-"	cbnz	%w1, 1b\n"						\
-"	" #mb								\
-	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
-	: "Ir" (i)							\
+	__LL_SC_ATOMIC_OP_RETURN(asm_op, mb, acq, rel, w)		\
+	: [res]"=&r" (result), [tmp]"=&r" (tmp), [v]"+Q" (v->counter)	\
+	: [i]"Ir" (i)							\
 	: cl);								\
 									\
 	return result;							\
@@ -85,14 +121,10 @@ __LL_SC_PREFIX(atomic_fetch_##op##name(int i, atomic_t *v))		\
 	int val, result;						\
 									\
 	asm volatile("// atomic_fetch_" #op #name "\n"			\
-"	prfm	pstl1strm, %3\n"					\
-"1:	ld" #acq "xr	%w0, %3\n"					\
-"	" #asm_op "	%w1, %w0, %w4\n"				\
-"	st" #rel "xr	%w2, %w1, %3\n"					\
-"	cbnz	%w2, 1b\n"						\
-"	" #mb								\
-	: "=&r" (result), "=&r" (val), "=&r" (tmp), "+Q" (v->counter)	\
-	: "Ir" (i)							\
+	__LL_SC_ATOMIC_FETCH_OP(asm_op, mb, acq, rel, w)		\
+	: [res]"=&r" (result), [val]"=&r" (val), [tmp]"=&r" (tmp),	\
+	  [v]"+Q" (v->counter)						\
+	: [i]"Ir" (i)							\
 	: cl);								\
 									\
 	return result;							\
@@ -139,13 +171,9 @@ __LL_SC_PREFIX(atomic64_##op(long i, atomic64_t *v))			\
 	unsigned long tmp;						\
 									\
 	asm volatile("// atomic64_" #op "\n"				\
-"	prfm	pstl1strm, %2\n"					\
-"1:	ldxr	%0, %2\n"						\
-"	" #asm_op "	%0, %0, %3\n"					\
-"	stxr	%w1, %0, %2\n"						\
-"	cbnz	%w1, 1b"						\
-	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
-	: "Ir" (i));							\
+	__LL_SC_ATOMIC_OP(asm_op, )					\
+	: [res]"=&r" (result), [tmp]"=&r" (tmp), [v]"+Q" (v->counter)	\
+	: [i]"Ir" (i));							\
 }									\
 __LL_SC_EXPORT(atomic64_##op);
 
@@ -157,14 +185,9 @@ __LL_SC_PREFIX(atomic64_##op##_return##name(long i, atomic64_t *v))	\
 	unsigned long tmp;						\
 									\
 	asm volatile("// atomic64_" #op "_return" #name "\n"		\
-"	prfm	pstl1strm, %2\n"					\
-"1:	ld" #acq "xr	%0, %2\n"					\
-"	" #asm_op "	%0, %0, %3\n"					\
-"	st" #rel "xr	%w1, %0, %2\n"					\
-"	cbnz	%w1, 1b\n"						\
-"	" #mb								\
-	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
-	: "Ir" (i)							\
+	__LL_SC_ATOMIC_OP_RETURN(asm_op, mb, acq, rel, )		\
+	: [res]"=&r" (result), [tmp]"=&r" (tmp), [v]"+Q" (v->counter)	\
+	: [i]"Ir" (i)							\
 	: cl);								\
 									\
 	return result;							\
@@ -179,14 +202,10 @@ __LL_SC_PREFIX(atomic64_fetch_##op##name(long i, atomic64_t *v))	\
 	unsigned long tmp;						\
 									\
 	asm volatile("// atomic64_fetch_" #op #name "\n"		\
-"	prfm	pstl1strm, %3\n"					\
-"1:	ld" #acq "xr	%0, %3\n"					\
-"	" #asm_op "	%1, %0, %4\n"					\
-"	st" #rel "xr	%w2, %1, %3\n"					\
-"	cbnz	%w2, 1b\n"						\
-"	" #mb								\
-	: "=&r" (result), "=&r" (val), "=&r" (tmp), "+Q" (v->counter)	\
-	: "Ir" (i)							\
+	__LL_SC_ATOMIC_FETCH_OP(asm_op, mb, acq, rel, )			\
+	: [res]"=&r" (result), [val]"=&r" (val), [tmp]"=&r" (tmp),	\
+	  [v]"+Q" (v->counter)						\
+	: [i]"Ir" (i)							\
 	: cl);								\
 									\
 	return result;							\
@@ -257,14 +276,7 @@ __LL_SC_PREFIX(__cmpxchg_case_##name(volatile void *ptr,		\
 	unsigned long tmp, oldval;					\
 									\
 	asm volatile(							\
-	"	prfm	pstl1strm, %[v]\n"				\
-	"1:	ld" #acq "xr" #sz "\t%" #w "[oldval], %[v]\n"		\
-	"	eor	%" #w "[tmp], %" #w "[oldval], %" #w "[old]\n"	\
-	"	cbnz	%" #w "[tmp], 2f\n"				\
-	"	st" #rel "xr" #sz "\t%w[tmp], %" #w "[new], %[v]\n"	\
-	"	cbnz	%w[tmp], 1b\n"					\
-	"	" #mb "\n"						\
-	"2:"								\
+	__LL_SC_CMPXCHG_BASE_OP(w, sz, name, mb, acq, rel)		\
 	: [tmp] "=&r" (tmp), [oldval] "=&r" (oldval),			\
 	  [v] "+Q" (*(unsigned long *)ptr)				\
 	: [old] "Lr" (old), [new] "r" (new)				\
@@ -304,18 +316,11 @@ __LL_SC_PREFIX(__cmpxchg_double##name(unsigned long old1,		\
 	unsigned long tmp, ret;						\
 									\
 	asm volatile("// __cmpxchg_double" #name "\n"			\
-	"	prfm	pstl1strm, %2\n"				\
-	"1:	ldxp	%0, %1, %2\n"					\
-	"	eor	%0, %0, %3\n"					\
-	"	eor	%1, %1, %4\n"					\
-	"	orr	%1, %0, %1\n"					\
-	"	cbnz	%1, 2f\n"					\
-	"	st" #rel "xp	%w0, %5, %6, %2\n"			\
-	"	cbnz	%w0, 1b\n"					\
-	"	" #mb "\n"						\
-	"2:"								\
-	: "=&r" (tmp), "=&r" (ret), "+Q" (*(unsigned long *)ptr)	\
-	: "r" (old1), "r" (old2), "r" (new1), "r" (new2)		\
+	__LL_SC_CMPXCHG_DBL_OP(mb, rel)					\
+	: [tmp]"=&r" (tmp), [ret]"=&r" (ret),				\
+	  [v]"+Q" (*(unsigned long *)ptr)				\
+	: [old1]"r" (old1), [old2]"r" (old2), [new1]"r" (new1),		\
+	  [new2]"r" (new2)						\
 	: cl);								\
 									\
 	return ret;							\
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/3] arm64/atomics: use subsections for out of line LL/SC alternatives
  2018-11-13 23:39 [PATCH 0/3] arm64: use subsections instead of function calls for LL/SC fallbacks Ard Biesheuvel
  2018-11-13 23:39 ` [PATCH 1/3] arm64/atomics: refactor LL/SC base asm templates Ard Biesheuvel
@ 2018-11-13 23:39 ` Ard Biesheuvel
  2018-11-13 23:39 ` [PATCH 3/3] arm64/atomics: remove " Ard Biesheuvel
  2018-11-27 19:30 ` [PATCH 0/3] arm64: use subsections instead of function calls for LL/SC fallbacks Will Deacon
  3 siblings, 0 replies; 7+ messages in thread
From: Ard Biesheuvel @ 2018-11-13 23:39 UTC (permalink / raw)
  To: linux-arm-kernel

When building with support for LSE atomics, the LL/SC alternative are
emitted as functions, and 'bl' instructions are patched into the code
stream to call them instead of the LSE instructions when the CPU does
not support LSE.

This has some downsides: the LSE atomics are coded up in such a way
that the asm input registers are always aligned with the prototypes
of the out of line alternatives, limiting the freedom the compiler
to allocate registers. Also, the registers x16, x17 and x30 need to
be added to the clobber list, given that those may be corrupted by
a function call. Unfortunately, we still end up with stack spills in
the out of line alternatives that requires 3 temp registers, due to
the fact that x30 needs to be preserved to be able to return.

Also, doing function calls from inline assembler is likely to become
more of a maintenance burden going forward, due to the introduction
of live patching (which requires reliable stack traces), code flow
integrity, Clang support etc etc.

So instead, let's use subsections to emit the out of line alternatives,
but as part of the inline asm expansion so that the compiler is in
charge of the register allocation. The number of allocated registers
still exceeds what LSE actually needs, but the compiler is free to
select them, and the out of line alternatives will no longer require
to spill x30.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/include/asm/atomic_ll_sc.h |   3 +
 arch/arm64/include/asm/atomic_lse.h   | 558 ++++++++++----------
 arch/arm64/include/asm/lse.h          |   4 -
 3 files changed, 285 insertions(+), 280 deletions(-)

diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h
index 5f55f6b8dd7e..10d0b7360747 100644
--- a/arch/arm64/include/asm/atomic_ll_sc.h
+++ b/arch/arm64/include/asm/atomic_ll_sc.h
@@ -82,6 +82,8 @@
 "	" #mb "							\n"	\
 "2:"									\
 
+#if !(defined(CONFIG_ARM64_LSE_ATOMICS) && defined(CONFIG_AS_LSE))
+
 #define ATOMIC_OP(op, asm_op)						\
 __LL_SC_INLINE void							\
 __LL_SC_PREFIX(atomic_##op(int i, atomic_t *v))				\
@@ -332,4 +334,5 @@ __CMPXCHG_DBL(_mb, dmb ish, l, "memory")
 
 #undef __CMPXCHG_DBL
 
+#endif	/* !(defined(CONFIG_ARM64_LSE_ATOMICS) && defined(CONFIG_AS_LSE)) */
 #endif	/* __ASM_ATOMIC_LL_SC_H */
diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index f9b0b09153e0..70bae15cc8d6 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -25,510 +25,516 @@
 #error "please don't include this file directly"
 #endif
 
-#define __LL_SC_ATOMIC(op)	__LL_SC_CALL(atomic_##op)
-#define ATOMIC_OP(op, asm_op)						\
+#include <asm/atomic_ll_sc.h>
+
+#define __LL_SC_ATOMIC(op)						\
+"	b		3f					\n"	\
+"	.subsection	1					\n"	\
+"3:			" op "					\n"	\
+"	b		4f					\n"	\
+"	.previous						\n"	\
+"4:								\n"
+
+#define ATOMIC_OP(op, llsc_op, lse_op)					\
 static inline void atomic_##op(int i, atomic_t *v)			\
 {									\
-	register int w0 asm ("w0") = i;					\
-	register atomic_t *x1 asm ("x1") = v;				\
+	unsigned long tmp;						\
+	int result;							\
 									\
-	asm volatile(ARM64_LSE_ATOMIC_INSN(__LL_SC_ATOMIC(op),		\
-"	" #asm_op "	%w[i], %[v]\n")					\
-	: [i] "+r" (w0), [v] "+Q" (v->counter)				\
-	: "r" (x1)							\
-	: __LL_SC_CLOBBERS);						\
+	asm volatile(ARM64_LSE_ATOMIC_INSN(				\
+	__LL_SC_ATOMIC(__LL_SC_ATOMIC_OP(llsc_op, w)),			\
+"	" #lse_op "	%w[i], %[v]\n")					\
+	: [res]"=&r" (result), [tmp]"=&r" (tmp), [v]"+Q" (v->counter)	\
+	: [i]"r" (i));							\
 }
 
-ATOMIC_OP(andnot, stclr)
-ATOMIC_OP(or, stset)
-ATOMIC_OP(xor, steor)
-ATOMIC_OP(add, stadd)
+ATOMIC_OP(andnot, bic, stclr)
+ATOMIC_OP(or,     orr, stset)
+ATOMIC_OP(xor,    eor, steor)
+ATOMIC_OP(add,    add, stadd)
 
 #undef ATOMIC_OP
 
-#define ATOMIC_FETCH_OP(name, mb, op, asm_op, cl...)			\
+#define ATOMIC_FETCH_OP(name, ac, rl, mb, op, llsc_op, lse_op, cl...)	\
 static inline int atomic_fetch_##op##name(int i, atomic_t *v)		\
 {									\
-	register int w0 asm ("w0") = i;					\
-	register atomic_t *x1 asm ("x1") = v;				\
+	unsigned long tmp;						\
+	int val, result;						\
 									\
 	asm volatile(ARM64_LSE_ATOMIC_INSN(				\
 	/* LL/SC */							\
-	__LL_SC_ATOMIC(fetch_##op##name),				\
+	__LL_SC_ATOMIC(__LL_SC_ATOMIC_FETCH_OP(llsc_op, mb, ac, rl, w)),\
 	/* LSE atomics */						\
-"	" #asm_op #mb "	%w[i], %w[i], %[v]")				\
-	: [i] "+r" (w0), [v] "+Q" (v->counter)				\
-	: "r" (x1)							\
-	: __LL_SC_CLOBBERS, ##cl);					\
+"	" #lse_op #ac #rl " %w[i], %w[res], %[v]")			\
+	: [res]"=&r" (result), [val]"=&r" (val), [tmp]"=&r" (tmp),	\
+	  [v]"+Q" (v->counter)						\
+	: [i]"r" (i)							\
+	: cl);								\
 									\
-	return w0;							\
+	return result;							\
 }
 
-#define ATOMIC_FETCH_OPS(op, asm_op)					\
-	ATOMIC_FETCH_OP(_relaxed,   , op, asm_op)			\
-	ATOMIC_FETCH_OP(_acquire,  a, op, asm_op, "memory")		\
-	ATOMIC_FETCH_OP(_release,  l, op, asm_op, "memory")		\
-	ATOMIC_FETCH_OP(        , al, op, asm_op, "memory")
+#define ATOMIC_FETCH_OPS(op, llsc_op, lse_op)					\
+	ATOMIC_FETCH_OP(_relaxed,  ,  ,        , op, llsc_op, lse_op)		\
+	ATOMIC_FETCH_OP(_acquire, a,  ,        , op, llsc_op, lse_op, "memory")	\
+	ATOMIC_FETCH_OP(_release,  , l,        , op, llsc_op, lse_op, "memory")	\
+	ATOMIC_FETCH_OP(        , a, l, dmb ish, op, llsc_op, lse_op, "memory")
 
-ATOMIC_FETCH_OPS(andnot, ldclr)
-ATOMIC_FETCH_OPS(or, ldset)
-ATOMIC_FETCH_OPS(xor, ldeor)
-ATOMIC_FETCH_OPS(add, ldadd)
+ATOMIC_FETCH_OPS(andnot, bic, ldclr)
+ATOMIC_FETCH_OPS(or,     orr, ldset)
+ATOMIC_FETCH_OPS(xor,    eor, ldeor)
+ATOMIC_FETCH_OPS(add,    add, ldadd)
 
 #undef ATOMIC_FETCH_OP
 #undef ATOMIC_FETCH_OPS
 
-#define ATOMIC_OP_ADD_RETURN(name, mb, cl...)				\
+#define ATOMIC_OP_ADD_RETURN(name, ac, rl, mb, cl...)			\
 static inline int atomic_add_return##name(int i, atomic_t *v)		\
 {									\
-	register int w0 asm ("w0") = i;					\
-	register atomic_t *x1 asm ("x1") = v;				\
+	unsigned long tmp;						\
+	int result;							\
 									\
 	asm volatile(ARM64_LSE_ATOMIC_INSN(				\
 	/* LL/SC */							\
-	__LL_SC_ATOMIC(add_return##name)				\
+	__LL_SC_ATOMIC(__LL_SC_ATOMIC_OP_RETURN(add, mb, ac, rl, w))	\
 	__nops(1),							\
 	/* LSE atomics */						\
-	"	ldadd" #mb "	%w[i], w30, %[v]\n"			\
-	"	add	%w[i], %w[i], w30")				\
-	: [i] "+r" (w0), [v] "+Q" (v->counter)				\
-	: "r" (x1)							\
-	: __LL_SC_CLOBBERS, ##cl);					\
+	"	ldadd" #ac #rl " %w[i], %w[res], %[v]\n"		\
+	"	add		%w[res], %w[res], %w[i]")		\
+	: [v]"+Q" (v->counter), [res]"=&r" (result), [tmp]"=&r" (tmp)	\
+	: [i]"r" (i)							\
+	: cl);								\
 									\
-	return w0;							\
+	return result;							\
 }
 
-ATOMIC_OP_ADD_RETURN(_relaxed,   )
-ATOMIC_OP_ADD_RETURN(_acquire,  a, "memory")
-ATOMIC_OP_ADD_RETURN(_release,  l, "memory")
-ATOMIC_OP_ADD_RETURN(        , al, "memory")
+ATOMIC_OP_ADD_RETURN(_relaxed,  ,  ,        )
+ATOMIC_OP_ADD_RETURN(_acquire, a,  ,        , "memory")
+ATOMIC_OP_ADD_RETURN(_release,  , l,        , "memory")
+ATOMIC_OP_ADD_RETURN(        , a, l, dmb ish, "memory")
 
 #undef ATOMIC_OP_ADD_RETURN
 
 static inline void atomic_and(int i, atomic_t *v)
 {
-	register int w0 asm ("w0") = i;
-	register atomic_t *x1 asm ("x1") = v;
+	unsigned long tmp;
+	int result;
 
 	asm volatile(ARM64_LSE_ATOMIC_INSN(
 	/* LL/SC */
-	__LL_SC_ATOMIC(and)
+	__LL_SC_ATOMIC(__LL_SC_ATOMIC_OP(and, w))
 	__nops(1),
 	/* LSE atomics */
-	"	mvn	%w[i], %w[i]\n"
-	"	stclr	%w[i], %[v]")
-	: [i] "+&r" (w0), [v] "+Q" (v->counter)
-	: "r" (x1)
-	: __LL_SC_CLOBBERS);
+	"	mvn	%w[tmp], %w[i]\n"
+	"	stclr	%w[tmp], %[v]")
+	: [res]"=&r" (result), [tmp]"=&r" (tmp), [v]"+Q" (v->counter)
+	: [i]"r" (i));
 }
 
-#define ATOMIC_FETCH_OP_AND(name, mb, cl...)				\
+#define ATOMIC_FETCH_OP_AND(name, ac, rl, mb, cl...)			\
 static inline int atomic_fetch_and##name(int i, atomic_t *v)		\
 {									\
-	register int w0 asm ("w0") = i;					\
-	register atomic_t *x1 asm ("x1") = v;				\
+	unsigned long tmp;						\
+	int val, result;						\
 									\
 	asm volatile(ARM64_LSE_ATOMIC_INSN(				\
 	/* LL/SC */							\
-	__LL_SC_ATOMIC(fetch_and##name)					\
+	__LL_SC_ATOMIC(__LL_SC_ATOMIC_FETCH_OP(and, mb, ac, rl, w))	\
 	__nops(1),							\
 	/* LSE atomics */						\
-	"	mvn	%w[i], %w[i]\n"					\
-	"	ldclr" #mb "	%w[i], %w[i], %[v]")			\
-	: [i] "+&r" (w0), [v] "+Q" (v->counter)				\
-	: "r" (x1)							\
-	: __LL_SC_CLOBBERS, ##cl);					\
+	"	mvn		%w[res], %w[i]\n"			\
+	"	ldclr" #ac #rl " %w[res], %w[res], %[v]")		\
+	: [res]"=&r" (result), [val]"=&r" (val), [tmp]"=&r" (tmp),	\
+	  [v]"+Q" (v->counter)						\
+	: [i]"r" (i)							\
+	: cl);								\
 									\
-	return w0;							\
+	return result;							\
 }
 
-ATOMIC_FETCH_OP_AND(_relaxed,   )
-ATOMIC_FETCH_OP_AND(_acquire,  a, "memory")
-ATOMIC_FETCH_OP_AND(_release,  l, "memory")
-ATOMIC_FETCH_OP_AND(        , al, "memory")
+ATOMIC_FETCH_OP_AND(_relaxed,  ,  ,        )
+ATOMIC_FETCH_OP_AND(_acquire, a,  ,        , "memory")
+ATOMIC_FETCH_OP_AND(_release,  , l,        , "memory")
+ATOMIC_FETCH_OP_AND(        , a, l, dmb ish, "memory")
 
 #undef ATOMIC_FETCH_OP_AND
 
 static inline void atomic_sub(int i, atomic_t *v)
 {
-	register int w0 asm ("w0") = i;
-	register atomic_t *x1 asm ("x1") = v;
+	unsigned long tmp;
+	int result;
 
 	asm volatile(ARM64_LSE_ATOMIC_INSN(
 	/* LL/SC */
-	__LL_SC_ATOMIC(sub)
+	__LL_SC_ATOMIC(__LL_SC_ATOMIC_OP(sub, w))
 	__nops(1),
 	/* LSE atomics */
-	"	neg	%w[i], %w[i]\n"
-	"	stadd	%w[i], %[v]")
-	: [i] "+&r" (w0), [v] "+Q" (v->counter)
-	: "r" (x1)
-	: __LL_SC_CLOBBERS);
+	"	neg	%w[tmp], %w[i]\n"
+	"	stadd	%w[tmp], %[v]")
+	: [res]"=&r" (result), [tmp]"=&r" (tmp), [v]"+Q" (v->counter)
+	: [i]"r" (i));
 }
 
-#define ATOMIC_OP_SUB_RETURN(name, mb, cl...)				\
+#define ATOMIC_OP_SUB_RETURN(name, ac, rl, mb, cl...)			\
 static inline int atomic_sub_return##name(int i, atomic_t *v)		\
 {									\
-	register int w0 asm ("w0") = i;					\
-	register atomic_t *x1 asm ("x1") = v;				\
+	unsigned long tmp;						\
+	int result;							\
 									\
 	asm volatile(ARM64_LSE_ATOMIC_INSN(				\
 	/* LL/SC */							\
-	__LL_SC_ATOMIC(sub_return##name)				\
+	__LL_SC_ATOMIC(__LL_SC_ATOMIC_OP_RETURN(sub, mb, ac, rl, w))	\
 	__nops(2),							\
 	/* LSE atomics */						\
-	"	neg	%w[i], %w[i]\n"					\
-	"	ldadd" #mb "	%w[i], w30, %[v]\n"			\
-	"	add	%w[i], %w[i], w30")				\
-	: [i] "+&r" (w0), [v] "+Q" (v->counter)				\
-	: "r" (x1)							\
-	: __LL_SC_CLOBBERS , ##cl);					\
+	"	neg		%w[tmp], %w[i]\n"			\
+	"	ldadd" #ac #rl " %w[tmp], %w[res], %[v]\n"		\
+	"	add		%w[res], %w[res], %w[tmp]")		\
+	: [v]"+Q" (v->counter), [res]"=&r" (result), [tmp]"=&r" (tmp)	\
+	: [i]"r" (i)							\
+	: cl);								\
 									\
-	return w0;							\
+	return result;							\
 }
 
-ATOMIC_OP_SUB_RETURN(_relaxed,   )
-ATOMIC_OP_SUB_RETURN(_acquire,  a, "memory")
-ATOMIC_OP_SUB_RETURN(_release,  l, "memory")
-ATOMIC_OP_SUB_RETURN(        , al, "memory")
+ATOMIC_OP_SUB_RETURN(_relaxed,  ,  ,        )
+ATOMIC_OP_SUB_RETURN(_acquire, a,  ,        , "memory")
+ATOMIC_OP_SUB_RETURN(_release,  , l,        , "memory")
+ATOMIC_OP_SUB_RETURN(        , a, l, dmb ish, "memory")
 
 #undef ATOMIC_OP_SUB_RETURN
 
-#define ATOMIC_FETCH_OP_SUB(name, mb, cl...)				\
+#define ATOMIC_FETCH_OP_SUB(name, ac, rl, mb, cl...)			\
 static inline int atomic_fetch_sub##name(int i, atomic_t *v)		\
 {									\
-	register int w0 asm ("w0") = i;					\
-	register atomic_t *x1 asm ("x1") = v;				\
+	unsigned long tmp;						\
+	int val, result;						\
 									\
 	asm volatile(ARM64_LSE_ATOMIC_INSN(				\
 	/* LL/SC */							\
-	__LL_SC_ATOMIC(fetch_sub##name)					\
+	__LL_SC_ATOMIC(__LL_SC_ATOMIC_FETCH_OP(sub, mb, ac, rl, w))	\
 	__nops(1),							\
 	/* LSE atomics */						\
-	"	neg	%w[i], %w[i]\n"					\
-	"	ldadd" #mb "	%w[i], %w[i], %[v]")			\
-	: [i] "+&r" (w0), [v] "+Q" (v->counter)				\
-	: "r" (x1)							\
-	: __LL_SC_CLOBBERS, ##cl);					\
+	"	neg		%w[tmp], %w[i]\n"			\
+	"	ldadd" #ac #rl " %w[tmp], %w[res], %[v]")		\
+	: [res]"=&r" (result), [val]"=&r" (val), [tmp]"=&r" (tmp),	\
+	  [v]"+Q" (v->counter)						\
+	: [i]"r" (i)							\
+	: cl);								\
 									\
-	return w0;							\
+	return result;							\
 }
 
-ATOMIC_FETCH_OP_SUB(_relaxed,   )
-ATOMIC_FETCH_OP_SUB(_acquire,  a, "memory")
-ATOMIC_FETCH_OP_SUB(_release,  l, "memory")
-ATOMIC_FETCH_OP_SUB(        , al, "memory")
+ATOMIC_FETCH_OP_SUB(_relaxed,  ,  ,        )
+ATOMIC_FETCH_OP_SUB(_acquire, a,  ,        , "memory")
+ATOMIC_FETCH_OP_SUB(_release,  , l,        , "memory")
+ATOMIC_FETCH_OP_SUB(        , a, l, dmb ish, "memory")
 
 #undef ATOMIC_FETCH_OP_SUB
-#undef __LL_SC_ATOMIC
 
-#define __LL_SC_ATOMIC64(op)	__LL_SC_CALL(atomic64_##op)
-#define ATOMIC64_OP(op, asm_op)						\
+#define ATOMIC64_OP(op, llsc_op, lse_op)				\
 static inline void atomic64_##op(long i, atomic64_t *v)			\
 {									\
-	register long x0 asm ("x0") = i;				\
-	register atomic64_t *x1 asm ("x1") = v;				\
+	long result;							\
+	unsigned long tmp;						\
 									\
-	asm volatile(ARM64_LSE_ATOMIC_INSN(__LL_SC_ATOMIC64(op),	\
-"	" #asm_op "	%[i], %[v]\n")					\
-	: [i] "+r" (x0), [v] "+Q" (v->counter)				\
-	: "r" (x1)							\
-	: __LL_SC_CLOBBERS);						\
+	asm volatile(ARM64_LSE_ATOMIC_INSN(				\
+	__LL_SC_ATOMIC(__LL_SC_ATOMIC_OP(llsc_op, )),			\
+"	" #lse_op "	%[i], %[v]\n")					\
+	: [res]"=&r" (result), [tmp]"=&r" (tmp), [v]"+Q" (v->counter)	\
+	: [i]"r" (i));							\
 }
 
-ATOMIC64_OP(andnot, stclr)
-ATOMIC64_OP(or, stset)
-ATOMIC64_OP(xor, steor)
-ATOMIC64_OP(add, stadd)
+ATOMIC64_OP(andnot, bic, stclr)
+ATOMIC64_OP(or,     orr, stset)
+ATOMIC64_OP(xor,    eor, steor)
+ATOMIC64_OP(add,    add, stadd)
 
 #undef ATOMIC64_OP
 
-#define ATOMIC64_FETCH_OP(name, mb, op, asm_op, cl...)			\
+#define ATOMIC64_FETCH_OP(name, ac, rl, mb, op, llsc_op, lse_op, cl...)	\
 static inline long atomic64_fetch_##op##name(long i, atomic64_t *v)	\
 {									\
-	register long x0 asm ("x0") = i;				\
-	register atomic64_t *x1 asm ("x1") = v;				\
+	long result, val;						\
+	unsigned long tmp;						\
 									\
 	asm volatile(ARM64_LSE_ATOMIC_INSN(				\
 	/* LL/SC */							\
-	__LL_SC_ATOMIC64(fetch_##op##name),				\
+	__LL_SC_ATOMIC(__LL_SC_ATOMIC_FETCH_OP(llsc_op, mb, ac, rl, )),	\
 	/* LSE atomics */						\
-"	" #asm_op #mb "	%[i], %[i], %[v]")				\
-	: [i] "+r" (x0), [v] "+Q" (v->counter)				\
-	: "r" (x1)							\
-	: __LL_SC_CLOBBERS, ##cl);					\
+"	" #lse_op #ac #rl "	%[i], %[res], %[v]")			\
+	: [res]"=&r" (result), [val]"=&r" (val), [tmp]"=&r" (tmp),	\
+	  [v]"+Q" (v->counter)						\
+	: [i]"r" (i)							\
+	: cl);								\
 									\
-	return x0;							\
+	return result;							\
 }
 
-#define ATOMIC64_FETCH_OPS(op, asm_op)					\
-	ATOMIC64_FETCH_OP(_relaxed,   , op, asm_op)			\
-	ATOMIC64_FETCH_OP(_acquire,  a, op, asm_op, "memory")		\
-	ATOMIC64_FETCH_OP(_release,  l, op, asm_op, "memory")		\
-	ATOMIC64_FETCH_OP(        , al, op, asm_op, "memory")
+#define ATOMIC64_FETCH_OPS(op, llsc_op, lse_op)						\
+	ATOMIC64_FETCH_OP(_relaxed,  ,  ,        , op, llsc_op, lse_op)			\
+	ATOMIC64_FETCH_OP(_acquire, a,  ,        , op, llsc_op, lse_op, "memory")	\
+	ATOMIC64_FETCH_OP(_release,  , l,        , op, llsc_op, lse_op, "memory")	\
+	ATOMIC64_FETCH_OP(        , a, l, dmb ish, op, llsc_op, lse_op, "memory")
 
-ATOMIC64_FETCH_OPS(andnot, ldclr)
-ATOMIC64_FETCH_OPS(or, ldset)
-ATOMIC64_FETCH_OPS(xor, ldeor)
-ATOMIC64_FETCH_OPS(add, ldadd)
+ATOMIC64_FETCH_OPS(andnot, bic, ldclr)
+ATOMIC64_FETCH_OPS(or,     orr, ldset)
+ATOMIC64_FETCH_OPS(xor,    eor, ldeor)
+ATOMIC64_FETCH_OPS(add,    add, ldadd)
 
 #undef ATOMIC64_FETCH_OP
 #undef ATOMIC64_FETCH_OPS
 
-#define ATOMIC64_OP_ADD_RETURN(name, mb, cl...)				\
+#define ATOMIC64_OP_ADD_RETURN(name, ac, rl, mb, cl...)			\
 static inline long atomic64_add_return##name(long i, atomic64_t *v)	\
 {									\
-	register long x0 asm ("x0") = i;				\
-	register atomic64_t *x1 asm ("x1") = v;				\
+	unsigned long tmp;						\
+	long result;							\
 									\
 	asm volatile(ARM64_LSE_ATOMIC_INSN(				\
 	/* LL/SC */							\
-	__LL_SC_ATOMIC64(add_return##name)				\
+	__LL_SC_ATOMIC(__LL_SC_ATOMIC_OP_RETURN(add, mb, ac, rl, ))	\
 	__nops(1),							\
 	/* LSE atomics */						\
-	"	ldadd" #mb "	%[i], x30, %[v]\n"			\
-	"	add	%[i], %[i], x30")				\
-	: [i] "+r" (x0), [v] "+Q" (v->counter)				\
-	: "r" (x1)							\
-	: __LL_SC_CLOBBERS, ##cl);					\
+	"	ldadd" #ac #rl " %[i], %[res], %[v]\n"			\
+	"	add		%[res], %[res], %[i]")			\
+	: [v]"+Q" (v->counter), [res]"=&r" (result), [tmp]"=&r" (tmp)	\
+	: [i]"r" (i)							\
+	: cl);								\
 									\
-	return x0;							\
+	return result;							\
 }
 
-ATOMIC64_OP_ADD_RETURN(_relaxed,   )
-ATOMIC64_OP_ADD_RETURN(_acquire,  a, "memory")
-ATOMIC64_OP_ADD_RETURN(_release,  l, "memory")
-ATOMIC64_OP_ADD_RETURN(        , al, "memory")
+ATOMIC64_OP_ADD_RETURN(_relaxed,  ,  ,        )
+ATOMIC64_OP_ADD_RETURN(_acquire, a,  ,        , "memory")
+ATOMIC64_OP_ADD_RETURN(_release,  , l,        , "memory")
+ATOMIC64_OP_ADD_RETURN(        , a, l, dmb ish, "memory")
 
 #undef ATOMIC64_OP_ADD_RETURN
 
 static inline void atomic64_and(long i, atomic64_t *v)
 {
-	register long x0 asm ("x0") = i;
-	register atomic64_t *x1 asm ("x1") = v;
+	long result;
+	unsigned long tmp;
 
 	asm volatile(ARM64_LSE_ATOMIC_INSN(
 	/* LL/SC */
-	__LL_SC_ATOMIC64(and)
+	__LL_SC_ATOMIC(__LL_SC_ATOMIC_OP(and, ))
 	__nops(1),
 	/* LSE atomics */
-	"	mvn	%[i], %[i]\n"
-	"	stclr	%[i], %[v]")
-	: [i] "+&r" (x0), [v] "+Q" (v->counter)
-	: "r" (x1)
-	: __LL_SC_CLOBBERS);
+	"	mvn	%[tmp], %[i]\n"
+	"	stclr	%[tmp], %[v]")
+	: [res]"=&r" (result), [tmp]"=&r" (tmp), [v]"+Q" (v->counter)
+	: [i]"r" (i));
 }
 
-#define ATOMIC64_FETCH_OP_AND(name, mb, cl...)				\
+#define ATOMIC64_FETCH_OP_AND(name, ac, rl, mb, cl...)			\
 static inline long atomic64_fetch_and##name(long i, atomic64_t *v)	\
 {									\
-	register long x0 asm ("x0") = i;				\
-	register atomic64_t *x1 asm ("x1") = v;				\
+	long result, val;						\
+	unsigned long tmp;						\
 									\
 	asm volatile(ARM64_LSE_ATOMIC_INSN(				\
 	/* LL/SC */							\
-	__LL_SC_ATOMIC64(fetch_and##name)				\
+	__LL_SC_ATOMIC(__LL_SC_ATOMIC_FETCH_OP(and, mb, ac, rl, ))	\
 	__nops(1),							\
 	/* LSE atomics */						\
-	"	mvn	%[i], %[i]\n"					\
-	"	ldclr" #mb "	%[i], %[i], %[v]")			\
-	: [i] "+&r" (x0), [v] "+Q" (v->counter)				\
-	: "r" (x1)							\
-	: __LL_SC_CLOBBERS, ##cl);					\
+	"	mvn		%[res], %[i]\n"				\
+	"	ldclr" #ac #rl " %[res], %[res], %[v]")			\
+	: [res]"=&r" (result), [val]"=&r" (val), [tmp]"=&r" (tmp),	\
+	  [v]"+Q" (v->counter)						\
+	: [i]"r" (i)							\
+	: cl);								\
 									\
-	return x0;							\
+	return result;							\
 }
 
-ATOMIC64_FETCH_OP_AND(_relaxed,   )
-ATOMIC64_FETCH_OP_AND(_acquire,  a, "memory")
-ATOMIC64_FETCH_OP_AND(_release,  l, "memory")
-ATOMIC64_FETCH_OP_AND(        , al, "memory")
+ATOMIC64_FETCH_OP_AND(_relaxed,  ,  ,        )
+ATOMIC64_FETCH_OP_AND(_acquire, a,  ,        , "memory")
+ATOMIC64_FETCH_OP_AND(_release,  , l,        , "memory")
+ATOMIC64_FETCH_OP_AND(        , a, l, dmb ish, "memory")
 
 #undef ATOMIC64_FETCH_OP_AND
 
 static inline void atomic64_sub(long i, atomic64_t *v)
 {
-	register long x0 asm ("x0") = i;
-	register atomic64_t *x1 asm ("x1") = v;
+	long result;
+	unsigned long tmp;
 
 	asm volatile(ARM64_LSE_ATOMIC_INSN(
 	/* LL/SC */
-	__LL_SC_ATOMIC64(sub)
+	__LL_SC_ATOMIC(__LL_SC_ATOMIC_OP(sub, ))
 	__nops(1),
 	/* LSE atomics */
-	"	neg	%[i], %[i]\n"
-	"	stadd	%[i], %[v]")
-	: [i] "+&r" (x0), [v] "+Q" (v->counter)
-	: "r" (x1)
-	: __LL_SC_CLOBBERS);
+	"	neg	%[tmp], %[i]\n"
+	"	stadd	%[tmp], %[v]")
+	: [res]"=&r" (result), [tmp]"=&r" (tmp), [v]"+Q" (v->counter)
+	: [i]"r" (i));
 }
 
-#define ATOMIC64_OP_SUB_RETURN(name, mb, cl...)				\
+#define ATOMIC64_OP_SUB_RETURN(name, ac, rl, mb, cl...)			\
 static inline long atomic64_sub_return##name(long i, atomic64_t *v)	\
 {									\
-	register long x0 asm ("x0") = i;				\
-	register atomic64_t *x1 asm ("x1") = v;				\
+	unsigned long tmp;						\
+	long result;							\
 									\
 	asm volatile(ARM64_LSE_ATOMIC_INSN(				\
 	/* LL/SC */							\
-	__LL_SC_ATOMIC64(sub_return##name)				\
+	__LL_SC_ATOMIC(__LL_SC_ATOMIC_OP_RETURN(sub, mb, ac, rl, ))	\
 	__nops(2),							\
 	/* LSE atomics */						\
-	"	neg	%[i], %[i]\n"					\
-	"	ldadd" #mb "	%[i], x30, %[v]\n"			\
-	"	add	%[i], %[i], x30")				\
-	: [i] "+&r" (x0), [v] "+Q" (v->counter)				\
-	: "r" (x1)							\
-	: __LL_SC_CLOBBERS, ##cl);					\
+	"	neg		%[tmp], %[i]\n"				\
+	"	ldadd" #ac #rl " %[tmp], %[res], %[v]\n"			\
+	"	add		%[res], %[res], %[tmp]")		\
+	: [v]"+Q" (v->counter), [res]"=&r" (result), [tmp]"=&r" (tmp)	\
+	: [i]"r" (i)							\
+	: cl);								\
 									\
-	return x0;							\
+	return result;							\
 }
 
-ATOMIC64_OP_SUB_RETURN(_relaxed,   )
-ATOMIC64_OP_SUB_RETURN(_acquire,  a, "memory")
-ATOMIC64_OP_SUB_RETURN(_release,  l, "memory")
-ATOMIC64_OP_SUB_RETURN(        , al, "memory")
+ATOMIC64_OP_SUB_RETURN(_relaxed,  ,  ,        )
+ATOMIC64_OP_SUB_RETURN(_acquire, a,  ,        , "memory")
+ATOMIC64_OP_SUB_RETURN(_release,  , l,        , "memory")
+ATOMIC64_OP_SUB_RETURN(        , a, l, dmb ish, "memory")
 
 #undef ATOMIC64_OP_SUB_RETURN
 
-#define ATOMIC64_FETCH_OP_SUB(name, mb, cl...)				\
+#define ATOMIC64_FETCH_OP_SUB(name, ac, rl, mb, cl...)			\
 static inline long atomic64_fetch_sub##name(long i, atomic64_t *v)	\
 {									\
-	register long x0 asm ("x0") = i;				\
-	register atomic64_t *x1 asm ("x1") = v;				\
+	long result, val;						\
+	unsigned long tmp;						\
 									\
 	asm volatile(ARM64_LSE_ATOMIC_INSN(				\
 	/* LL/SC */							\
-	__LL_SC_ATOMIC64(fetch_sub##name)				\
+	__LL_SC_ATOMIC(__LL_SC_ATOMIC_FETCH_OP(sub, mb, ac, rl, ))	\
 	__nops(1),							\
 	/* LSE atomics */						\
-	"	neg	%[i], %[i]\n"					\
-	"	ldadd" #mb "	%[i], %[i], %[v]")			\
-	: [i] "+&r" (x0), [v] "+Q" (v->counter)				\
-	: "r" (x1)							\
-	: __LL_SC_CLOBBERS, ##cl);					\
+	"	neg		%[tmp], %[i]\n"				\
+	"	ldadd" #ac #rl " %[tmp], %[res], %[v]")			\
+	: [res]"=&r" (result), [val]"=&r" (val), [tmp]"=&r" (tmp),	\
+	  [v]"+Q" (v->counter)						\
+	: [i]"r" (i)							\
+	: cl);								\
 									\
-	return x0;							\
+	return result;							\
 }
 
-ATOMIC64_FETCH_OP_SUB(_relaxed,   )
-ATOMIC64_FETCH_OP_SUB(_acquire,  a, "memory")
-ATOMIC64_FETCH_OP_SUB(_release,  l, "memory")
-ATOMIC64_FETCH_OP_SUB(        , al, "memory")
+ATOMIC64_FETCH_OP_SUB(_relaxed,  ,  ,        )
+ATOMIC64_FETCH_OP_SUB(_acquire, a,  ,        , "memory")
+ATOMIC64_FETCH_OP_SUB(_release,  , l,        , "memory")
+ATOMIC64_FETCH_OP_SUB(        , a, l, dmb ish, "memory")
 
 #undef ATOMIC64_FETCH_OP_SUB
 
 static inline long atomic64_dec_if_positive(atomic64_t *v)
 {
-	register long x0 asm ("x0") = (long)v;
+	long tmp, result;
 
 	asm volatile(ARM64_LSE_ATOMIC_INSN(
 	/* LL/SC */
-	__LL_SC_ATOMIC64(dec_if_positive)
-	__nops(6),
+	"	prfm	pstl1strm, %[v]\n"
+	"1:	ldxr	%[tmp], %[v]\n"
+	"	subs	%[res], %[tmp], #1\n"
+	"	b.lt	2f\n"
+	"	stlxr	%w[tmp], %[res], %[v]\n"
+	"	cbnz	%w[tmp], 1b\n"
+	"	dmb	ish\n"
+	"2:",
 	/* LSE atomics */
-	"1:	ldr	x30, %[v]\n"
-	"	subs	%[ret], x30, #1\n"
+	"1:	ldr	%[tmp], %[v]\n"
+	"	subs	%[res], %[tmp], #1\n"
 	"	b.lt	2f\n"
-	"	casal	x30, %[ret], %[v]\n"
-	"	sub	x30, x30, #1\n"
-	"	sub	x30, x30, %[ret]\n"
-	"	cbnz	x30, 1b\n"
+	"	casal	%[tmp], %[res], %[v]\n"
+	"	sub	%[tmp], %[tmp], #1\n"
+	"	sub	%[tmp], %[tmp], %[res]\n"
+	"	cbnz	%[tmp], 1b\n"
 	"2:")
-	: [ret] "+&r" (x0), [v] "+Q" (v->counter)
+	: [res]"=&r" (result), [tmp]"=&r" (tmp), [v]"+Q" (v->counter)
 	:
-	: __LL_SC_CLOBBERS, "cc", "memory");
+	: "cc", "memory");
 
-	return x0;
+	return result;
 }
 
 #undef __LL_SC_ATOMIC64
 
-#define __LL_SC_CMPXCHG(op)	__LL_SC_CALL(__cmpxchg_case_##op)
-
-#define __CMPXCHG_CASE(w, sz, name, mb, cl...)				\
+#define __CMPXCHG_CASE(w, sz, name, ac, rl, mb, cl...)			\
 static inline unsigned long __cmpxchg_case_##name(volatile void *ptr,	\
 						  unsigned long old,	\
 						  unsigned long new)	\
 {									\
-	register unsigned long x0 asm ("x0") = (unsigned long)ptr;	\
-	register unsigned long x1 asm ("x1") = old;			\
-	register unsigned long x2 asm ("x2") = new;			\
+	unsigned long tmp, oldval;					\
 									\
 	asm volatile(ARM64_LSE_ATOMIC_INSN(				\
 	/* LL/SC */							\
-	__LL_SC_CMPXCHG(name)						\
-	__nops(2),							\
+	__LL_SC_ATOMIC(__LL_SC_CMPXCHG_BASE_OP(w, sz, name, mb, ac, rl))\
+	__nops(1),							\
 	/* LSE atomics */						\
-	"	mov	" #w "30, %" #w "[old]\n"			\
-	"	cas" #mb #sz "\t" #w "30, %" #w "[new], %[v]\n"		\
-	"	mov	%" #w "[ret], " #w "30")			\
-	: [ret] "+r" (x0), [v] "+Q" (*(unsigned long *)ptr)		\
-	: [old] "r" (x1), [new] "r" (x2)				\
-	: __LL_SC_CLOBBERS, ##cl);					\
+	"	mov			%" #w "[oldval], %" #w "[old]\n"\
+	"	cas" #ac #rl #sz "	%" #w "[oldval], %" #w "[new], "\
+	"						 %[v]\n")	\
+	: [tmp] "=&r" (tmp), [oldval] "=&r" (oldval),			\
+	  [v] "+Q" (*(unsigned long *)ptr)				\
+	: [old] "Lr" (old), [new] "r" (new)				\
+	: cl);								\
 									\
-	return x0;							\
+	return oldval;							\
 }
 
-__CMPXCHG_CASE(w, b,     1,   )
-__CMPXCHG_CASE(w, h,     2,   )
-__CMPXCHG_CASE(w,  ,     4,   )
-__CMPXCHG_CASE(x,  ,     8,   )
-__CMPXCHG_CASE(w, b, acq_1,  a, "memory")
-__CMPXCHG_CASE(w, h, acq_2,  a, "memory")
-__CMPXCHG_CASE(w,  , acq_4,  a, "memory")
-__CMPXCHG_CASE(x,  , acq_8,  a, "memory")
-__CMPXCHG_CASE(w, b, rel_1,  l, "memory")
-__CMPXCHG_CASE(w, h, rel_2,  l, "memory")
-__CMPXCHG_CASE(w,  , rel_4,  l, "memory")
-__CMPXCHG_CASE(x,  , rel_8,  l, "memory")
-__CMPXCHG_CASE(w, b,  mb_1, al, "memory")
-__CMPXCHG_CASE(w, h,  mb_2, al, "memory")
-__CMPXCHG_CASE(w,  ,  mb_4, al, "memory")
-__CMPXCHG_CASE(x,  ,  mb_8, al, "memory")
-
-#undef __LL_SC_CMPXCHG
-#undef __CMPXCHG_CASE
+__CMPXCHG_CASE(w, b,     1,  ,  ,        )
+__CMPXCHG_CASE(w, h,     2,  ,  ,        )
+__CMPXCHG_CASE(w,  ,     4,  ,  ,        )
+__CMPXCHG_CASE(x,  ,     8,  ,  ,        )
+__CMPXCHG_CASE(w, b, acq_1, a,  ,        , "memory")
+__CMPXCHG_CASE(w, h, acq_2, a,  ,        , "memory")
+__CMPXCHG_CASE(w,  , acq_4, a,  ,        , "memory")
+__CMPXCHG_CASE(x,  , acq_8, a,  ,        , "memory")
+__CMPXCHG_CASE(w, b, rel_1,  , l,        , "memory")
+__CMPXCHG_CASE(w, h, rel_2,  , l,        , "memory")
+__CMPXCHG_CASE(w,  , rel_4,  , l,        , "memory")
+__CMPXCHG_CASE(x,  , rel_8,  , l,        , "memory")
+__CMPXCHG_CASE(w, b,  mb_1, a, l, dmb ish, "memory")
+__CMPXCHG_CASE(w, h,  mb_2, a, l, dmb ish, "memory")
+__CMPXCHG_CASE(w,  ,  mb_4, a, l, dmb ish, "memory")
+__CMPXCHG_CASE(x,  ,  mb_8, a, l, dmb ish, "memory")
 
-#define __LL_SC_CMPXCHG_DBL(op)	__LL_SC_CALL(__cmpxchg_double##op)
+#undef __CMPXCHG_CASE
 
-#define __CMPXCHG_DBL(name, mb, cl...)					\
+#define __CMPXCHG_DBL(name, ac, rl, mb, cl...)				\
 static inline long __cmpxchg_double##name(unsigned long old1,		\
 					 unsigned long old2,		\
 					 unsigned long new1,		\
 					 unsigned long new2,		\
 					 volatile void *ptr)		\
 {									\
-	unsigned long oldval1 = old1;					\
-	unsigned long oldval2 = old2;					\
-	register unsigned long x0 asm ("x0") = old1;			\
-	register unsigned long x1 asm ("x1") = old2;			\
+	register unsigned long oldval1 asm ("x0") = old1;		\
+	register unsigned long oldval2 asm ("x1") = old2;		\
 	register unsigned long x2 asm ("x2") = new1;			\
 	register unsigned long x3 asm ("x3") = new2;			\
-	register unsigned long x4 asm ("x4") = (unsigned long)ptr;	\
 									\
 	asm volatile(ARM64_LSE_ATOMIC_INSN(				\
 	/* LL/SC */							\
-	__LL_SC_CMPXCHG_DBL(name)					\
+	__LL_SC_ATOMIC(__LL_SC_CMPXCHG_DBL_OP(mb, rl))			\
 	__nops(3),							\
 	/* LSE atomics */						\
-	"	casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\
-	"	eor	%[old1], %[old1], %[oldval1]\n"			\
-	"	eor	%[old2], %[old2], %[oldval2]\n"			\
-	"	orr	%[old1], %[old1], %[old2]")			\
-	: [old1] "+&r" (x0), [old2] "+&r" (x1),				\
-	  [v] "+Q" (*(unsigned long *)ptr)				\
-	: [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4),		\
-	  [oldval1] "r" (oldval1), [oldval2] "r" (oldval2)		\
-	: __LL_SC_CLOBBERS, ##cl);					\
+	"	casp" #ac #rl "	%[ret], %[tmp], %[new1], %[new2], "	\
+	"			%[v]\n"					\
+	"	eor		%[ret], %[ret], %[old1]\n"		\
+	"	eor		%[tmp], %[tmp], %[old2]\n"		\
+	"	orr		%[ret], %[ret], %[tmp]")		\
+	: [ret]"+&r" (oldval1), [tmp]"+&r" (oldval2),			\
+	  [v]"+Q" (*(unsigned long *)ptr)				\
+	: [old1]"r" (old1), [old2]"r" (old2), [new1]"r" (x2),		\
+	  [new2]"r" (x3)						\
+	: cl);								\
 									\
-	return x0;							\
+	return oldval1;							\
 }
 
-__CMPXCHG_DBL(   ,   )
-__CMPXCHG_DBL(_mb, al, "memory")
+__CMPXCHG_DBL(   ,  ,  ,        )
+__CMPXCHG_DBL(_mb, a, l, dmb ish, "memory")
 
-#undef __LL_SC_CMPXCHG_DBL
 #undef __CMPXCHG_DBL
 
 #endif	/* __ASM_ATOMIC_LSE_H */
diff --git a/arch/arm64/include/asm/lse.h b/arch/arm64/include/asm/lse.h
index 8262325e2fc6..ef70f62ea25e 100644
--- a/arch/arm64/include/asm/lse.h
+++ b/arch/arm64/include/asm/lse.h
@@ -27,10 +27,6 @@ __asm__(".arch_extension	lse");
 #define __LL_SC_PREFIX(x)	__ll_sc_##x
 #define __LL_SC_EXPORT(x)	EXPORT_SYMBOL(__LL_SC_PREFIX(x))
 
-/* Macro for constructing calls to out-of-line ll/sc atomics */
-#define __LL_SC_CALL(op)	"bl\t" __stringify(__LL_SC_PREFIX(op)) "\n"
-#define __LL_SC_CLOBBERS	"x16", "x17", "x30"
-
 /* In-line patching at runtime */
 #define ARM64_LSE_ATOMIC_INSN(llsc, lse)				\
 	ALTERNATIVE(llsc, lse, ARM64_HAS_LSE_ATOMICS)
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 3/3] arm64/atomics: remove out of line LL/SC alternatives
  2018-11-13 23:39 [PATCH 0/3] arm64: use subsections instead of function calls for LL/SC fallbacks Ard Biesheuvel
  2018-11-13 23:39 ` [PATCH 1/3] arm64/atomics: refactor LL/SC base asm templates Ard Biesheuvel
  2018-11-13 23:39 ` [PATCH 2/3] arm64/atomics: use subsections for out of line LL/SC alternatives Ard Biesheuvel
@ 2018-11-13 23:39 ` Ard Biesheuvel
  2018-11-27 19:30 ` [PATCH 0/3] arm64: use subsections instead of function calls for LL/SC fallbacks Will Deacon
  3 siblings, 0 replies; 7+ messages in thread
From: Ard Biesheuvel @ 2018-11-13 23:39 UTC (permalink / raw)
  To: linux-arm-kernel

Now that we are no longer emitting calls to the out of line LL/SC
alternatives from the LSE implementation, drop the exports, the
prototype decorations and the Makefile rules that build the object
file that contains them.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/include/asm/atomic_ll_sc.h | 48 ++++++--------------
 arch/arm64/include/asm/lse.h          |  9 ----
 arch/arm64/lib/Makefile               | 19 --------
 arch/arm64/lib/atomic_ll_sc.c         |  3 --
 4 files changed, 15 insertions(+), 64 deletions(-)

diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h
index 10d0b7360747..0aa65d7a9f27 100644
--- a/arch/arm64/include/asm/atomic_ll_sc.h
+++ b/arch/arm64/include/asm/atomic_ll_sc.h
@@ -85,8 +85,7 @@
 #if !(defined(CONFIG_ARM64_LSE_ATOMICS) && defined(CONFIG_AS_LSE))
 
 #define ATOMIC_OP(op, asm_op)						\
-__LL_SC_INLINE void							\
-__LL_SC_PREFIX(atomic_##op(int i, atomic_t *v))				\
+static inline void atomic_##op(int i, atomic_t *v))			\
 {									\
 	unsigned long tmp;						\
 	int result;							\
@@ -96,11 +95,9 @@ __LL_SC_PREFIX(atomic_##op(int i, atomic_t *v))				\
 	: [res]"=&r" (result), [tmp]"=&r" (tmp), [v]"+Q" (v->counter)	\
 	: [i]"Ir" (i));							\
 }									\
-__LL_SC_EXPORT(atomic_##op);
 
 #define ATOMIC_OP_RETURN(name, mb, acq, rel, cl, op, asm_op)		\
-__LL_SC_INLINE int							\
-__LL_SC_PREFIX(atomic_##op##_return##name(int i, atomic_t *v))		\
+static inline int atomic_##op##_return##name(int i, atomic_t *v)	\
 {									\
 	unsigned long tmp;						\
 	int result;							\
@@ -113,11 +110,9 @@ __LL_SC_PREFIX(atomic_##op##_return##name(int i, atomic_t *v))		\
 									\
 	return result;							\
 }									\
-__LL_SC_EXPORT(atomic_##op##_return##name);
 
 #define ATOMIC_FETCH_OP(name, mb, acq, rel, cl, op, asm_op)		\
-__LL_SC_INLINE int							\
-__LL_SC_PREFIX(atomic_fetch_##op##name(int i, atomic_t *v))		\
+static inline int atomic_fetch_##op##name(int i, atomic_t *v)		\
 {									\
 	unsigned long tmp;						\
 	int val, result;						\
@@ -131,7 +126,6 @@ __LL_SC_PREFIX(atomic_fetch_##op##name(int i, atomic_t *v))		\
 									\
 	return result;							\
 }									\
-__LL_SC_EXPORT(atomic_fetch_##op##name);
 
 #define ATOMIC_OPS(...)							\
 	ATOMIC_OP(__VA_ARGS__)						\
@@ -166,8 +160,7 @@ ATOMIC_OPS(xor, eor)
 #undef ATOMIC_OP
 
 #define ATOMIC64_OP(op, asm_op)						\
-__LL_SC_INLINE void							\
-__LL_SC_PREFIX(atomic64_##op(long i, atomic64_t *v))			\
+static inline void atomic64_##op(long i, atomic64_t *v)			\
 {									\
 	long result;							\
 	unsigned long tmp;						\
@@ -177,11 +170,9 @@ __LL_SC_PREFIX(atomic64_##op(long i, atomic64_t *v))			\
 	: [res]"=&r" (result), [tmp]"=&r" (tmp), [v]"+Q" (v->counter)	\
 	: [i]"Ir" (i));							\
 }									\
-__LL_SC_EXPORT(atomic64_##op);
 
 #define ATOMIC64_OP_RETURN(name, mb, acq, rel, cl, op, asm_op)		\
-__LL_SC_INLINE long							\
-__LL_SC_PREFIX(atomic64_##op##_return##name(long i, atomic64_t *v))	\
+static inline long atomic64_##op##_return##name(long i, atomic64_t *v)	\
 {									\
 	long result;							\
 	unsigned long tmp;						\
@@ -194,11 +185,9 @@ __LL_SC_PREFIX(atomic64_##op##_return##name(long i, atomic64_t *v))	\
 									\
 	return result;							\
 }									\
-__LL_SC_EXPORT(atomic64_##op##_return##name);
 
 #define ATOMIC64_FETCH_OP(name, mb, acq, rel, cl, op, asm_op)		\
-__LL_SC_INLINE long							\
-__LL_SC_PREFIX(atomic64_fetch_##op##name(long i, atomic64_t *v))	\
+static inline long atomic64_fetch_##op##name(long i, atomic64_t *v)	\
 {									\
 	long result, val;						\
 	unsigned long tmp;						\
@@ -212,7 +201,6 @@ __LL_SC_PREFIX(atomic64_fetch_##op##name(long i, atomic64_t *v))	\
 									\
 	return result;							\
 }									\
-__LL_SC_EXPORT(atomic64_fetch_##op##name);
 
 #define ATOMIC64_OPS(...)						\
 	ATOMIC64_OP(__VA_ARGS__)					\
@@ -246,8 +234,7 @@ ATOMIC64_OPS(xor, eor)
 #undef ATOMIC64_OP_RETURN
 #undef ATOMIC64_OP
 
-__LL_SC_INLINE long
-__LL_SC_PREFIX(atomic64_dec_if_positive(atomic64_t *v))
+static inline long atomic64_dec_if_positive(atomic64_t *v)
 {
 	long result;
 	unsigned long tmp;
@@ -267,13 +254,11 @@ __LL_SC_PREFIX(atomic64_dec_if_positive(atomic64_t *v))
 
 	return result;
 }
-__LL_SC_EXPORT(atomic64_dec_if_positive);
 
 #define __CMPXCHG_CASE(w, sz, name, mb, acq, rel, cl)			\
-__LL_SC_INLINE unsigned long						\
-__LL_SC_PREFIX(__cmpxchg_case_##name(volatile void *ptr,		\
-				     unsigned long old,			\
-				     unsigned long new))		\
+static inline unsigned long __cmpxchg_case_##name(volatile void *ptr,	\
+						  unsigned long old,	\
+						  unsigned long new)	\
 {									\
 	unsigned long tmp, oldval;					\
 									\
@@ -286,7 +271,6 @@ __LL_SC_PREFIX(__cmpxchg_case_##name(volatile void *ptr,		\
 									\
 	return oldval;							\
 }									\
-__LL_SC_EXPORT(__cmpxchg_case_##name);
 
 __CMPXCHG_CASE(w, b,     1,        ,  ,  ,         )
 __CMPXCHG_CASE(w, h,     2,        ,  ,  ,         )
@@ -308,12 +292,11 @@ __CMPXCHG_CASE( ,  ,  mb_8, dmb ish,  , l, "memory")
 #undef __CMPXCHG_CASE
 
 #define __CMPXCHG_DBL(name, mb, rel, cl)				\
-__LL_SC_INLINE long							\
-__LL_SC_PREFIX(__cmpxchg_double##name(unsigned long old1,		\
-				      unsigned long old2,		\
-				      unsigned long new1,		\
-				      unsigned long new2,		\
-				      volatile void *ptr))		\
+static inline long __cmpxchg_double##name(unsigned long old1,		\
+					  unsigned long old2,		\
+					  unsigned long new1,		\
+					  unsigned long new2,		\
+					  volatile void *ptr)		\
 {									\
 	unsigned long tmp, ret;						\
 									\
@@ -327,7 +310,6 @@ __LL_SC_PREFIX(__cmpxchg_double##name(unsigned long old1,		\
 									\
 	return ret;							\
 }									\
-__LL_SC_EXPORT(__cmpxchg_double##name);
 
 __CMPXCHG_DBL(   ,        ,  ,         )
 __CMPXCHG_DBL(_mb, dmb ish, l, "memory")
diff --git a/arch/arm64/include/asm/lse.h b/arch/arm64/include/asm/lse.h
index ef70f62ea25e..3ae4f9a6c207 100644
--- a/arch/arm64/include/asm/lse.h
+++ b/arch/arm64/include/asm/lse.h
@@ -22,11 +22,6 @@
 
 __asm__(".arch_extension	lse");
 
-/* Move the ll/sc atomics out-of-line */
-#define __LL_SC_INLINE		notrace
-#define __LL_SC_PREFIX(x)	__ll_sc_##x
-#define __LL_SC_EXPORT(x)	EXPORT_SYMBOL(__LL_SC_PREFIX(x))
-
 /* In-line patching at runtime */
 #define ARM64_LSE_ATOMIC_INSN(llsc, lse)				\
 	ALTERNATIVE(llsc, lse, ARM64_HAS_LSE_ATOMICS)
@@ -42,10 +37,6 @@ __asm__(".arch_extension	lse");
 
 #else	/* __ASSEMBLER__ */
 
-#define __LL_SC_INLINE		static inline
-#define __LL_SC_PREFIX(x)	x
-#define __LL_SC_EXPORT(x)
-
 #define ARM64_LSE_ATOMIC_INSN(llsc, lse)	llsc
 
 #endif	/* __ASSEMBLER__ */
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 69ff9887f724..39be2f7f0084 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -5,25 +5,6 @@ lib-y		:= clear_user.o delay.o copy_from_user.o		\
 		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\
 		   strchr.o strrchr.o tishift.o
 
-# Tell the compiler to treat all general purpose registers (with the
-# exception of the IP registers, which are already handled by the caller
-# in case of a PLT) as callee-saved, which allows for efficient runtime
-# patching of the bl instruction in the caller with an atomic instruction
-# when supported by the CPU. Result and argument registers are handled
-# correctly, based on the function prototype.
-lib-$(CONFIG_ARM64_LSE_ATOMICS) += atomic_ll_sc.o
-CFLAGS_atomic_ll_sc.o	:= -ffixed-x1 -ffixed-x2        		\
-		   -ffixed-x3 -ffixed-x4 -ffixed-x5 -ffixed-x6		\
-		   -ffixed-x7 -fcall-saved-x8 -fcall-saved-x9		\
-		   -fcall-saved-x10 -fcall-saved-x11 -fcall-saved-x12	\
-		   -fcall-saved-x13 -fcall-saved-x14 -fcall-saved-x15	\
-		   -fcall-saved-x18 -fomit-frame-pointer
-CFLAGS_REMOVE_atomic_ll_sc.o := -pg
-GCOV_PROFILE_atomic_ll_sc.o	:= n
-KASAN_SANITIZE_atomic_ll_sc.o	:= n
-KCOV_INSTRUMENT_atomic_ll_sc.o	:= n
-UBSAN_SANITIZE_atomic_ll_sc.o	:= n
-
 lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
 
 obj-$(CONFIG_CRC32) += crc32.o
diff --git a/arch/arm64/lib/atomic_ll_sc.c b/arch/arm64/lib/atomic_ll_sc.c
deleted file mode 100644
index b0c538b0da28..000000000000
--- a/arch/arm64/lib/atomic_ll_sc.c
+++ /dev/null
@@ -1,3 +0,0 @@
-#include <asm/atomic.h>
-#define __ARM64_IN_ATOMIC_IMPL
-#include <asm/atomic_ll_sc.h>
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 0/3] arm64: use subsections instead of function calls for LL/SC fallbacks
  2018-11-13 23:39 [PATCH 0/3] arm64: use subsections instead of function calls for LL/SC fallbacks Ard Biesheuvel
                   ` (2 preceding siblings ...)
  2018-11-13 23:39 ` [PATCH 3/3] arm64/atomics: remove " Ard Biesheuvel
@ 2018-11-27 19:30 ` Will Deacon
  2018-11-28  9:16   ` Ard Biesheuvel
  3 siblings, 1 reply; 7+ messages in thread
From: Will Deacon @ 2018-11-27 19:30 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Ard,

On Tue, Nov 13, 2018 at 03:39:20PM -0800, Ard Biesheuvel wrote:
> Refactor the LL/SC atomics code so we can emit the LL/SC fallbacks for the
> LSE atomics as subsections that get instantiated at each call site rather
> than as out of line functions that get called from inline asm (without the
> awareness of the compiler)
> 
> This should allow slightly better LSE code, and removes stack spilling and
> potential PLT indirection for the LL/SC fallbacks.

Thanks, I much prefer using subsections to the current approach. However,
a downside of your patches is that the some of the asm operands passed
to the LSE implementation are redundant, for example, in the fetch-ops:

"	" #lse_op #ac #rl " %w[i], %w[res], %[v]")			\
	: [res]"=&r" (result), [val]"=&r" (val), [tmp]"=&r" (tmp),	\
	  [v]"+Q" (v->counter)						\

I'd have thought we could avoid this by splitting up the asms and using
a static key to dispatch them. For example, the really crude hacking
below resulted in reasonable code generation:

000000000000040 <will_atomic_add>:
  40:   14000004        b       50 <will_atomic_add+0x10>	// Patched with NOP once features are determined
  44:   14000007        b       60 <will_atomic_add+0x20>	// Patched with NOP if LSE
  48:   b820003f        stadd   w0, [x1]
  4c:   d65f03c0        ret
  50:   90000002        adrp    x2, 0 <cpu_hwcaps>
  54:   f9400042        ldr     x2, [x2]
  58:   721b005f        tst     w2, #0x20
  5c:   54ffff61        b.ne    48 <will_atomic_add+0x8>  // b.any
  60:   14000002        b       68 <will_atomic_add+0x28>
  64:   d65f03c0        ret
  68:   f9800031        prfm    pstl1strm, [x1]
  6c:   885f7c22        ldxr    w2, [x1]
  70:   0b000042        add     w2, w2, w0
  74:   88037c22        stxr    w3, w2, [x1]
  78:   35ffffa3        cbnz    w3, 6c <will_atomic_add+0x2c>
  7c:   17fffffa        b       64 <will_atomic_add+0x24>

So if we tweaked the existing code so that we can generate the LL/SC
versions either in a subsection or not depending on LSE, then we could
probably play this sort of trick using a static key.

What do you think?

Will

--->8

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 7e2ec64aa414..ec7bfa40ee85 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -369,7 +369,7 @@ static inline bool __cpus_have_const_cap(int num)
 {
 	if (num >= ARM64_NCAPS)
 		return false;
-	return static_branch_unlikely(&cpu_hwcap_keys[num]);
+	return static_branch_likely(&cpu_hwcap_keys[num]);
 }
 
 static inline bool cpus_have_cap(unsigned int num)
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index f4fc1e0544b7..f44080ef7188 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -405,3 +405,36 @@ static int __init register_kernel_offset_dumper(void)
 	return 0;
 }
 __initcall(register_kernel_offset_dumper);
+
+static inline void ll_sc_atomic_add(int i, atomic_t *v)
+{
+	unsigned long tmp;
+	int result;
+
+	asm volatile(
+"	b	3f\n"
+"	.subsection 1\n"
+"3:	prfm	pstl1strm, %2\n"
+"1:	ldxr	%w0, %2\n"
+"	add	%w0, %w0, %w3\n"
+"	stxr	%w1, %w0, %2\n"
+"	cbnz	%w1, 1b\n"
+"	b	4f\n"
+"	.previous\n"
+"4:"
+	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
+	: "Ir" (i));
+}
+
+void will_atomic_add(int i, atomic_t *v)
+{
+	if (!cpus_have_const_cap(ARM64_HAS_LSE_ATOMICS)) {
+		ll_sc_atomic_add(i, v);
+	} else {
+		asm volatile("stadd	%w[i], %[v]"
+		: [v] "+Q" (v->counter)
+		: [i] "r" (i));
+	}
+
+	return;
+}

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 0/3] arm64: use subsections instead of function calls for LL/SC fallbacks
  2018-11-27 19:30 ` [PATCH 0/3] arm64: use subsections instead of function calls for LL/SC fallbacks Will Deacon
@ 2018-11-28  9:16   ` Ard Biesheuvel
  2018-11-28  9:33     ` Ard Biesheuvel
  0 siblings, 1 reply; 7+ messages in thread
From: Ard Biesheuvel @ 2018-11-28  9:16 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 27 Nov 2018 at 20:30, Will Deacon <will.deacon@arm.com> wrote:
>
> Hi Ard,
>
> On Tue, Nov 13, 2018 at 03:39:20PM -0800, Ard Biesheuvel wrote:
> > Refactor the LL/SC atomics code so we can emit the LL/SC fallbacks for the
> > LSE atomics as subsections that get instantiated at each call site rather
> > than as out of line functions that get called from inline asm (without the
> > awareness of the compiler)
> >
> > This should allow slightly better LSE code, and removes stack spilling and
> > potential PLT indirection for the LL/SC fallbacks.
>
> Thanks, I much prefer using subsections to the current approach. However,
> a downside of your patches is that the some of the asm operands passed
> to the LSE implementation are redundant, for example, in the fetch-ops:
>
> "       " #lse_op #ac #rl " %w[i], %w[res], %[v]")                      \
>         : [res]"=&r" (result), [val]"=&r" (val), [tmp]"=&r" (tmp),      \
>           [v]"+Q" (v->counter)                                          \
>

That applies to most of the ops, but note that
- compared to the current situation, where x16, x17 and x30 are always
considered clobbered, the situation actually improves in some cases,
and doesn't get worse in any of them
- i tweaked the patterns a bit so that, for instance in the example
you quote, [i] is now only an input register, which may permit the
compiler to reuse a live register holding the value of i

> I'd have thought we could avoid this by splitting up the asms and using
> a static key to dispatch them. For example, the really crude hacking
> below resulted in reasonable code generation:
>
> 000000000000040 <will_atomic_add>:
>   40:   14000004        b       50 <will_atomic_add+0x10>       // Patched with NOP once features are determined
>   44:   14000007        b       60 <will_atomic_add+0x20>       // Patched with NOP if LSE
>   48:   b820003f        stadd   w0, [x1]
>   4c:   d65f03c0        ret
>   50:   90000002        adrp    x2, 0 <cpu_hwcaps>
>   54:   f9400042        ldr     x2, [x2]
>   58:   721b005f        tst     w2, #0x20
>   5c:   54ffff61        b.ne    48 <will_atomic_add+0x8>  // b.any
>   60:   14000002        b       68 <will_atomic_add+0x28>
>   64:   d65f03c0        ret
>   68:   f9800031        prfm    pstl1strm, [x1]
>   6c:   885f7c22        ldxr    w2, [x1]
>   70:   0b000042        add     w2, w2, w0
>   74:   88037c22        stxr    w3, w2, [x1]
>   78:   35ffffa3        cbnz    w3, 6c <will_atomic_add+0x2c>
>   7c:   17fffffa        b       64 <will_atomic_add+0x24>
>
> So if we tweaked the existing code so that we can generate the LL/SC
> versions either in a subsection or not depending on LSE, then we could
> probably play this sort of trick using a static key.
>
> What do you think?
>

If you inline that function, the register allocation will be exactly
the same, no? The compiler will still need to spill whatever is in the
registers that the LL/SC code may use if executed. Or will it only do
so when taking that path?

Also, AIUI, the point was to optimize for LSE, and fall back to LL/SC
(potentially taking a slight performance hit in doing so). Does the
compiler actually move the unlikely path out of line? I can't tell
from just this function, but if it doesn't, the ret at 0x4c will
become another branch in reality, branching over the LL/SC code

So it really depends on how well the compiler behaves when emitting
these inline. Does it really help that much to use fewer registers?
Enough to justify adding unconditional branches, and potentially lower
I-cache utilization?

> --->8
>
> diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
> index 7e2ec64aa414..ec7bfa40ee85 100644
> --- a/arch/arm64/include/asm/cpufeature.h
> +++ b/arch/arm64/include/asm/cpufeature.h
> @@ -369,7 +369,7 @@ static inline bool __cpus_have_const_cap(int num)
>  {
>         if (num >= ARM64_NCAPS)
>                 return false;
> -       return static_branch_unlikely(&cpu_hwcap_keys[num]);
> +       return static_branch_likely(&cpu_hwcap_keys[num]);
>  }
>
>  static inline bool cpus_have_cap(unsigned int num)
> diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
> index f4fc1e0544b7..f44080ef7188 100644
> --- a/arch/arm64/kernel/setup.c
> +++ b/arch/arm64/kernel/setup.c
> @@ -405,3 +405,36 @@ static int __init register_kernel_offset_dumper(void)
>         return 0;
>  }
>  __initcall(register_kernel_offset_dumper);
> +
> +static inline void ll_sc_atomic_add(int i, atomic_t *v)
> +{
> +       unsigned long tmp;
> +       int result;
> +
> +       asm volatile(
> +"      b       3f\n"
> +"      .subsection 1\n"
> +"3:    prfm    pstl1strm, %2\n"
> +"1:    ldxr    %w0, %2\n"
> +"      add     %w0, %w0, %w3\n"
> +"      stxr    %w1, %w0, %2\n"
> +"      cbnz    %w1, 1b\n"
> +"      b       4f\n"
> +"      .previous\n"
> +"4:"
> +       : "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
> +       : "Ir" (i));
> +}
> +
> +void will_atomic_add(int i, atomic_t *v)
> +{
> +       if (!cpus_have_const_cap(ARM64_HAS_LSE_ATOMICS)) {
> +               ll_sc_atomic_add(i, v);
> +       } else {
> +               asm volatile("stadd     %w[i], %[v]"
> +               : [v] "+Q" (v->counter)
> +               : [i] "r" (i));
> +       }
> +
> +       return;
> +}

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 0/3] arm64: use subsections instead of function calls for LL/SC fallbacks
  2018-11-28  9:16   ` Ard Biesheuvel
@ 2018-11-28  9:33     ` Ard Biesheuvel
  0 siblings, 0 replies; 7+ messages in thread
From: Ard Biesheuvel @ 2018-11-28  9:33 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, 28 Nov 2018 at 10:16, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
>
> On Tue, 27 Nov 2018 at 20:30, Will Deacon <will.deacon@arm.com> wrote:
> >
> > Hi Ard,
> >
> > On Tue, Nov 13, 2018 at 03:39:20PM -0800, Ard Biesheuvel wrote:
> > > Refactor the LL/SC atomics code so we can emit the LL/SC fallbacks for the
> > > LSE atomics as subsections that get instantiated at each call site rather
> > > than as out of line functions that get called from inline asm (without the
> > > awareness of the compiler)
> > >
> > > This should allow slightly better LSE code, and removes stack spilling and
> > > potential PLT indirection for the LL/SC fallbacks.
> >
> > Thanks, I much prefer using subsections to the current approach. However,
> > a downside of your patches is that the some of the asm operands passed
> > to the LSE implementation are redundant, for example, in the fetch-ops:
> >
> > "       " #lse_op #ac #rl " %w[i], %w[res], %[v]")                      \
> >         : [res]"=&r" (result), [val]"=&r" (val), [tmp]"=&r" (tmp),      \
> >           [v]"+Q" (v->counter)                                          \
> >
>
> That applies to most of the ops, but note that
> - compared to the current situation, where x16, x17 and x30 are always
> considered clobbered, the situation actually improves in some cases,
> and doesn't get worse in any of them
> - i tweaked the patterns a bit so that, for instance in the example
> you quote, [i] is now only an input register, which may permit the
> compiler to reuse a live register holding the value of i
>
> > I'd have thought we could avoid this by splitting up the asms and using
> > a static key to dispatch them. For example, the really crude hacking
> > below resulted in reasonable code generation:
> >
> > 000000000000040 <will_atomic_add>:
> >   40:   14000004        b       50 <will_atomic_add+0x10>       // Patched with NOP once features are determined
> >   44:   14000007        b       60 <will_atomic_add+0x20>       // Patched with NOP if LSE
> >   48:   b820003f        stadd   w0, [x1]
> >   4c:   d65f03c0        ret
> >   50:   90000002        adrp    x2, 0 <cpu_hwcaps>
> >   54:   f9400042        ldr     x2, [x2]
> >   58:   721b005f        tst     w2, #0x20
> >   5c:   54ffff61        b.ne    48 <will_atomic_add+0x8>  // b.any
> >   60:   14000002        b       68 <will_atomic_add+0x28>
> >   64:   d65f03c0        ret
> >   68:   f9800031        prfm    pstl1strm, [x1]
> >   6c:   885f7c22        ldxr    w2, [x1]
> >   70:   0b000042        add     w2, w2, w0
> >   74:   88037c22        stxr    w3, w2, [x1]
> >   78:   35ffffa3        cbnz    w3, 6c <will_atomic_add+0x2c>
> >   7c:   17fffffa        b       64 <will_atomic_add+0x24>
> >
> > So if we tweaked the existing code so that we can generate the LL/SC
> > versions either in a subsection or not depending on LSE, then we could
> > probably play this sort of trick using a static key.
> >
> > What do you think?
> >
>
> If you inline that function, the register allocation will be exactly
> the same, no? The compiler will still need to spill whatever is in the
> registers that the LL/SC code may use if executed. Or will it only do
> so when taking that path?
>

Never mind, I just noticed that the LL/SC code is still in a
subsection, so it will be moved out of the code path in any case.

> Also, AIUI, the point was to optimize for LSE, and fall back to LL/SC
> (potentially taking a slight performance hit in doing so). Does the
> compiler actually move the unlikely path out of line? I can't tell
> from just this function, but if it doesn't, the ret at 0x4c will
> become another branch in reality, branching over the LL/SC code
>
> So it really depends on how well the compiler behaves when emitting
> these inline. Does it really help that much to use fewer registers?
> Enough to justify adding unconditional branches, and potentially lower
> I-cache utilization?
>

OK, so the LL/SC code is not a concern. What about the const cap
checks? Will they pollute the I-cache or be moved out of line as well?

In general, I think your approach looks cleaner and more maintainable
in any case, I'm just skeptical that it will make a noticeable
difference in terms of register pressure.

> > --->8
> >
> > diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
> > index 7e2ec64aa414..ec7bfa40ee85 100644
> > --- a/arch/arm64/include/asm/cpufeature.h
> > +++ b/arch/arm64/include/asm/cpufeature.h
> > @@ -369,7 +369,7 @@ static inline bool __cpus_have_const_cap(int num)
> >  {
> >         if (num >= ARM64_NCAPS)
> >                 return false;
> > -       return static_branch_unlikely(&cpu_hwcap_keys[num]);
> > +       return static_branch_likely(&cpu_hwcap_keys[num]);
> >  }
> >
> >  static inline bool cpus_have_cap(unsigned int num)
> > diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
> > index f4fc1e0544b7..f44080ef7188 100644
> > --- a/arch/arm64/kernel/setup.c
> > +++ b/arch/arm64/kernel/setup.c
> > @@ -405,3 +405,36 @@ static int __init register_kernel_offset_dumper(void)
> >         return 0;
> >  }
> >  __initcall(register_kernel_offset_dumper);
> > +
> > +static inline void ll_sc_atomic_add(int i, atomic_t *v)
> > +{
> > +       unsigned long tmp;
> > +       int result;
> > +
> > +       asm volatile(
> > +"      b       3f\n"
> > +"      .subsection 1\n"
> > +"3:    prfm    pstl1strm, %2\n"
> > +"1:    ldxr    %w0, %2\n"
> > +"      add     %w0, %w0, %w3\n"
> > +"      stxr    %w1, %w0, %2\n"
> > +"      cbnz    %w1, 1b\n"
> > +"      b       4f\n"
> > +"      .previous\n"
> > +"4:"
> > +       : "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
> > +       : "Ir" (i));
> > +}
> > +
> > +void will_atomic_add(int i, atomic_t *v)
> > +{
> > +       if (!cpus_have_const_cap(ARM64_HAS_LSE_ATOMICS)) {
> > +               ll_sc_atomic_add(i, v);
> > +       } else {
> > +               asm volatile("stadd     %w[i], %[v]"
> > +               : [v] "+Q" (v->counter)
> > +               : [i] "r" (i));
> > +       }
> > +
> > +       return;
> > +}

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2018-11-28  9:33 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-11-13 23:39 [PATCH 0/3] arm64: use subsections instead of function calls for LL/SC fallbacks Ard Biesheuvel
2018-11-13 23:39 ` [PATCH 1/3] arm64/atomics: refactor LL/SC base asm templates Ard Biesheuvel
2018-11-13 23:39 ` [PATCH 2/3] arm64/atomics: use subsections for out of line LL/SC alternatives Ard Biesheuvel
2018-11-13 23:39 ` [PATCH 3/3] arm64/atomics: remove " Ard Biesheuvel
2018-11-27 19:30 ` [PATCH 0/3] arm64: use subsections instead of function calls for LL/SC fallbacks Will Deacon
2018-11-28  9:16   ` Ard Biesheuvel
2018-11-28  9:33     ` Ard Biesheuvel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).