All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] Support rseq on arm64
@ 2018-06-25 17:54 ` Will Deacon
  0 siblings, 0 replies; 24+ messages in thread
From: Will Deacon @ 2018-06-25 17:54 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, arnd, mathieu.desnoyers, peterz, paulmck,
	boqun.feng, catalin.marinas, peter.maydell, mark.rutland,
	Will Deacon

Hi all,

This patch wires up rseq for native and compat tasks under arm64. Both
have been tested with the selftests and they pass successfully on my Seattle
box.

Cheers,

Will

--->8

Will Deacon (3):
  arm64: rseq: Implement backend rseq calls and select HAVE_RSEQ
  asm-generic: unistd.h: Wire up sys_rseq
  rseq/selftests: Add support for arm64

 arch/arm64/Kconfig                        |   1 +
 arch/arm64/include/asm/unistd.h           |   2 +-
 arch/arm64/include/asm/unistd32.h         |   2 +
 arch/arm64/kernel/entry.S                 |   2 +
 arch/arm64/kernel/ptrace.c                |   2 +
 arch/arm64/kernel/signal.c                |   3 +
 include/uapi/asm-generic/unistd.h         |   4 +-
 tools/testing/selftests/rseq/param_test.c |  20 +
 tools/testing/selftests/rseq/rseq-arm64.h | 594 ++++++++++++++++++++++++++++++
 tools/testing/selftests/rseq/rseq.h       |   2 +
 10 files changed, 630 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/rseq/rseq-arm64.h

-- 
2.1.4


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 0/3] Support rseq on arm64
@ 2018-06-25 17:54 ` Will Deacon
  0 siblings, 0 replies; 24+ messages in thread
From: Will Deacon @ 2018-06-25 17:54 UTC (permalink / raw)
  To: linux-arm-kernel

Hi all,

This patch wires up rseq for native and compat tasks under arm64. Both
have been tested with the selftests and they pass successfully on my Seattle
box.

Cheers,

Will

--->8

Will Deacon (3):
  arm64: rseq: Implement backend rseq calls and select HAVE_RSEQ
  asm-generic: unistd.h: Wire up sys_rseq
  rseq/selftests: Add support for arm64

 arch/arm64/Kconfig                        |   1 +
 arch/arm64/include/asm/unistd.h           |   2 +-
 arch/arm64/include/asm/unistd32.h         |   2 +
 arch/arm64/kernel/entry.S                 |   2 +
 arch/arm64/kernel/ptrace.c                |   2 +
 arch/arm64/kernel/signal.c                |   3 +
 include/uapi/asm-generic/unistd.h         |   4 +-
 tools/testing/selftests/rseq/param_test.c |  20 +
 tools/testing/selftests/rseq/rseq-arm64.h | 594 ++++++++++++++++++++++++++++++
 tools/testing/selftests/rseq/rseq.h       |   2 +
 10 files changed, 630 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/rseq/rseq-arm64.h

-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 1/3] arm64: rseq: Implement backend rseq calls and select HAVE_RSEQ
  2018-06-25 17:54 ` Will Deacon
@ 2018-06-25 17:54   ` Will Deacon
  -1 siblings, 0 replies; 24+ messages in thread
From: Will Deacon @ 2018-06-25 17:54 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, arnd, mathieu.desnoyers, peterz, paulmck,
	boqun.feng, catalin.marinas, peter.maydell, mark.rutland,
	Will Deacon

Implement calls to rseq_signal_deliver, rseq_handle_notify_resume
and rseq_syscall so that we can select HAVE_RSEQ on arm64.

Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/Kconfig                | 1 +
 arch/arm64/include/asm/unistd.h   | 2 +-
 arch/arm64/include/asm/unistd32.h | 2 ++
 arch/arm64/kernel/entry.S         | 2 ++
 arch/arm64/kernel/ptrace.c        | 2 ++
 arch/arm64/kernel/signal.c        | 3 +++
 6 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 42c090cf0292..26cb550673b2 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -127,6 +127,7 @@ config ARM64
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RCU_TABLE_FREE
+	select HAVE_RSEQ
 	select HAVE_STACKPROTECTOR
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_KPROBES
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index a0baa9af5487..e0d0f5b856e7 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -43,7 +43,7 @@
 #define __ARM_NR_compat_cacheflush	(__ARM_NR_COMPAT_BASE+2)
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE+5)
 
-#define __NR_compat_syscalls		398
+#define __NR_compat_syscalls		399
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index ef292160748c..0fdc7ef8a776 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -817,6 +817,8 @@ __SYSCALL(__NR_pkey_alloc, sys_pkey_alloc)
 __SYSCALL(__NR_pkey_free, sys_pkey_free)
 #define __NR_statx 397
 __SYSCALL(__NR_statx, sys_statx)
+#define __NR_rseq 398
+__SYSCALL(__NR_rseq, sys_rseq)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 28ad8799406f..1eda9e1a1f4a 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -904,6 +904,7 @@ ENDPROC(el0_error)
 ret_fast_syscall:
 	disable_daif
 	str	x0, [sp, #S_X0]			// returned x0
+#ifndef CONFIG_DEBUG_RSEQ
 	ldr	x1, [tsk, #TSK_TI_FLAGS]	// re-check for syscall tracing
 	and	x2, x1, #_TIF_SYSCALL_WORK
 	cbnz	x2, ret_fast_syscall_trace
@@ -911,6 +912,7 @@ ret_fast_syscall:
 	cbnz	x2, work_pending
 	enable_step_tsk x1, x2
 	kernel_exit 0
+#endif
 ret_fast_syscall_trace:
 	enable_daif
 	b	__sys_trace_return_skipped	// we already saved x0
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 5c338ce5a7fa..9f479c111675 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -1656,6 +1656,8 @@ asmlinkage void syscall_trace_exit(struct pt_regs *regs)
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
 		tracehook_report_syscall(regs, PTRACE_SYSCALL_EXIT);
+
+	rseq_syscall(regs);
 }
 
 /*
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 511af13e8d8f..e3b1d1b0aee8 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -802,6 +802,8 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 	int usig = ksig->sig;
 	int ret;
 
+	rseq_signal_deliver(ksig, regs);
+
 	/*
 	 * Set up the stack frame
 	 */
@@ -940,6 +942,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
 			if (thread_flags & _TIF_NOTIFY_RESUME) {
 				clear_thread_flag(TIF_NOTIFY_RESUME);
 				tracehook_notify_resume(regs);
+				rseq_handle_notify_resume(NULL, regs);
 			}
 
 			if (thread_flags & _TIF_FOREIGN_FPSTATE)
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 1/3] arm64: rseq: Implement backend rseq calls and select HAVE_RSEQ
@ 2018-06-25 17:54   ` Will Deacon
  0 siblings, 0 replies; 24+ messages in thread
From: Will Deacon @ 2018-06-25 17:54 UTC (permalink / raw)
  To: linux-arm-kernel

Implement calls to rseq_signal_deliver, rseq_handle_notify_resume
and rseq_syscall so that we can select HAVE_RSEQ on arm64.

Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/Kconfig                | 1 +
 arch/arm64/include/asm/unistd.h   | 2 +-
 arch/arm64/include/asm/unistd32.h | 2 ++
 arch/arm64/kernel/entry.S         | 2 ++
 arch/arm64/kernel/ptrace.c        | 2 ++
 arch/arm64/kernel/signal.c        | 3 +++
 6 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 42c090cf0292..26cb550673b2 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -127,6 +127,7 @@ config ARM64
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RCU_TABLE_FREE
+	select HAVE_RSEQ
 	select HAVE_STACKPROTECTOR
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_KPROBES
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index a0baa9af5487..e0d0f5b856e7 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -43,7 +43,7 @@
 #define __ARM_NR_compat_cacheflush	(__ARM_NR_COMPAT_BASE+2)
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE+5)
 
-#define __NR_compat_syscalls		398
+#define __NR_compat_syscalls		399
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index ef292160748c..0fdc7ef8a776 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -817,6 +817,8 @@ __SYSCALL(__NR_pkey_alloc, sys_pkey_alloc)
 __SYSCALL(__NR_pkey_free, sys_pkey_free)
 #define __NR_statx 397
 __SYSCALL(__NR_statx, sys_statx)
+#define __NR_rseq 398
+__SYSCALL(__NR_rseq, sys_rseq)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 28ad8799406f..1eda9e1a1f4a 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -904,6 +904,7 @@ ENDPROC(el0_error)
 ret_fast_syscall:
 	disable_daif
 	str	x0, [sp, #S_X0]			// returned x0
+#ifndef CONFIG_DEBUG_RSEQ
 	ldr	x1, [tsk, #TSK_TI_FLAGS]	// re-check for syscall tracing
 	and	x2, x1, #_TIF_SYSCALL_WORK
 	cbnz	x2, ret_fast_syscall_trace
@@ -911,6 +912,7 @@ ret_fast_syscall:
 	cbnz	x2, work_pending
 	enable_step_tsk x1, x2
 	kernel_exit 0
+#endif
 ret_fast_syscall_trace:
 	enable_daif
 	b	__sys_trace_return_skipped	// we already saved x0
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 5c338ce5a7fa..9f479c111675 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -1656,6 +1656,8 @@ asmlinkage void syscall_trace_exit(struct pt_regs *regs)
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
 		tracehook_report_syscall(regs, PTRACE_SYSCALL_EXIT);
+
+	rseq_syscall(regs);
 }
 
 /*
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 511af13e8d8f..e3b1d1b0aee8 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -802,6 +802,8 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 	int usig = ksig->sig;
 	int ret;
 
+	rseq_signal_deliver(ksig, regs);
+
 	/*
 	 * Set up the stack frame
 	 */
@@ -940,6 +942,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
 			if (thread_flags & _TIF_NOTIFY_RESUME) {
 				clear_thread_flag(TIF_NOTIFY_RESUME);
 				tracehook_notify_resume(regs);
+				rseq_handle_notify_resume(NULL, regs);
 			}
 
 			if (thread_flags & _TIF_FOREIGN_FPSTATE)
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 2/3] asm-generic: unistd.h: Wire up sys_rseq
  2018-06-25 17:54 ` Will Deacon
@ 2018-06-25 17:54   ` Will Deacon
  -1 siblings, 0 replies; 24+ messages in thread
From: Will Deacon @ 2018-06-25 17:54 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, arnd, mathieu.desnoyers, peterz, paulmck,
	boqun.feng, catalin.marinas, peter.maydell, mark.rutland,
	Will Deacon

The new rseq call arrived in 4.18-rc1, so provide it in the asm-generic
unistd.h for architectures such as arm64.

Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/uapi/asm-generic/unistd.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 42990676a55e..df4bedb9b01c 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -734,9 +734,11 @@ __SYSCALL(__NR_pkey_free,     sys_pkey_free)
 __SYSCALL(__NR_statx,     sys_statx)
 #define __NR_io_pgetevents 292
 __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
+#define __NR_rseq 293
+__SYSCALL(__NR_rseq, sys_rseq)
 
 #undef __NR_syscalls
-#define __NR_syscalls 293
+#define __NR_syscalls 294
 
 /*
  * 32 bit systems traditionally used different
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 2/3] asm-generic: unistd.h: Wire up sys_rseq
@ 2018-06-25 17:54   ` Will Deacon
  0 siblings, 0 replies; 24+ messages in thread
From: Will Deacon @ 2018-06-25 17:54 UTC (permalink / raw)
  To: linux-arm-kernel

The new rseq call arrived in 4.18-rc1, so provide it in the asm-generic
unistd.h for architectures such as arm64.

Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/uapi/asm-generic/unistd.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 42990676a55e..df4bedb9b01c 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -734,9 +734,11 @@ __SYSCALL(__NR_pkey_free,     sys_pkey_free)
 __SYSCALL(__NR_statx,     sys_statx)
 #define __NR_io_pgetevents 292
 __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
+#define __NR_rseq 293
+__SYSCALL(__NR_rseq, sys_rseq)
 
 #undef __NR_syscalls
-#define __NR_syscalls 293
+#define __NR_syscalls 294
 
 /*
  * 32 bit systems traditionally used different
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 3/3] rseq/selftests: Add support for arm64
  2018-06-25 17:54 ` Will Deacon
@ 2018-06-25 17:54   ` Will Deacon
  -1 siblings, 0 replies; 24+ messages in thread
From: Will Deacon @ 2018-06-25 17:54 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, arnd, mathieu.desnoyers, peterz, paulmck,
	boqun.feng, catalin.marinas, peter.maydell, mark.rutland,
	Will Deacon

Hook up arm64 support to the rseq selftests.

Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 tools/testing/selftests/rseq/param_test.c |  20 +
 tools/testing/selftests/rseq/rseq-arm64.h | 594 ++++++++++++++++++++++++++++++
 tools/testing/selftests/rseq/rseq.h       |   2 +
 3 files changed, 616 insertions(+)
 create mode 100644 tools/testing/selftests/rseq/rseq-arm64.h

diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c
index 615252331813..fa144c556371 100644
--- a/tools/testing/selftests/rseq/param_test.c
+++ b/tools/testing/selftests/rseq/param_test.c
@@ -114,6 +114,26 @@ unsigned int yield_mod_cnt, nr_abort;
 	"bne 222b\n\t" \
 	"333:\n\t"
 
+#elif defined(__AARCH64EL__)
+
+#define RSEQ_INJECT_INPUT \
+	, [loop_cnt_1] "Qo" (loop_cnt[1]) \
+	, [loop_cnt_2] "Qo" (loop_cnt[2]) \
+	, [loop_cnt_3] "Qo" (loop_cnt[3]) \
+	, [loop_cnt_4] "Qo" (loop_cnt[4]) \
+	, [loop_cnt_5] "Qo" (loop_cnt[5]) \
+	, [loop_cnt_6] "Qo" (loop_cnt[6])
+
+#define INJECT_ASM_REG	RSEQ_ASM_TMP_REG32
+
+#define RSEQ_INJECT_ASM(n) \
+	"	ldr	" INJECT_ASM_REG ", %[loop_cnt_" #n "]\n"	\
+	"	cbz	" INJECT_ASM_REG ", 333f\n"			\
+	"222:\n"							\
+	"	sub	" INJECT_ASM_REG ", " INJECT_ASM_REG ", #1\n"	\
+	"	cbnz	" INJECT_ASM_REG ", 222b\n"			\
+	"333:\n"
+
 #elif __PPC__
 
 #define RSEQ_INJECT_INPUT \
diff --git a/tools/testing/selftests/rseq/rseq-arm64.h b/tools/testing/selftests/rseq/rseq-arm64.h
new file mode 100644
index 000000000000..599788f74137
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-arm64.h
@@ -0,0 +1,594 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * rseq-arm64.h
+ *
+ * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ * (C) Copyright 2018 - Will Deacon <will.deacon@arm.com>
+ */
+
+#define RSEQ_SIG	0xd428bc00	/* BRK #0x45E0 */
+
+#define rseq_smp_mb()	__asm__ __volatile__ ("dmb ish" ::: "memory")
+#define rseq_smp_rmb()	__asm__ __volatile__ ("dmb ishld" ::: "memory")
+#define rseq_smp_wmb()	__asm__ __volatile__ ("dmb ishst" ::: "memory")
+
+#define rseq_smp_load_acquire(p)						\
+__extension__ ({								\
+	__typeof(*p) ____p1;							\
+	switch (sizeof(*p)) {							\
+	case 1:									\
+		asm volatile ("ldarb %w0, %1"					\
+			: "=r" (*(__u8 *)p)					\
+			: "Q" (*p) : "memory");					\
+		break;								\
+	case 2:									\
+		asm volatile ("ldarh %w0, %1"					\
+			: "=r" (*(__u16 *)p)					\
+			: "Q" (*p) : "memory");					\
+		break;								\
+	case 4:									\
+		asm volatile ("ldar %w0, %1"					\
+			: "=r" (*(__u32 *)p)					\
+			: "Q" (*p) : "memory");					\
+		break;								\
+	case 8:									\
+		asm volatile ("ldar %0, %1"					\
+			: "=r" (*(__u64 *)p)					\
+			: "Q" (*p) : "memory");					\
+		break;								\
+	}									\
+	____p1;									\
+})
+
+#define rseq_smp_acquire__after_ctrl_dep()	rseq_smp_rmb()
+
+#define rseq_smp_store_release(p, v)						\
+do {										\
+	switch (sizeof(*p)) {							\
+	case 1:									\
+		asm volatile ("stlrb %w1, %0"					\
+				: "=Q" (*p)					\
+				: "r" ((__u8)v)					\
+				: "memory");					\
+		break;								\
+	case 2:									\
+		asm volatile ("stlrh %w1, %0"					\
+				: "=Q" (*p)					\
+				: "r" ((__u16)v)				\
+				: "memory");					\
+		break;								\
+	case 4:									\
+		asm volatile ("stlr %w1, %0"					\
+				: "=Q" (*p)					\
+				: "r" ((__u32)v)				\
+				: "memory");					\
+		break;								\
+	case 8:									\
+		asm volatile ("stlr %1, %0"					\
+				: "=Q" (*p)					\
+				: "r" ((__u64)v)				\
+				: "memory");					\
+		break;								\
+	}									\
+} while (0)
+
+#ifdef RSEQ_SKIP_FASTPATH
+#include "rseq-skip.h"
+#else /* !RSEQ_SKIP_FASTPATH */
+
+#define RSEQ_ASM_TMP_REG32	"w15"
+#define RSEQ_ASM_TMP_REG	"x15"
+#define RSEQ_ASM_TMP_REG_2	"x14"
+
+#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,		\
+				post_commit_offset, abort_ip)			\
+	"	.pushsection	__rseq_table, \"aw\"\n"				\
+	"	.balign	32\n"							\
+	__rseq_str(label) ":\n"							\
+	"	.long	" __rseq_str(version) ", " __rseq_str(flags) "\n"	\
+	"	.quad	" __rseq_str(start_ip) ", "				\
+			  __rseq_str(post_commit_offset) ", "			\
+			  __rseq_str(abort_ip) "\n"				\
+	"	.popsection\n"
+
+#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip)	\
+	__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,			\
+				(post_commit_ip - start_ip), abort_ip)
+
+#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)			\
+	RSEQ_INJECT_ASM(1)							\
+	"	adrp	" RSEQ_ASM_TMP_REG ", " __rseq_str(cs_label) "\n"	\
+	"	add	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
+			", :lo12:" __rseq_str(cs_label) "\n"			\
+	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(rseq_cs) "]\n"	\
+	__rseq_str(label) ":\n"
+
+#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\
+	"	.pushsection	__rseq_failure, \"ax\"\n"			\
+	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\
+	__rseq_str(label) ":\n"							\
+	"	b	%l[" __rseq_str(abort_label) "]\n"			\
+	"	.popsection\n"
+
+#define RSEQ_ASM_OP_STORE(value, var)						\
+	"	str	%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n"
+
+#define RSEQ_ASM_OP_STORE_RELEASE(value, var)					\
+	"	stlr	%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n"
+
+#define RSEQ_ASM_OP_FINAL_STORE(value, var, post_commit_label)			\
+	RSEQ_ASM_OP_STORE(value, var)						\
+	__rseq_str(post_commit_label) ":\n"
+
+#define RSEQ_ASM_OP_FINAL_STORE_RELEASE(value, var, post_commit_label)		\
+	RSEQ_ASM_OP_STORE_RELEASE(value, var)					\
+	__rseq_str(post_commit_label) ":\n"
+
+#define RSEQ_ASM_OP_CMPEQ(var, expect, label)					\
+	"	ldr	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(var) "]\n"		\
+	"	sub	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
+			", %[" __rseq_str(expect) "]\n"				\
+	"	cbnz	" RSEQ_ASM_TMP_REG ", " __rseq_str(label) "\n"
+
+#define RSEQ_ASM_OP_CMPEQ32(var, expect, label)					\
+	"	ldr	" RSEQ_ASM_TMP_REG32 ", %[" __rseq_str(var) "]\n"	\
+	"	sub	" RSEQ_ASM_TMP_REG32 ", " RSEQ_ASM_TMP_REG32		\
+			", %w[" __rseq_str(expect) "]\n"			\
+	"	cbnz	" RSEQ_ASM_TMP_REG32 ", " __rseq_str(label) "\n"
+
+#define RSEQ_ASM_OP_CMPNE(var, expect, label)					\
+	"	ldr	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(var) "]\n"		\
+	"	sub	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
+			", %[" __rseq_str(expect) "]\n"				\
+	"	cbz	" RSEQ_ASM_TMP_REG ", " __rseq_str(label) "\n"
+
+#define RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, label)			\
+	RSEQ_INJECT_ASM(2)							\
+	RSEQ_ASM_OP_CMPEQ32(current_cpu_id, cpu_id, label)
+
+#define RSEQ_ASM_OP_R_LOAD(var)							\
+	"	ldr	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(var) "]\n"
+
+#define RSEQ_ASM_OP_R_STORE(var)						\
+	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(var) "]\n"
+
+#define RSEQ_ASM_OP_R_LOAD_OFF(offset)						\
+	"	ldr	" RSEQ_ASM_TMP_REG ", [" RSEQ_ASM_TMP_REG		\
+			", %[" __rseq_str(offset) "]]\n"
+
+#define RSEQ_ASM_OP_R_ADD(count)						\
+	"	add	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
+			", %[" __rseq_str(count) "]\n"
+
+#define RSEQ_ASM_OP_R_FINAL_STORE(var, post_commit_label)			\
+	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(var) "]\n"		\
+	__rseq_str(post_commit_label) ":\n"
+
+#define RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)					\
+	"	cbz	%[" __rseq_str(len) "], 333f\n"				\
+	"	mov	" RSEQ_ASM_TMP_REG_2 ", %[" __rseq_str(len) "]\n"	\
+	"222:	sub	" RSEQ_ASM_TMP_REG_2 ", " RSEQ_ASM_TMP_REG_2 ", #1\n"	\
+	"	ldrb	" RSEQ_ASM_TMP_REG32 ", [%[" __rseq_str(src) "]"	\
+			", " RSEQ_ASM_TMP_REG_2 "]\n"				\
+	"	strb	" RSEQ_ASM_TMP_REG32 ", [%[" __rseq_str(dst) "]"	\
+			", " RSEQ_ASM_TMP_REG_2 "]\n"				\
+	"	cbnz	" RSEQ_ASM_TMP_REG_2 ", 222b\n"				\
+	"333:\n"
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [v]			"Qo" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
+			       off_t voffp, intptr_t *load, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPNE(v, expectnot, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPNE(v, expectnot, %l[error2])
+#endif
+		RSEQ_ASM_OP_R_LOAD(v)
+		RSEQ_ASM_OP_R_STORE(load)
+		RSEQ_ASM_OP_R_LOAD_OFF(voffp)
+		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [v]			"Qo" (*v),
+		  [expectnot]		"r" (expectnot),
+		  [load]		"Qo" (*load),
+		  [voffp]		"r" (voffp)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_addv(intptr_t *v, intptr_t count, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+#endif
+		RSEQ_ASM_OP_R_LOAD(v)
+		RSEQ_ASM_OP_R_ADD(count)
+		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
+		RSEQ_INJECT_ASM(4)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [v]			"Qo" (*v),
+		  [count]		"r" (count)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1
+#endif
+	);
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
+				 intptr_t *v2, intptr_t newv2,
+				 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+		RSEQ_ASM_OP_STORE(newv2, v2)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [expect]		"r" (expect),
+		  [v]			"Qo" (*v),
+		  [newv]		"r" (newv),
+		  [v2]			"Qo" (*v2),
+		  [newv2]		"r" (newv2)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
+					 intptr_t *v2, intptr_t newv2,
+					 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+		RSEQ_ASM_OP_STORE(newv2, v2)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [expect]		"r" (expect),
+		  [v]			"Qo" (*v),
+		  [newv]		"r" (newv),
+		  [v2]			"Qo" (*v2),
+		  [newv2]		"r" (newv2)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
+			      intptr_t *v2, intptr_t expect2,
+			      intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+		RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[cmpfail])
+		RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+		RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[error3])
+#endif
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [v]			"Qo" (*v),
+		  [expect]		"r" (expect),
+		  [v2]			"Qo" (*v2),
+		  [expect2]		"r" (expect2),
+		  [newv]		"r" (newv)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2, error3
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+error3:
+	rseq_bug("2nd expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
+				 void *dst, void *src, size_t len,
+				 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+		RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [expect]		"r" (expect),
+		  [v]			"Qo" (*v),
+		  [newv]		"r" (newv),
+		  [dst]			"r" (dst),
+		  [src]			"r" (src),
+		  [len]			"r" (len)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG, RSEQ_ASM_TMP_REG_2
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
+					 void *dst, void *src, size_t len,
+					 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+		RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [expect]		"r" (expect),
+		  [v]			"Qo" (*v),
+		  [newv]		"r" (newv),
+		  [dst]			"r" (dst),
+		  [src]			"r" (src),
+		  [len]			"r" (len)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG, RSEQ_ASM_TMP_REG_2
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+#endif /* !RSEQ_SKIP_FASTPATH */
diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h
index a4684112676c..b5d94087fe31 100644
--- a/tools/testing/selftests/rseq/rseq.h
+++ b/tools/testing/selftests/rseq/rseq.h
@@ -71,6 +71,8 @@ extern __thread volatile struct rseq __rseq_abi;
 #include <rseq-x86.h>
 #elif defined(__ARMEL__)
 #include <rseq-arm.h>
+#elif defined (__AARCH64EL__)
+#include <rseq-arm64.h>
 #elif defined(__PPC__)
 #include <rseq-ppc.h>
 #elif defined(__mips__)
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 3/3] rseq/selftests: Add support for arm64
@ 2018-06-25 17:54   ` Will Deacon
  0 siblings, 0 replies; 24+ messages in thread
From: Will Deacon @ 2018-06-25 17:54 UTC (permalink / raw)
  To: linux-arm-kernel

Hook up arm64 support to the rseq selftests.

Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 tools/testing/selftests/rseq/param_test.c |  20 +
 tools/testing/selftests/rseq/rseq-arm64.h | 594 ++++++++++++++++++++++++++++++
 tools/testing/selftests/rseq/rseq.h       |   2 +
 3 files changed, 616 insertions(+)
 create mode 100644 tools/testing/selftests/rseq/rseq-arm64.h

diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c
index 615252331813..fa144c556371 100644
--- a/tools/testing/selftests/rseq/param_test.c
+++ b/tools/testing/selftests/rseq/param_test.c
@@ -114,6 +114,26 @@ unsigned int yield_mod_cnt, nr_abort;
 	"bne 222b\n\t" \
 	"333:\n\t"
 
+#elif defined(__AARCH64EL__)
+
+#define RSEQ_INJECT_INPUT \
+	, [loop_cnt_1] "Qo" (loop_cnt[1]) \
+	, [loop_cnt_2] "Qo" (loop_cnt[2]) \
+	, [loop_cnt_3] "Qo" (loop_cnt[3]) \
+	, [loop_cnt_4] "Qo" (loop_cnt[4]) \
+	, [loop_cnt_5] "Qo" (loop_cnt[5]) \
+	, [loop_cnt_6] "Qo" (loop_cnt[6])
+
+#define INJECT_ASM_REG	RSEQ_ASM_TMP_REG32
+
+#define RSEQ_INJECT_ASM(n) \
+	"	ldr	" INJECT_ASM_REG ", %[loop_cnt_" #n "]\n"	\
+	"	cbz	" INJECT_ASM_REG ", 333f\n"			\
+	"222:\n"							\
+	"	sub	" INJECT_ASM_REG ", " INJECT_ASM_REG ", #1\n"	\
+	"	cbnz	" INJECT_ASM_REG ", 222b\n"			\
+	"333:\n"
+
 #elif __PPC__
 
 #define RSEQ_INJECT_INPUT \
diff --git a/tools/testing/selftests/rseq/rseq-arm64.h b/tools/testing/selftests/rseq/rseq-arm64.h
new file mode 100644
index 000000000000..599788f74137
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-arm64.h
@@ -0,0 +1,594 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * rseq-arm64.h
+ *
+ * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ * (C) Copyright 2018 - Will Deacon <will.deacon@arm.com>
+ */
+
+#define RSEQ_SIG	0xd428bc00	/* BRK #0x45E0 */
+
+#define rseq_smp_mb()	__asm__ __volatile__ ("dmb ish" ::: "memory")
+#define rseq_smp_rmb()	__asm__ __volatile__ ("dmb ishld" ::: "memory")
+#define rseq_smp_wmb()	__asm__ __volatile__ ("dmb ishst" ::: "memory")
+
+#define rseq_smp_load_acquire(p)						\
+__extension__ ({								\
+	__typeof(*p) ____p1;							\
+	switch (sizeof(*p)) {							\
+	case 1:									\
+		asm volatile ("ldarb %w0, %1"					\
+			: "=r" (*(__u8 *)p)					\
+			: "Q" (*p) : "memory");					\
+		break;								\
+	case 2:									\
+		asm volatile ("ldarh %w0, %1"					\
+			: "=r" (*(__u16 *)p)					\
+			: "Q" (*p) : "memory");					\
+		break;								\
+	case 4:									\
+		asm volatile ("ldar %w0, %1"					\
+			: "=r" (*(__u32 *)p)					\
+			: "Q" (*p) : "memory");					\
+		break;								\
+	case 8:									\
+		asm volatile ("ldar %0, %1"					\
+			: "=r" (*(__u64 *)p)					\
+			: "Q" (*p) : "memory");					\
+		break;								\
+	}									\
+	____p1;									\
+})
+
+#define rseq_smp_acquire__after_ctrl_dep()	rseq_smp_rmb()
+
+#define rseq_smp_store_release(p, v)						\
+do {										\
+	switch (sizeof(*p)) {							\
+	case 1:									\
+		asm volatile ("stlrb %w1, %0"					\
+				: "=Q" (*p)					\
+				: "r" ((__u8)v)					\
+				: "memory");					\
+		break;								\
+	case 2:									\
+		asm volatile ("stlrh %w1, %0"					\
+				: "=Q" (*p)					\
+				: "r" ((__u16)v)				\
+				: "memory");					\
+		break;								\
+	case 4:									\
+		asm volatile ("stlr %w1, %0"					\
+				: "=Q" (*p)					\
+				: "r" ((__u32)v)				\
+				: "memory");					\
+		break;								\
+	case 8:									\
+		asm volatile ("stlr %1, %0"					\
+				: "=Q" (*p)					\
+				: "r" ((__u64)v)				\
+				: "memory");					\
+		break;								\
+	}									\
+} while (0)
+
+#ifdef RSEQ_SKIP_FASTPATH
+#include "rseq-skip.h"
+#else /* !RSEQ_SKIP_FASTPATH */
+
+#define RSEQ_ASM_TMP_REG32	"w15"
+#define RSEQ_ASM_TMP_REG	"x15"
+#define RSEQ_ASM_TMP_REG_2	"x14"
+
+#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,		\
+				post_commit_offset, abort_ip)			\
+	"	.pushsection	__rseq_table, \"aw\"\n"				\
+	"	.balign	32\n"							\
+	__rseq_str(label) ":\n"							\
+	"	.long	" __rseq_str(version) ", " __rseq_str(flags) "\n"	\
+	"	.quad	" __rseq_str(start_ip) ", "				\
+			  __rseq_str(post_commit_offset) ", "			\
+			  __rseq_str(abort_ip) "\n"				\
+	"	.popsection\n"
+
+#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip)	\
+	__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,			\
+				(post_commit_ip - start_ip), abort_ip)
+
+#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)			\
+	RSEQ_INJECT_ASM(1)							\
+	"	adrp	" RSEQ_ASM_TMP_REG ", " __rseq_str(cs_label) "\n"	\
+	"	add	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
+			", :lo12:" __rseq_str(cs_label) "\n"			\
+	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(rseq_cs) "]\n"	\
+	__rseq_str(label) ":\n"
+
+#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\
+	"	.pushsection	__rseq_failure, \"ax\"\n"			\
+	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\
+	__rseq_str(label) ":\n"							\
+	"	b	%l[" __rseq_str(abort_label) "]\n"			\
+	"	.popsection\n"
+
+#define RSEQ_ASM_OP_STORE(value, var)						\
+	"	str	%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n"
+
+#define RSEQ_ASM_OP_STORE_RELEASE(value, var)					\
+	"	stlr	%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n"
+
+#define RSEQ_ASM_OP_FINAL_STORE(value, var, post_commit_label)			\
+	RSEQ_ASM_OP_STORE(value, var)						\
+	__rseq_str(post_commit_label) ":\n"
+
+#define RSEQ_ASM_OP_FINAL_STORE_RELEASE(value, var, post_commit_label)		\
+	RSEQ_ASM_OP_STORE_RELEASE(value, var)					\
+	__rseq_str(post_commit_label) ":\n"
+
+#define RSEQ_ASM_OP_CMPEQ(var, expect, label)					\
+	"	ldr	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(var) "]\n"		\
+	"	sub	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
+			", %[" __rseq_str(expect) "]\n"				\
+	"	cbnz	" RSEQ_ASM_TMP_REG ", " __rseq_str(label) "\n"
+
+#define RSEQ_ASM_OP_CMPEQ32(var, expect, label)					\
+	"	ldr	" RSEQ_ASM_TMP_REG32 ", %[" __rseq_str(var) "]\n"	\
+	"	sub	" RSEQ_ASM_TMP_REG32 ", " RSEQ_ASM_TMP_REG32		\
+			", %w[" __rseq_str(expect) "]\n"			\
+	"	cbnz	" RSEQ_ASM_TMP_REG32 ", " __rseq_str(label) "\n"
+
+#define RSEQ_ASM_OP_CMPNE(var, expect, label)					\
+	"	ldr	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(var) "]\n"		\
+	"	sub	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
+			", %[" __rseq_str(expect) "]\n"				\
+	"	cbz	" RSEQ_ASM_TMP_REG ", " __rseq_str(label) "\n"
+
+#define RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, label)			\
+	RSEQ_INJECT_ASM(2)							\
+	RSEQ_ASM_OP_CMPEQ32(current_cpu_id, cpu_id, label)
+
+#define RSEQ_ASM_OP_R_LOAD(var)							\
+	"	ldr	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(var) "]\n"
+
+#define RSEQ_ASM_OP_R_STORE(var)						\
+	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(var) "]\n"
+
+#define RSEQ_ASM_OP_R_LOAD_OFF(offset)						\
+	"	ldr	" RSEQ_ASM_TMP_REG ", [" RSEQ_ASM_TMP_REG		\
+			", %[" __rseq_str(offset) "]]\n"
+
+#define RSEQ_ASM_OP_R_ADD(count)						\
+	"	add	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
+			", %[" __rseq_str(count) "]\n"
+
+#define RSEQ_ASM_OP_R_FINAL_STORE(var, post_commit_label)			\
+	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(var) "]\n"		\
+	__rseq_str(post_commit_label) ":\n"
+
+#define RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)					\
+	"	cbz	%[" __rseq_str(len) "], 333f\n"				\
+	"	mov	" RSEQ_ASM_TMP_REG_2 ", %[" __rseq_str(len) "]\n"	\
+	"222:	sub	" RSEQ_ASM_TMP_REG_2 ", " RSEQ_ASM_TMP_REG_2 ", #1\n"	\
+	"	ldrb	" RSEQ_ASM_TMP_REG32 ", [%[" __rseq_str(src) "]"	\
+			", " RSEQ_ASM_TMP_REG_2 "]\n"				\
+	"	strb	" RSEQ_ASM_TMP_REG32 ", [%[" __rseq_str(dst) "]"	\
+			", " RSEQ_ASM_TMP_REG_2 "]\n"				\
+	"	cbnz	" RSEQ_ASM_TMP_REG_2 ", 222b\n"				\
+	"333:\n"
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [v]			"Qo" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
+			       off_t voffp, intptr_t *load, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPNE(v, expectnot, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPNE(v, expectnot, %l[error2])
+#endif
+		RSEQ_ASM_OP_R_LOAD(v)
+		RSEQ_ASM_OP_R_STORE(load)
+		RSEQ_ASM_OP_R_LOAD_OFF(voffp)
+		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [v]			"Qo" (*v),
+		  [expectnot]		"r" (expectnot),
+		  [load]		"Qo" (*load),
+		  [voffp]		"r" (voffp)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_addv(intptr_t *v, intptr_t count, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+#endif
+		RSEQ_ASM_OP_R_LOAD(v)
+		RSEQ_ASM_OP_R_ADD(count)
+		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
+		RSEQ_INJECT_ASM(4)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [v]			"Qo" (*v),
+		  [count]		"r" (count)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1
+#endif
+	);
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
+				 intptr_t *v2, intptr_t newv2,
+				 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+		RSEQ_ASM_OP_STORE(newv2, v2)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [expect]		"r" (expect),
+		  [v]			"Qo" (*v),
+		  [newv]		"r" (newv),
+		  [v2]			"Qo" (*v2),
+		  [newv2]		"r" (newv2)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
+					 intptr_t *v2, intptr_t newv2,
+					 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+		RSEQ_ASM_OP_STORE(newv2, v2)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [expect]		"r" (expect),
+		  [v]			"Qo" (*v),
+		  [newv]		"r" (newv),
+		  [v2]			"Qo" (*v2),
+		  [newv2]		"r" (newv2)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
+			      intptr_t *v2, intptr_t expect2,
+			      intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+		RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[cmpfail])
+		RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+		RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[error3])
+#endif
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [v]			"Qo" (*v),
+		  [expect]		"r" (expect),
+		  [v2]			"Qo" (*v2),
+		  [expect2]		"r" (expect2),
+		  [newv]		"r" (newv)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2, error3
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+error3:
+	rseq_bug("2nd expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
+				 void *dst, void *src, size_t len,
+				 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+		RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [expect]		"r" (expect),
+		  [v]			"Qo" (*v),
+		  [newv]		"r" (newv),
+		  [dst]			"r" (dst),
+		  [src]			"r" (src),
+		  [len]			"r" (len)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG, RSEQ_ASM_TMP_REG_2
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
+					 void *dst, void *src, size_t len,
+					 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+		RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
+		  [rseq_cs]		"m" (__rseq_abi.rseq_cs),
+		  [expect]		"r" (expect),
+		  [v]			"Qo" (*v),
+		  [newv]		"r" (newv),
+		  [dst]			"r" (dst),
+		  [src]			"r" (src),
+		  [len]			"r" (len)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG, RSEQ_ASM_TMP_REG_2
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+#endif /* !RSEQ_SKIP_FASTPATH */
diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h
index a4684112676c..b5d94087fe31 100644
--- a/tools/testing/selftests/rseq/rseq.h
+++ b/tools/testing/selftests/rseq/rseq.h
@@ -71,6 +71,8 @@ extern __thread volatile struct rseq __rseq_abi;
 #include <rseq-x86.h>
 #elif defined(__ARMEL__)
 #include <rseq-arm.h>
+#elif defined (__AARCH64EL__)
+#include <rseq-arm64.h>
 #elif defined(__PPC__)
 #include <rseq-ppc.h>
 #elif defined(__mips__)
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH 3/3] rseq/selftests: Add support for arm64
  2018-06-25 17:54   ` Will Deacon
@ 2018-06-25 18:10     ` Mathieu Desnoyers
  -1 siblings, 0 replies; 24+ messages in thread
From: Mathieu Desnoyers @ 2018-06-25 18:10 UTC (permalink / raw)
  To: Will Deacon
  Cc: linux-arm-kernel, linux-kernel, Arnd Bergmann, Peter Zijlstra,
	Paul E. McKenney, Boqun Feng, Catalin Marinas, peter maydell,
	Mark Rutland

----- On Jun 25, 2018, at 1:54 PM, Will Deacon will.deacon@arm.com wrote:

[...]

> +#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,		\
> +				post_commit_offset, abort_ip)			\
> +	"	.pushsection	__rseq_table, \"aw\"\n"				\
> +	"	.balign	32\n"							\
> +	__rseq_str(label) ":\n"							\
> +	"	.long	" __rseq_str(version) ", " __rseq_str(flags) "\n"	\
> +	"	.quad	" __rseq_str(start_ip) ", "				\
> +			  __rseq_str(post_commit_offset) ", "			\
> +			  __rseq_str(abort_ip) "\n"				\
> +	"	.popsection\n"
> +
> +#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip)	\
> +	__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,			\
> +				(post_commit_ip - start_ip), abort_ip)
> +
> +#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)			\
> +	RSEQ_INJECT_ASM(1)							\
> +	"	adrp	" RSEQ_ASM_TMP_REG ", " __rseq_str(cs_label) "\n"	\
> +	"	add	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
> +			", :lo12:" __rseq_str(cs_label) "\n"			\
> +	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(rseq_cs) "]\n"	\
> +	__rseq_str(label) ":\n"
> +
> +#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\
> +	"	.pushsection	__rseq_failure, \"ax\"\n"			\
> +	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\
> +	__rseq_str(label) ":\n"							\
> +	"	b	%l[" __rseq_str(abort_label) "]\n"			\
> +	"	.popsection\n"

Thanks Will for porting rseq to arm64 !

I notice you are using the instructions

  adrp
  add
  str

to implement RSEQ_ASM_STORE_RSEQ_CS(). Did you compare
performance-wise with an approach using a literal pool
near the instruction pointer like I did on arm32 ?

With that approach, this ends up being simply

  adr
  str

which provides significantly better performance on my test
platform over loading a pointer targeting a separate data
section.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 3/3] rseq/selftests: Add support for arm64
@ 2018-06-25 18:10     ` Mathieu Desnoyers
  0 siblings, 0 replies; 24+ messages in thread
From: Mathieu Desnoyers @ 2018-06-25 18:10 UTC (permalink / raw)
  To: linux-arm-kernel

----- On Jun 25, 2018, at 1:54 PM, Will Deacon will.deacon at arm.com wrote:

[...]

> +#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,		\
> +				post_commit_offset, abort_ip)			\
> +	"	.pushsection	__rseq_table, \"aw\"\n"				\
> +	"	.balign	32\n"							\
> +	__rseq_str(label) ":\n"							\
> +	"	.long	" __rseq_str(version) ", " __rseq_str(flags) "\n"	\
> +	"	.quad	" __rseq_str(start_ip) ", "				\
> +			  __rseq_str(post_commit_offset) ", "			\
> +			  __rseq_str(abort_ip) "\n"				\
> +	"	.popsection\n"
> +
> +#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip)	\
> +	__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,			\
> +				(post_commit_ip - start_ip), abort_ip)
> +
> +#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)			\
> +	RSEQ_INJECT_ASM(1)							\
> +	"	adrp	" RSEQ_ASM_TMP_REG ", " __rseq_str(cs_label) "\n"	\
> +	"	add	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
> +			", :lo12:" __rseq_str(cs_label) "\n"			\
> +	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(rseq_cs) "]\n"	\
> +	__rseq_str(label) ":\n"
> +
> +#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\
> +	"	.pushsection	__rseq_failure, \"ax\"\n"			\
> +	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\
> +	__rseq_str(label) ":\n"							\
> +	"	b	%l[" __rseq_str(abort_label) "]\n"			\
> +	"	.popsection\n"

Thanks Will for porting rseq to arm64 !

I notice you are using the instructions

  adrp
  add
  str

to implement RSEQ_ASM_STORE_RSEQ_CS(). Did you compare
performance-wise with an approach using a literal pool
near the instruction pointer like I did on arm32 ?

With that approach, this ends up being simply

  adr
  str

which provides significantly better performance on my test
platform over loading a pointer targeting a separate data
section.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/3] arm64: rseq: Implement backend rseq calls and select HAVE_RSEQ
  2018-06-25 17:54   ` Will Deacon
@ 2018-06-26 10:31     ` Mark Rutland
  -1 siblings, 0 replies; 24+ messages in thread
From: Mark Rutland @ 2018-06-26 10:31 UTC (permalink / raw)
  To: Will Deacon
  Cc: linux-arm-kernel, linux-kernel, arnd, mathieu.desnoyers, peterz,
	paulmck, boqun.feng, catalin.marinas, peter.maydell

On Mon, Jun 25, 2018 at 06:54:43PM +0100, Will Deacon wrote:
>  /*
>   * Please add new compat syscalls above this comment and update
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index 28ad8799406f..1eda9e1a1f4a 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -904,6 +904,7 @@ ENDPROC(el0_error)
>  ret_fast_syscall:
>  	disable_daif
>  	str	x0, [sp, #S_X0]			// returned x0
> +#ifndef CONFIG_DEBUG_RSEQ
>  	ldr	x1, [tsk, #TSK_TI_FLAGS]	// re-check for syscall tracing
>  	and	x2, x1, #_TIF_SYSCALL_WORK
>  	cbnz	x2, ret_fast_syscall_trace
> @@ -911,6 +912,7 @@ ret_fast_syscall:
>  	cbnz	x2, work_pending
>  	enable_step_tsk x1, x2
>  	kernel_exit 0
> +#endif
>  ret_fast_syscall_trace:
>  	enable_daif
>  	b	__sys_trace_return_skipped	// we already saved x0

I *think* this is ok, since we re-check the TIF bits in
syscall_trace_exit().

This does mean that we'd now always call audit_syscall_exit(),
regardless of TIF_AUDIT, but there are already cases when we call that
with TIF_AUDIT clear, so I think if that's a problem it's a latent bug.
Likewise for audit_syscall_entry().

It seems we're in the same boat as other architectures there,
regardless.

FWIW, for the entry bits:

Acked-by: Mark Rutland <mark.rutland@arm.com>

Mark.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 1/3] arm64: rseq: Implement backend rseq calls and select HAVE_RSEQ
@ 2018-06-26 10:31     ` Mark Rutland
  0 siblings, 0 replies; 24+ messages in thread
From: Mark Rutland @ 2018-06-26 10:31 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Jun 25, 2018 at 06:54:43PM +0100, Will Deacon wrote:
>  /*
>   * Please add new compat syscalls above this comment and update
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index 28ad8799406f..1eda9e1a1f4a 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -904,6 +904,7 @@ ENDPROC(el0_error)
>  ret_fast_syscall:
>  	disable_daif
>  	str	x0, [sp, #S_X0]			// returned x0
> +#ifndef CONFIG_DEBUG_RSEQ
>  	ldr	x1, [tsk, #TSK_TI_FLAGS]	// re-check for syscall tracing
>  	and	x2, x1, #_TIF_SYSCALL_WORK
>  	cbnz	x2, ret_fast_syscall_trace
> @@ -911,6 +912,7 @@ ret_fast_syscall:
>  	cbnz	x2, work_pending
>  	enable_step_tsk x1, x2
>  	kernel_exit 0
> +#endif
>  ret_fast_syscall_trace:
>  	enable_daif
>  	b	__sys_trace_return_skipped	// we already saved x0

I *think* this is ok, since we re-check the TIF bits in
syscall_trace_exit().

This does mean that we'd now always call audit_syscall_exit(),
regardless of TIF_AUDIT, but there are already cases when we call that
with TIF_AUDIT clear, so I think if that's a problem it's a latent bug.
Likewise for audit_syscall_entry().

It seems we're in the same boat as other architectures there,
regardless.

FWIW, for the entry bits:

Acked-by: Mark Rutland <mark.rutland@arm.com>

Mark.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 3/3] rseq/selftests: Add support for arm64
  2018-06-25 18:10     ` Mathieu Desnoyers
@ 2018-06-26 15:14       ` Will Deacon
  -1 siblings, 0 replies; 24+ messages in thread
From: Will Deacon @ 2018-06-26 15:14 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: linux-arm-kernel, linux-kernel, Arnd Bergmann, Peter Zijlstra,
	Paul E. McKenney, Boqun Feng, Catalin Marinas, peter maydell,
	Mark Rutland

Hi Mathieu,

On Mon, Jun 25, 2018 at 02:10:10PM -0400, Mathieu Desnoyers wrote:
> ----- On Jun 25, 2018, at 1:54 PM, Will Deacon will.deacon@arm.com wrote:
> > +#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,		\
> > +				post_commit_offset, abort_ip)			\
> > +	"	.pushsection	__rseq_table, \"aw\"\n"				\
> > +	"	.balign	32\n"							\
> > +	__rseq_str(label) ":\n"							\
> > +	"	.long	" __rseq_str(version) ", " __rseq_str(flags) "\n"	\
> > +	"	.quad	" __rseq_str(start_ip) ", "				\
> > +			  __rseq_str(post_commit_offset) ", "			\
> > +			  __rseq_str(abort_ip) "\n"				\
> > +	"	.popsection\n"
> > +
> > +#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip)	\
> > +	__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,			\
> > +				(post_commit_ip - start_ip), abort_ip)
> > +
> > +#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)			\
> > +	RSEQ_INJECT_ASM(1)							\
> > +	"	adrp	" RSEQ_ASM_TMP_REG ", " __rseq_str(cs_label) "\n"	\
> > +	"	add	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
> > +			", :lo12:" __rseq_str(cs_label) "\n"			\
> > +	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(rseq_cs) "]\n"	\
> > +	__rseq_str(label) ":\n"
> > +
> > +#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\
> > +	"	.pushsection	__rseq_failure, \"ax\"\n"			\
> > +	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\
> > +	__rseq_str(label) ":\n"							\
> > +	"	b	%l[" __rseq_str(abort_label) "]\n"			\
> > +	"	.popsection\n"
> 
> Thanks Will for porting rseq to arm64 !

That's ok, it was good fun :)

I'm going to chat with our compiler guys to see if there's any room for
improving the flexibility in the critical section, since having a temporary
in the clobber list is pretty grotty.

> I notice you are using the instructions
> 
>   adrp
>   add
>   str
> 
> to implement RSEQ_ASM_STORE_RSEQ_CS(). Did you compare
> performance-wise with an approach using a literal pool
> near the instruction pointer like I did on arm32 ?

I didn't, no. Do you have a benchmark to hand so I can give this a go?
The two reasons I didn't go down this route are:

1. It introduces data which is mapped as executable. I don't have a
   specific security concern here, but the way things have gone so far
   this year, I've realised that I'm not bright enough to anticipate
   these things.

2. It introduces a branch over the table on the fast path, which is likely
   to have a relatively higher branch misprediction cost on more advanced
   CPUs.

I also find it grotty that we emit two tables so that debuggers can cope,
but that's just a cosmetic nit.

> With that approach, this ends up being simply
> 
>   adr
>   str
> 
> which provides significantly better performance on my test
> platform over loading a pointer targeting a separate data
> section.

My understanding is that your test platform is based on Cortex-A7, so I'd
be wary about concluding too much about general performance from that CPU
since its a pretty straightforward in-order design.

Will

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 3/3] rseq/selftests: Add support for arm64
@ 2018-06-26 15:14       ` Will Deacon
  0 siblings, 0 replies; 24+ messages in thread
From: Will Deacon @ 2018-06-26 15:14 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Mathieu,

On Mon, Jun 25, 2018 at 02:10:10PM -0400, Mathieu Desnoyers wrote:
> ----- On Jun 25, 2018, at 1:54 PM, Will Deacon will.deacon at arm.com wrote:
> > +#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,		\
> > +				post_commit_offset, abort_ip)			\
> > +	"	.pushsection	__rseq_table, \"aw\"\n"				\
> > +	"	.balign	32\n"							\
> > +	__rseq_str(label) ":\n"							\
> > +	"	.long	" __rseq_str(version) ", " __rseq_str(flags) "\n"	\
> > +	"	.quad	" __rseq_str(start_ip) ", "				\
> > +			  __rseq_str(post_commit_offset) ", "			\
> > +			  __rseq_str(abort_ip) "\n"				\
> > +	"	.popsection\n"
> > +
> > +#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip)	\
> > +	__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,			\
> > +				(post_commit_ip - start_ip), abort_ip)
> > +
> > +#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)			\
> > +	RSEQ_INJECT_ASM(1)							\
> > +	"	adrp	" RSEQ_ASM_TMP_REG ", " __rseq_str(cs_label) "\n"	\
> > +	"	add	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
> > +			", :lo12:" __rseq_str(cs_label) "\n"			\
> > +	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(rseq_cs) "]\n"	\
> > +	__rseq_str(label) ":\n"
> > +
> > +#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\
> > +	"	.pushsection	__rseq_failure, \"ax\"\n"			\
> > +	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\
> > +	__rseq_str(label) ":\n"							\
> > +	"	b	%l[" __rseq_str(abort_label) "]\n"			\
> > +	"	.popsection\n"
> 
> Thanks Will for porting rseq to arm64 !

That's ok, it was good fun :)

I'm going to chat with our compiler guys to see if there's any room for
improving the flexibility in the critical section, since having a temporary
in the clobber list is pretty grotty.

> I notice you are using the instructions
> 
>   adrp
>   add
>   str
> 
> to implement RSEQ_ASM_STORE_RSEQ_CS(). Did you compare
> performance-wise with an approach using a literal pool
> near the instruction pointer like I did on arm32 ?

I didn't, no. Do you have a benchmark to hand so I can give this a go?
The two reasons I didn't go down this route are:

1. It introduces data which is mapped as executable. I don't have a
   specific security concern here, but the way things have gone so far
   this year, I've realised that I'm not bright enough to anticipate
   these things.

2. It introduces a branch over the table on the fast path, which is likely
   to have a relatively higher branch misprediction cost on more advanced
   CPUs.

I also find it grotty that we emit two tables so that debuggers can cope,
but that's just a cosmetic nit.

> With that approach, this ends up being simply
> 
>   adr
>   str
> 
> which provides significantly better performance on my test
> platform over loading a pointer targeting a separate data
> section.

My understanding is that your test platform is based on Cortex-A7, so I'd
be wary about concluding too much about general performance from that CPU
since its a pretty straightforward in-order design.

Will

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 3/3] rseq/selftests: Add support for arm64
  2018-06-26 15:14       ` Will Deacon
@ 2018-06-26 16:11         ` Mathieu Desnoyers
  -1 siblings, 0 replies; 24+ messages in thread
From: Mathieu Desnoyers @ 2018-06-26 16:11 UTC (permalink / raw)
  To: Will Deacon
  Cc: linux-arm-kernel, linux-kernel, Arnd Bergmann, Peter Zijlstra,
	Paul E. McKenney, Boqun Feng, Catalin Marinas, peter maydell,
	Mark Rutland



----- On Jun 26, 2018, at 11:14 AM, Will Deacon will.deacon@arm.com wrote:

> Hi Mathieu,
> 
> On Mon, Jun 25, 2018 at 02:10:10PM -0400, Mathieu Desnoyers wrote:
>> ----- On Jun 25, 2018, at 1:54 PM, Will Deacon will.deacon@arm.com wrote:
>> > +#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,		\
>> > +				post_commit_offset, abort_ip)			\
>> > +	"	.pushsection	__rseq_table, \"aw\"\n"				\
>> > +	"	.balign	32\n"							\
>> > +	__rseq_str(label) ":\n"							\
>> > +	"	.long	" __rseq_str(version) ", " __rseq_str(flags) "\n"	\
>> > +	"	.quad	" __rseq_str(start_ip) ", "				\
>> > +			  __rseq_str(post_commit_offset) ", "			\
>> > +			  __rseq_str(abort_ip) "\n"				\
>> > +	"	.popsection\n"
>> > +
>> > +#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip)	\
>> > +	__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,			\
>> > +				(post_commit_ip - start_ip), abort_ip)
>> > +
>> > +#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)			\
>> > +	RSEQ_INJECT_ASM(1)							\
>> > +	"	adrp	" RSEQ_ASM_TMP_REG ", " __rseq_str(cs_label) "\n"	\
>> > +	"	add	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
>> > +			", :lo12:" __rseq_str(cs_label) "\n"			\
>> > +	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(rseq_cs) "]\n"	\
>> > +	__rseq_str(label) ":\n"
>> > +
>> > +#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\
>> > +	"	.pushsection	__rseq_failure, \"ax\"\n"			\
>> > +	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\
>> > +	__rseq_str(label) ":\n"							\
>> > +	"	b	%l[" __rseq_str(abort_label) "]\n"			\
>> > +	"	.popsection\n"
>> 
>> Thanks Will for porting rseq to arm64 !
> 
> That's ok, it was good fun :)
> 
> I'm going to chat with our compiler guys to see if there's any room for
> improving the flexibility in the critical section, since having a temporary
> in the clobber list is pretty grotty.

Let me know how it goes!

> 
>> I notice you are using the instructions
>> 
>>   adrp
>>   add
>>   str
>> 
>> to implement RSEQ_ASM_STORE_RSEQ_CS(). Did you compare
>> performance-wise with an approach using a literal pool
>> near the instruction pointer like I did on arm32 ?
> 
> I didn't, no. Do you have a benchmark to hand so I can give this a go?

see tools/testing/selftests/rseq/param_test_benchmark --help

It's a stripped-down version of param_test, without all the code for
delay loops and testing checks.

Example use for counter increment with 4 threads, doing 5G counter
increments per thread:

time ./param_test_benchmark -T i -t 4 -r 5000000000

> The two reasons I didn't go down this route are:
> 
> 1. It introduces data which is mapped as executable. I don't have a
>   specific security concern here, but the way things have gone so far
>   this year, I've realised that I'm not bright enough to anticipate
>   these things.

So far I've been able to dig up that "pure code" or "execute only" code
is explicitly requested by compiler flags (-mno-pc-relative-literal-loads
on aarch64, -mpure-code on arm32 when the moon cycle is aligned). It's a
shame that it's not more standard, or that there does not appear to be any
preprocessor define available to test this within code.

I'm all for allowing end users to chose whether they want to use literal
pools in code or not, but I think it should be configurable at compile
time, and we should make it similar on arm32 and arm64. Given that compilers
don't emit preprocessor define, perhaps we need to introduce our own
RSEQ_NO_PC_RELATIVE_LITERAL_LOADS (or perhaps a shorter name ?) define to
select behavior at compile-time.

> 2. It introduces a branch over the table on the fast path, which is likely
>   to have a relatively higher branch misprediction cost on more advanced
>   CPUs.

Hrm, wait a second... I see that your comparison of the cpu number requires:

+#define RSEQ_ASM_OP_CMPEQ32(var, expect, label)                                        \
+        "        ldr        " RSEQ_ASM_TMP_REG32 ", %[" __rseq_str(var) "]\n"        \
+        "        sub        " RSEQ_ASM_TMP_REG32 ", " RSEQ_ASM_TMP_REG32                \
+                        ", %w[" __rseq_str(expect) "]\n"                        \
+        "        cbnz        " RSEQ_ASM_TMP_REG32 ", " __rseq_str(label) "\n"

because the abort code is emitted in a separate section:

+#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)                                \
+        "        .pushsection        __rseq_failure, \"ax\"\n"                        \
+        "        .long         "        __rseq_str(RSEQ_SIG) "\n"                        \
+        __rseq_str(label) ":\n"                                                        \
+        "        b        %l[" __rseq_str(abort_label) "]\n"                        \
+        "        .popsection\n"

Like I did on x86. But the cbnz instruction requires the branch target to be
within +/- 1MB from the instruction (http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch06s04.html),
which clearly is not guaranteed when you place the abort label in a separate
section.

Also, using cbnz to jump to a label that is outside of the assembly
(e.g. %l[error1]) does not ensure that the branch target is within
1MB of the code.

I've had assembler issues on arm32 due to those kind of constraints
when integrating rseq headers into larger code-bases.

So, one way to fix the fast-path so cpu number comparison can branch
to a close location is to put the abort code near the fast-path, and
you end up having to unconditionally jump over the abort code from
the fast-path on success. So once you bite the bullet and jump over
abort, you just have to ensure you place the struct rseq_cs data
near the abort code, so you end up jumping over both at the same time.

> 
> I also find it grotty that we emit two tables so that debuggers can cope,
> but that's just a cosmetic nit.
> 
>> With that approach, this ends up being simply
>> 
>>   adr
>>   str
>> 
>> which provides significantly better performance on my test
>> platform over loading a pointer targeting a separate data
>> section.
> 
> My understanding is that your test platform is based on Cortex-A7, so I'd
> be wary about concluding too much about general performance from that CPU
> since its a pretty straightforward in-order design.

I did benchmarks on our Wandboard (Cortex A9) as well as the Cubietruck. I
could only use perf to do detailed breakdown of the fast-path overhead on
the Cubie because I could not get it to work on our Wandboard, but overall
speed was better on Wandboard as well (as I recall) with the literal pool.

Thanks,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 3/3] rseq/selftests: Add support for arm64
@ 2018-06-26 16:11         ` Mathieu Desnoyers
  0 siblings, 0 replies; 24+ messages in thread
From: Mathieu Desnoyers @ 2018-06-26 16:11 UTC (permalink / raw)
  To: linux-arm-kernel



----- On Jun 26, 2018, at 11:14 AM, Will Deacon will.deacon at arm.com wrote:

> Hi Mathieu,
> 
> On Mon, Jun 25, 2018 at 02:10:10PM -0400, Mathieu Desnoyers wrote:
>> ----- On Jun 25, 2018, at 1:54 PM, Will Deacon will.deacon at arm.com wrote:
>> > +#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,		\
>> > +				post_commit_offset, abort_ip)			\
>> > +	"	.pushsection	__rseq_table, \"aw\"\n"				\
>> > +	"	.balign	32\n"							\
>> > +	__rseq_str(label) ":\n"							\
>> > +	"	.long	" __rseq_str(version) ", " __rseq_str(flags) "\n"	\
>> > +	"	.quad	" __rseq_str(start_ip) ", "				\
>> > +			  __rseq_str(post_commit_offset) ", "			\
>> > +			  __rseq_str(abort_ip) "\n"				\
>> > +	"	.popsection\n"
>> > +
>> > +#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip)	\
>> > +	__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,			\
>> > +				(post_commit_ip - start_ip), abort_ip)
>> > +
>> > +#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)			\
>> > +	RSEQ_INJECT_ASM(1)							\
>> > +	"	adrp	" RSEQ_ASM_TMP_REG ", " __rseq_str(cs_label) "\n"	\
>> > +	"	add	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
>> > +			", :lo12:" __rseq_str(cs_label) "\n"			\
>> > +	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(rseq_cs) "]\n"	\
>> > +	__rseq_str(label) ":\n"
>> > +
>> > +#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\
>> > +	"	.pushsection	__rseq_failure, \"ax\"\n"			\
>> > +	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\
>> > +	__rseq_str(label) ":\n"							\
>> > +	"	b	%l[" __rseq_str(abort_label) "]\n"			\
>> > +	"	.popsection\n"
>> 
>> Thanks Will for porting rseq to arm64 !
> 
> That's ok, it was good fun :)
> 
> I'm going to chat with our compiler guys to see if there's any room for
> improving the flexibility in the critical section, since having a temporary
> in the clobber list is pretty grotty.

Let me know how it goes!

> 
>> I notice you are using the instructions
>> 
>>   adrp
>>   add
>>   str
>> 
>> to implement RSEQ_ASM_STORE_RSEQ_CS(). Did you compare
>> performance-wise with an approach using a literal pool
>> near the instruction pointer like I did on arm32 ?
> 
> I didn't, no. Do you have a benchmark to hand so I can give this a go?

see tools/testing/selftests/rseq/param_test_benchmark --help

It's a stripped-down version of param_test, without all the code for
delay loops and testing checks.

Example use for counter increment with 4 threads, doing 5G counter
increments per thread:

time ./param_test_benchmark -T i -t 4 -r 5000000000

> The two reasons I didn't go down this route are:
> 
> 1. It introduces data which is mapped as executable. I don't have a
>   specific security concern here, but the way things have gone so far
>   this year, I've realised that I'm not bright enough to anticipate
>   these things.

So far I've been able to dig up that "pure code" or "execute only" code
is explicitly requested by compiler flags (-mno-pc-relative-literal-loads
on aarch64, -mpure-code on arm32 when the moon cycle is aligned). It's a
shame that it's not more standard, or that there does not appear to be any
preprocessor define available to test this within code.

I'm all for allowing end users to chose whether they want to use literal
pools in code or not, but I think it should be configurable at compile
time, and we should make it similar on arm32 and arm64. Given that compilers
don't emit preprocessor define, perhaps we need to introduce our own
RSEQ_NO_PC_RELATIVE_LITERAL_LOADS (or perhaps a shorter name ?) define to
select behavior at compile-time.

> 2. It introduces a branch over the table on the fast path, which is likely
>   to have a relatively higher branch misprediction cost on more advanced
>   CPUs.

Hrm, wait a second... I see that your comparison of the cpu number requires:

+#define RSEQ_ASM_OP_CMPEQ32(var, expect, label)                                        \
+        "        ldr        " RSEQ_ASM_TMP_REG32 ", %[" __rseq_str(var) "]\n"        \
+        "        sub        " RSEQ_ASM_TMP_REG32 ", " RSEQ_ASM_TMP_REG32                \
+                        ", %w[" __rseq_str(expect) "]\n"                        \
+        "        cbnz        " RSEQ_ASM_TMP_REG32 ", " __rseq_str(label) "\n"

because the abort code is emitted in a separate section:

+#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)                                \
+        "        .pushsection        __rseq_failure, \"ax\"\n"                        \
+        "        .long         "        __rseq_str(RSEQ_SIG) "\n"                        \
+        __rseq_str(label) ":\n"                                                        \
+        "        b        %l[" __rseq_str(abort_label) "]\n"                        \
+        "        .popsection\n"

Like I did on x86. But the cbnz instruction requires the branch target to be
within +/- 1MB from the instruction (http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch06s04.html),
which clearly is not guaranteed when you place the abort label in a separate
section.

Also, using cbnz to jump to a label that is outside of the assembly
(e.g. %l[error1]) does not ensure that the branch target is within
1MB of the code.

I've had assembler issues on arm32 due to those kind of constraints
when integrating rseq headers into larger code-bases.

So, one way to fix the fast-path so cpu number comparison can branch
to a close location is to put the abort code near the fast-path, and
you end up having to unconditionally jump over the abort code from
the fast-path on success. So once you bite the bullet and jump over
abort, you just have to ensure you place the struct rseq_cs data
near the abort code, so you end up jumping over both at the same time.

> 
> I also find it grotty that we emit two tables so that debuggers can cope,
> but that's just a cosmetic nit.
> 
>> With that approach, this ends up being simply
>> 
>>   adr
>>   str
>> 
>> which provides significantly better performance on my test
>> platform over loading a pointer targeting a separate data
>> section.
> 
> My understanding is that your test platform is based on Cortex-A7, so I'd
> be wary about concluding too much about general performance from that CPU
> since its a pretty straightforward in-order design.

I did benchmarks on our Wandboard (Cortex A9) as well as the Cubietruck. I
could only use perf to do detailed breakdown of the fast-path overhead on
the Cubie because I could not get it to work on our Wandboard, but overall
speed was better on Wandboard as well (as I recall) with the literal pool.

Thanks,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 3/3] rseq/selftests: Add support for arm64
  2018-06-26 16:11         ` Mathieu Desnoyers
@ 2018-06-28 16:47           ` Will Deacon
  -1 siblings, 0 replies; 24+ messages in thread
From: Will Deacon @ 2018-06-28 16:47 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: linux-arm-kernel, linux-kernel, Arnd Bergmann, Peter Zijlstra,
	Paul E. McKenney, Boqun Feng, Catalin Marinas, peter maydell,
	Mark Rutland

Hi Mathieu,

On Tue, Jun 26, 2018 at 12:11:52PM -0400, Mathieu Desnoyers wrote:
> ----- On Jun 26, 2018, at 11:14 AM, Will Deacon will.deacon@arm.com wrote:
> > On Mon, Jun 25, 2018 at 02:10:10PM -0400, Mathieu Desnoyers wrote:
> >> I notice you are using the instructions
> >> 
> >>   adrp
> >>   add
> >>   str
> >> 
> >> to implement RSEQ_ASM_STORE_RSEQ_CS(). Did you compare
> >> performance-wise with an approach using a literal pool
> >> near the instruction pointer like I did on arm32 ?
> > 
> > I didn't, no. Do you have a benchmark to hand so I can give this a go?
> 
> see tools/testing/selftests/rseq/param_test_benchmark --help
> 
> It's a stripped-down version of param_test, without all the code for
> delay loops and testing checks.
> 
> Example use for counter increment with 4 threads, doing 5G counter
> increments per thread:
> 
> time ./param_test_benchmark -T i -t 4 -r 5000000000

Thanks. I ran that on a few arm64 systems I have access to, with three
configurations of the selftest:

1. As I posted
2. With the abort signature and branch in-lined, so as to avoid the CBNZ
   address limitations in large codebases
3. With both the abort handler and the table inlined (i.e. the same thing
   as 32-bit).

There isn't a reliably measurable difference between (1) and (2), but I take
between 12% and 27% hit between (2) and (3).

So I'll post a v2 based on (2).

Will

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 3/3] rseq/selftests: Add support for arm64
@ 2018-06-28 16:47           ` Will Deacon
  0 siblings, 0 replies; 24+ messages in thread
From: Will Deacon @ 2018-06-28 16:47 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Mathieu,

On Tue, Jun 26, 2018 at 12:11:52PM -0400, Mathieu Desnoyers wrote:
> ----- On Jun 26, 2018, at 11:14 AM, Will Deacon will.deacon at arm.com wrote:
> > On Mon, Jun 25, 2018 at 02:10:10PM -0400, Mathieu Desnoyers wrote:
> >> I notice you are using the instructions
> >> 
> >>   adrp
> >>   add
> >>   str
> >> 
> >> to implement RSEQ_ASM_STORE_RSEQ_CS(). Did you compare
> >> performance-wise with an approach using a literal pool
> >> near the instruction pointer like I did on arm32 ?
> > 
> > I didn't, no. Do you have a benchmark to hand so I can give this a go?
> 
> see tools/testing/selftests/rseq/param_test_benchmark --help
> 
> It's a stripped-down version of param_test, without all the code for
> delay loops and testing checks.
> 
> Example use for counter increment with 4 threads, doing 5G counter
> increments per thread:
> 
> time ./param_test_benchmark -T i -t 4 -r 5000000000

Thanks. I ran that on a few arm64 systems I have access to, with three
configurations of the selftest:

1. As I posted
2. With the abort signature and branch in-lined, so as to avoid the CBNZ
   address limitations in large codebases
3. With both the abort handler and the table inlined (i.e. the same thing
   as 32-bit).

There isn't a reliably measurable difference between (1) and (2), but I take
between 12% and 27% hit between (2) and (3).

So I'll post a v2 based on (2).

Will

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 3/3] rseq/selftests: Add support for arm64
  2018-06-28 16:47           ` Will Deacon
@ 2018-06-28 20:50             ` Mathieu Desnoyers
  -1 siblings, 0 replies; 24+ messages in thread
From: Mathieu Desnoyers @ 2018-06-28 20:50 UTC (permalink / raw)
  To: Will Deacon
  Cc: linux-arm-kernel, linux-kernel, Arnd Bergmann, Peter Zijlstra,
	Paul E. McKenney, Boqun Feng, Catalin Marinas, peter maydell,
	Mark Rutland

----- On Jun 28, 2018, at 12:47 PM, Will Deacon will.deacon@arm.com wrote:

> Hi Mathieu,
> 
> On Tue, Jun 26, 2018 at 12:11:52PM -0400, Mathieu Desnoyers wrote:
>> ----- On Jun 26, 2018, at 11:14 AM, Will Deacon will.deacon@arm.com wrote:
>> > On Mon, Jun 25, 2018 at 02:10:10PM -0400, Mathieu Desnoyers wrote:
>> >> I notice you are using the instructions
>> >> 
>> >>   adrp
>> >>   add
>> >>   str
>> >> 
>> >> to implement RSEQ_ASM_STORE_RSEQ_CS(). Did you compare
>> >> performance-wise with an approach using a literal pool
>> >> near the instruction pointer like I did on arm32 ?
>> > 
>> > I didn't, no. Do you have a benchmark to hand so I can give this a go?
>> 
>> see tools/testing/selftests/rseq/param_test_benchmark --help
>> 
>> It's a stripped-down version of param_test, without all the code for
>> delay loops and testing checks.
>> 
>> Example use for counter increment with 4 threads, doing 5G counter
>> increments per thread:
>> 
>> time ./param_test_benchmark -T i -t 4 -r 5000000000
> 
> Thanks. I ran that on a few arm64 systems I have access to, with three
> configurations of the selftest:
> 
> 1. As I posted
> 2. With the abort signature and branch in-lined, so as to avoid the CBNZ
>   address limitations in large codebases
> 3. With both the abort handler and the table inlined (i.e. the same thing
>   as 32-bit).
> 
> There isn't a reliably measurable difference between (1) and (2), but I take
> between 12% and 27% hit between (2) and (3).

Those results puzzle me. Do you have the actual code snippets of each
implementation nearby ?

Thanks,

Mathieu

> 
> So I'll post a v2 based on (2).
> 
> Will

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 3/3] rseq/selftests: Add support for arm64
@ 2018-06-28 20:50             ` Mathieu Desnoyers
  0 siblings, 0 replies; 24+ messages in thread
From: Mathieu Desnoyers @ 2018-06-28 20:50 UTC (permalink / raw)
  To: linux-arm-kernel

----- On Jun 28, 2018, at 12:47 PM, Will Deacon will.deacon at arm.com wrote:

> Hi Mathieu,
> 
> On Tue, Jun 26, 2018 at 12:11:52PM -0400, Mathieu Desnoyers wrote:
>> ----- On Jun 26, 2018, at 11:14 AM, Will Deacon will.deacon at arm.com wrote:
>> > On Mon, Jun 25, 2018 at 02:10:10PM -0400, Mathieu Desnoyers wrote:
>> >> I notice you are using the instructions
>> >> 
>> >>   adrp
>> >>   add
>> >>   str
>> >> 
>> >> to implement RSEQ_ASM_STORE_RSEQ_CS(). Did you compare
>> >> performance-wise with an approach using a literal pool
>> >> near the instruction pointer like I did on arm32 ?
>> > 
>> > I didn't, no. Do you have a benchmark to hand so I can give this a go?
>> 
>> see tools/testing/selftests/rseq/param_test_benchmark --help
>> 
>> It's a stripped-down version of param_test, without all the code for
>> delay loops and testing checks.
>> 
>> Example use for counter increment with 4 threads, doing 5G counter
>> increments per thread:
>> 
>> time ./param_test_benchmark -T i -t 4 -r 5000000000
> 
> Thanks. I ran that on a few arm64 systems I have access to, with three
> configurations of the selftest:
> 
> 1. As I posted
> 2. With the abort signature and branch in-lined, so as to avoid the CBNZ
>   address limitations in large codebases
> 3. With both the abort handler and the table inlined (i.e. the same thing
>   as 32-bit).
> 
> There isn't a reliably measurable difference between (1) and (2), but I take
> between 12% and 27% hit between (2) and (3).

Those results puzzle me. Do you have the actual code snippets of each
implementation nearby ?

Thanks,

Mathieu

> 
> So I'll post a v2 based on (2).
> 
> Will

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 3/3] rseq/selftests: Add support for arm64
  2018-06-28 20:50             ` Mathieu Desnoyers
@ 2018-07-02 16:49               ` Will Deacon
  -1 siblings, 0 replies; 24+ messages in thread
From: Will Deacon @ 2018-07-02 16:49 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: linux-arm-kernel, linux-kernel, Arnd Bergmann, Peter Zijlstra,
	Paul E. McKenney, Boqun Feng, Catalin Marinas, peter maydell,
	Mark Rutland

On Thu, Jun 28, 2018 at 04:50:40PM -0400, Mathieu Desnoyers wrote:
> ----- On Jun 28, 2018, at 12:47 PM, Will Deacon will.deacon@arm.com wrote:
> > On Tue, Jun 26, 2018 at 12:11:52PM -0400, Mathieu Desnoyers wrote:
> >> ----- On Jun 26, 2018, at 11:14 AM, Will Deacon will.deacon@arm.com wrote:
> >> > On Mon, Jun 25, 2018 at 02:10:10PM -0400, Mathieu Desnoyers wrote:
> >> >> I notice you are using the instructions
> >> >> 
> >> >>   adrp
> >> >>   add
> >> >>   str
> >> >> 
> >> >> to implement RSEQ_ASM_STORE_RSEQ_CS(). Did you compare
> >> >> performance-wise with an approach using a literal pool
> >> >> near the instruction pointer like I did on arm32 ?
> >> > 
> >> > I didn't, no. Do you have a benchmark to hand so I can give this a go?
> >> 
> >> see tools/testing/selftests/rseq/param_test_benchmark --help
> >> 
> >> It's a stripped-down version of param_test, without all the code for
> >> delay loops and testing checks.
> >> 
> >> Example use for counter increment with 4 threads, doing 5G counter
> >> increments per thread:
> >> 
> >> time ./param_test_benchmark -T i -t 4 -r 5000000000
> > 
> > Thanks. I ran that on a few arm64 systems I have access to, with three
> > configurations of the selftest:
> > 
> > 1. As I posted
> > 2. With the abort signature and branch in-lined, so as to avoid the CBNZ
> >   address limitations in large codebases
> > 3. With both the abort handler and the table inlined (i.e. the same thing
> >   as 32-bit).
> > 
> > There isn't a reliably measurable difference between (1) and (2), but I take
> > between 12% and 27% hit between (2) and (3).
> 
> Those results puzzle me. Do you have the actual code snippets of each
> implementation nearby ?

Sure, I've included the diffs for (2) and (3) below. They both apply on top
of my branch at:

git://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git rseq

Will

--->8

diff --git a/tools/testing/selftests/rseq/rseq-arm64.h b/tools/testing/selftests/rseq/rseq-arm64.h
index 599788f74137..954f34671ca6 100644
--- a/tools/testing/selftests/rseq/rseq-arm64.h
+++ b/tools/testing/selftests/rseq/rseq-arm64.h
@@ -104,11 +104,11 @@ do {										\
 	__rseq_str(label) ":\n"
 
 #define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\
-	"	.pushsection	__rseq_failure, \"ax\"\n"			\
-	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\
+	"	b	222f\n"							\
+	"	.inst 	"	__rseq_str(RSEQ_SIG) "\n"			\
 	__rseq_str(label) ":\n"							\
 	"	b	%l[" __rseq_str(abort_label) "]\n"			\
-	"	.popsection\n"
+	"222:\n"
 
 #define RSEQ_ASM_OP_STORE(value, var)						\
 	"	str	%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n"

--->8

diff --git a/tools/testing/selftests/rseq/rseq-arm64.h b/tools/testing/selftests/rseq/rseq-arm64.h
index 599788f74137..2554aa17acf3 100644
--- a/tools/testing/selftests/rseq/rseq-arm64.h
+++ b/tools/testing/selftests/rseq/rseq-arm64.h
@@ -80,35 +80,37 @@ do {										\
 #define RSEQ_ASM_TMP_REG	"x15"
 #define RSEQ_ASM_TMP_REG_2	"x14"
 
-#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,		\
+#define __RSEQ_ASM_DEFINE_TABLE(version, flags, start_ip,			\
 				post_commit_offset, abort_ip)			\
-	"	.pushsection	__rseq_table, \"aw\"\n"				\
-	"	.balign	32\n"							\
-	__rseq_str(label) ":\n"							\
 	"	.long	" __rseq_str(version) ", " __rseq_str(flags) "\n"	\
 	"	.quad	" __rseq_str(start_ip) ", "				\
 			  __rseq_str(post_commit_offset) ", "			\
-			  __rseq_str(abort_ip) "\n"				\
-	"	.popsection\n"
+			  __rseq_str(abort_ip) "\n"
 
-#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip)	\
-	__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,			\
-				(post_commit_ip - start_ip), abort_ip)
+#define RSEQ_ASM_DEFINE_TABLE(start_ip, post_commit_ip, abort_ip)		\
+	"	.pushsection	__rseq_table, \"aw\"\n"				\
+	"	.balign	32\n"							\
+	__RSEQ_ASM_DEFINE_TABLE(0x0, 0x0, start_ip,				\
+				(post_commit_ip - start_ip), abort_ip)		\
+	"	.popsection\n"
 
-#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)			\
+#define RSEQ_ASM_STORE_RSEQ_CS(label, table_label, rseq_cs)			\
 	RSEQ_INJECT_ASM(1)							\
-	"	adrp	" RSEQ_ASM_TMP_REG ", " __rseq_str(cs_label) "\n"	\
-	"	add	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
-			", :lo12:" __rseq_str(cs_label) "\n"			\
+	"	adr	" RSEQ_ASM_TMP_REG ", " __rseq_str(table_label) "\n"	\
 	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(rseq_cs) "]\n"	\
 	__rseq_str(label) ":\n"
 
-#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\
-	"	.pushsection	__rseq_failure, \"ax\"\n"			\
-	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\
+#define RSEQ_ASM_DEFINE_ABORT(table_label, start_ip, post_commit_ip, label,	\
+			      abort_label)					\
+	"	b	222f\n"							\
+	"	.balign 32\n"							\
+	__rseq_str(table_label) ":\n"						\
+	__RSEQ_ASM_DEFINE_TABLE(0x0, 0x0, start_ip,				\
+				(post_commit_ip - start_ip), label ## f)	\
+	"	.inst 	"	__rseq_str(RSEQ_SIG) "\n"			\
 	__rseq_str(label) ":\n"							\
 	"	b	%l[" __rseq_str(abort_label) "]\n"			\
-	"	.popsection\n"
+	"222:\n"
 
 #define RSEQ_ASM_OP_STORE(value, var)						\
 	"	str	%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n"
@@ -181,8 +183,8 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
@@ -191,9 +193,9 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
 #endif
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
 		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
@@ -230,8 +232,8 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 		RSEQ_ASM_OP_CMPNE(v, expectnot, %l[cmpfail])
@@ -243,9 +245,9 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
 		RSEQ_ASM_OP_R_LOAD(v)
 		RSEQ_ASM_OP_R_STORE(load)
 		RSEQ_ASM_OP_R_LOAD_OFF(voffp)
-		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
+		RSEQ_ASM_OP_R_FINAL_STORE(v, 2)
 		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
@@ -281,8 +283,8 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 #ifdef RSEQ_COMPARE_TWICE
@@ -290,9 +292,9 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
 #endif
 		RSEQ_ASM_OP_R_LOAD(v)
 		RSEQ_ASM_OP_R_ADD(count)
-		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
+		RSEQ_ASM_OP_R_FINAL_STORE(v, 2)
 		RSEQ_INJECT_ASM(4)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
@@ -324,8 +326,8 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
@@ -336,9 +338,9 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
 #endif
 		RSEQ_ASM_OP_STORE(newv2, v2)
 		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
 		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
@@ -378,8 +380,8 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
@@ -390,9 +392,9 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
 #endif
 		RSEQ_ASM_OP_STORE(newv2, v2)
 		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
+		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 2)
 		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
@@ -432,8 +434,8 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
@@ -445,9 +447,9 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
 		RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[error3])
 #endif
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
 		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
@@ -489,8 +491,8 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
@@ -501,9 +503,9 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
 #endif
 		RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
 		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
 		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
@@ -544,8 +546,8 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
@@ -556,9 +558,9 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
 #endif
 		RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
 		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
+		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 2)
 		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 3/3] rseq/selftests: Add support for arm64
@ 2018-07-02 16:49               ` Will Deacon
  0 siblings, 0 replies; 24+ messages in thread
From: Will Deacon @ 2018-07-02 16:49 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Jun 28, 2018 at 04:50:40PM -0400, Mathieu Desnoyers wrote:
> ----- On Jun 28, 2018, at 12:47 PM, Will Deacon will.deacon at arm.com wrote:
> > On Tue, Jun 26, 2018 at 12:11:52PM -0400, Mathieu Desnoyers wrote:
> >> ----- On Jun 26, 2018, at 11:14 AM, Will Deacon will.deacon at arm.com wrote:
> >> > On Mon, Jun 25, 2018 at 02:10:10PM -0400, Mathieu Desnoyers wrote:
> >> >> I notice you are using the instructions
> >> >> 
> >> >>   adrp
> >> >>   add
> >> >>   str
> >> >> 
> >> >> to implement RSEQ_ASM_STORE_RSEQ_CS(). Did you compare
> >> >> performance-wise with an approach using a literal pool
> >> >> near the instruction pointer like I did on arm32 ?
> >> > 
> >> > I didn't, no. Do you have a benchmark to hand so I can give this a go?
> >> 
> >> see tools/testing/selftests/rseq/param_test_benchmark --help
> >> 
> >> It's a stripped-down version of param_test, without all the code for
> >> delay loops and testing checks.
> >> 
> >> Example use for counter increment with 4 threads, doing 5G counter
> >> increments per thread:
> >> 
> >> time ./param_test_benchmark -T i -t 4 -r 5000000000
> > 
> > Thanks. I ran that on a few arm64 systems I have access to, with three
> > configurations of the selftest:
> > 
> > 1. As I posted
> > 2. With the abort signature and branch in-lined, so as to avoid the CBNZ
> >   address limitations in large codebases
> > 3. With both the abort handler and the table inlined (i.e. the same thing
> >   as 32-bit).
> > 
> > There isn't a reliably measurable difference between (1) and (2), but I take
> > between 12% and 27% hit between (2) and (3).
> 
> Those results puzzle me. Do you have the actual code snippets of each
> implementation nearby ?

Sure, I've included the diffs for (2) and (3) below. They both apply on top
of my branch at:

git://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git rseq

Will

--->8

diff --git a/tools/testing/selftests/rseq/rseq-arm64.h b/tools/testing/selftests/rseq/rseq-arm64.h
index 599788f74137..954f34671ca6 100644
--- a/tools/testing/selftests/rseq/rseq-arm64.h
+++ b/tools/testing/selftests/rseq/rseq-arm64.h
@@ -104,11 +104,11 @@ do {										\
 	__rseq_str(label) ":\n"
 
 #define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\
-	"	.pushsection	__rseq_failure, \"ax\"\n"			\
-	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\
+	"	b	222f\n"							\
+	"	.inst 	"	__rseq_str(RSEQ_SIG) "\n"			\
 	__rseq_str(label) ":\n"							\
 	"	b	%l[" __rseq_str(abort_label) "]\n"			\
-	"	.popsection\n"
+	"222:\n"
 
 #define RSEQ_ASM_OP_STORE(value, var)						\
 	"	str	%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n"

--->8

diff --git a/tools/testing/selftests/rseq/rseq-arm64.h b/tools/testing/selftests/rseq/rseq-arm64.h
index 599788f74137..2554aa17acf3 100644
--- a/tools/testing/selftests/rseq/rseq-arm64.h
+++ b/tools/testing/selftests/rseq/rseq-arm64.h
@@ -80,35 +80,37 @@ do {										\
 #define RSEQ_ASM_TMP_REG	"x15"
 #define RSEQ_ASM_TMP_REG_2	"x14"
 
-#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,		\
+#define __RSEQ_ASM_DEFINE_TABLE(version, flags, start_ip,			\
 				post_commit_offset, abort_ip)			\
-	"	.pushsection	__rseq_table, \"aw\"\n"				\
-	"	.balign	32\n"							\
-	__rseq_str(label) ":\n"							\
 	"	.long	" __rseq_str(version) ", " __rseq_str(flags) "\n"	\
 	"	.quad	" __rseq_str(start_ip) ", "				\
 			  __rseq_str(post_commit_offset) ", "			\
-			  __rseq_str(abort_ip) "\n"				\
-	"	.popsection\n"
+			  __rseq_str(abort_ip) "\n"
 
-#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip)	\
-	__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,			\
-				(post_commit_ip - start_ip), abort_ip)
+#define RSEQ_ASM_DEFINE_TABLE(start_ip, post_commit_ip, abort_ip)		\
+	"	.pushsection	__rseq_table, \"aw\"\n"				\
+	"	.balign	32\n"							\
+	__RSEQ_ASM_DEFINE_TABLE(0x0, 0x0, start_ip,				\
+				(post_commit_ip - start_ip), abort_ip)		\
+	"	.popsection\n"
 
-#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)			\
+#define RSEQ_ASM_STORE_RSEQ_CS(label, table_label, rseq_cs)			\
 	RSEQ_INJECT_ASM(1)							\
-	"	adrp	" RSEQ_ASM_TMP_REG ", " __rseq_str(cs_label) "\n"	\
-	"	add	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
-			", :lo12:" __rseq_str(cs_label) "\n"			\
+	"	adr	" RSEQ_ASM_TMP_REG ", " __rseq_str(table_label) "\n"	\
 	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(rseq_cs) "]\n"	\
 	__rseq_str(label) ":\n"
 
-#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\
-	"	.pushsection	__rseq_failure, \"ax\"\n"			\
-	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\
+#define RSEQ_ASM_DEFINE_ABORT(table_label, start_ip, post_commit_ip, label,	\
+			      abort_label)					\
+	"	b	222f\n"							\
+	"	.balign 32\n"							\
+	__rseq_str(table_label) ":\n"						\
+	__RSEQ_ASM_DEFINE_TABLE(0x0, 0x0, start_ip,				\
+				(post_commit_ip - start_ip), label ## f)	\
+	"	.inst 	"	__rseq_str(RSEQ_SIG) "\n"			\
 	__rseq_str(label) ":\n"							\
 	"	b	%l[" __rseq_str(abort_label) "]\n"			\
-	"	.popsection\n"
+	"222:\n"
 
 #define RSEQ_ASM_OP_STORE(value, var)						\
 	"	str	%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n"
@@ -181,8 +183,8 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
@@ -191,9 +193,9 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
 #endif
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
 		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
@@ -230,8 +232,8 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 		RSEQ_ASM_OP_CMPNE(v, expectnot, %l[cmpfail])
@@ -243,9 +245,9 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
 		RSEQ_ASM_OP_R_LOAD(v)
 		RSEQ_ASM_OP_R_STORE(load)
 		RSEQ_ASM_OP_R_LOAD_OFF(voffp)
-		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
+		RSEQ_ASM_OP_R_FINAL_STORE(v, 2)
 		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
@@ -281,8 +283,8 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 #ifdef RSEQ_COMPARE_TWICE
@@ -290,9 +292,9 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
 #endif
 		RSEQ_ASM_OP_R_LOAD(v)
 		RSEQ_ASM_OP_R_ADD(count)
-		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
+		RSEQ_ASM_OP_R_FINAL_STORE(v, 2)
 		RSEQ_INJECT_ASM(4)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
@@ -324,8 +326,8 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
@@ -336,9 +338,9 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
 #endif
 		RSEQ_ASM_OP_STORE(newv2, v2)
 		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
 		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
@@ -378,8 +380,8 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
@@ -390,9 +392,9 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
 #endif
 		RSEQ_ASM_OP_STORE(newv2, v2)
 		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
+		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 2)
 		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
@@ -432,8 +434,8 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
@@ -445,9 +447,9 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
 		RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[error3])
 #endif
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
 		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
@@ -489,8 +491,8 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
@@ -501,9 +503,9 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
 #endif
 		RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
 		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
 		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
@@ -544,8 +546,8 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
 	RSEQ_INJECT_C(9)
 
 	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
 		RSEQ_INJECT_ASM(3)
 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
@@ -556,9 +558,9 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
 #endif
 		RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
 		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
+		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 2)
 		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
 		: /* gcc asm goto does not allow outputs */
 		: [cpu_id]		"r" (cpu),
 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH 3/3] rseq/selftests: Add support for arm64
  2018-07-02 16:49               ` Will Deacon
@ 2018-07-02 17:47                 ` Mathieu Desnoyers
  -1 siblings, 0 replies; 24+ messages in thread
From: Mathieu Desnoyers @ 2018-07-02 17:47 UTC (permalink / raw)
  To: Will Deacon
  Cc: linux-arm-kernel, linux-kernel, Arnd Bergmann, Peter Zijlstra,
	Paul E. McKenney, Boqun Feng, Catalin Marinas, peter maydell,
	Mark Rutland

----- On Jul 2, 2018, at 12:49 PM, Will Deacon will.deacon@arm.com wrote:

> On Thu, Jun 28, 2018 at 04:50:40PM -0400, Mathieu Desnoyers wrote:
>> ----- On Jun 28, 2018, at 12:47 PM, Will Deacon will.deacon@arm.com wrote:
>> > On Tue, Jun 26, 2018 at 12:11:52PM -0400, Mathieu Desnoyers wrote:
>> >> ----- On Jun 26, 2018, at 11:14 AM, Will Deacon will.deacon@arm.com wrote:
>> >> > On Mon, Jun 25, 2018 at 02:10:10PM -0400, Mathieu Desnoyers wrote:
>> >> >> I notice you are using the instructions
>> >> >> 
>> >> >>   adrp
>> >> >>   add
>> >> >>   str
>> >> >> 
>> >> >> to implement RSEQ_ASM_STORE_RSEQ_CS(). Did you compare
>> >> >> performance-wise with an approach using a literal pool
>> >> >> near the instruction pointer like I did on arm32 ?
>> >> > 
>> >> > I didn't, no. Do you have a benchmark to hand so I can give this a go?
>> >> 
>> >> see tools/testing/selftests/rseq/param_test_benchmark --help
>> >> 
>> >> It's a stripped-down version of param_test, without all the code for
>> >> delay loops and testing checks.
>> >> 
>> >> Example use for counter increment with 4 threads, doing 5G counter
>> >> increments per thread:
>> >> 
>> >> time ./param_test_benchmark -T i -t 4 -r 5000000000
>> > 
>> > Thanks. I ran that on a few arm64 systems I have access to, with three
>> > configurations of the selftest:
>> > 
>> > 1. As I posted
>> > 2. With the abort signature and branch in-lined, so as to avoid the CBNZ
>> >   address limitations in large codebases
>> > 3. With both the abort handler and the table inlined (i.e. the same thing
>> >   as 32-bit).
>> > 
>> > There isn't a reliably measurable difference between (1) and (2), but I take
>> > between 12% and 27% hit between (2) and (3).
>> 
>> Those results puzzle me. Do you have the actual code snippets of each
>> implementation nearby ?
> 
> Sure, I've included the diffs for (2) and (3) below. They both apply on top
> of my branch at:
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git rseq
> 
> Will

I figured out that ADRP+ADD are optimized on Cortex A57 to have a 1 cycle
latency. This would explain why they are doing comparatively well compared
to ADR.

And I guess having more compact code wins here.

So I'm OK with your patchset with the modification for (2), which ensures
the abort label is not too far away on large code-bases.

Thanks!

Mathieu

> 
> --->8
> 
> diff --git a/tools/testing/selftests/rseq/rseq-arm64.h
> b/tools/testing/selftests/rseq/rseq-arm64.h
> index 599788f74137..954f34671ca6 100644
> --- a/tools/testing/selftests/rseq/rseq-arm64.h
> +++ b/tools/testing/selftests/rseq/rseq-arm64.h
> @@ -104,11 +104,11 @@ do {										\
> 	__rseq_str(label) ":\n"
> 
> #define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\
> -	"	.pushsection	__rseq_failure, \"ax\"\n"			\
> -	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\
> +	"	b	222f\n"							\
> +	"	.inst 	"	__rseq_str(RSEQ_SIG) "\n"			\
> 	__rseq_str(label) ":\n"							\
> 	"	b	%l[" __rseq_str(abort_label) "]\n"			\
> -	"	.popsection\n"
> +	"222:\n"
> 
> #define RSEQ_ASM_OP_STORE(value, var)						\
> 	"	str	%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n"
> 
> --->8
> 
> diff --git a/tools/testing/selftests/rseq/rseq-arm64.h
> b/tools/testing/selftests/rseq/rseq-arm64.h
> index 599788f74137..2554aa17acf3 100644
> --- a/tools/testing/selftests/rseq/rseq-arm64.h
> +++ b/tools/testing/selftests/rseq/rseq-arm64.h
> @@ -80,35 +80,37 @@ do {										\
> #define RSEQ_ASM_TMP_REG	"x15"
> #define RSEQ_ASM_TMP_REG_2	"x14"
> 
> -#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,		\
> +#define __RSEQ_ASM_DEFINE_TABLE(version, flags, start_ip,			\
> 				post_commit_offset, abort_ip)			\
> -	"	.pushsection	__rseq_table, \"aw\"\n"				\
> -	"	.balign	32\n"							\
> -	__rseq_str(label) ":\n"							\
> 	"	.long	" __rseq_str(version) ", " __rseq_str(flags) "\n"	\
> 	"	.quad	" __rseq_str(start_ip) ", "				\
> 			  __rseq_str(post_commit_offset) ", "			\
> -			  __rseq_str(abort_ip) "\n"				\
> -	"	.popsection\n"
> +			  __rseq_str(abort_ip) "\n"
> 
> -#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip)	\
> -	__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,			\
> -				(post_commit_ip - start_ip), abort_ip)
> +#define RSEQ_ASM_DEFINE_TABLE(start_ip, post_commit_ip, abort_ip)		\
> +	"	.pushsection	__rseq_table, \"aw\"\n"				\
> +	"	.balign	32\n"							\
> +	__RSEQ_ASM_DEFINE_TABLE(0x0, 0x0, start_ip,				\
> +				(post_commit_ip - start_ip), abort_ip)		\
> +	"	.popsection\n"
> 
> -#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)			\
> +#define RSEQ_ASM_STORE_RSEQ_CS(label, table_label, rseq_cs)			\
> 	RSEQ_INJECT_ASM(1)							\
> -	"	adrp	" RSEQ_ASM_TMP_REG ", " __rseq_str(cs_label) "\n"	\
> -	"	add	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
> -			", :lo12:" __rseq_str(cs_label) "\n"			\
> +	"	adr	" RSEQ_ASM_TMP_REG ", " __rseq_str(table_label) "\n"	\
> 	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(rseq_cs) "]\n"	\
> 	__rseq_str(label) ":\n"
> 
> -#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\
> -	"	.pushsection	__rseq_failure, \"ax\"\n"			\
> -	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\
> +#define RSEQ_ASM_DEFINE_ABORT(table_label, start_ip, post_commit_ip, label,	\
> +			      abort_label)					\
> +	"	b	222f\n"							\
> +	"	.balign 32\n"							\
> +	__rseq_str(table_label) ":\n"						\
> +	__RSEQ_ASM_DEFINE_TABLE(0x0, 0x0, start_ip,				\
> +				(post_commit_ip - start_ip), label ## f)	\
> +	"	.inst 	"	__rseq_str(RSEQ_SIG) "\n"			\
> 	__rseq_str(label) ":\n"							\
> 	"	b	%l[" __rseq_str(abort_label) "]\n"			\
> -	"	.popsection\n"
> +	"222:\n"
> 
> #define RSEQ_ASM_OP_STORE(value, var)						\
> 	"	str	%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n"
> @@ -181,8 +183,8 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect,
> intptr_t newv, int cpu)
> 	RSEQ_INJECT_C(9)
> 
> 	__asm__ __volatile__ goto (
> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
> 		RSEQ_INJECT_ASM(3)
> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
> @@ -191,9 +193,9 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect,
> intptr_t newv, int cpu)
> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
> #endif
> -		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
> +		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
> 		RSEQ_INJECT_ASM(5)
> -		RSEQ_ASM_DEFINE_ABORT(4, abort)
> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
> 		: /* gcc asm goto does not allow outputs */
> 		: [cpu_id]		"r" (cpu),
> 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
> @@ -230,8 +232,8 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t
> expectnot,
> 	RSEQ_INJECT_C(9)
> 
> 	__asm__ __volatile__ goto (
> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
> 		RSEQ_INJECT_ASM(3)
> 		RSEQ_ASM_OP_CMPNE(v, expectnot, %l[cmpfail])
> @@ -243,9 +245,9 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t
> expectnot,
> 		RSEQ_ASM_OP_R_LOAD(v)
> 		RSEQ_ASM_OP_R_STORE(load)
> 		RSEQ_ASM_OP_R_LOAD_OFF(voffp)
> -		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
> +		RSEQ_ASM_OP_R_FINAL_STORE(v, 2)
> 		RSEQ_INJECT_ASM(5)
> -		RSEQ_ASM_DEFINE_ABORT(4, abort)
> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
> 		: /* gcc asm goto does not allow outputs */
> 		: [cpu_id]		"r" (cpu),
> 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
> @@ -281,8 +283,8 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
> 	RSEQ_INJECT_C(9)
> 
> 	__asm__ __volatile__ goto (
> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
> 		RSEQ_INJECT_ASM(3)
> #ifdef RSEQ_COMPARE_TWICE
> @@ -290,9 +292,9 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
> #endif
> 		RSEQ_ASM_OP_R_LOAD(v)
> 		RSEQ_ASM_OP_R_ADD(count)
> -		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
> +		RSEQ_ASM_OP_R_FINAL_STORE(v, 2)
> 		RSEQ_INJECT_ASM(4)
> -		RSEQ_ASM_DEFINE_ABORT(4, abort)
> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
> 		: /* gcc asm goto does not allow outputs */
> 		: [cpu_id]		"r" (cpu),
> 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
> @@ -324,8 +326,8 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t
> expect,
> 	RSEQ_INJECT_C(9)
> 
> 	__asm__ __volatile__ goto (
> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
> 		RSEQ_INJECT_ASM(3)
> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
> @@ -336,9 +338,9 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t
> expect,
> #endif
> 		RSEQ_ASM_OP_STORE(newv2, v2)
> 		RSEQ_INJECT_ASM(5)
> -		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
> +		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
> 		RSEQ_INJECT_ASM(6)
> -		RSEQ_ASM_DEFINE_ABORT(4, abort)
> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
> 		: /* gcc asm goto does not allow outputs */
> 		: [cpu_id]		"r" (cpu),
> 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
> @@ -378,8 +380,8 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v,
> intptr_t expect,
> 	RSEQ_INJECT_C(9)
> 
> 	__asm__ __volatile__ goto (
> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
> 		RSEQ_INJECT_ASM(3)
> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
> @@ -390,9 +392,9 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v,
> intptr_t expect,
> #endif
> 		RSEQ_ASM_OP_STORE(newv2, v2)
> 		RSEQ_INJECT_ASM(5)
> -		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
> +		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 2)
> 		RSEQ_INJECT_ASM(6)
> -		RSEQ_ASM_DEFINE_ABORT(4, abort)
> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
> 		: /* gcc asm goto does not allow outputs */
> 		: [cpu_id]		"r" (cpu),
> 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
> @@ -432,8 +434,8 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
> 	RSEQ_INJECT_C(9)
> 
> 	__asm__ __volatile__ goto (
> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
> 		RSEQ_INJECT_ASM(3)
> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
> @@ -445,9 +447,9 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
> 		RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[error3])
> #endif
> -		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
> +		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
> 		RSEQ_INJECT_ASM(6)
> -		RSEQ_ASM_DEFINE_ABORT(4, abort)
> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
> 		: /* gcc asm goto does not allow outputs */
> 		: [cpu_id]		"r" (cpu),
> 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
> @@ -489,8 +491,8 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t
> expect,
> 	RSEQ_INJECT_C(9)
> 
> 	__asm__ __volatile__ goto (
> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
> 		RSEQ_INJECT_ASM(3)
> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
> @@ -501,9 +503,9 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t
> expect,
> #endif
> 		RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
> 		RSEQ_INJECT_ASM(5)
> -		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
> +		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
> 		RSEQ_INJECT_ASM(6)
> -		RSEQ_ASM_DEFINE_ABORT(4, abort)
> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
> 		: /* gcc asm goto does not allow outputs */
> 		: [cpu_id]		"r" (cpu),
> 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
> @@ -544,8 +546,8 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v,
> intptr_t expect,
> 	RSEQ_INJECT_C(9)
> 
> 	__asm__ __volatile__ goto (
> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
> 		RSEQ_INJECT_ASM(3)
> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
> @@ -556,9 +558,9 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v,
> intptr_t expect,
> #endif
> 		RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
> 		RSEQ_INJECT_ASM(5)
> -		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
> +		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 2)
> 		RSEQ_INJECT_ASM(6)
> -		RSEQ_ASM_DEFINE_ABORT(4, abort)
> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
> 		: /* gcc asm goto does not allow outputs */
> 		: [cpu_id]		"r" (cpu),
>  		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 3/3] rseq/selftests: Add support for arm64
@ 2018-07-02 17:47                 ` Mathieu Desnoyers
  0 siblings, 0 replies; 24+ messages in thread
From: Mathieu Desnoyers @ 2018-07-02 17:47 UTC (permalink / raw)
  To: linux-arm-kernel

----- On Jul 2, 2018, at 12:49 PM, Will Deacon will.deacon at arm.com wrote:

> On Thu, Jun 28, 2018 at 04:50:40PM -0400, Mathieu Desnoyers wrote:
>> ----- On Jun 28, 2018, at 12:47 PM, Will Deacon will.deacon at arm.com wrote:
>> > On Tue, Jun 26, 2018 at 12:11:52PM -0400, Mathieu Desnoyers wrote:
>> >> ----- On Jun 26, 2018, at 11:14 AM, Will Deacon will.deacon at arm.com wrote:
>> >> > On Mon, Jun 25, 2018 at 02:10:10PM -0400, Mathieu Desnoyers wrote:
>> >> >> I notice you are using the instructions
>> >> >> 
>> >> >>   adrp
>> >> >>   add
>> >> >>   str
>> >> >> 
>> >> >> to implement RSEQ_ASM_STORE_RSEQ_CS(). Did you compare
>> >> >> performance-wise with an approach using a literal pool
>> >> >> near the instruction pointer like I did on arm32 ?
>> >> > 
>> >> > I didn't, no. Do you have a benchmark to hand so I can give this a go?
>> >> 
>> >> see tools/testing/selftests/rseq/param_test_benchmark --help
>> >> 
>> >> It's a stripped-down version of param_test, without all the code for
>> >> delay loops and testing checks.
>> >> 
>> >> Example use for counter increment with 4 threads, doing 5G counter
>> >> increments per thread:
>> >> 
>> >> time ./param_test_benchmark -T i -t 4 -r 5000000000
>> > 
>> > Thanks. I ran that on a few arm64 systems I have access to, with three
>> > configurations of the selftest:
>> > 
>> > 1. As I posted
>> > 2. With the abort signature and branch in-lined, so as to avoid the CBNZ
>> >   address limitations in large codebases
>> > 3. With both the abort handler and the table inlined (i.e. the same thing
>> >   as 32-bit).
>> > 
>> > There isn't a reliably measurable difference between (1) and (2), but I take
>> > between 12% and 27% hit between (2) and (3).
>> 
>> Those results puzzle me. Do you have the actual code snippets of each
>> implementation nearby ?
> 
> Sure, I've included the diffs for (2) and (3) below. They both apply on top
> of my branch at:
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git rseq
> 
> Will

I figured out that ADRP+ADD are optimized on Cortex A57 to have a 1 cycle
latency. This would explain why they are doing comparatively well compared
to ADR.

And I guess having more compact code wins here.

So I'm OK with your patchset with the modification for (2), which ensures
the abort label is not too far away on large code-bases.

Thanks!

Mathieu

> 
> --->8
> 
> diff --git a/tools/testing/selftests/rseq/rseq-arm64.h
> b/tools/testing/selftests/rseq/rseq-arm64.h
> index 599788f74137..954f34671ca6 100644
> --- a/tools/testing/selftests/rseq/rseq-arm64.h
> +++ b/tools/testing/selftests/rseq/rseq-arm64.h
> @@ -104,11 +104,11 @@ do {										\
> 	__rseq_str(label) ":\n"
> 
> #define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\
> -	"	.pushsection	__rseq_failure, \"ax\"\n"			\
> -	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\
> +	"	b	222f\n"							\
> +	"	.inst 	"	__rseq_str(RSEQ_SIG) "\n"			\
> 	__rseq_str(label) ":\n"							\
> 	"	b	%l[" __rseq_str(abort_label) "]\n"			\
> -	"	.popsection\n"
> +	"222:\n"
> 
> #define RSEQ_ASM_OP_STORE(value, var)						\
> 	"	str	%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n"
> 
> --->8
> 
> diff --git a/tools/testing/selftests/rseq/rseq-arm64.h
> b/tools/testing/selftests/rseq/rseq-arm64.h
> index 599788f74137..2554aa17acf3 100644
> --- a/tools/testing/selftests/rseq/rseq-arm64.h
> +++ b/tools/testing/selftests/rseq/rseq-arm64.h
> @@ -80,35 +80,37 @@ do {										\
> #define RSEQ_ASM_TMP_REG	"x15"
> #define RSEQ_ASM_TMP_REG_2	"x14"
> 
> -#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,		\
> +#define __RSEQ_ASM_DEFINE_TABLE(version, flags, start_ip,			\
> 				post_commit_offset, abort_ip)			\
> -	"	.pushsection	__rseq_table, \"aw\"\n"				\
> -	"	.balign	32\n"							\
> -	__rseq_str(label) ":\n"							\
> 	"	.long	" __rseq_str(version) ", " __rseq_str(flags) "\n"	\
> 	"	.quad	" __rseq_str(start_ip) ", "				\
> 			  __rseq_str(post_commit_offset) ", "			\
> -			  __rseq_str(abort_ip) "\n"				\
> -	"	.popsection\n"
> +			  __rseq_str(abort_ip) "\n"
> 
> -#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip)	\
> -	__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,			\
> -				(post_commit_ip - start_ip), abort_ip)
> +#define RSEQ_ASM_DEFINE_TABLE(start_ip, post_commit_ip, abort_ip)		\
> +	"	.pushsection	__rseq_table, \"aw\"\n"				\
> +	"	.balign	32\n"							\
> +	__RSEQ_ASM_DEFINE_TABLE(0x0, 0x0, start_ip,				\
> +				(post_commit_ip - start_ip), abort_ip)		\
> +	"	.popsection\n"
> 
> -#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)			\
> +#define RSEQ_ASM_STORE_RSEQ_CS(label, table_label, rseq_cs)			\
> 	RSEQ_INJECT_ASM(1)							\
> -	"	adrp	" RSEQ_ASM_TMP_REG ", " __rseq_str(cs_label) "\n"	\
> -	"	add	" RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG		\
> -			", :lo12:" __rseq_str(cs_label) "\n"			\
> +	"	adr	" RSEQ_ASM_TMP_REG ", " __rseq_str(table_label) "\n"	\
> 	"	str	" RSEQ_ASM_TMP_REG ", %[" __rseq_str(rseq_cs) "]\n"	\
> 	__rseq_str(label) ":\n"
> 
> -#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)				\
> -	"	.pushsection	__rseq_failure, \"ax\"\n"			\
> -	"	.long 	"	__rseq_str(RSEQ_SIG) "\n"			\
> +#define RSEQ_ASM_DEFINE_ABORT(table_label, start_ip, post_commit_ip, label,	\
> +			      abort_label)					\
> +	"	b	222f\n"							\
> +	"	.balign 32\n"							\
> +	__rseq_str(table_label) ":\n"						\
> +	__RSEQ_ASM_DEFINE_TABLE(0x0, 0x0, start_ip,				\
> +				(post_commit_ip - start_ip), label ## f)	\
> +	"	.inst 	"	__rseq_str(RSEQ_SIG) "\n"			\
> 	__rseq_str(label) ":\n"							\
> 	"	b	%l[" __rseq_str(abort_label) "]\n"			\
> -	"	.popsection\n"
> +	"222:\n"
> 
> #define RSEQ_ASM_OP_STORE(value, var)						\
> 	"	str	%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n"
> @@ -181,8 +183,8 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect,
> intptr_t newv, int cpu)
> 	RSEQ_INJECT_C(9)
> 
> 	__asm__ __volatile__ goto (
> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
> 		RSEQ_INJECT_ASM(3)
> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
> @@ -191,9 +193,9 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect,
> intptr_t newv, int cpu)
> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
> #endif
> -		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
> +		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
> 		RSEQ_INJECT_ASM(5)
> -		RSEQ_ASM_DEFINE_ABORT(4, abort)
> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
> 		: /* gcc asm goto does not allow outputs */
> 		: [cpu_id]		"r" (cpu),
> 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
> @@ -230,8 +232,8 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t
> expectnot,
> 	RSEQ_INJECT_C(9)
> 
> 	__asm__ __volatile__ goto (
> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
> 		RSEQ_INJECT_ASM(3)
> 		RSEQ_ASM_OP_CMPNE(v, expectnot, %l[cmpfail])
> @@ -243,9 +245,9 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t
> expectnot,
> 		RSEQ_ASM_OP_R_LOAD(v)
> 		RSEQ_ASM_OP_R_STORE(load)
> 		RSEQ_ASM_OP_R_LOAD_OFF(voffp)
> -		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
> +		RSEQ_ASM_OP_R_FINAL_STORE(v, 2)
> 		RSEQ_INJECT_ASM(5)
> -		RSEQ_ASM_DEFINE_ABORT(4, abort)
> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
> 		: /* gcc asm goto does not allow outputs */
> 		: [cpu_id]		"r" (cpu),
> 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
> @@ -281,8 +283,8 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
> 	RSEQ_INJECT_C(9)
> 
> 	__asm__ __volatile__ goto (
> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
> 		RSEQ_INJECT_ASM(3)
> #ifdef RSEQ_COMPARE_TWICE
> @@ -290,9 +292,9 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
> #endif
> 		RSEQ_ASM_OP_R_LOAD(v)
> 		RSEQ_ASM_OP_R_ADD(count)
> -		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
> +		RSEQ_ASM_OP_R_FINAL_STORE(v, 2)
> 		RSEQ_INJECT_ASM(4)
> -		RSEQ_ASM_DEFINE_ABORT(4, abort)
> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
> 		: /* gcc asm goto does not allow outputs */
> 		: [cpu_id]		"r" (cpu),
> 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
> @@ -324,8 +326,8 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t
> expect,
> 	RSEQ_INJECT_C(9)
> 
> 	__asm__ __volatile__ goto (
> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
> 		RSEQ_INJECT_ASM(3)
> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
> @@ -336,9 +338,9 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t
> expect,
> #endif
> 		RSEQ_ASM_OP_STORE(newv2, v2)
> 		RSEQ_INJECT_ASM(5)
> -		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
> +		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
> 		RSEQ_INJECT_ASM(6)
> -		RSEQ_ASM_DEFINE_ABORT(4, abort)
> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
> 		: /* gcc asm goto does not allow outputs */
> 		: [cpu_id]		"r" (cpu),
> 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
> @@ -378,8 +380,8 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v,
> intptr_t expect,
> 	RSEQ_INJECT_C(9)
> 
> 	__asm__ __volatile__ goto (
> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
> 		RSEQ_INJECT_ASM(3)
> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
> @@ -390,9 +392,9 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v,
> intptr_t expect,
> #endif
> 		RSEQ_ASM_OP_STORE(newv2, v2)
> 		RSEQ_INJECT_ASM(5)
> -		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
> +		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 2)
> 		RSEQ_INJECT_ASM(6)
> -		RSEQ_ASM_DEFINE_ABORT(4, abort)
> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
> 		: /* gcc asm goto does not allow outputs */
> 		: [cpu_id]		"r" (cpu),
> 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
> @@ -432,8 +434,8 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
> 	RSEQ_INJECT_C(9)
> 
> 	__asm__ __volatile__ goto (
> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
> 		RSEQ_INJECT_ASM(3)
> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
> @@ -445,9 +447,9 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
> 		RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[error3])
> #endif
> -		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
> +		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
> 		RSEQ_INJECT_ASM(6)
> -		RSEQ_ASM_DEFINE_ABORT(4, abort)
> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
> 		: /* gcc asm goto does not allow outputs */
> 		: [cpu_id]		"r" (cpu),
> 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
> @@ -489,8 +491,8 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t
> expect,
> 	RSEQ_INJECT_C(9)
> 
> 	__asm__ __volatile__ goto (
> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
> 		RSEQ_INJECT_ASM(3)
> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
> @@ -501,9 +503,9 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t
> expect,
> #endif
> 		RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
> 		RSEQ_INJECT_ASM(5)
> -		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
> +		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
> 		RSEQ_INJECT_ASM(6)
> -		RSEQ_ASM_DEFINE_ABORT(4, abort)
> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
> 		: /* gcc asm goto does not allow outputs */
> 		: [cpu_id]		"r" (cpu),
> 		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),
> @@ -544,8 +546,8 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v,
> intptr_t expect,
> 	RSEQ_INJECT_C(9)
> 
> 	__asm__ __volatile__ goto (
> -		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
> -		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
> +		RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f)
> +		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
> 		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
> 		RSEQ_INJECT_ASM(3)
> 		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
> @@ -556,9 +558,9 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v,
> intptr_t expect,
> #endif
> 		RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
> 		RSEQ_INJECT_ASM(5)
> -		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
> +		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 2)
> 		RSEQ_INJECT_ASM(6)
> -		RSEQ_ASM_DEFINE_ABORT(4, abort)
> +		RSEQ_ASM_DEFINE_ABORT(3, 1b, 2b, 4, abort)
> 		: /* gcc asm goto does not allow outputs */
> 		: [cpu_id]		"r" (cpu),
>  		  [current_cpu_id]	"Qo" (__rseq_abi.cpu_id),

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2018-07-02 17:47 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-06-25 17:54 [PATCH 0/3] Support rseq on arm64 Will Deacon
2018-06-25 17:54 ` Will Deacon
2018-06-25 17:54 ` [PATCH 1/3] arm64: rseq: Implement backend rseq calls and select HAVE_RSEQ Will Deacon
2018-06-25 17:54   ` Will Deacon
2018-06-26 10:31   ` Mark Rutland
2018-06-26 10:31     ` Mark Rutland
2018-06-25 17:54 ` [PATCH 2/3] asm-generic: unistd.h: Wire up sys_rseq Will Deacon
2018-06-25 17:54   ` Will Deacon
2018-06-25 17:54 ` [PATCH 3/3] rseq/selftests: Add support for arm64 Will Deacon
2018-06-25 17:54   ` Will Deacon
2018-06-25 18:10   ` Mathieu Desnoyers
2018-06-25 18:10     ` Mathieu Desnoyers
2018-06-26 15:14     ` Will Deacon
2018-06-26 15:14       ` Will Deacon
2018-06-26 16:11       ` Mathieu Desnoyers
2018-06-26 16:11         ` Mathieu Desnoyers
2018-06-28 16:47         ` Will Deacon
2018-06-28 16:47           ` Will Deacon
2018-06-28 20:50           ` Mathieu Desnoyers
2018-06-28 20:50             ` Mathieu Desnoyers
2018-07-02 16:49             ` Will Deacon
2018-07-02 16:49               ` Will Deacon
2018-07-02 17:47               ` Mathieu Desnoyers
2018-07-02 17:47                 ` Mathieu Desnoyers

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.