All of lore.kernel.org
 help / color / mirror / Atom feed
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
To: Peter Zijlstra <peterz@infradead.org>,
	"Paul E . McKenney" <paulmck@linux.vnet.ibm.com>,
	Boqun Feng <boqun.feng@gmail.com>,
	Andy Lutomirski <luto@amacapital.net>,
	Dave Watson <davejwatson@fb.com>
Cc: linux-kernel@vger.kernel.org, linux-api@vger.kernel.org,
	Paul Turner <pjt@google.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Russell King <linux@arm.linux.org.uk>,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>, "H . Peter Anvin" <hpa@zytor.com>,
	Andrew Hunter <ahh@google.com>, Andi Kleen <andi@firstfloor.org>,
	Chris Lameter <cl@linux.com>, Ben Maurer <bmaurer@fb.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Josh Triplett <josh@joshtriplett.org>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Catalin Marinas <catalin.marinas@arm.com>,
	Will Deacon <will.deacon@arm.com>,
	Michael Kerrisk <mtk.manpages@gmail.com>,
	Joel Fernandes <joelaf@google.com>,
	Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
	Shuah Khan <shuahkh@osg.samsung.com>,
	linux-kselftest@vger.kernel.org
Subject: [RFC PATCH for 4.18 15/16] rseq: selftests: Provide parametrized tests (v2)
Date: Sat,  2 Jun 2018 08:44:07 -0400	[thread overview]
Message-ID: <20180602124408.8430-16-mathieu.desnoyers@efficios.com> (raw)
In-Reply-To: <20180602124408.8430-1-mathieu.desnoyers@efficios.com>

"param_test" is a parametrizable restartable sequences test. See
the "--help" output for usage.

"param_test_benchmark" is the same as "param_test", but it removes
testing book-keeping code to allow accurate benchmarks.

"param_test_compare_twice" is the same as "param_test", but it performs
each comparison within rseq critical section twice, thus validating
invariants. If any of the second comparisons fails, an error message
is printed and the test aborts.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Shuah Khan <shuahkh@osg.samsung.com>
CC: Russell King <linux@arm.linux.org.uk>
CC: Catalin Marinas <catalin.marinas@arm.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Paul Turner <pjt@google.com>
CC: Andrew Hunter <ahh@google.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Andi Kleen <andi@firstfloor.org>
CC: Dave Watson <davejwatson@fb.com>
CC: Chris Lameter <cl@linux.com>
CC: Ingo Molnar <mingo@redhat.com>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Ben Maurer <bmaurer@fb.com>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
CC: Josh Triplett <josh@joshtriplett.org>
CC: Linus Torvalds <torvalds@linux-foundation.org>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: linux-kselftest@vger.kernel.org
CC: linux-api@vger.kernel.org
---
Changes since v1:
- Use only rseq, remove use of cpu_opv.
---
 tools/testing/selftests/rseq/param_test.c | 1260 +++++++++++++++++++++++++++++
 1 file changed, 1260 insertions(+)
 create mode 100644 tools/testing/selftests/rseq/param_test.c

diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c
new file mode 100644
index 000000000000..6a9f602a8718
--- /dev/null
+++ b/tools/testing/selftests/rseq/param_test.c
@@ -0,0 +1,1260 @@
+// SPDX-License-Identifier: LGPL-2.1
+#define _GNU_SOURCE
+#include <assert.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <poll.h>
+#include <sys/types.h>
+#include <signal.h>
+#include <errno.h>
+#include <stddef.h>
+
+static inline pid_t gettid(void)
+{
+	return syscall(__NR_gettid);
+}
+
+#define NR_INJECT	9
+static int loop_cnt[NR_INJECT + 1];
+
+static int loop_cnt_1 asm("asm_loop_cnt_1") __attribute__((used));
+static int loop_cnt_2 asm("asm_loop_cnt_2") __attribute__((used));
+static int loop_cnt_3 asm("asm_loop_cnt_3") __attribute__((used));
+static int loop_cnt_4 asm("asm_loop_cnt_4") __attribute__((used));
+static int loop_cnt_5 asm("asm_loop_cnt_5") __attribute__((used));
+static int loop_cnt_6 asm("asm_loop_cnt_6") __attribute__((used));
+
+static int opt_modulo, verbose;
+
+static int opt_yield, opt_signal, opt_sleep,
+		opt_disable_rseq, opt_threads = 200,
+		opt_disable_mod = 0, opt_test = 's', opt_mb = 0;
+
+#ifndef RSEQ_SKIP_FASTPATH
+static long long opt_reps = 5000;
+#else
+static long long opt_reps = 100;
+#endif
+
+static __thread __attribute__((tls_model("initial-exec")))
+unsigned int signals_delivered;
+
+#ifndef BENCHMARK
+
+static __thread __attribute__((tls_model("initial-exec"), unused))
+unsigned int yield_mod_cnt, nr_abort;
+
+#define printf_verbose(fmt, ...)			\
+	do {						\
+		if (verbose)				\
+			printf(fmt, ## __VA_ARGS__);	\
+	} while (0)
+
+#if defined(__x86_64__) || defined(__i386__)
+
+#define INJECT_ASM_REG	"eax"
+
+#define RSEQ_INJECT_CLOBBER \
+	, INJECT_ASM_REG
+
+#ifdef __i386__
+
+#define RSEQ_INJECT_ASM(n) \
+	"mov asm_loop_cnt_" #n ", %%" INJECT_ASM_REG "\n\t" \
+	"test %%" INJECT_ASM_REG ",%%" INJECT_ASM_REG "\n\t" \
+	"jz 333f\n\t" \
+	"222:\n\t" \
+	"dec %%" INJECT_ASM_REG "\n\t" \
+	"jnz 222b\n\t" \
+	"333:\n\t"
+
+#elif defined(__x86_64__)
+
+#define RSEQ_INJECT_ASM(n) \
+	"lea asm_loop_cnt_" #n "(%%rip), %%" INJECT_ASM_REG "\n\t" \
+	"mov (%%" INJECT_ASM_REG "), %%" INJECT_ASM_REG "\n\t" \
+	"test %%" INJECT_ASM_REG ",%%" INJECT_ASM_REG "\n\t" \
+	"jz 333f\n\t" \
+	"222:\n\t" \
+	"dec %%" INJECT_ASM_REG "\n\t" \
+	"jnz 222b\n\t" \
+	"333:\n\t"
+
+#else
+#error "Unsupported architecture"
+#endif
+
+#elif defined(__ARMEL__)
+
+#define RSEQ_INJECT_INPUT \
+	, [loop_cnt_1]"m"(loop_cnt[1]) \
+	, [loop_cnt_2]"m"(loop_cnt[2]) \
+	, [loop_cnt_3]"m"(loop_cnt[3]) \
+	, [loop_cnt_4]"m"(loop_cnt[4]) \
+	, [loop_cnt_5]"m"(loop_cnt[5]) \
+	, [loop_cnt_6]"m"(loop_cnt[6])
+
+#define INJECT_ASM_REG	"r4"
+
+#define RSEQ_INJECT_CLOBBER \
+	, INJECT_ASM_REG
+
+#define RSEQ_INJECT_ASM(n) \
+	"ldr " INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \
+	"cmp " INJECT_ASM_REG ", #0\n\t" \
+	"beq 333f\n\t" \
+	"222:\n\t" \
+	"subs " INJECT_ASM_REG ", #1\n\t" \
+	"bne 222b\n\t" \
+	"333:\n\t"
+
+#elif __PPC__
+
+#define RSEQ_INJECT_INPUT \
+	, [loop_cnt_1]"m"(loop_cnt[1]) \
+	, [loop_cnt_2]"m"(loop_cnt[2]) \
+	, [loop_cnt_3]"m"(loop_cnt[3]) \
+	, [loop_cnt_4]"m"(loop_cnt[4]) \
+	, [loop_cnt_5]"m"(loop_cnt[5]) \
+	, [loop_cnt_6]"m"(loop_cnt[6])
+
+#define INJECT_ASM_REG	"r18"
+
+#define RSEQ_INJECT_CLOBBER \
+	, INJECT_ASM_REG
+
+#define RSEQ_INJECT_ASM(n) \
+	"lwz %%" INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \
+	"cmpwi %%" INJECT_ASM_REG ", 0\n\t" \
+	"beq 333f\n\t" \
+	"222:\n\t" \
+	"subic. %%" INJECT_ASM_REG ", %%" INJECT_ASM_REG ", 1\n\t" \
+	"bne 222b\n\t" \
+	"333:\n\t"
+#else
+#error unsupported target
+#endif
+
+#define RSEQ_INJECT_FAILED \
+	nr_abort++;
+
+#define RSEQ_INJECT_C(n) \
+{ \
+	int loc_i, loc_nr_loops = loop_cnt[n]; \
+	\
+	for (loc_i = 0; loc_i < loc_nr_loops; loc_i++) { \
+		rseq_barrier(); \
+	} \
+	if (loc_nr_loops == -1 && opt_modulo) { \
+		if (yield_mod_cnt == opt_modulo - 1) { \
+			if (opt_sleep > 0) \
+				poll(NULL, 0, opt_sleep); \
+			if (opt_yield) \
+				sched_yield(); \
+			if (opt_signal) \
+				raise(SIGUSR1); \
+			yield_mod_cnt = 0; \
+		} else { \
+			yield_mod_cnt++; \
+		} \
+	} \
+}
+
+#else
+
+#define printf_verbose(fmt, ...)
+
+#endif /* BENCHMARK */
+
+#include "rseq.h"
+
+struct percpu_lock_entry {
+	intptr_t v;
+} __attribute__((aligned(128)));
+
+struct percpu_lock {
+	struct percpu_lock_entry c[CPU_SETSIZE];
+};
+
+struct test_data_entry {
+	intptr_t count;
+} __attribute__((aligned(128)));
+
+struct spinlock_test_data {
+	struct percpu_lock lock;
+	struct test_data_entry c[CPU_SETSIZE];
+};
+
+struct spinlock_thread_test_data {
+	struct spinlock_test_data *data;
+	long long reps;
+	int reg;
+};
+
+struct inc_test_data {
+	struct test_data_entry c[CPU_SETSIZE];
+};
+
+struct inc_thread_test_data {
+	struct inc_test_data *data;
+	long long reps;
+	int reg;
+};
+
+struct percpu_list_node {
+	intptr_t data;
+	struct percpu_list_node *next;
+};
+
+struct percpu_list_entry {
+	struct percpu_list_node *head;
+} __attribute__((aligned(128)));
+
+struct percpu_list {
+	struct percpu_list_entry c[CPU_SETSIZE];
+};
+
+#define BUFFER_ITEM_PER_CPU	100
+
+struct percpu_buffer_node {
+	intptr_t data;
+};
+
+struct percpu_buffer_entry {
+	intptr_t offset;
+	intptr_t buflen;
+	struct percpu_buffer_node **array;
+} __attribute__((aligned(128)));
+
+struct percpu_buffer {
+	struct percpu_buffer_entry c[CPU_SETSIZE];
+};
+
+#define MEMCPY_BUFFER_ITEM_PER_CPU	100
+
+struct percpu_memcpy_buffer_node {
+	intptr_t data1;
+	uint64_t data2;
+};
+
+struct percpu_memcpy_buffer_entry {
+	intptr_t offset;
+	intptr_t buflen;
+	struct percpu_memcpy_buffer_node *array;
+} __attribute__((aligned(128)));
+
+struct percpu_memcpy_buffer {
+	struct percpu_memcpy_buffer_entry c[CPU_SETSIZE];
+};
+
+/* A simple percpu spinlock. Grabs lock on current cpu. */
+static int rseq_this_cpu_lock(struct percpu_lock *lock)
+{
+	int cpu;
+
+	for (;;) {
+		int ret;
+
+		cpu = rseq_cpu_start();
+		ret = rseq_cmpeqv_storev(&lock->c[cpu].v,
+					 0, 1, cpu);
+		if (rseq_likely(!ret))
+			break;
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	/*
+	 * Acquire semantic when taking lock after control dependency.
+	 * Matches rseq_smp_store_release().
+	 */
+	rseq_smp_acquire__after_ctrl_dep();
+	return cpu;
+}
+
+static void rseq_percpu_unlock(struct percpu_lock *lock, int cpu)
+{
+	assert(lock->c[cpu].v == 1);
+	/*
+	 * Release lock, with release semantic. Matches
+	 * rseq_smp_acquire__after_ctrl_dep().
+	 */
+	rseq_smp_store_release(&lock->c[cpu].v, 0);
+}
+
+void *test_percpu_spinlock_thread(void *arg)
+{
+	struct spinlock_thread_test_data *thread_data = arg;
+	struct spinlock_test_data *data = thread_data->data;
+	long long i, reps;
+
+	if (!opt_disable_rseq && thread_data->reg &&
+	    rseq_register_current_thread())
+		abort();
+	reps = thread_data->reps;
+	for (i = 0; i < reps; i++) {
+		int cpu = rseq_cpu_start();
+
+		cpu = rseq_this_cpu_lock(&data->lock);
+		data->c[cpu].count++;
+		rseq_percpu_unlock(&data->lock, cpu);
+#ifndef BENCHMARK
+		if (i != 0 && !(i % (reps / 10)))
+			printf_verbose("tid %d: count %lld\n", (int) gettid(), i);
+#endif
+	}
+	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+		       (int) gettid(), nr_abort, signals_delivered);
+	if (!opt_disable_rseq && thread_data->reg &&
+	    rseq_unregister_current_thread())
+		abort();
+	return NULL;
+}
+
+/*
+ * A simple test which implements a sharded counter using a per-cpu
+ * lock.  Obviously real applications might prefer to simply use a
+ * per-cpu increment; however, this is reasonable for a test and the
+ * lock can be extended to synchronize more complicated operations.
+ */
+void test_percpu_spinlock(void)
+{
+	const int num_threads = opt_threads;
+	int i, ret;
+	uint64_t sum;
+	pthread_t test_threads[num_threads];
+	struct spinlock_test_data data;
+	struct spinlock_thread_test_data thread_data[num_threads];
+
+	memset(&data, 0, sizeof(data));
+	for (i = 0; i < num_threads; i++) {
+		thread_data[i].reps = opt_reps;
+		if (opt_disable_mod <= 0 || (i % opt_disable_mod))
+			thread_data[i].reg = 1;
+		else
+			thread_data[i].reg = 0;
+		thread_data[i].data = &data;
+		ret = pthread_create(&test_threads[i], NULL,
+				     test_percpu_spinlock_thread,
+				     &thread_data[i]);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	sum = 0;
+	for (i = 0; i < CPU_SETSIZE; i++)
+		sum += data.c[i].count;
+
+	assert(sum == (uint64_t)opt_reps * num_threads);
+}
+
+void *test_percpu_inc_thread(void *arg)
+{
+	struct inc_thread_test_data *thread_data = arg;
+	struct inc_test_data *data = thread_data->data;
+	long long i, reps;
+
+	if (!opt_disable_rseq && thread_data->reg &&
+	    rseq_register_current_thread())
+		abort();
+	reps = thread_data->reps;
+	for (i = 0; i < reps; i++) {
+		int ret;
+
+		do {
+			int cpu;
+
+			cpu = rseq_cpu_start();
+			ret = rseq_addv(&data->c[cpu].count, 1, cpu);
+		} while (rseq_unlikely(ret));
+#ifndef BENCHMARK
+		if (i != 0 && !(i % (reps / 10)))
+			printf_verbose("tid %d: count %lld\n", (int) gettid(), i);
+#endif
+	}
+	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+		       (int) gettid(), nr_abort, signals_delivered);
+	if (!opt_disable_rseq && thread_data->reg &&
+	    rseq_unregister_current_thread())
+		abort();
+	return NULL;
+}
+
+void test_percpu_inc(void)
+{
+	const int num_threads = opt_threads;
+	int i, ret;
+	uint64_t sum;
+	pthread_t test_threads[num_threads];
+	struct inc_test_data data;
+	struct inc_thread_test_data thread_data[num_threads];
+
+	memset(&data, 0, sizeof(data));
+	for (i = 0; i < num_threads; i++) {
+		thread_data[i].reps = opt_reps;
+		if (opt_disable_mod <= 0 || (i % opt_disable_mod))
+			thread_data[i].reg = 1;
+		else
+			thread_data[i].reg = 0;
+		thread_data[i].data = &data;
+		ret = pthread_create(&test_threads[i], NULL,
+				     test_percpu_inc_thread,
+				     &thread_data[i]);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	sum = 0;
+	for (i = 0; i < CPU_SETSIZE; i++)
+		sum += data.c[i].count;
+
+	assert(sum == (uint64_t)opt_reps * num_threads);
+}
+
+void this_cpu_list_push(struct percpu_list *list,
+			struct percpu_list_node *node,
+			int *_cpu)
+{
+	int cpu;
+
+	for (;;) {
+		intptr_t *targetptr, newval, expect;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		/* Load list->c[cpu].head with single-copy atomicity. */
+		expect = (intptr_t)RSEQ_READ_ONCE(list->c[cpu].head);
+		newval = (intptr_t)node;
+		targetptr = (intptr_t *)&list->c[cpu].head;
+		node->next = (struct percpu_list_node *)expect;
+		ret = rseq_cmpeqv_storev(targetptr, expect, newval, cpu);
+		if (rseq_likely(!ret))
+			break;
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+}
+
+/*
+ * Unlike a traditional lock-less linked list; the availability of a
+ * rseq primitive allows us to implement pop without concerns over
+ * ABA-type races.
+ */
+struct percpu_list_node *this_cpu_list_pop(struct percpu_list *list,
+					   int *_cpu)
+{
+	struct percpu_list_node *node = NULL;
+	int cpu;
+
+	for (;;) {
+		struct percpu_list_node *head;
+		intptr_t *targetptr, expectnot, *load;
+		off_t offset;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		targetptr = (intptr_t *)&list->c[cpu].head;
+		expectnot = (intptr_t)NULL;
+		offset = offsetof(struct percpu_list_node, next);
+		load = (intptr_t *)&head;
+		ret = rseq_cmpnev_storeoffp_load(targetptr, expectnot,
+						   offset, load, cpu);
+		if (rseq_likely(!ret)) {
+			node = head;
+			break;
+		}
+		if (ret > 0)
+			break;
+		/* Retry if rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+	return node;
+}
+
+/*
+ * __percpu_list_pop is not safe against concurrent accesses. Should
+ * only be used on lists that are not concurrently modified.
+ */
+struct percpu_list_node *__percpu_list_pop(struct percpu_list *list, int cpu)
+{
+	struct percpu_list_node *node;
+
+	node = list->c[cpu].head;
+	if (!node)
+		return NULL;
+	list->c[cpu].head = node->next;
+	return node;
+}
+
+void *test_percpu_list_thread(void *arg)
+{
+	long long i, reps;
+	struct percpu_list *list = (struct percpu_list *)arg;
+
+	if (!opt_disable_rseq && rseq_register_current_thread())
+		abort();
+
+	reps = opt_reps;
+	for (i = 0; i < reps; i++) {
+		struct percpu_list_node *node;
+
+		node = this_cpu_list_pop(list, NULL);
+		if (opt_yield)
+			sched_yield();  /* encourage shuffling */
+		if (node)
+			this_cpu_list_push(list, node, NULL);
+	}
+
+	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+		       (int) gettid(), nr_abort, signals_delivered);
+	if (!opt_disable_rseq && rseq_unregister_current_thread())
+		abort();
+
+	return NULL;
+}
+
+/* Simultaneous modification to a per-cpu linked list from many threads.  */
+void test_percpu_list(void)
+{
+	const int num_threads = opt_threads;
+	int i, j, ret;
+	uint64_t sum = 0, expected_sum = 0;
+	struct percpu_list list;
+	pthread_t test_threads[num_threads];
+	cpu_set_t allowed_cpus;
+
+	memset(&list, 0, sizeof(list));
+
+	/* Generate list entries for every usable cpu. */
+	sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+		for (j = 1; j <= 100; j++) {
+			struct percpu_list_node *node;
+
+			expected_sum += j;
+
+			node = malloc(sizeof(*node));
+			assert(node);
+			node->data = j;
+			node->next = list.c[i].head;
+			list.c[i].head = node;
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_create(&test_threads[i], NULL,
+				     test_percpu_list_thread, &list);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		struct percpu_list_node *node;
+
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+
+		while ((node = __percpu_list_pop(&list, i))) {
+			sum += node->data;
+			free(node);
+		}
+	}
+
+	/*
+	 * All entries should now be accounted for (unless some external
+	 * actor is interfering with our allowed affinity while this
+	 * test is running).
+	 */
+	assert(sum == expected_sum);
+}
+
+bool this_cpu_buffer_push(struct percpu_buffer *buffer,
+			  struct percpu_buffer_node *node,
+			  int *_cpu)
+{
+	bool result = false;
+	int cpu;
+
+	for (;;) {
+		intptr_t *targetptr_spec, newval_spec;
+		intptr_t *targetptr_final, newval_final;
+		intptr_t offset;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+		if (offset == buffer->c[cpu].buflen)
+			break;
+		newval_spec = (intptr_t)node;
+		targetptr_spec = (intptr_t *)&buffer->c[cpu].array[offset];
+		newval_final = offset + 1;
+		targetptr_final = &buffer->c[cpu].offset;
+		if (opt_mb)
+			ret = rseq_cmpeqv_trystorev_storev_release(
+				targetptr_final, offset, targetptr_spec,
+				newval_spec, newval_final, cpu);
+		else
+			ret = rseq_cmpeqv_trystorev_storev(targetptr_final,
+				offset, targetptr_spec, newval_spec,
+				newval_final, cpu);
+		if (rseq_likely(!ret)) {
+			result = true;
+			break;
+		}
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+	return result;
+}
+
+struct percpu_buffer_node *this_cpu_buffer_pop(struct percpu_buffer *buffer,
+					       int *_cpu)
+{
+	struct percpu_buffer_node *head;
+	int cpu;
+
+	for (;;) {
+		intptr_t *targetptr, newval;
+		intptr_t offset;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		/* Load offset with single-copy atomicity. */
+		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+		if (offset == 0) {
+			head = NULL;
+			break;
+		}
+		head = RSEQ_READ_ONCE(buffer->c[cpu].array[offset - 1]);
+		newval = offset - 1;
+		targetptr = (intptr_t *)&buffer->c[cpu].offset;
+		ret = rseq_cmpeqv_cmpeqv_storev(targetptr, offset,
+			(intptr_t *)&buffer->c[cpu].array[offset - 1],
+			(intptr_t)head, newval, cpu);
+		if (rseq_likely(!ret))
+			break;
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+	return head;
+}
+
+/*
+ * __percpu_buffer_pop is not safe against concurrent accesses. Should
+ * only be used on buffers that are not concurrently modified.
+ */
+struct percpu_buffer_node *__percpu_buffer_pop(struct percpu_buffer *buffer,
+					       int cpu)
+{
+	struct percpu_buffer_node *head;
+	intptr_t offset;
+
+	offset = buffer->c[cpu].offset;
+	if (offset == 0)
+		return NULL;
+	head = buffer->c[cpu].array[offset - 1];
+	buffer->c[cpu].offset = offset - 1;
+	return head;
+}
+
+void *test_percpu_buffer_thread(void *arg)
+{
+	long long i, reps;
+	struct percpu_buffer *buffer = (struct percpu_buffer *)arg;
+
+	if (!opt_disable_rseq && rseq_register_current_thread())
+		abort();
+
+	reps = opt_reps;
+	for (i = 0; i < reps; i++) {
+		struct percpu_buffer_node *node;
+
+		node = this_cpu_buffer_pop(buffer, NULL);
+		if (opt_yield)
+			sched_yield();  /* encourage shuffling */
+		if (node) {
+			if (!this_cpu_buffer_push(buffer, node, NULL)) {
+				/* Should increase buffer size. */
+				abort();
+			}
+		}
+	}
+
+	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+		       (int) gettid(), nr_abort, signals_delivered);
+	if (!opt_disable_rseq && rseq_unregister_current_thread())
+		abort();
+
+	return NULL;
+}
+
+/* Simultaneous modification to a per-cpu buffer from many threads.  */
+void test_percpu_buffer(void)
+{
+	const int num_threads = opt_threads;
+	int i, j, ret;
+	uint64_t sum = 0, expected_sum = 0;
+	struct percpu_buffer buffer;
+	pthread_t test_threads[num_threads];
+	cpu_set_t allowed_cpus;
+
+	memset(&buffer, 0, sizeof(buffer));
+
+	/* Generate list entries for every usable cpu. */
+	sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+		/* Worse-case is every item in same CPU. */
+		buffer.c[i].array =
+			malloc(sizeof(*buffer.c[i].array) * CPU_SETSIZE *
+			       BUFFER_ITEM_PER_CPU);
+		assert(buffer.c[i].array);
+		buffer.c[i].buflen = CPU_SETSIZE * BUFFER_ITEM_PER_CPU;
+		for (j = 1; j <= BUFFER_ITEM_PER_CPU; j++) {
+			struct percpu_buffer_node *node;
+
+			expected_sum += j;
+
+			/*
+			 * We could theoretically put the word-sized
+			 * "data" directly in the buffer. However, we
+			 * want to model objects that would not fit
+			 * within a single word, so allocate an object
+			 * for each node.
+			 */
+			node = malloc(sizeof(*node));
+			assert(node);
+			node->data = j;
+			buffer.c[i].array[j - 1] = node;
+			buffer.c[i].offset++;
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_create(&test_threads[i], NULL,
+				     test_percpu_buffer_thread, &buffer);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		struct percpu_buffer_node *node;
+
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+
+		while ((node = __percpu_buffer_pop(&buffer, i))) {
+			sum += node->data;
+			free(node);
+		}
+		free(buffer.c[i].array);
+	}
+
+	/*
+	 * All entries should now be accounted for (unless some external
+	 * actor is interfering with our allowed affinity while this
+	 * test is running).
+	 */
+	assert(sum == expected_sum);
+}
+
+bool this_cpu_memcpy_buffer_push(struct percpu_memcpy_buffer *buffer,
+				 struct percpu_memcpy_buffer_node item,
+				 int *_cpu)
+{
+	bool result = false;
+	int cpu;
+
+	for (;;) {
+		intptr_t *targetptr_final, newval_final, offset;
+		char *destptr, *srcptr;
+		size_t copylen;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		/* Load offset with single-copy atomicity. */
+		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+		if (offset == buffer->c[cpu].buflen)
+			break;
+		destptr = (char *)&buffer->c[cpu].array[offset];
+		srcptr = (char *)&item;
+		/* copylen must be <= 4kB. */
+		copylen = sizeof(item);
+		newval_final = offset + 1;
+		targetptr_final = &buffer->c[cpu].offset;
+		if (opt_mb)
+			ret = rseq_cmpeqv_trymemcpy_storev_release(
+				targetptr_final, offset,
+				destptr, srcptr, copylen,
+				newval_final, cpu);
+		else
+			ret = rseq_cmpeqv_trymemcpy_storev(targetptr_final,
+				offset, destptr, srcptr, copylen,
+				newval_final, cpu);
+		if (rseq_likely(!ret)) {
+			result = true;
+			break;
+		}
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+	return result;
+}
+
+bool this_cpu_memcpy_buffer_pop(struct percpu_memcpy_buffer *buffer,
+				struct percpu_memcpy_buffer_node *item,
+				int *_cpu)
+{
+	bool result = false;
+	int cpu;
+
+	for (;;) {
+		intptr_t *targetptr_final, newval_final, offset;
+		char *destptr, *srcptr;
+		size_t copylen;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		/* Load offset with single-copy atomicity. */
+		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+		if (offset == 0)
+			break;
+		destptr = (char *)item;
+		srcptr = (char *)&buffer->c[cpu].array[offset - 1];
+		/* copylen must be <= 4kB. */
+		copylen = sizeof(*item);
+		newval_final = offset - 1;
+		targetptr_final = &buffer->c[cpu].offset;
+		ret = rseq_cmpeqv_trymemcpy_storev(targetptr_final,
+			offset, destptr, srcptr, copylen,
+			newval_final, cpu);
+		if (rseq_likely(!ret)) {
+			result = true;
+			break;
+		}
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+	return result;
+}
+
+/*
+ * __percpu_memcpy_buffer_pop is not safe against concurrent accesses. Should
+ * only be used on buffers that are not concurrently modified.
+ */
+bool __percpu_memcpy_buffer_pop(struct percpu_memcpy_buffer *buffer,
+				struct percpu_memcpy_buffer_node *item,
+				int cpu)
+{
+	intptr_t offset;
+
+	offset = buffer->c[cpu].offset;
+	if (offset == 0)
+		return false;
+	memcpy(item, &buffer->c[cpu].array[offset - 1], sizeof(*item));
+	buffer->c[cpu].offset = offset - 1;
+	return true;
+}
+
+void *test_percpu_memcpy_buffer_thread(void *arg)
+{
+	long long i, reps;
+	struct percpu_memcpy_buffer *buffer = (struct percpu_memcpy_buffer *)arg;
+
+	if (!opt_disable_rseq && rseq_register_current_thread())
+		abort();
+
+	reps = opt_reps;
+	for (i = 0; i < reps; i++) {
+		struct percpu_memcpy_buffer_node item;
+		bool result;
+
+		result = this_cpu_memcpy_buffer_pop(buffer, &item, NULL);
+		if (opt_yield)
+			sched_yield();  /* encourage shuffling */
+		if (result) {
+			if (!this_cpu_memcpy_buffer_push(buffer, item, NULL)) {
+				/* Should increase buffer size. */
+				abort();
+			}
+		}
+	}
+
+	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+		       (int) gettid(), nr_abort, signals_delivered);
+	if (!opt_disable_rseq && rseq_unregister_current_thread())
+		abort();
+
+	return NULL;
+}
+
+/* Simultaneous modification to a per-cpu buffer from many threads.  */
+void test_percpu_memcpy_buffer(void)
+{
+	const int num_threads = opt_threads;
+	int i, j, ret;
+	uint64_t sum = 0, expected_sum = 0;
+	struct percpu_memcpy_buffer buffer;
+	pthread_t test_threads[num_threads];
+	cpu_set_t allowed_cpus;
+
+	memset(&buffer, 0, sizeof(buffer));
+
+	/* Generate list entries for every usable cpu. */
+	sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+		/* Worse-case is every item in same CPU. */
+		buffer.c[i].array =
+			malloc(sizeof(*buffer.c[i].array) * CPU_SETSIZE *
+			       MEMCPY_BUFFER_ITEM_PER_CPU);
+		assert(buffer.c[i].array);
+		buffer.c[i].buflen = CPU_SETSIZE * MEMCPY_BUFFER_ITEM_PER_CPU;
+		for (j = 1; j <= MEMCPY_BUFFER_ITEM_PER_CPU; j++) {
+			expected_sum += 2 * j + 1;
+
+			/*
+			 * We could theoretically put the word-sized
+			 * "data" directly in the buffer. However, we
+			 * want to model objects that would not fit
+			 * within a single word, so allocate an object
+			 * for each node.
+			 */
+			buffer.c[i].array[j - 1].data1 = j;
+			buffer.c[i].array[j - 1].data2 = j + 1;
+			buffer.c[i].offset++;
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_create(&test_threads[i], NULL,
+				     test_percpu_memcpy_buffer_thread,
+				     &buffer);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		struct percpu_memcpy_buffer_node item;
+
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+
+		while (__percpu_memcpy_buffer_pop(&buffer, &item, i)) {
+			sum += item.data1;
+			sum += item.data2;
+		}
+		free(buffer.c[i].array);
+	}
+
+	/*
+	 * All entries should now be accounted for (unless some external
+	 * actor is interfering with our allowed affinity while this
+	 * test is running).
+	 */
+	assert(sum == expected_sum);
+}
+
+static void test_signal_interrupt_handler(int signo)
+{
+	signals_delivered++;
+}
+
+static int set_signal_handler(void)
+{
+	int ret = 0;
+	struct sigaction sa;
+	sigset_t sigset;
+
+	ret = sigemptyset(&sigset);
+	if (ret < 0) {
+		perror("sigemptyset");
+		return ret;
+	}
+
+	sa.sa_handler = test_signal_interrupt_handler;
+	sa.sa_mask = sigset;
+	sa.sa_flags = 0;
+	ret = sigaction(SIGUSR1, &sa, NULL);
+	if (ret < 0) {
+		perror("sigaction");
+		return ret;
+	}
+
+	printf_verbose("Signal handler set for SIGUSR1\n");
+
+	return ret;
+}
+
+static void show_usage(int argc, char **argv)
+{
+	printf("Usage : %s <OPTIONS>\n",
+		argv[0]);
+	printf("OPTIONS:\n");
+	printf("	[-1 loops] Number of loops for delay injection 1\n");
+	printf("	[-2 loops] Number of loops for delay injection 2\n");
+	printf("	[-3 loops] Number of loops for delay injection 3\n");
+	printf("	[-4 loops] Number of loops for delay injection 4\n");
+	printf("	[-5 loops] Number of loops for delay injection 5\n");
+	printf("	[-6 loops] Number of loops for delay injection 6\n");
+	printf("	[-7 loops] Number of loops for delay injection 7 (-1 to enable -m)\n");
+	printf("	[-8 loops] Number of loops for delay injection 8 (-1 to enable -m)\n");
+	printf("	[-9 loops] Number of loops for delay injection 9 (-1 to enable -m)\n");
+	printf("	[-m N] Yield/sleep/kill every modulo N (default 0: disabled) (>= 0)\n");
+	printf("	[-y] Yield\n");
+	printf("	[-k] Kill thread with signal\n");
+	printf("	[-s S] S: =0: disabled (default), >0: sleep time (ms)\n");
+	printf("	[-t N] Number of threads (default 200)\n");
+	printf("	[-r N] Number of repetitions per thread (default 5000)\n");
+	printf("	[-d] Disable rseq system call (no initialization)\n");
+	printf("	[-D M] Disable rseq for each M threads\n");
+	printf("	[-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement\n");
+	printf("	[-M] Push into buffer and memcpy buffer with memory barriers.\n");
+	printf("	[-v] Verbose output.\n");
+	printf("	[-h] Show this help.\n");
+	printf("\n");
+}
+
+int main(int argc, char **argv)
+{
+	int i;
+
+	for (i = 1; i < argc; i++) {
+		if (argv[i][0] != '-')
+			continue;
+		switch (argv[i][1]) {
+		case '1':
+		case '2':
+		case '3':
+		case '4':
+		case '5':
+		case '6':
+		case '7':
+		case '8':
+		case '9':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			loop_cnt[argv[i][1] - '0'] = atol(argv[i + 1]);
+			i++;
+			break;
+		case 'm':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_modulo = atol(argv[i + 1]);
+			if (opt_modulo < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 's':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_sleep = atol(argv[i + 1]);
+			if (opt_sleep < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 'y':
+			opt_yield = 1;
+			break;
+		case 'k':
+			opt_signal = 1;
+			break;
+		case 'd':
+			opt_disable_rseq = 1;
+			break;
+		case 'D':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_disable_mod = atol(argv[i + 1]);
+			if (opt_disable_mod < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 't':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_threads = atol(argv[i + 1]);
+			if (opt_threads < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 'r':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_reps = atoll(argv[i + 1]);
+			if (opt_reps < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 'h':
+			show_usage(argc, argv);
+			goto end;
+		case 'T':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_test = *argv[i + 1];
+			switch (opt_test) {
+			case 's':
+			case 'l':
+			case 'i':
+			case 'b':
+			case 'm':
+				break;
+			default:
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 'v':
+			verbose = 1;
+			break;
+		case 'M':
+			opt_mb = 1;
+			break;
+		default:
+			show_usage(argc, argv);
+			goto error;
+		}
+	}
+
+	loop_cnt_1 = loop_cnt[1];
+	loop_cnt_2 = loop_cnt[2];
+	loop_cnt_3 = loop_cnt[3];
+	loop_cnt_4 = loop_cnt[4];
+	loop_cnt_5 = loop_cnt[5];
+	loop_cnt_6 = loop_cnt[6];
+
+	if (set_signal_handler())
+		goto error;
+
+	if (!opt_disable_rseq && rseq_register_current_thread())
+		goto error;
+	switch (opt_test) {
+	case 's':
+		printf_verbose("spinlock\n");
+		test_percpu_spinlock();
+		break;
+	case 'l':
+		printf_verbose("linked list\n");
+		test_percpu_list();
+		break;
+	case 'b':
+		printf_verbose("buffer\n");
+		test_percpu_buffer();
+		break;
+	case 'm':
+		printf_verbose("memcpy buffer\n");
+		test_percpu_memcpy_buffer();
+		break;
+	case 'i':
+		printf_verbose("counter increment\n");
+		test_percpu_inc();
+		break;
+	}
+	if (!opt_disable_rseq && rseq_unregister_current_thread())
+		abort();
+end:
+	return 0;
+
+error:
+	return -1;
+}
-- 
2.11.0

WARNING: multiple messages have this Message-ID (diff)
From: mathieu.desnoyers at efficios.com (Mathieu Desnoyers)
Subject: [RFC PATCH for 4.18 15/16] rseq: selftests: Provide parametrized tests (v2)
Date: Sat,  2 Jun 2018 08:44:07 -0400	[thread overview]
Message-ID: <20180602124408.8430-16-mathieu.desnoyers@efficios.com> (raw)
In-Reply-To: <20180602124408.8430-1-mathieu.desnoyers@efficios.com>

"param_test" is a parametrizable restartable sequences test. See
the "--help" output for usage.

"param_test_benchmark" is the same as "param_test", but it removes
testing book-keeping code to allow accurate benchmarks.

"param_test_compare_twice" is the same as "param_test", but it performs
each comparison within rseq critical section twice, thus validating
invariants. If any of the second comparisons fails, an error message
is printed and the test aborts.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers at efficios.com>
CC: Shuah Khan <shuahkh at osg.samsung.com>
CC: Russell King <linux at arm.linux.org.uk>
CC: Catalin Marinas <catalin.marinas at arm.com>
CC: Will Deacon <will.deacon at arm.com>
CC: Thomas Gleixner <tglx at linutronix.de>
CC: Paul Turner <pjt at google.com>
CC: Andrew Hunter <ahh at google.com>
CC: Peter Zijlstra <peterz at infradead.org>
CC: Andy Lutomirski <luto at amacapital.net>
CC: Andi Kleen <andi at firstfloor.org>
CC: Dave Watson <davejwatson at fb.com>
CC: Chris Lameter <cl at linux.com>
CC: Ingo Molnar <mingo at redhat.com>
CC: "H. Peter Anvin" <hpa at zytor.com>
CC: Ben Maurer <bmaurer at fb.com>
CC: Steven Rostedt <rostedt at goodmis.org>
CC: "Paul E. McKenney" <paulmck at linux.vnet.ibm.com>
CC: Josh Triplett <josh at joshtriplett.org>
CC: Linus Torvalds <torvalds at linux-foundation.org>
CC: Andrew Morton <akpm at linux-foundation.org>
CC: Boqun Feng <boqun.feng at gmail.com>
CC: linux-kselftest at vger.kernel.org
CC: linux-api at vger.kernel.org
---
Changes since v1:
- Use only rseq, remove use of cpu_opv.
---
 tools/testing/selftests/rseq/param_test.c | 1260 +++++++++++++++++++++++++++++
 1 file changed, 1260 insertions(+)
 create mode 100644 tools/testing/selftests/rseq/param_test.c

diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c
new file mode 100644
index 000000000000..6a9f602a8718
--- /dev/null
+++ b/tools/testing/selftests/rseq/param_test.c
@@ -0,0 +1,1260 @@
+// SPDX-License-Identifier: LGPL-2.1
+#define _GNU_SOURCE
+#include <assert.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <poll.h>
+#include <sys/types.h>
+#include <signal.h>
+#include <errno.h>
+#include <stddef.h>
+
+static inline pid_t gettid(void)
+{
+	return syscall(__NR_gettid);
+}
+
+#define NR_INJECT	9
+static int loop_cnt[NR_INJECT + 1];
+
+static int loop_cnt_1 asm("asm_loop_cnt_1") __attribute__((used));
+static int loop_cnt_2 asm("asm_loop_cnt_2") __attribute__((used));
+static int loop_cnt_3 asm("asm_loop_cnt_3") __attribute__((used));
+static int loop_cnt_4 asm("asm_loop_cnt_4") __attribute__((used));
+static int loop_cnt_5 asm("asm_loop_cnt_5") __attribute__((used));
+static int loop_cnt_6 asm("asm_loop_cnt_6") __attribute__((used));
+
+static int opt_modulo, verbose;
+
+static int opt_yield, opt_signal, opt_sleep,
+		opt_disable_rseq, opt_threads = 200,
+		opt_disable_mod = 0, opt_test = 's', opt_mb = 0;
+
+#ifndef RSEQ_SKIP_FASTPATH
+static long long opt_reps = 5000;
+#else
+static long long opt_reps = 100;
+#endif
+
+static __thread __attribute__((tls_model("initial-exec")))
+unsigned int signals_delivered;
+
+#ifndef BENCHMARK
+
+static __thread __attribute__((tls_model("initial-exec"), unused))
+unsigned int yield_mod_cnt, nr_abort;
+
+#define printf_verbose(fmt, ...)			\
+	do {						\
+		if (verbose)				\
+			printf(fmt, ## __VA_ARGS__);	\
+	} while (0)
+
+#if defined(__x86_64__) || defined(__i386__)
+
+#define INJECT_ASM_REG	"eax"
+
+#define RSEQ_INJECT_CLOBBER \
+	, INJECT_ASM_REG
+
+#ifdef __i386__
+
+#define RSEQ_INJECT_ASM(n) \
+	"mov asm_loop_cnt_" #n ", %%" INJECT_ASM_REG "\n\t" \
+	"test %%" INJECT_ASM_REG ",%%" INJECT_ASM_REG "\n\t" \
+	"jz 333f\n\t" \
+	"222:\n\t" \
+	"dec %%" INJECT_ASM_REG "\n\t" \
+	"jnz 222b\n\t" \
+	"333:\n\t"
+
+#elif defined(__x86_64__)
+
+#define RSEQ_INJECT_ASM(n) \
+	"lea asm_loop_cnt_" #n "(%%rip), %%" INJECT_ASM_REG "\n\t" \
+	"mov (%%" INJECT_ASM_REG "), %%" INJECT_ASM_REG "\n\t" \
+	"test %%" INJECT_ASM_REG ",%%" INJECT_ASM_REG "\n\t" \
+	"jz 333f\n\t" \
+	"222:\n\t" \
+	"dec %%" INJECT_ASM_REG "\n\t" \
+	"jnz 222b\n\t" \
+	"333:\n\t"
+
+#else
+#error "Unsupported architecture"
+#endif
+
+#elif defined(__ARMEL__)
+
+#define RSEQ_INJECT_INPUT \
+	, [loop_cnt_1]"m"(loop_cnt[1]) \
+	, [loop_cnt_2]"m"(loop_cnt[2]) \
+	, [loop_cnt_3]"m"(loop_cnt[3]) \
+	, [loop_cnt_4]"m"(loop_cnt[4]) \
+	, [loop_cnt_5]"m"(loop_cnt[5]) \
+	, [loop_cnt_6]"m"(loop_cnt[6])
+
+#define INJECT_ASM_REG	"r4"
+
+#define RSEQ_INJECT_CLOBBER \
+	, INJECT_ASM_REG
+
+#define RSEQ_INJECT_ASM(n) \
+	"ldr " INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \
+	"cmp " INJECT_ASM_REG ", #0\n\t" \
+	"beq 333f\n\t" \
+	"222:\n\t" \
+	"subs " INJECT_ASM_REG ", #1\n\t" \
+	"bne 222b\n\t" \
+	"333:\n\t"
+
+#elif __PPC__
+
+#define RSEQ_INJECT_INPUT \
+	, [loop_cnt_1]"m"(loop_cnt[1]) \
+	, [loop_cnt_2]"m"(loop_cnt[2]) \
+	, [loop_cnt_3]"m"(loop_cnt[3]) \
+	, [loop_cnt_4]"m"(loop_cnt[4]) \
+	, [loop_cnt_5]"m"(loop_cnt[5]) \
+	, [loop_cnt_6]"m"(loop_cnt[6])
+
+#define INJECT_ASM_REG	"r18"
+
+#define RSEQ_INJECT_CLOBBER \
+	, INJECT_ASM_REG
+
+#define RSEQ_INJECT_ASM(n) \
+	"lwz %%" INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \
+	"cmpwi %%" INJECT_ASM_REG ", 0\n\t" \
+	"beq 333f\n\t" \
+	"222:\n\t" \
+	"subic. %%" INJECT_ASM_REG ", %%" INJECT_ASM_REG ", 1\n\t" \
+	"bne 222b\n\t" \
+	"333:\n\t"
+#else
+#error unsupported target
+#endif
+
+#define RSEQ_INJECT_FAILED \
+	nr_abort++;
+
+#define RSEQ_INJECT_C(n) \
+{ \
+	int loc_i, loc_nr_loops = loop_cnt[n]; \
+	\
+	for (loc_i = 0; loc_i < loc_nr_loops; loc_i++) { \
+		rseq_barrier(); \
+	} \
+	if (loc_nr_loops == -1 && opt_modulo) { \
+		if (yield_mod_cnt == opt_modulo - 1) { \
+			if (opt_sleep > 0) \
+				poll(NULL, 0, opt_sleep); \
+			if (opt_yield) \
+				sched_yield(); \
+			if (opt_signal) \
+				raise(SIGUSR1); \
+			yield_mod_cnt = 0; \
+		} else { \
+			yield_mod_cnt++; \
+		} \
+	} \
+}
+
+#else
+
+#define printf_verbose(fmt, ...)
+
+#endif /* BENCHMARK */
+
+#include "rseq.h"
+
+struct percpu_lock_entry {
+	intptr_t v;
+} __attribute__((aligned(128)));
+
+struct percpu_lock {
+	struct percpu_lock_entry c[CPU_SETSIZE];
+};
+
+struct test_data_entry {
+	intptr_t count;
+} __attribute__((aligned(128)));
+
+struct spinlock_test_data {
+	struct percpu_lock lock;
+	struct test_data_entry c[CPU_SETSIZE];
+};
+
+struct spinlock_thread_test_data {
+	struct spinlock_test_data *data;
+	long long reps;
+	int reg;
+};
+
+struct inc_test_data {
+	struct test_data_entry c[CPU_SETSIZE];
+};
+
+struct inc_thread_test_data {
+	struct inc_test_data *data;
+	long long reps;
+	int reg;
+};
+
+struct percpu_list_node {
+	intptr_t data;
+	struct percpu_list_node *next;
+};
+
+struct percpu_list_entry {
+	struct percpu_list_node *head;
+} __attribute__((aligned(128)));
+
+struct percpu_list {
+	struct percpu_list_entry c[CPU_SETSIZE];
+};
+
+#define BUFFER_ITEM_PER_CPU	100
+
+struct percpu_buffer_node {
+	intptr_t data;
+};
+
+struct percpu_buffer_entry {
+	intptr_t offset;
+	intptr_t buflen;
+	struct percpu_buffer_node **array;
+} __attribute__((aligned(128)));
+
+struct percpu_buffer {
+	struct percpu_buffer_entry c[CPU_SETSIZE];
+};
+
+#define MEMCPY_BUFFER_ITEM_PER_CPU	100
+
+struct percpu_memcpy_buffer_node {
+	intptr_t data1;
+	uint64_t data2;
+};
+
+struct percpu_memcpy_buffer_entry {
+	intptr_t offset;
+	intptr_t buflen;
+	struct percpu_memcpy_buffer_node *array;
+} __attribute__((aligned(128)));
+
+struct percpu_memcpy_buffer {
+	struct percpu_memcpy_buffer_entry c[CPU_SETSIZE];
+};
+
+/* A simple percpu spinlock. Grabs lock on current cpu. */
+static int rseq_this_cpu_lock(struct percpu_lock *lock)
+{
+	int cpu;
+
+	for (;;) {
+		int ret;
+
+		cpu = rseq_cpu_start();
+		ret = rseq_cmpeqv_storev(&lock->c[cpu].v,
+					 0, 1, cpu);
+		if (rseq_likely(!ret))
+			break;
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	/*
+	 * Acquire semantic when taking lock after control dependency.
+	 * Matches rseq_smp_store_release().
+	 */
+	rseq_smp_acquire__after_ctrl_dep();
+	return cpu;
+}
+
+static void rseq_percpu_unlock(struct percpu_lock *lock, int cpu)
+{
+	assert(lock->c[cpu].v == 1);
+	/*
+	 * Release lock, with release semantic. Matches
+	 * rseq_smp_acquire__after_ctrl_dep().
+	 */
+	rseq_smp_store_release(&lock->c[cpu].v, 0);
+}
+
+void *test_percpu_spinlock_thread(void *arg)
+{
+	struct spinlock_thread_test_data *thread_data = arg;
+	struct spinlock_test_data *data = thread_data->data;
+	long long i, reps;
+
+	if (!opt_disable_rseq && thread_data->reg &&
+	    rseq_register_current_thread())
+		abort();
+	reps = thread_data->reps;
+	for (i = 0; i < reps; i++) {
+		int cpu = rseq_cpu_start();
+
+		cpu = rseq_this_cpu_lock(&data->lock);
+		data->c[cpu].count++;
+		rseq_percpu_unlock(&data->lock, cpu);
+#ifndef BENCHMARK
+		if (i != 0 && !(i % (reps / 10)))
+			printf_verbose("tid %d: count %lld\n", (int) gettid(), i);
+#endif
+	}
+	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+		       (int) gettid(), nr_abort, signals_delivered);
+	if (!opt_disable_rseq && thread_data->reg &&
+	    rseq_unregister_current_thread())
+		abort();
+	return NULL;
+}
+
+/*
+ * A simple test which implements a sharded counter using a per-cpu
+ * lock.  Obviously real applications might prefer to simply use a
+ * per-cpu increment; however, this is reasonable for a test and the
+ * lock can be extended to synchronize more complicated operations.
+ */
+void test_percpu_spinlock(void)
+{
+	const int num_threads = opt_threads;
+	int i, ret;
+	uint64_t sum;
+	pthread_t test_threads[num_threads];
+	struct spinlock_test_data data;
+	struct spinlock_thread_test_data thread_data[num_threads];
+
+	memset(&data, 0, sizeof(data));
+	for (i = 0; i < num_threads; i++) {
+		thread_data[i].reps = opt_reps;
+		if (opt_disable_mod <= 0 || (i % opt_disable_mod))
+			thread_data[i].reg = 1;
+		else
+			thread_data[i].reg = 0;
+		thread_data[i].data = &data;
+		ret = pthread_create(&test_threads[i], NULL,
+				     test_percpu_spinlock_thread,
+				     &thread_data[i]);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	sum = 0;
+	for (i = 0; i < CPU_SETSIZE; i++)
+		sum += data.c[i].count;
+
+	assert(sum == (uint64_t)opt_reps * num_threads);
+}
+
+void *test_percpu_inc_thread(void *arg)
+{
+	struct inc_thread_test_data *thread_data = arg;
+	struct inc_test_data *data = thread_data->data;
+	long long i, reps;
+
+	if (!opt_disable_rseq && thread_data->reg &&
+	    rseq_register_current_thread())
+		abort();
+	reps = thread_data->reps;
+	for (i = 0; i < reps; i++) {
+		int ret;
+
+		do {
+			int cpu;
+
+			cpu = rseq_cpu_start();
+			ret = rseq_addv(&data->c[cpu].count, 1, cpu);
+		} while (rseq_unlikely(ret));
+#ifndef BENCHMARK
+		if (i != 0 && !(i % (reps / 10)))
+			printf_verbose("tid %d: count %lld\n", (int) gettid(), i);
+#endif
+	}
+	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+		       (int) gettid(), nr_abort, signals_delivered);
+	if (!opt_disable_rseq && thread_data->reg &&
+	    rseq_unregister_current_thread())
+		abort();
+	return NULL;
+}
+
+void test_percpu_inc(void)
+{
+	const int num_threads = opt_threads;
+	int i, ret;
+	uint64_t sum;
+	pthread_t test_threads[num_threads];
+	struct inc_test_data data;
+	struct inc_thread_test_data thread_data[num_threads];
+
+	memset(&data, 0, sizeof(data));
+	for (i = 0; i < num_threads; i++) {
+		thread_data[i].reps = opt_reps;
+		if (opt_disable_mod <= 0 || (i % opt_disable_mod))
+			thread_data[i].reg = 1;
+		else
+			thread_data[i].reg = 0;
+		thread_data[i].data = &data;
+		ret = pthread_create(&test_threads[i], NULL,
+				     test_percpu_inc_thread,
+				     &thread_data[i]);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	sum = 0;
+	for (i = 0; i < CPU_SETSIZE; i++)
+		sum += data.c[i].count;
+
+	assert(sum == (uint64_t)opt_reps * num_threads);
+}
+
+void this_cpu_list_push(struct percpu_list *list,
+			struct percpu_list_node *node,
+			int *_cpu)
+{
+	int cpu;
+
+	for (;;) {
+		intptr_t *targetptr, newval, expect;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		/* Load list->c[cpu].head with single-copy atomicity. */
+		expect = (intptr_t)RSEQ_READ_ONCE(list->c[cpu].head);
+		newval = (intptr_t)node;
+		targetptr = (intptr_t *)&list->c[cpu].head;
+		node->next = (struct percpu_list_node *)expect;
+		ret = rseq_cmpeqv_storev(targetptr, expect, newval, cpu);
+		if (rseq_likely(!ret))
+			break;
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+}
+
+/*
+ * Unlike a traditional lock-less linked list; the availability of a
+ * rseq primitive allows us to implement pop without concerns over
+ * ABA-type races.
+ */
+struct percpu_list_node *this_cpu_list_pop(struct percpu_list *list,
+					   int *_cpu)
+{
+	struct percpu_list_node *node = NULL;
+	int cpu;
+
+	for (;;) {
+		struct percpu_list_node *head;
+		intptr_t *targetptr, expectnot, *load;
+		off_t offset;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		targetptr = (intptr_t *)&list->c[cpu].head;
+		expectnot = (intptr_t)NULL;
+		offset = offsetof(struct percpu_list_node, next);
+		load = (intptr_t *)&head;
+		ret = rseq_cmpnev_storeoffp_load(targetptr, expectnot,
+						   offset, load, cpu);
+		if (rseq_likely(!ret)) {
+			node = head;
+			break;
+		}
+		if (ret > 0)
+			break;
+		/* Retry if rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+	return node;
+}
+
+/*
+ * __percpu_list_pop is not safe against concurrent accesses. Should
+ * only be used on lists that are not concurrently modified.
+ */
+struct percpu_list_node *__percpu_list_pop(struct percpu_list *list, int cpu)
+{
+	struct percpu_list_node *node;
+
+	node = list->c[cpu].head;
+	if (!node)
+		return NULL;
+	list->c[cpu].head = node->next;
+	return node;
+}
+
+void *test_percpu_list_thread(void *arg)
+{
+	long long i, reps;
+	struct percpu_list *list = (struct percpu_list *)arg;
+
+	if (!opt_disable_rseq && rseq_register_current_thread())
+		abort();
+
+	reps = opt_reps;
+	for (i = 0; i < reps; i++) {
+		struct percpu_list_node *node;
+
+		node = this_cpu_list_pop(list, NULL);
+		if (opt_yield)
+			sched_yield();  /* encourage shuffling */
+		if (node)
+			this_cpu_list_push(list, node, NULL);
+	}
+
+	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+		       (int) gettid(), nr_abort, signals_delivered);
+	if (!opt_disable_rseq && rseq_unregister_current_thread())
+		abort();
+
+	return NULL;
+}
+
+/* Simultaneous modification to a per-cpu linked list from many threads.  */
+void test_percpu_list(void)
+{
+	const int num_threads = opt_threads;
+	int i, j, ret;
+	uint64_t sum = 0, expected_sum = 0;
+	struct percpu_list list;
+	pthread_t test_threads[num_threads];
+	cpu_set_t allowed_cpus;
+
+	memset(&list, 0, sizeof(list));
+
+	/* Generate list entries for every usable cpu. */
+	sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+		for (j = 1; j <= 100; j++) {
+			struct percpu_list_node *node;
+
+			expected_sum += j;
+
+			node = malloc(sizeof(*node));
+			assert(node);
+			node->data = j;
+			node->next = list.c[i].head;
+			list.c[i].head = node;
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_create(&test_threads[i], NULL,
+				     test_percpu_list_thread, &list);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		struct percpu_list_node *node;
+
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+
+		while ((node = __percpu_list_pop(&list, i))) {
+			sum += node->data;
+			free(node);
+		}
+	}
+
+	/*
+	 * All entries should now be accounted for (unless some external
+	 * actor is interfering with our allowed affinity while this
+	 * test is running).
+	 */
+	assert(sum == expected_sum);
+}
+
+bool this_cpu_buffer_push(struct percpu_buffer *buffer,
+			  struct percpu_buffer_node *node,
+			  int *_cpu)
+{
+	bool result = false;
+	int cpu;
+
+	for (;;) {
+		intptr_t *targetptr_spec, newval_spec;
+		intptr_t *targetptr_final, newval_final;
+		intptr_t offset;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+		if (offset == buffer->c[cpu].buflen)
+			break;
+		newval_spec = (intptr_t)node;
+		targetptr_spec = (intptr_t *)&buffer->c[cpu].array[offset];
+		newval_final = offset + 1;
+		targetptr_final = &buffer->c[cpu].offset;
+		if (opt_mb)
+			ret = rseq_cmpeqv_trystorev_storev_release(
+				targetptr_final, offset, targetptr_spec,
+				newval_spec, newval_final, cpu);
+		else
+			ret = rseq_cmpeqv_trystorev_storev(targetptr_final,
+				offset, targetptr_spec, newval_spec,
+				newval_final, cpu);
+		if (rseq_likely(!ret)) {
+			result = true;
+			break;
+		}
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+	return result;
+}
+
+struct percpu_buffer_node *this_cpu_buffer_pop(struct percpu_buffer *buffer,
+					       int *_cpu)
+{
+	struct percpu_buffer_node *head;
+	int cpu;
+
+	for (;;) {
+		intptr_t *targetptr, newval;
+		intptr_t offset;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		/* Load offset with single-copy atomicity. */
+		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+		if (offset == 0) {
+			head = NULL;
+			break;
+		}
+		head = RSEQ_READ_ONCE(buffer->c[cpu].array[offset - 1]);
+		newval = offset - 1;
+		targetptr = (intptr_t *)&buffer->c[cpu].offset;
+		ret = rseq_cmpeqv_cmpeqv_storev(targetptr, offset,
+			(intptr_t *)&buffer->c[cpu].array[offset - 1],
+			(intptr_t)head, newval, cpu);
+		if (rseq_likely(!ret))
+			break;
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+	return head;
+}
+
+/*
+ * __percpu_buffer_pop is not safe against concurrent accesses. Should
+ * only be used on buffers that are not concurrently modified.
+ */
+struct percpu_buffer_node *__percpu_buffer_pop(struct percpu_buffer *buffer,
+					       int cpu)
+{
+	struct percpu_buffer_node *head;
+	intptr_t offset;
+
+	offset = buffer->c[cpu].offset;
+	if (offset == 0)
+		return NULL;
+	head = buffer->c[cpu].array[offset - 1];
+	buffer->c[cpu].offset = offset - 1;
+	return head;
+}
+
+void *test_percpu_buffer_thread(void *arg)
+{
+	long long i, reps;
+	struct percpu_buffer *buffer = (struct percpu_buffer *)arg;
+
+	if (!opt_disable_rseq && rseq_register_current_thread())
+		abort();
+
+	reps = opt_reps;
+	for (i = 0; i < reps; i++) {
+		struct percpu_buffer_node *node;
+
+		node = this_cpu_buffer_pop(buffer, NULL);
+		if (opt_yield)
+			sched_yield();  /* encourage shuffling */
+		if (node) {
+			if (!this_cpu_buffer_push(buffer, node, NULL)) {
+				/* Should increase buffer size. */
+				abort();
+			}
+		}
+	}
+
+	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+		       (int) gettid(), nr_abort, signals_delivered);
+	if (!opt_disable_rseq && rseq_unregister_current_thread())
+		abort();
+
+	return NULL;
+}
+
+/* Simultaneous modification to a per-cpu buffer from many threads.  */
+void test_percpu_buffer(void)
+{
+	const int num_threads = opt_threads;
+	int i, j, ret;
+	uint64_t sum = 0, expected_sum = 0;
+	struct percpu_buffer buffer;
+	pthread_t test_threads[num_threads];
+	cpu_set_t allowed_cpus;
+
+	memset(&buffer, 0, sizeof(buffer));
+
+	/* Generate list entries for every usable cpu. */
+	sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+		/* Worse-case is every item in same CPU. */
+		buffer.c[i].array =
+			malloc(sizeof(*buffer.c[i].array) * CPU_SETSIZE *
+			       BUFFER_ITEM_PER_CPU);
+		assert(buffer.c[i].array);
+		buffer.c[i].buflen = CPU_SETSIZE * BUFFER_ITEM_PER_CPU;
+		for (j = 1; j <= BUFFER_ITEM_PER_CPU; j++) {
+			struct percpu_buffer_node *node;
+
+			expected_sum += j;
+
+			/*
+			 * We could theoretically put the word-sized
+			 * "data" directly in the buffer. However, we
+			 * want to model objects that would not fit
+			 * within a single word, so allocate an object
+			 * for each node.
+			 */
+			node = malloc(sizeof(*node));
+			assert(node);
+			node->data = j;
+			buffer.c[i].array[j - 1] = node;
+			buffer.c[i].offset++;
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_create(&test_threads[i], NULL,
+				     test_percpu_buffer_thread, &buffer);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		struct percpu_buffer_node *node;
+
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+
+		while ((node = __percpu_buffer_pop(&buffer, i))) {
+			sum += node->data;
+			free(node);
+		}
+		free(buffer.c[i].array);
+	}
+
+	/*
+	 * All entries should now be accounted for (unless some external
+	 * actor is interfering with our allowed affinity while this
+	 * test is running).
+	 */
+	assert(sum == expected_sum);
+}
+
+bool this_cpu_memcpy_buffer_push(struct percpu_memcpy_buffer *buffer,
+				 struct percpu_memcpy_buffer_node item,
+				 int *_cpu)
+{
+	bool result = false;
+	int cpu;
+
+	for (;;) {
+		intptr_t *targetptr_final, newval_final, offset;
+		char *destptr, *srcptr;
+		size_t copylen;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		/* Load offset with single-copy atomicity. */
+		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+		if (offset == buffer->c[cpu].buflen)
+			break;
+		destptr = (char *)&buffer->c[cpu].array[offset];
+		srcptr = (char *)&item;
+		/* copylen must be <= 4kB. */
+		copylen = sizeof(item);
+		newval_final = offset + 1;
+		targetptr_final = &buffer->c[cpu].offset;
+		if (opt_mb)
+			ret = rseq_cmpeqv_trymemcpy_storev_release(
+				targetptr_final, offset,
+				destptr, srcptr, copylen,
+				newval_final, cpu);
+		else
+			ret = rseq_cmpeqv_trymemcpy_storev(targetptr_final,
+				offset, destptr, srcptr, copylen,
+				newval_final, cpu);
+		if (rseq_likely(!ret)) {
+			result = true;
+			break;
+		}
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+	return result;
+}
+
+bool this_cpu_memcpy_buffer_pop(struct percpu_memcpy_buffer *buffer,
+				struct percpu_memcpy_buffer_node *item,
+				int *_cpu)
+{
+	bool result = false;
+	int cpu;
+
+	for (;;) {
+		intptr_t *targetptr_final, newval_final, offset;
+		char *destptr, *srcptr;
+		size_t copylen;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		/* Load offset with single-copy atomicity. */
+		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+		if (offset == 0)
+			break;
+		destptr = (char *)item;
+		srcptr = (char *)&buffer->c[cpu].array[offset - 1];
+		/* copylen must be <= 4kB. */
+		copylen = sizeof(*item);
+		newval_final = offset - 1;
+		targetptr_final = &buffer->c[cpu].offset;
+		ret = rseq_cmpeqv_trymemcpy_storev(targetptr_final,
+			offset, destptr, srcptr, copylen,
+			newval_final, cpu);
+		if (rseq_likely(!ret)) {
+			result = true;
+			break;
+		}
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+	return result;
+}
+
+/*
+ * __percpu_memcpy_buffer_pop is not safe against concurrent accesses. Should
+ * only be used on buffers that are not concurrently modified.
+ */
+bool __percpu_memcpy_buffer_pop(struct percpu_memcpy_buffer *buffer,
+				struct percpu_memcpy_buffer_node *item,
+				int cpu)
+{
+	intptr_t offset;
+
+	offset = buffer->c[cpu].offset;
+	if (offset == 0)
+		return false;
+	memcpy(item, &buffer->c[cpu].array[offset - 1], sizeof(*item));
+	buffer->c[cpu].offset = offset - 1;
+	return true;
+}
+
+void *test_percpu_memcpy_buffer_thread(void *arg)
+{
+	long long i, reps;
+	struct percpu_memcpy_buffer *buffer = (struct percpu_memcpy_buffer *)arg;
+
+	if (!opt_disable_rseq && rseq_register_current_thread())
+		abort();
+
+	reps = opt_reps;
+	for (i = 0; i < reps; i++) {
+		struct percpu_memcpy_buffer_node item;
+		bool result;
+
+		result = this_cpu_memcpy_buffer_pop(buffer, &item, NULL);
+		if (opt_yield)
+			sched_yield();  /* encourage shuffling */
+		if (result) {
+			if (!this_cpu_memcpy_buffer_push(buffer, item, NULL)) {
+				/* Should increase buffer size. */
+				abort();
+			}
+		}
+	}
+
+	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+		       (int) gettid(), nr_abort, signals_delivered);
+	if (!opt_disable_rseq && rseq_unregister_current_thread())
+		abort();
+
+	return NULL;
+}
+
+/* Simultaneous modification to a per-cpu buffer from many threads.  */
+void test_percpu_memcpy_buffer(void)
+{
+	const int num_threads = opt_threads;
+	int i, j, ret;
+	uint64_t sum = 0, expected_sum = 0;
+	struct percpu_memcpy_buffer buffer;
+	pthread_t test_threads[num_threads];
+	cpu_set_t allowed_cpus;
+
+	memset(&buffer, 0, sizeof(buffer));
+
+	/* Generate list entries for every usable cpu. */
+	sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+		/* Worse-case is every item in same CPU. */
+		buffer.c[i].array =
+			malloc(sizeof(*buffer.c[i].array) * CPU_SETSIZE *
+			       MEMCPY_BUFFER_ITEM_PER_CPU);
+		assert(buffer.c[i].array);
+		buffer.c[i].buflen = CPU_SETSIZE * MEMCPY_BUFFER_ITEM_PER_CPU;
+		for (j = 1; j <= MEMCPY_BUFFER_ITEM_PER_CPU; j++) {
+			expected_sum += 2 * j + 1;
+
+			/*
+			 * We could theoretically put the word-sized
+			 * "data" directly in the buffer. However, we
+			 * want to model objects that would not fit
+			 * within a single word, so allocate an object
+			 * for each node.
+			 */
+			buffer.c[i].array[j - 1].data1 = j;
+			buffer.c[i].array[j - 1].data2 = j + 1;
+			buffer.c[i].offset++;
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_create(&test_threads[i], NULL,
+				     test_percpu_memcpy_buffer_thread,
+				     &buffer);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		struct percpu_memcpy_buffer_node item;
+
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+
+		while (__percpu_memcpy_buffer_pop(&buffer, &item, i)) {
+			sum += item.data1;
+			sum += item.data2;
+		}
+		free(buffer.c[i].array);
+	}
+
+	/*
+	 * All entries should now be accounted for (unless some external
+	 * actor is interfering with our allowed affinity while this
+	 * test is running).
+	 */
+	assert(sum == expected_sum);
+}
+
+static void test_signal_interrupt_handler(int signo)
+{
+	signals_delivered++;
+}
+
+static int set_signal_handler(void)
+{
+	int ret = 0;
+	struct sigaction sa;
+	sigset_t sigset;
+
+	ret = sigemptyset(&sigset);
+	if (ret < 0) {
+		perror("sigemptyset");
+		return ret;
+	}
+
+	sa.sa_handler = test_signal_interrupt_handler;
+	sa.sa_mask = sigset;
+	sa.sa_flags = 0;
+	ret = sigaction(SIGUSR1, &sa, NULL);
+	if (ret < 0) {
+		perror("sigaction");
+		return ret;
+	}
+
+	printf_verbose("Signal handler set for SIGUSR1\n");
+
+	return ret;
+}
+
+static void show_usage(int argc, char **argv)
+{
+	printf("Usage : %s <OPTIONS>\n",
+		argv[0]);
+	printf("OPTIONS:\n");
+	printf("	[-1 loops] Number of loops for delay injection 1\n");
+	printf("	[-2 loops] Number of loops for delay injection 2\n");
+	printf("	[-3 loops] Number of loops for delay injection 3\n");
+	printf("	[-4 loops] Number of loops for delay injection 4\n");
+	printf("	[-5 loops] Number of loops for delay injection 5\n");
+	printf("	[-6 loops] Number of loops for delay injection 6\n");
+	printf("	[-7 loops] Number of loops for delay injection 7 (-1 to enable -m)\n");
+	printf("	[-8 loops] Number of loops for delay injection 8 (-1 to enable -m)\n");
+	printf("	[-9 loops] Number of loops for delay injection 9 (-1 to enable -m)\n");
+	printf("	[-m N] Yield/sleep/kill every modulo N (default 0: disabled) (>= 0)\n");
+	printf("	[-y] Yield\n");
+	printf("	[-k] Kill thread with signal\n");
+	printf("	[-s S] S: =0: disabled (default), >0: sleep time (ms)\n");
+	printf("	[-t N] Number of threads (default 200)\n");
+	printf("	[-r N] Number of repetitions per thread (default 5000)\n");
+	printf("	[-d] Disable rseq system call (no initialization)\n");
+	printf("	[-D M] Disable rseq for each M threads\n");
+	printf("	[-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement\n");
+	printf("	[-M] Push into buffer and memcpy buffer with memory barriers.\n");
+	printf("	[-v] Verbose output.\n");
+	printf("	[-h] Show this help.\n");
+	printf("\n");
+}
+
+int main(int argc, char **argv)
+{
+	int i;
+
+	for (i = 1; i < argc; i++) {
+		if (argv[i][0] != '-')
+			continue;
+		switch (argv[i][1]) {
+		case '1':
+		case '2':
+		case '3':
+		case '4':
+		case '5':
+		case '6':
+		case '7':
+		case '8':
+		case '9':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			loop_cnt[argv[i][1] - '0'] = atol(argv[i + 1]);
+			i++;
+			break;
+		case 'm':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_modulo = atol(argv[i + 1]);
+			if (opt_modulo < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 's':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_sleep = atol(argv[i + 1]);
+			if (opt_sleep < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 'y':
+			opt_yield = 1;
+			break;
+		case 'k':
+			opt_signal = 1;
+			break;
+		case 'd':
+			opt_disable_rseq = 1;
+			break;
+		case 'D':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_disable_mod = atol(argv[i + 1]);
+			if (opt_disable_mod < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 't':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_threads = atol(argv[i + 1]);
+			if (opt_threads < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 'r':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_reps = atoll(argv[i + 1]);
+			if (opt_reps < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 'h':
+			show_usage(argc, argv);
+			goto end;
+		case 'T':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_test = *argv[i + 1];
+			switch (opt_test) {
+			case 's':
+			case 'l':
+			case 'i':
+			case 'b':
+			case 'm':
+				break;
+			default:
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 'v':
+			verbose = 1;
+			break;
+		case 'M':
+			opt_mb = 1;
+			break;
+		default:
+			show_usage(argc, argv);
+			goto error;
+		}
+	}
+
+	loop_cnt_1 = loop_cnt[1];
+	loop_cnt_2 = loop_cnt[2];
+	loop_cnt_3 = loop_cnt[3];
+	loop_cnt_4 = loop_cnt[4];
+	loop_cnt_5 = loop_cnt[5];
+	loop_cnt_6 = loop_cnt[6];
+
+	if (set_signal_handler())
+		goto error;
+
+	if (!opt_disable_rseq && rseq_register_current_thread())
+		goto error;
+	switch (opt_test) {
+	case 's':
+		printf_verbose("spinlock\n");
+		test_percpu_spinlock();
+		break;
+	case 'l':
+		printf_verbose("linked list\n");
+		test_percpu_list();
+		break;
+	case 'b':
+		printf_verbose("buffer\n");
+		test_percpu_buffer();
+		break;
+	case 'm':
+		printf_verbose("memcpy buffer\n");
+		test_percpu_memcpy_buffer();
+		break;
+	case 'i':
+		printf_verbose("counter increment\n");
+		test_percpu_inc();
+		break;
+	}
+	if (!opt_disable_rseq && rseq_unregister_current_thread())
+		abort();
+end:
+	return 0;
+
+error:
+	return -1;
+}
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kselftest" in
the body of a message to majordomo at vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

WARNING: multiple messages have this Message-ID (diff)
From: mathieu.desnoyers@efficios.com (Mathieu Desnoyers)
Subject: [RFC PATCH for 4.18 15/16] rseq: selftests: Provide parametrized tests (v2)
Date: Sat,  2 Jun 2018 08:44:07 -0400	[thread overview]
Message-ID: <20180602124408.8430-16-mathieu.desnoyers@efficios.com> (raw)
Message-ID: <20180602124407.q-lIZ8SCxdZU6DND4D5J_MhTetQxm_ezvX9Q0f8zyns@z> (raw)
In-Reply-To: <20180602124408.8430-1-mathieu.desnoyers@efficios.com>

"param_test" is a parametrizable restartable sequences test. See
the "--help" output for usage.

"param_test_benchmark" is the same as "param_test", but it removes
testing book-keeping code to allow accurate benchmarks.

"param_test_compare_twice" is the same as "param_test", but it performs
each comparison within rseq critical section twice, thus validating
invariants. If any of the second comparisons fails, an error message
is printed and the test aborts.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers at efficios.com>
CC: Shuah Khan <shuahkh at osg.samsung.com>
CC: Russell King <linux at arm.linux.org.uk>
CC: Catalin Marinas <catalin.marinas at arm.com>
CC: Will Deacon <will.deacon at arm.com>
CC: Thomas Gleixner <tglx at linutronix.de>
CC: Paul Turner <pjt at google.com>
CC: Andrew Hunter <ahh at google.com>
CC: Peter Zijlstra <peterz at infradead.org>
CC: Andy Lutomirski <luto at amacapital.net>
CC: Andi Kleen <andi at firstfloor.org>
CC: Dave Watson <davejwatson at fb.com>
CC: Chris Lameter <cl at linux.com>
CC: Ingo Molnar <mingo at redhat.com>
CC: "H. Peter Anvin" <hpa at zytor.com>
CC: Ben Maurer <bmaurer at fb.com>
CC: Steven Rostedt <rostedt at goodmis.org>
CC: "Paul E. McKenney" <paulmck at linux.vnet.ibm.com>
CC: Josh Triplett <josh at joshtriplett.org>
CC: Linus Torvalds <torvalds at linux-foundation.org>
CC: Andrew Morton <akpm at linux-foundation.org>
CC: Boqun Feng <boqun.feng at gmail.com>
CC: linux-kselftest at vger.kernel.org
CC: linux-api at vger.kernel.org
---
Changes since v1:
- Use only rseq, remove use of cpu_opv.
---
 tools/testing/selftests/rseq/param_test.c | 1260 +++++++++++++++++++++++++++++
 1 file changed, 1260 insertions(+)
 create mode 100644 tools/testing/selftests/rseq/param_test.c

diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c
new file mode 100644
index 000000000000..6a9f602a8718
--- /dev/null
+++ b/tools/testing/selftests/rseq/param_test.c
@@ -0,0 +1,1260 @@
+// SPDX-License-Identifier: LGPL-2.1
+#define _GNU_SOURCE
+#include <assert.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <poll.h>
+#include <sys/types.h>
+#include <signal.h>
+#include <errno.h>
+#include <stddef.h>
+
+static inline pid_t gettid(void)
+{
+	return syscall(__NR_gettid);
+}
+
+#define NR_INJECT	9
+static int loop_cnt[NR_INJECT + 1];
+
+static int loop_cnt_1 asm("asm_loop_cnt_1") __attribute__((used));
+static int loop_cnt_2 asm("asm_loop_cnt_2") __attribute__((used));
+static int loop_cnt_3 asm("asm_loop_cnt_3") __attribute__((used));
+static int loop_cnt_4 asm("asm_loop_cnt_4") __attribute__((used));
+static int loop_cnt_5 asm("asm_loop_cnt_5") __attribute__((used));
+static int loop_cnt_6 asm("asm_loop_cnt_6") __attribute__((used));
+
+static int opt_modulo, verbose;
+
+static int opt_yield, opt_signal, opt_sleep,
+		opt_disable_rseq, opt_threads = 200,
+		opt_disable_mod = 0, opt_test = 's', opt_mb = 0;
+
+#ifndef RSEQ_SKIP_FASTPATH
+static long long opt_reps = 5000;
+#else
+static long long opt_reps = 100;
+#endif
+
+static __thread __attribute__((tls_model("initial-exec")))
+unsigned int signals_delivered;
+
+#ifndef BENCHMARK
+
+static __thread __attribute__((tls_model("initial-exec"), unused))
+unsigned int yield_mod_cnt, nr_abort;
+
+#define printf_verbose(fmt, ...)			\
+	do {						\
+		if (verbose)				\
+			printf(fmt, ## __VA_ARGS__);	\
+	} while (0)
+
+#if defined(__x86_64__) || defined(__i386__)
+
+#define INJECT_ASM_REG	"eax"
+
+#define RSEQ_INJECT_CLOBBER \
+	, INJECT_ASM_REG
+
+#ifdef __i386__
+
+#define RSEQ_INJECT_ASM(n) \
+	"mov asm_loop_cnt_" #n ", %%" INJECT_ASM_REG "\n\t" \
+	"test %%" INJECT_ASM_REG ",%%" INJECT_ASM_REG "\n\t" \
+	"jz 333f\n\t" \
+	"222:\n\t" \
+	"dec %%" INJECT_ASM_REG "\n\t" \
+	"jnz 222b\n\t" \
+	"333:\n\t"
+
+#elif defined(__x86_64__)
+
+#define RSEQ_INJECT_ASM(n) \
+	"lea asm_loop_cnt_" #n "(%%rip), %%" INJECT_ASM_REG "\n\t" \
+	"mov (%%" INJECT_ASM_REG "), %%" INJECT_ASM_REG "\n\t" \
+	"test %%" INJECT_ASM_REG ",%%" INJECT_ASM_REG "\n\t" \
+	"jz 333f\n\t" \
+	"222:\n\t" \
+	"dec %%" INJECT_ASM_REG "\n\t" \
+	"jnz 222b\n\t" \
+	"333:\n\t"
+
+#else
+#error "Unsupported architecture"
+#endif
+
+#elif defined(__ARMEL__)
+
+#define RSEQ_INJECT_INPUT \
+	, [loop_cnt_1]"m"(loop_cnt[1]) \
+	, [loop_cnt_2]"m"(loop_cnt[2]) \
+	, [loop_cnt_3]"m"(loop_cnt[3]) \
+	, [loop_cnt_4]"m"(loop_cnt[4]) \
+	, [loop_cnt_5]"m"(loop_cnt[5]) \
+	, [loop_cnt_6]"m"(loop_cnt[6])
+
+#define INJECT_ASM_REG	"r4"
+
+#define RSEQ_INJECT_CLOBBER \
+	, INJECT_ASM_REG
+
+#define RSEQ_INJECT_ASM(n) \
+	"ldr " INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \
+	"cmp " INJECT_ASM_REG ", #0\n\t" \
+	"beq 333f\n\t" \
+	"222:\n\t" \
+	"subs " INJECT_ASM_REG ", #1\n\t" \
+	"bne 222b\n\t" \
+	"333:\n\t"
+
+#elif __PPC__
+
+#define RSEQ_INJECT_INPUT \
+	, [loop_cnt_1]"m"(loop_cnt[1]) \
+	, [loop_cnt_2]"m"(loop_cnt[2]) \
+	, [loop_cnt_3]"m"(loop_cnt[3]) \
+	, [loop_cnt_4]"m"(loop_cnt[4]) \
+	, [loop_cnt_5]"m"(loop_cnt[5]) \
+	, [loop_cnt_6]"m"(loop_cnt[6])
+
+#define INJECT_ASM_REG	"r18"
+
+#define RSEQ_INJECT_CLOBBER \
+	, INJECT_ASM_REG
+
+#define RSEQ_INJECT_ASM(n) \
+	"lwz %%" INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \
+	"cmpwi %%" INJECT_ASM_REG ", 0\n\t" \
+	"beq 333f\n\t" \
+	"222:\n\t" \
+	"subic. %%" INJECT_ASM_REG ", %%" INJECT_ASM_REG ", 1\n\t" \
+	"bne 222b\n\t" \
+	"333:\n\t"
+#else
+#error unsupported target
+#endif
+
+#define RSEQ_INJECT_FAILED \
+	nr_abort++;
+
+#define RSEQ_INJECT_C(n) \
+{ \
+	int loc_i, loc_nr_loops = loop_cnt[n]; \
+	\
+	for (loc_i = 0; loc_i < loc_nr_loops; loc_i++) { \
+		rseq_barrier(); \
+	} \
+	if (loc_nr_loops == -1 && opt_modulo) { \
+		if (yield_mod_cnt == opt_modulo - 1) { \
+			if (opt_sleep > 0) \
+				poll(NULL, 0, opt_sleep); \
+			if (opt_yield) \
+				sched_yield(); \
+			if (opt_signal) \
+				raise(SIGUSR1); \
+			yield_mod_cnt = 0; \
+		} else { \
+			yield_mod_cnt++; \
+		} \
+	} \
+}
+
+#else
+
+#define printf_verbose(fmt, ...)
+
+#endif /* BENCHMARK */
+
+#include "rseq.h"
+
+struct percpu_lock_entry {
+	intptr_t v;
+} __attribute__((aligned(128)));
+
+struct percpu_lock {
+	struct percpu_lock_entry c[CPU_SETSIZE];
+};
+
+struct test_data_entry {
+	intptr_t count;
+} __attribute__((aligned(128)));
+
+struct spinlock_test_data {
+	struct percpu_lock lock;
+	struct test_data_entry c[CPU_SETSIZE];
+};
+
+struct spinlock_thread_test_data {
+	struct spinlock_test_data *data;
+	long long reps;
+	int reg;
+};
+
+struct inc_test_data {
+	struct test_data_entry c[CPU_SETSIZE];
+};
+
+struct inc_thread_test_data {
+	struct inc_test_data *data;
+	long long reps;
+	int reg;
+};
+
+struct percpu_list_node {
+	intptr_t data;
+	struct percpu_list_node *next;
+};
+
+struct percpu_list_entry {
+	struct percpu_list_node *head;
+} __attribute__((aligned(128)));
+
+struct percpu_list {
+	struct percpu_list_entry c[CPU_SETSIZE];
+};
+
+#define BUFFER_ITEM_PER_CPU	100
+
+struct percpu_buffer_node {
+	intptr_t data;
+};
+
+struct percpu_buffer_entry {
+	intptr_t offset;
+	intptr_t buflen;
+	struct percpu_buffer_node **array;
+} __attribute__((aligned(128)));
+
+struct percpu_buffer {
+	struct percpu_buffer_entry c[CPU_SETSIZE];
+};
+
+#define MEMCPY_BUFFER_ITEM_PER_CPU	100
+
+struct percpu_memcpy_buffer_node {
+	intptr_t data1;
+	uint64_t data2;
+};
+
+struct percpu_memcpy_buffer_entry {
+	intptr_t offset;
+	intptr_t buflen;
+	struct percpu_memcpy_buffer_node *array;
+} __attribute__((aligned(128)));
+
+struct percpu_memcpy_buffer {
+	struct percpu_memcpy_buffer_entry c[CPU_SETSIZE];
+};
+
+/* A simple percpu spinlock. Grabs lock on current cpu. */
+static int rseq_this_cpu_lock(struct percpu_lock *lock)
+{
+	int cpu;
+
+	for (;;) {
+		int ret;
+
+		cpu = rseq_cpu_start();
+		ret = rseq_cmpeqv_storev(&lock->c[cpu].v,
+					 0, 1, cpu);
+		if (rseq_likely(!ret))
+			break;
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	/*
+	 * Acquire semantic when taking lock after control dependency.
+	 * Matches rseq_smp_store_release().
+	 */
+	rseq_smp_acquire__after_ctrl_dep();
+	return cpu;
+}
+
+static void rseq_percpu_unlock(struct percpu_lock *lock, int cpu)
+{
+	assert(lock->c[cpu].v == 1);
+	/*
+	 * Release lock, with release semantic. Matches
+	 * rseq_smp_acquire__after_ctrl_dep().
+	 */
+	rseq_smp_store_release(&lock->c[cpu].v, 0);
+}
+
+void *test_percpu_spinlock_thread(void *arg)
+{
+	struct spinlock_thread_test_data *thread_data = arg;
+	struct spinlock_test_data *data = thread_data->data;
+	long long i, reps;
+
+	if (!opt_disable_rseq && thread_data->reg &&
+	    rseq_register_current_thread())
+		abort();
+	reps = thread_data->reps;
+	for (i = 0; i < reps; i++) {
+		int cpu = rseq_cpu_start();
+
+		cpu = rseq_this_cpu_lock(&data->lock);
+		data->c[cpu].count++;
+		rseq_percpu_unlock(&data->lock, cpu);
+#ifndef BENCHMARK
+		if (i != 0 && !(i % (reps / 10)))
+			printf_verbose("tid %d: count %lld\n", (int) gettid(), i);
+#endif
+	}
+	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+		       (int) gettid(), nr_abort, signals_delivered);
+	if (!opt_disable_rseq && thread_data->reg &&
+	    rseq_unregister_current_thread())
+		abort();
+	return NULL;
+}
+
+/*
+ * A simple test which implements a sharded counter using a per-cpu
+ * lock.  Obviously real applications might prefer to simply use a
+ * per-cpu increment; however, this is reasonable for a test and the
+ * lock can be extended to synchronize more complicated operations.
+ */
+void test_percpu_spinlock(void)
+{
+	const int num_threads = opt_threads;
+	int i, ret;
+	uint64_t sum;
+	pthread_t test_threads[num_threads];
+	struct spinlock_test_data data;
+	struct spinlock_thread_test_data thread_data[num_threads];
+
+	memset(&data, 0, sizeof(data));
+	for (i = 0; i < num_threads; i++) {
+		thread_data[i].reps = opt_reps;
+		if (opt_disable_mod <= 0 || (i % opt_disable_mod))
+			thread_data[i].reg = 1;
+		else
+			thread_data[i].reg = 0;
+		thread_data[i].data = &data;
+		ret = pthread_create(&test_threads[i], NULL,
+				     test_percpu_spinlock_thread,
+				     &thread_data[i]);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	sum = 0;
+	for (i = 0; i < CPU_SETSIZE; i++)
+		sum += data.c[i].count;
+
+	assert(sum == (uint64_t)opt_reps * num_threads);
+}
+
+void *test_percpu_inc_thread(void *arg)
+{
+	struct inc_thread_test_data *thread_data = arg;
+	struct inc_test_data *data = thread_data->data;
+	long long i, reps;
+
+	if (!opt_disable_rseq && thread_data->reg &&
+	    rseq_register_current_thread())
+		abort();
+	reps = thread_data->reps;
+	for (i = 0; i < reps; i++) {
+		int ret;
+
+		do {
+			int cpu;
+
+			cpu = rseq_cpu_start();
+			ret = rseq_addv(&data->c[cpu].count, 1, cpu);
+		} while (rseq_unlikely(ret));
+#ifndef BENCHMARK
+		if (i != 0 && !(i % (reps / 10)))
+			printf_verbose("tid %d: count %lld\n", (int) gettid(), i);
+#endif
+	}
+	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+		       (int) gettid(), nr_abort, signals_delivered);
+	if (!opt_disable_rseq && thread_data->reg &&
+	    rseq_unregister_current_thread())
+		abort();
+	return NULL;
+}
+
+void test_percpu_inc(void)
+{
+	const int num_threads = opt_threads;
+	int i, ret;
+	uint64_t sum;
+	pthread_t test_threads[num_threads];
+	struct inc_test_data data;
+	struct inc_thread_test_data thread_data[num_threads];
+
+	memset(&data, 0, sizeof(data));
+	for (i = 0; i < num_threads; i++) {
+		thread_data[i].reps = opt_reps;
+		if (opt_disable_mod <= 0 || (i % opt_disable_mod))
+			thread_data[i].reg = 1;
+		else
+			thread_data[i].reg = 0;
+		thread_data[i].data = &data;
+		ret = pthread_create(&test_threads[i], NULL,
+				     test_percpu_inc_thread,
+				     &thread_data[i]);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	sum = 0;
+	for (i = 0; i < CPU_SETSIZE; i++)
+		sum += data.c[i].count;
+
+	assert(sum == (uint64_t)opt_reps * num_threads);
+}
+
+void this_cpu_list_push(struct percpu_list *list,
+			struct percpu_list_node *node,
+			int *_cpu)
+{
+	int cpu;
+
+	for (;;) {
+		intptr_t *targetptr, newval, expect;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		/* Load list->c[cpu].head with single-copy atomicity. */
+		expect = (intptr_t)RSEQ_READ_ONCE(list->c[cpu].head);
+		newval = (intptr_t)node;
+		targetptr = (intptr_t *)&list->c[cpu].head;
+		node->next = (struct percpu_list_node *)expect;
+		ret = rseq_cmpeqv_storev(targetptr, expect, newval, cpu);
+		if (rseq_likely(!ret))
+			break;
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+}
+
+/*
+ * Unlike a traditional lock-less linked list; the availability of a
+ * rseq primitive allows us to implement pop without concerns over
+ * ABA-type races.
+ */
+struct percpu_list_node *this_cpu_list_pop(struct percpu_list *list,
+					   int *_cpu)
+{
+	struct percpu_list_node *node = NULL;
+	int cpu;
+
+	for (;;) {
+		struct percpu_list_node *head;
+		intptr_t *targetptr, expectnot, *load;
+		off_t offset;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		targetptr = (intptr_t *)&list->c[cpu].head;
+		expectnot = (intptr_t)NULL;
+		offset = offsetof(struct percpu_list_node, next);
+		load = (intptr_t *)&head;
+		ret = rseq_cmpnev_storeoffp_load(targetptr, expectnot,
+						   offset, load, cpu);
+		if (rseq_likely(!ret)) {
+			node = head;
+			break;
+		}
+		if (ret > 0)
+			break;
+		/* Retry if rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+	return node;
+}
+
+/*
+ * __percpu_list_pop is not safe against concurrent accesses. Should
+ * only be used on lists that are not concurrently modified.
+ */
+struct percpu_list_node *__percpu_list_pop(struct percpu_list *list, int cpu)
+{
+	struct percpu_list_node *node;
+
+	node = list->c[cpu].head;
+	if (!node)
+		return NULL;
+	list->c[cpu].head = node->next;
+	return node;
+}
+
+void *test_percpu_list_thread(void *arg)
+{
+	long long i, reps;
+	struct percpu_list *list = (struct percpu_list *)arg;
+
+	if (!opt_disable_rseq && rseq_register_current_thread())
+		abort();
+
+	reps = opt_reps;
+	for (i = 0; i < reps; i++) {
+		struct percpu_list_node *node;
+
+		node = this_cpu_list_pop(list, NULL);
+		if (opt_yield)
+			sched_yield();  /* encourage shuffling */
+		if (node)
+			this_cpu_list_push(list, node, NULL);
+	}
+
+	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+		       (int) gettid(), nr_abort, signals_delivered);
+	if (!opt_disable_rseq && rseq_unregister_current_thread())
+		abort();
+
+	return NULL;
+}
+
+/* Simultaneous modification to a per-cpu linked list from many threads.  */
+void test_percpu_list(void)
+{
+	const int num_threads = opt_threads;
+	int i, j, ret;
+	uint64_t sum = 0, expected_sum = 0;
+	struct percpu_list list;
+	pthread_t test_threads[num_threads];
+	cpu_set_t allowed_cpus;
+
+	memset(&list, 0, sizeof(list));
+
+	/* Generate list entries for every usable cpu. */
+	sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+		for (j = 1; j <= 100; j++) {
+			struct percpu_list_node *node;
+
+			expected_sum += j;
+
+			node = malloc(sizeof(*node));
+			assert(node);
+			node->data = j;
+			node->next = list.c[i].head;
+			list.c[i].head = node;
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_create(&test_threads[i], NULL,
+				     test_percpu_list_thread, &list);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		struct percpu_list_node *node;
+
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+
+		while ((node = __percpu_list_pop(&list, i))) {
+			sum += node->data;
+			free(node);
+		}
+	}
+
+	/*
+	 * All entries should now be accounted for (unless some external
+	 * actor is interfering with our allowed affinity while this
+	 * test is running).
+	 */
+	assert(sum == expected_sum);
+}
+
+bool this_cpu_buffer_push(struct percpu_buffer *buffer,
+			  struct percpu_buffer_node *node,
+			  int *_cpu)
+{
+	bool result = false;
+	int cpu;
+
+	for (;;) {
+		intptr_t *targetptr_spec, newval_spec;
+		intptr_t *targetptr_final, newval_final;
+		intptr_t offset;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+		if (offset == buffer->c[cpu].buflen)
+			break;
+		newval_spec = (intptr_t)node;
+		targetptr_spec = (intptr_t *)&buffer->c[cpu].array[offset];
+		newval_final = offset + 1;
+		targetptr_final = &buffer->c[cpu].offset;
+		if (opt_mb)
+			ret = rseq_cmpeqv_trystorev_storev_release(
+				targetptr_final, offset, targetptr_spec,
+				newval_spec, newval_final, cpu);
+		else
+			ret = rseq_cmpeqv_trystorev_storev(targetptr_final,
+				offset, targetptr_spec, newval_spec,
+				newval_final, cpu);
+		if (rseq_likely(!ret)) {
+			result = true;
+			break;
+		}
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+	return result;
+}
+
+struct percpu_buffer_node *this_cpu_buffer_pop(struct percpu_buffer *buffer,
+					       int *_cpu)
+{
+	struct percpu_buffer_node *head;
+	int cpu;
+
+	for (;;) {
+		intptr_t *targetptr, newval;
+		intptr_t offset;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		/* Load offset with single-copy atomicity. */
+		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+		if (offset == 0) {
+			head = NULL;
+			break;
+		}
+		head = RSEQ_READ_ONCE(buffer->c[cpu].array[offset - 1]);
+		newval = offset - 1;
+		targetptr = (intptr_t *)&buffer->c[cpu].offset;
+		ret = rseq_cmpeqv_cmpeqv_storev(targetptr, offset,
+			(intptr_t *)&buffer->c[cpu].array[offset - 1],
+			(intptr_t)head, newval, cpu);
+		if (rseq_likely(!ret))
+			break;
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+	return head;
+}
+
+/*
+ * __percpu_buffer_pop is not safe against concurrent accesses. Should
+ * only be used on buffers that are not concurrently modified.
+ */
+struct percpu_buffer_node *__percpu_buffer_pop(struct percpu_buffer *buffer,
+					       int cpu)
+{
+	struct percpu_buffer_node *head;
+	intptr_t offset;
+
+	offset = buffer->c[cpu].offset;
+	if (offset == 0)
+		return NULL;
+	head = buffer->c[cpu].array[offset - 1];
+	buffer->c[cpu].offset = offset - 1;
+	return head;
+}
+
+void *test_percpu_buffer_thread(void *arg)
+{
+	long long i, reps;
+	struct percpu_buffer *buffer = (struct percpu_buffer *)arg;
+
+	if (!opt_disable_rseq && rseq_register_current_thread())
+		abort();
+
+	reps = opt_reps;
+	for (i = 0; i < reps; i++) {
+		struct percpu_buffer_node *node;
+
+		node = this_cpu_buffer_pop(buffer, NULL);
+		if (opt_yield)
+			sched_yield();  /* encourage shuffling */
+		if (node) {
+			if (!this_cpu_buffer_push(buffer, node, NULL)) {
+				/* Should increase buffer size. */
+				abort();
+			}
+		}
+	}
+
+	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+		       (int) gettid(), nr_abort, signals_delivered);
+	if (!opt_disable_rseq && rseq_unregister_current_thread())
+		abort();
+
+	return NULL;
+}
+
+/* Simultaneous modification to a per-cpu buffer from many threads.  */
+void test_percpu_buffer(void)
+{
+	const int num_threads = opt_threads;
+	int i, j, ret;
+	uint64_t sum = 0, expected_sum = 0;
+	struct percpu_buffer buffer;
+	pthread_t test_threads[num_threads];
+	cpu_set_t allowed_cpus;
+
+	memset(&buffer, 0, sizeof(buffer));
+
+	/* Generate list entries for every usable cpu. */
+	sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+		/* Worse-case is every item in same CPU. */
+		buffer.c[i].array =
+			malloc(sizeof(*buffer.c[i].array) * CPU_SETSIZE *
+			       BUFFER_ITEM_PER_CPU);
+		assert(buffer.c[i].array);
+		buffer.c[i].buflen = CPU_SETSIZE * BUFFER_ITEM_PER_CPU;
+		for (j = 1; j <= BUFFER_ITEM_PER_CPU; j++) {
+			struct percpu_buffer_node *node;
+
+			expected_sum += j;
+
+			/*
+			 * We could theoretically put the word-sized
+			 * "data" directly in the buffer. However, we
+			 * want to model objects that would not fit
+			 * within a single word, so allocate an object
+			 * for each node.
+			 */
+			node = malloc(sizeof(*node));
+			assert(node);
+			node->data = j;
+			buffer.c[i].array[j - 1] = node;
+			buffer.c[i].offset++;
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_create(&test_threads[i], NULL,
+				     test_percpu_buffer_thread, &buffer);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		struct percpu_buffer_node *node;
+
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+
+		while ((node = __percpu_buffer_pop(&buffer, i))) {
+			sum += node->data;
+			free(node);
+		}
+		free(buffer.c[i].array);
+	}
+
+	/*
+	 * All entries should now be accounted for (unless some external
+	 * actor is interfering with our allowed affinity while this
+	 * test is running).
+	 */
+	assert(sum == expected_sum);
+}
+
+bool this_cpu_memcpy_buffer_push(struct percpu_memcpy_buffer *buffer,
+				 struct percpu_memcpy_buffer_node item,
+				 int *_cpu)
+{
+	bool result = false;
+	int cpu;
+
+	for (;;) {
+		intptr_t *targetptr_final, newval_final, offset;
+		char *destptr, *srcptr;
+		size_t copylen;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		/* Load offset with single-copy atomicity. */
+		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+		if (offset == buffer->c[cpu].buflen)
+			break;
+		destptr = (char *)&buffer->c[cpu].array[offset];
+		srcptr = (char *)&item;
+		/* copylen must be <= 4kB. */
+		copylen = sizeof(item);
+		newval_final = offset + 1;
+		targetptr_final = &buffer->c[cpu].offset;
+		if (opt_mb)
+			ret = rseq_cmpeqv_trymemcpy_storev_release(
+				targetptr_final, offset,
+				destptr, srcptr, copylen,
+				newval_final, cpu);
+		else
+			ret = rseq_cmpeqv_trymemcpy_storev(targetptr_final,
+				offset, destptr, srcptr, copylen,
+				newval_final, cpu);
+		if (rseq_likely(!ret)) {
+			result = true;
+			break;
+		}
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+	return result;
+}
+
+bool this_cpu_memcpy_buffer_pop(struct percpu_memcpy_buffer *buffer,
+				struct percpu_memcpy_buffer_node *item,
+				int *_cpu)
+{
+	bool result = false;
+	int cpu;
+
+	for (;;) {
+		intptr_t *targetptr_final, newval_final, offset;
+		char *destptr, *srcptr;
+		size_t copylen;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		/* Load offset with single-copy atomicity. */
+		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+		if (offset == 0)
+			break;
+		destptr = (char *)item;
+		srcptr = (char *)&buffer->c[cpu].array[offset - 1];
+		/* copylen must be <= 4kB. */
+		copylen = sizeof(*item);
+		newval_final = offset - 1;
+		targetptr_final = &buffer->c[cpu].offset;
+		ret = rseq_cmpeqv_trymemcpy_storev(targetptr_final,
+			offset, destptr, srcptr, copylen,
+			newval_final, cpu);
+		if (rseq_likely(!ret)) {
+			result = true;
+			break;
+		}
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+	return result;
+}
+
+/*
+ * __percpu_memcpy_buffer_pop is not safe against concurrent accesses. Should
+ * only be used on buffers that are not concurrently modified.
+ */
+bool __percpu_memcpy_buffer_pop(struct percpu_memcpy_buffer *buffer,
+				struct percpu_memcpy_buffer_node *item,
+				int cpu)
+{
+	intptr_t offset;
+
+	offset = buffer->c[cpu].offset;
+	if (offset == 0)
+		return false;
+	memcpy(item, &buffer->c[cpu].array[offset - 1], sizeof(*item));
+	buffer->c[cpu].offset = offset - 1;
+	return true;
+}
+
+void *test_percpu_memcpy_buffer_thread(void *arg)
+{
+	long long i, reps;
+	struct percpu_memcpy_buffer *buffer = (struct percpu_memcpy_buffer *)arg;
+
+	if (!opt_disable_rseq && rseq_register_current_thread())
+		abort();
+
+	reps = opt_reps;
+	for (i = 0; i < reps; i++) {
+		struct percpu_memcpy_buffer_node item;
+		bool result;
+
+		result = this_cpu_memcpy_buffer_pop(buffer, &item, NULL);
+		if (opt_yield)
+			sched_yield();  /* encourage shuffling */
+		if (result) {
+			if (!this_cpu_memcpy_buffer_push(buffer, item, NULL)) {
+				/* Should increase buffer size. */
+				abort();
+			}
+		}
+	}
+
+	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+		       (int) gettid(), nr_abort, signals_delivered);
+	if (!opt_disable_rseq && rseq_unregister_current_thread())
+		abort();
+
+	return NULL;
+}
+
+/* Simultaneous modification to a per-cpu buffer from many threads.  */
+void test_percpu_memcpy_buffer(void)
+{
+	const int num_threads = opt_threads;
+	int i, j, ret;
+	uint64_t sum = 0, expected_sum = 0;
+	struct percpu_memcpy_buffer buffer;
+	pthread_t test_threads[num_threads];
+	cpu_set_t allowed_cpus;
+
+	memset(&buffer, 0, sizeof(buffer));
+
+	/* Generate list entries for every usable cpu. */
+	sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+		/* Worse-case is every item in same CPU. */
+		buffer.c[i].array =
+			malloc(sizeof(*buffer.c[i].array) * CPU_SETSIZE *
+			       MEMCPY_BUFFER_ITEM_PER_CPU);
+		assert(buffer.c[i].array);
+		buffer.c[i].buflen = CPU_SETSIZE * MEMCPY_BUFFER_ITEM_PER_CPU;
+		for (j = 1; j <= MEMCPY_BUFFER_ITEM_PER_CPU; j++) {
+			expected_sum += 2 * j + 1;
+
+			/*
+			 * We could theoretically put the word-sized
+			 * "data" directly in the buffer. However, we
+			 * want to model objects that would not fit
+			 * within a single word, so allocate an object
+			 * for each node.
+			 */
+			buffer.c[i].array[j - 1].data1 = j;
+			buffer.c[i].array[j - 1].data2 = j + 1;
+			buffer.c[i].offset++;
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_create(&test_threads[i], NULL,
+				     test_percpu_memcpy_buffer_thread,
+				     &buffer);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		struct percpu_memcpy_buffer_node item;
+
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+
+		while (__percpu_memcpy_buffer_pop(&buffer, &item, i)) {
+			sum += item.data1;
+			sum += item.data2;
+		}
+		free(buffer.c[i].array);
+	}
+
+	/*
+	 * All entries should now be accounted for (unless some external
+	 * actor is interfering with our allowed affinity while this
+	 * test is running).
+	 */
+	assert(sum == expected_sum);
+}
+
+static void test_signal_interrupt_handler(int signo)
+{
+	signals_delivered++;
+}
+
+static int set_signal_handler(void)
+{
+	int ret = 0;
+	struct sigaction sa;
+	sigset_t sigset;
+
+	ret = sigemptyset(&sigset);
+	if (ret < 0) {
+		perror("sigemptyset");
+		return ret;
+	}
+
+	sa.sa_handler = test_signal_interrupt_handler;
+	sa.sa_mask = sigset;
+	sa.sa_flags = 0;
+	ret = sigaction(SIGUSR1, &sa, NULL);
+	if (ret < 0) {
+		perror("sigaction");
+		return ret;
+	}
+
+	printf_verbose("Signal handler set for SIGUSR1\n");
+
+	return ret;
+}
+
+static void show_usage(int argc, char **argv)
+{
+	printf("Usage : %s <OPTIONS>\n",
+		argv[0]);
+	printf("OPTIONS:\n");
+	printf("	[-1 loops] Number of loops for delay injection 1\n");
+	printf("	[-2 loops] Number of loops for delay injection 2\n");
+	printf("	[-3 loops] Number of loops for delay injection 3\n");
+	printf("	[-4 loops] Number of loops for delay injection 4\n");
+	printf("	[-5 loops] Number of loops for delay injection 5\n");
+	printf("	[-6 loops] Number of loops for delay injection 6\n");
+	printf("	[-7 loops] Number of loops for delay injection 7 (-1 to enable -m)\n");
+	printf("	[-8 loops] Number of loops for delay injection 8 (-1 to enable -m)\n");
+	printf("	[-9 loops] Number of loops for delay injection 9 (-1 to enable -m)\n");
+	printf("	[-m N] Yield/sleep/kill every modulo N (default 0: disabled) (>= 0)\n");
+	printf("	[-y] Yield\n");
+	printf("	[-k] Kill thread with signal\n");
+	printf("	[-s S] S: =0: disabled (default), >0: sleep time (ms)\n");
+	printf("	[-t N] Number of threads (default 200)\n");
+	printf("	[-r N] Number of repetitions per thread (default 5000)\n");
+	printf("	[-d] Disable rseq system call (no initialization)\n");
+	printf("	[-D M] Disable rseq for each M threads\n");
+	printf("	[-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement\n");
+	printf("	[-M] Push into buffer and memcpy buffer with memory barriers.\n");
+	printf("	[-v] Verbose output.\n");
+	printf("	[-h] Show this help.\n");
+	printf("\n");
+}
+
+int main(int argc, char **argv)
+{
+	int i;
+
+	for (i = 1; i < argc; i++) {
+		if (argv[i][0] != '-')
+			continue;
+		switch (argv[i][1]) {
+		case '1':
+		case '2':
+		case '3':
+		case '4':
+		case '5':
+		case '6':
+		case '7':
+		case '8':
+		case '9':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			loop_cnt[argv[i][1] - '0'] = atol(argv[i + 1]);
+			i++;
+			break;
+		case 'm':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_modulo = atol(argv[i + 1]);
+			if (opt_modulo < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 's':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_sleep = atol(argv[i + 1]);
+			if (opt_sleep < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 'y':
+			opt_yield = 1;
+			break;
+		case 'k':
+			opt_signal = 1;
+			break;
+		case 'd':
+			opt_disable_rseq = 1;
+			break;
+		case 'D':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_disable_mod = atol(argv[i + 1]);
+			if (opt_disable_mod < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 't':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_threads = atol(argv[i + 1]);
+			if (opt_threads < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 'r':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_reps = atoll(argv[i + 1]);
+			if (opt_reps < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 'h':
+			show_usage(argc, argv);
+			goto end;
+		case 'T':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_test = *argv[i + 1];
+			switch (opt_test) {
+			case 's':
+			case 'l':
+			case 'i':
+			case 'b':
+			case 'm':
+				break;
+			default:
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 'v':
+			verbose = 1;
+			break;
+		case 'M':
+			opt_mb = 1;
+			break;
+		default:
+			show_usage(argc, argv);
+			goto error;
+		}
+	}
+
+	loop_cnt_1 = loop_cnt[1];
+	loop_cnt_2 = loop_cnt[2];
+	loop_cnt_3 = loop_cnt[3];
+	loop_cnt_4 = loop_cnt[4];
+	loop_cnt_5 = loop_cnt[5];
+	loop_cnt_6 = loop_cnt[6];
+
+	if (set_signal_handler())
+		goto error;
+
+	if (!opt_disable_rseq && rseq_register_current_thread())
+		goto error;
+	switch (opt_test) {
+	case 's':
+		printf_verbose("spinlock\n");
+		test_percpu_spinlock();
+		break;
+	case 'l':
+		printf_verbose("linked list\n");
+		test_percpu_list();
+		break;
+	case 'b':
+		printf_verbose("buffer\n");
+		test_percpu_buffer();
+		break;
+	case 'm':
+		printf_verbose("memcpy buffer\n");
+		test_percpu_memcpy_buffer();
+		break;
+	case 'i':
+		printf_verbose("counter increment\n");
+		test_percpu_inc();
+		break;
+	}
+	if (!opt_disable_rseq && rseq_unregister_current_thread())
+		abort();
+end:
+	return 0;
+
+error:
+	return -1;
+}
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kselftest" in
the body of a message to majordomo at vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

WARNING: multiple messages have this Message-ID (diff)
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
To: Peter Zijlstra <peterz@infradead.org>,
	"Paul E . McKenney" <paulmck@linux.vnet.ibm.com>,
	Boqun Feng <boqun.feng@gmail.com>,
	Andy Lutomirski <luto@amacapital.net>,
	Dave Watson <davejwatson@fb.com>
Cc: linux-kernel@vger.kernel.org, linux-api@vger.kernel.org,
	Paul Turner <pjt@google.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Russell King <linux@arm.linux.org.uk>,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>, "H . Peter Anvin" <hpa@zytor.com>,
	Andrew Hunter <ahh@google.com>, Andi Kleen <andi@firstfloor.org>,
	Chris Lameter <cl@linux.com>, Ben Maurer <bmaurer@fb.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Josh Triplett <josh@joshtriplett.org>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Catalin Marinas <catalin.marinas@arm.com>,
	Will Deacon <will.deacon@arm.com>,
	Michael Kerrisk <mtk.manpages@gmail.com>,
	Joel Fernandes <joelaf@google.com>,
	Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
	Shuah Khan <shuahkh@osg.sams>
Subject: [RFC PATCH for 4.18 15/16] rseq: selftests: Provide parametrized tests (v2)
Date: Sat,  2 Jun 2018 08:44:07 -0400	[thread overview]
Message-ID: <20180602124408.8430-16-mathieu.desnoyers@efficios.com> (raw)
In-Reply-To: <20180602124408.8430-1-mathieu.desnoyers@efficios.com>

"param_test" is a parametrizable restartable sequences test. See
the "--help" output for usage.

"param_test_benchmark" is the same as "param_test", but it removes
testing book-keeping code to allow accurate benchmarks.

"param_test_compare_twice" is the same as "param_test", but it performs
each comparison within rseq critical section twice, thus validating
invariants. If any of the second comparisons fails, an error message
is printed and the test aborts.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Shuah Khan <shuahkh@osg.samsung.com>
CC: Russell King <linux@arm.linux.org.uk>
CC: Catalin Marinas <catalin.marinas@arm.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Paul Turner <pjt@google.com>
CC: Andrew Hunter <ahh@google.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Andi Kleen <andi@firstfloor.org>
CC: Dave Watson <davejwatson@fb.com>
CC: Chris Lameter <cl@linux.com>
CC: Ingo Molnar <mingo@redhat.com>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Ben Maurer <bmaurer@fb.com>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
CC: Josh Triplett <josh@joshtriplett.org>
CC: Linus Torvalds <torvalds@linux-foundation.org>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: linux-kselftest@vger.kernel.org
CC: linux-api@vger.kernel.org
---
Changes since v1:
- Use only rseq, remove use of cpu_opv.
---
 tools/testing/selftests/rseq/param_test.c | 1260 +++++++++++++++++++++++++++++
 1 file changed, 1260 insertions(+)
 create mode 100644 tools/testing/selftests/rseq/param_test.c

diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c
new file mode 100644
index 000000000000..6a9f602a8718
--- /dev/null
+++ b/tools/testing/selftests/rseq/param_test.c
@@ -0,0 +1,1260 @@
+// SPDX-License-Identifier: LGPL-2.1
+#define _GNU_SOURCE
+#include <assert.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <poll.h>
+#include <sys/types.h>
+#include <signal.h>
+#include <errno.h>
+#include <stddef.h>
+
+static inline pid_t gettid(void)
+{
+	return syscall(__NR_gettid);
+}
+
+#define NR_INJECT	9
+static int loop_cnt[NR_INJECT + 1];
+
+static int loop_cnt_1 asm("asm_loop_cnt_1") __attribute__((used));
+static int loop_cnt_2 asm("asm_loop_cnt_2") __attribute__((used));
+static int loop_cnt_3 asm("asm_loop_cnt_3") __attribute__((used));
+static int loop_cnt_4 asm("asm_loop_cnt_4") __attribute__((used));
+static int loop_cnt_5 asm("asm_loop_cnt_5") __attribute__((used));
+static int loop_cnt_6 asm("asm_loop_cnt_6") __attribute__((used));
+
+static int opt_modulo, verbose;
+
+static int opt_yield, opt_signal, opt_sleep,
+		opt_disable_rseq, opt_threads = 200,
+		opt_disable_mod = 0, opt_test = 's', opt_mb = 0;
+
+#ifndef RSEQ_SKIP_FASTPATH
+static long long opt_reps = 5000;
+#else
+static long long opt_reps = 100;
+#endif
+
+static __thread __attribute__((tls_model("initial-exec")))
+unsigned int signals_delivered;
+
+#ifndef BENCHMARK
+
+static __thread __attribute__((tls_model("initial-exec"), unused))
+unsigned int yield_mod_cnt, nr_abort;
+
+#define printf_verbose(fmt, ...)			\
+	do {						\
+		if (verbose)				\
+			printf(fmt, ## __VA_ARGS__);	\
+	} while (0)
+
+#if defined(__x86_64__) || defined(__i386__)
+
+#define INJECT_ASM_REG	"eax"
+
+#define RSEQ_INJECT_CLOBBER \
+	, INJECT_ASM_REG
+
+#ifdef __i386__
+
+#define RSEQ_INJECT_ASM(n) \
+	"mov asm_loop_cnt_" #n ", %%" INJECT_ASM_REG "\n\t" \
+	"test %%" INJECT_ASM_REG ",%%" INJECT_ASM_REG "\n\t" \
+	"jz 333f\n\t" \
+	"222:\n\t" \
+	"dec %%" INJECT_ASM_REG "\n\t" \
+	"jnz 222b\n\t" \
+	"333:\n\t"
+
+#elif defined(__x86_64__)
+
+#define RSEQ_INJECT_ASM(n) \
+	"lea asm_loop_cnt_" #n "(%%rip), %%" INJECT_ASM_REG "\n\t" \
+	"mov (%%" INJECT_ASM_REG "), %%" INJECT_ASM_REG "\n\t" \
+	"test %%" INJECT_ASM_REG ",%%" INJECT_ASM_REG "\n\t" \
+	"jz 333f\n\t" \
+	"222:\n\t" \
+	"dec %%" INJECT_ASM_REG "\n\t" \
+	"jnz 222b\n\t" \
+	"333:\n\t"
+
+#else
+#error "Unsupported architecture"
+#endif
+
+#elif defined(__ARMEL__)
+
+#define RSEQ_INJECT_INPUT \
+	, [loop_cnt_1]"m"(loop_cnt[1]) \
+	, [loop_cnt_2]"m"(loop_cnt[2]) \
+	, [loop_cnt_3]"m"(loop_cnt[3]) \
+	, [loop_cnt_4]"m"(loop_cnt[4]) \
+	, [loop_cnt_5]"m"(loop_cnt[5]) \
+	, [loop_cnt_6]"m"(loop_cnt[6])
+
+#define INJECT_ASM_REG	"r4"
+
+#define RSEQ_INJECT_CLOBBER \
+	, INJECT_ASM_REG
+
+#define RSEQ_INJECT_ASM(n) \
+	"ldr " INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \
+	"cmp " INJECT_ASM_REG ", #0\n\t" \
+	"beq 333f\n\t" \
+	"222:\n\t" \
+	"subs " INJECT_ASM_REG ", #1\n\t" \
+	"bne 222b\n\t" \
+	"333:\n\t"
+
+#elif __PPC__
+
+#define RSEQ_INJECT_INPUT \
+	, [loop_cnt_1]"m"(loop_cnt[1]) \
+	, [loop_cnt_2]"m"(loop_cnt[2]) \
+	, [loop_cnt_3]"m"(loop_cnt[3]) \
+	, [loop_cnt_4]"m"(loop_cnt[4]) \
+	, [loop_cnt_5]"m"(loop_cnt[5]) \
+	, [loop_cnt_6]"m"(loop_cnt[6])
+
+#define INJECT_ASM_REG	"r18"
+
+#define RSEQ_INJECT_CLOBBER \
+	, INJECT_ASM_REG
+
+#define RSEQ_INJECT_ASM(n) \
+	"lwz %%" INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \
+	"cmpwi %%" INJECT_ASM_REG ", 0\n\t" \
+	"beq 333f\n\t" \
+	"222:\n\t" \
+	"subic. %%" INJECT_ASM_REG ", %%" INJECT_ASM_REG ", 1\n\t" \
+	"bne 222b\n\t" \
+	"333:\n\t"
+#else
+#error unsupported target
+#endif
+
+#define RSEQ_INJECT_FAILED \
+	nr_abort++;
+
+#define RSEQ_INJECT_C(n) \
+{ \
+	int loc_i, loc_nr_loops = loop_cnt[n]; \
+	\
+	for (loc_i = 0; loc_i < loc_nr_loops; loc_i++) { \
+		rseq_barrier(); \
+	} \
+	if (loc_nr_loops == -1 && opt_modulo) { \
+		if (yield_mod_cnt == opt_modulo - 1) { \
+			if (opt_sleep > 0) \
+				poll(NULL, 0, opt_sleep); \
+			if (opt_yield) \
+				sched_yield(); \
+			if (opt_signal) \
+				raise(SIGUSR1); \
+			yield_mod_cnt = 0; \
+		} else { \
+			yield_mod_cnt++; \
+		} \
+	} \
+}
+
+#else
+
+#define printf_verbose(fmt, ...)
+
+#endif /* BENCHMARK */
+
+#include "rseq.h"
+
+struct percpu_lock_entry {
+	intptr_t v;
+} __attribute__((aligned(128)));
+
+struct percpu_lock {
+	struct percpu_lock_entry c[CPU_SETSIZE];
+};
+
+struct test_data_entry {
+	intptr_t count;
+} __attribute__((aligned(128)));
+
+struct spinlock_test_data {
+	struct percpu_lock lock;
+	struct test_data_entry c[CPU_SETSIZE];
+};
+
+struct spinlock_thread_test_data {
+	struct spinlock_test_data *data;
+	long long reps;
+	int reg;
+};
+
+struct inc_test_data {
+	struct test_data_entry c[CPU_SETSIZE];
+};
+
+struct inc_thread_test_data {
+	struct inc_test_data *data;
+	long long reps;
+	int reg;
+};
+
+struct percpu_list_node {
+	intptr_t data;
+	struct percpu_list_node *next;
+};
+
+struct percpu_list_entry {
+	struct percpu_list_node *head;
+} __attribute__((aligned(128)));
+
+struct percpu_list {
+	struct percpu_list_entry c[CPU_SETSIZE];
+};
+
+#define BUFFER_ITEM_PER_CPU	100
+
+struct percpu_buffer_node {
+	intptr_t data;
+};
+
+struct percpu_buffer_entry {
+	intptr_t offset;
+	intptr_t buflen;
+	struct percpu_buffer_node **array;
+} __attribute__((aligned(128)));
+
+struct percpu_buffer {
+	struct percpu_buffer_entry c[CPU_SETSIZE];
+};
+
+#define MEMCPY_BUFFER_ITEM_PER_CPU	100
+
+struct percpu_memcpy_buffer_node {
+	intptr_t data1;
+	uint64_t data2;
+};
+
+struct percpu_memcpy_buffer_entry {
+	intptr_t offset;
+	intptr_t buflen;
+	struct percpu_memcpy_buffer_node *array;
+} __attribute__((aligned(128)));
+
+struct percpu_memcpy_buffer {
+	struct percpu_memcpy_buffer_entry c[CPU_SETSIZE];
+};
+
+/* A simple percpu spinlock. Grabs lock on current cpu. */
+static int rseq_this_cpu_lock(struct percpu_lock *lock)
+{
+	int cpu;
+
+	for (;;) {
+		int ret;
+
+		cpu = rseq_cpu_start();
+		ret = rseq_cmpeqv_storev(&lock->c[cpu].v,
+					 0, 1, cpu);
+		if (rseq_likely(!ret))
+			break;
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	/*
+	 * Acquire semantic when taking lock after control dependency.
+	 * Matches rseq_smp_store_release().
+	 */
+	rseq_smp_acquire__after_ctrl_dep();
+	return cpu;
+}
+
+static void rseq_percpu_unlock(struct percpu_lock *lock, int cpu)
+{
+	assert(lock->c[cpu].v == 1);
+	/*
+	 * Release lock, with release semantic. Matches
+	 * rseq_smp_acquire__after_ctrl_dep().
+	 */
+	rseq_smp_store_release(&lock->c[cpu].v, 0);
+}
+
+void *test_percpu_spinlock_thread(void *arg)
+{
+	struct spinlock_thread_test_data *thread_data = arg;
+	struct spinlock_test_data *data = thread_data->data;
+	long long i, reps;
+
+	if (!opt_disable_rseq && thread_data->reg &&
+	    rseq_register_current_thread())
+		abort();
+	reps = thread_data->reps;
+	for (i = 0; i < reps; i++) {
+		int cpu = rseq_cpu_start();
+
+		cpu = rseq_this_cpu_lock(&data->lock);
+		data->c[cpu].count++;
+		rseq_percpu_unlock(&data->lock, cpu);
+#ifndef BENCHMARK
+		if (i != 0 && !(i % (reps / 10)))
+			printf_verbose("tid %d: count %lld\n", (int) gettid(), i);
+#endif
+	}
+	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+		       (int) gettid(), nr_abort, signals_delivered);
+	if (!opt_disable_rseq && thread_data->reg &&
+	    rseq_unregister_current_thread())
+		abort();
+	return NULL;
+}
+
+/*
+ * A simple test which implements a sharded counter using a per-cpu
+ * lock.  Obviously real applications might prefer to simply use a
+ * per-cpu increment; however, this is reasonable for a test and the
+ * lock can be extended to synchronize more complicated operations.
+ */
+void test_percpu_spinlock(void)
+{
+	const int num_threads = opt_threads;
+	int i, ret;
+	uint64_t sum;
+	pthread_t test_threads[num_threads];
+	struct spinlock_test_data data;
+	struct spinlock_thread_test_data thread_data[num_threads];
+
+	memset(&data, 0, sizeof(data));
+	for (i = 0; i < num_threads; i++) {
+		thread_data[i].reps = opt_reps;
+		if (opt_disable_mod <= 0 || (i % opt_disable_mod))
+			thread_data[i].reg = 1;
+		else
+			thread_data[i].reg = 0;
+		thread_data[i].data = &data;
+		ret = pthread_create(&test_threads[i], NULL,
+				     test_percpu_spinlock_thread,
+				     &thread_data[i]);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	sum = 0;
+	for (i = 0; i < CPU_SETSIZE; i++)
+		sum += data.c[i].count;
+
+	assert(sum == (uint64_t)opt_reps * num_threads);
+}
+
+void *test_percpu_inc_thread(void *arg)
+{
+	struct inc_thread_test_data *thread_data = arg;
+	struct inc_test_data *data = thread_data->data;
+	long long i, reps;
+
+	if (!opt_disable_rseq && thread_data->reg &&
+	    rseq_register_current_thread())
+		abort();
+	reps = thread_data->reps;
+	for (i = 0; i < reps; i++) {
+		int ret;
+
+		do {
+			int cpu;
+
+			cpu = rseq_cpu_start();
+			ret = rseq_addv(&data->c[cpu].count, 1, cpu);
+		} while (rseq_unlikely(ret));
+#ifndef BENCHMARK
+		if (i != 0 && !(i % (reps / 10)))
+			printf_verbose("tid %d: count %lld\n", (int) gettid(), i);
+#endif
+	}
+	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+		       (int) gettid(), nr_abort, signals_delivered);
+	if (!opt_disable_rseq && thread_data->reg &&
+	    rseq_unregister_current_thread())
+		abort();
+	return NULL;
+}
+
+void test_percpu_inc(void)
+{
+	const int num_threads = opt_threads;
+	int i, ret;
+	uint64_t sum;
+	pthread_t test_threads[num_threads];
+	struct inc_test_data data;
+	struct inc_thread_test_data thread_data[num_threads];
+
+	memset(&data, 0, sizeof(data));
+	for (i = 0; i < num_threads; i++) {
+		thread_data[i].reps = opt_reps;
+		if (opt_disable_mod <= 0 || (i % opt_disable_mod))
+			thread_data[i].reg = 1;
+		else
+			thread_data[i].reg = 0;
+		thread_data[i].data = &data;
+		ret = pthread_create(&test_threads[i], NULL,
+				     test_percpu_inc_thread,
+				     &thread_data[i]);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	sum = 0;
+	for (i = 0; i < CPU_SETSIZE; i++)
+		sum += data.c[i].count;
+
+	assert(sum == (uint64_t)opt_reps * num_threads);
+}
+
+void this_cpu_list_push(struct percpu_list *list,
+			struct percpu_list_node *node,
+			int *_cpu)
+{
+	int cpu;
+
+	for (;;) {
+		intptr_t *targetptr, newval, expect;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		/* Load list->c[cpu].head with single-copy atomicity. */
+		expect = (intptr_t)RSEQ_READ_ONCE(list->c[cpu].head);
+		newval = (intptr_t)node;
+		targetptr = (intptr_t *)&list->c[cpu].head;
+		node->next = (struct percpu_list_node *)expect;
+		ret = rseq_cmpeqv_storev(targetptr, expect, newval, cpu);
+		if (rseq_likely(!ret))
+			break;
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+}
+
+/*
+ * Unlike a traditional lock-less linked list; the availability of a
+ * rseq primitive allows us to implement pop without concerns over
+ * ABA-type races.
+ */
+struct percpu_list_node *this_cpu_list_pop(struct percpu_list *list,
+					   int *_cpu)
+{
+	struct percpu_list_node *node = NULL;
+	int cpu;
+
+	for (;;) {
+		struct percpu_list_node *head;
+		intptr_t *targetptr, expectnot, *load;
+		off_t offset;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		targetptr = (intptr_t *)&list->c[cpu].head;
+		expectnot = (intptr_t)NULL;
+		offset = offsetof(struct percpu_list_node, next);
+		load = (intptr_t *)&head;
+		ret = rseq_cmpnev_storeoffp_load(targetptr, expectnot,
+						   offset, load, cpu);
+		if (rseq_likely(!ret)) {
+			node = head;
+			break;
+		}
+		if (ret > 0)
+			break;
+		/* Retry if rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+	return node;
+}
+
+/*
+ * __percpu_list_pop is not safe against concurrent accesses. Should
+ * only be used on lists that are not concurrently modified.
+ */
+struct percpu_list_node *__percpu_list_pop(struct percpu_list *list, int cpu)
+{
+	struct percpu_list_node *node;
+
+	node = list->c[cpu].head;
+	if (!node)
+		return NULL;
+	list->c[cpu].head = node->next;
+	return node;
+}
+
+void *test_percpu_list_thread(void *arg)
+{
+	long long i, reps;
+	struct percpu_list *list = (struct percpu_list *)arg;
+
+	if (!opt_disable_rseq && rseq_register_current_thread())
+		abort();
+
+	reps = opt_reps;
+	for (i = 0; i < reps; i++) {
+		struct percpu_list_node *node;
+
+		node = this_cpu_list_pop(list, NULL);
+		if (opt_yield)
+			sched_yield();  /* encourage shuffling */
+		if (node)
+			this_cpu_list_push(list, node, NULL);
+	}
+
+	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+		       (int) gettid(), nr_abort, signals_delivered);
+	if (!opt_disable_rseq && rseq_unregister_current_thread())
+		abort();
+
+	return NULL;
+}
+
+/* Simultaneous modification to a per-cpu linked list from many threads.  */
+void test_percpu_list(void)
+{
+	const int num_threads = opt_threads;
+	int i, j, ret;
+	uint64_t sum = 0, expected_sum = 0;
+	struct percpu_list list;
+	pthread_t test_threads[num_threads];
+	cpu_set_t allowed_cpus;
+
+	memset(&list, 0, sizeof(list));
+
+	/* Generate list entries for every usable cpu. */
+	sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+		for (j = 1; j <= 100; j++) {
+			struct percpu_list_node *node;
+
+			expected_sum += j;
+
+			node = malloc(sizeof(*node));
+			assert(node);
+			node->data = j;
+			node->next = list.c[i].head;
+			list.c[i].head = node;
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_create(&test_threads[i], NULL,
+				     test_percpu_list_thread, &list);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		struct percpu_list_node *node;
+
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+
+		while ((node = __percpu_list_pop(&list, i))) {
+			sum += node->data;
+			free(node);
+		}
+	}
+
+	/*
+	 * All entries should now be accounted for (unless some external
+	 * actor is interfering with our allowed affinity while this
+	 * test is running).
+	 */
+	assert(sum == expected_sum);
+}
+
+bool this_cpu_buffer_push(struct percpu_buffer *buffer,
+			  struct percpu_buffer_node *node,
+			  int *_cpu)
+{
+	bool result = false;
+	int cpu;
+
+	for (;;) {
+		intptr_t *targetptr_spec, newval_spec;
+		intptr_t *targetptr_final, newval_final;
+		intptr_t offset;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+		if (offset == buffer->c[cpu].buflen)
+			break;
+		newval_spec = (intptr_t)node;
+		targetptr_spec = (intptr_t *)&buffer->c[cpu].array[offset];
+		newval_final = offset + 1;
+		targetptr_final = &buffer->c[cpu].offset;
+		if (opt_mb)
+			ret = rseq_cmpeqv_trystorev_storev_release(
+				targetptr_final, offset, targetptr_spec,
+				newval_spec, newval_final, cpu);
+		else
+			ret = rseq_cmpeqv_trystorev_storev(targetptr_final,
+				offset, targetptr_spec, newval_spec,
+				newval_final, cpu);
+		if (rseq_likely(!ret)) {
+			result = true;
+			break;
+		}
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+	return result;
+}
+
+struct percpu_buffer_node *this_cpu_buffer_pop(struct percpu_buffer *buffer,
+					       int *_cpu)
+{
+	struct percpu_buffer_node *head;
+	int cpu;
+
+	for (;;) {
+		intptr_t *targetptr, newval;
+		intptr_t offset;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		/* Load offset with single-copy atomicity. */
+		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+		if (offset == 0) {
+			head = NULL;
+			break;
+		}
+		head = RSEQ_READ_ONCE(buffer->c[cpu].array[offset - 1]);
+		newval = offset - 1;
+		targetptr = (intptr_t *)&buffer->c[cpu].offset;
+		ret = rseq_cmpeqv_cmpeqv_storev(targetptr, offset,
+			(intptr_t *)&buffer->c[cpu].array[offset - 1],
+			(intptr_t)head, newval, cpu);
+		if (rseq_likely(!ret))
+			break;
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+	return head;
+}
+
+/*
+ * __percpu_buffer_pop is not safe against concurrent accesses. Should
+ * only be used on buffers that are not concurrently modified.
+ */
+struct percpu_buffer_node *__percpu_buffer_pop(struct percpu_buffer *buffer,
+					       int cpu)
+{
+	struct percpu_buffer_node *head;
+	intptr_t offset;
+
+	offset = buffer->c[cpu].offset;
+	if (offset == 0)
+		return NULL;
+	head = buffer->c[cpu].array[offset - 1];
+	buffer->c[cpu].offset = offset - 1;
+	return head;
+}
+
+void *test_percpu_buffer_thread(void *arg)
+{
+	long long i, reps;
+	struct percpu_buffer *buffer = (struct percpu_buffer *)arg;
+
+	if (!opt_disable_rseq && rseq_register_current_thread())
+		abort();
+
+	reps = opt_reps;
+	for (i = 0; i < reps; i++) {
+		struct percpu_buffer_node *node;
+
+		node = this_cpu_buffer_pop(buffer, NULL);
+		if (opt_yield)
+			sched_yield();  /* encourage shuffling */
+		if (node) {
+			if (!this_cpu_buffer_push(buffer, node, NULL)) {
+				/* Should increase buffer size. */
+				abort();
+			}
+		}
+	}
+
+	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+		       (int) gettid(), nr_abort, signals_delivered);
+	if (!opt_disable_rseq && rseq_unregister_current_thread())
+		abort();
+
+	return NULL;
+}
+
+/* Simultaneous modification to a per-cpu buffer from many threads.  */
+void test_percpu_buffer(void)
+{
+	const int num_threads = opt_threads;
+	int i, j, ret;
+	uint64_t sum = 0, expected_sum = 0;
+	struct percpu_buffer buffer;
+	pthread_t test_threads[num_threads];
+	cpu_set_t allowed_cpus;
+
+	memset(&buffer, 0, sizeof(buffer));
+
+	/* Generate list entries for every usable cpu. */
+	sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+		/* Worse-case is every item in same CPU. */
+		buffer.c[i].array =
+			malloc(sizeof(*buffer.c[i].array) * CPU_SETSIZE *
+			       BUFFER_ITEM_PER_CPU);
+		assert(buffer.c[i].array);
+		buffer.c[i].buflen = CPU_SETSIZE * BUFFER_ITEM_PER_CPU;
+		for (j = 1; j <= BUFFER_ITEM_PER_CPU; j++) {
+			struct percpu_buffer_node *node;
+
+			expected_sum += j;
+
+			/*
+			 * We could theoretically put the word-sized
+			 * "data" directly in the buffer. However, we
+			 * want to model objects that would not fit
+			 * within a single word, so allocate an object
+			 * for each node.
+			 */
+			node = malloc(sizeof(*node));
+			assert(node);
+			node->data = j;
+			buffer.c[i].array[j - 1] = node;
+			buffer.c[i].offset++;
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_create(&test_threads[i], NULL,
+				     test_percpu_buffer_thread, &buffer);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		struct percpu_buffer_node *node;
+
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+
+		while ((node = __percpu_buffer_pop(&buffer, i))) {
+			sum += node->data;
+			free(node);
+		}
+		free(buffer.c[i].array);
+	}
+
+	/*
+	 * All entries should now be accounted for (unless some external
+	 * actor is interfering with our allowed affinity while this
+	 * test is running).
+	 */
+	assert(sum == expected_sum);
+}
+
+bool this_cpu_memcpy_buffer_push(struct percpu_memcpy_buffer *buffer,
+				 struct percpu_memcpy_buffer_node item,
+				 int *_cpu)
+{
+	bool result = false;
+	int cpu;
+
+	for (;;) {
+		intptr_t *targetptr_final, newval_final, offset;
+		char *destptr, *srcptr;
+		size_t copylen;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		/* Load offset with single-copy atomicity. */
+		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+		if (offset == buffer->c[cpu].buflen)
+			break;
+		destptr = (char *)&buffer->c[cpu].array[offset];
+		srcptr = (char *)&item;
+		/* copylen must be <= 4kB. */
+		copylen = sizeof(item);
+		newval_final = offset + 1;
+		targetptr_final = &buffer->c[cpu].offset;
+		if (opt_mb)
+			ret = rseq_cmpeqv_trymemcpy_storev_release(
+				targetptr_final, offset,
+				destptr, srcptr, copylen,
+				newval_final, cpu);
+		else
+			ret = rseq_cmpeqv_trymemcpy_storev(targetptr_final,
+				offset, destptr, srcptr, copylen,
+				newval_final, cpu);
+		if (rseq_likely(!ret)) {
+			result = true;
+			break;
+		}
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+	return result;
+}
+
+bool this_cpu_memcpy_buffer_pop(struct percpu_memcpy_buffer *buffer,
+				struct percpu_memcpy_buffer_node *item,
+				int *_cpu)
+{
+	bool result = false;
+	int cpu;
+
+	for (;;) {
+		intptr_t *targetptr_final, newval_final, offset;
+		char *destptr, *srcptr;
+		size_t copylen;
+		int ret;
+
+		cpu = rseq_cpu_start();
+		/* Load offset with single-copy atomicity. */
+		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+		if (offset == 0)
+			break;
+		destptr = (char *)item;
+		srcptr = (char *)&buffer->c[cpu].array[offset - 1];
+		/* copylen must be <= 4kB. */
+		copylen = sizeof(*item);
+		newval_final = offset - 1;
+		targetptr_final = &buffer->c[cpu].offset;
+		ret = rseq_cmpeqv_trymemcpy_storev(targetptr_final,
+			offset, destptr, srcptr, copylen,
+			newval_final, cpu);
+		if (rseq_likely(!ret)) {
+			result = true;
+			break;
+		}
+		/* Retry if comparison fails or rseq aborts. */
+	}
+	if (_cpu)
+		*_cpu = cpu;
+	return result;
+}
+
+/*
+ * __percpu_memcpy_buffer_pop is not safe against concurrent accesses. Should
+ * only be used on buffers that are not concurrently modified.
+ */
+bool __percpu_memcpy_buffer_pop(struct percpu_memcpy_buffer *buffer,
+				struct percpu_memcpy_buffer_node *item,
+				int cpu)
+{
+	intptr_t offset;
+
+	offset = buffer->c[cpu].offset;
+	if (offset == 0)
+		return false;
+	memcpy(item, &buffer->c[cpu].array[offset - 1], sizeof(*item));
+	buffer->c[cpu].offset = offset - 1;
+	return true;
+}
+
+void *test_percpu_memcpy_buffer_thread(void *arg)
+{
+	long long i, reps;
+	struct percpu_memcpy_buffer *buffer = (struct percpu_memcpy_buffer *)arg;
+
+	if (!opt_disable_rseq && rseq_register_current_thread())
+		abort();
+
+	reps = opt_reps;
+	for (i = 0; i < reps; i++) {
+		struct percpu_memcpy_buffer_node item;
+		bool result;
+
+		result = this_cpu_memcpy_buffer_pop(buffer, &item, NULL);
+		if (opt_yield)
+			sched_yield();  /* encourage shuffling */
+		if (result) {
+			if (!this_cpu_memcpy_buffer_push(buffer, item, NULL)) {
+				/* Should increase buffer size. */
+				abort();
+			}
+		}
+	}
+
+	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+		       (int) gettid(), nr_abort, signals_delivered);
+	if (!opt_disable_rseq && rseq_unregister_current_thread())
+		abort();
+
+	return NULL;
+}
+
+/* Simultaneous modification to a per-cpu buffer from many threads.  */
+void test_percpu_memcpy_buffer(void)
+{
+	const int num_threads = opt_threads;
+	int i, j, ret;
+	uint64_t sum = 0, expected_sum = 0;
+	struct percpu_memcpy_buffer buffer;
+	pthread_t test_threads[num_threads];
+	cpu_set_t allowed_cpus;
+
+	memset(&buffer, 0, sizeof(buffer));
+
+	/* Generate list entries for every usable cpu. */
+	sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+		/* Worse-case is every item in same CPU. */
+		buffer.c[i].array =
+			malloc(sizeof(*buffer.c[i].array) * CPU_SETSIZE *
+			       MEMCPY_BUFFER_ITEM_PER_CPU);
+		assert(buffer.c[i].array);
+		buffer.c[i].buflen = CPU_SETSIZE * MEMCPY_BUFFER_ITEM_PER_CPU;
+		for (j = 1; j <= MEMCPY_BUFFER_ITEM_PER_CPU; j++) {
+			expected_sum += 2 * j + 1;
+
+			/*
+			 * We could theoretically put the word-sized
+			 * "data" directly in the buffer. However, we
+			 * want to model objects that would not fit
+			 * within a single word, so allocate an object
+			 * for each node.
+			 */
+			buffer.c[i].array[j - 1].data1 = j;
+			buffer.c[i].array[j - 1].data2 = j + 1;
+			buffer.c[i].offset++;
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_create(&test_threads[i], NULL,
+				     test_percpu_memcpy_buffer_thread,
+				     &buffer);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		struct percpu_memcpy_buffer_node item;
+
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+
+		while (__percpu_memcpy_buffer_pop(&buffer, &item, i)) {
+			sum += item.data1;
+			sum += item.data2;
+		}
+		free(buffer.c[i].array);
+	}
+
+	/*
+	 * All entries should now be accounted for (unless some external
+	 * actor is interfering with our allowed affinity while this
+	 * test is running).
+	 */
+	assert(sum == expected_sum);
+}
+
+static void test_signal_interrupt_handler(int signo)
+{
+	signals_delivered++;
+}
+
+static int set_signal_handler(void)
+{
+	int ret = 0;
+	struct sigaction sa;
+	sigset_t sigset;
+
+	ret = sigemptyset(&sigset);
+	if (ret < 0) {
+		perror("sigemptyset");
+		return ret;
+	}
+
+	sa.sa_handler = test_signal_interrupt_handler;
+	sa.sa_mask = sigset;
+	sa.sa_flags = 0;
+	ret = sigaction(SIGUSR1, &sa, NULL);
+	if (ret < 0) {
+		perror("sigaction");
+		return ret;
+	}
+
+	printf_verbose("Signal handler set for SIGUSR1\n");
+
+	return ret;
+}
+
+static void show_usage(int argc, char **argv)
+{
+	printf("Usage : %s <OPTIONS>\n",
+		argv[0]);
+	printf("OPTIONS:\n");
+	printf("	[-1 loops] Number of loops for delay injection 1\n");
+	printf("	[-2 loops] Number of loops for delay injection 2\n");
+	printf("	[-3 loops] Number of loops for delay injection 3\n");
+	printf("	[-4 loops] Number of loops for delay injection 4\n");
+	printf("	[-5 loops] Number of loops for delay injection 5\n");
+	printf("	[-6 loops] Number of loops for delay injection 6\n");
+	printf("	[-7 loops] Number of loops for delay injection 7 (-1 to enable -m)\n");
+	printf("	[-8 loops] Number of loops for delay injection 8 (-1 to enable -m)\n");
+	printf("	[-9 loops] Number of loops for delay injection 9 (-1 to enable -m)\n");
+	printf("	[-m N] Yield/sleep/kill every modulo N (default 0: disabled) (>= 0)\n");
+	printf("	[-y] Yield\n");
+	printf("	[-k] Kill thread with signal\n");
+	printf("	[-s S] S: =0: disabled (default), >0: sleep time (ms)\n");
+	printf("	[-t N] Number of threads (default 200)\n");
+	printf("	[-r N] Number of repetitions per thread (default 5000)\n");
+	printf("	[-d] Disable rseq system call (no initialization)\n");
+	printf("	[-D M] Disable rseq for each M threads\n");
+	printf("	[-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement\n");
+	printf("	[-M] Push into buffer and memcpy buffer with memory barriers.\n");
+	printf("	[-v] Verbose output.\n");
+	printf("	[-h] Show this help.\n");
+	printf("\n");
+}
+
+int main(int argc, char **argv)
+{
+	int i;
+
+	for (i = 1; i < argc; i++) {
+		if (argv[i][0] != '-')
+			continue;
+		switch (argv[i][1]) {
+		case '1':
+		case '2':
+		case '3':
+		case '4':
+		case '5':
+		case '6':
+		case '7':
+		case '8':
+		case '9':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			loop_cnt[argv[i][1] - '0'] = atol(argv[i + 1]);
+			i++;
+			break;
+		case 'm':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_modulo = atol(argv[i + 1]);
+			if (opt_modulo < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 's':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_sleep = atol(argv[i + 1]);
+			if (opt_sleep < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 'y':
+			opt_yield = 1;
+			break;
+		case 'k':
+			opt_signal = 1;
+			break;
+		case 'd':
+			opt_disable_rseq = 1;
+			break;
+		case 'D':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_disable_mod = atol(argv[i + 1]);
+			if (opt_disable_mod < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 't':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_threads = atol(argv[i + 1]);
+			if (opt_threads < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 'r':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_reps = atoll(argv[i + 1]);
+			if (opt_reps < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 'h':
+			show_usage(argc, argv);
+			goto end;
+		case 'T':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_test = *argv[i + 1];
+			switch (opt_test) {
+			case 's':
+			case 'l':
+			case 'i':
+			case 'b':
+			case 'm':
+				break;
+			default:
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 'v':
+			verbose = 1;
+			break;
+		case 'M':
+			opt_mb = 1;
+			break;
+		default:
+			show_usage(argc, argv);
+			goto error;
+		}
+	}
+
+	loop_cnt_1 = loop_cnt[1];
+	loop_cnt_2 = loop_cnt[2];
+	loop_cnt_3 = loop_cnt[3];
+	loop_cnt_4 = loop_cnt[4];
+	loop_cnt_5 = loop_cnt[5];
+	loop_cnt_6 = loop_cnt[6];
+
+	if (set_signal_handler())
+		goto error;
+
+	if (!opt_disable_rseq && rseq_register_current_thread())
+		goto error;
+	switch (opt_test) {
+	case 's':
+		printf_verbose("spinlock\n");
+		test_percpu_spinlock();
+		break;
+	case 'l':
+		printf_verbose("linked list\n");
+		test_percpu_list();
+		break;
+	case 'b':
+		printf_verbose("buffer\n");
+		test_percpu_buffer();
+		break;
+	case 'm':
+		printf_verbose("memcpy buffer\n");
+		test_percpu_memcpy_buffer();
+		break;
+	case 'i':
+		printf_verbose("counter increment\n");
+		test_percpu_inc();
+		break;
+	}
+	if (!opt_disable_rseq && rseq_unregister_current_thread())
+		abort();
+end:
+	return 0;
+
+error:
+	return -1;
+}
-- 
2.11.0

  parent reply	other threads:[~2018-06-02 12:46 UTC|newest]

Thread overview: 59+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-06-02 12:43 [RFC PATCH for 4.18 00/16] Restartable Sequences Mathieu Desnoyers
2018-06-02 12:43 ` [RFC PATCH for 4.18 01/16] uapi headers: Provide types_32_64.h (v2) Mathieu Desnoyers
2018-06-02 12:43 ` [RFC PATCH for 4.18 02/16] rseq: Introduce restartable sequences system call (v13) Mathieu Desnoyers
2018-06-02 12:43   ` Mathieu Desnoyers
2018-06-02 12:43 ` [RFC PATCH for 4.18 03/16] arm: Add restartable sequences support Mathieu Desnoyers
2018-06-02 12:43 ` [RFC PATCH for 4.18 04/16] arm: Add syscall detection for restartable sequences Mathieu Desnoyers
2018-06-02 12:43 ` [RFC PATCH for 4.18 05/16] arm: Wire up restartable sequences system call Mathieu Desnoyers
2018-06-02 12:43 ` [RFC PATCH for 4.18 06/16] x86: Add support for restartable sequences (v2) Mathieu Desnoyers
2018-06-02 12:43 ` [RFC PATCH for 4.18 07/16] x86: Wire up restartable sequence system call Mathieu Desnoyers
2018-06-02 12:44 ` [RFC PATCH for 4.18 08/16] powerpc: Add support for restartable sequences Mathieu Desnoyers
2018-06-02 12:44   ` Mathieu Desnoyers
2018-06-02 12:44 ` [RFC PATCH for 4.18 09/16] powerpc: Add syscall detection " Mathieu Desnoyers
2018-06-02 12:44   ` Mathieu Desnoyers
2018-06-05  5:21   ` Michael Ellerman
2018-06-05  5:21     ` Michael Ellerman
2018-06-05 12:50     ` Mathieu Desnoyers
2018-06-05 12:50       ` Mathieu Desnoyers
2018-06-02 12:44 ` [RFC PATCH for 4.18 10/16] powerpc: Wire up restartable sequences system call Mathieu Desnoyers
2018-06-02 12:44   ` Mathieu Desnoyers
2018-06-05  5:18   ` Michael Ellerman
2018-06-05  5:18     ` Michael Ellerman
2018-06-05 12:51     ` Mathieu Desnoyers
2018-06-05 12:51       ` Mathieu Desnoyers
2018-06-02 12:44 ` [RFC PATCH for 4.18 11/16] selftests: lib.mk: Introduce OVERRIDE_TARGETS Mathieu Desnoyers
2018-06-02 12:44   ` Mathieu Desnoyers
2018-06-02 12:44   ` Mathieu Desnoyers
2018-06-02 12:44   ` mathieu.desnoyers
2018-06-02 12:44 ` [RFC PATCH for 4.18 12/16] rseq: selftests: Provide rseq library (v5) Mathieu Desnoyers
2018-06-02 12:44   ` Mathieu Desnoyers
2018-06-02 12:44   ` Mathieu Desnoyers
2018-06-02 12:44   ` mathieu.desnoyers
2018-06-02 12:44 ` [RFC PATCH for 4.18 13/16] rseq: selftests: Provide basic test Mathieu Desnoyers
2018-06-02 12:44   ` Mathieu Desnoyers
2018-06-02 12:44   ` Mathieu Desnoyers
2018-06-02 12:44   ` mathieu.desnoyers
2018-06-02 12:44 ` [RFC PATCH for 4.18 14/16] rseq: selftests: Provide basic percpu ops test (v2) Mathieu Desnoyers
2018-06-02 12:44   ` Mathieu Desnoyers
2018-06-02 12:44   ` Mathieu Desnoyers
2018-06-02 12:44   ` mathieu.desnoyers
2018-06-02 12:44 ` Mathieu Desnoyers [this message]
2018-06-02 12:44   ` [RFC PATCH for 4.18 15/16] rseq: selftests: Provide parametrized tests (v2) Mathieu Desnoyers
2018-06-02 12:44   ` Mathieu Desnoyers
2018-06-02 12:44   ` mathieu.desnoyers
2018-06-02 12:44 ` [RFC PATCH for 4.18 16/16] rseq: selftests: Provide Makefile, scripts, gitignore (v2) Mathieu Desnoyers
2018-06-02 12:44   ` Mathieu Desnoyers
2018-06-02 12:44   ` Mathieu Desnoyers
2018-06-02 12:44   ` mathieu.desnoyers
2018-07-27 22:01 ` [RFC PATCH for 4.18 00/16] Restartable Sequences Pavel Machek
2018-07-27 22:01   ` Pavel Machek
2018-07-28 13:49   ` Mathieu Desnoyers
2018-07-28 13:49     ` Mathieu Desnoyers
2018-07-28 14:13     ` Pavel Machek
2018-07-28 14:13       ` Pavel Machek
2018-07-30 18:42       ` Mathieu Desnoyers
2018-07-30 18:42         ` Mathieu Desnoyers
2018-07-30 19:07         ` Pavel Machek
2018-07-30 19:07           ` Pavel Machek
2018-07-30 19:34           ` Mathieu Desnoyers
2018-07-30 19:34             ` Mathieu Desnoyers

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180602124408.8430-16-mathieu.desnoyers@efficios.com \
    --to=mathieu.desnoyers@efficios.com \
    --cc=ahh@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=andi@firstfloor.org \
    --cc=bmaurer@fb.com \
    --cc=boqun.feng@gmail.com \
    --cc=catalin.marinas@arm.com \
    --cc=cl@linux.com \
    --cc=davejwatson@fb.com \
    --cc=hpa@zytor.com \
    --cc=joelaf@google.com \
    --cc=josh@joshtriplett.org \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=linux@arm.linux.org.uk \
    --cc=luto@amacapital.net \
    --cc=mingo@redhat.com \
    --cc=mtk.manpages@gmail.com \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=peterz@infradead.org \
    --cc=pjt@google.com \
    --cc=rostedt@goodmis.org \
    --cc=shuahkh@osg.samsung.com \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    --cc=will.deacon@arm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.