[dpdk-dev] [PATCH v3 4/8] examples/performance-thread: use compiler atomics for sync

From: Dharmik Thakkar <dharmik.thakkar@arm.com>
To: John McNamara <john.mcnamara@intel.com>
Cc: dev@dpdk.org, nd@arm.com, honnappa.nagarahalli@arm.com,
	ruifeng.wang@arm.com, joyce.kong@arm.com,
	dharmik.thakkar@arm.com
Subject: [dpdk-dev] [PATCH v3 4/8] examples/performance-thread: use compiler atomics for sync
Date: Wed, 13 Oct 2021 13:54:03 -0500	[thread overview]
Message-ID: <20211013185407.2841183-5-dharmik.thakkar@arm.com> (raw)
In-Reply-To: <20211013185407.2841183-1-dharmik.thakkar@arm.com>

From: Joyce Kong <joyce.kong@arm.com>

Convert rte_atomic usages to compiler atomic built-ins
for thread sync.

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 examples/performance-thread/common/lthread.c  | 10 +++---
 .../performance-thread/common/lthread_diag.h  | 10 +++---
 .../performance-thread/common/lthread_int.h   |  1 -
 .../performance-thread/common/lthread_mutex.c | 26 +++++++-------
 .../performance-thread/common/lthread_mutex.h |  2 +-
 .../performance-thread/common/lthread_sched.c | 34 ++++++++-----------
 .../performance-thread/common/lthread_tls.c   |  5 +--
 .../performance-thread/l3fwd-thread/main.c    | 22 +++++-------
 8 files changed, 53 insertions(+), 57 deletions(-)

diff --git a/examples/performance-thread/common/lthread.c b/examples/performance-thread/common/lthread.c
index 3f1f48db433e..98123f34f840 100644
--- a/examples/performance-thread/common/lthread.c
+++ b/examples/performance-thread/common/lthread.c
@@ -357,9 +357,10 @@ void lthread_exit(void *ptr)
 	 *  - if exit before join then we suspend and resume on join
 	 *  - if join before exit then we resume the joining thread
 	 */
+	uint64_t join_initial = LT_JOIN_INITIAL;
 	if ((lt->join == LT_JOIN_INITIAL)
-	    && rte_atomic64_cmpset(&lt->join, LT_JOIN_INITIAL,
-				   LT_JOIN_EXITING)) {
+	    && __atomic_compare_exchange_n(&lt->join, &join_initial,
+		LT_JOIN_EXITING, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
 
 		DIAG_EVENT(lt, LT_DIAG_LTHREAD_EXIT, 1, 0);
 		_suspend();
@@ -415,9 +416,10 @@ int lthread_join(struct lthread *lt, void **ptr)
 	 *  - if join before exit we suspend and will resume when exit is called
 	 *  - if exit before join we resume the exiting thread
 	 */
+	uint64_t join_initial = LT_JOIN_INITIAL;
 	if ((lt->join == LT_JOIN_INITIAL)
-	    && rte_atomic64_cmpset(&lt->join, LT_JOIN_INITIAL,
-				   LT_JOIN_THREAD_SET)) {
+	    && __atomic_compare_exchange_n(&lt->join, &join_initial,
+		LT_JOIN_THREAD_SET, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
 
 		DIAG_EVENT(current, LT_DIAG_LTHREAD_JOIN, lt, 1);
 		_suspend();
diff --git a/examples/performance-thread/common/lthread_diag.h b/examples/performance-thread/common/lthread_diag.h
index e876dda6da3a..7ee89eef388d 100644
--- a/examples/performance-thread/common/lthread_diag.h
+++ b/examples/performance-thread/common/lthread_diag.h
@@ -78,11 +78,11 @@ extern uint64_t diag_mask;
 	}								\
 } while (0)
 
-#define DIAG_COUNT_DEFINE(x) rte_atomic64_t count_##x
-#define DIAG_COUNT_INIT(o, x) rte_atomic64_init(&((o)->count_##x))
-#define DIAG_COUNT_INC(o, x) rte_atomic64_inc(&((o)->count_##x))
-#define DIAG_COUNT_DEC(o, x) rte_atomic64_dec(&((o)->count_##x))
-#define DIAG_COUNT(o, x) rte_atomic64_read(&((o)->count_##x))
+#define DIAG_COUNT_DEFINE(x) uint64_t count_##x
+#define DIAG_COUNT_INIT(o, x) __atomic_store_n(&((o)->count_##x), 0, __ATOMIC_RELAXED)
+#define DIAG_COUNT_INC(o, x) __atomic_fetch_add(&((o)->count_##x), 1, __ATOMIC_RELAXED)
+#define DIAG_COUNT_DEC(o, x) __atomic_fetch_sub(&((o)->count_##x), 1, __ATOMIC_RELAXED)
+#define DIAG_COUNT(o, x) __atomic_load_n(&((o)->count_##x), __ATOMIC_RELAXED)
 
 #define DIAG_USED
 
diff --git a/examples/performance-thread/common/lthread_int.h b/examples/performance-thread/common/lthread_int.h
index a352f13b7568..d010126f1681 100644
--- a/examples/performance-thread/common/lthread_int.h
+++ b/examples/performance-thread/common/lthread_int.h
@@ -21,7 +21,6 @@ extern "C" {
 #include <rte_cycles.h>
 #include <rte_per_lcore.h>
 #include <rte_timer.h>
-#include <rte_atomic_64.h>
 #include <rte_spinlock.h>
 #include <ctx.h>
 
diff --git a/examples/performance-thread/common/lthread_mutex.c b/examples/performance-thread/common/lthread_mutex.c
index 01da6cad4f61..061fc5c19a6b 100644
--- a/examples/performance-thread/common/lthread_mutex.c
+++ b/examples/performance-thread/common/lthread_mutex.c
@@ -60,7 +60,7 @@ lthread_mutex_init(char *name, struct lthread_mutex **mutex,
 	m->root_sched = THIS_SCHED;
 	m->owner = NULL;
 
-	rte_atomic64_init(&m->count);
+	__atomic_store_n(&m->count, 0, __ATOMIC_RELAXED);
 
 	DIAG_CREATE_EVENT(m, LT_DIAG_MUTEX_CREATE);
 	/* success */
@@ -115,10 +115,11 @@ int lthread_mutex_lock(struct lthread_mutex *m)
 	}
 
 	for (;;) {
-		rte_atomic64_inc(&m->count);
+		__atomic_fetch_add(&m->count, 1, __ATOMIC_RELAXED);
 		do {
-			if (rte_atomic64_cmpset
-			    ((uint64_t *) &m->owner, 0, (uint64_t) lt)) {
+			uint64_t lt_init = 0;
+			if (__atomic_compare_exchange_n((uint64_t *) &m->owner, &lt_init,
+				(uint64_t) lt, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
 				/* happy days, we got the lock */
 				DIAG_EVENT(m, LT_DIAG_MUTEX_LOCK, m, 0);
 				return 0;
@@ -126,7 +127,7 @@ int lthread_mutex_lock(struct lthread_mutex *m)
 			/* spin due to race with unlock when
 			* nothing was blocked
 			*/
-		} while ((rte_atomic64_read(&m->count) == 1) &&
+		} while ((__atomic_load_n(&m->count, __ATOMIC_RELAXED) == 1) &&
 				(m->owner == NULL));
 
 		/* queue the current thread in the blocked queue
@@ -160,16 +161,17 @@ int lthread_mutex_trylock(struct lthread_mutex *m)
 		return POSIX_ERRNO(EDEADLK);
 	}
 
-	rte_atomic64_inc(&m->count);
-	if (rte_atomic64_cmpset
-	    ((uint64_t *) &m->owner, (uint64_t) NULL, (uint64_t) lt)) {
+	__atomic_fetch_add(&m->count, 1, __ATOMIC_RELAXED);
+	uint64_t lt_init = 0;
+	if (__atomic_compare_exchange_n((uint64_t *) &m->owner, &lt_init,
+		(uint64_t) lt, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
 		/* got the lock */
 		DIAG_EVENT(m, LT_DIAG_MUTEX_TRYLOCK, m, 0);
 		return 0;
 	}
 
 	/* failed so return busy */
-	rte_atomic64_dec(&m->count);
+	__atomic_fetch_sub(&m->count, 1, __ATOMIC_RELAXED);
 	DIAG_EVENT(m, LT_DIAG_MUTEX_TRYLOCK, m, POSIX_ERRNO(EBUSY));
 	return POSIX_ERRNO(EBUSY);
 }
@@ -193,13 +195,13 @@ int lthread_mutex_unlock(struct lthread_mutex *m)
 		return POSIX_ERRNO(EPERM);
 	}
 
-	rte_atomic64_dec(&m->count);
+	__atomic_fetch_sub(&m->count, 1, __ATOMIC_RELAXED);
 	/* if there are blocked threads then make one ready */
-	while (rte_atomic64_read(&m->count) > 0) {
+	while (__atomic_load_n(&m->count, __ATOMIC_RELAXED) > 0) {
 		unblocked = _lthread_queue_remove(m->blocked);
 
 		if (unblocked != NULL) {
-			rte_atomic64_dec(&m->count);
+			__atomic_fetch_sub(&m->count, 1, __ATOMIC_RELAXED);
 			DIAG_EVENT(m, LT_DIAG_MUTEX_UNLOCKED, m, unblocked);
 			RTE_ASSERT(unblocked->sched != NULL);
 			_ready_queue_insert((struct lthread_sched *)
diff --git a/examples/performance-thread/common/lthread_mutex.h b/examples/performance-thread/common/lthread_mutex.h
index cd866f87b889..730092bdf8bb 100644
--- a/examples/performance-thread/common/lthread_mutex.h
+++ b/examples/performance-thread/common/lthread_mutex.h
@@ -17,7 +17,7 @@ extern "C" {
 
 struct lthread_mutex {
 	struct lthread *owner;
-	rte_atomic64_t	count;
+	uint64_t count;
 	struct lthread_queue *blocked __rte_cache_aligned;
 	struct lthread_sched *root_sched;
 	char			name[MAX_MUTEX_NAME_SIZE];
diff --git a/examples/performance-thread/common/lthread_sched.c b/examples/performance-thread/common/lthread_sched.c
index 38ca0c45cbf8..3784b010c221 100644
--- a/examples/performance-thread/common/lthread_sched.c
+++ b/examples/performance-thread/common/lthread_sched.c
@@ -22,8 +22,6 @@
 
 #include <rte_prefetch.h>
 #include <rte_per_lcore.h>
-#include <rte_atomic.h>
-#include <rte_atomic_64.h>
 #include <rte_log.h>
 #include <rte_common.h>
 #include <rte_branch_prediction.h>
@@ -47,8 +45,8 @@
  * When a scheduler shuts down it is assumed that the application is terminating
  */
 
-static rte_atomic16_t num_schedulers;
-static rte_atomic16_t active_schedulers;
+static uint16_t num_schedulers;
+static uint16_t active_schedulers;
 
 /* one scheduler per lcore */
 RTE_DEFINE_PER_LCORE(struct lthread_sched *, this_sched) = NULL;
@@ -64,10 +62,8 @@ uint64_t diag_mask;
 RTE_INIT(lthread_sched_ctor)
 {
 	memset(schedcore, 0, sizeof(schedcore));
-	rte_atomic16_init(&num_schedulers);
-	rte_atomic16_set(&num_schedulers, 1);
-	rte_atomic16_init(&active_schedulers);
-	rte_atomic16_set(&active_schedulers, 0);
+	__atomic_store_n(&num_schedulers, 1, __ATOMIC_RELAXED);
+	__atomic_store_n(&active_schedulers, 0, __ATOMIC_RELAXED);
 	diag_cb = NULL;
 }
 
@@ -260,8 +256,8 @@ struct lthread_sched *_lthread_sched_create(size_t stack_size)
  */
 int lthread_num_schedulers_set(int num)
 {
-	rte_atomic16_set(&num_schedulers, num);
-	return (int)rte_atomic16_read(&num_schedulers);
+	__atomic_store_n(&num_schedulers, num, __ATOMIC_RELAXED);
+	return (int)__atomic_load_n(&num_schedulers, __ATOMIC_RELAXED);
 }
 
 /*
@@ -269,7 +265,7 @@ int lthread_num_schedulers_set(int num)
  */
 int lthread_active_schedulers(void)
 {
-	return (int)rte_atomic16_read(&active_schedulers);
+	return (int)__atomic_load_n(&active_schedulers, __ATOMIC_RELAXED);
 }
 
 
@@ -299,8 +295,8 @@ void lthread_scheduler_shutdown_all(void)
 	 * for the possibility of a pthread wrapper on lthread_yield(),
 	 * something that is not possible unless the scheduler is running.
 	 */
-	while (rte_atomic16_read(&active_schedulers) <
-	       rte_atomic16_read(&num_schedulers))
+	while (__atomic_load_n(&active_schedulers, __ATOMIC_RELAXED) <
+	       __atomic_load_n(&num_schedulers, __ATOMIC_RELAXED))
 		sched_yield();
 
 	for (i = 0; i < LTHREAD_MAX_LCORES; i++) {
@@ -415,15 +411,15 @@ static inline int _lthread_sched_isdone(struct lthread_sched *sched)
  */
 static inline void _lthread_schedulers_sync_start(void)
 {
-	rte_atomic16_inc(&active_schedulers);
+	__atomic_fetch_add(&active_schedulers, 1, __ATOMIC_RELAXED);
 
 	/* wait for lthread schedulers
 	 * Note we use sched_yield() rather than pthread_yield() to allow
 	 * for the possibility of a pthread wrapper on lthread_yield(),
 	 * something that is not possible unless the scheduler is running.
 	 */
-	while (rte_atomic16_read(&active_schedulers) <
-	       rte_atomic16_read(&num_schedulers))
+	while (__atomic_load_n(&active_schedulers, __ATOMIC_RELAXED) <
+	       __atomic_load_n(&num_schedulers, __ATOMIC_RELAXED))
 		sched_yield();
 
 }
@@ -433,15 +429,15 @@ static inline void _lthread_schedulers_sync_start(void)
  */
 static inline void _lthread_schedulers_sync_stop(void)
 {
-	rte_atomic16_dec(&active_schedulers);
-	rte_atomic16_dec(&num_schedulers);
+	__atomic_fetch_sub(&active_schedulers, 1, __ATOMIC_RELAXED);
+	__atomic_fetch_sub(&num_schedulers, 1, __ATOMIC_RELAXED);
 
 	/* wait for schedulers
 	 * Note we use sched_yield() rather than pthread_yield() to allow
 	 * for the possibility of a pthread wrapper on lthread_yield(),
 	 * something that is not possible unless the scheduler is running.
 	 */
-	while (rte_atomic16_read(&active_schedulers) > 0)
+	while (__atomic_load_n(&active_schedulers, __ATOMIC_RELAXED) > 0)
 		sched_yield();
 
 }
diff --git a/examples/performance-thread/common/lthread_tls.c b/examples/performance-thread/common/lthread_tls.c
index 07de6cafabf5..4ab2e3558b1c 100644
--- a/examples/performance-thread/common/lthread_tls.c
+++ b/examples/performance-thread/common/lthread_tls.c
@@ -18,7 +18,6 @@
 #include <rte_malloc.h>
 #include <rte_log.h>
 #include <rte_ring.h>
-#include <rte_atomic_64.h>
 
 #include "lthread_tls.h"
 #include "lthread_queue.h"
@@ -52,8 +51,10 @@ void _lthread_key_pool_init(void)
 
 	bzero(key_table, sizeof(key_table));
 
+	uint64_t pool_init = 0;
 	/* only one lcore should do this */
-	if (rte_atomic64_cmpset(&key_pool_init, 0, 1)) {
+	if (__atomic_compare_exchange_n(&key_pool_init, &pool_init, 1, 0,
+			__ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
 
 		snprintf(name,
 			MAX_LTHREAD_NAME_SIZE,
diff --git a/examples/performance-thread/l3fwd-thread/main.c b/examples/performance-thread/l3fwd-thread/main.c
index 2905199743a7..50ecc4e820f6 100644
--- a/examples/performance-thread/l3fwd-thread/main.c
+++ b/examples/performance-thread/l3fwd-thread/main.c
@@ -26,7 +26,6 @@
 #include <rte_memcpy.h>
 #include <rte_eal.h>
 #include <rte_launch.h>
-#include <rte_atomic.h>
 #include <rte_cycles.h>
 #include <rte_prefetch.h>
 #include <rte_lcore.h>
@@ -570,8 +569,8 @@ RTE_DEFINE_PER_LCORE(struct lcore_conf *, lcore_conf);
  */
 static int lthreads_on = 1; /**< Use lthreads for processing*/
 
-rte_atomic16_t rx_counter;  /**< Number of spawned rx threads */
-rte_atomic16_t tx_counter;  /**< Number of spawned tx threads */
+uint16_t rx_counter;  /**< Number of spawned rx threads */
+uint16_t tx_counter;  /**< Number of spawned tx threads */
 
 struct thread_conf {
 	uint16_t lcore_id;      /**< Initial lcore for rx thread */
@@ -1910,11 +1909,8 @@ cpu_load_collector(__rte_unused void *arg) {
 	printf("Waiting for %d rx threads and %d tx threads\n", n_rx_thread,
 			n_tx_thread);
 
-	while (rte_atomic16_read(&rx_counter) < n_rx_thread)
-		rte_pause();
-
-	while (rte_atomic16_read(&tx_counter) < n_tx_thread)
-		rte_pause();
+	rte_wait_until_equal_16(&rx_counter, n_rx_thread, __ATOMIC_RELAXED);
+	rte_wait_until_equal_16(&tx_counter, n_tx_thread, __ATOMIC_RELAXED);
 
 	for (i = 0; i < n_rx_thread; i++) {
 
@@ -2036,7 +2032,7 @@ lthread_tx_per_ring(void *dummy)
 	RTE_LOG(INFO, L3FWD, "entering main tx loop on lcore %u\n", rte_lcore_id());
 
 	nb_rx = 0;
-	rte_atomic16_inc(&tx_counter);
+	__atomic_fetch_add(&tx_counter, 1, __ATOMIC_RELAXED);
 	while (1) {
 
 		/*
@@ -2161,7 +2157,7 @@ lthread_rx(void *dummy)
 	worker_id = 0;
 
 	rx_conf->conf.cpu_id = sched_getcpu();
-	rte_atomic16_inc(&rx_counter);
+	__atomic_fetch_add(&rx_counter, 1, __ATOMIC_RELAXED);
 	while (1) {
 
 		/*
@@ -2243,7 +2239,7 @@ lthread_spawner(__rte_unused void *arg)
 	 * scheduler as this lthread, yielding is required to let them to run and
 	 * prevent deadlock here.
 	 */
-	while (rte_atomic16_read(&rx_counter) < n_rx_thread)
+	while (__atomic_load_n(&rx_counter, __ATOMIC_RELAXED) < n_rx_thread)
 		lthread_sleep(100000);
 
 	/*
@@ -2323,7 +2319,7 @@ pthread_tx(void *dummy)
 	RTE_LOG(INFO, L3FWD, "Entering main Tx loop on lcore %u\n", rte_lcore_id());
 
 	tx_conf->conf.cpu_id = sched_getcpu();
-	rte_atomic16_inc(&tx_counter);
+	__atomic_fetch_add(&tx_counter, 1, __ATOMIC_RELAXED);
 	while (1) {
 
 		cur_tsc = rte_rdtsc();
@@ -2406,7 +2402,7 @@ pthread_rx(void *dummy)
 
 	worker_id = 0;
 	rx_conf->conf.cpu_id = sched_getcpu();
-	rte_atomic16_inc(&rx_counter);
+	__atomic_fetch_add(&rx_counter, 1, __ATOMIC_RELAXED);
 	while (1) {
 
 		/*
-- 
2.25.1