All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/3] reimplement rwlock and add relevant perf test case
@ 2019-01-15 13:12 Joyce Kong
  2019-01-15 13:12 ` [PATCH v2 1/3] rwlock: reimplement with __atomic builtins Joyce Kong
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Joyce Kong @ 2019-01-15 13:12 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, hemant.agrawal, bruce.richardson, chaozhu,
	honnappa.nagarahalli, nd

v2:
	Rebase and modify the rwlock test case to address the comments in v1.

v1: reimplement rwlock with __atomic builtins, and add a rwlock perf test
    on all available cores to benchmark the improvement.

	We tested the patches on three arm64 platforms.
	ThundeX2 gained 20% performance, Qualcomm gained 36% and
	the 4-Cortex-A72 Marvell MACCHIATObin gained 19.6%.
	
	Below is the detailed test result on ThunderX2:

	*** rwlock_autotest without __atomic builtins *** Rwlock Perf Test on 128 cores...
	Core [0] count = 281
	Core [1] count = 252
	Core [2] count = 290
	Core [3] count = 259
	Core [4] count = 287
	...
	Core [209] count = 3
	Core [210] count = 31
	Core [211] count = 120
	Total count = 18537

	*** rwlock_autotest with __atomic builtins *** Rwlock Perf Test on 128 cores...
	Core [0] count = 346
	Core [1] count = 355
	Core [2] count = 259
	Core [3] count = 285
	Core [4] count = 320
	...
	Core [209] count = 2
	Core [210] count = 23
	Core [211] count = 63
	Total count = 22194

Gavin Hu (1):
  rwlock: reimplement with __atomic builtins

Joyce Kong (2):
  test/rwlock: add perf test case
  test/rwlock: amortize the cost of getting time

 lib/librte_eal/common/include/generic/rte_rwlock.h | 16 ++---
 test/test/test_rwlock.c                            | 75 ++++++++++++++++++++++
 2 files changed, 83 insertions(+), 8 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v2 1/3] rwlock: reimplement with __atomic builtins
  2019-01-15 13:12 [PATCH v2 0/3] reimplement rwlock and add relevant perf test case Joyce Kong
@ 2019-01-15 13:12 ` Joyce Kong
  2019-03-12 18:08   ` [EXT] " Jerin Jacob Kollanukkaran
  2019-01-15 13:12 ` [PATCH v2 2/3] test/rwlock: add perf test case Joyce Kong
  2019-01-15 13:12 ` [PATCH v2 3/3] test/rwlock: amortize the cost of getting time Joyce Kong
  2 siblings, 1 reply; 7+ messages in thread
From: Joyce Kong @ 2019-01-15 13:12 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, hemant.agrawal, bruce.richardson, chaozhu,
	honnappa.nagarahalli, nd, Gavin Hu, stable

From: Gavin Hu <gavin.hu@arm.com>

The __sync builtin based implementation generates full memory barriers
('dmb ish') on Arm platforms. Using C11 atomic builtins to generate one
way barriers.

Here is the assembly code of __sync_compare_and_swap builtin.
__sync_bool_compare_and_swap(dst, exp, src);
   0x000000000090f1b0 <+16>:    e0 07 40 f9 ldr x0, [sp, #8]
   0x000000000090f1b4 <+20>:    e1 0f 40 79 ldrh    w1, [sp, #6]
   0x000000000090f1b8 <+24>:    e2 0b 40 79 ldrh    w2, [sp, #4]
   0x000000000090f1bc <+28>:    21 3c 00 12 and w1, w1, #0xffff
   0x000000000090f1c0 <+32>:    03 7c 5f 48 ldxrh   w3, [x0]
   0x000000000090f1c4 <+36>:    7f 00 01 6b cmp w3, w1
   0x000000000090f1c8 <+40>:    61 00 00 54 b.ne    0x90f1d4
<rte_atomic16_cmpset+52>  // b.any
   0x000000000090f1cc <+44>:    02 fc 04 48 stlxrh  w4, w2, [x0]
   0x000000000090f1d0 <+48>:    84 ff ff 35 cbnz    w4, 0x90f1c0
<rte_atomic16_cmpset+32>
   0x000000000090f1d4 <+52>:    bf 3b 03 d5 dmb ish
   0x000000000090f1d8 <+56>:    e0 17 9f 1a cset    w0, eq  // eq = none

Fixes: af75078fece3 ("first public release")
Cc: stable@dpdk.org

Signed-off-by: Gavin Hu <gavin.hu@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
Reviewed-by: Joyce Kong <joyce.kong@arm.com>
Tested-by: Joyce Kong <joyce.kong@arm.com>
---
 lib/librte_eal/common/include/generic/rte_rwlock.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/librte_eal/common/include/generic/rte_rwlock.h b/lib/librte_eal/common/include/generic/rte_rwlock.h
index b05d85a..3317a21 100644
--- a/lib/librte_eal/common/include/generic/rte_rwlock.h
+++ b/lib/librte_eal/common/include/generic/rte_rwlock.h
@@ -64,14 +64,14 @@ rte_rwlock_read_lock(rte_rwlock_t *rwl)
 	int success = 0;
 
 	while (success == 0) {
-		x = rwl->cnt;
+		x = __atomic_load_n(&rwl->cnt, __ATOMIC_RELAXED);
 		/* write lock is held */
 		if (x < 0) {
 			rte_pause();
 			continue;
 		}
-		success = rte_atomic32_cmpset((volatile uint32_t *)&rwl->cnt,
-					      (uint32_t)x, (uint32_t)(x + 1));
+		success = __atomic_compare_exchange_n(&rwl->cnt, &x, x+1, 1,
+					__ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
 	}
 }
 
@@ -114,7 +114,7 @@ rte_rwlock_read_trylock(rte_rwlock_t *rwl)
 static inline void
 rte_rwlock_read_unlock(rte_rwlock_t *rwl)
 {
-	rte_atomic32_dec((rte_atomic32_t *)(intptr_t)&rwl->cnt);
+	__atomic_fetch_sub(&rwl->cnt, 1, __ATOMIC_RELEASE);
 }
 
 /**
@@ -156,14 +156,14 @@ rte_rwlock_write_lock(rte_rwlock_t *rwl)
 	int success = 0;
 
 	while (success == 0) {
-		x = rwl->cnt;
+		x = __atomic_load_n(&rwl->cnt, __ATOMIC_RELAXED);
 		/* a lock is held */
 		if (x != 0) {
 			rte_pause();
 			continue;
 		}
-		success = rte_atomic32_cmpset((volatile uint32_t *)&rwl->cnt,
-					      0, (uint32_t)-1);
+		success = __atomic_compare_exchange_n(&rwl->cnt, &x, -1, 1,
+					__ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
 	}
 }
 
@@ -176,7 +176,7 @@ rte_rwlock_write_lock(rte_rwlock_t *rwl)
 static inline void
 rte_rwlock_write_unlock(rte_rwlock_t *rwl)
 {
-	rte_atomic32_inc((rte_atomic32_t *)(intptr_t)&rwl->cnt);
+	__atomic_store_n(&rwl->cnt, 0, __ATOMIC_RELEASE);
 }
 
 /**
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v2 2/3] test/rwlock: add perf test case
  2019-01-15 13:12 [PATCH v2 0/3] reimplement rwlock and add relevant perf test case Joyce Kong
  2019-01-15 13:12 ` [PATCH v2 1/3] rwlock: reimplement with __atomic builtins Joyce Kong
@ 2019-01-15 13:12 ` Joyce Kong
  2019-01-15 13:12 ` [PATCH v2 3/3] test/rwlock: amortize the cost of getting time Joyce Kong
  2 siblings, 0 replies; 7+ messages in thread
From: Joyce Kong @ 2019-01-15 13:12 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, hemant.agrawal, bruce.richardson, chaozhu,
	honnappa.nagarahalli, nd, stable

Add performance test on all available cores to benchmark
the scaling up performance of rw_lock.

Fixes: af75078fece3 ("first public release")
Cc: stable@dpdk.org

Suggested-by: Gavin Hu <gavin.hu@arm.com>
Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 test/test/test_rwlock.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/test/test/test_rwlock.c b/test/test/test_rwlock.c
index 224f0de..250ea5a 100644
--- a/test/test/test_rwlock.c
+++ b/test/test/test_rwlock.c
@@ -36,6 +36,7 @@
 
 static rte_rwlock_t sl;
 static rte_rwlock_t sl_tab[RTE_MAX_LCORE];
+static rte_atomic32_t synchro;
 
 enum {
 	LC_TYPE_RDLOCK,
@@ -83,6 +84,75 @@ test_rwlock_per_core(__attribute__((unused)) void *arg)
 	return 0;
 }
 
+static rte_rwlock_t lk = RTE_RWLOCK_INITIALIZER;
+static volatile uint64_t rwlock_data;
+static uint64_t lock_count[RTE_MAX_LCORE] = {0};
+
+#define TIME_MS 100
+
+static int
+load_loop_fn(__attribute__((unused)) void *arg)
+{
+	uint64_t time_diff = 0, begin;
+	uint64_t hz = rte_get_timer_hz();
+	uint64_t lcount = 0;
+	const unsigned int lcore = rte_lcore_id();
+
+	/* wait synchro for slaves */
+	if (lcore != rte_get_master_lcore())
+		while (rte_atomic32_read(&synchro) == 0)
+			;
+
+	begin = rte_rdtsc_precise();
+	while (time_diff < hz * TIME_MS / 1000) {
+		rte_rwlock_write_lock(&lk);
+		++rwlock_data;
+		rte_rwlock_write_unlock(&lk);
+
+		rte_rwlock_read_lock(&lk);
+		printf("Core [%u] rwlock_data = %"PRIu64"\n",
+				lcore, rwlock_data);
+		rte_rwlock_read_unlock(&lk);
+
+		lcount++;
+		/* delay to make lock duty cycle slightly realistic */
+		rte_pause();
+		time_diff = rte_rdtsc_precise() - begin;
+	}
+
+	lock_count[lcore] = lcount;
+	return 0;
+}
+
+static int
+test_rwlock_perf(void)
+{
+	unsigned int i;
+	uint64_t total = 0;
+
+	printf("\nRwlock Perf Test on %u cores...\n", rte_lcore_count());
+
+	/* clear synchro and start slaves */
+	rte_atomic32_set(&synchro, 0);
+	if (rte_eal_mp_remote_launch(load_loop_fn, NULL, SKIP_MASTER) < 0)
+		return -1;
+
+	/* start synchro and launch test on master */
+	rte_atomic32_set(&synchro, 1);
+	load_loop_fn(NULL);
+
+	rte_eal_mp_wait_lcore();
+
+	RTE_LCORE_FOREACH(i) {
+		printf("Core [%u] count = %"PRIu64"\n", i, lock_count[i]);
+		total += lock_count[i];
+	}
+
+	printf("Total count = %"PRIu64"\n", total);
+
+	return 0;
+}
+
 /*
  * - There is a global rwlock and a table of rwlocks (one per lcore).
  *
@@ -132,6 +202,9 @@ rwlock_test1(void)
 
 	rte_eal_mp_wait_lcore();
 
+	if (test_rwlock_perf() < 0)
+		return -1;
+
 	return 0;
 }
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v2 3/3] test/rwlock: amortize the cost of getting time
  2019-01-15 13:12 [PATCH v2 0/3] reimplement rwlock and add relevant perf test case Joyce Kong
  2019-01-15 13:12 ` [PATCH v2 1/3] rwlock: reimplement with __atomic builtins Joyce Kong
  2019-01-15 13:12 ` [PATCH v2 2/3] test/rwlock: add perf test case Joyce Kong
@ 2019-01-15 13:12 ` Joyce Kong
  2 siblings, 0 replies; 7+ messages in thread
From: Joyce Kong @ 2019-01-15 13:12 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, hemant.agrawal, bruce.richardson, chaozhu,
	honnappa.nagarahalli, nd, stable

Fixes: af75078fece3 ("first public release")
Cc: stable@dpdk.org

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
---
 test/test/test_rwlock.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/test/test/test_rwlock.c b/test/test/test_rwlock.c
index 250ea5a..d42ac35 100644
--- a/test/test/test_rwlock.c
+++ b/test/test/test_rwlock.c
@@ -86,9 +86,9 @@ test_rwlock_per_core(__attribute__((unused)) void *arg)
 
 static rte_rwlock_t lk = RTE_RWLOCK_INITIALIZER;
 static volatile uint64_t rwlock_data;
-static uint64_t lock_count[RTE_MAX_LCORE] = {0};
+static uint64_t time_count[RTE_MAX_LCORE] = {0};
 
-#define TIME_MS 100
+#define MAX_LOOP 10000
 
 static int
 load_loop_fn(__attribute__((unused)) void *arg)
@@ -104,7 +104,7 @@ load_loop_fn(__attribute__((unused)) void *arg)
 			;
 
 	begin = rte_rdtsc_precise();
-	while (time_diff < hz * TIME_MS / 1000) {
+	while (lcount < MAX_LOOP) {
 		rte_rwlock_write_lock(&lk);
 		++rwlock_data;
 		rte_rwlock_write_unlock(&lk);
@@ -117,10 +117,10 @@ load_loop_fn(__attribute__((unused)) void *arg)
 		lcount++;
 		/* delay to make lock duty cycle slightly realistic */
 		rte_pause();
-		time_diff = rte_rdtsc_precise() - begin;
 	}
 
-	lock_count[lcore] = lcount;
+	time_diff = rte_rdtsc_precise() - begin;
+	time_count[lcore] = time_diff * 1000000 / hz;
 	return 0;
 }
 
@@ -144,11 +144,13 @@ test_rwlock_perf(void)
 	rte_eal_mp_wait_lcore();
 
 	RTE_LCORE_FOREACH(i) {
-		printf("Core [%u] count = %"PRIu64"\n", i, lock_count[i]);
-		total += lock_count[i];
+		printf("Core [%u] cost time = %"PRIu64" us\n",
+				i, time_count[i]);
+		total += time_count[i];
 	}
 
-	printf("Total count = %"PRIu64"\n", total);
+	printf("Total cost time = %"PRIu64" us\n", total);
+	memset(time_count, 0, sizeof(time_count));
 
 	return 0;
 }
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [EXT] [PATCH v2 1/3] rwlock: reimplement with __atomic builtins
  2019-01-15 13:12 ` [PATCH v2 1/3] rwlock: reimplement with __atomic builtins Joyce Kong
@ 2019-03-12 18:08   ` Jerin Jacob Kollanukkaran
  2019-03-14 13:17     ` Joyce Kong (Arm Technology China)
  2019-03-15  3:40     ` Gavin Hu (Arm Technology China)
  0 siblings, 2 replies; 7+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-03-12 18:08 UTC (permalink / raw)
  To: joyce.kong, dev
  Cc: chaozhu, nd, bruce.richardson, thomas, hemant.agrawal, stable,
	honnappa.nagarahalli, gavin.hu

On Tue, 2019-01-15 at 21:12 +0800, Joyce Kong wrote:
> From: Gavin Hu <gavin.hu@arm.com>
> 
> The __sync builtin based implementation generates full memory
> barriers
> ('dmb ish') on Arm platforms. Using C11 atomic builtins to generate
> one
> way barriers.
> 
> Here is the assembly code of __sync_compare_and_swap builtin.
> __sync_bool_compare_and_swap(dst, exp, src);
>    0x000000000090f1b0 <+16>:    e0 07 40 f9 ldr x0, [sp, #8]
>    0x000000000090f1b4 <+20>:    e1 0f 40 79 ldrh    w1, [sp, #6]
>    0x000000000090f1b8 <+24>:    e2 0b 40 79 ldrh    w2, [sp, #4]
>    0x000000000090f1bc <+28>:    21 3c 00 12 and w1, w1, #0xffff
>    0x000000000090f1c0 <+32>:    03 7c 5f 48 ldxrh   w3, [x0]
>    0x000000000090f1c4 <+36>:    7f 00 01 6b cmp w3, w1
>    0x000000000090f1c8 <+40>:    61 00 00 54 b.ne    0x90f1d4
> <rte_atomic16_cmpset+52>  // b.any
>    0x000000000090f1cc <+44>:    02 fc 04 48 stlxrh  w4, w2, [x0]
>    0x000000000090f1d0 <+48>:    84 ff ff 35 cbnz    w4, 0x90f1c0
> <rte_atomic16_cmpset+32>
>    0x000000000090f1d4 <+52>:    bf 3b 03 d5 dmb ish
>    0x000000000090f1d8 <+56>:    e0 17 9f 1a cset    w0, eq  // eq =
> none
> 
> Fixes: af75078fece3 ("first public release")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> Reviewed-by: Joyce Kong <joyce.kong@arm.com>
> Tested-by: Joyce Kong <joyce.kong@arm.com>
> ---

Nit:
Fix following git-checklog.sh issue

Wrong headline format:
	rwlock: reimplement with __atomic builtins


With above fix:
Acked-by: Jerin Jacob <jerinj@marvell.com>

Performance on octeontx2: +88% improvement with this patch.





^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [EXT] [PATCH v2 1/3] rwlock: reimplement with __atomic builtins
  2019-03-12 18:08   ` [EXT] " Jerin Jacob Kollanukkaran
@ 2019-03-14 13:17     ` Joyce Kong (Arm Technology China)
  2019-03-15  3:40     ` Gavin Hu (Arm Technology China)
  1 sibling, 0 replies; 7+ messages in thread
From: Joyce Kong (Arm Technology China) @ 2019-03-14 13:17 UTC (permalink / raw)
  To: jerinj, dev
  Cc: chaozhu, nd, bruce.richardson, thomas, hemant.agrawal, stable,
	Honnappa Nagarahalli, Gavin Hu (Arm Technology China)

> -----Original Message-----
> From: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Sent: Wednesday, March 13, 2019 2:09 AM
> To: Joyce Kong (Arm Technology China) <Joyce.Kong@arm.com>;
> dev@dpdk.org
> Cc: chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>;
> bruce.richardson@intel.com; thomas@monjalon.net;
> hemant.agrawal@nxp.com; stable@dpdk.org; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>
> Subject: Re: [EXT] [PATCH v2 1/3] rwlock: reimplement with __atomic builtins
> 
> On Tue, 2019-01-15 at 21:12 +0800, Joyce Kong wrote:
> > From: Gavin Hu <gavin.hu@arm.com>
> >
> > The __sync builtin based implementation generates full memory barriers
> > ('dmb ish') on Arm platforms. Using C11 atomic builtins to generate
> > one way barriers.
> >
> > Here is the assembly code of __sync_compare_and_swap builtin.
> > __sync_bool_compare_and_swap(dst, exp, src);
> >    0x000000000090f1b0 <+16>:    e0 07 40 f9 ldr x0, [sp, #8]
> >    0x000000000090f1b4 <+20>:    e1 0f 40 79 ldrh    w1, [sp, #6]
> >    0x000000000090f1b8 <+24>:    e2 0b 40 79 ldrh    w2, [sp, #4]
> >    0x000000000090f1bc <+28>:    21 3c 00 12 and w1, w1, #0xffff
> >    0x000000000090f1c0 <+32>:    03 7c 5f 48 ldxrh   w3, [x0]
> >    0x000000000090f1c4 <+36>:    7f 00 01 6b cmp w3, w1
> >    0x000000000090f1c8 <+40>:    61 00 00 54 b.ne    0x90f1d4
> > <rte_atomic16_cmpset+52>  // b.any
> >    0x000000000090f1cc <+44>:    02 fc 04 48 stlxrh  w4, w2, [x0]
> >    0x000000000090f1d0 <+48>:    84 ff ff 35 cbnz    w4, 0x90f1c0
> > <rte_atomic16_cmpset+32>
> >    0x000000000090f1d4 <+52>:    bf 3b 03 d5 dmb ish
> >    0x000000000090f1d8 <+56>:    e0 17 9f 1a cset    w0, eq  // eq =
> > none
> >
> > Fixes: af75078fece3 ("first public release")
> > Cc: stable@dpdk.org
> >
> > Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> > Reviewed-by: Joyce Kong <joyce.kong@arm.com>
> > Tested-by: Joyce Kong <joyce.kong@arm.com>
> > ---
> 
> Nit:
> Fix following git-checklog.sh issue
> 
> Wrong headline format:
> 	rwlock: reimplement with __atomic builtins
> 
Has fixed format error in v3.

> With above fix:
> Acked-by: Jerin Jacob <jerinj@marvell.com>
> 
> Performance on octeontx2: +88% improvement with this patch.


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [EXT] [PATCH v2 1/3] rwlock: reimplement with __atomic builtins
  2019-03-12 18:08   ` [EXT] " Jerin Jacob Kollanukkaran
  2019-03-14 13:17     ` Joyce Kong (Arm Technology China)
@ 2019-03-15  3:40     ` Gavin Hu (Arm Technology China)
  1 sibling, 0 replies; 7+ messages in thread
From: Gavin Hu (Arm Technology China) @ 2019-03-15  3:40 UTC (permalink / raw)
  To: jerinj, Joyce Kong (Arm Technology China), dev
  Cc: chaozhu, nd, bruce.richardson, thomas, hemant.agrawal, stable,
	Honnappa Nagarahalli

Hi Jerin,

> -----Original Message-----
> From: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Sent: Wednesday, March 13, 2019 2:09 AM
> To: Joyce Kong (Arm Technology China) <Joyce.Kong@arm.com>;
> dev@dpdk.org
> Cc: chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>;
> bruce.richardson@intel.com; thomas@monjalon.net;
> hemant.agrawal@nxp.com; stable@dpdk.org; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>
> Subject: Re: [EXT] [PATCH v2 1/3] rwlock: reimplement with __atomic
> builtins
> 
> On Tue, 2019-01-15 at 21:12 +0800, Joyce Kong wrote:
> > From: Gavin Hu <gavin.hu@arm.com>
> >
> > The __sync builtin based implementation generates full memory
> > barriers
> > ('dmb ish') on Arm platforms. Using C11 atomic builtins to generate
> > one
> > way barriers.
> >
> > Here is the assembly code of __sync_compare_and_swap builtin.
> > __sync_bool_compare_and_swap(dst, exp, src);
> >    0x000000000090f1b0 <+16>:    e0 07 40 f9 ldr x0, [sp, #8]
> >    0x000000000090f1b4 <+20>:    e1 0f 40 79 ldrh    w1, [sp, #6]
> >    0x000000000090f1b8 <+24>:    e2 0b 40 79 ldrh    w2, [sp, #4]
> >    0x000000000090f1bc <+28>:    21 3c 00 12 and w1, w1, #0xffff
> >    0x000000000090f1c0 <+32>:    03 7c 5f 48 ldxrh   w3, [x0]
> >    0x000000000090f1c4 <+36>:    7f 00 01 6b cmp w3, w1
> >    0x000000000090f1c8 <+40>:    61 00 00 54 b.ne    0x90f1d4
> > <rte_atomic16_cmpset+52>  // b.any
> >    0x000000000090f1cc <+44>:    02 fc 04 48 stlxrh  w4, w2, [x0]
> >    0x000000000090f1d0 <+48>:    84 ff ff 35 cbnz    w4, 0x90f1c0
> > <rte_atomic16_cmpset+32>
> >    0x000000000090f1d4 <+52>:    bf 3b 03 d5 dmb ish
> >    0x000000000090f1d8 <+56>:    e0 17 9f 1a cset    w0, eq  // eq =
> > none
> >
> > Fixes: af75078fece3 ("first public release")
> > Cc: stable@dpdk.org
> >
> > Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> > Reviewed-by: Joyce Kong <joyce.kong@arm.com>
> > Tested-by: Joyce Kong <joyce.kong@arm.com>
> > ---
> 
> Nit:
> Fix following git-checklog.sh issue
> 
> Wrong headline format:
> 	rwlock: reimplement with __atomic builtins
> 
> 
> With above fix:
> Acked-by: Jerin Jacob <jerinj@marvell.com>
> 
> Performance on octeontx2: +88% improvement with this patch.
> 
With your test result, I assume you applied all the three patches, not this one only.
Would you review/ack the other two patches? 
> 


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2019-03-15  3:40 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-15 13:12 [PATCH v2 0/3] reimplement rwlock and add relevant perf test case Joyce Kong
2019-01-15 13:12 ` [PATCH v2 1/3] rwlock: reimplement with __atomic builtins Joyce Kong
2019-03-12 18:08   ` [EXT] " Jerin Jacob Kollanukkaran
2019-03-14 13:17     ` Joyce Kong (Arm Technology China)
2019-03-15  3:40     ` Gavin Hu (Arm Technology China)
2019-01-15 13:12 ` [PATCH v2 2/3] test/rwlock: add perf test case Joyce Kong
2019-01-15 13:12 ` [PATCH v2 3/3] test/rwlock: amortize the cost of getting time Joyce Kong

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.