All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/1] Add 128-bit compare and set
@ 2019-01-28 17:29 Gage Eads
  2019-01-28 17:29 ` [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only) Gage Eads
                   ` (2 more replies)
  0 siblings, 3 replies; 36+ messages in thread
From: Gage Eads @ 2019-01-28 17:29 UTC (permalink / raw)
  To: dev
  Cc: olivier.matz, arybchenko, bruce.richardson, konstantin.ananyev,
	gavin.hu, Honnappa.Nagarahalli, nd, chaozhu, jerinj,
	hemant.agrawal

This patch addresses x86-64 only; other architectures can/will be supported
in the future. The __atomic intrinsic was considered for the
implementation, however libatomic was found[1] to use locks to implement
the 128-bit CAS on at least one architecture and so is eschewed here.
The interface is modeled after the __atomic_compare_exchange_16 (which
itself is based on the C++11 memory model) to best support weak
consistency architectures.

This patch was originally part of a series that introduces a non-blocking stack
mempool handler[2], and is required by a non-blocking ring patchset. This
patch was spun off so that the the NB ring depends only on this patch
and not the entire non-blocking stack patchset.

[1] http://mails.dpdk.org/archives/dev/2019-January/124002.html
[1] http://mails.dpdk.org/archives/dev/2019-January/123653.html

Gage Eads (1):
  eal: add 128-bit cmpset (x86-64 only)

 .../common/include/arch/x86/rte_atomic_64.h        | 31 +++++++++++
 lib/librte_eal/common/include/generic/rte_atomic.h | 65 ++++++++++++++++++++++
 2 files changed, 96 insertions(+)

-- 
2.13.6

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
  2019-01-28 17:29 [PATCH 0/1] Add 128-bit compare and set Gage Eads
@ 2019-01-28 17:29 ` Gage Eads
  2019-01-28 23:01   ` Ola Liljedahl
  2019-01-31  5:48   ` Honnappa Nagarahalli
  2019-02-22 15:46 ` [PATCH v2 0/1] Add 128-bit compare and set Gage Eads
  2019-04-03 19:35 ` [PATCH v5] eal/x86: add 128-bit atomic compare exchange Thomas Monjalon
  2 siblings, 2 replies; 36+ messages in thread
From: Gage Eads @ 2019-01-28 17:29 UTC (permalink / raw)
  To: dev
  Cc: olivier.matz, arybchenko, bruce.richardson, konstantin.ananyev,
	gavin.hu, Honnappa.Nagarahalli, nd, chaozhu, jerinj,
	hemant.agrawal

This operation can be used for non-blocking algorithms, such as a
non-blocking stack or ring.

Signed-off-by: Gage Eads <gage.eads@intel.com>
---
 .../common/include/arch/x86/rte_atomic_64.h        | 31 +++++++++++
 lib/librte_eal/common/include/generic/rte_atomic.h | 65 ++++++++++++++++++++++
 2 files changed, 96 insertions(+)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
index fd2ec9c53..b7b90b83e 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
@@ -34,6 +34,7 @@
 /*
  * Inspired from FreeBSD src/sys/amd64/include/atomic.h
  * Copyright (c) 1998 Doug Rabson
+ * Copyright (c) 2019 Intel Corporation
  * All rights reserved.
  */
 
@@ -46,6 +47,7 @@
 
 #include <stdint.h>
 #include <rte_common.h>
+#include <rte_compat.h>
 #include <rte_atomic.h>
 
 /*------------------------- 64 bit atomic operations -------------------------*/
@@ -208,4 +210,33 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 }
 #endif
 
+static inline int __rte_experimental
+rte_atomic128_cmpset(volatile rte_int128_t *dst,
+		     rte_int128_t *exp, rte_int128_t *src,
+		     unsigned int weak,
+		     enum rte_atomic_memmodel_t success,
+		     enum rte_atomic_memmodel_t failure)
+{
+	RTE_SET_USED(weak);
+	RTE_SET_USED(success);
+	RTE_SET_USED(failure);
+	uint8_t res;
+
+	asm volatile (
+		      MPLOCKED
+		      "cmpxchg16b %[dst];"
+		      " sete %[res]"
+		      : [dst] "=m" (dst->val[0]),
+			"=A" (exp->val[0]),
+			[res] "=r" (res)
+		      : "c" (src->val[1]),
+			"b" (src->val[0]),
+			"m" (dst->val[0]),
+			"d" (exp->val[1]),
+			"a" (exp->val[0])
+		      : "memory");
+
+	return res;
+}
+
 #endif /* _RTE_ATOMIC_X86_64_H_ */
diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h b/lib/librte_eal/common/include/generic/rte_atomic.h
index b99ba4688..8d612d566 100644
--- a/lib/librte_eal/common/include/generic/rte_atomic.h
+++ b/lib/librte_eal/common/include/generic/rte_atomic.h
@@ -14,6 +14,7 @@
 
 #include <stdint.h>
 #include <rte_common.h>
+#include <rte_compat.h>
 
 #ifdef __DOXYGEN__
 
@@ -1082,4 +1083,68 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 }
 #endif
 
+/*------------------------ 128 bit atomic operations -------------------------*/
+
+/**
+ * 128-bit integer structure.
+ */
+typedef struct {
+	uint64_t val[2];
+} __rte_aligned(16) rte_int128_t;
+
+/**
+ * Memory consistency models used in atomic operations. These control the
+ * behavior of the operation with respect to memory barriers and
+ * thread synchronization.
+ *
+ * These directly match those in the C++11 standard; for details on their
+ * behavior, refer to the standard.
+ */
+enum rte_atomic_memmodel_t {
+	RTE_ATOMIC_RELAXED,
+	RTE_ATOMIC_CONSUME,
+	RTE_ATOMIC_ACQUIRE,
+	RTE_ATOMIC_RELEASE,
+	RTE_ATOMIC_ACQ_REL,
+	RTE_ATOMIC_SEQ_CST,
+};
+
+/* Only implemented on x86-64 currently. The ifdef prevents compilation from
+ * failing for architectures without a definition of this function.
+ */
+#if defined(RTE_ARCH_X86_64)
+/**
+ * An atomic compare and set function used by the mutex functions.
+ * (atomic) equivalent to:
+ *   if (*dst == exp)
+ *     *dst = src (all 128-bit words)
+ *
+ * @param dst
+ *   The destination into which the value will be written.
+ * @param exp
+ *   Pointer to the expected value. If the operation fails, this memory is
+ *   updated with the actual value.
+ * @param src
+ *   Pointer to the new value.
+ * @param weak
+ *   A value of true allows the comparison to spuriously fail. Implementations
+ *   may ignore this argument and only implement the strong variant.
+ * @param success
+ *   If successful, the operation's memory behavior conforms to this (or a
+ *   stronger) model.
+ * @param failure
+ *   If unsuccessful, the operation's memory behavior conforms to this (or a
+ *   stronger) model. This argument cannot be RTE_ATOMIC_RELEASE,
+ *   RTE_ATOMIC_ACQ_REL, or a stronger model than success.
+ * @return
+ *   Non-zero on success; 0 on failure.
+ */
+static inline int __rte_experimental
+rte_atomic128_cmpset(volatile rte_int128_t *dst,
+		     rte_int128_t *exp, rte_int128_t *src,
+		     unsigned int weak,
+		     enum rte_atomic_memmodel_t success,
+		     enum rte_atomic_memmodel_t failure);
+#endif
+
 #endif /* _RTE_ATOMIC_H_ */
-- 
2.13.6

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
  2019-01-28 17:29 ` [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only) Gage Eads
@ 2019-01-28 23:01   ` Ola Liljedahl
  2019-02-01 17:06     ` Eads, Gage
  2019-01-31  5:48   ` Honnappa Nagarahalli
  1 sibling, 1 reply; 36+ messages in thread
From: Ola Liljedahl @ 2019-01-28 23:01 UTC (permalink / raw)
  To: gage.eads, dev
  Cc: arybchenko, jerinj, chaozhu, nd, bruce.richardson,
	konstantin.ananyev, hemant.agrawal, olivier.matz,
	Honnappa Nagarahalli, Gavin Hu (Arm Technology China)

On Mon, 2019-01-28 at 11:29 -0600, Gage Eads wrote:
> This operation can be used for non-blocking algorithms, such as a
> non-blocking stack or ring.
> 
> Signed-off-by: Gage Eads <gage.eads@intel.com>
> ---
>  .../common/include/arch/x86/rte_atomic_64.h        | 31 +++++++++++
>  lib/librte_eal/common/include/generic/rte_atomic.h | 65
> ++++++++++++++++++++++
>  2 files changed, 96 insertions(+)
> 
> diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> index fd2ec9c53..b7b90b83e 100644
> --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> @@ -34,6 +34,7 @@
>  /*
>   * Inspired from FreeBSD src/sys/amd64/include/atomic.h
>   * Copyright (c) 1998 Doug Rabson
> + * Copyright (c) 2019 Intel Corporation
>   * All rights reserved.
>   */
>  
> @@ -46,6 +47,7 @@
>  
>  #include <stdint.h>
>  #include <rte_common.h>
> +#include <rte_compat.h>
>  #include <rte_atomic.h>
>  
>  /*------------------------- 64 bit atomic operations ------------------------
> -*/
> @@ -208,4 +210,33 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
>  }
>  #endif
>  
> +static inline int __rte_experimental
__rte_always_inline?

> +rte_atomic128_cmpset(volatile rte_int128_t *dst,
No need to declare the location volatile. Volatile doesn't do what you think it
does.
https://youtu.be/lkgszkPnV8g?t=1027


> +		     rte_int128_t *exp,
I would declare 'exp' const as well and document that 'exp' is not updated (with
the old value) for a failure. The reason being that ARMv8.0/AArch64 cannot
atomically read the old value without also writing the location and that is bad
for performance (unnecessary writes leads to unnecessary contention and worse
scalability). And the user must anyway read the location (in the start of the
critical section) using e.g. non-atomic 64-bit reads so there isn't actually any
requirement for an atomic 128-bit read of the location.

>  rte_int128_t *src,
const rte_int128_t *src?

But why are we not passing 'exp' and 'src' by value? That works great, even with
structs. Passing by value simplifies the compiler's life, especially if the call
is inlined. Ask a compiler developer.

> +		     unsigned int weak,
> +		     enum rte_atomic_memmodel_t success,
> +		     enum rte_atomic_memmodel_t failure)
> +{
> +	RTE_SET_USED(weak);
> +	RTE_SET_USED(success);
> +	RTE_SET_USED(failure);
> +	uint8_t res;
> +
> +	asm volatile (
> +		      MPLOCKED
> +		      "cmpxchg16b %[dst];"
> +		      " sete %[res]"
> +		      : [dst] "=m" (dst->val[0]),
> +			"=A" (exp->val[0]),
> +			[res] "=r" (res)
> +		      : "c" (src->val[1]),
> +			"b" (src->val[0]),
> +			"m" (dst->val[0]),
> +			"d" (exp->val[1]),
> +			"a" (exp->val[0])
> +		      : "memory");
> +
> +	return res;
> +}
> +
>  #endif /* _RTE_ATOMIC_X86_64_H_ */
> diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h
> b/lib/librte_eal/common/include/generic/rte_atomic.h
> index b99ba4688..8d612d566 100644
> --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> @@ -14,6 +14,7 @@
>  
>  #include <stdint.h>
>  #include <rte_common.h>
> +#include <rte_compat.h>
>  
>  #ifdef __DOXYGEN__
>  
> @@ -1082,4 +1083,68 @@ static inline void rte_atomic64_clear(rte_atomic64_t
> *v)
>  }
>  #endif
>  
> +/*------------------------ 128 bit atomic operations ------------------------
> -*/
> +
> +/**
> + * 128-bit integer structure.
> + */
> +typedef struct {
> +	uint64_t val[2];
> +} __rte_aligned(16) rte_int128_t;
So we can't use __int128?

> +
> +/**
> + * Memory consistency models used in atomic operations. These control the
> + * behavior of the operation with respect to memory barriers and
> + * thread synchronization.
> + *
> + * These directly match those in the C++11 standard; for details on their
> + * behavior, refer to the standard.
> + */
> +enum rte_atomic_memmodel_t {
> +	RTE_ATOMIC_RELAXED,
> +	RTE_ATOMIC_CONSUME,
> +	RTE_ATOMIC_ACQUIRE,
> +	RTE_ATOMIC_RELEASE,
> +	RTE_ATOMIC_ACQ_REL,
> +	RTE_ATOMIC_SEQ_CST,
> +};
> +
> +/* Only implemented on x86-64 currently. The ifdef prevents compilation from
> + * failing for architectures without a definition of this function.
> + */
> +#if defined(RTE_ARCH_X86_64)
> +/**
> + * An atomic compare and set function used by the mutex functions.
> + * (atomic) equivalent to:
> + *   if (*dst == exp)
> + *     *dst = src (all 128-bit words)
> + *
> + * @param dst
> + *   The destination into which the value will be written.
> + * @param exp
> + *   Pointer to the expected value. If the operation fails, this memory is
> + *   updated with the actual value.
> + * @param src
> + *   Pointer to the new value.
> + * @param weak
> + *   A value of true allows the comparison to spuriously fail.
> Implementations
> + *   may ignore this argument and only implement the strong variant.
> + * @param success
> + *   If successful, the operation's memory behavior conforms to this (or a
> + *   stronger) model.
> + * @param failure
> + *   If unsuccessful, the operation's memory behavior conforms to this (or a
> + *   stronger) model. This argument cannot be RTE_ATOMIC_RELEASE,
> + *   RTE_ATOMIC_ACQ_REL, or a stronger model than success.
> + * @return
> + *   Non-zero on success; 0 on failure.
> + */
> +static inline int __rte_experimental
> +rte_atomic128_cmpset(volatile rte_int128_t *dst,
> +		     rte_int128_t *exp, rte_int128_t *src,
> +		     unsigned int weak,
> +		     enum rte_atomic_memmodel_t success,
> +		     enum rte_atomic_memmodel_t failure);
> +#endif
> +
>  #endif /* _RTE_ATOMIC_H_ */
-- 
Ola Liljedahl, Networking System Architect, Arm
Phone +46706866373, Skype ola.liljedahl


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
  2019-01-28 17:29 ` [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only) Gage Eads
  2019-01-28 23:01   ` Ola Liljedahl
@ 2019-01-31  5:48   ` Honnappa Nagarahalli
  2019-02-01 17:11     ` Eads, Gage
  1 sibling, 1 reply; 36+ messages in thread
From: Honnappa Nagarahalli @ 2019-01-31  5:48 UTC (permalink / raw)
  To: Gage Eads, dev
  Cc: olivier.matz, arybchenko, bruce.richardson, konstantin.ananyev,
	Gavin Hu (Arm Technology China),
	nd, chaozhu, jerinj, hemant.agrawal, nd

> 
> This operation can be used for non-blocking algorithms, such as a non-
> blocking stack or ring.
> 
> Signed-off-by: Gage Eads <gage.eads@intel.com>
> ---
>  .../common/include/arch/x86/rte_atomic_64.h        | 31 +++++++++++
>  lib/librte_eal/common/include/generic/rte_atomic.h | 65
> ++++++++++++++++++++++
>  2 files changed, 96 insertions(+)
> 
> diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> index fd2ec9c53..b7b90b83e 100644
> --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> @@ -34,6 +34,7 @@
>  /*
>   * Inspired from FreeBSD src/sys/amd64/include/atomic.h
>   * Copyright (c) 1998 Doug Rabson
> + * Copyright (c) 2019 Intel Corporation
>   * All rights reserved.
>   */
> 
> @@ -46,6 +47,7 @@
> 
>  #include <stdint.h>
>  #include <rte_common.h>
> +#include <rte_compat.h>
>  #include <rte_atomic.h>
> 
>  /*------------------------- 64 bit atomic operations -------------------------*/ @@ -
> 208,4 +210,33 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)  }
> #endif
> 
> +static inline int __rte_experimental
> +rte_atomic128_cmpset(volatile rte_int128_t *dst,
Does it make sense to call is rte_atomic128_compare_exchange (or ..._cmp_xchg) to indicate it is a compare-exchange operation?

> +		     rte_int128_t *exp, rte_int128_t *src,
> +		     unsigned int weak,
> +		     enum rte_atomic_memmodel_t success,
> +		     enum rte_atomic_memmodel_t failure) {
> +	RTE_SET_USED(weak);
> +	RTE_SET_USED(success);
> +	RTE_SET_USED(failure);
> +	uint8_t res;
> +
> +	asm volatile (
> +		      MPLOCKED
> +		      "cmpxchg16b %[dst];"
> +		      " sete %[res]"
> +		      : [dst] "=m" (dst->val[0]),
> +			"=A" (exp->val[0]),
> +			[res] "=r" (res)
> +		      : "c" (src->val[1]),
> +			"b" (src->val[0]),
> +			"m" (dst->val[0]),
> +			"d" (exp->val[1]),
> +			"a" (exp->val[0])
> +		      : "memory");
> +
> +	return res;
> +}
> +
>  #endif /* _RTE_ATOMIC_X86_64_H_ */
> diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h
> b/lib/librte_eal/common/include/generic/rte_atomic.h
> index b99ba4688..8d612d566 100644
> --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> @@ -14,6 +14,7 @@
> 
>  #include <stdint.h>
>  #include <rte_common.h>
> +#include <rte_compat.h>
> 
>  #ifdef __DOXYGEN__
> 
> @@ -1082,4 +1083,68 @@ static inline void
> rte_atomic64_clear(rte_atomic64_t *v)  }  #endif
> 
> +/*------------------------ 128 bit atomic operations
> +-------------------------*/
> +
> +/**
> + * 128-bit integer structure.
> + */
> +typedef struct {
> +	uint64_t val[2];
> +} __rte_aligned(16) rte_int128_t;
It looks like '__int128' is available from gcc 4.6. I think we should use '__int128'. We can have it as an internal structure for ease of programming.

> +
> +/**
> + * Memory consistency models used in atomic operations. These control
> +the
> + * behavior of the operation with respect to memory barriers and
> + * thread synchronization.
> + *
> + * These directly match those in the C++11 standard; for details on
> +their
> + * behavior, refer to the standard.
> + */
> +enum rte_atomic_memmodel_t {
> +	RTE_ATOMIC_RELAXED,
> +	RTE_ATOMIC_CONSUME,
> +	RTE_ATOMIC_ACQUIRE,
> +	RTE_ATOMIC_RELEASE,
> +	RTE_ATOMIC_ACQ_REL,
> +	RTE_ATOMIC_SEQ_CST,
> +};
IMO, we can use the GCC provided names. I do not see any advantage to defining our own.

> +
> +/* Only implemented on x86-64 currently. The ifdef prevents compilation
> +from
> + * failing for architectures without a definition of this function.
> + */
Minor comment. We can skip the above comments, the #if below is pretty obvious.

> +#if defined(RTE_ARCH_X86_64)
> +/**
> + * An atomic compare and set function used by the mutex functions.
> + * (atomic) equivalent to:
> + *   if (*dst == exp)
> + *     *dst = src (all 128-bit words)
> + *
> + * @param dst
> + *   The destination into which the value will be written.
> + * @param exp
> + *   Pointer to the expected value. If the operation fails, this memory is
> + *   updated with the actual value.
> + * @param src
> + *   Pointer to the new value.
> + * @param weak
> + *   A value of true allows the comparison to spuriously fail.
> Implementations
> + *   may ignore this argument and only implement the strong variant.
> + * @param success
> + *   If successful, the operation's memory behavior conforms to this (or a
> + *   stronger) model.
> + * @param failure
> + *   If unsuccessful, the operation's memory behavior conforms to this (or a
> + *   stronger) model. This argument cannot be RTE_ATOMIC_RELEASE,
> + *   RTE_ATOMIC_ACQ_REL, or a stronger model than success.
> + * @return
> + *   Non-zero on success; 0 on failure.
> + */
> +static inline int __rte_experimental
> +rte_atomic128_cmpset(volatile rte_int128_t *dst,
> +		     rte_int128_t *exp, rte_int128_t *src,
> +		     unsigned int weak,
> +		     enum rte_atomic_memmodel_t success,
> +		     enum rte_atomic_memmodel_t failure); #endif
> +
>  #endif /* _RTE_ATOMIC_H_ */
> --
> 2.13.6

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
  2019-01-28 23:01   ` Ola Liljedahl
@ 2019-02-01 17:06     ` Eads, Gage
  2019-02-01 19:01       ` Ola Liljedahl
  2019-02-04 18:33       ` Honnappa Nagarahalli
  0 siblings, 2 replies; 36+ messages in thread
From: Eads, Gage @ 2019-02-01 17:06 UTC (permalink / raw)
  To: Ola Liljedahl, dev
  Cc: arybchenko, jerinj, chaozhu, nd, Richardson, Bruce, Ananyev,
	Konstantin, hemant.agrawal, olivier.matz, Honnappa Nagarahalli,
	Gavin Hu (Arm Technology China)



> -----Original Message-----
> From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> Sent: Monday, January 28, 2019 5:02 PM
> To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> Cc: arybchenko@solarflare.com; jerinj@marvell.com;
> chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> olivier.matz@6wind.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>
> Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
> 
> On Mon, 2019-01-28 at 11:29 -0600, Gage Eads wrote:
> > This operation can be used for non-blocking algorithms, such as a
> > non-blocking stack or ring.
> >
> > Signed-off-by: Gage Eads <gage.eads@intel.com>
> > ---
> >  .../common/include/arch/x86/rte_atomic_64.h        | 31 +++++++++++
> >  lib/librte_eal/common/include/generic/rte_atomic.h | 65
> > ++++++++++++++++++++++
> >  2 files changed, 96 insertions(+)
> >
> > diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > index fd2ec9c53..b7b90b83e 100644
> > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > @@ -34,6 +34,7 @@
> >  /*
> >   * Inspired from FreeBSD src/sys/amd64/include/atomic.h
> >   * Copyright (c) 1998 Doug Rabson
> > + * Copyright (c) 2019 Intel Corporation
> >   * All rights reserved.
> >   */
> >
> > @@ -46,6 +47,7 @@
> >
> >  #include <stdint.h>
> >  #include <rte_common.h>
> > +#include <rte_compat.h>
> >  #include <rte_atomic.h>
> >
> >  /*------------------------- 64 bit atomic operations
> > ------------------------ -*/ @@ -208,4 +210,33 @@ static inline void
> > rte_atomic64_clear(rte_atomic64_t *v)
> >  }
> >  #endif
> >
> > +static inline int __rte_experimental
> __rte_always_inline?
> 
> > +rte_atomic128_cmpset(volatile rte_int128_t *dst,
> No need to declare the location volatile. Volatile doesn't do what you think it
> does.
> https://youtu.be/lkgszkPnV8g?t=1027
> 

I made this volatile to match the existing rte_atomicN_cmpset definitions, which presumably have a good reason for using the keyword. Maintainers, any input here?

> 
> > +		     rte_int128_t *exp,
> I would declare 'exp' const as well and document that 'exp' is not updated (with
> the old value) for a failure. The reason being that ARMv8.0/AArch64 cannot
> atomically read the old value without also writing the location and that is bad
> for performance (unnecessary writes leads to unnecessary contention and
> worse scalability). And the user must anyway read the location (in the start of
> the critical section) using e.g. non-atomic 64-bit reads so there isn't actually any
> requirement for an atomic 128-bit read of the location.
> 

Will change in v2.

> >  rte_int128_t *src,
> const rte_int128_t *src?

Sure, I don't see any harm in using const.

> 
> But why are we not passing 'exp' and 'src' by value? That works great, even with
> structs. Passing by value simplifies the compiler's life, especially if the call is
> inlined. Ask a compiler developer.

I ran objdump on the nb_stack code with both approaches, and pass-by-reference resulted in fewer overall x86_64 assembly ops.
PBV: 100 ops for push, 97 ops for pop
PBR: 92 ops for push, 84 ops for pop

(Using the in-progress v5 nb_stack code)

Another factor -- though much less compelling -- is that with pass-by-reference, the user can create a 16B structure and cast it to rte_int128_t when they call rte_atomic128_cmpset, whereas with pass-by-value they need to put that struct in a union with rte_int128_t.

> 
> > +		     unsigned int weak,
> > +		     enum rte_atomic_memmodel_t success,
> > +		     enum rte_atomic_memmodel_t failure) {
> > +	RTE_SET_USED(weak);
> > +	RTE_SET_USED(success);
> > +	RTE_SET_USED(failure);
> > +	uint8_t res;
> > +
> > +	asm volatile (
> > +		      MPLOCKED
> > +		      "cmpxchg16b %[dst];"
> > +		      " sete %[res]"
> > +		      : [dst] "=m" (dst->val[0]),
> > +			"=A" (exp->val[0]),
> > +			[res] "=r" (res)
> > +		      : "c" (src->val[1]),
> > +			"b" (src->val[0]),
> > +			"m" (dst->val[0]),
> > +			"d" (exp->val[1]),
> > +			"a" (exp->val[0])
> > +		      : "memory");
> > +
> > +	return res;
> > +}
> > +
> >  #endif /* _RTE_ATOMIC_X86_64_H_ */
> > diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h
> > b/lib/librte_eal/common/include/generic/rte_atomic.h
> > index b99ba4688..8d612d566 100644
> > --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> > +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> > @@ -14,6 +14,7 @@
> >
> >  #include <stdint.h>
> >  #include <rte_common.h>
> > +#include <rte_compat.h>
> >
> >  #ifdef __DOXYGEN__
> >
> > @@ -1082,4 +1083,68 @@ static inline void
> > rte_atomic64_clear(rte_atomic64_t
> > *v)
> >  }
> >  #endif
> >
> > +/*------------------------ 128 bit atomic operations
> > +------------------------
> > -*/
> > +
> > +/**
> > + * 128-bit integer structure.
> > + */
> > +typedef struct {
> > +	uint64_t val[2];
> > +} __rte_aligned(16) rte_int128_t;
> So we can't use __int128?
> 

I'll put it in a union with val[2], in case any implementations want to use it.

Thanks,
Gage

[snip]

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
  2019-01-31  5:48   ` Honnappa Nagarahalli
@ 2019-02-01 17:11     ` Eads, Gage
  0 siblings, 0 replies; 36+ messages in thread
From: Eads, Gage @ 2019-02-01 17:11 UTC (permalink / raw)
  To: Honnappa Nagarahalli, dev
  Cc: olivier.matz, arybchenko, Richardson, Bruce, Ananyev, Konstantin,
	Gavin Hu (Arm Technology China),
	nd, chaozhu, jerinj, hemant.agrawal, nd



> -----Original Message-----
> From: Honnappa Nagarahalli [mailto:Honnappa.Nagarahalli@arm.com]
> Sent: Wednesday, January 30, 2019 11:48 PM
> To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> Cc: olivier.matz@6wind.com; arybchenko@solarflare.com; Richardson, Bruce
> <bruce.richardson@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>; nd <nd@arm.com>; chaozhu@linux.vnet.ibm.com;
> jerinj@marvell.com; hemant.agrawal@nxp.com; nd <nd@arm.com>
> Subject: RE: [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
> 
> >
> > This operation can be used for non-blocking algorithms, such as a non-
> > blocking stack or ring.
> >
> > Signed-off-by: Gage Eads <gage.eads@intel.com>
> > ---
> >  .../common/include/arch/x86/rte_atomic_64.h        | 31 +++++++++++
> >  lib/librte_eal/common/include/generic/rte_atomic.h | 65
> > ++++++++++++++++++++++
> >  2 files changed, 96 insertions(+)
> >
> > diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > index fd2ec9c53..b7b90b83e 100644
> > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > @@ -34,6 +34,7 @@
> >  /*
> >   * Inspired from FreeBSD src/sys/amd64/include/atomic.h
> >   * Copyright (c) 1998 Doug Rabson
> > + * Copyright (c) 2019 Intel Corporation
> >   * All rights reserved.
> >   */
> >
> > @@ -46,6 +47,7 @@
> >
> >  #include <stdint.h>
> >  #include <rte_common.h>
> > +#include <rte_compat.h>
> >  #include <rte_atomic.h>
> >
> >  /*------------------------- 64 bit atomic operations
> > -------------------------*/ @@ -
> > 208,4 +210,33 @@ static inline void rte_atomic64_clear(rte_atomic64_t
> > *v)  } #endif
> >
> > +static inline int __rte_experimental
> > +rte_atomic128_cmpset(volatile rte_int128_t *dst,
> Does it make sense to call is rte_atomic128_compare_exchange (or
> ..._cmp_xchg) to indicate it is a compare-exchange operation?
> 

Good point, though for v2 I'm planning to change this to true cmpset semantics (no update to exp on failure). See Ola's reply for the justification.

> > +		     rte_int128_t *exp, rte_int128_t *src,
> > +		     unsigned int weak,
> > +		     enum rte_atomic_memmodel_t success,
> > +		     enum rte_atomic_memmodel_t failure) {
> > +	RTE_SET_USED(weak);
> > +	RTE_SET_USED(success);
> > +	RTE_SET_USED(failure);
> > +	uint8_t res;
> > +
> > +	asm volatile (
> > +		      MPLOCKED
> > +		      "cmpxchg16b %[dst];"
> > +		      " sete %[res]"
> > +		      : [dst] "=m" (dst->val[0]),
> > +			"=A" (exp->val[0]),
> > +			[res] "=r" (res)
> > +		      : "c" (src->val[1]),
> > +			"b" (src->val[0]),
> > +			"m" (dst->val[0]),
> > +			"d" (exp->val[1]),
> > +			"a" (exp->val[0])
> > +		      : "memory");
> > +
> > +	return res;
> > +}
> > +
> >  #endif /* _RTE_ATOMIC_X86_64_H_ */
> > diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h
> > b/lib/librte_eal/common/include/generic/rte_atomic.h
> > index b99ba4688..8d612d566 100644
> > --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> > +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> > @@ -14,6 +14,7 @@
> >
> >  #include <stdint.h>
> >  #include <rte_common.h>
> > +#include <rte_compat.h>
> >
> >  #ifdef __DOXYGEN__
> >
> > @@ -1082,4 +1083,68 @@ static inline void
> > rte_atomic64_clear(rte_atomic64_t *v)  }  #endif
> >
> > +/*------------------------ 128 bit atomic operations
> > +-------------------------*/
> > +
> > +/**
> > + * 128-bit integer structure.
> > + */
> > +typedef struct {
> > +	uint64_t val[2];
> > +} __rte_aligned(16) rte_int128_t;
> It looks like '__int128' is available from gcc 4.6. I think we should use '__int128'.
> We can have it as an internal structure for ease of programming.
> 

Will add in v2.

> > +
> > +/**
> > + * Memory consistency models used in atomic operations. These control
> > +the
> > + * behavior of the operation with respect to memory barriers and
> > + * thread synchronization.
> > + *
> > + * These directly match those in the C++11 standard; for details on
> > +their
> > + * behavior, refer to the standard.
> > + */
> > +enum rte_atomic_memmodel_t {
> > +	RTE_ATOMIC_RELAXED,
> > +	RTE_ATOMIC_CONSUME,
> > +	RTE_ATOMIC_ACQUIRE,
> > +	RTE_ATOMIC_RELEASE,
> > +	RTE_ATOMIC_ACQ_REL,
> > +	RTE_ATOMIC_SEQ_CST,
> > +};
> IMO, we can use the GCC provided names. I do not see any advantage to
> defining our own.
> 

Will change in v2. I was trying to avoid issues with GCC versions that don't have C++11 support, but DPDK's recommended minimum version (4.9) is later than the version that added __atomic builtins (4.7).

> > +
> > +/* Only implemented on x86-64 currently. The ifdef prevents
> > +compilation from
> > + * failing for architectures without a definition of this function.
> > + */
> Minor comment. We can skip the above comments, the #if below is pretty
> obvious.
> 

Sure.

Thanks,
Gage

<snip>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
  2019-02-01 17:06     ` Eads, Gage
@ 2019-02-01 19:01       ` Ola Liljedahl
  2019-02-01 19:28         ` Eads, Gage
  2019-02-04 18:33       ` Honnappa Nagarahalli
  1 sibling, 1 reply; 36+ messages in thread
From: Ola Liljedahl @ 2019-02-01 19:01 UTC (permalink / raw)
  To: gage.eads, dev
  Cc: jerinj, chaozhu, nd, bruce.richardson, konstantin.ananyev,
	hemant.agrawal, olivier.matz, arybchenko,
	Gavin Hu (Arm Technology China),
	Honnappa Nagarahalli

On Fri, 2019-02-01 at 17:06 +0000, Eads, Gage wrote:
> 
> > 
> > -----Original Message-----
> > From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> > Sent: Monday, January 28, 2019 5:02 PM
> > To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> > Cc: arybchenko@solarflare.com; jerinj@marvell.com;
> > chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>; Ananyev, Konstantin
> > <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> > olivier.matz@6wind.com; Honnappa Nagarahalli
> > <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> > <Gavin.Hu@arm.com>
> > Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
> > 
> > On Mon, 2019-01-28 at 11:29 -0600, Gage Eads wrote:
> > > 
> > > This operation can be used for non-blocking algorithms, such as a
> > > non-blocking stack or ring.
> > > 
> > > Signed-off-by: Gage Eads <gage.eads@intel.com>
> > > ---
> > >  .../common/include/arch/x86/rte_atomic_64.h        | 31 +++++++++++
> > >  lib/librte_eal/common/include/generic/rte_atomic.h | 65
> > > ++++++++++++++++++++++
> > >  2 files changed, 96 insertions(+)
> > > 
> > > diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > index fd2ec9c53..b7b90b83e 100644
> > > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > @@ -34,6 +34,7 @@
> > >  /*
> > >   * Inspired from FreeBSD src/sys/amd64/include/atomic.h
> > >   * Copyright (c) 1998 Doug Rabson
> > > + * Copyright (c) 2019 Intel Corporation
> > >   * All rights reserved.
> > >   */
> > > 
> > > @@ -46,6 +47,7 @@
> > > 
> > >  #include <stdint.h>
> > >  #include <rte_common.h>
> > > +#include <rte_compat.h>
> > >  #include <rte_atomic.h>
> > > 
> > >  /*------------------------- 64 bit atomic operations
> > > ------------------------ -*/ @@ -208,4 +210,33 @@ static inline void
> > > rte_atomic64_clear(rte_atomic64_t *v)
> > >  }
> > >  #endif
> > > 
> > > +static inline int __rte_experimental
> > __rte_always_inline?
> > 
> > > 
> > > +rte_atomic128_cmpset(volatile rte_int128_t *dst,
> > No need to declare the location volatile. Volatile doesn't do what you think
> > it
> > does.
> > https://youtu.be/lkgszkPnV8g?t=1027
> > 
> I made this volatile to match the existing rte_atomicN_cmpset definitions,
> which presumably have a good reason for using the keyword. Maintainers, any
> input here?
> 
> > 
> > 
> > > 
> > > +		     rte_int128_t *exp,
> > I would declare 'exp' const as well and document that 'exp' is not updated
> > (with
> > the old value) for a failure. The reason being that ARMv8.0/AArch64 cannot
> > atomically read the old value without also writing the location and that is
> > bad
> > for performance (unnecessary writes leads to unnecessary contention and
> > worse scalability). And the user must anyway read the location (in the start
> > of
> > the critical section) using e.g. non-atomic 64-bit reads so there isn't
> > actually any
> > requirement for an atomic 128-bit read of the location.
> > 
> Will change in v2.
> 
> > 
> > > 
> > >  rte_int128_t *src,
> > const rte_int128_t *src?
> Sure, I don't see any harm in using const.
> 
> > 
> > 
> > But why are we not passing 'exp' and 'src' by value? That works great, even
> > with
> > structs. Passing by value simplifies the compiler's life, especially if the
> > call is
> > inlined. Ask a compiler developer.
> I ran objdump on the nb_stack code with both approaches, and pass-by-reference 
> resulted in fewer overall x86_64 assembly ops.
> PBV: 100 ops for push, 97 ops for pop
> PBR: 92 ops for push, 84 ops for pop
OK I have never checked x86_64 code generation... I have good experiences with
ARM/AArch64, everything seems to be done using registers. I am surprised there
is a difference.

Did a quick check with lfring, passing 'src' (third param) by reference and by
value. No difference in code generation on x86_64.

But if you insist let's go with PBR.

> 
> (Using the in-progress v5 nb_stack code)
> 
> Another factor -- though much less compelling -- is that with pass-by-
> reference, the user can create a 16B structure and cast it to rte_int128_t
> when they call rte_atomic128_cmpset, whereas with pass-by-value they need to
> put that struct in a union with rte_int128_t.
Which is what I always do nowadays... Trying to use as few casts as possible and
lie to the compiler as seldom as possible. But I can see the freedom provided by
taking a pointer to something and cast it it rte_int128_t ptr in the call
to rte_atomic128_cmpset().

Would prefer a name that is more similar to __atomic_compare_exchange(). E.g.
rte_atomic128_compare_exchange() (or perhaps just rte_atomic128_cmpxchg)? All
the rte_atomicXX_cmpset() functions do not take any memory order parameters.
From an Arm perspective, we are not happy with that.

> 
> > 
> > 
> > > 
> > > +		     unsigned int weak,
> > > +		     enum rte_atomic_memmodel_t success,
> > > +		     enum rte_atomic_memmodel_t failure) {
> > > +	RTE_SET_USED(weak);
> > > +	RTE_SET_USED(success);
> > > +	RTE_SET_USED(failure);
> > > +	uint8_t res;
> > > +
> > > +	asm volatile (
> > > +		      MPLOCKED
> > > +		      "cmpxchg16b %[dst];"
> > > +		      " sete %[res]"
> > > +		      : [dst] "=m" (dst->val[0]),
> > > +			"=A" (exp->val[0]),
> > > +			[res] "=r" (res)
> > > +		      : "c" (src->val[1]),
> > > +			"b" (src->val[0]),
> > > +			"m" (dst->val[0]),
> > > +			"d" (exp->val[1]),
> > > +			"a" (exp->val[0])
> > > +		      : "memory");
> > > +
> > > +	return res;
> > > +}
> > > +
> > >  #endif /* _RTE_ATOMIC_X86_64_H_ */
> > > diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h
> > > b/lib/librte_eal/common/include/generic/rte_atomic.h
> > > index b99ba4688..8d612d566 100644
> > > --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> > > +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> > > @@ -14,6 +14,7 @@
> > > 
> > >  #include <stdint.h>
> > >  #include <rte_common.h>
> > > +#include <rte_compat.h>
> > > 
> > >  #ifdef __DOXYGEN__
> > > 
> > > @@ -1082,4 +1083,68 @@ static inline void
> > > rte_atomic64_clear(rte_atomic64_t
> > > *v)
> > >  }
> > >  #endif
> > > 
> > > +/*------------------------ 128 bit atomic operations
> > > +------------------------
> > > -*/
> > > +
> > > +/**
> > > + * 128-bit integer structure.
> > > + */
> > > +typedef struct {
> > > +	uint64_t val[2];
> > > +} __rte_aligned(16) rte_int128_t;
> > So we can't use __int128?
> > 
> I'll put it in a union with val[2], in case any implementations want to use
> it.
Thinking on this one more time, since the inline asm functions (e.g. for x86_64
cmpxchg16b and for AArch64 LDXP/STXP) anyway will use 64-bit registers, it makes
most sense to make rte_int128_t a struct of 2x64b. The question is whether to
use an array like above or a struct with two elements (which I normally do
internally). Can you compare code generation with the following definition?
typedef struct {
        uint64_t lo, hi;
} __rte_aligned(16) rte_int128_t;

> 
> Thanks,
> Gage
> 
> [snip]
-- 
Ola Liljedahl, Networking System Architect, Arm
Phone +46706866373, Skype ola.liljedahl


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
  2019-02-01 19:01       ` Ola Liljedahl
@ 2019-02-01 19:28         ` Eads, Gage
  2019-02-01 19:43           ` Ola Liljedahl
  0 siblings, 1 reply; 36+ messages in thread
From: Eads, Gage @ 2019-02-01 19:28 UTC (permalink / raw)
  To: Ola Liljedahl, dev
  Cc: jerinj, chaozhu, nd, Richardson, Bruce, Ananyev, Konstantin,
	hemant.agrawal, olivier.matz, arybchenko,
	Gavin Hu (Arm Technology China),
	Honnappa Nagarahalli



> -----Original Message-----
> From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> Sent: Friday, February 1, 2019 1:02 PM
> To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> Cc: jerinj@marvell.com; chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>;
> Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> olivier.matz@6wind.com; arybchenko@solarflare.com; Gavin Hu (Arm
> Technology China) <Gavin.Hu@arm.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>
> Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
> 
> On Fri, 2019-02-01 at 17:06 +0000, Eads, Gage wrote:
> >
> > >
> > > -----Original Message-----
> > > From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> > > Sent: Monday, January 28, 2019 5:02 PM
> > > To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> > > Cc: arybchenko@solarflare.com; jerinj@marvell.com;
> > > chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>; Richardson, Bruce
> > > <bruce.richardson@intel.com>; Ananyev, Konstantin
> > > <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> > > olivier.matz@6wind.com; Honnappa Nagarahalli
> > > <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> > > <Gavin.Hu@arm.com>
> > > Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset (x86-64
> > > only)
> > >
> > > On Mon, 2019-01-28 at 11:29 -0600, Gage Eads wrote:
> > > >
> > > > This operation can be used for non-blocking algorithms, such as a
> > > > non-blocking stack or ring.
> > > >
> > > > Signed-off-by: Gage Eads <gage.eads@intel.com>
> > > > ---
> > > >  .../common/include/arch/x86/rte_atomic_64.h        | 31
> > > > +++++++++++
> > > >  lib/librte_eal/common/include/generic/rte_atomic.h | 65
> > > > ++++++++++++++++++++++
> > > >  2 files changed, 96 insertions(+)
> > > >
> > > > diff --git
> > > > a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > index fd2ec9c53..b7b90b83e 100644
> > > > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > @@ -34,6 +34,7 @@
> > > >  /*
> > > >   * Inspired from FreeBSD src/sys/amd64/include/atomic.h
> > > >   * Copyright (c) 1998 Doug Rabson
> > > > + * Copyright (c) 2019 Intel Corporation
> > > >   * All rights reserved.
> > > >   */
> > > >
> > > > @@ -46,6 +47,7 @@
> > > >
> > > >  #include <stdint.h>
> > > >  #include <rte_common.h>
> > > > +#include <rte_compat.h>
> > > >  #include <rte_atomic.h>
> > > >
> > > >  /*------------------------- 64 bit atomic operations
> > > > ------------------------ -*/ @@ -208,4 +210,33 @@ static inline
> > > > void rte_atomic64_clear(rte_atomic64_t *v)
> > > >  }
> > > >  #endif
> > > >
> > > > +static inline int __rte_experimental
> > > __rte_always_inline?
> > >
> > > >
> > > > +rte_atomic128_cmpset(volatile rte_int128_t *dst,
> > > No need to declare the location volatile. Volatile doesn't do what
> > > you think it does.
> > > https://youtu.be/lkgszkPnV8g?t=1027
> > >
> > I made this volatile to match the existing rte_atomicN_cmpset
> > definitions, which presumably have a good reason for using the
> > keyword. Maintainers, any input here?
> >
> > >
> > >
> > > >
> > > > +		     rte_int128_t *exp,
> > > I would declare 'exp' const as well and document that 'exp' is not
> > > updated (with the old value) for a failure. The reason being that
> > > ARMv8.0/AArch64 cannot atomically read the old value without also
> > > writing the location and that is bad for performance (unnecessary
> > > writes leads to unnecessary contention and worse scalability). And
> > > the user must anyway read the location (in the start of the critical
> > > section) using e.g. non-atomic 64-bit reads so there isn't actually
> > > any requirement for an atomic 128-bit read of the location.
> > >
> > Will change in v2.
> >
> > >
> > > >
> > > >  rte_int128_t *src,
> > > const rte_int128_t *src?
> > Sure, I don't see any harm in using const.
> >
> > >
> > >
> > > But why are we not passing 'exp' and 'src' by value? That works
> > > great, even with structs. Passing by value simplifies the compiler's
> > > life, especially if the call is inlined. Ask a compiler developer.
> > I ran objdump on the nb_stack code with both approaches, and
> > pass-by-reference resulted in fewer overall x86_64 assembly ops.
> > PBV: 100 ops for push, 97 ops for pop
> > PBR: 92 ops for push, 84 ops for pop
> OK I have never checked x86_64 code generation... I have good experiences
> with ARM/AArch64, everything seems to be done using registers. I am surprised
> there is a difference.
> 
> Did a quick check with lfring, passing 'src' (third param) by reference and by
> value. No difference in code generation on x86_64.
> 
> But if you insist let's go with PBR.
> 
> >
> > (Using the in-progress v5 nb_stack code)
> >
> > Another factor -- though much less compelling -- is that with pass-by-
> > reference, the user can create a 16B structure and cast it to
> > rte_int128_t when they call rte_atomic128_cmpset, whereas with
> > pass-by-value they need to put that struct in a union with rte_int128_t.
> Which is what I always do nowadays... Trying to use as few casts as possible and
> lie to the compiler as seldom as possible. But I can see the freedom provided by
> taking a pointer to something and cast it it rte_int128_t ptr in the call
> to rte_atomic128_cmpset().
> 
> Would prefer a name that is more similar to __atomic_compare_exchange().
> E.g.
> rte_atomic128_compare_exchange() (or perhaps just rte_atomic128_cmpxchg)?
> All the rte_atomicXX_cmpset() functions do not take any memory order
> parameters.
> From an Arm perspective, we are not happy with that.

Since the function returns a boolean success value, isn't compare-and-set the appropriate term?

> 
> >
> > >
> > >
> > > >
> > > > +		     unsigned int weak,
> > > > +		     enum rte_atomic_memmodel_t success,
> > > > +		     enum rte_atomic_memmodel_t failure) {
> > > > +	RTE_SET_USED(weak);
> > > > +	RTE_SET_USED(success);
> > > > +	RTE_SET_USED(failure);
> > > > +	uint8_t res;
> > > > +
> > > > +	asm volatile (
> > > > +		      MPLOCKED
> > > > +		      "cmpxchg16b %[dst];"
> > > > +		      " sete %[res]"
> > > > +		      : [dst] "=m" (dst->val[0]),
> > > > +			"=A" (exp->val[0]),
> > > > +			[res] "=r" (res)
> > > > +		      : "c" (src->val[1]),
> > > > +			"b" (src->val[0]),
> > > > +			"m" (dst->val[0]),
> > > > +			"d" (exp->val[1]),
> > > > +			"a" (exp->val[0])
> > > > +		      : "memory");
> > > > +
> > > > +	return res;
> > > > +}
> > > > +
> > > >  #endif /* _RTE_ATOMIC_X86_64_H_ */ diff --git
> > > > a/lib/librte_eal/common/include/generic/rte_atomic.h
> > > > b/lib/librte_eal/common/include/generic/rte_atomic.h
> > > > index b99ba4688..8d612d566 100644
> > > > --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> > > > +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> > > > @@ -14,6 +14,7 @@
> > > >
> > > >  #include <stdint.h>
> > > >  #include <rte_common.h>
> > > > +#include <rte_compat.h>
> > > >
> > > >  #ifdef __DOXYGEN__
> > > >
> > > > @@ -1082,4 +1083,68 @@ static inline void
> > > > rte_atomic64_clear(rte_atomic64_t
> > > > *v)
> > > >  }
> > > >  #endif
> > > >
> > > > +/*------------------------ 128 bit atomic operations
> > > > +------------------------
> > > > -*/
> > > > +
> > > > +/**
> > > > + * 128-bit integer structure.
> > > > + */
> > > > +typedef struct {
> > > > +	uint64_t val[2];
> > > > +} __rte_aligned(16) rte_int128_t;
> > > So we can't use __int128?
> > >
> > I'll put it in a union with val[2], in case any implementations want
> > to use it.
> Thinking on this one more time, since the inline asm functions (e.g. for x86_64
> cmpxchg16b and for AArch64 LDXP/STXP) anyway will use 64-bit registers, it
> makes most sense to make rte_int128_t a struct of 2x64b. The question is
> whether to use an array like above or a struct with two elements (which I
> normally do internally). Can you compare code generation with the following
> definition?
> typedef struct {
>         uint64_t lo, hi;
> } __rte_aligned(16) rte_int128_t;
> 

Interestingly, that made no difference in the PBV code but added more instructions overall to PBR:
PBV: 100 insts for push, 97 insts for pop
PBR: 100 insts for push, 83 insts for pop

> >
> > Thanks,
> > Gage
> >
> > [snip]
> --
> Ola Liljedahl, Networking System Architect, Arm Phone +46706866373, Skype
> ola.liljedahl


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
  2019-02-01 19:28         ` Eads, Gage
@ 2019-02-01 19:43           ` Ola Liljedahl
  2019-02-01 21:05             ` Eads, Gage
  0 siblings, 1 reply; 36+ messages in thread
From: Ola Liljedahl @ 2019-02-01 19:43 UTC (permalink / raw)
  To: gage.eads, dev
  Cc: jerinj, chaozhu, nd, bruce.richardson, konstantin.ananyev,
	hemant.agrawal, olivier.matz, arybchenko,
	Gavin Hu (Arm Technology China),
	Honnappa Nagarahalli

On Fri, 2019-02-01 at 19:28 +0000, Eads, Gage wrote:
> 
> > 
> > -----Original Message-----
> > From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> > Sent: Friday, February 1, 2019 1:02 PM
> > To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> > Cc: jerinj@marvell.com; chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>;
> > Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin
> > <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> > olivier.matz@6wind.com; arybchenko@solarflare.com; Gavin Hu (Arm
> > Technology China) <Gavin.Hu@arm.com>; Honnappa Nagarahalli
> > <Honnappa.Nagarahalli@arm.com>
> > Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
> > 
> > On Fri, 2019-02-01 at 17:06 +0000, Eads, Gage wrote:
> > > 
> > > 
> > > > 
> > > > 
> > > > -----Original Message-----
> > > > From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> > > > Sent: Monday, January 28, 2019 5:02 PM
> > > > To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> > > > Cc: arybchenko@solarflare.com; jerinj@marvell.com;
> > > > chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>; Richardson, Bruce
> > > > <bruce.richardson@intel.com>; Ananyev, Konstantin
> > > > <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> > > > olivier.matz@6wind.com; Honnappa Nagarahalli
> > > > <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> > > > <Gavin.Hu@arm.com>
> > > > Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset (x86-64
> > > > only)
> > > > 
> > > > On Mon, 2019-01-28 at 11:29 -0600, Gage Eads wrote:
> > > > > 
> > > > > 
> > > > > This operation can be used for non-blocking algorithms, such as a
> > > > > non-blocking stack or ring.
> > > > > 
> > > > > Signed-off-by: Gage Eads <gage.eads@intel.com>
> > > > > ---
> > > > >  .../common/include/arch/x86/rte_atomic_64.h        | 31
> > > > > +++++++++++
> > > > >  lib/librte_eal/common/include/generic/rte_atomic.h | 65
> > > > > ++++++++++++++++++++++
> > > > >  2 files changed, 96 insertions(+)
> > > > > 
> > > > > diff --git
> > > > > a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > index fd2ec9c53..b7b90b83e 100644
> > > > > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > @@ -34,6 +34,7 @@
> > > > >  /*
> > > > >   * Inspired from FreeBSD src/sys/amd64/include/atomic.h
> > > > >   * Copyright (c) 1998 Doug Rabson
> > > > > + * Copyright (c) 2019 Intel Corporation
> > > > >   * All rights reserved.
> > > > >   */
> > > > > 
> > > > > @@ -46,6 +47,7 @@
> > > > > 
> > > > >  #include <stdint.h>
> > > > >  #include <rte_common.h>
> > > > > +#include <rte_compat.h>
> > > > >  #include <rte_atomic.h>
> > > > > 
> > > > >  /*------------------------- 64 bit atomic operations
> > > > > ------------------------ -*/ @@ -208,4 +210,33 @@ static inline
> > > > > void rte_atomic64_clear(rte_atomic64_t *v)
> > > > >  }
> > > > >  #endif
> > > > > 
> > > > > +static inline int __rte_experimental
> > > > __rte_always_inline?
> > > > 
> > > > > 
> > > > > 
> > > > > +rte_atomic128_cmpset(volatile rte_int128_t *dst,
> > > > No need to declare the location volatile. Volatile doesn't do what
> > > > you think it does.
> > > > https://youtu.be/lkgszkPnV8g?t=1027
> > > > 
> > > I made this volatile to match the existing rte_atomicN_cmpset
> > > definitions, which presumably have a good reason for using the
> > > keyword. Maintainers, any input here?
> > > 
> > > > 
> > > > 
> > > > 
> > > > > 
> > > > > 
> > > > > +		     rte_int128_t *exp,
> > > > I would declare 'exp' const as well and document that 'exp' is not
> > > > updated (with the old value) for a failure. The reason being that
> > > > ARMv8.0/AArch64 cannot atomically read the old value without also
> > > > writing the location and that is bad for performance (unnecessary
> > > > writes leads to unnecessary contention and worse scalability). And
> > > > the user must anyway read the location (in the start of the critical
> > > > section) using e.g. non-atomic 64-bit reads so there isn't actually
> > > > any requirement for an atomic 128-bit read of the location.
> > > > 
> > > Will change in v2.
> > > 
> > > > 
> > > > 
> > > > > 
> > > > > 
> > > > >  rte_int128_t *src,
> > > > const rte_int128_t *src?
> > > Sure, I don't see any harm in using const.
> > > 
> > > > 
> > > > 
> > > > 
> > > > But why are we not passing 'exp' and 'src' by value? That works
> > > > great, even with structs. Passing by value simplifies the compiler's
> > > > life, especially if the call is inlined. Ask a compiler developer.
> > > I ran objdump on the nb_stack code with both approaches, and
> > > pass-by-reference resulted in fewer overall x86_64 assembly ops.
> > > PBV: 100 ops for push, 97 ops for pop
> > > PBR: 92 ops for push, 84 ops for pop
> > OK I have never checked x86_64 code generation... I have good experiences
> > with ARM/AArch64, everything seems to be done using registers. I am
> > surprised
> > there is a difference.
> > 
> > Did a quick check with lfring, passing 'src' (third param) by reference and
> > by
> > value. No difference in code generation on x86_64.
> > 
> > But if you insist let's go with PBR.
> > 
> > > 
> > > 
> > > (Using the in-progress v5 nb_stack code)
> > > 
> > > Another factor -- though much less compelling -- is that with pass-by-
> > > reference, the user can create a 16B structure and cast it to
> > > rte_int128_t when they call rte_atomic128_cmpset, whereas with
> > > pass-by-value they need to put that struct in a union with rte_int128_t.
> > Which is what I always do nowadays... Trying to use as few casts as possible
> > and
> > lie to the compiler as seldom as possible. But I can see the freedom
> > provided by
> > taking a pointer to something and cast it it rte_int128_t ptr in the call
> > to rte_atomic128_cmpset().
> > 
> > Would prefer a name that is more similar to __atomic_compare_exchange().
> > E.g.
> > rte_atomic128_compare_exchange() (or perhaps just rte_atomic128_cmpxchg)?
> > All the rte_atomicXX_cmpset() functions do not take any memory order
> > parameters.
> > From an Arm perspective, we are not happy with that.
> Since the function returns a boolean success value, isn't compare-and-set the
> appropriate term?
I was thinking of the memory ordering parameters that __atomic_xxx builtins have
and we want (from an Arm perspective).

GCC __atomic_compare_exchange also returns a boolean.
https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html
bool __atomic_compare_exchange_n (type *ptr, type *expected, type desired, bool
weak, int success_memorder, int failure_memorder)
bool __atomic_compare_exchange (type *ptr, type *expected, type *desired, bool
weak, int success_memorder, int failure_memorder)

rte_atomic128_compare_exchange(rte_int128_t *dst, const rte_int128_t *exp, const
rte_int182_t *src, bool weak, int mo_success, int mo_failure);

> 
> > 
> > 
> > > 
> > > 
> > > > 
> > > > 
> > > > 
> > > > > 
> > > > > 
> > > > > +		     unsigned int weak,
> > > > > +		     enum rte_atomic_memmodel_t success,
> > > > > +		     enum rte_atomic_memmodel_t failure) {
> > > > > +	RTE_SET_USED(weak);
> > > > > +	RTE_SET_USED(success);
> > > > > +	RTE_SET_USED(failure);
> > > > > +	uint8_t res;
> > > > > +
> > > > > +	asm volatile (
> > > > > +		      MPLOCKED
> > > > > +		      "cmpxchg16b %[dst];"
> > > > > +		      " sete %[res]"
> > > > > +		      : [dst] "=m" (dst->val[0]),
> > > > > +			"=A" (exp->val[0]),
> > > > > +			[res] "=r" (res)
> > > > > +		      : "c" (src->val[1]),
> > > > > +			"b" (src->val[0]),
> > > > > +			"m" (dst->val[0]),
> > > > > +			"d" (exp->val[1]),
> > > > > +			"a" (exp->val[0])
> > > > > +		      : "memory");
> > > > > +
> > > > > +	return res;
> > > > > +}
> > > > > +
> > > > >  #endif /* _RTE_ATOMIC_X86_64_H_ */ diff --git
> > > > > a/lib/librte_eal/common/include/generic/rte_atomic.h
> > > > > b/lib/librte_eal/common/include/generic/rte_atomic.h
> > > > > index b99ba4688..8d612d566 100644
> > > > > --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> > > > > +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> > > > > @@ -14,6 +14,7 @@
> > > > > 
> > > > >  #include <stdint.h>
> > > > >  #include <rte_common.h>
> > > > > +#include <rte_compat.h>
> > > > > 
> > > > >  #ifdef __DOXYGEN__
> > > > > 
> > > > > @@ -1082,4 +1083,68 @@ static inline void
> > > > > rte_atomic64_clear(rte_atomic64_t
> > > > > *v)
> > > > >  }
> > > > >  #endif
> > > > > 
> > > > > +/*------------------------ 128 bit atomic operations
> > > > > +------------------------
> > > > > -*/
> > > > > +
> > > > > +/**
> > > > > + * 128-bit integer structure.
> > > > > + */
> > > > > +typedef struct {
> > > > > +	uint64_t val[2];
> > > > > +} __rte_aligned(16) rte_int128_t;
> > > > So we can't use __int128?
> > > > 
> > > I'll put it in a union with val[2], in case any implementations want
> > > to use it.
> > Thinking on this one more time, since the inline asm functions (e.g. for
> > x86_64
> > cmpxchg16b and for AArch64 LDXP/STXP) anyway will use 64-bit registers, it
> > makes most sense to make rte_int128_t a struct of 2x64b. The question is
> > whether to use an array like above or a struct with two elements (which I
> > normally do internally). Can you compare code generation with the following
> > definition?
> > typedef struct {
> >         uint64_t lo, hi;
> > } __rte_aligned(16) rte_int128_t;
> > 
> Interestingly, that made no difference in the PBV code but added more
> instructions overall to PBR:
> PBV: 100 insts for push, 97 insts for pop
> PBR: 100 insts for push, 83 insts for pop
I think we learned something here... Trying to understand exactly what. But I
think this result settles it.

I should test the different alternatives on Arm, does code generation behave the
same as for x86_64.

> 
> > 
> > > 
> > > 
> > > Thanks,
> > > Gage
> > > 
> > > [snip]
> > --
> > Ola Liljedahl, Networking System Architect, Arm Phone +46706866373, Skype
> > ola.liljedahl
-- 
Ola Liljedahl, Networking System Architect, Arm
Phone +46706866373, Skype ola.liljedahl


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
  2019-02-01 19:43           ` Ola Liljedahl
@ 2019-02-01 21:05             ` Eads, Gage
  2019-02-01 23:11               ` Ola Liljedahl
  0 siblings, 1 reply; 36+ messages in thread
From: Eads, Gage @ 2019-02-01 21:05 UTC (permalink / raw)
  To: Ola Liljedahl, dev
  Cc: jerinj, chaozhu, nd, Richardson, Bruce, Ananyev, Konstantin,
	hemant.agrawal, olivier.matz, arybchenko,
	Gavin Hu (Arm Technology China),
	Honnappa Nagarahalli



> -----Original Message-----
> From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> Sent: Friday, February 1, 2019 1:44 PM
> To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> Cc: jerinj@marvell.com; chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>;
> Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> olivier.matz@6wind.com; arybchenko@solarflare.com; Gavin Hu (Arm
> Technology China) <Gavin.Hu@arm.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>
> Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
> 
> On Fri, 2019-02-01 at 19:28 +0000, Eads, Gage wrote:
> >
> > >
> > > -----Original Message-----
> > > From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> > > Sent: Friday, February 1, 2019 1:02 PM
> > > To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> > > Cc: jerinj@marvell.com; chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>;
> > > Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin
> > > <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> > > olivier.matz@6wind.com; arybchenko@solarflare.com; Gavin Hu (Arm
> > > Technology China) <Gavin.Hu@arm.com>; Honnappa Nagarahalli
> > > <Honnappa.Nagarahalli@arm.com>
> > > Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset (x86-64
> > > only)
> > >
> > > On Fri, 2019-02-01 at 17:06 +0000, Eads, Gage wrote:
> > > >
> > > >
> > > > >
> > > > >
> > > > > -----Original Message-----
> > > > > From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> > > > > Sent: Monday, January 28, 2019 5:02 PM
> > > > > To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> > > > > Cc: arybchenko@solarflare.com; jerinj@marvell.com;
> > > > > chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>; Richardson, Bruce
> > > > > <bruce.richardson@intel.com>; Ananyev, Konstantin
> > > > > <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> > > > > olivier.matz@6wind.com; Honnappa Nagarahalli
> > > > > <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> > > > > <Gavin.Hu@arm.com>
> > > > > Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset
> > > > > (x86-64
> > > > > only)
> > > > >
> > > > > On Mon, 2019-01-28 at 11:29 -0600, Gage Eads wrote:
> > > > > >
> > > > > >
> > > > > > This operation can be used for non-blocking algorithms, such
> > > > > > as a non-blocking stack or ring.
> > > > > >
> > > > > > Signed-off-by: Gage Eads <gage.eads@intel.com>
> > > > > > ---
> > > > > >  .../common/include/arch/x86/rte_atomic_64.h        | 31
> > > > > > +++++++++++
> > > > > >  lib/librte_eal/common/include/generic/rte_atomic.h | 65
> > > > > > ++++++++++++++++++++++
> > > > > >  2 files changed, 96 insertions(+)
> > > > > >
> > > > > > diff --git
> > > > > > a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > > b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > > index fd2ec9c53..b7b90b83e 100644
> > > > > > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > > @@ -34,6 +34,7 @@
> > > > > >  /*
> > > > > >   * Inspired from FreeBSD src/sys/amd64/include/atomic.h
> > > > > >   * Copyright (c) 1998 Doug Rabson
> > > > > > + * Copyright (c) 2019 Intel Corporation
> > > > > >   * All rights reserved.
> > > > > >   */
> > > > > >
> > > > > > @@ -46,6 +47,7 @@
> > > > > >
> > > > > >  #include <stdint.h>
> > > > > >  #include <rte_common.h>
> > > > > > +#include <rte_compat.h>
> > > > > >  #include <rte_atomic.h>
> > > > > >
> > > > > >  /*------------------------- 64 bit atomic operations
> > > > > > ------------------------ -*/ @@ -208,4 +210,33 @@ static
> > > > > > inline void rte_atomic64_clear(rte_atomic64_t *v)
> > > > > >  }
> > > > > >  #endif
> > > > > >
> > > > > > +static inline int __rte_experimental
> > > > > __rte_always_inline?
> > > > >
> > > > > >
> > > > > >
> > > > > > +rte_atomic128_cmpset(volatile rte_int128_t *dst,
> > > > > No need to declare the location volatile. Volatile doesn't do
> > > > > what you think it does.
> > > > > https://youtu.be/lkgszkPnV8g?t=1027
> > > > >
> > > > I made this volatile to match the existing rte_atomicN_cmpset
> > > > definitions, which presumably have a good reason for using the
> > > > keyword. Maintainers, any input here?
> > > >
> > > > >
> > > > >
> > > > >
> > > > > >
> > > > > >
> > > > > > +		     rte_int128_t *exp,
> > > > > I would declare 'exp' const as well and document that 'exp' is
> > > > > not updated (with the old value) for a failure. The reason being
> > > > > that
> > > > > ARMv8.0/AArch64 cannot atomically read the old value without
> > > > > also writing the location and that is bad for performance
> > > > > (unnecessary writes leads to unnecessary contention and worse
> > > > > scalability). And the user must anyway read the location (in the
> > > > > start of the critical
> > > > > section) using e.g. non-atomic 64-bit reads so there isn't
> > > > > actually any requirement for an atomic 128-bit read of the location.
> > > > >
> > > > Will change in v2.
> > > >
> > > > >
> > > > >
> > > > > >
> > > > > >
> > > > > >  rte_int128_t *src,
> > > > > const rte_int128_t *src?
> > > > Sure, I don't see any harm in using const.
> > > >
> > > > >
> > > > >
> > > > >
> > > > > But why are we not passing 'exp' and 'src' by value? That works
> > > > > great, even with structs. Passing by value simplifies the
> > > > > compiler's life, especially if the call is inlined. Ask a compiler developer.
> > > > I ran objdump on the nb_stack code with both approaches, and
> > > > pass-by-reference resulted in fewer overall x86_64 assembly ops.
> > > > PBV: 100 ops for push, 97 ops for pop
> > > > PBR: 92 ops for push, 84 ops for pop
> > > OK I have never checked x86_64 code generation... I have good
> > > experiences with ARM/AArch64, everything seems to be done using
> > > registers. I am surprised there is a difference.
> > >
> > > Did a quick check with lfring, passing 'src' (third param) by
> > > reference and by value. No difference in code generation on x86_64.
> > >
> > > But if you insist let's go with PBR.
> > >
> > > >
> > > >
> > > > (Using the in-progress v5 nb_stack code)
> > > >
> > > > Another factor -- though much less compelling -- is that with
> > > > pass-by- reference, the user can create a 16B structure and cast
> > > > it to rte_int128_t when they call rte_atomic128_cmpset, whereas
> > > > with pass-by-value they need to put that struct in a union with
> rte_int128_t.
> > > Which is what I always do nowadays... Trying to use as few casts as
> > > possible and lie to the compiler as seldom as possible. But I can
> > > see the freedom provided by taking a pointer to something and cast
> > > it it rte_int128_t ptr in the call to rte_atomic128_cmpset().
> > >
> > > Would prefer a name that is more similar to __atomic_compare_exchange().
> > > E.g.
> > > rte_atomic128_compare_exchange() (or perhaps just
> rte_atomic128_cmpxchg)?
> > > All the rte_atomicXX_cmpset() functions do not take any memory order
> > > parameters.
> > > From an Arm perspective, we are not happy with that.
> > Since the function returns a boolean success value, isn't
> > compare-and-set the appropriate term?
> I was thinking of the memory ordering parameters that __atomic_xxx builtins
> have and we want (from an Arm perspective).
> 
> GCC __atomic_compare_exchange also returns a boolean.
> https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html
> bool __atomic_compare_exchange_n (type *ptr, type *expected, type desired,
> bool weak, int success_memorder, int failure_memorder) bool
> __atomic_compare_exchange (type *ptr, type *expected, type *desired, bool
> weak, int success_memorder, int failure_memorder)
> 
> rte_atomic128_compare_exchange(rte_int128_t *dst, const rte_int128_t *exp,
> const rte_int182_t *src, bool weak, int mo_success, int mo_failure);
> 

I assumed GCC chose the name "exchange" because the builtin modifies 'exp' on failure, but I could be wrong. I'm fine with either name. Anyone else have a preference?

<snip>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
  2019-02-01 21:05             ` Eads, Gage
@ 2019-02-01 23:11               ` Ola Liljedahl
  0 siblings, 0 replies; 36+ messages in thread
From: Ola Liljedahl @ 2019-02-01 23:11 UTC (permalink / raw)
  To: gage.eads, dev
  Cc: jerinj, chaozhu, nd, bruce.richardson, konstantin.ananyev,
	hemant.agrawal, olivier.matz, arybchenko,
	Gavin Hu (Arm Technology China),
	Honnappa Nagarahalli

On Fri, 2019-02-01 at 21:05 +0000, Eads, Gage wrote:
> 
> > 
> > -----Original Message-----
> > From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> > Sent: Friday, February 1, 2019 1:44 PM
> > To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> > Cc: jerinj@marvell.com; chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>;
> > Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin
> > <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> > olivier.matz@6wind.com; arybchenko@solarflare.com; Gavin Hu (Arm
> > Technology China) <Gavin.Hu@arm.com>; Honnappa Nagarahalli
> > <Honnappa.Nagarahalli@arm.com>
> > Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
> > 
> > On Fri, 2019-02-01 at 19:28 +0000, Eads, Gage wrote:
> > > 
> > > 
> > > > 
> > > > 
> > > > -----Original Message-----
> > > > From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> > > > Sent: Friday, February 1, 2019 1:02 PM
> > > > To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> > > > Cc: jerinj@marvell.com; chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>;
> > > > Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin
> > > > <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> > > > olivier.matz@6wind.com; arybchenko@solarflare.com; Gavin Hu (Arm
> > > > Technology China) <Gavin.Hu@arm.com>; Honnappa Nagarahalli
> > > > <Honnappa.Nagarahalli@arm.com>
> > > > Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset (x86-64
> > > > only)
> > > > 
> > > > On Fri, 2019-02-01 at 17:06 +0000, Eads, Gage wrote:
> > > > > 
> > > > > 
> > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > -----Original Message-----
> > > > > > From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> > > > > > Sent: Monday, January 28, 2019 5:02 PM
> > > > > > To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> > > > > > Cc: arybchenko@solarflare.com; jerinj@marvell.com;
> > > > > > chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>; Richardson, Bruce
> > > > > > <bruce.richardson@intel.com>; Ananyev, Konstantin
> > > > > > <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> > > > > > olivier.matz@6wind.com; Honnappa Nagarahalli
> > > > > > <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> > > > > > <Gavin.Hu@arm.com>
> > > > > > Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset
> > > > > > (x86-64
> > > > > > only)
> > > > > > 
> > > > > > On Mon, 2019-01-28 at 11:29 -0600, Gage Eads wrote:
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > This operation can be used for non-blocking algorithms, such
> > > > > > > as a non-blocking stack or ring.
> > > > > > > 
> > > > > > > Signed-off-by: Gage Eads <gage.eads@intel.com>
> > > > > > > ---
> > > > > > >  .../common/include/arch/x86/rte_atomic_64.h        | 31
> > > > > > > +++++++++++
> > > > > > >  lib/librte_eal/common/include/generic/rte_atomic.h | 65
> > > > > > > ++++++++++++++++++++++
> > > > > > >  2 files changed, 96 insertions(+)
> > > > > > > 
> > > > > > > diff --git
> > > > > > > a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > > > b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > > > index fd2ec9c53..b7b90b83e 100644
> > > > > > > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > > > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > > > @@ -34,6 +34,7 @@
> > > > > > >  /*
> > > > > > >   * Inspired from FreeBSD src/sys/amd64/include/atomic.h
> > > > > > >   * Copyright (c) 1998 Doug Rabson
> > > > > > > + * Copyright (c) 2019 Intel Corporation
> > > > > > >   * All rights reserved.
> > > > > > >   */
> > > > > > > 
> > > > > > > @@ -46,6 +47,7 @@
> > > > > > > 
> > > > > > >  #include <stdint.h>
> > > > > > >  #include <rte_common.h>
> > > > > > > +#include <rte_compat.h>
> > > > > > >  #include <rte_atomic.h>
> > > > > > > 
> > > > > > >  /*------------------------- 64 bit atomic operations
> > > > > > > ------------------------ -*/ @@ -208,4 +210,33 @@ static
> > > > > > > inline void rte_atomic64_clear(rte_atomic64_t *v)
> > > > > > >  }
> > > > > > >  #endif
> > > > > > > 
> > > > > > > +static inline int __rte_experimental
> > > > > > __rte_always_inline?
> > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > +rte_atomic128_cmpset(volatile rte_int128_t *dst,
> > > > > > No need to declare the location volatile. Volatile doesn't do
> > > > > > what you think it does.
> > > > > > https://youtu.be/lkgszkPnV8g?t=1027
> > > > > > 
> > > > > I made this volatile to match the existing rte_atomicN_cmpset
> > > > > definitions, which presumably have a good reason for using the
> > > > > keyword. Maintainers, any input here?
> > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > +		     rte_int128_t *exp,
> > > > > > I would declare 'exp' const as well and document that 'exp' is
> > > > > > not updated (with the old value) for a failure. The reason being
> > > > > > that
> > > > > > ARMv8.0/AArch64 cannot atomically read the old value without
> > > > > > also writing the location and that is bad for performance
> > > > > > (unnecessary writes leads to unnecessary contention and worse
> > > > > > scalability). And the user must anyway read the location (in the
> > > > > > start of the critical
> > > > > > section) using e.g. non-atomic 64-bit reads so there isn't
> > > > > > actually any requirement for an atomic 128-bit read of the location.
> > > > > > 
> > > > > Will change in v2.
> > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > >  rte_int128_t *src,
> > > > > > const rte_int128_t *src?
> > > > > Sure, I don't see any harm in using const.
> > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > But why are we not passing 'exp' and 'src' by value? That works
> > > > > > great, even with structs. Passing by value simplifies the
> > > > > > compiler's life, especially if the call is inlined. Ask a compiler
> > > > > > developer.
> > > > > I ran objdump on the nb_stack code with both approaches, and
> > > > > pass-by-reference resulted in fewer overall x86_64 assembly ops.
> > > > > PBV: 100 ops for push, 97 ops for pop
> > > > > PBR: 92 ops for push, 84 ops for pop
> > > > OK I have never checked x86_64 code generation... I have good
> > > > experiences with ARM/AArch64, everything seems to be done using
> > > > registers. I am surprised there is a difference.
> > > > 
> > > > Did a quick check with lfring, passing 'src' (third param) by
> > > > reference and by value. No difference in code generation on x86_64.
> > > > 
> > > > But if you insist let's go with PBR.
> > > > 
> > > > > 
> > > > > 
> > > > > 
> > > > > (Using the in-progress v5 nb_stack code)
> > > > > 
> > > > > Another factor -- though much less compelling -- is that with
> > > > > pass-by- reference, the user can create a 16B structure and cast
> > > > > it to rte_int128_t when they call rte_atomic128_cmpset, whereas
> > > > > with pass-by-value they need to put that struct in a union with
> > rte_int128_t.
> > > 
> > > > 
> > > > Which is what I always do nowadays... Trying to use as few casts as
> > > > possible and lie to the compiler as seldom as possible. But I can
> > > > see the freedom provided by taking a pointer to something and cast
> > > > it it rte_int128_t ptr in the call to rte_atomic128_cmpset().
> > > > 
> > > > Would prefer a name that is more similar to __atomic_compare_exchange().
> > > > E.g.
> > > > rte_atomic128_compare_exchange() (or perhaps just
> > rte_atomic128_cmpxchg)?
> > > 
> > > > 
> > > > All the rte_atomicXX_cmpset() functions do not take any memory order
> > > > parameters.
> > > > From an Arm perspective, we are not happy with that.
> > > Since the function returns a boolean success value, isn't
> > > compare-and-set the appropriate term?
> > I was thinking of the memory ordering parameters that __atomic_xxx builtins
> > have and we want (from an Arm perspective).
> > 
> > GCC __atomic_compare_exchange also returns a boolean.
> > https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html
> > bool __atomic_compare_exchange_n (type *ptr, type *expected, type desired,
> > bool weak, int success_memorder, int failure_memorder) bool
> > __atomic_compare_exchange (type *ptr, type *expected, type *desired, bool
> > weak, int success_memorder, int failure_memorder)
> > 
> > rte_atomic128_compare_exchange(rte_int128_t *dst, const rte_int128_t *exp,
> > const rte_int182_t *src, bool weak, int mo_success, int mo_failure);
> > 
> I assumed GCC chose the name "exchange" because the builtin modifies 'exp' on
> failure, but I could be wrong.
You are probably right. But exchange doesn't necessary mean that the old value
is returned, just that it is replaced with a different value?
However __atomic_exchange() (unconditionally) returns the old value so there is
some precedent for retunrning the old value. And it can be done, just not
guaranteed to be atomic. Perhaps there is a way to both eat and have the cake?

Re-use the prototype of __atomic_compare_exchange_16() so must return a value if
the comparison fails. But depending on the 'weak' flag, we for weak=false
require an atomic read which on ARMv8.0 must be accomplished by writing back the
old value (if the comparison fails). Or for weak=true we allow both spurious
LL/SC failures (this is what the weak parameter is for) and non-atomic read of
the old value (this is my 'frail' atomic compare and exchange in lfring).

The GCC documentation says this:
"If they are not equal, the operation is a read and the current contents of ptr
are written into expected."
We follow this but just don't guarantee an atomic read (for weak=true).

bool
rte_atomic128_compare_exchange(rte_int128_t *dst,
                               rte_int128_t *exp, //Note no const now
                               const rte_int128_t *src,
                               bool weak,
                               int mo_success,
                               int mo_failure);


>  I'm fine with either name. Anyone else have a preference?

> 
> <snip>
-- 
Ola Liljedahl, Networking System Architect, Arm
Phone +46706866373, Skype ola.liljedahl


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
  2019-02-01 17:06     ` Eads, Gage
  2019-02-01 19:01       ` Ola Liljedahl
@ 2019-02-04 18:33       ` Honnappa Nagarahalli
  1 sibling, 0 replies; 36+ messages in thread
From: Honnappa Nagarahalli @ 2019-02-04 18:33 UTC (permalink / raw)
  To: Eads, Gage, Ola Liljedahl, dev
  Cc: arybchenko, jerinj, chaozhu, nd, Richardson, Bruce, Ananyev,
	Konstantin, hemant.agrawal, olivier.matz,
	Gavin Hu (Arm Technology China),
	nd

> >
> > On Mon, 2019-01-28 at 11:29 -0600, Gage Eads wrote:
> > > This operation can be used for non-blocking algorithms, such as a
> > > non-blocking stack or ring.
> > >
> > > Signed-off-by: Gage Eads <gage.eads@intel.com>
> > > ---
> > >  .../common/include/arch/x86/rte_atomic_64.h        | 31 +++++++++++
> > >  lib/librte_eal/common/include/generic/rte_atomic.h | 65
> > > ++++++++++++++++++++++
> > >  2 files changed, 96 insertions(+)
> > >
> > > diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > index fd2ec9c53..b7b90b83e 100644
> > > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > @@ -34,6 +34,7 @@
> > >  /*
> > >   * Inspired from FreeBSD src/sys/amd64/include/atomic.h
> > >   * Copyright (c) 1998 Doug Rabson
> > > + * Copyright (c) 2019 Intel Corporation
> > >   * All rights reserved.
> > >   */
> > >
> > > @@ -46,6 +47,7 @@
> > >
> > >  #include <stdint.h>
> > >  #include <rte_common.h>
> > > +#include <rte_compat.h>
> > >  #include <rte_atomic.h>
> > >
> > >  /*------------------------- 64 bit atomic operations
> > > ------------------------ -*/ @@ -208,4 +210,33 @@ static inline void
> > > rte_atomic64_clear(rte_atomic64_t *v)
> > >  }
> > >  #endif
> > >
> > > +static inline int __rte_experimental
> > __rte_always_inline?
> >
> > > +rte_atomic128_cmpset(volatile rte_int128_t *dst,
> > No need to declare the location volatile. Volatile doesn't do what you
> > think it does.
> > https://youtu.be/lkgszkPnV8g?t=1027
> >
> 
> I made this volatile to match the existing rte_atomicN_cmpset definitions,
> which presumably have a good reason for using the keyword. Maintainers, any
> input here?
> 
> >
> > > +		     rte_int128_t *exp,
> > I would declare 'exp' const as well and document that 'exp' is not
> > updated (with the old value) for a failure. The reason being that
> > ARMv8.0/AArch64 cannot atomically read the old value without also
> > writing the location and that is bad for performance (unnecessary
> > writes leads to unnecessary contention and worse scalability). And the
> > user must anyway read the location (in the start of the critical
> > section) using e.g. non-atomic 64-bit reads so there isn't actually any
> requirement for an atomic 128-bit read of the location.
> >
> 
> Will change in v2.
> 
IMO, we should not change the definition of this API, because
1) This API will differ from __atomic_compare_exchange_n API. It will be a new API to learn for the users.
2) The definition in this patch will make it easy to replace this API call with __atomic_xxx API (whenever it supports 128b natively on all the platforms)
3) I do not see any documentation in [1] indicating that the 'read on failure' will be an atomic read.

[1] https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html

> > >  rte_int128_t *src,
> > const rte_int128_t *src?
> 
> Sure, I don't see any harm in using const.
> 
> >
> > But why are we not passing 'exp' and 'src' by value? That works great,
> > even with structs. Passing by value simplifies the compiler's life,
> > especially if the call is inlined. Ask a compiler developer.
> 
> I ran objdump on the nb_stack code with both approaches, and pass-by-
> reference resulted in fewer overall x86_64 assembly ops.
> PBV: 100 ops for push, 97 ops for pop
> PBR: 92 ops for push, 84 ops for pop
> 
> (Using the in-progress v5 nb_stack code)
> 
> Another factor -- though much less compelling -- is that with pass-by-reference,
> the user can create a 16B structure and cast it to rte_int128_t when they call
> rte_atomic128_cmpset, whereas with pass-by-value they need to put that
> struct in a union with rte_int128_t.
> 
> >
> > > +		     unsigned int weak,
> > > +		     enum rte_atomic_memmodel_t success,
> > > +		     enum rte_atomic_memmodel_t failure) {
> > > +	RTE_SET_USED(weak);
> > > +	RTE_SET_USED(success);
> > > +	RTE_SET_USED(failure);
> > > +	uint8_t res;
> > > +
> > > +	asm volatile (
> > > +		      MPLOCKED
> > > +		      "cmpxchg16b %[dst];"
> > > +		      " sete %[res]"
> > > +		      : [dst] "=m" (dst->val[0]),
> > > +			"=A" (exp->val[0]),
> > > +			[res] "=r" (res)
> > > +		      : "c" (src->val[1]),
> > > +			"b" (src->val[0]),
> > > +			"m" (dst->val[0]),
> > > +			"d" (exp->val[1]),
> > > +			"a" (exp->val[0])
> > > +		      : "memory");
> > > +
> > > +	return res;
> > > +}
> > > +
> > >  #endif /* _RTE_ATOMIC_X86_64_H_ */
> > > diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h
> > > b/lib/librte_eal/common/include/generic/rte_atomic.h
> > > index b99ba4688..8d612d566 100644
> > > --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> > > +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> > > @@ -14,6 +14,7 @@
> > >
> > >  #include <stdint.h>
> > >  #include <rte_common.h>
> > > +#include <rte_compat.h>
> > >
> > >  #ifdef __DOXYGEN__
> > >
> > > @@ -1082,4 +1083,68 @@ static inline void
> > > rte_atomic64_clear(rte_atomic64_t
> > > *v)
> > >  }
> > >  #endif
> > >
> > > +/*------------------------ 128 bit atomic operations
> > > +------------------------
> > > -*/
> > > +
> > > +/**
> > > + * 128-bit integer structure.
> > > + */
> > > +typedef struct {
> > > +	uint64_t val[2];
> > > +} __rte_aligned(16) rte_int128_t;
> > So we can't use __int128?
> >
> 
> I'll put it in a union with val[2], in case any implementations want to use it.
> 
> Thanks,
> Gage
> 
> [snip]

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v2 0/1] Add 128-bit compare and set
  2019-01-28 17:29 [PATCH 0/1] Add 128-bit compare and set Gage Eads
  2019-01-28 17:29 ` [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only) Gage Eads
@ 2019-02-22 15:46 ` Gage Eads
  2019-02-22 15:46   ` [PATCH v2 1/1] eal: add 128-bit cmpxchg (x86-64 only) Gage Eads
  2019-03-04 20:51   ` [PATCH v3 0/1] Add 128-bit compare and set Gage Eads
  2019-04-03 19:35 ` [PATCH v5] eal/x86: add 128-bit atomic compare exchange Thomas Monjalon
  2 siblings, 2 replies; 36+ messages in thread
From: Gage Eads @ 2019-02-22 15:46 UTC (permalink / raw)
  To: dev
  Cc: olivier.matz, arybchenko, bruce.richardson, konstantin.ananyev,
	gavin.hu, Honnappa.Nagarahalli, nd, chaozhu, jerinj,
	hemant.agrawal

This patch addresses x86-64 only; other architectures can/will be supported
in the future. The __atomic intrinsic was considered for the implementation,
however libatomic was found[1] to use locks to implement the 128-bit CAS on at
least one architecture and so is eschewed here. The interface is modeled after
the __atomic_compare_exchange_16 (which itself is based on the C++11 memory
model) to best support weak consistency architectures.

This patch was originally part of a series that introduces a non-blocking stack
mempool handler[2], and is required by a non-blocking ring patchset. This
patch was spun off so that the the NB ring depends only on this patch
and not on the entire non-blocking stack patchset.

[1] http://mails.dpdk.org/archives/dev/2019-January/124002.html
[2] http://mails.dpdk.org/archives/dev/2019-January/123653.html

v2:
 - Rename function rte_atomic128_cmpxchg()
 - Replace "=A" output constraint with "=a" and "=d" to prevent GCC from
   using the al register as the sete destination
 - Extend 'weak' definition to explicitly allow non-atomic 'exp' updates.
 - Add const keyword to 'src' and remove volatile keyword from 'dst'
 - Put __int128 in a union in rte_int128_t and move the structure
   definition inside the RTE_ARCH_x86_64 ifdef
 - Drop enum rte_atomic_memmodel_t in favor of compiler-defined __ATOMIC_*
 - Drop unnecessary comment relating to X86_64
 - Tweak the pseudocode to reflect the 'exp' update on failure.
Gage Eads (1):
  eal: add 128-bit cmpxchg (x86-64 only)

 .../common/include/arch/x86/rte_atomic_64.h        | 33 ++++++++++++
 lib/librte_eal/common/include/generic/rte_atomic.h | 59 ++++++++++++++++++++++
 2 files changed, 92 insertions(+)

-- 
2.13.6

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v2 1/1] eal: add 128-bit cmpxchg (x86-64 only)
  2019-02-22 15:46 ` [PATCH v2 0/1] Add 128-bit compare and set Gage Eads
@ 2019-02-22 15:46   ` Gage Eads
  2019-03-04 20:19     ` Honnappa Nagarahalli
  2019-03-04 20:51   ` [PATCH v3 0/1] Add 128-bit compare and set Gage Eads
  1 sibling, 1 reply; 36+ messages in thread
From: Gage Eads @ 2019-02-22 15:46 UTC (permalink / raw)
  To: dev
  Cc: olivier.matz, arybchenko, bruce.richardson, konstantin.ananyev,
	gavin.hu, Honnappa.Nagarahalli, nd, chaozhu, jerinj,
	hemant.agrawal

This operation can be used for non-blocking algorithms, such as a
non-blocking stack or ring.

Signed-off-by: Gage Eads <gage.eads@intel.com>
---
 .../common/include/arch/x86/rte_atomic_64.h        | 33 ++++++++++++
 lib/librte_eal/common/include/generic/rte_atomic.h | 59 ++++++++++++++++++++++
 2 files changed, 92 insertions(+)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
index fd2ec9c53..413e5361b 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
@@ -34,6 +34,7 @@
 /*
  * Inspired from FreeBSD src/sys/amd64/include/atomic.h
  * Copyright (c) 1998 Doug Rabson
+ * Copyright (c) 2019 Intel Corporation
  * All rights reserved.
  */
 
@@ -46,6 +47,7 @@
 
 #include <stdint.h>
 #include <rte_common.h>
+#include <rte_compat.h>
 #include <rte_atomic.h>
 
 /*------------------------- 64 bit atomic operations -------------------------*/
@@ -208,4 +210,35 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 }
 #endif
 
+static inline int __rte_experimental
+rte_atomic128_cmpxchg(rte_int128_t *dst,
+		      rte_int128_t *exp,
+		      const rte_int128_t *src,
+		      unsigned int weak,
+		      int success,
+		      int failure)
+{
+	RTE_SET_USED(weak);
+	RTE_SET_USED(success);
+	RTE_SET_USED(failure);
+	uint8_t res;
+
+	asm volatile (
+		      MPLOCKED
+		      "cmpxchg16b %[dst];"
+		      " sete %[res]"
+		      : [dst] "=m" (dst->val[0]),
+			"=a" (exp->val[0]),
+			"=d" (exp->val[1]),
+			[res] "=r" (res)
+		      : "b" (src->val[0]),
+			"c" (src->val[1]),
+			"a" (exp->val[0]),
+			"d" (exp->val[1]),
+			"m" (dst->val[0])
+		      : "memory");
+
+	return res;
+}
+
 #endif /* _RTE_ATOMIC_X86_64_H_ */
diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h b/lib/librte_eal/common/include/generic/rte_atomic.h
index 4afd1acc3..7c43896e9 100644
--- a/lib/librte_eal/common/include/generic/rte_atomic.h
+++ b/lib/librte_eal/common/include/generic/rte_atomic.h
@@ -14,6 +14,7 @@
 
 #include <stdint.h>
 #include <rte_common.h>
+#include <rte_compat.h>
 
 #ifdef __DOXYGEN__
 
@@ -1082,4 +1083,62 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 }
 #endif
 
+/*------------------------ 128 bit atomic operations -------------------------*/
+
+#if defined(RTE_ARCH_X86_64)
+/**
+ * 128-bit integer structure.
+ */
+RTE_STD_C11
+typedef struct {
+	RTE_STD_C11
+	union {
+		uint64_t val[2];
+		__int128 int128;
+	};
+} __rte_aligned(16) rte_int128_t;
+
+/**
+ * An atomic compare and set function used by the mutex functions.
+ * (Atomically) Equivalent to:
+ *   if (*dst == exp)
+ *     *dst = src
+ *   else
+ *     *exp = *dst
+ *
+ * @note The success and failure arguments must be one of the __ATOMIC_* values
+ * defined in the C++11 standard. For details on their behavior, refer to the
+ * standard.
+ *
+ * @param dst
+ *   The destination into which the value will be written.
+ * @param exp
+ *   Pointer to the expected value. If the operation fails, this memory is
+ *   updated with the actual value.
+ * @param src
+ *   Pointer to the new value.
+ * @param weak
+ *   A value of true allows the comparison to spuriously fail and allows the
+ *   'exp' update to occur non-atomically (i.e. a torn read may occur).
+ *   Implementations may ignore this argument and only implement the strong
+ *   variant.
+ * @param success
+ *   If successful, the operation's memory behavior conforms to this (or a
+ *   stronger) model.
+ * @param failure
+ *   If unsuccessful, the operation's memory behavior conforms to this (or a
+ *   stronger) model. This argument cannot be __ATOMIC_RELEASE,
+ *   __ATOMIC_ACQ_REL, or a stronger model than success.
+ * @return
+ *   Non-zero on success; 0 on failure.
+ */
+static inline int __rte_experimental
+rte_atomic128_cmpxchg(rte_int128_t *dst,
+		      rte_int128_t *exp,
+		      const rte_int128_t *src,
+		      unsigned int weak,
+		      int success,
+		      int failure);
+#endif
+
 #endif /* _RTE_ATOMIC_H_ */
-- 
2.13.6

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 1/1] eal: add 128-bit cmpxchg (x86-64 only)
  2019-02-22 15:46   ` [PATCH v2 1/1] eal: add 128-bit cmpxchg (x86-64 only) Gage Eads
@ 2019-03-04 20:19     ` Honnappa Nagarahalli
  2019-03-04 20:47       ` Eads, Gage
  0 siblings, 1 reply; 36+ messages in thread
From: Honnappa Nagarahalli @ 2019-03-04 20:19 UTC (permalink / raw)
  To: Gage Eads, dev
  Cc: olivier.matz, arybchenko, bruce.richardson, konstantin.ananyev,
	Gavin Hu (Arm Technology China),
	nd, chaozhu, jerinj, hemant.agrawal, Honnappa Nagarahalli, nd

> This operation can be used for non-blocking algorithms, such as a non-blocking
> stack or ring.
> 
> Signed-off-by: Gage Eads <gage.eads@intel.com>
> ---
>  .../common/include/arch/x86/rte_atomic_64.h        | 33 ++++++++++++
>  lib/librte_eal/common/include/generic/rte_atomic.h | 59
> ++++++++++++++++++++++
>  2 files changed, 92 insertions(+)
> 
> diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> index fd2ec9c53..413e5361b 100644
> --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> @@ -34,6 +34,7 @@
>  /*
>   * Inspired from FreeBSD src/sys/amd64/include/atomic.h
>   * Copyright (c) 1998 Doug Rabson
> + * Copyright (c) 2019 Intel Corporation
>   * All rights reserved.
>   */
> 
> @@ -46,6 +47,7 @@
> 
>  #include <stdint.h>
>  #include <rte_common.h>
> +#include <rte_compat.h>
>  #include <rte_atomic.h>
> 
>  /*------------------------- 64 bit atomic operations -------------------------*/ @@ -
> 208,4 +210,35 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)  }
> #endif
> 
> +static inline int __rte_experimental
> +rte_atomic128_cmpxchg(rte_int128_t *dst,
> +		      rte_int128_t *exp,
> +		      const rte_int128_t *src,
> +		      unsigned int weak,
> +		      int success,
> +		      int failure)
> +{
> +	RTE_SET_USED(weak);
> +	RTE_SET_USED(success);
> +	RTE_SET_USED(failure);
> +	uint8_t res;
> +
> +	asm volatile (
> +		      MPLOCKED
> +		      "cmpxchg16b %[dst];"
> +		      " sete %[res]"
> +		      : [dst] "=m" (dst->val[0]),
> +			"=a" (exp->val[0]),
> +			"=d" (exp->val[1]),
> +			[res] "=r" (res)
> +		      : "b" (src->val[0]),
> +			"c" (src->val[1]),
> +			"a" (exp->val[0]),
> +			"d" (exp->val[1]),
> +			"m" (dst->val[0])
> +		      : "memory");
> +
> +	return res;
> +}
> +
>  #endif /* _RTE_ATOMIC_X86_64_H_ */
> diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h
> b/lib/librte_eal/common/include/generic/rte_atomic.h
> index 4afd1acc3..7c43896e9 100644
> --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> @@ -14,6 +14,7 @@
> 
>  #include <stdint.h>
>  #include <rte_common.h>
> +#include <rte_compat.h>
> 
>  #ifdef __DOXYGEN__
> 
> @@ -1082,4 +1083,62 @@ static inline void
> rte_atomic64_clear(rte_atomic64_t *v)  }  #endif
> 
> +/*------------------------ 128 bit atomic operations
> +-------------------------*/
> +
> +#if defined(RTE_ARCH_X86_64)
> +/**
> + * 128-bit integer structure.
> + */
> +RTE_STD_C11
> +typedef struct {
> +	RTE_STD_C11
> +	union {
> +		uint64_t val[2];
> +		__int128 int128;
> +	};
> +} __rte_aligned(16) rte_int128_t;
> +
> +/**
> + * An atomic compare and set function used by the mutex functions.
> + * (Atomically) Equivalent to:
> + *   if (*dst == exp)
Should be, "if (*dst == *exp)"

> + *     *dst = src
Should be "*dst = *src"

> + *   else
> + *     *exp = *dst
> + *
> + * @note The success and failure arguments must be one of the
> +__ATOMIC_* values
> + * defined in the C++11 standard. For details on their behavior, refer
> +to the
> + * standard.
> + *
> + * @param dst
> + *   The destination into which the value will be written.
> + * @param exp
> + *   Pointer to the expected value. If the operation fails, this memory is
> + *   updated with the actual value.
> + * @param src
> + *   Pointer to the new value.
> + * @param weak
> + *   A value of true allows the comparison to spuriously fail and allows the
> + *   'exp' update to occur non-atomically (i.e. a torn read may occur).
> + *   Implementations may ignore this argument and only implement the strong
> + *   variant.
> + * @param success
> + *   If successful, the operation's memory behavior conforms to this (or a
> + *   stronger) model.
> + * @param failure
> + *   If unsuccessful, the operation's memory behavior conforms to this (or a
> + *   stronger) model. This argument cannot be __ATOMIC_RELEASE,
> + *   __ATOMIC_ACQ_REL, or a stronger model than success.
> + * @return
> + *   Non-zero on success; 0 on failure.
> + */
> +static inline int __rte_experimental
> +rte_atomic128_cmpxchg(rte_int128_t *dst,
Name could be more neutral. May be rte_atomic128_compare/cmp_exchange?

> +		      rte_int128_t *exp,
> +		      const rte_int128_t *src,
> +		      unsigned int weak,
> +		      int success,
> +		      int failure);
> +#endif
> +
>  #endif /* _RTE_ATOMIC_H_ */
> --
> 2.13.6

Few minor comments.
I have not reviewed the x86 implementation.
Otherwise,
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 1/1] eal: add 128-bit cmpxchg (x86-64 only)
  2019-03-04 20:19     ` Honnappa Nagarahalli
@ 2019-03-04 20:47       ` Eads, Gage
  0 siblings, 0 replies; 36+ messages in thread
From: Eads, Gage @ 2019-03-04 20:47 UTC (permalink / raw)
  To: Honnappa Nagarahalli, dev
  Cc: olivier.matz, arybchenko, Richardson, Bruce, Ananyev, Konstantin,
	Gavin Hu (Arm Technology China),
	nd, chaozhu, jerinj, hemant.agrawal, nd

[snip]

> > +/**
> > + * An atomic compare and set function used by the mutex functions.
> > + * (Atomically) Equivalent to:
> > + *   if (*dst == exp)
> Should be, "if (*dst == *exp)"
> 
> > + *     *dst = src
> Should be "*dst = *src"
> 

Good catches, will fix both.

> > + *   else
> > + *     *exp = *dst
> > + *
> > + * @note The success and failure arguments must be one of the
> > +__ATOMIC_* values
> > + * defined in the C++11 standard. For details on their behavior,
> > +refer to the
> > + * standard.
> > + *
> > + * @param dst
> > + *   The destination into which the value will be written.
> > + * @param exp
> > + *   Pointer to the expected value. If the operation fails, this memory is
> > + *   updated with the actual value.
> > + * @param src
> > + *   Pointer to the new value.
> > + * @param weak
> > + *   A value of true allows the comparison to spuriously fail and allows the
> > + *   'exp' update to occur non-atomically (i.e. a torn read may occur).
> > + *   Implementations may ignore this argument and only implement the
> strong
> > + *   variant.
> > + * @param success
> > + *   If successful, the operation's memory behavior conforms to this (or a
> > + *   stronger) model.
> > + * @param failure
> > + *   If unsuccessful, the operation's memory behavior conforms to this (or a
> > + *   stronger) model. This argument cannot be __ATOMIC_RELEASE,
> > + *   __ATOMIC_ACQ_REL, or a stronger model than success.
> > + * @return
> > + *   Non-zero on success; 0 on failure.
> > + */
> > +static inline int __rte_experimental
> > +rte_atomic128_cmpxchg(rte_int128_t *dst,
> Name could be more neutral. May be rte_atomic128_compare/cmp_exchange?
> 
> > +		      rte_int128_t *exp,
> > +		      const rte_int128_t *src,
> > +		      unsigned int weak,
> > +		      int success,
> > +		      int failure);
> > +#endif
> > +
> >  #endif /* _RTE_ATOMIC_H_ */
> > --
> > 2.13.6
> 

Sure -- I'll rename it rte_atomic128_cmp_exchange in the next version.

> Few minor comments.
> I have not reviewed the x86 implementation.
> Otherwise,
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>

Thanks,
Gage

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v3 0/1] Add 128-bit compare and set
  2019-02-22 15:46 ` [PATCH v2 0/1] Add 128-bit compare and set Gage Eads
  2019-02-22 15:46   ` [PATCH v2 1/1] eal: add 128-bit cmpxchg (x86-64 only) Gage Eads
@ 2019-03-04 20:51   ` Gage Eads
  2019-03-04 20:51     ` [PATCH v3 1/1] eal: add 128-bit compare exchange (x86-64 only) Gage Eads
  2019-04-03 17:34     ` [PATCH v4 0/1] Add 128-bit compare and set Gage Eads
  1 sibling, 2 replies; 36+ messages in thread
From: Gage Eads @ 2019-03-04 20:51 UTC (permalink / raw)
  To: dev
  Cc: olivier.matz, arybchenko, bruce.richardson, konstantin.ananyev,
	gavin.hu, Honnappa.Nagarahalli, nd, chaozhu, jerinj,
	hemant.agrawal

This patch addresses x86-64 only; other architectures can/will be supported
in the future. The __atomic intrinsic was considered for the implementation,
however libatomic was found[1] to use locks to implement the 128-bit CAS on at
least one architecture and so is eschewed here. The interface is modeled after
the __atomic_compare_exchange_16 (which itself is based on the C++11 memory
model) to best support weak consistency architectures.

This patch was originally part of a series that introduces a non-blocking stack
mempool handler[2], and is required by a non-blocking ring patchset. This
patch was spun off so that the the NB ring depends only on this patch
and not on the entire non-blocking stack patchset.

[1] http://mails.dpdk.org/archives/dev/2019-January/124002.html
[2] http://mails.dpdk.org/archives/dev/2019-January/123653.html

v3:
 - Rename function to ISA-neutral rte_atomic128_cmp_exchange()
 - Fix two pseudocode bugs in function documentation

v2:
 - Rename function to rte_atomic128_cmpxchg()
 - Replace "=A" output constraint with "=a" and "=d" to prevent GCC from using
   the al register for the sete destination
 - Extend 'weak' definition to allow non-atomic 'exp' updates.
 - Add const keyword to 'src' and remove volatile keyword from 'dst'
 - Put __int128 in a union in rte_int128_t and move the structure definition
   inside the RTE_ARCH_x86_64 ifdef
 - Drop enum rte_atomic_memmodel_t in favor of compiler-defined __ATOMIC_*
 - Drop unnecessary comment relating to X86_64
 - Tweak the pseudocode to reflect the 'exp' update on failure.

Gage Eads (1):
  eal: add 128-bit compare exchange (x86-64 only)

 .../common/include/arch/x86/rte_atomic_64.h        | 33 ++++++++++++
 lib/librte_eal/common/include/generic/rte_atomic.h | 59 ++++++++++++++++++++++
 2 files changed, 92 insertions(+)

-- 
2.13.6

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v3 1/1] eal: add 128-bit compare exchange (x86-64 only)
  2019-03-04 20:51   ` [PATCH v3 0/1] Add 128-bit compare and set Gage Eads
@ 2019-03-04 20:51     ` Gage Eads
  2019-03-27 23:12       ` Thomas Monjalon
  2019-04-03 17:34     ` [PATCH v4 0/1] Add 128-bit compare and set Gage Eads
  1 sibling, 1 reply; 36+ messages in thread
From: Gage Eads @ 2019-03-04 20:51 UTC (permalink / raw)
  To: dev
  Cc: olivier.matz, arybchenko, bruce.richardson, konstantin.ananyev,
	gavin.hu, Honnappa.Nagarahalli, nd, chaozhu, jerinj,
	hemant.agrawal

This operation can be used for non-blocking algorithms, such as a
non-blocking stack or ring.

Signed-off-by: Gage Eads <gage.eads@intel.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
---
 .../common/include/arch/x86/rte_atomic_64.h        | 33 ++++++++++++
 lib/librte_eal/common/include/generic/rte_atomic.h | 59 ++++++++++++++++++++++
 2 files changed, 92 insertions(+)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
index fd2ec9c53..3b061ff7c 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
@@ -34,6 +34,7 @@
 /*
  * Inspired from FreeBSD src/sys/amd64/include/atomic.h
  * Copyright (c) 1998 Doug Rabson
+ * Copyright (c) 2019 Intel Corporation
  * All rights reserved.
  */
 
@@ -46,6 +47,7 @@
 
 #include <stdint.h>
 #include <rte_common.h>
+#include <rte_compat.h>
 #include <rte_atomic.h>
 
 /*------------------------- 64 bit atomic operations -------------------------*/
@@ -208,4 +210,35 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 }
 #endif
 
+static inline int __rte_experimental
+rte_atomic128_cmp_exchange(rte_int128_t *dst,
+			   rte_int128_t *exp,
+			   const rte_int128_t *src,
+			   unsigned int weak,
+			   int success,
+			   int failure)
+{
+	RTE_SET_USED(weak);
+	RTE_SET_USED(success);
+	RTE_SET_USED(failure);
+	uint8_t res;
+
+	asm volatile (
+		      MPLOCKED
+		      "cmpxchg16b %[dst];"
+		      " sete %[res]"
+		      : [dst] "=m" (dst->val[0]),
+			"=a" (exp->val[0]),
+			"=d" (exp->val[1]),
+			[res] "=r" (res)
+		      : "b" (src->val[0]),
+			"c" (src->val[1]),
+			"a" (exp->val[0]),
+			"d" (exp->val[1]),
+			"m" (dst->val[0])
+		      : "memory");
+
+	return res;
+}
+
 #endif /* _RTE_ATOMIC_X86_64_H_ */
diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h b/lib/librte_eal/common/include/generic/rte_atomic.h
index 4afd1acc3..a7d58c190 100644
--- a/lib/librte_eal/common/include/generic/rte_atomic.h
+++ b/lib/librte_eal/common/include/generic/rte_atomic.h
@@ -14,6 +14,7 @@
 
 #include <stdint.h>
 #include <rte_common.h>
+#include <rte_compat.h>
 
 #ifdef __DOXYGEN__
 
@@ -1082,4 +1083,62 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 }
 #endif
 
+/*------------------------ 128 bit atomic operations -------------------------*/
+
+#if defined(RTE_ARCH_X86_64)
+/**
+ * 128-bit integer structure.
+ */
+RTE_STD_C11
+typedef struct {
+	RTE_STD_C11
+	union {
+		uint64_t val[2];
+		__int128 int128;
+	};
+} __rte_aligned(16) rte_int128_t;
+
+/**
+ * An atomic compare and set function used by the mutex functions.
+ * (Atomically) Equivalent to:
+ *   if (*dst == *exp)
+ *     *dst = *src
+ *   else
+ *     *exp = *dst
+ *
+ * @note The success and failure arguments must be one of the __ATOMIC_* values
+ * defined in the C++11 standard. For details on their behavior, refer to the
+ * standard.
+ *
+ * @param dst
+ *   The destination into which the value will be written.
+ * @param exp
+ *   Pointer to the expected value. If the operation fails, this memory is
+ *   updated with the actual value.
+ * @param src
+ *   Pointer to the new value.
+ * @param weak
+ *   A value of true allows the comparison to spuriously fail and allows the
+ *   'exp' update to occur non-atomically (i.e. a torn read may occur).
+ *   Implementations may ignore this argument and only implement the strong
+ *   variant.
+ * @param success
+ *   If successful, the operation's memory behavior conforms to this (or a
+ *   stronger) model.
+ * @param failure
+ *   If unsuccessful, the operation's memory behavior conforms to this (or a
+ *   stronger) model. This argument cannot be __ATOMIC_RELEASE,
+ *   __ATOMIC_ACQ_REL, or a stronger model than success.
+ * @return
+ *   Non-zero on success; 0 on failure.
+ */
+static inline int __rte_experimental
+rte_atomic128_cmp_exchange(rte_int128_t *dst,
+			   rte_int128_t *exp,
+			   const rte_int128_t *src,
+			   unsigned int weak,
+			   int success,
+			   int failure);
+#endif
+
 #endif /* _RTE_ATOMIC_H_ */
-- 
2.13.6

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/1] eal: add 128-bit compare exchange (x86-64 only)
  2019-03-04 20:51     ` [PATCH v3 1/1] eal: add 128-bit compare exchange (x86-64 only) Gage Eads
@ 2019-03-27 23:12       ` Thomas Monjalon
  2019-03-28 16:22         ` Eads, Gage
  0 siblings, 1 reply; 36+ messages in thread
From: Thomas Monjalon @ 2019-03-27 23:12 UTC (permalink / raw)
  To: Gage Eads
  Cc: dev, olivier.matz, arybchenko, bruce.richardson,
	konstantin.ananyev, gavin.hu, Honnappa.Nagarahalli, nd, chaozhu,
	jerinj, hemant.agrawal, david.marchand

04/03/2019 21:51, Gage Eads:
> --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> +#if defined(RTE_ARCH_X86_64)
> +/**
> + * 128-bit integer structure.
> + */
> +RTE_STD_C11
> +typedef struct {
> +       RTE_STD_C11
> +       union {
> +               uint64_t val[2];
> +               __int128 int128;
> +       };
> +} __rte_aligned(16) rte_int128_t;

Why adding an arch-specific definition in a generic file?
Can we move it to the x86_64 file?

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/1] eal: add 128-bit compare exchange (x86-64 only)
  2019-03-27 23:12       ` Thomas Monjalon
@ 2019-03-28 16:22         ` Eads, Gage
  0 siblings, 0 replies; 36+ messages in thread
From: Eads, Gage @ 2019-03-28 16:22 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dev, olivier.matz, arybchenko, Richardson, Bruce, Ananyev,
	Konstantin, gavin.hu, Honnappa.Nagarahalli, nd, chaozhu, jerinj,
	hemant.agrawal, david.marchand

> 04/03/2019 21:51, Gage Eads:
> > --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> > +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> > +#if defined(RTE_ARCH_X86_64)
> > +/**
> > + * 128-bit integer structure.
> > + */
> > +RTE_STD_C11
> > +typedef struct {
> > +       RTE_STD_C11
> > +       union {
> > +               uint64_t val[2];
> > +               __int128 int128;
> > +       };
> > +} __rte_aligned(16) rte_int128_t;
> 
> Why adding an arch-specific definition in a generic file?
> Can we move it to the x86_64 file?
> 

We can. I put it in the generic header in anticipation of other ISA implementations coming later, and since the atomic operations are documented in the generic header (although that ifdef appears to prevent it from being included in doxygen output).

I'll move it to x86/rte_atomic_64.h if that's preferred, and we can consider moving it back to generic as other implementations are added.

Thanks,
Gage

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v4 0/1] Add 128-bit compare and set
  2019-03-04 20:51   ` [PATCH v3 0/1] Add 128-bit compare and set Gage Eads
  2019-03-04 20:51     ` [PATCH v3 1/1] eal: add 128-bit compare exchange (x86-64 only) Gage Eads
@ 2019-04-03 17:34     ` Gage Eads
  2019-04-03 17:34       ` [PATCH v4 1/1] eal: add 128-bit compare exchange (x86-64 only) Gage Eads
  1 sibling, 1 reply; 36+ messages in thread
From: Gage Eads @ 2019-04-03 17:34 UTC (permalink / raw)
  To: dev
  Cc: olivier.matz, arybchenko, bruce.richardson, konstantin.ananyev,
	gavin.hu, Honnappa.Nagarahalli, nd, chaozhu, jerinj,
	hemant.agrawal, thomas

This patch addresses x86-64 only; other architectures can/will be supported
in the future. The __atomic intrinsic was considered for the implementation,
however libatomic was found[1] to use locks to implement the 128-bit CAS on at
least one architecture and so is eschewed here. The interface is modeled after
the __atomic_compare_exchange_16 (which itself is based on the C++11 memory
model) to best support weak consistency architectures.

This patch was originally part of a series that introduces a non-blocking stack
mempool handler[2], and is required by a non-blocking ring patchset. This
patch was spun off so that the the NB ring depends only on this patch
and not on the entire non-blocking stack patchset.

[1] http://mails.dpdk.org/archives/dev/2019-January/124002.html
[2] http://mails.dpdk.org/archives/dev/2019-January/123653.html

v4:
 - Move function declaration from generic/rte_atomic.h to x86-64 header file

v3:
 - Rename function to ISA-neutral rte_atomic128_cmp_exchange()
 - Fix two pseudocode bugs in function documentation

v2:
 - Rename function to rte_atomic128_cmpxchg()
 - Replace "=A" output constraint with "=a" and "=d" to prevent GCC from using
   the al register for the sete destination
 - Extend 'weak' definition to allow non-atomic 'exp' updates.
 - Add const keyword to 'src' and remove volatile keyword from 'dst'
 - Put __int128 in a union in rte_int128_t and move the structure definition
   inside the RTE_ARCH_x86_64 ifdef
 - Drop enum rte_atomic_memmodel_t in favor of compiler-defined __ATOMIC_*
 - Drop unnecessary comment relating to X86_64
 - Tweak the pseudocode to reflect the 'exp' update on failure.

Gage Eads (1):
  eal: add 128-bit compare exchange (x86-64 only)

 .../common/include/arch/x86/rte_atomic_64.h        | 81 ++++++++++++++++++++++
 1 file changed, 81 insertions(+)

-- 
2.13.6

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v4 1/1] eal: add 128-bit compare exchange (x86-64 only)
  2019-04-03 17:34     ` [PATCH v4 0/1] Add 128-bit compare and set Gage Eads
@ 2019-04-03 17:34       ` Gage Eads
  2019-04-03 19:04         ` Thomas Monjalon
  0 siblings, 1 reply; 36+ messages in thread
From: Gage Eads @ 2019-04-03 17:34 UTC (permalink / raw)
  To: dev
  Cc: olivier.matz, arybchenko, bruce.richardson, konstantin.ananyev,
	gavin.hu, Honnappa.Nagarahalli, nd, chaozhu, jerinj,
	hemant.agrawal, thomas

This operation can be used for non-blocking algorithms, such as a
non-blocking stack or ring.

Signed-off-by: Gage Eads <gage.eads@intel.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
---
 .../common/include/arch/x86/rte_atomic_64.h        | 81 ++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
index fd2ec9c53..49bce32c7 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
@@ -34,6 +34,7 @@
 /*
  * Inspired from FreeBSD src/sys/amd64/include/atomic.h
  * Copyright (c) 1998 Doug Rabson
+ * Copyright (c) 2019 Intel Corporation
  * All rights reserved.
  */
 
@@ -46,6 +47,7 @@
 
 #include <stdint.h>
 #include <rte_common.h>
+#include <rte_compat.h>
 #include <rte_atomic.h>
 
 /*------------------------- 64 bit atomic operations -------------------------*/
@@ -208,4 +210,83 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 }
 #endif
 
+/*------------------------ 128 bit atomic operations -------------------------*/
+
+/**
+ * 128-bit integer structure.
+ */
+RTE_STD_C11
+typedef struct {
+	RTE_STD_C11
+	union {
+		uint64_t val[2];
+		__int128 int128;
+	};
+} __rte_aligned(16) rte_int128_t;
+
+/**
+ * An atomic compare and set function used by the mutex functions.
+ * (Atomically) Equivalent to:
+ *   if (*dst == *exp)
+ *     *dst = *src
+ *   else
+ *     *exp = *dst
+ *
+ * @note The success and failure arguments must be one of the __ATOMIC_* values
+ * defined in the C++11 standard. For details on their behavior, refer to the
+ * standard.
+ *
+ * @param dst
+ *   The destination into which the value will be written.
+ * @param exp
+ *   Pointer to the expected value. If the operation fails, this memory is
+ *   updated with the actual value.
+ * @param src
+ *   Pointer to the new value.
+ * @param weak
+ *   A value of true allows the comparison to spuriously fail and allows the
+ *   'exp' update to occur non-atomically (i.e. a torn read may occur).
+ *   Implementations may ignore this argument and only implement the strong
+ *   variant.
+ * @param success
+ *   If successful, the operation's memory behavior conforms to this (or a
+ *   stronger) model.
+ * @param failure
+ *   If unsuccessful, the operation's memory behavior conforms to this (or a
+ *   stronger) model. This argument cannot be __ATOMIC_RELEASE,
+ *   __ATOMIC_ACQ_REL, or a stronger model than success.
+ * @return
+ *   Non-zero on success; 0 on failure.
+ */
+static inline int __rte_experimental
+rte_atomic128_cmp_exchange(rte_int128_t *dst,
+			   rte_int128_t *exp,
+			   const rte_int128_t *src,
+			   unsigned int weak,
+			   int success,
+			   int failure)
+{
+	RTE_SET_USED(weak);
+	RTE_SET_USED(success);
+	RTE_SET_USED(failure);
+	uint8_t res;
+
+	asm volatile (
+		      MPLOCKED
+		      "cmpxchg16b %[dst];"
+		      " sete %[res]"
+		      : [dst] "=m" (dst->val[0]),
+			"=a" (exp->val[0]),
+			"=d" (exp->val[1]),
+			[res] "=r" (res)
+		      : "b" (src->val[0]),
+			"c" (src->val[1]),
+			"a" (exp->val[0]),
+			"d" (exp->val[1]),
+			"m" (dst->val[0])
+		      : "memory");
+
+	return res;
+}
+
 #endif /* _RTE_ATOMIC_X86_64_H_ */
-- 
2.13.6

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] eal: add 128-bit compare exchange (x86-64 only)
  2019-04-03 17:34       ` [PATCH v4 1/1] eal: add 128-bit compare exchange (x86-64 only) Gage Eads
@ 2019-04-03 19:04         ` Thomas Monjalon
  2019-04-03 19:21           ` Eads, Gage
  0 siblings, 1 reply; 36+ messages in thread
From: Thomas Monjalon @ 2019-04-03 19:04 UTC (permalink / raw)
  To: Gage Eads
  Cc: dev, olivier.matz, arybchenko, bruce.richardson,
	konstantin.ananyev, gavin.hu, Honnappa.Nagarahalli, nd, chaozhu,
	jerinj, hemant.agrawal

03/04/2019 19:34, Gage Eads:
> This operation can be used for non-blocking algorithms, such as a
> non-blocking stack or ring.
> 
> Signed-off-by: Gage Eads <gage.eads@intel.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> ---
> --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> +/**
> + * An atomic compare and set function used by the mutex functions.
> + * (Atomically) Equivalent to:
> + *   if (*dst == *exp)
> + *     *dst = *src
> + *   else
> + *     *exp = *dst
> + *
> + * @note The success and failure arguments must be one of the __ATOMIC_* values
> + * defined in the C++11 standard. For details on their behavior, refer to the
> + * standard.
> + *
> + * @param dst
> + *   The destination into which the value will be written.
> + * @param exp
> + *   Pointer to the expected value. If the operation fails, this memory is
> + *   updated with the actual value.
> + * @param src
> + *   Pointer to the new value.
> + * @param weak
> + *   A value of true allows the comparison to spuriously fail and allows the
> + *   'exp' update to occur non-atomically (i.e. a torn read may occur).
> + *   Implementations may ignore this argument and only implement the strong
> + *   variant.
> + * @param success
> + *   If successful, the operation's memory behavior conforms to this (or a
> + *   stronger) model.
> + * @param failure
> + *   If unsuccessful, the operation's memory behavior conforms to this (or a
> + *   stronger) model. This argument cannot be __ATOMIC_RELEASE,
> + *   __ATOMIC_ACQ_REL, or a stronger model than success.
> + * @return
> + *   Non-zero on success; 0 on failure.
> + */
> +static inline int __rte_experimental
> +rte_atomic128_cmp_exchange(rte_int128_t *dst,
> +			   rte_int128_t *exp,
> +			   const rte_int128_t *src,
> +			   unsigned int weak,
> +			   int success,
> +			   int failure)

I was thinking about keeping the doxygen in the generic file.
Is it possible?

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] eal: add 128-bit compare exchange (x86-64 only)
  2019-04-03 19:04         ` Thomas Monjalon
@ 2019-04-03 19:21           ` Eads, Gage
  2019-04-03 19:27             ` Thomas Monjalon
  0 siblings, 1 reply; 36+ messages in thread
From: Eads, Gage @ 2019-04-03 19:21 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dev, olivier.matz, arybchenko, Richardson, Bruce, Ananyev,
	Konstantin, gavin.hu, Honnappa.Nagarahalli, nd, chaozhu, jerinj,
	hemant.agrawal

> 03/04/2019 19:34, Gage Eads:
> > This operation can be used for non-blocking algorithms, such as a
> > non-blocking stack or ring.
> >
> > Signed-off-by: Gage Eads <gage.eads@intel.com>
> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > ---
> > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > +/**
> > + * An atomic compare and set function used by the mutex functions.
> > + * (Atomically) Equivalent to:
> > + *   if (*dst == *exp)
> > + *     *dst = *src
> > + *   else
> > + *     *exp = *dst
> > + *
> > + * @note The success and failure arguments must be one of the
> > +__ATOMIC_* values
> > + * defined in the C++11 standard. For details on their behavior,
> > +refer to the
> > + * standard.
> > + *
> > + * @param dst
> > + *   The destination into which the value will be written.
> > + * @param exp
> > + *   Pointer to the expected value. If the operation fails, this memory is
> > + *   updated with the actual value.
> > + * @param src
> > + *   Pointer to the new value.
> > + * @param weak
> > + *   A value of true allows the comparison to spuriously fail and allows the
> > + *   'exp' update to occur non-atomically (i.e. a torn read may occur).
> > + *   Implementations may ignore this argument and only implement the
> strong
> > + *   variant.
> > + * @param success
> > + *   If successful, the operation's memory behavior conforms to this (or a
> > + *   stronger) model.
> > + * @param failure
> > + *   If unsuccessful, the operation's memory behavior conforms to this (or
> a
> > + *   stronger) model. This argument cannot be __ATOMIC_RELEASE,
> > + *   __ATOMIC_ACQ_REL, or a stronger model than success.
> > + * @return
> > + *   Non-zero on success; 0 on failure.
> > + */
> > +static inline int __rte_experimental
> > +rte_atomic128_cmp_exchange(rte_int128_t *dst,
> > +			   rte_int128_t *exp,
> > +			   const rte_int128_t *src,
> > +			   unsigned int weak,
> > +			   int success,
> > +			   int failure)
> 
> I was thinking about keeping the doxygen in the generic file.
> Is it possible?
> 

We'd need to include the definition of rte_int128_t, so we'd also need either an ifdef on RTE_ARCH_64 or RTE_ARCH_X86_64 to protect 32-bit builds. That macro would prevent doxygen from parsing that section, unless we add a workaround like, for example:

#if defined(RTE_ARCH_64) || defined(__DOXYGEN__)

So the patch would look like the v3, with the declaration in generic/rte_atomic.h, but with that preprocessor change. If we change RTE_ARCH_X86_64 to RTE_ARCH_64, I'd add a note clarifying that it's only implemented for x86-64. What do you think?

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] eal: add 128-bit compare exchange (x86-64 only)
  2019-04-03 19:21           ` Eads, Gage
@ 2019-04-03 19:27             ` Thomas Monjalon
  0 siblings, 0 replies; 36+ messages in thread
From: Thomas Monjalon @ 2019-04-03 19:27 UTC (permalink / raw)
  To: Eads, Gage
  Cc: dev, olivier.matz, arybchenko, Richardson, Bruce, Ananyev,
	Konstantin, gavin.hu, Honnappa.Nagarahalli, nd, chaozhu, jerinj,
	hemant.agrawal

03/04/2019 21:21, Eads, Gage:
> > 03/04/2019 19:34, Gage Eads:
> > > This operation can be used for non-blocking algorithms, such as a
> > > non-blocking stack or ring.
> > >
> > > Signed-off-by: Gage Eads <gage.eads@intel.com>
> > > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > > ---
> > > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > +/**
> > > + * An atomic compare and set function used by the mutex functions.
> > > + * (Atomically) Equivalent to:
> > > + *   if (*dst == *exp)
> > > + *     *dst = *src
> > > + *   else
> > > + *     *exp = *dst
> > > + *
> > > + * @note The success and failure arguments must be one of the
> > > +__ATOMIC_* values
> > > + * defined in the C++11 standard. For details on their behavior,
> > > +refer to the
> > > + * standard.
> > > + *
> > > + * @param dst
> > > + *   The destination into which the value will be written.
> > > + * @param exp
> > > + *   Pointer to the expected value. If the operation fails, this memory is
> > > + *   updated with the actual value.
> > > + * @param src
> > > + *   Pointer to the new value.
> > > + * @param weak
> > > + *   A value of true allows the comparison to spuriously fail and allows the
> > > + *   'exp' update to occur non-atomically (i.e. a torn read may occur).
> > > + *   Implementations may ignore this argument and only implement the
> > strong
> > > + *   variant.
> > > + * @param success
> > > + *   If successful, the operation's memory behavior conforms to this (or a
> > > + *   stronger) model.
> > > + * @param failure
> > > + *   If unsuccessful, the operation's memory behavior conforms to this (or
> > a
> > > + *   stronger) model. This argument cannot be __ATOMIC_RELEASE,
> > > + *   __ATOMIC_ACQ_REL, or a stronger model than success.
> > > + * @return
> > > + *   Non-zero on success; 0 on failure.
> > > + */
> > > +static inline int __rte_experimental
> > > +rte_atomic128_cmp_exchange(rte_int128_t *dst,
> > > +			   rte_int128_t *exp,
> > > +			   const rte_int128_t *src,
> > > +			   unsigned int weak,
> > > +			   int success,
> > > +			   int failure)
> > 
> > I was thinking about keeping the doxygen in the generic file.
> > Is it possible?
> > 
> 
> We'd need to include the definition of rte_int128_t, so we'd also need either an ifdef on RTE_ARCH_64 or RTE_ARCH_X86_64 to protect 32-bit builds. That macro would prevent doxygen from parsing that section, unless we add a workaround like, for example:
> 
> #if defined(RTE_ARCH_64) || defined(__DOXYGEN__)
> 
> So the patch would look like the v3, with the declaration in generic/rte_atomic.h, but with that preprocessor change. If we change RTE_ARCH_X86_64 to RTE_ARCH_64, I'd add a note clarifying that it's only implemented for x86-64. What do you think?

I would like to see the doc in the generic file
and the implementation in the x86 file.

I tried forward declaration of the typedef:
	struct rte_int128;
	typedef struct rte_int128 rte_int128_t;
I don't why it does not work.
So I'm trying to protect the declaration with
	#ifdef __DOXYGEN__

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v5] eal/x86: add 128-bit atomic compare exchange
  2019-01-28 17:29 [PATCH 0/1] Add 128-bit compare and set Gage Eads
  2019-01-28 17:29 ` [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only) Gage Eads
  2019-02-22 15:46 ` [PATCH v2 0/1] Add 128-bit compare and set Gage Eads
@ 2019-04-03 19:35 ` Thomas Monjalon
  2019-04-03 19:44   ` [PATCH v6] " Gage Eads
  2 siblings, 1 reply; 36+ messages in thread
From: Thomas Monjalon @ 2019-04-03 19:35 UTC (permalink / raw)
  To: gage.eads, Bruce Richardson, Konstantin Ananyev; +Cc: dev, Honnappa Nagarahalli

From: Gage Eads <gage.eads@intel.com>

This operation can be used for non-blocking algorithms, such as a
non-blocking stack or ring.

It is available only for x86_64.

Signed-off-by: Gage Eads <gage.eads@intel.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
---

This patch addresses x86-64 only; other architectures can/will be supported
in the future. The __atomic intrinsic was considered for the implementation,
however libatomic was found[1] to use locks to implement the 128-bit CAS on at
least one architecture and so is eschewed here. The interface is modeled after
the __atomic_compare_exchange_16 (which itself is based on the C++11 memory
model) to best support weak consistency architectures.

This patch was originally part of a series that introduces a non-blocking stack
mempool handler[2], and is required by a non-blocking ring patchset. This
patch was spun off so that the the NB ring depends only on this patch
and not on the entire non-blocking stack patchset.

[1] http://mails.dpdk.org/archives/dev/2019-January/124002.html
[2] http://mails.dpdk.org/archives/dev/2019-January/123653.html

v5:
 - Move declaration in the generic file for doxygen only (Thomas)

v4:
 - Move function declaration from generic/rte_atomic.h to x86-64 header file

v3:
 - Rename function to ISA-neutral rte_atomic128_cmp_exchange()
 - Fix two pseudocode bugs in function documentation

v2:
 - Rename function to rte_atomic128_cmpxchg()
 - Replace "=A" output constraint with "=a" and "=d" to prevent GCC from using
   the al register for the sete destination
 - Extend 'weak' definition to allow non-atomic 'exp' updates.
 - Add const keyword to 'src' and remove volatile keyword from 'dst'
 - Put __int128 in a union in rte_int128_t and move the structure definition
   inside the RTE_ARCH_x86_64 ifdef
 - Drop enum rte_atomic_memmodel_t in favor of compiler-defined __ATOMIC_*
 - Drop unnecessary comment relating to X86_64
 - Tweak the pseudocode to reflect the 'exp' update on failure.

---
 .../common/include/arch/x86/rte_atomic_64.h   | 47 ++++++++++++++++++
 .../common/include/generic/rte_atomic.h       | 48 +++++++++++++++++++
 2 files changed, 95 insertions(+)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
index fd2ec9c53..4b8315926 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
@@ -34,6 +34,7 @@
 /*
  * Inspired from FreeBSD src/sys/amd64/include/atomic.h
  * Copyright (c) 1998 Doug Rabson
+ * Copyright (c) 2019 Intel Corporation
  * All rights reserved.
  */
 
@@ -46,6 +47,7 @@
 
 #include <stdint.h>
 #include <rte_common.h>
+#include <rte_compat.h>
 #include <rte_atomic.h>
 
 /*------------------------- 64 bit atomic operations -------------------------*/
@@ -208,4 +210,49 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 }
 #endif
 
+/*------------------------ 128 bit atomic operations -------------------------*/
+
+/**
+ * 128-bit integer structure.
+ */
+RTE_STD_C11
+typedef struct {
+	RTE_STD_C11
+	union {
+		uint64_t val[2];
+		__int128 int128;
+	};
+} __rte_aligned(16) rte_int128_t;
+
+static inline int __rte_experimental
+rte_atomic128_cmp_exchange(rte_int128_t *dst,
+			   rte_int128_t *exp,
+			   const rte_int128_t *src,
+			   unsigned int weak,
+			   int success,
+			   int failure)
+{
+	RTE_SET_USED(weak);
+	RTE_SET_USED(success);
+	RTE_SET_USED(failure);
+	uint8_t res;
+
+	asm volatile (
+		      MPLOCKED
+		      "cmpxchg16b %[dst];"
+		      " sete %[res]"
+		      : [dst] "=m" (dst->val[0]),
+			"=a" (exp->val[0]),
+			"=d" (exp->val[1]),
+			[res] "=r" (res)
+		      : "b" (src->val[0]),
+			"c" (src->val[1]),
+			"a" (exp->val[0]),
+			"d" (exp->val[1]),
+			"m" (dst->val[0])
+		      : "memory");
+
+	return res;
+}
+
 #endif /* _RTE_ATOMIC_X86_64_H_ */
diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h b/lib/librte_eal/common/include/generic/rte_atomic.h
index e91742702..f057072fc 100644
--- a/lib/librte_eal/common/include/generic/rte_atomic.h
+++ b/lib/librte_eal/common/include/generic/rte_atomic.h
@@ -1079,4 +1079,52 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 }
 #endif
 
+/*------------------------ 128 bit atomic operations -------------------------*/
+
+#ifdef __DOXYGEN__
+
+/**
+ * An atomic compare and set function used by the mutex functions.
+ * (Atomically) Equivalent to:
+ *   if (*dst == *exp)
+ *     *dst = *src
+ *   else
+ *     *exp = *dst
+ *
+ * @note The success and failure arguments must be one of the __ATOMIC_* values
+ * defined in the C++11 standard. For details on their behavior, refer to the
+ * standard.
+ *
+ * @param dst
+ *   The destination into which the value will be written.
+ * @param exp
+ *   Pointer to the expected value. If the operation fails, this memory is
+ *   updated with the actual value.
+ * @param src
+ *   Pointer to the new value.
+ * @param weak
+ *   A value of true allows the comparison to spuriously fail and allows the
+ *   'exp' update to occur non-atomically (i.e. a torn read may occur).
+ *   Implementations may ignore this argument and only implement the strong
+ *   variant.
+ * @param success
+ *   If successful, the operation's memory behavior conforms to this (or a
+ *   stronger) model.
+ * @param failure
+ *   If unsuccessful, the operation's memory behavior conforms to this (or a
+ *   stronger) model. This argument cannot be __ATOMIC_RELEASE,
+ *   __ATOMIC_ACQ_REL, or a stronger model than success.
+ * @return
+ *   Non-zero on success; 0 on failure.
+ */
+static inline int __rte_experimental
+rte_atomic128_cmp_exchange(rte_int128_t *dst,
+			   rte_int128_t *exp,
+			   const rte_int128_t *src,
+			   unsigned int weak,
+			   int success,
+			   int failure);
+
+#endif /* __DOXYGEN__ */
+
 #endif /* _RTE_ATOMIC_H_ */
-- 
2.21.0

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH v6] eal/x86: add 128-bit atomic compare exchange
  2019-04-03 19:35 ` [PATCH v5] eal/x86: add 128-bit atomic compare exchange Thomas Monjalon
@ 2019-04-03 19:44   ` Gage Eads
  2019-04-03 20:01     ` Thomas Monjalon
  2019-04-04 11:47     ` Ferruh Yigit
  0 siblings, 2 replies; 36+ messages in thread
From: Gage Eads @ 2019-04-03 19:44 UTC (permalink / raw)
  To: dev
  Cc: olivier.matz, arybchenko, bruce.richardson, konstantin.ananyev,
	gavin.hu, Honnappa.Nagarahalli, nd, chaozhu, jerinj,
	hemant.agrawal

This operation can be used for non-blocking algorithms, such as a
non-blocking stack or ring.

It is available only for x86_64.

Signed-off-by: Gage Eads <gage.eads@intel.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
---
This patch addresses x86-64 only; other architectures can/will be supported
in the future. The __atomic intrinsic was considered for the implementation,
however libatomic was found[1] to use locks to implement the 128-bit CAS on at
least one architecture and so is eschewed here. The interface is modeled after
the __atomic_compare_exchange_16 (which itself is based on the C++11 memory
model) to best support weak consistency architectures.

This patch was originally part of a series that introduces a non-blocking stack
mempool handler[2], and is required by a non-blocking ring patchset. This
patch was spun off so that the the NB ring depends only on this patch
and not on the entire non-blocking stack patchset.

[1] http://mails.dpdk.org/archives/dev/2019-January/124002.html
[2] http://mails.dpdk.org/archives/dev/2019-January/123653.html

v6:
- Use @code and @endcode to clean up pseudo-code generation
- Add note that the function is currently limited to x86-64

v5:
 - Move declaration in the generic file for doxygen only (Thomas)

v4:
 - Move function declaration from generic/rte_atomic.h to x86-64 header file

v3:
 - Rename function to ISA-neutral rte_atomic128_cmp_exchange()
 - Fix two pseudocode bugs in function documentation

v2:
 - Rename function to rte_atomic128_cmpxchg()
 - Replace "=A" output constraint with "=a" and "=d" to prevent GCC from using
   the al register for the sete destination
 - Extend 'weak' definition to allow non-atomic 'exp' updates.
 - Add const keyword to 'src' and remove volatile keyword from 'dst'
 - Put __int128 in a union in rte_int128_t and move the structure definition
   inside the RTE_ARCH_x86_64 ifdef
 - Drop enum rte_atomic_memmodel_t in favor of compiler-defined __ATOMIC_*
 - Drop unnecessary comment relating to X86_64
 - Tweak the pseudocode to reflect the 'exp' update on failure.

 .../common/include/arch/x86/rte_atomic_64.h        | 47 +++++++++++++++++++
 lib/librte_eal/common/include/generic/rte_atomic.h | 52 ++++++++++++++++++++++
 2 files changed, 99 insertions(+)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
index fd2ec9c53..4b8315926 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
@@ -34,6 +34,7 @@
 /*
  * Inspired from FreeBSD src/sys/amd64/include/atomic.h
  * Copyright (c) 1998 Doug Rabson
+ * Copyright (c) 2019 Intel Corporation
  * All rights reserved.
  */
 
@@ -46,6 +47,7 @@
 
 #include <stdint.h>
 #include <rte_common.h>
+#include <rte_compat.h>
 #include <rte_atomic.h>
 
 /*------------------------- 64 bit atomic operations -------------------------*/
@@ -208,4 +210,49 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 }
 #endif
 
+/*------------------------ 128 bit atomic operations -------------------------*/
+
+/**
+ * 128-bit integer structure.
+ */
+RTE_STD_C11
+typedef struct {
+	RTE_STD_C11
+	union {
+		uint64_t val[2];
+		__int128 int128;
+	};
+} __rte_aligned(16) rte_int128_t;
+
+static inline int __rte_experimental
+rte_atomic128_cmp_exchange(rte_int128_t *dst,
+			   rte_int128_t *exp,
+			   const rte_int128_t *src,
+			   unsigned int weak,
+			   int success,
+			   int failure)
+{
+	RTE_SET_USED(weak);
+	RTE_SET_USED(success);
+	RTE_SET_USED(failure);
+	uint8_t res;
+
+	asm volatile (
+		      MPLOCKED
+		      "cmpxchg16b %[dst];"
+		      " sete %[res]"
+		      : [dst] "=m" (dst->val[0]),
+			"=a" (exp->val[0]),
+			"=d" (exp->val[1]),
+			[res] "=r" (res)
+		      : "b" (src->val[0]),
+			"c" (src->val[1]),
+			"a" (exp->val[0]),
+			"d" (exp->val[1]),
+			"m" (dst->val[0])
+		      : "memory");
+
+	return res;
+}
+
 #endif /* _RTE_ATOMIC_X86_64_H_ */
diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h b/lib/librte_eal/common/include/generic/rte_atomic.h
index e91742702..99585436c 100644
--- a/lib/librte_eal/common/include/generic/rte_atomic.h
+++ b/lib/librte_eal/common/include/generic/rte_atomic.h
@@ -1079,4 +1079,56 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 }
 #endif
 
+/*------------------------ 128 bit atomic operations -------------------------*/
+
+#ifdef __DOXYGEN__
+
+/**
+ * An atomic compare and set function used by the mutex functions.
+ * (Atomically) Equivalent to:
+ * @code
+ *   if (*dst == *exp)
+ *     *dst = *src
+ *   else
+ *     *exp = *dst
+ * @endcode
+ *
+ * @note This function is currently only available for the x86-64 platform.
+ *
+ * @note The success and failure arguments must be one of the __ATOMIC_* values
+ * defined in the C++11 standard. For details on their behavior, refer to the
+ * standard.
+ *
+ * @param dst
+ *   The destination into which the value will be written.
+ * @param exp
+ *   Pointer to the expected value. If the operation fails, this memory is
+ *   updated with the actual value.
+ * @param src
+ *   Pointer to the new value.
+ * @param weak
+ *   A value of true allows the comparison to spuriously fail and allows the
+ *   'exp' update to occur non-atomically (i.e. a torn read may occur).
+ *   Implementations may ignore this argument and only implement the strong
+ *   variant.
+ * @param success
+ *   If successful, the operation's memory behavior conforms to this (or a
+ *   stronger) model.
+ * @param failure
+ *   If unsuccessful, the operation's memory behavior conforms to this (or a
+ *   stronger) model. This argument cannot be __ATOMIC_RELEASE,
+ *   __ATOMIC_ACQ_REL, or a stronger model than success.
+ * @return
+ *   Non-zero on success; 0 on failure.
+ */
+static inline int __rte_experimental
+rte_atomic128_cmp_exchange(rte_int128_t *dst,
+			   rte_int128_t *exp,
+			   const rte_int128_t *src,
+			   unsigned int weak,
+			   int success,
+			   int failure);
+
+#endif /* __DOXYGEN__ */
+
 #endif /* _RTE_ATOMIC_H_ */
-- 
2.13.6

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH v6] eal/x86: add 128-bit atomic compare exchange
  2019-04-03 19:44   ` [PATCH v6] " Gage Eads
@ 2019-04-03 20:01     ` Thomas Monjalon
  2019-04-04 11:47     ` Ferruh Yigit
  1 sibling, 0 replies; 36+ messages in thread
From: Thomas Monjalon @ 2019-04-03 20:01 UTC (permalink / raw)
  To: Gage Eads
  Cc: dev, olivier.matz, arybchenko, bruce.richardson,
	konstantin.ananyev, gavin.hu, Honnappa.Nagarahalli, nd, chaozhu,
	jerinj, hemant.agrawal

03/04/2019 21:44, Gage Eads:
> This operation can be used for non-blocking algorithms, such as a
> non-blocking stack or ring.
> 
> It is available only for x86_64.
> 
> Signed-off-by: Gage Eads <gage.eads@intel.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> ---
> This patch addresses x86-64 only; other architectures can/will be supported
> in the future. The __atomic intrinsic was considered for the implementation,
> however libatomic was found[1] to use locks to implement the 128-bit CAS on at
> least one architecture and so is eschewed here. The interface is modeled after
> the __atomic_compare_exchange_16 (which itself is based on the C++11 memory
> model) to best support weak consistency architectures.
> 
> This patch was originally part of a series that introduces a non-blocking stack
> mempool handler[2], and is required by a non-blocking ring patchset. This
> patch was spun off so that the the NB ring depends only on this patch
> and not on the entire non-blocking stack patchset.
> 
> [1] http://mails.dpdk.org/archives/dev/2019-January/124002.html
> [2] http://mails.dpdk.org/archives/dev/2019-January/123653.html
> 
> v6:
> - Use @code and @endcode to clean up pseudo-code generation
> - Add note that the function is currently limited to x86-64
> 
> v5:
>  - Move declaration in the generic file for doxygen only (Thomas)
> 
> v4:
>  - Move function declaration from generic/rte_atomic.h to x86-64 header file
> 
> v3:
>  - Rename function to ISA-neutral rte_atomic128_cmp_exchange()
>  - Fix two pseudocode bugs in function documentation
> 
> v2:
>  - Rename function to rte_atomic128_cmpxchg()
>  - Replace "=A" output constraint with "=a" and "=d" to prevent GCC from using
>    the al register for the sete destination
>  - Extend 'weak' definition to allow non-atomic 'exp' updates.
>  - Add const keyword to 'src' and remove volatile keyword from 'dst'
>  - Put __int128 in a union in rte_int128_t and move the structure definition
>    inside the RTE_ARCH_x86_64 ifdef
>  - Drop enum rte_atomic_memmodel_t in favor of compiler-defined __ATOMIC_*
>  - Drop unnecessary comment relating to X86_64
>  - Tweak the pseudocode to reflect the 'exp' update on failure.
> 
>  .../common/include/arch/x86/rte_atomic_64.h        | 47 +++++++++++++++++++
>  lib/librte_eal/common/include/generic/rte_atomic.h | 52 ++++++++++++++++++++++
>  2 files changed, 99 insertions(+)

Applied, thanks

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v6] eal/x86: add 128-bit atomic compare exchange
  2019-04-03 19:44   ` [PATCH v6] " Gage Eads
  2019-04-03 20:01     ` Thomas Monjalon
@ 2019-04-04 11:47     ` Ferruh Yigit
  2019-04-04 12:08       ` Thomas Monjalon
  1 sibling, 1 reply; 36+ messages in thread
From: Ferruh Yigit @ 2019-04-04 11:47 UTC (permalink / raw)
  To: Gage Eads, Shahaf Shuler, Matan Azrad, Yongseok Koh
  Cc: dev, olivier.matz, arybchenko, bruce.richardson,
	konstantin.ananyev, gavin.hu, Honnappa.Nagarahalli, nd, chaozhu,
	jerinj, hemant.agrawal, Thomas Monjalon

On 4/3/2019 8:44 PM, Gage Eads wrote:
> This operation can be used for non-blocking algorithms, such as a
> non-blocking stack or ring.
> 
> It is available only for x86_64.
> 
> Signed-off-by: Gage Eads <gage.eads@intel.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> ---
> This patch addresses x86-64 only; other architectures can/will be supported
> in the future. The __atomic intrinsic was considered for the implementation,
> however libatomic was found[1] to use locks to implement the 128-bit CAS on at
> least one architecture and so is eschewed here. The interface is modeled after
> the __atomic_compare_exchange_16 (which itself is based on the C++11 memory
> model) to best support weak consistency architectures.
> 
> This patch was originally part of a series that introduces a non-blocking stack
> mempool handler[2], and is required by a non-blocking ring patchset. This
> patch was spun off so that the the NB ring depends only on this patch
> and not on the entire non-blocking stack patchset.
> 
> [1] http://mails.dpdk.org/archives/dev/2019-January/124002.html
> [2] http://mails.dpdk.org/archives/dev/2019-January/123653.html
> 
> v6:
> - Use @code and @endcode to clean up pseudo-code generation
> - Add note that the function is currently limited to x86-64
> 
> v5:
>  - Move declaration in the generic file for doxygen only (Thomas)
> 
> v4:
>  - Move function declaration from generic/rte_atomic.h to x86-64 header file
> 
> v3:
>  - Rename function to ISA-neutral rte_atomic128_cmp_exchange()
>  - Fix two pseudocode bugs in function documentation
> 
> v2:
>  - Rename function to rte_atomic128_cmpxchg()
>  - Replace "=A" output constraint with "=a" and "=d" to prevent GCC from using
>    the al register for the sete destination
>  - Extend 'weak' definition to allow non-atomic 'exp' updates.
>  - Add const keyword to 'src' and remove volatile keyword from 'dst'
>  - Put __int128 in a union in rte_int128_t and move the structure definition
>    inside the RTE_ARCH_x86_64 ifdef
>  - Drop enum rte_atomic_memmodel_t in favor of compiler-defined __ATOMIC_*
>  - Drop unnecessary comment relating to X86_64
>  - Tweak the pseudocode to reflect the 'exp' update on failure.
> 

<...>

I am getting a build error, I guess generated from this patch, while building
mlx drivers, when MLX._DEBUG enabled [1].

This looks because of the "-pedantic" parameter mlx drivers have [2].


Gage, Shahaf, Matan, Yongseok,

Can you please check this?


[1]
.../dpdk/x86_64-native-linuxapp-gcc/include/rte_atomic_64.h:223:3: error: ISO C
does not support ‘__int128’ types [-Werror=pedantic]


   __int128 int128;



   ^~~~~~~~



cc1: all warnings being treated as errors

[2]
Indeed I would like to discuss this distinct behavior of the mlx drivers, it is
many time "-pedantic" caused problem, I will bring the discussion on separate
thread.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v6] eal/x86: add 128-bit atomic compare exchange
  2019-04-04 11:47     ` Ferruh Yigit
@ 2019-04-04 12:08       ` Thomas Monjalon
  2019-04-04 12:12         ` Thomas Monjalon
  0 siblings, 1 reply; 36+ messages in thread
From: Thomas Monjalon @ 2019-04-04 12:08 UTC (permalink / raw)
  To: Ferruh Yigit
  Cc: Gage Eads, Shahaf Shuler, Matan Azrad, Yongseok Koh, dev,
	olivier.matz, arybchenko, bruce.richardson, konstantin.ananyev,
	gavin.hu, Honnappa.Nagarahalli, nd, chaozhu, jerinj,
	hemant.agrawal

04/04/2019 13:47, Ferruh Yigit:
> .../dpdk/x86_64-native-linuxapp-gcc/include/rte_atomic_64.h:223:3: error: ISO C
> does not support ‘__int128’ types [-Werror=pedantic]

We can try this kind of workaround (disable pedantic locally):
https://github.com/HowardHinnant/date/pull/38/commits/177032852d5b4614112ca1ab3ef42d6b41824816

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v6] eal/x86: add 128-bit atomic compare exchange
  2019-04-04 12:08       ` Thomas Monjalon
@ 2019-04-04 12:12         ` Thomas Monjalon
  2019-04-04 12:14           ` Eads, Gage
  0 siblings, 1 reply; 36+ messages in thread
From: Thomas Monjalon @ 2019-04-04 12:12 UTC (permalink / raw)
  To: Ferruh Yigit, Gage Eads
  Cc: Shahaf Shuler, Matan Azrad, Yongseok Koh, dev, olivier.matz,
	arybchenko, bruce.richardson, konstantin.ananyev, gavin.hu,
	Honnappa.Nagarahalli, nd, chaozhu, jerinj, hemant.agrawal

04/04/2019 14:08, Thomas Monjalon:
> 04/04/2019 13:47, Ferruh Yigit:
> > .../dpdk/x86_64-native-linuxapp-gcc/include/rte_atomic_64.h:223:3: error: ISO C
> > does not support ‘__int128’ types [-Werror=pedantic]
> 
> We can try this kind of workaround (disable pedantic locally):
> https://github.com/HowardHinnant/date/pull/38/commits/177032852d5b4614112ca1ab3ef42d6b41824816

Or better:
__extension__ typedef __int128 int128;

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v6] eal/x86: add 128-bit atomic compare exchange
  2019-04-04 12:12         ` Thomas Monjalon
@ 2019-04-04 12:14           ` Eads, Gage
  2019-04-04 12:18             ` Thomas Monjalon
  0 siblings, 1 reply; 36+ messages in thread
From: Eads, Gage @ 2019-04-04 12:14 UTC (permalink / raw)
  To: Thomas Monjalon, Yigit, Ferruh
  Cc: Shahaf Shuler, Matan Azrad, Yongseok Koh, dev, olivier.matz,
	arybchenko, Richardson, Bruce, Ananyev, Konstantin, gavin.hu,
	Honnappa.Nagarahalli, nd, chaozhu, jerinj, hemant.agrawal



> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> Sent: Thursday, April 4, 2019 7:13 AM
> To: Yigit, Ferruh <ferruh.yigit@intel.com>; Eads, Gage
> <gage.eads@intel.com>
> Cc: Shahaf Shuler <shahafs@mellanox.com>; Matan Azrad
> <matan@mellanox.com>; Yongseok Koh <yskoh@mellanox.com>;
> dev@dpdk.org; olivier.matz@6wind.com; arybchenko@solarflare.com;
> Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; gavin.hu@arm.com;
> Honnappa.Nagarahalli@arm.com; nd@arm.com;
> chaozhu@linux.vnet.ibm.com; jerinj@marvell.com;
> hemant.agrawal@nxp.com
> Subject: Re: [dpdk-dev] [PATCH v6] eal/x86: add 128-bit atomic compare
> exchange
> 
> 04/04/2019 14:08, Thomas Monjalon:
> > 04/04/2019 13:47, Ferruh Yigit:
> > > .../dpdk/x86_64-native-linuxapp-gcc/include/rte_atomic_64.h:223:3:
> > > error: ISO C does not support ‘__int128’ types [-Werror=pedantic]
> >
> > We can try this kind of workaround (disable pedantic locally):
> >
> https://github.com/HowardHinnant/date/pull/38/commits/177032852d5b46
> 14
> > 112ca1ab3ef42d6b41824816
> 
> Or better:
> __extension__ typedef __int128 int128;
> 

Taking that one step further -- RTE_STD_C11 evaluates to __extension__ (when the STD C version is sufficiently old).

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v6] eal/x86: add 128-bit atomic compare exchange
  2019-04-04 12:14           ` Eads, Gage
@ 2019-04-04 12:18             ` Thomas Monjalon
  2019-04-04 12:22               ` Eads, Gage
                                 ` (2 more replies)
  0 siblings, 3 replies; 36+ messages in thread
From: Thomas Monjalon @ 2019-04-04 12:18 UTC (permalink / raw)
  To: Eads, Gage, Yigit, Ferruh
  Cc: Shahaf Shuler, Matan Azrad, Yongseok Koh, dev, olivier.matz,
	arybchenko, Richardson, Bruce, Ananyev, Konstantin, gavin.hu,
	Honnappa.Nagarahalli, nd, chaozhu, jerinj, hemant.agrawal

04/04/2019 14:14, Eads, Gage:
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > 04/04/2019 14:08, Thomas Monjalon:
> > > 04/04/2019 13:47, Ferruh Yigit:
> > > > .../dpdk/x86_64-native-linuxapp-gcc/include/rte_atomic_64.h:223:3:
> > > > error: ISO C does not support ‘__int128’ types [-Werror=pedantic]
> > >
> > > We can try this kind of workaround (disable pedantic locally):
> > >
> > https://github.com/HowardHinnant/date/pull/38/commits/177032852d5b46
> > 14
> > > 112ca1ab3ef42d6b41824816
> > 
> > Or better:
> > __extension__ typedef __int128 int128;
> > 
> 
> Taking that one step further -- RTE_STD_C11 evaluates to __extension__ (when the STD C version is sufficiently old).

I don't think __int128 is part of C11. Is it?

Ferruh, I cannot reproduce the compiler error. What is your compiler?
Please, could you test this patch?

--- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
@@ -220,7 +220,7 @@ typedef struct {
        RTE_STD_C11
        union {
                uint64_t val[2];
-               __int128 int128;
+               __extension__ __int128 int128;
        };
 } __rte_aligned(16) rte_int128_t;

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v6] eal/x86: add 128-bit atomic compare exchange
  2019-04-04 12:18             ` Thomas Monjalon
@ 2019-04-04 12:22               ` Eads, Gage
  2019-04-04 12:24               ` Eads, Gage
  2019-04-04 12:52               ` Ferruh Yigit
  2 siblings, 0 replies; 36+ messages in thread
From: Eads, Gage @ 2019-04-04 12:22 UTC (permalink / raw)
  To: Thomas Monjalon, Yigit, Ferruh
  Cc: Shahaf Shuler, Matan Azrad, Yongseok Koh, dev, olivier.matz,
	arybchenko, Richardson, Bruce, Ananyev, Konstantin, gavin.hu,
	Honnappa.Nagarahalli, nd, chaozhu, jerinj, hemant.agrawal

> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> Sent: Thursday, April 4, 2019 7:18 AM
> To: Eads, Gage <gage.eads@intel.com>; Yigit, Ferruh
> <ferruh.yigit@intel.com>
> Cc: Shahaf Shuler <shahafs@mellanox.com>; Matan Azrad
> <matan@mellanox.com>; Yongseok Koh <yskoh@mellanox.com>;
> dev@dpdk.org; olivier.matz@6wind.com; arybchenko@solarflare.com;
> Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; gavin.hu@arm.com;
> Honnappa.Nagarahalli@arm.com; nd@arm.com;
> chaozhu@linux.vnet.ibm.com; jerinj@marvell.com;
> hemant.agrawal@nxp.com
> Subject: Re: [dpdk-dev] [PATCH v6] eal/x86: add 128-bit atomic compare
> exchange
> 
> 04/04/2019 14:14, Eads, Gage:
> > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > 04/04/2019 14:08, Thomas Monjalon:
> > > > 04/04/2019 13:47, Ferruh Yigit:
> > > > > .../dpdk/x86_64-native-linuxapp-gcc/include/rte_atomic_64.h:223:3:
> > > > > error: ISO C does not support ‘__int128’ types [-Werror=pedantic]
> > > >
> > > > We can try this kind of workaround (disable pedantic locally):
> > > >
> > >
> https://github.com/HowardHinnant/date/pull/38/commits/177032852d5b46
> > > 14
> > > > 112ca1ab3ef42d6b41824816
> > >
> > > Or better:
> > > __extension__ typedef __int128 int128;
> > >
> >
> > Taking that one step further -- RTE_STD_C11 evaluates to __extension__
> (when the STD C version is sufficiently old).
> 
> I don't think __int128 is part of C11. Is it?

You're right, this is a compiler extension. Using '__extension__' makes more sense.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v6] eal/x86: add 128-bit atomic compare exchange
  2019-04-04 12:18             ` Thomas Monjalon
  2019-04-04 12:22               ` Eads, Gage
@ 2019-04-04 12:24               ` Eads, Gage
  2019-04-04 12:52               ` Ferruh Yigit
  2 siblings, 0 replies; 36+ messages in thread
From: Eads, Gage @ 2019-04-04 12:24 UTC (permalink / raw)
  To: Thomas Monjalon, Yigit, Ferruh
  Cc: Shahaf Shuler, Matan Azrad, Yongseok Koh, dev, olivier.matz,
	arybchenko, Richardson, Bruce, Ananyev, Konstantin, gavin.hu,
	Honnappa.Nagarahalli, nd, chaozhu, jerinj, hemant.agrawal



> -----Original Message-----
> From: Eads, Gage
> Sent: Thursday, April 4, 2019 7:23 AM
> To: 'Thomas Monjalon' <thomas@monjalon.net>; Yigit, Ferruh
> <ferruh.yigit@intel.com>
> Cc: Shahaf Shuler <shahafs@mellanox.com>; Matan Azrad
> <matan@mellanox.com>; Yongseok Koh <yskoh@mellanox.com>;
> dev@dpdk.org; olivier.matz@6wind.com; arybchenko@solarflare.com;
> Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; gavin.hu@arm.com;
> Honnappa.Nagarahalli@arm.com; nd@arm.com;
> chaozhu@linux.vnet.ibm.com; jerinj@marvell.com;
> hemant.agrawal@nxp.com
> Subject: RE: [dpdk-dev] [PATCH v6] eal/x86: add 128-bit atomic compare
> exchange
> 
> > -----Original Message-----
> > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > Sent: Thursday, April 4, 2019 7:18 AM
> > To: Eads, Gage <gage.eads@intel.com>; Yigit, Ferruh
> > <ferruh.yigit@intel.com>
> > Cc: Shahaf Shuler <shahafs@mellanox.com>; Matan Azrad
> > <matan@mellanox.com>; Yongseok Koh <yskoh@mellanox.com>;
> dev@dpdk.org;
> > olivier.matz@6wind.com; arybchenko@solarflare.com; Richardson, Bruce
> > <bruce.richardson@intel.com>; Ananyev, Konstantin
> > <konstantin.ananyev@intel.com>; gavin.hu@arm.com;
> > Honnappa.Nagarahalli@arm.com; nd@arm.com;
> chaozhu@linux.vnet.ibm.com;
> > jerinj@marvell.com; hemant.agrawal@nxp.com
> > Subject: Re: [dpdk-dev] [PATCH v6] eal/x86: add 128-bit atomic compare
> > exchange
> >
> > 04/04/2019 14:14, Eads, Gage:
> > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > 04/04/2019 14:08, Thomas Monjalon:
> > > > > 04/04/2019 13:47, Ferruh Yigit:
> > > > > > .../dpdk/x86_64-native-linuxapp-
> gcc/include/rte_atomic_64.h:223:3:
> > > > > > error: ISO C does not support ‘__int128’ types
> > > > > > [-Werror=pedantic]
> > > > >
> > > > > We can try this kind of workaround (disable pedantic locally):
> > > > >
> > > >
> >
> https://github.com/HowardHinnant/date/pull/38/commits/177032852d5b46
> > > > 14
> > > > > 112ca1ab3ef42d6b41824816
> > > >
> > > > Or better:
> > > > __extension__ typedef __int128 int128;
> > > >
> > >
> > > Taking that one step further -- RTE_STD_C11 evaluates to
> > > __extension__
> > (when the STD C version is sufficiently old).
> >
> > I don't think __int128 is part of C11. Is it?
> 
> You're right, this is a compiler extension. Using '__extension__' makes more
> sense.

FYI, __int128 was added in GCC 4.6 (and DPDK's minimum version is currently 4.9).

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v6] eal/x86: add 128-bit atomic compare exchange
  2019-04-04 12:18             ` Thomas Monjalon
  2019-04-04 12:22               ` Eads, Gage
  2019-04-04 12:24               ` Eads, Gage
@ 2019-04-04 12:52               ` Ferruh Yigit
  2 siblings, 0 replies; 36+ messages in thread
From: Ferruh Yigit @ 2019-04-04 12:52 UTC (permalink / raw)
  To: Thomas Monjalon, Eads, Gage
  Cc: Shahaf Shuler, Matan Azrad, Yongseok Koh, dev, olivier.matz,
	arybchenko, Richardson, Bruce, Ananyev, Konstantin, gavin.hu,
	Honnappa.Nagarahalli, nd, chaozhu, jerinj, hemant.agrawal

On 4/4/2019 1:18 PM, Thomas Monjalon wrote:
> 04/04/2019 14:14, Eads, Gage:
>> From: Thomas Monjalon [mailto:thomas@monjalon.net]
>>> 04/04/2019 14:08, Thomas Monjalon:
>>>> 04/04/2019 13:47, Ferruh Yigit:
>>>>> .../dpdk/x86_64-native-linuxapp-gcc/include/rte_atomic_64.h:223:3:
>>>>> error: ISO C does not support ‘__int128’ types [-Werror=pedantic]
>>>>
>>>> We can try this kind of workaround (disable pedantic locally):
>>>>
>>> https://github.com/HowardHinnant/date/pull/38/commits/177032852d5b46
>>> 14
>>>> 112ca1ab3ef42d6b41824816
>>>
>>> Or better:
>>> __extension__ typedef __int128 int128;
>>>
>>
>> Taking that one step further -- RTE_STD_C11 evaluates to __extension__ (when the STD C version is sufficiently old).
> 
> I don't think __int128 is part of C11. Is it?
> 
> Ferruh, I cannot reproduce the compiler error. What is your compiler?
> Please, could you test this patch?

It is gcc [1]. Sure will test it.

[1]
gcc (GCC) 8.3.1 20190223 (Red Hat 8.3.1-2)

> 
> --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> @@ -220,7 +220,7 @@ typedef struct {
>         RTE_STD_C11
>         union {
>                 uint64_t val[2];
> -               __int128 int128;
> +               __extension__ __int128 int128;
>         };
>  } __rte_aligned(16) rte_int128_t;
> 
> 
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2019-04-04 12:52 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-28 17:29 [PATCH 0/1] Add 128-bit compare and set Gage Eads
2019-01-28 17:29 ` [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only) Gage Eads
2019-01-28 23:01   ` Ola Liljedahl
2019-02-01 17:06     ` Eads, Gage
2019-02-01 19:01       ` Ola Liljedahl
2019-02-01 19:28         ` Eads, Gage
2019-02-01 19:43           ` Ola Liljedahl
2019-02-01 21:05             ` Eads, Gage
2019-02-01 23:11               ` Ola Liljedahl
2019-02-04 18:33       ` Honnappa Nagarahalli
2019-01-31  5:48   ` Honnappa Nagarahalli
2019-02-01 17:11     ` Eads, Gage
2019-02-22 15:46 ` [PATCH v2 0/1] Add 128-bit compare and set Gage Eads
2019-02-22 15:46   ` [PATCH v2 1/1] eal: add 128-bit cmpxchg (x86-64 only) Gage Eads
2019-03-04 20:19     ` Honnappa Nagarahalli
2019-03-04 20:47       ` Eads, Gage
2019-03-04 20:51   ` [PATCH v3 0/1] Add 128-bit compare and set Gage Eads
2019-03-04 20:51     ` [PATCH v3 1/1] eal: add 128-bit compare exchange (x86-64 only) Gage Eads
2019-03-27 23:12       ` Thomas Monjalon
2019-03-28 16:22         ` Eads, Gage
2019-04-03 17:34     ` [PATCH v4 0/1] Add 128-bit compare and set Gage Eads
2019-04-03 17:34       ` [PATCH v4 1/1] eal: add 128-bit compare exchange (x86-64 only) Gage Eads
2019-04-03 19:04         ` Thomas Monjalon
2019-04-03 19:21           ` Eads, Gage
2019-04-03 19:27             ` Thomas Monjalon
2019-04-03 19:35 ` [PATCH v5] eal/x86: add 128-bit atomic compare exchange Thomas Monjalon
2019-04-03 19:44   ` [PATCH v6] " Gage Eads
2019-04-03 20:01     ` Thomas Monjalon
2019-04-04 11:47     ` Ferruh Yigit
2019-04-04 12:08       ` Thomas Monjalon
2019-04-04 12:12         ` Thomas Monjalon
2019-04-04 12:14           ` Eads, Gage
2019-04-04 12:18             ` Thomas Monjalon
2019-04-04 12:22               ` Eads, Gage
2019-04-04 12:24               ` Eads, Gage
2019-04-04 12:52               ` Ferruh Yigit

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.