All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v1] app/testpmd: optimized MAC swap by using neon intrinsics
@ 2019-03-11  8:14 Ruifeng Wang
  2019-03-11 14:16 ` Jerin Jacob Kollanukkaran
  2019-03-11 15:33 ` Honnappa Nagarahalli
  0 siblings, 2 replies; 5+ messages in thread
From: Ruifeng Wang @ 2019-03-11  8:14 UTC (permalink / raw)
  To: wenzhuo.lu, jingjing.wu, bernard.iremonger
  Cc: dev, jerinj, hemant.agrawal, Honnappa.Nagarahalli, nd, Ruifeng Wang

Improved MAC swap performance for ARM platform.
The improvement was achieved by using neon intrinsics
to save CPU cycles and doing swap for four packets
at a time.
The optimization had 15% - 20% throughput boost
in testpmd MAC swap mode.

Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
Reviewed-by: Phil Yang <phil.yang@arm.com>
---
 app/test-pmd/macswap.c      |  4 +-
 app/test-pmd/macswap_neon.h | 93 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+), 1 deletion(-)
 create mode 100644 app/test-pmd/macswap_neon.h

diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
index cbb41b7..71af916 100644
--- a/app/test-pmd/macswap.c
+++ b/app/test-pmd/macswap.c
@@ -66,8 +66,10 @@
 #include <rte_flow.h>
 
 #include "testpmd.h"
-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86)
 #include "macswap_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "macswap_neon.h"
 #else
 #include "macswap.h"
 #endif
diff --git a/app/test-pmd/macswap_neon.h b/app/test-pmd/macswap_neon.h
new file mode 100644
index 0000000..bad1b9b
--- /dev/null
+++ b/app/test-pmd/macswap_neon.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Arm Limited
+ */
+
+#ifndef _MACSWAP_NEON_H_
+#define _MACSWAP_NEON_H_
+
+#include "macswap_common.h"
+#include "rte_vect.h"
+
+static inline void
+do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
+		struct rte_port *txp)
+{
+	struct ether_hdr *eth_hdr[4];
+	struct rte_mbuf *mb[4];
+	uint64_t ol_flags;
+	int i;
+	int r;
+	uint8x16_t v0, v1, v2, v3;
+	/**
+	 * Index map be used to shuffle the 16 bytes.
+	 * byte 0-5 will be swapped with byte 6-11.
+	 * byte 12-15 will keep unchanged.
+	 */
+	uint8x16_t idx_map = {6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5,
+				12, 13, 14, 15};
+
+	ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
+	vlan_qinq_set(pkts, nb, ol_flags,
+			txp->tx_vlan_id, txp->tx_vlan_id_outer);
+
+	i = 0;
+	r = nb;
+
+	while (r >= 4) {
+		if (r >= 8) {
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 4], void *));
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 5], void *));
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 6], void *));
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 7], void *));
+		}
+
+		mb[0] = pkts[i++];
+		eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct ether_hdr *);
+
+		mb[1] = pkts[i++];
+		eth_hdr[1] = rte_pktmbuf_mtod(mb[1], struct ether_hdr *);
+
+		mb[2] = pkts[i++];
+		eth_hdr[2] = rte_pktmbuf_mtod(mb[2], struct ether_hdr *);
+
+		mb[3] = pkts[i++];
+		eth_hdr[3] = rte_pktmbuf_mtod(mb[3], struct ether_hdr *);
+
+		v0 = vld1q_u8((uint8_t const *)eth_hdr[0]);
+		v1 = vld1q_u8((uint8_t const *)eth_hdr[1]);
+		v2 = vld1q_u8((uint8_t const *)eth_hdr[2]);
+		v3 = vld1q_u8((uint8_t const *)eth_hdr[3]);
+
+		v0 = vqtbl1q_u8(v0, idx_map);
+		v1 = vqtbl1q_u8(v1, idx_map);
+		v2 = vqtbl1q_u8(v2, idx_map);
+		v3 = vqtbl1q_u8(v3, idx_map);
+
+		vst1q_u8((uint8_t *)eth_hdr[0], v0);
+		vst1q_u8((uint8_t *)eth_hdr[1], v1);
+		vst1q_u8((uint8_t *)eth_hdr[2], v2);
+		vst1q_u8((uint8_t *)eth_hdr[3], v3);
+
+		mbuf_field_set(mb[0], ol_flags);
+		mbuf_field_set(mb[1], ol_flags);
+		mbuf_field_set(mb[2], ol_flags);
+		mbuf_field_set(mb[3], ol_flags);
+		r -= 4;
+	}
+
+	for ( ; i < nb; i++) {
+		if (i < nb - 1)
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
+		mb[0] = pkts[i];
+		eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct ether_hdr *);
+
+		/* Swap dest and src mac addresses. */
+		v0 = vld1q_u8((uint8_t const *)eth_hdr[0]);
+		v0 = vqtbl1q_u8(v0, idx_map);
+		vst1q_u8((uint8_t *)eth_hdr[0], v0);
+
+		mbuf_field_set(mb[0], ol_flags);
+	}
+}
+
+#endif /* _MACSWAP_NEON_H_ */
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v1] app/testpmd: optimized MAC swap by using neon intrinsics
  2019-03-11  8:14 [PATCH v1] app/testpmd: optimized MAC swap by using neon intrinsics Ruifeng Wang
@ 2019-03-11 14:16 ` Jerin Jacob Kollanukkaran
  2019-03-12  1:34   ` Ruifeng Wang (Arm Technology China)
  2019-03-11 15:33 ` Honnappa Nagarahalli
  1 sibling, 1 reply; 5+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-03-11 14:16 UTC (permalink / raw)
  To: ruifeng.wang, jingjing.wu, bernard.iremonger, wenzhuo.lu
  Cc: Honnappa.Nagarahalli, nd, hemant.agrawal, dev

On Mon, 2019-03-11 at 16:14 +0800, Ruifeng Wang wrote:
> -------------------------------------------------------------------
> ---
> Improved MAC swap performance for ARM platform.
> The improvement was achieved by using neon intrinsics
> to save CPU cycles and doing swap for four packets
> at a time.
> The optimization had 15% - 20% throughput boost
> in testpmd MAC swap mode.
> 
> Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> Reviewed-by: Phil Yang <phil.yang@arm.com>
> ---
>  app/test-pmd/macswap.c      |  4 +-
>  app/test-pmd/macswap_neon.h | 93
> +++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 96 insertions(+), 1 deletion(-)
>  create mode 100644 app/test-pmd/macswap_neon.h
> 
> diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
> 
> +static inline void
> +do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
> +		struct rte_port *txp)
> +{
> +	struct ether_hdr *eth_hdr[4];
> +	struct rte_mbuf *mb[4];
> +	uint64_t ol_flags;
> +	int i;
> +	int r;
> +	uint8x16_t v0, v1, v2, v3;
> +	/**
> +	 * Index map be used to shuffle the 16 bytes.
> +	 * byte 0-5 will be swapped with byte 6-11.
> +	 * byte 12-15 will keep unchanged.
> +	 */
> +	uint8x16_t idx_map = {6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5,
> +				12, 13, 14, 15};

Nit: I think, we can make it as "const uint8x16_t idx_map".

Other than that it looks good to me.
Regarding the performance, I have tested with two SoCs.

octeontx: +13% improvement
octeontx2: +46% improvement


Acked-by: Jerin Jacob <jerinj@marvell.com>





^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v1] app/testpmd: optimized MAC swap by using neon intrinsics
  2019-03-11  8:14 [PATCH v1] app/testpmd: optimized MAC swap by using neon intrinsics Ruifeng Wang
  2019-03-11 14:16 ` Jerin Jacob Kollanukkaran
@ 2019-03-11 15:33 ` Honnappa Nagarahalli
  2019-03-12  1:37   ` Ruifeng Wang (Arm Technology China)
  1 sibling, 1 reply; 5+ messages in thread
From: Honnappa Nagarahalli @ 2019-03-11 15:33 UTC (permalink / raw)
  To: Ruifeng Wang (Arm Technology China),
	wenzhuo.lu, jingjing.wu, bernard.iremonger
  Cc: dev, jerinj, hemant.agrawal, nd,
	Ruifeng Wang (Arm Technology China),
	Honnappa Nagarahalli, nd

> Improved MAC swap performance for ARM platform.
> The improvement was achieved by using neon intrinsics to save CPU cycles and
> doing swap for four packets at a time.
> The optimization had 15% - 20% throughput boost in testpmd MAC swap mode.
> 
> Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> Reviewed-by: Phil Yang <phil.yang@arm.com>
> ---
>  app/test-pmd/macswap.c      |  4 +-
>  app/test-pmd/macswap_neon.h | 93
> +++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 96 insertions(+), 1 deletion(-)  create mode 100644 app/test-
> pmd/macswap_neon.h
> 
> diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c index
> cbb41b7..71af916 100644
> --- a/app/test-pmd/macswap.c
> +++ b/app/test-pmd/macswap.c
> @@ -66,8 +66,10 @@
>  #include <rte_flow.h>
> 
>  #include "testpmd.h"
> -#ifdef RTE_ARCH_X86
> +#if defined(RTE_ARCH_X86)
>  #include "macswap_sse.h"
> +#elif defined(RTE_MACHINE_CPUFLAG_NEON) #include "macswap_neon.h"
>  #else
>  #include "macswap.h"
>  #endif
> diff --git a/app/test-pmd/macswap_neon.h b/app/test-pmd/macswap_neon.h
> new file mode 100644 index 0000000..bad1b9b
> --- /dev/null
> +++ b/app/test-pmd/macswap_neon.h
> @@ -0,0 +1,93 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2019 Arm Limited
This change is derived from Intel's work in macswap_sse.h. Can you please add a header similar to lib/librte_lpm/rte_lpm_neon.h?

> + */
> +
> +#ifndef _MACSWAP_NEON_H_
> +#define _MACSWAP_NEON_H_
> +

<snip>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v1] app/testpmd: optimized MAC swap by using neon intrinsics
  2019-03-11 14:16 ` Jerin Jacob Kollanukkaran
@ 2019-03-12  1:34   ` Ruifeng Wang (Arm Technology China)
  0 siblings, 0 replies; 5+ messages in thread
From: Ruifeng Wang (Arm Technology China) @ 2019-03-12  1:34 UTC (permalink / raw)
  To: jerinj, jingjing.wu, bernard.iremonger, wenzhuo.lu
  Cc: Honnappa Nagarahalli, nd, hemant.agrawal, dev, nd



Regards,
/Ruifeng

> -----Original Message-----
> From: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Sent: 2019年3月11日 22:17
> To: Ruifeng Wang (Arm Technology China) <Ruifeng.Wang@arm.com>;
> jingjing.wu@intel.com; bernard.iremonger@intel.com;
> wenzhuo.lu@intel.com
> Cc: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; nd
> <nd@arm.com>; hemant.agrawal@nxp.com; dev@dpdk.org
> Subject: Re: [PATCH v1] app/testpmd: optimized MAC swap by using neon
> intrinsics
> 
> On Mon, 2019-03-11 at 16:14 +0800, Ruifeng Wang wrote:
> > -------------------------------------------------------------------
> > ---
> > Improved MAC swap performance for ARM platform.
> > The improvement was achieved by using neon intrinsics to save CPU
> > cycles and doing swap for four packets at a time.
> > The optimization had 15% - 20% throughput boost in testpmd MAC swap
> > mode.
> >
> > Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > ---
> >  app/test-pmd/macswap.c      |  4 +-
> >  app/test-pmd/macswap_neon.h | 93
> > +++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 96 insertions(+), 1 deletion(-)  create mode 100644
> > app/test-pmd/macswap_neon.h
> >
> > diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
> >
> > +static inline void
> > +do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
> > +		struct rte_port *txp)
> > +{
> > +	struct ether_hdr *eth_hdr[4];
> > +	struct rte_mbuf *mb[4];
> > +	uint64_t ol_flags;
> > +	int i;
> > +	int r;
> > +	uint8x16_t v0, v1, v2, v3;
> > +	/**
> > +	 * Index map be used to shuffle the 16 bytes.
> > +	 * byte 0-5 will be swapped with byte 6-11.
> > +	 * byte 12-15 will keep unchanged.
> > +	 */
> > +	uint8x16_t idx_map = {6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5,
> > +				12, 13, 14, 15};
> 
> Nit: I think, we can make it as "const uint8x16_t idx_map".
> 
> Other than that it looks good to me.
> Regarding the performance, I have tested with two SoCs.
> 
> octeontx: +13% improvement
> octeontx2: +46% improvement
> 
> 
> Acked-by: Jerin Jacob <jerinj@marvell.com>
> 
Thanks Jerin for your test and data.
The code change will be included in v2.


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v1] app/testpmd: optimized MAC swap by using neon intrinsics
  2019-03-11 15:33 ` Honnappa Nagarahalli
@ 2019-03-12  1:37   ` Ruifeng Wang (Arm Technology China)
  0 siblings, 0 replies; 5+ messages in thread
From: Ruifeng Wang (Arm Technology China) @ 2019-03-12  1:37 UTC (permalink / raw)
  To: Honnappa Nagarahalli, wenzhuo.lu, jingjing.wu, bernard.iremonger
  Cc: dev, jerinj, hemant.agrawal, nd, nd, nd



Regards,
/Ruifeng

> -----Original Message-----
> From: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> Sent: 2019年3月11日 23:33
> To: Ruifeng Wang (Arm Technology China) <Ruifeng.Wang@arm.com>;
> wenzhuo.lu@intel.com; jingjing.wu@intel.com;
> bernard.iremonger@intel.com
> Cc: dev@dpdk.org; jerinj@marvell.com; hemant.agrawal@nxp.com; nd
> <nd@arm.com>; Ruifeng Wang (Arm Technology China)
> <Ruifeng.Wang@arm.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>
> Subject: RE: [PATCH v1] app/testpmd: optimized MAC swap by using neon
> intrinsics
> 
> > Improved MAC swap performance for ARM platform.
> > The improvement was achieved by using neon intrinsics to save CPU
> > cycles and doing swap for four packets at a time.
> > The optimization had 15% - 20% throughput boost in testpmd MAC swap
> mode.
> >
> > Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > ---
> >  app/test-pmd/macswap.c      |  4 +-
> >  app/test-pmd/macswap_neon.h | 93
> > +++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 96 insertions(+), 1 deletion(-)  create mode 100644
> > app/test- pmd/macswap_neon.h
> >
> > diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c index
> > cbb41b7..71af916 100644
> > --- a/app/test-pmd/macswap.c
> > +++ b/app/test-pmd/macswap.c
> > @@ -66,8 +66,10 @@
> >  #include <rte_flow.h>
> >
> >  #include "testpmd.h"
> > -#ifdef RTE_ARCH_X86
> > +#if defined(RTE_ARCH_X86)
> >  #include "macswap_sse.h"
> > +#elif defined(RTE_MACHINE_CPUFLAG_NEON) #include
> "macswap_neon.h"
> >  #else
> >  #include "macswap.h"
> >  #endif
> > diff --git a/app/test-pmd/macswap_neon.h b/app/test-
> pmd/macswap_neon.h
> > new file mode 100644 index 0000000..bad1b9b
> > --- /dev/null
> > +++ b/app/test-pmd/macswap_neon.h
> > @@ -0,0 +1,93 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2019 Arm Limited
> This change is derived from Intel's work in macswap_sse.h. Can you please
> add a header similar to lib/librte_lpm/rte_lpm_neon.h?
> 
Sure. Will update file header in v2.
Thanks for your suggestion.

> > + */
> > +
> > +#ifndef _MACSWAP_NEON_H_
> > +#define _MACSWAP_NEON_H_
> > +
> 
> <snip>


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2019-03-12  1:37 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-11  8:14 [PATCH v1] app/testpmd: optimized MAC swap by using neon intrinsics Ruifeng Wang
2019-03-11 14:16 ` Jerin Jacob Kollanukkaran
2019-03-12  1:34   ` Ruifeng Wang (Arm Technology China)
2019-03-11 15:33 ` Honnappa Nagarahalli
2019-03-12  1:37   ` Ruifeng Wang (Arm Technology China)

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.